update

e019635f · xuxzh1 · 64def8e2 · 64def8e2 · 64def8e2 · 64def8e2
Commit e019635f authored Nov 01, 2024 by xuxzh1 🎱
20 changed files
--- a/server/vllm/vllm/model_executor/input_metadata.py
+++ b/server/vllm/vllm/model_executor/input_metadata.py
-from typing import Dict, List, Optional, Tuple
-
-import torch
-from xformers.ops import AttentionBias
-
-from vllm.sampling_params import SamplingParams
-from vllm.sequence import SequenceData
-
-
-class InputMetadata:
-    """Metadata for input sequences. Used for PagedAttention.
-
-    Args:
-        seq_groups: List of (seq_ids, sampling_params).
-        seq_data: Seq_id -> SequenceData.
-        prompt_lens: Lengths of prompts.
-        slot_mapping: The address to write the new KV to of each token.
-        context_lens: the length of attention context for each generation token.
-        max_context_len: The maximum context length.
-        block_tables: The block tables. (Seq id -> list of physical block)
-    """
-
-    def __init__(
-        self,
-        seq_groups: List[Tuple[List[int], SamplingParams]],
-        seq_data: Dict[int, SequenceData],
-        prompt_lens: List[int],
-        slot_mapping: torch.Tensor,
-        context_lens: torch.Tensor,
-        max_context_len: int,
-        block_tables: torch.Tensor,
-        sliding_window: Optional[int] = None,
-    ) -> None:
-        self.seq_groups = seq_groups
-        self.seq_data = seq_data
-        self.prompt_lens = prompt_lens
-        self.slot_mapping = slot_mapping
-        self.context_lens = context_lens
-        self.max_context_len = max_context_len
-        self.block_tables = block_tables
-
-        self.max_prompt_len = max(prompt_lens) if prompt_lens else 0
-        self.to_cache = None
-        if sliding_window is not None:
-            # We need to keep the positions of sliding windows within
-            # the key / value tables, this is helpful to know which
-            # elements we need to cache.
-            to_cache, start_idx = [], 0
-            for prompt_len in self.prompt_lens:
-                to_cache.extend(
-                    range(
-                        start_idx + max(0, prompt_len - sliding_window),
-                        start_idx + prompt_len,
-                    ))
-                start_idx += self.max_prompt_len
-            to_cache.extend(range(start_idx, slot_mapping.shape[0]))
-            self.to_cache = torch.tensor(to_cache,
-                                         dtype=torch.int32,
-                                         device=self.slot_mapping.device)
-
-        self.num_prompts = len(prompt_lens)
-        self.num_prompt_tokens = self.num_prompts * self.max_prompt_len
-        self.num_generation_tokens = context_lens.shape[0]
-        if block_tables.numel() > 0:
-            self.max_num_blocks_per_seq = block_tables.shape[1]
-        else:
-            self.max_num_blocks_per_seq = 0
-        assert block_tables.shape[0] == self.num_generation_tokens
-        assert context_lens.shape[0] == self.num_generation_tokens
-
-        # Set during the execution of the first attention op.
-        self.attn_bias: Optional[AttentionBias] = None
-
-    def __repr__(self) -> str:
-        # Print only useful metadata.
-        return (f'InputMetadata('
-                f'num_prompt_tokens={self.num_prompt_tokens}, '
-                f'num_prompts={self.num_prompts}, '
-                f'prompt_lens={self.prompt_lens}, '
-                f'num_generation_tokens={self.num_generation_tokens}, '
-                f'context_lens={self.context_lens}, '
-                f'max_context_len={self.max_context_len}), '
-                f'max_num_blocks_per_seq={self.max_num_blocks_per_seq}, '
-                f'block_tables={self.block_tables}), '
-                f'slot_mapping={self.slot_mapping}')
--- a/server/vllm/vllm/model_executor/layers/__init__.py
+++ b/server/vllm/vllm/model_executor/layers/__init__.py
--- a/server/vllm/vllm/model_executor/layers/activation.py
+++ b/server/vllm/vllm/model_executor/layers/activation.py
-"""Custom activation functions."""
-import torch
-import torch.nn as nn
-
-from vllm import activation_ops
-
-
-class SiluAndMul(nn.Module):
-    """An activation function for SwiGLU.
-
-    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
-
-    Shapes:
-        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
-        return: (batch_size, seq_len, d) or (num_tokens, d)
-    """
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        d = x.shape[-1] // 2
-        output_shape = (x.shape[:-1] + (d, ))
-        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-        activation_ops.silu_and_mul(out, x)
-        return out
-
-
-class NewGELU(nn.Module):
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        out = torch.empty_like(x)
-        activation_ops.gelu_new(out, x)
-        return out
-
-
-class FastGELU(nn.Module):
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        out = torch.empty_like(x)
-        activation_ops.gelu_fast(out, x)
-        return out
-
-
-_ACTIVATION_REGISTRY = {
-    "gelu": nn.GELU(),
-    "gelu_fast": FastGELU(),
-    "gelu_new": NewGELU(),
-    "gelu_pytorch_tanh": nn.GELU(approximate="tanh"),
-    "relu": nn.ReLU(),
-}
-
-
-def get_act_fn(act_fn: str) -> nn.Module:
-    """Get an activation function by name."""
-    act_fn = act_fn.lower()
-    if act_fn in _ACTIVATION_REGISTRY:
-        return _ACTIVATION_REGISTRY[act_fn]
-    raise ValueError(f"Activation function {act_fn!r} is not supported.")
--- a/server/vllm/vllm/model_executor/layers/attention.py
+++ b/server/vllm/vllm/model_executor/layers/attention.py
-"""Multi-head attention."""
-from typing import Any, Dict, List, Optional
-
-import torch
-import torch.nn as nn
-from xformers import ops as xops
-from xformers.ops.fmha.attn_bias import (BlockDiagonalCausalMask,
-                                         LowerTriangularMaskWithTensorBias)
-
-from vllm import attention_ops
-from vllm import cache_ops
-from vllm.model_executor.input_metadata import InputMetadata
-from vllm.model_executor.layers.rotary_embedding import (
-    DynamicNTKScalingRotaryEmbedding, LinearScalingRotaryEmbedding,
-    RotaryEmbedding)
-
-_SUPPORTED_HEAD_SIZES = [64, 80, 96, 112, 128, 256]
-# Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
-_PARTITION_SIZE = 512
-
-
-class PagedAttention(nn.Module):
-    # pylint: disable=line-too-long
-    """GPT-style multi-head PagedAttention.
-
-    This class takes query, key, and value tensors as input. The input tensors
-    can either contain prompt tokens or generation tokens, in addition to
-    paddings.
-
-    The class does the following:
-    1. Perform multi_query_kv_attention for the prompts. This operation does
-        not use the KV cache.
-    2. Wait for the cache operations (e.g., swap, copy) to finish. The cache
-        operations are issued by the cache engine before executing the forward
-        pass of the model, and they are executed asynchronously.
-    3. Reshape and store the input key and value tensors in the KV cache.
-    4. Perform single_query_cached_kv_attention for the generation tokens.
-        This operation reads the previous key and value tensors from the KV
-        cache.
-    5. Return the output tensor.
-    """
-
-    def __init__(self,
-                 num_heads: int,
-                 head_size: int,
-                 scale: float,
-                 num_kv_heads: Optional[int] = None,
-                 sliding_window: Optional[int] = None) -> None:
-        super().__init__()
-        self.num_heads = num_heads
-        self.head_size = head_size
-        self.scale = float(scale)
-        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
-        self.sliding_window = sliding_window
-
-        assert self.num_heads % self.num_kv_heads == 0
-        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
-        self.head_mapping = torch.repeat_interleave(
-            torch.arange(self.num_kv_heads, dtype=torch.int32, device="cuda"),
-            self.num_queries_per_kv)
-
-        if self.head_size not in _SUPPORTED_HEAD_SIZES:
-            raise ValueError(f"head_size ({self.head_size}) is not supported. "
-                             f"Supported head sizes: {_SUPPORTED_HEAD_SIZES}.")
-
-    def set_attn_bias(
-        self,
-        input_metadata: InputMetadata,
-        dtype: torch.dtype,
-    ) -> None:
-        del dtype  # Unused.
-        if input_metadata.attn_bias is not None:
-            # Already set by a previous layer.
-            return
-        prompt_lens = [input_metadata.max_prompt_len
-                       ] * input_metadata.num_prompts
-        attn_bias = BlockDiagonalCausalMask.from_seqlens(prompt_lens)
-        if self.sliding_window is not None:
-            attn_bias = attn_bias.make_local_attention(self.sliding_window)
-        input_metadata.attn_bias = attn_bias
-
-    def multi_query_kv_attention(
-        self,
-        output: torch.Tensor,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        input_metadata: InputMetadata,
-    ) -> torch.Tensor:
-        """Normal attention for the prompt tokens.
-
-        Args:
-            output: shape = [num_prompt_tokens, num_heads, head_size]
-            query: shape = [num_prompt_tokens, num_heads, head_size]
-            key: shape = [num_prompt_tokens, num_kv_heads, head_size]
-            value: shape = [num_prompt_tokens, num_kv_heads, head_size]
-            input_metadata: metadata for paged attention.
-        """
-        if self.num_kv_heads != self.num_heads:
-            # Project the key and value tensors to the desired number of heads.
-            key = torch.repeat_interleave(key, self.num_queries_per_kv, dim=1)
-            value = torch.repeat_interleave(value,
-                                            self.num_queries_per_kv,
-                                            dim=1)
-
-        # TODO(woosuk): The unsqueeze op may incur some CPU overhead. Optimize.
-        out = xops.memory_efficient_attention_forward(
-            query.unsqueeze(0),
-            key.unsqueeze(0),
-            value.unsqueeze(0),
-            attn_bias=input_metadata.attn_bias,
-            p=0.0,
-            scale=self.scale,
-        )
-        # TODO(woosuk): Unnecessary copy. Optimize.
-        output.copy_(out.squeeze(0))
-        return output
-
-    def get_alibi_slopes(self) -> Optional[torch.Tensor]:
-        """Returns the slopes for the alibi attention bias.
-
-        Returns:
-            slopes: shape = [num_heads]
-        """
-        return None
-
-    def single_query_cached_kv_attention(
-        self,
-        output: torch.Tensor,
-        query: torch.Tensor,
-        key_cache: torch.Tensor,
-        value_cache: torch.Tensor,
-        input_metadata: InputMetadata,
-        alibi_slopes: Optional[torch.Tensor],
-    ) -> None:
-        """PagedAttention for the generation tokens.
-
-        Args:
-            output: shape = [num_generation_tokens, num_heads, head_size]
-            query: shape = [num_generation_tokens, num_heads, head_size]
-            key_cache: shape = [num_blocks, num_kv_heads, head_size/x,
-                block_size, x]
-            value_cache: shape = [num_blocks, num_kv_heads, head_size,
-                block_size]
-            input_metadata: metadata for paged attention.
-            alibi_slopes: shape = [num_heads]
-        """
-        block_size = value_cache.shape[3]
-        num_seqs, num_heads, head_size = query.shape
-        max_num_partitions = (
-            (input_metadata.max_context_len + _PARTITION_SIZE - 1) //
-            _PARTITION_SIZE)
-        # NOTE(woosuk): We use a simple heuristic to decide whether to use
-        # PagedAttention V1 or V2. If the number of partitions is 1, we use
-        # V1 to avoid the overhead of reduction. Also, if the number of
-        # sequences or heads is large, we use V1 since there is enough work
-        # to parallelize.
-        # TODO(woosuk): Tune this heuristic.
-        use_v1 = max_num_partitions == 1 or num_seqs * num_heads > 512
-        if use_v1:
-            # Run PagedAttention V1.
-            attention_ops.paged_attention_v1(
-                output,
-                query,
-                key_cache,
-                value_cache,
-                self.head_mapping,
-                self.scale,
-                input_metadata.block_tables,
-                input_metadata.context_lens,
-                block_size,
-                input_metadata.max_context_len,
-                alibi_slopes,
-            )
-        else:
-            # Run PagedAttention V2.
-            assert _PARTITION_SIZE % block_size == 0
-            tmp_output = torch.empty(
-                size=(num_seqs, num_heads, max_num_partitions, head_size),
-                dtype=output.dtype,
-                device=output.device,
-            )
-            exp_sums = torch.empty(
-                size=(num_seqs, num_heads, max_num_partitions),
-                dtype=torch.float32,
-                device=output.device,
-            )
-            max_logits = torch.empty_like(exp_sums)
-            attention_ops.paged_attention_v2(
-                output,
-                exp_sums,
-                max_logits,
-                tmp_output,
-                query,
-                key_cache,
-                value_cache,
-                self.head_mapping,
-                self.scale,
-                input_metadata.block_tables,
-                input_metadata.context_lens,
-                block_size,
-                input_metadata.max_context_len,
-                alibi_slopes,
-            )
-
-    def forward(
-        self,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        key_cache: Optional[torch.Tensor],
-        value_cache: Optional[torch.Tensor],
-        input_metadata: InputMetadata,
-        cache_event: Optional[torch.cuda.Event],
-    ) -> torch.Tensor:
-        """PagedAttention forward pass.
-
-        NOTE: The query, key, and value tensors must be sliced from a qkv
-        tensor of shape [batch_size, seq_len, 3 * num_heads * head_size].
-
-        Args:
-            query: shape = [batch_size, seq_len, num_heads * head_size]
-            key: shape = [batch_size, seq_len, num_kv_heads * head_size]
-            value: shape = [batch_size, num_kv_heads * head_size]
-            key_cache: shape = [num_blocks, num_kv_heads, head_size/x,
-                block_size, x]
-            value_cache: shape = [num_blocks, num_kv_heads, head_size,
-                block_size]
-            input_metadata: metadata for paged attention.
-            cache_event: event to wait for the cache operations to finish.
-
-        Returns:
-            shape = [batch_size, seq_len, num_heads * head_size]
-        """
-        batch_size, seq_len, _ = query.shape
-        # Reshape the query, key, and value tensors.
-        query = query.view(-1, self.num_heads, self.head_size)
-        key = key.view(-1, self.num_kv_heads, self.head_size)
-        value = value.view(-1, self.num_kv_heads, self.head_size)
-
-        # Pre-allocate the output tensor.
-        output = torch.empty_like(query)
-
-        # Compute the attention op for prompts.
-        num_prompt_tokens = input_metadata.num_prompt_tokens
-        if num_prompt_tokens > 0:
-            # Prompt run.
-            assert input_metadata.num_generation_tokens == 0
-            self.set_attn_bias(input_metadata, dtype=query.dtype)
-            self.multi_query_kv_attention(
-                output,
-                query,
-                key,
-                value,
-                input_metadata,
-            )
-
-        # Wait until the cache op is done.
-        if cache_event is not None:
-            cache_event.wait()
-
-        # Reshape the keys and values and store them in the cache.
-        # When key_cache and value_cache are not provided, the new key
-        # and value vectors will not be cached.
-        if key_cache is not None and value_cache is not None:
-            key_to_cache = key
-            value_to_cache = value
-            slot_mapping = input_metadata.slot_mapping.view(-1)
-            if input_metadata.to_cache is not None:
-                key_to_cache = key_to_cache[input_metadata.to_cache]
-                value_to_cache = value_to_cache[input_metadata.to_cache]
-                slot_mapping = slot_mapping[input_metadata.to_cache]
-
-            cache_ops.reshape_and_cache(
-                key_to_cache,
-                value_to_cache,
-                key_cache,
-                value_cache,
-                slot_mapping,
-            )
-
-        if input_metadata.num_generation_tokens > 0:
-            # Decoding run.
-            assert input_metadata.num_prompt_tokens == 0
-            assert key_cache is not None and value_cache is not None, (
-                "key_cache and value_cache must be provided when "
-                "generating tokens.")
-            # Compute the attention op for generation tokens.
-            self.single_query_cached_kv_attention(output, query, key_cache,
-                                                  value_cache, input_metadata,
-                                                  self.get_alibi_slopes())
-
-        # Reshape the output tensor.
-        # NOTE(woosuk): The output tensor may include paddings.
-        return output.view(batch_size, seq_len,
-                           self.num_heads * self.head_size)
-
-
-class PagedAttentionWithRoPE(PagedAttention):
-    """PagedAttention with rotary positional embedding."""
-
-    def __init__(
-        self,
-        num_heads: int,
-        head_size: int,
-        scale: float,
-        rotary_dim: int,
-        max_position: int = 8192,
-        base: int = 10000,
-        num_kv_heads: Optional[int] = None,
-        is_neox_style: bool = True,
-        rope_scaling: Optional[Dict[str, Any]] = None,
-        sliding_window: Optional[int] = None,
-    ) -> None:
-        super().__init__(num_heads,
-                         head_size,
-                         scale,
-                         num_kv_heads,
-                         sliding_window=sliding_window)
-        if rope_scaling is None:
-            self.rotary_emb = RotaryEmbedding(head_size, rotary_dim,
-                                              max_position, base,
-                                              is_neox_style)
-        else:
-            scaling_type = rope_scaling["type"]
-            scaling_factor = rope_scaling["factor"]
-            if scaling_type == "linear":
-                self.rotary_emb = LinearScalingRotaryEmbedding(
-                    head_size, rotary_dim, max_position, base, is_neox_style,
-                    scaling_factor)
-            elif scaling_type == "dynamic":
-                self.rotary_emb = DynamicNTKScalingRotaryEmbedding(
-                    head_size, rotary_dim, max_position, base, is_neox_style,
-                    scaling_factor)
-            else:
-                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        key_cache: torch.Tensor,
-        value_cache: torch.Tensor,
-        input_metadata: InputMetadata,
-        cache_event: Optional[torch.cuda.Event],
-    ) -> torch.Tensor:
-        """ PagedAttention forward pass with rotary embedding.
-
-        Args:
-            positions: shape = [batch_size, seq_len]
-            query: shape = [batch_size, seq_len, num_heads * head_size]
-            key: shape = [batch_size, seq_len, num_kv_heads * head_size]
-            value: shape = [batch_size, seq_len, num_kv_heads * head_size]
-            key_cache: shape = [num_blocks, num_kv_heads, head_size/x,
-                block_size, x]
-            value_cache: shape = [num_blocks, num_kv_heads, head_size,
-                block_size]
-            input_metadata: metadata for paged attention.
-            cache_event: event to wait for the cache operations to finish.
-
-        Returns:
-            shape = [batch_size, seq_len, num_heads * head_size]
-        """
-
-        # Apply rotary embedding to the query and key before passing them
-        # to the attention op.
-        query, key = self.rotary_emb(positions, query, key)
-        return super().forward(
-            query,
-            key,
-            value,
-            key_cache,
-            value_cache,
-            input_metadata,
-            cache_event,
-        )
-
-
-class PagedAttentionWithALiBi(PagedAttention):
-    """PagedAttention with ALiBi attention bias."""
-
-    def __init__(self,
-                 num_heads: int,
-                 head_size: int,
-                 scale: float,
-                 slopes: List[float],
-                 num_kv_heads: Optional[int] = None) -> None:
-        super().__init__(num_heads, head_size, scale, num_kv_heads)
-        assert len(slopes) == num_heads
-
-        slopes = torch.tensor(slopes, dtype=torch.float32)
-        self.register_buffer("alibi_slopes", slopes, persistent=False)
-
-    def set_attn_bias(self, input_metadata: InputMetadata,
-                      dtype: torch.dtype) -> None:
-        if input_metadata.attn_bias is not None:
-            # Already set by a previous layer.
-            return
-        # Generates ALiBi mask based on the max prompt length.
-        max_prompt_len = input_metadata.max_prompt_len
-        bias = torch.arange(max_prompt_len, dtype=dtype)
-        # NOTE(zhuohan): HF uses
-        #     `bias = bias[None, :].repeat(prompt_len, 1)`
-        # here. We find that both biases give the same results, but
-        # the bias below more accurately follows the original ALiBi
-        # paper.
-        bias = bias[None, :] - bias[:, None]
-        bias = bias.to(self.alibi_slopes.device)
-
-        # When using custom attention bias, xformers requires the bias to
-        # be sliced from a tensor whose length is a multiple of 8.
-        padded_len = (max_prompt_len + 7) // 8 * 8
-        bias = torch.empty(
-            input_metadata.num_prompts,
-            self.num_heads,
-            max_prompt_len,
-            padded_len,
-            device=self.alibi_slopes.device,
-            dtype=dtype,
-        )[:, :, :, :max_prompt_len].copy_(bias)
-        bias.mul_(self.alibi_slopes[:, None, None])
-        attn_bias = LowerTriangularMaskWithTensorBias(bias)
-        input_metadata.attn_bias = attn_bias
-
-    def multi_query_kv_attention(
-        self,
-        output: torch.Tensor,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        input_metadata: InputMetadata,
-    ) -> torch.Tensor:
-        """Attention with ALiBi bias for the prompt tokens.
-
-        Args:
-            output: shape = [num_prompt_tokens, num_heads, head_size]
-            query: shape = [num_prompt_tokens, num_heads, head_size]
-            key: shape = [num_prompt_tokens, num_kv_heads, head_size]
-            value: shape = [num_prompt_tokens, num_kv_heads, head_size]
-            input_metadata: metadata for paged attention.
-        """
-        if self.num_kv_heads != self.num_heads:
-            # Project the key and value tensors to the desired number of heads.
-            key = torch.repeat_interleave(key, self.num_queries_per_kv, dim=1)
-            value = torch.repeat_interleave(value,
-                                            self.num_queries_per_kv,
-                                            dim=1)
-        batch_size = input_metadata.num_prompts
-        seq_len = input_metadata.max_prompt_len
-
-        out = xops.memory_efficient_attention_forward(
-            query.view(batch_size, seq_len, self.num_heads, self.head_size),
-            key.view(batch_size, seq_len, self.num_heads, self.head_size),
-            value.view(batch_size, seq_len, self.num_heads, self.head_size),
-            attn_bias=input_metadata.attn_bias,
-            p=0.0,
-            scale=self.scale,
-        )
-        # TODO(woosuk): Unnecessary copy. Optimize.
-        output.copy_(out.view(-1, self.num_heads, self.head_size))
-        return output
-
-    def get_alibi_slopes(self) -> Optional[torch.Tensor]:
-        return self.alibi_slopes
--- a/server/vllm/vllm/model_executor/layers/layernorm.py
+++ b/server/vllm/vllm/model_executor/layers/layernorm.py
-"""Custom normalization layers."""
-import torch
-import torch.nn as nn
-
-from vllm import layernorm_ops
-
-
-class RMSNorm(nn.Module):
-    """Root mean square normalization.
-
-    Computes x -> w * x / sqrt(E[x^2] + eps) where w is the learned weight.
-    Refer to https://arxiv.org/abs/1910.07467
-    """
-
-    def __init__(
-        self,
-        hidden_size: int,
-        eps: float = 1e-6,
-    ) -> None:
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        out = torch.empty_like(x)
-        layernorm_ops.rms_norm(
-            out,
-            x,
-            self.weight.data,
-            self.variance_epsilon,
-        )
-        return out
--- a/server/vllm/vllm/model_executor/layers/quantized_linear/__init__.py
+++ b/server/vllm/vllm/model_executor/layers/quantized_linear/__init__.py
-from vllm.model_executor.layers.quantized_linear.awq import (
-    AWQColumnParallelLinear, AWQRowParallelLinear)
-from vllm.model_executor.parallel_utils.layers import (ColumnParallelLinear,
-                                                       RowParallelLinear)
-
-_QUANTIZED_LINEAR_REGISTRY = {
-    "awq": (AWQColumnParallelLinear, AWQRowParallelLinear),
-}
-
-
-class ParallelLinear:
-
-    @classmethod
-    def column(cls, *args, **kwargs) -> ColumnParallelLinear:
-        quant_config = kwargs.get("quant_config", None)
-        if quant_config is None:
-            return ColumnParallelLinear(*args, **kwargs)
-
-        name = quant_config.get_name()
-        if name not in _QUANTIZED_LINEAR_REGISTRY:
-            raise ValueError(f"No quantized linear is found for {name}")
-
-        quant_linear_cls = _QUANTIZED_LINEAR_REGISTRY[name][0]
-        return quant_linear_cls(*args, **kwargs)
-
-    @classmethod
-    def row(cls, *args, **kwargs) -> RowParallelLinear:
-        quant_config = kwargs.get("quant_config", None)
-        if quant_config is None:
-            return RowParallelLinear(*args, **kwargs)
-
-        name = quant_config.get_name()
-        if name not in _QUANTIZED_LINEAR_REGISTRY:
-            raise ValueError(f"No quantized linear is found for {name}")
-
-        quant_linear_cls = _QUANTIZED_LINEAR_REGISTRY[name][1]
-        return quant_linear_cls(*args, **kwargs)
--- a/server/vllm/vllm/model_executor/layers/quantized_linear/awq.py
+++ b/server/vllm/vllm/model_executor/layers/quantized_linear/awq.py
-from typing import Optional
-
-import torch
-from torch.nn.parameter import Parameter
-
-from vllm import quantization_ops
-from vllm.model_executor.parallel_utils.layers import (ColumnParallelLinear,
-                                                       RowParallelLinear)
-
-
-class AWQColumnParallelLinear(ColumnParallelLinear):
-
-    def create_weights(self, dtype: torch.dtype) -> None:
-        assert self.input_size % self.quant_config.weight_bits == 0
-        assert (self.output_size_per_partition %
-                self.quant_config.pack_factor == 0)
-        self.qweight = Parameter(
-            torch.empty(
-                self.input_size,
-                self.output_size_per_partition //
-                self.quant_config.pack_factor,
-                device="cuda",
-                dtype=torch.int32,
-            ),
-            requires_grad=False,
-        )
-        self.qzeros = Parameter(
-            torch.empty(
-                self.input_size // self.quant_config.group_size,
-                self.output_size_per_partition //
-                self.quant_config.pack_factor,
-                device="cuda",
-                dtype=torch.int32,
-            ),
-            requires_grad=False,
-        )
-        self.scales = Parameter(
-            torch.empty(
-                self.input_size // self.quant_config.group_size,
-                self.output_size_per_partition,
-                device="cuda",
-                dtype=dtype,
-            ),
-            requires_grad=False,
-        )
-
-    def apply_weights(
-        self,
-        x: torch.Tensor,
-        bias: Optional[torch.Tensor],
-    ) -> torch.Tensor:
-        pack_factor = self.quant_config.pack_factor
-        out_shape = (x.shape[:-1] + (self.qweight.shape[-1] * pack_factor, ))
-        reshaped_x = x.reshape(-1, x.shape[-1])
-        out = quantization_ops.awq_gemm(reshaped_x, self.qweight, self.scales,
-                                        self.qzeros, pack_factor)
-        if bias is not None:
-            out = out + bias
-        return out.reshape(out_shape)
-
-
-class AWQRowParallelLinear(RowParallelLinear):
-
-    def create_weights(self, dtype: torch.dtype) -> None:
-        assert (self.input_size_per_partition %
-                self.quant_config.weight_bits == 0)
-        assert self.output_size % self.quant_config.pack_factor == 0
-        self.qweight = Parameter(
-            torch.empty(
-                self.input_size_per_partition,
-                self.output_size // self.quant_config.pack_factor,
-                device="cuda",
-                dtype=torch.int32,
-            ),
-            requires_grad=False,
-        )
-        self.qzeros = Parameter(
-            torch.empty(
-                self.input_size_per_partition // self.quant_config.group_size,
-                self.output_size // self.quant_config.pack_factor,
-                device="cuda",
-                dtype=torch.int32,
-            ),
-            requires_grad=False,
-        )
-        self.scales = Parameter(
-            torch.empty(
-                self.input_size_per_partition // self.quant_config.group_size,
-                self.output_size,
-                device="cuda",
-                dtype=dtype,
-            ),
-            requires_grad=False,
-        )
-
-    def apply_weights(self, x: torch.Tensor) -> torch.Tensor:
-        pack_factor = self.quant_config.pack_factor
-        out_shape = (x.shape[:-1] + (self.qweight.shape[-1] * pack_factor, ))
-        reshaped_x = x.reshape(-1, x.shape[-1])
-        out = quantization_ops.awq_gemm(reshaped_x, self.qweight, self.scales,
-                                        self.qzeros, pack_factor)
-        return out.reshape(out_shape)
--- a/server/vllm/vllm/model_executor/layers/rotary_embedding.py
+++ b/server/vllm/vllm/model_executor/layers/rotary_embedding.py
-# coding=utf-8
-# Adapted from
-# https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/llama/modeling_llama.py
-# Copyright 2023 The vLLM team.
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Rotary Positional Embeddings."""
-from typing import Tuple, Union
-
-import torch
-import torch.nn as nn
-
-from vllm import pos_encoding_ops
-
-
-class RotaryEmbedding(nn.Module):
-    """Original rotary positional embedding."""
-
-    def __init__(
-        self,
-        head_size: int,
-        rotary_dim: int,
-        max_position_embeddings: int,
-        base: int,
-        is_neox_style: bool,
-    ) -> None:
-        super().__init__()
-        self.head_size = head_size
-        self.rotary_dim = rotary_dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        self.is_neox_style = is_neox_style
-
-        cache = self._compute_cos_sin_cache()
-        cache = cache.to(torch.get_default_dtype())
-        self.register_buffer("cos_sin_cache", cache, persistent=False)
-
-    def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor:
-        """Compute the inverse frequency."""
-        # NOTE(woosuk): The HF implementation uses `torch.arange(...).float()`.
-        # However, we use `torch.arange(..., dtype=torch.float)` instead to
-        # avoid numerical issues with large base values (e.g., 10000000).
-        # This may cause a slight numerical difference between the HF
-        # implementation and ours.
-        # NOTE(woosuk): To exactly match the HF implementation, we need to
-        # use CPU to compute the cache and then move it to GPU. However, we
-        # create the cache on GPU for faster initialization. This may cause
-        # a slight numerical difference between the HF implementation and ours.
-        inv_freq = 1.0 / (base**(torch.arange(
-            0, self.rotary_dim, 2, dtype=torch.float, device="cuda") /
-                                 self.rotary_dim))
-        return inv_freq
-
-    def _compute_cos_sin_cache(self) -> torch.Tensor:
-        """Compute the cos and sin cache."""
-        inv_freq = self._compute_inv_freq(self.base)
-        t = torch.arange(self.max_position_embeddings,
-                         dtype=torch.float,
-                         device="cuda")
-
-        freqs = torch.einsum("i,j -> ij", t, inv_freq)
-        cos = freqs.cos()
-        sin = freqs.sin()
-        cache = torch.cat((cos, sin), dim=-1)
-        return cache
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        query: torch.Tensor,
-        key: torch.Tensor,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        # pos_encoding_ops.rotary_embedding() is an in-place operation that
-        # updates the query and key tensors.
-        pos_encoding_ops.rotary_embedding(positions, query, key,
-                                          self.head_size, self.cos_sin_cache,
-                                          self.is_neox_style)
-        return query, key
-
-
-class LinearScalingRotaryEmbedding(RotaryEmbedding):
-    """RotaryEmbedding extended with linear scaling.
-
-    Credits to the Reddit user /u/kaiokendev
-    """
-
-    def __init__(
-        self,
-        head_size: int,
-        rotary_dim: int,
-        max_position_embeddings: int,
-        base: int,
-        is_neox_style: bool,
-        scaling_factor: float,
-    ) -> None:
-        self.scaling_factor = scaling_factor
-        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
-                         is_neox_style)
-
-    def _compute_cos_sin_cache(self) -> torch.Tensor:
-        inv_freq = self._compute_inv_freq(self.base)
-        # NOTE(woosuk): self.max_position_embeddings is the original
-        # maximum length before applying the rope scaling.
-        # Thus, the maximum length after applying the rope scaling is
-        # self.max_position_embeddings * self.scaling_factor.
-        max_len = self.max_position_embeddings * self.scaling_factor
-        t = torch.arange(max_len, dtype=torch.float, device="cuda")
-        t = t / self.scaling_factor
-
-        freqs = torch.einsum("i,j -> ij", t, inv_freq)
-        cos = freqs.cos()
-        sin = freqs.sin()
-        cache = torch.cat((cos, sin), dim=-1)
-        return cache
-
-
-class DynamicNTKScalingRotaryEmbedding(RotaryEmbedding):
-    """RotaryEmbedding extended with Dynamic NTK scaling.
-
-    Credits to the Reddit users /u/bloc97 and /u/emozilla
-    """
-
-    def __init__(
-        self,
-        head_size: int,
-        rotary_dim: int,
-        max_position_embeddings: int,
-        base: int,
-        is_neox_style: bool,
-        scaling_factor: float,
-    ) -> None:
-        self.scaling_factor = scaling_factor
-        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
-                         is_neox_style)
-
-    def _compute_cos_sin_cache(self) -> torch.Tensor:
-        # NOTE(woosuk): self.max_position_embeddings is the original
-        # maximum length before applying the rope scaling.
-        # Thus, the maximum length after applying the rope scaling is
-        # self.max_position_embeddings * self.scaling_factor.
-        max_len = self.max_position_embeddings * self.scaling_factor
-        base = self.base * (
-            (self.scaling_factor * max_len / self.max_position_embeddings) -
-            (self.scaling_factor - 1))**(self.rotary_dim /
-                                         (self.rotary_dim - 2))
-        inv_freq = self._compute_inv_freq(base)
-        t = torch.arange(max_len, dtype=torch.float, device="cuda")
-
-        freqs = torch.einsum("i,j -> ij", t, inv_freq)
-        cos = freqs.cos()
-        sin = freqs.sin()
-        cache = torch.cat((cos, sin), dim=-1)
-        return cache
--- a/server/vllm/vllm/model_executor/layers/sampler.py
+++ b/server/vllm/vllm/model_executor/layers/sampler.py
-"""A layer that samples the next tokens from the model's outputs."""
-from typing import Dict, List, Optional, Tuple
-
-import torch
-import torch.nn as nn
-
-from vllm.model_executor.input_metadata import InputMetadata
-from vllm.model_executor.parallel_utils.communication_op import (
-    tensor_model_parallel_all_gather)
-from vllm.sampling_params import SamplingParams, SamplingType
-from vllm.sequence import (PromptLogprobs, SampleLogprobs, SamplerOutput,
-                           SequenceData, SequenceGroupOutputs, SequenceOutputs)
-
-_SAMPLING_EPS = 1e-5
-
-
-class Sampler(nn.Module):
-    """Samples the next tokens from the model's outputs.
-
-    This layer does the following:
-    1. Discard the hidden states that are not used for sampling (i.e., all
-        tokens except the final one in each prompt).
-    2. Compute the logits for the next tokens.
-    3. Apply presence and frequency penalties.
-    4. Apply temperature scaling.
-    5. Apply top-p and top-k truncation.
-    6. Sample the next tokens.
-    Here, each sequence group within the batch can have different sampling
-    parameters (e.g., sampling method, temperature, top-p, top-k, etc.).
-    """
-
-    def __init__(self, vocab_size: int) -> None:
-        super().__init__()
-        self.vocab_size = vocab_size
-
-    def forward(
-        self,
-        embedding: torch.Tensor,
-        hidden_states: torch.Tensor,
-        input_metadata: InputMetadata,
-        embedding_bias: Optional[torch.Tensor] = None,
-    ) -> SamplerOutput:
-        # Get the hidden states that we use for sampling.
-        hidden_states = _prune_hidden_states(hidden_states, input_metadata)
-
-        # Get the logits for the next tokens.
-        logits = _get_logits(hidden_states, embedding, embedding_bias,
-                             self.vocab_size)
-
-        # Apply presence and frequency penalties.
-        output_tokens = _get_output_tokens(input_metadata)
-        assert len(output_tokens) == logits.shape[0]
-        presence_penalties, frequency_penalties = _get_penalties(
-            input_metadata)
-        assert len(presence_penalties) == logits.shape[0]
-        assert len(frequency_penalties) == logits.shape[0]
-        logits = _apply_penalties(logits, output_tokens, presence_penalties,
-                                  frequency_penalties)
-
-        # Apply temperature scaling.
-        temperatures = _get_temperatures(input_metadata)
-        assert len(temperatures) == logits.shape[0]
-        if any(t != 1.0 for t in temperatures):
-            t = torch.tensor(temperatures,
-                             dtype=logits.dtype,
-                             device=logits.device)
-            # Use in-place division to avoid creating a new tensor.
-            logits.div_(t.unsqueeze(dim=1))
-
-        # Apply top-p and top-k truncation.
-        top_ps, top_ks = _get_top_p_top_k(input_metadata, self.vocab_size)
-        assert len(top_ps) == len(top_ks) == logits.shape[0]
-        do_top_p = any(p < 1.0 - _SAMPLING_EPS for p in top_ps)
-        do_top_k = any(k != self.vocab_size for k in top_ks)
-        if do_top_p or do_top_k:
-            logits = _apply_top_p_top_k(logits, top_ps, top_ks)
-
-        # We use float32 for probabilities and log probabilities.
-        # Compute the probabilities.
-        probs = torch.softmax(logits, dim=-1, dtype=torch.float)
-        # Compute the log probabilities.
-        # Use log_softmax to ensure numerical stability.
-        logprobs = torch.log_softmax(logits, dim=-1, dtype=torch.float)
-
-        # Sample the next tokens.
-        sample_results = _sample(probs, logprobs, input_metadata)
-        # Get the logprobs query results.
-        prompt_logprobs, sample_logprobs = _get_logprobs(
-            logprobs, input_metadata, sample_results)
-        return _build_sampler_output(sample_results, input_metadata,
-                                     prompt_logprobs, sample_logprobs)
-
-
-def _get_logits(hidden_states: torch.Tensor, embedding: torch.Tensor,
-                embedding_bias: Optional[torch.Tensor],
-                vocab_size: int) -> torch.Tensor:
-    # Get the logits for the next tokens.
-    logits = torch.matmul(hidden_states, embedding.t())
-    if embedding_bias is not None:
-        logits += embedding_bias
-    logits = tensor_model_parallel_all_gather(logits)
-    # Remove paddings in vocab (if any).
-    logits = logits[:, :vocab_size]
-    return logits
-
-
-def _prune_hidden_states(
-    hidden_states: torch.Tensor,
-    input_metadata: InputMetadata,
-) -> torch.Tensor:
-    selected_token_indices: List[int] = []
-    start_idx = 0
-    for i, seq_group in enumerate(input_metadata.seq_groups):
-        seq_ids, sampling_params = seq_group
-        if i < input_metadata.num_prompts:
-            assert len(seq_ids) == 1, "Prompt input should have only one seq."
-            prompt_len = input_metadata.prompt_lens[i]
-            if sampling_params.prompt_logprobs is not None:
-                selected_token_indices.extend(
-                    range(start_idx, start_idx + prompt_len - 1))
-            selected_token_indices.append(start_idx + prompt_len - 1)
-            start_idx += input_metadata.max_prompt_len
-        else:
-            num_seqs = len(seq_ids)
-            selected_token_indices.extend(
-                range(start_idx, start_idx + num_seqs))
-            start_idx += num_seqs
-
-    selected_token_indices = torch.tensor(selected_token_indices,
-                                          dtype=torch.long,
-                                          device=hidden_states.device)
-    hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
-    return hidden_states.index_select(0, selected_token_indices)
-
-
-def _get_penalties(
-        input_metadata: InputMetadata) -> Tuple[List[float], List[float]]:
-    # Collect the presence and frequency penalties.
-    presence_penalties: List[float] = []
-    frequency_penalties: List[float] = []
-    for i, seq_group in enumerate(input_metadata.seq_groups):
-        seq_ids, sampling_params = seq_group
-        p = sampling_params.presence_penalty
-        f = sampling_params.frequency_penalty
-        if (i < input_metadata.num_prompts
-                and sampling_params.prompt_logprobs is not None):
-            # NOTE: We do not apply presence and frequency penalties for the
-            # prompt token positions where we don't sample new tokens.
-            prompt_len = input_metadata.prompt_lens[i]
-            presence_penalties += [0] * (prompt_len - 1)
-            frequency_penalties += [0] * (prompt_len - 1)
-        presence_penalties += [p] * len(seq_ids)
-        frequency_penalties += [f] * len(seq_ids)
-    return presence_penalties, frequency_penalties
-
-
-def _get_output_tokens(input_metadata: InputMetadata) -> List[List[int]]:
-    output_tokens: List[List[int]] = []
-    for i, seq_group in enumerate(input_metadata.seq_groups):
-        seq_ids, sampling_params = seq_group
-        if (i < input_metadata.num_prompts
-                and sampling_params.prompt_logprobs is not None):
-            # NOTE: prompt token positions do not need output tokens to
-            # compute penalties.
-            prompt_len = input_metadata.prompt_lens[i]
-            output_tokens.extend([] for _ in range(prompt_len - 1))
-        for seq_id in seq_ids:
-            seq_data = input_metadata.seq_data[seq_id]
-            output_tokens.append(seq_data.output_token_ids)
-    return output_tokens
-
-
-def _apply_penalties(
-    logits: torch.Tensor,
-    output_tokens: List[List[int]],
-    presence_penalties: List[float],
-    frequency_penalties: List[float],
-) -> torch.Tensor:
-    num_seqs, vocab_size = logits.shape
-    for i in range(num_seqs):
-        if not output_tokens[i]:
-            continue
-        p = presence_penalties[i]
-        f = frequency_penalties[i]
-        if abs(p) < _SAMPLING_EPS and abs(f) < _SAMPLING_EPS:
-            continue
-        break
-    else:
-        # Return early if all sequences have zero penalties.
-        return logits
-
-    max_output_len = max(len(tokens) for tokens in output_tokens)
-    padded_output_tokens = [
-        tokens + [vocab_size] * (max_output_len - len(tokens))
-        for tokens in output_tokens
-    ]
-    output_tokens_tensor = torch.tensor(padded_output_tokens,
-                                        dtype=torch.long,
-                                        device=logits.device)
-
-    # Compute the bin counts for the output tokens.
-    # vocab_size + 1 for padding.
-    bin_counts = torch.zeros((num_seqs, vocab_size + 1),
-                             dtype=torch.long,
-                             device=logits.device)
-    bin_counts.scatter_add_(1, output_tokens_tensor,
-                            torch.ones_like(output_tokens_tensor))
-    bin_counts = bin_counts[:, :vocab_size]  # Remove the padding bin.
-
-    frequency_penalties = torch.tensor(frequency_penalties,
-                                       dtype=logits.dtype,
-                                       device=logits.device)
-    presence_penalties = torch.tensor(presence_penalties,
-                                      dtype=logits.dtype,
-                                      device=logits.device)
-
-    # We follow the definition in OpenAI API.
-    # Refer to https://platform.openai.com/docs/api-reference/parameter-details
-    logits -= frequency_penalties.unsqueeze(dim=1) * bin_counts
-    logits -= presence_penalties.unsqueeze(dim=1) * (bin_counts > 0)
-    return logits
-
-
-def _get_temperatures(input_metadata: InputMetadata) -> List[float]:
-    # Collect the temperatures for the logits.
-    temperatures: List[float] = []
-    for i, seq_group in enumerate(input_metadata.seq_groups):
-        seq_ids, sampling_params = seq_group
-        temperature = sampling_params.temperature
-        if temperature < _SAMPLING_EPS:
-            # NOTE: Zero temperature means deterministic sampling
-            # (i.e., greedy sampling or beam search).
-            # Set the temperature to 1 to avoid division by zero.
-            temperature = 1.0
-        if (i < input_metadata.num_prompts
-                and sampling_params.prompt_logprobs is not None):
-            prompt_len = input_metadata.prompt_lens[i]
-            temperatures += [temperature] * (prompt_len - 1)
-        temperatures += [temperature] * len(seq_ids)
-    return temperatures
-
-
-def _get_top_p_top_k(
-    input_metadata: InputMetadata,
-    vocab_size: int,
-) -> Tuple[List[float], List[int]]:
-    top_ps: List[float] = []
-    top_ks: List[int] = []
-    for i, seq_group in enumerate(input_metadata.seq_groups):
-        seq_ids, sampling_params = seq_group
-        top_p = sampling_params.top_p
-        # k should not be greater than the vocab size.
-        top_k = min(sampling_params.top_k, vocab_size)
-        # k=-1 means no truncation.
-        top_k = vocab_size if top_k == -1 else top_k
-        if (i < input_metadata.num_prompts
-                and sampling_params.prompt_logprobs is not None):
-            prompt_len = input_metadata.prompt_lens[i]
-            top_ps += [top_p] * (prompt_len - 1)
-            top_ks += [top_k] * (prompt_len - 1)
-        top_ps += [top_p] * len(seq_ids)
-        top_ks += [top_k] * len(seq_ids)
-    return top_ps, top_ks
-
-
-def _apply_top_p_top_k(
-    logits: torch.Tensor,
-    top_ps: List[float],
-    top_ks: List[int],
-) -> torch.Tensor:
-    p = torch.tensor(top_ps, dtype=logits.dtype, device=logits.device)
-    k = torch.tensor(top_ks, dtype=torch.int, device=logits.device)
-    logits_sort, logits_idx = logits.sort(dim=-1, descending=True)
-
-    # Apply top-p.
-    probs_sort = logits_sort.softmax(dim=-1)
-    probs_sum = probs_sort.cumsum(dim=-1)
-    top_p_mask = (probs_sum - probs_sort) > p.unsqueeze(dim=1)
-    logits_sort[top_p_mask] = -float("inf")
-
-    # Apply top-k.
-    # Create a mask for the top-k elements.
-    top_k_mask = torch.arange(logits_idx.shape[-1], device=logits_idx.device)
-    top_k_mask = top_k_mask.expand(logits_idx.shape[0], -1)
-    top_k_mask = top_k_mask >= k.unsqueeze(dim=1)
-    logits_sort[top_k_mask] = -float("inf")
-
-    # Re-sort the probabilities.
-    logits = torch.gather(logits_sort,
-                          dim=-1,
-                          index=torch.argsort(logits_idx, dim=-1))
-    return logits
-
-
-def _greedy_sample(
-    selected_seq_groups: List[Tuple[List[int], SamplingParams]],
-    logprobs: torch.Tensor,
-) -> List[Tuple[List[int], List[int]]]:
-    samples = torch.argmax(logprobs, dim=-1).cpu()
-    sample_idx = 0
-    results = []
-    for seq_group in selected_seq_groups:
-        seq_ids, _ = seq_group
-        num_parent_seqs = len(seq_ids)
-        assert num_parent_seqs == 1, (
-            "Greedy sampling should have only one seq.")
-        parent_ids = list(range(num_parent_seqs))
-        next_token_ids = [samples[sample_idx].item()]
-        results.append((next_token_ids, parent_ids))
-        sample_idx += num_parent_seqs
-    assert sample_idx == logprobs.size(0)
-    return results
-
-
-def _random_sample(
-    selected_seq_groups: List[Tuple[List[int], SamplingParams]],
-    is_prompts: List[bool],
-    probs: torch.Tensor,
-) -> List[Tuple[List[int], List[int]]]:
-    # Find the maximum best_of value of the prompt phase requests.
-    max_best_of = 1
-    for seq_group, is_prompt in zip(selected_seq_groups, is_prompts):
-        if is_prompt:
-            seq_ids, sampling_params = seq_group
-            max_best_of = max(max_best_of, sampling_params.best_of)
-    random_samples = torch.multinomial(probs,
-                                       num_samples=max_best_of,
-                                       replacement=True).cpu()
-    sample_idx = 0
-    results = []
-    for seq_group, is_prompt in zip(selected_seq_groups, is_prompts):
-        seq_ids, sampling_params = seq_group
-        num_parent_seqs = len(seq_ids)
-        if is_prompt:
-            # Prompt phase.
-            assert num_parent_seqs == 1, (
-                "Prompt input should have only one seq.")
-            parent_ids = [0] * sampling_params.best_of
-            next_token_ids = random_samples[
-                sample_idx, :sampling_params.best_of].tolist()
-        else:
-            # Generation phase.
-            parent_ids = list(range(num_parent_seqs))
-            next_token_ids = random_samples[sample_idx:sample_idx +
-                                            num_parent_seqs, 0].tolist()
-        results.append((next_token_ids, parent_ids))
-        sample_idx += num_parent_seqs
-    assert sample_idx == probs.size(0)
-    return results
-
-
-def _beam_search_sample(
-    selected_seq_groups: List[Tuple[List[int], SamplingParams]],
-    is_prompts: List[bool],
-    seq_data: Dict[int, SequenceData],
-    logprobs: torch.Tensor,
-) -> List[Tuple[List[int], List[int]]]:
-    # We sample 2 * beam_width candidates to make sure that with high
-    # probability we can get `beam_width` candidates in addition to
-    # the finished sequences for the next iteration. See
-    # https://github.com/tensorflow/tensor2tensor/blob/bafdc1b67730430d38d6ab802cbd51f9d053ba2e/tensor2tensor/utils/beam_search.py#L557-L563
-    # for details. See also HF reference:
-    # https://github.com/huggingface/transformers/blob/a4dd53d88e4852f023332d284ff07a01afcd5681/src/transformers/generation/utils.py#L3063-L3065
-    #
-    # NOTE: Beam search is not vectorized, so its speed can be slower than
-    # other sampling methods.
-    sample_idx = 0
-    results = []
-    for seq_group, is_prompt in zip(selected_seq_groups, is_prompts):
-        seq_ids, sampling_params = seq_group
-        num_parent_seqs = len(seq_ids)
-        beam_width = sampling_params.best_of
-        seq_group_logprobs = logprobs[sample_idx:sample_idx + num_parent_seqs]
-        if is_prompt:
-            # Prompt phase.
-            assert num_parent_seqs == 1, (
-                "Prompt input should have only one seq.")
-            parent_ids = [0] * (2 * beam_width)
-            _, next_token_ids = torch.topk(seq_group_logprobs[0],
-                                           2 * beam_width)
-            next_token_ids = next_token_ids.tolist()
-        else:
-            # Generation phase.
-            cumulative_logprobs = [
-                seq_data[seq_id].cumulative_logprob for seq_id in seq_ids
-            ]
-            cumulative_logprobs = torch.tensor(
-                cumulative_logprobs,
-                dtype=torch.float,
-                device=seq_group_logprobs.device)
-            seq_group_logprobs = (seq_group_logprobs +
-                                  cumulative_logprobs.unsqueeze(dim=1))
-            _, topk_ids = torch.topk(seq_group_logprobs.flatten(),
-                                     2 * beam_width)
-            topk_ids = topk_ids.tolist()
-            vocab_size = seq_group_logprobs.size(-1)
-            parent_ids = [i // vocab_size for i in topk_ids]
-            next_token_ids = [i % vocab_size for i in topk_ids]
-        results.append((next_token_ids, parent_ids))
-        sample_idx += num_parent_seqs
-    assert sample_idx == logprobs.size(0)
-    return results
-
-
-def _sample(
-    probs: torch.Tensor,
-    logprobs: torch.Tensor,
-    input_metadata: InputMetadata,
-) -> List[Tuple[List[int], List[int]]]:
-    categorized_seq_group_ids = {t: [] for t in SamplingType}
-    categorized_sample_indices = {t: [] for t in SamplingType}
-    start_idx = 0
-    for i, seq_group in enumerate(input_metadata.seq_groups):
-        seq_ids, sampling_params = seq_group
-        sampling_type = sampling_params.sampling_type
-        if (i < input_metadata.num_prompts
-                and sampling_params.prompt_logprobs is not None):
-            # NOTE: prompt token positions do not need sample, skip
-            prompt_len = input_metadata.prompt_lens[i]
-            start_idx += prompt_len - 1
-        categorized_seq_group_ids[sampling_type].append(i)
-        num_seqs = len(seq_ids)
-        categorized_sample_indices[sampling_type].extend(
-            range(start_idx, start_idx + num_seqs))
-        start_idx += num_seqs
-
-    sample_results_dict: Dict[int, Tuple[List[int], List[int]]] = {}
-    for sampling_type in SamplingType:
-        seq_group_ids = categorized_seq_group_ids[sampling_type]
-        seq_groups = [input_metadata.seq_groups[i] for i in seq_group_ids]
-        is_prompts = [i < input_metadata.num_prompts for i in seq_group_ids]
-        sample_indices = categorized_sample_indices[sampling_type]
-        num_tokens = len(sample_indices)
-        if num_tokens == 0:
-            continue
-        if sampling_type == SamplingType.GREEDY:
-            category_logprobs = logprobs[sample_indices]
-            sample_results = _greedy_sample(seq_groups, category_logprobs)
-        elif sampling_type == SamplingType.RANDOM:
-            category_probs = probs[sample_indices]
-            sample_results = _random_sample(seq_groups, is_prompts,
-                                            category_probs)
-        elif sampling_type == SamplingType.BEAM:
-            category_logprobs = logprobs[sample_indices]
-            sample_results = _beam_search_sample(seq_groups, is_prompts,
-                                                 input_metadata.seq_data,
-                                                 category_logprobs)
-        else:
-            raise ValueError(f"Unsupported sampling type: {sampling_type}")
-        sample_results_dict.update(zip(seq_group_ids, sample_results))
-
-    sample_results = [
-        sample_results_dict[i] for i in range(len(input_metadata.seq_groups))
-    ]
-    return sample_results
-
-
-def _get_logprobs(
-    logprobs: torch.Tensor,
-    input_metadata: InputMetadata,
-    sample_results: List[Tuple[List[int], List[int]]],
-) -> Tuple[List[Optional[List[Optional[Dict[int, float]]]]], List[List[Dict[
-        int, float]]]]:
-    # Prepare query indices
-    batched_logprobs_query_seq_indices: List[int] = []
-    batched_logprobs_query_token_indices: List[int] = []
-    largest_num_logprobs = 0
-    sample_idx = 0
-    for i, (seq_group, sample_result) in enumerate(
-            zip(input_metadata.seq_groups, sample_results)):
-        seq_ids, sampling_params = seq_group
-        next_token_ids, parent_ids = sample_result
-        num_parent_seqs = len(seq_ids)
-        if (i < input_metadata.num_prompts
-                and sampling_params.prompt_logprobs is not None):
-            largest_num_logprobs = max(largest_num_logprobs,
-                                       sampling_params.prompt_logprobs)
-            prompt_len = input_metadata.prompt_lens[i]
-            prompt_tokens = input_metadata.seq_data[
-                seq_ids[0]].prompt_token_ids
-            batched_logprobs_query_seq_indices.extend(
-                sample_idx + j for j in range(prompt_len - 1))
-            batched_logprobs_query_token_indices.extend(
-                token_id for token_id in prompt_tokens[1:])
-            sample_idx += prompt_len - 1
-        batched_logprobs_query_seq_indices.extend(
-            [sample_idx + parent_id for parent_id in parent_ids])
-        batched_logprobs_query_token_indices.extend(next_token_ids)
-        if sampling_params.logprobs is not None:
-            largest_num_logprobs = max(largest_num_logprobs,
-                                       sampling_params.logprobs)
-        sample_idx += num_parent_seqs
-    assert sample_idx == logprobs.size(0)
-
-    # Batched query for logprobs of selected token
-    batched_logprobs_query_result = logprobs[[
-        batched_logprobs_query_seq_indices,
-        batched_logprobs_query_token_indices
-    ]].cpu()
-
-    # Batched query for logprobs of topk tokens
-    if largest_num_logprobs > 0:
-        top_logprobs, top_token_ids = torch.topk(logprobs,
-                                                 largest_num_logprobs,
-                                                 dim=-1)
-        top_logprobs = top_logprobs.cpu()
-        top_token_ids = top_token_ids.cpu()
-    else:
-        top_logprobs, top_token_ids = None, None
-
-    # Gather results
-    result_prompt_logprobs: List[Optional[PromptLogprobs]] = []
-    result_sample_logprobs: List[SampleLogprobs] = []
-    sample_idx = 0
-    query_result_idx = 0
-    for i, (seq_group, sample_result) in enumerate(
-            zip(input_metadata.seq_groups, sample_results)):
-        seq_ids, sampling_params = seq_group
-        next_token_ids, parent_ids = sample_result
-
-        # Prompt logprobs
-        if (i < input_metadata.num_prompts
-                and sampling_params.prompt_logprobs is not None):
-            num_logprobs = sampling_params.prompt_logprobs
-            prompt_len = input_metadata.prompt_lens[i]
-            prompt_tokens = input_metadata.seq_data[
-                seq_ids[0]].prompt_token_ids
-            group_prompt_logprobs: PromptLogprobs = [None]
-            for token_id in prompt_tokens[1:]:
-                prompt_logprobs_dict = {
-                    token_id:
-                    batched_logprobs_query_result[query_result_idx].item()
-                }
-                if num_logprobs > 0:
-                    prompt_logprobs_dict.update(
-                        zip(top_token_ids[sample_idx, :num_logprobs].tolist(),
-                            top_logprobs[sample_idx, :num_logprobs].tolist()))
-                group_prompt_logprobs.append(prompt_logprobs_dict)
-                sample_idx += 1
-                query_result_idx += 1
-            result_prompt_logprobs.append(group_prompt_logprobs)
-        else:
-            result_prompt_logprobs.append(None)
-
-        # Sample logprobs
-        num_logprobs = sampling_params.logprobs
-        if num_logprobs is None:
-            num_logprobs = 0
-        group_sample_logprobs: SampleLogprobs = []
-        for next_token_id, parent_id in zip(next_token_ids, parent_ids):
-            sample_logprobs_dict = {
-                next_token_id:
-                batched_logprobs_query_result[query_result_idx].item()
-            }
-            query_result_idx += 1
-            if num_logprobs > 0:
-                sample_logprobs_dict.update(
-                    zip(
-                        top_token_ids[sample_idx +
-                                      parent_id, :num_logprobs].tolist(),
-                        top_logprobs[sample_idx +
-                                     parent_id, :num_logprobs].tolist()))
-            group_sample_logprobs.append(sample_logprobs_dict)
-        result_sample_logprobs.append(group_sample_logprobs)
-        sample_idx += len(seq_ids)
-
-    return result_prompt_logprobs, result_sample_logprobs
-
-
-def _build_sampler_output(
-    sample_results: List[Tuple[List[int], List[int]]],
-    input_metadata: InputMetadata,
-    prompt_logprobs: List[Optional[PromptLogprobs]],
-    sample_logprobs: List[SampleLogprobs],
-) -> SamplerOutput:
-    sampler_output = []
-    for (seq_group, sample_result, group_prompt_logprobs,
-         group_sample_logprobs) in zip(input_metadata.seq_groups,
-                                       sample_results, prompt_logprobs,
-                                       sample_logprobs):
-        seq_ids, _ = seq_group
-        next_token_ids, parent_ids = sample_result
-        seq_outputs = []
-        for parent_id, next_token_id, logprobs in zip(parent_ids,
-                                                      next_token_ids,
-                                                      group_sample_logprobs):
-            seq_outputs.append(
-                SequenceOutputs(seq_ids[parent_id], next_token_id, logprobs))
-        sampler_output.append(
-            SequenceGroupOutputs(seq_outputs, group_prompt_logprobs))
-    return sampler_output
--- a/server/vllm/vllm/model_executor/model_loader.py
+++ b/server/vllm/vllm/model_executor/model_loader.py
-"""Utilities for selecting and loading models."""
-import contextlib
-from typing import Type
-
-import torch
-import torch.nn as nn
-from transformers import PretrainedConfig
-
-from vllm.config import ModelConfig
-from vllm.model_executor.models import *  # pylint: disable=wildcard-import
-from vllm.model_executor.weight_utils import (get_quant_config,
-                                              initialize_dummy_weights)
-
-# TODO(woosuk): Lazy-load the model classes.
-_MODEL_REGISTRY = {
-    "AquilaModel": AquilaForCausalLM,
-    "AquilaForCausalLM": AquilaForCausalLM,  # AquilaChat2
-    "BaiChuanForCausalLM": BaiChuanForCausalLM,  # baichuan-7b
-    "BaichuanForCausalLM": BaichuanForCausalLM,  # baichuan-13b
-    "BloomForCausalLM": BloomForCausalLM,
-    "FalconForCausalLM": FalconForCausalLM,
-    "GPT2LMHeadModel": GPT2LMHeadModel,
-    "GPTBigCodeForCausalLM": GPTBigCodeForCausalLM,
-    "GPTJForCausalLM": GPTJForCausalLM,
-    "GPTNeoXForCausalLM": GPTNeoXForCausalLM,
-    "InternLMForCausalLM": InternLMForCausalLM,
-    "LlamaForCausalLM": LlamaForCausalLM,
-    "LLaMAForCausalLM": LlamaForCausalLM,  # For decapoda-research/llama-*
-    # "MistralForCausalLM": MistralForCausalLM,
-    "MPTForCausalLM": MPTForCausalLM,
-    "OPTForCausalLM": OPTForCausalLM,
-    "QWenLMHeadModel": QWenLMHeadModel,
-    "RWForCausalLM": FalconForCausalLM,
-}
-
-# FIXME(woosuk): Remove this once all models support quantization.
-_MODEL_CLASSES_SUPPORT_QUANTIZATION = [
-    LlamaForCausalLM,
-    # MistralForCausalLM,
-]
-
-
-@contextlib.contextmanager
-def _set_default_torch_dtype(dtype: torch.dtype):
-    """Sets the default torch dtype to the given dtype."""
-    old_dtype = torch.get_default_dtype()
-    torch.set_default_dtype(dtype)
-    yield
-    torch.set_default_dtype(old_dtype)
-
-
-def _get_model_architecture(config: PretrainedConfig) -> Type[nn.Module]:
-    architectures = getattr(config, "architectures", [])
-    for arch in architectures:
-        if arch in _MODEL_REGISTRY:
-            return _MODEL_REGISTRY[arch]
-    raise ValueError(
-        f"Model architectures {architectures} are not supported for now. "
-        f"Supported architectures: {list(_MODEL_REGISTRY.keys())}")
-
-
-def get_model(model_config: ModelConfig) -> nn.Module:
-    model_class = _get_model_architecture(model_config.hf_config)
-
-    # Get the quantization config.
-    quant_config = None
-    if model_config.quantization is not None:
-        if model_class not in _MODEL_CLASSES_SUPPORT_QUANTIZATION:
-            raise ValueError(
-                f"Quantization is not supported for {model_class}.")
-        quant_config = get_quant_config(model_config.quantization,
-                                        model_config.model,
-                                        model_config.download_dir)
-        capability = torch.cuda.get_device_capability()
-        capability = capability[0] * 10 + capability[1]
-        if capability < quant_config.get_min_capability():
-            raise ValueError(
-                f"The quantization method {model_config.quantization} is not "
-                "supported for the current GPU. "
-                f"Minimum capability: {quant_config.get_min_capability()}. "
-                f"Current capability: {capability}.")
-        supported_dtypes = quant_config.get_supported_act_dtypes()
-        if model_config.dtype not in supported_dtypes:
-            raise ValueError(
-                f"{model_config.dtype} is not supported for quantization "
-                f"method {model_config.quantization}. Supported dtypes: "
-                f"{supported_dtypes}")
-
-    with _set_default_torch_dtype(model_config.dtype):
-        # Create a model instance.
-        # The weights will be initialized as empty tensors.
-        if model_class in _MODEL_CLASSES_SUPPORT_QUANTIZATION:
-            model = model_class(model_config.hf_config, quant_config)
-        else:
-            model = model_class(model_config.hf_config)
-        if model_config.load_format == "dummy":
-            model = model.cuda()
-            # NOTE(woosuk): For accurate performance evaluation, we assign
-            # random values to the weights.
-            initialize_dummy_weights(model)
-        else:
-            # Load the weights from the cached or downloaded files.
-            model.load_weights(model_config.model, model_config.download_dir,
-                               model_config.load_format, model_config.revision)
-            model = model.cuda()
-    return model.eval()
--- a/server/vllm/vllm/model_executor/models/__init__.py
+++ b/server/vllm/vllm/model_executor/models/__init__.py
-from vllm.model_executor.models.aquila import AquilaForCausalLM
-from vllm.model_executor.models.baichuan import (BaiChuanForCausalLM,
-                                                 BaichuanForCausalLM)
-from vllm.model_executor.models.bloom import BloomForCausalLM
-from vllm.model_executor.models.falcon import FalconForCausalLM
-from vllm.model_executor.models.gpt2 import GPT2LMHeadModel
-from vllm.model_executor.models.gpt_bigcode import GPTBigCodeForCausalLM
-from vllm.model_executor.models.gpt_j import GPTJForCausalLM
-from vllm.model_executor.models.gpt_neox import GPTNeoXForCausalLM
-from vllm.model_executor.models.internlm import InternLMForCausalLM
-from vllm.model_executor.models.llama import LlamaForCausalLM
-# from vllm.model_executor.models.mistral import MistralForCausalLM
-from vllm.model_executor.models.mpt import MPTForCausalLM
-from vllm.model_executor.models.opt import OPTForCausalLM
-from vllm.model_executor.models.qwen import QWenLMHeadModel
-
-__all__ = [
-    "AquilaForCausalLM",
-    "BaiChuanForCausalLM",
-    "BaichuanForCausalLM",
-    "BloomForCausalLM",
-    "FalconForCausalLM",
-    "GPT2LMHeadModel",
-    "GPTBigCodeForCausalLM",
-    "GPTJForCausalLM",
-    "GPTNeoXForCausalLM",
-    "InternLMForCausalLM",
-    "LlamaForCausalLM",
-    "MPTForCausalLM",
-    "OPTForCausalLM",
-    "QWenLMHeadModel",
-    # "MistralForCausalLM",
-]
--- a/server/vllm/vllm/model_executor/models/aquila.py
+++ b/server/vllm/vllm/model_executor/models/aquila.py
-# coding=utf-8
-# Adapted from
-# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
-# Copyright 2023 The vLLM team.
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Inference-only LLaMA model compatible with HuggingFace weights.
-
-The input of the model is flattened to a 1D tensor of tokens. The model uses
-InputMetadata to extract the original 2D shape of the input.
-"""
-from typing import List, Optional, Tuple
-
-import torch
-from torch import nn
-
-from vllm.model_executor.input_metadata import InputMetadata
-from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.attention import PagedAttentionWithRoPE
-from vllm.model_executor.layers.sampler import Sampler
-from vllm.model_executor.weight_utils import (
-    hf_model_weights_iterator, load_padded_tensor_parallel_vocab,
-    load_tensor_parallel_weights)
-from vllm.model_executor.parallel_utils.parallel_state import (
-    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
-from vllm.model_executor.parallel_utils.layers import (VocabParallelEmbedding,
-                                                       ColumnParallelLinear,
-                                                       RowParallelLinear)
-from vllm.sequence import SamplerOutput
-from vllm.transformers_utils.configs.aquila import AquilaConfig
-
-KVCache = Tuple[torch.Tensor, torch.Tensor]
-
-
-class AquilaMLP(nn.Module):
-
-    def __init__(
-        self,
-        hidden_size: int,
-        intermediate_size: int,
-        hidden_act: str,
-    ):
-        super().__init__()
-        self.gate_up_proj = ColumnParallelLinear(
-            hidden_size,
-            2 * intermediate_size,
-            bias=False,
-            gather_output=False,
-        )
-        self.down_proj = RowParallelLinear(
-            intermediate_size,
-            hidden_size,
-            bias=False,
-            input_is_parallel=True,
-        )
-        if hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu is supported for now.")
-        self.act_fn = SiluAndMul()
-
-    def forward(self, x):
-        gate_up, _ = self.gate_up_proj(x)
-        x = self.act_fn(gate_up)
-        x, _ = self.down_proj(x)
-        return x
-
-
-class AquilaRMSNorm(nn.Module):
-
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        AquilaRMSNorm is equivalent to T5LayerNorm
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        variance = hidden_states.to(torch.float32).pow(2).mean(-1,
-                                                               keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance +
-                                                    self.variance_epsilon)
-
-        return (self.weight * hidden_states).to(input_dtype)
-
-
-class AquilaAttention(nn.Module):
-
-    def __init__(
-        self,
-        hidden_size: int,
-        num_heads: int,
-        num_kv_heads: int,
-        rope_theta: float = 10000,
-        max_position_embeddings: int = 8192,
-    ):
-        super().__init__()
-        self.hidden_size = hidden_size
-        tp_size = get_tensor_model_parallel_world_size()
-        self.total_num_heads = num_heads
-        assert self.total_num_heads % tp_size == 0
-        self.num_heads = self.total_num_heads // tp_size
-        self.total_num_kv_heads = num_kv_heads
-        assert self.total_num_kv_heads % tp_size == 0
-        self.num_kv_heads = self.total_num_kv_heads // tp_size
-        self.head_dim = hidden_size // self.total_num_heads
-        self.q_size = self.num_heads * self.head_dim
-        self.kv_size = self.num_kv_heads * self.head_dim
-        self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
-        self.max_position_embeddings = max_position_embeddings
-
-        self.qkv_proj = ColumnParallelLinear(
-            hidden_size,
-            (self.total_num_heads + 2 * self.total_num_kv_heads) *
-            self.head_dim,
-            bias=False,
-            gather_output=False,
-        )
-        self.o_proj = RowParallelLinear(
-            self.total_num_heads * self.head_dim,
-            hidden_size,
-            bias=False,
-            input_is_parallel=True,
-        )
-        self.attn = PagedAttentionWithRoPE(
-            self.num_heads,
-            self.head_dim,
-            self.scaling,
-            rotary_dim=self.head_dim,
-            base=self.rope_theta,
-            max_position=self.max_position_embeddings,
-            num_kv_heads=self.num_kv_heads,
-        )
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-        cache_event: Optional[torch.cuda.Event],
-    ) -> torch.Tensor:
-        qkv, _ = self.qkv_proj(hidden_states)
-        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
-        k_cache, v_cache = kv_cache
-        attn_output = self.attn(positions, q, k, v, k_cache, v_cache,
-                                input_metadata, cache_event)
-        output, _ = self.o_proj(attn_output)
-        return output
-
-
-class AquilaDecoderLayer(nn.Module):
-
-    def __init__(self, config: AquilaConfig):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
-        self.self_attn = AquilaAttention(
-            hidden_size=self.hidden_size,
-            num_heads=config.num_attention_heads,
-            num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            max_position_embeddings=max_position_embeddings,
-        )
-        self.mlp = AquilaMLP(
-            hidden_size=self.hidden_size,
-            intermediate_size=config.intermediate_size,
-            hidden_act=config.hidden_act,
-        )
-        self.input_layernorm = AquilaRMSNorm(config.hidden_size,
-                                             eps=config.rms_norm_eps)
-        self.post_attention_layernorm = AquilaRMSNorm(config.hidden_size,
-                                                      eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-        cache_event: Optional[torch.cuda.Event],
-    ) -> torch.Tensor:
-        # Self Attention
-        residual = hidden_states
-        hidden_states = self.input_layernorm(hidden_states)
-        hidden_states = self.self_attn(
-            positions=positions,
-            hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            input_metadata=input_metadata,
-            cache_event=cache_event,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-        return hidden_states
-
-
-class AquilaModel(nn.Module):
-
-    def __init__(self, config: AquilaConfig):
-        super().__init__()
-        self.config = config
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        #vocab_size = ((config.vocab_size + 63) // 64) * 64
-        self.embed_tokens = VocabParallelEmbedding(
-            config.vocab_size,
-            config.hidden_size,
-        )
-        self.layers = nn.ModuleList([
-            AquilaDecoderLayer(config) for _ in range(config.num_hidden_layers)
-        ])
-        self.norm = AquilaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-        cache_events: Optional[List[torch.cuda.Event]],
-    ) -> torch.Tensor:
-        hidden_states = self.embed_tokens(input_ids)
-        for i in range(len(self.layers)):
-            if cache_events is None:
-                cache_event = None
-            else:
-                cache_event = cache_events[i]
-            layer = self.layers[i]
-            hidden_states = layer(
-                positions,
-                hidden_states,
-                kv_caches[i],
-                input_metadata,
-                cache_event,
-            )
-        hidden_states = self.norm(hidden_states)
-
-        return hidden_states
-
-
-class AquilaForCausalLM(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.model = AquilaModel(config)
-        vocab_size = ((config.vocab_size + 63) // 64) * 64
-        self.lm_head = ColumnParallelLinear(
-            config.hidden_size,
-            vocab_size,
-            bias=False,
-            gather_output=False,
-        )
-        self.sampler = Sampler(config.vocab_size)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-        cache_events: Optional[List[torch.cuda.Event]],
-    ) -> SamplerOutput:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   input_metadata, cache_events)
-        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
-                                   input_metadata)
-        return next_tokens
-
-    _column_parallel_weights = [
-        "qkv_proj.weight", "gate_proj.weight", "up_proj.weight"
-    ]
-    _row_parallel_weights = ["o_proj.weight", "down_proj.weight"]
-
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
-        tp_size = get_tensor_model_parallel_world_size()
-        tensor_model_parallel_rank = get_tensor_model_parallel_rank()
-        q_proj_shard_size = (self.config.hidden_size // tp_size)
-        kv_proj_shard_size = (self.config.hidden_size //
-                              self.config.num_attention_heads *
-                              self.config.num_key_value_heads // tp_size)
-        attention_weight_specs = [
-            # (weight_name, shard_size, offset)
-            ("q_proj", q_proj_shard_size, 0),
-            ("k_proj", kv_proj_shard_size, q_proj_shard_size),
-            ("v_proj", kv_proj_shard_size,
-             q_proj_shard_size + kv_proj_shard_size),
-        ]
-        state_dict = self.state_dict()
-
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
-            if "rotary_emb.inv_freq" in name:
-                continue
-
-            is_attention_weight = False
-            for weight_name, shard_size, offset in attention_weight_specs:
-                if weight_name not in name:
-                    continue
-                param = state_dict[name.replace(weight_name, "qkv_proj")]
-
-                loaded_weight = loaded_weight[
-                    shard_size * tensor_model_parallel_rank:shard_size *
-                    (tensor_model_parallel_rank + 1)]
-                param_slice = param.data[offset:offset + shard_size]
-                assert param_slice.shape == loaded_weight.shape
-
-                param_slice.copy_(loaded_weight)
-                is_attention_weight = True
-                break
-            if is_attention_weight:
-                continue
-
-            is_gate_up_weight = False
-            for stride_id, weight_name in enumerate(["gate_proj", "up_proj"]):
-                if weight_name not in name:
-                    continue
-                param = state_dict[name.replace(weight_name, "gate_up_proj")]
-                shard_size = param.shape[0] // 2
-                loaded_weight = loaded_weight[
-                    shard_size * tensor_model_parallel_rank:shard_size *
-                    (tensor_model_parallel_rank + 1)]
-                param_slice = param.data[shard_size * stride_id:shard_size *
-                                         (stride_id + 1)]
-                assert param_slice.shape == loaded_weight.shape
-                param_slice.copy_(loaded_weight)
-                is_gate_up_weight = True
-                break
-            if is_gate_up_weight:
-                continue
-
-            param = state_dict[name]
-            if "embed_tokens" in name or "lm_head" in name:
-                load_padded_tensor_parallel_vocab(param, loaded_weight,
-                                                  tensor_model_parallel_rank)
-                continue
-
-            load_tensor_parallel_weights(param, loaded_weight, name,
-                                         self._column_parallel_weights,
-                                         self._row_parallel_weights,
-                                         tensor_model_parallel_rank)
--- a/server/vllm/vllm/model_executor/models/baichuan.py
+++ b/server/vllm/vllm/model_executor/models/baichuan.py
-# coding=utf-8
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Inference-only BaiChuan model compatible with HuggingFace weights.
-
-The input of the model is flattened to a 1D tensor of tokens. The model uses
-InputMetadata to extract the original 2D shape of the input.
-"""
-import math
-from typing import List, Optional, Tuple
-
-import torch
-from torch import nn
-
-from vllm.model_executor.input_metadata import InputMetadata
-from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.attention import (PagedAttentionWithRoPE,
-                                                  PagedAttentionWithALiBi)
-from vllm.model_executor.layers.sampler import Sampler
-from vllm.model_executor.weight_utils import (
-    convert_pyslice_to_tensor, hf_model_weights_iterator,
-    load_padded_tensor_parallel_vocab, load_tensor_parallel_weights)
-from vllm.model_executor.parallel_utils.parallel_state import (
-    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
-from vllm.model_executor.parallel_utils.layers import (VocabParallelEmbedding,
-                                                       ColumnParallelLinear,
-                                                       RowParallelLinear)
-from vllm.sequence import SamplerOutput
-from vllm.transformers_utils.configs.baichuan import BaiChuanConfig
-
-KVCache = Tuple[torch.Tensor, torch.Tensor]
-
-
-def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
-    closest_power_of_2 = 2**math.floor(math.log2(total_num_heads))
-    base = torch.tensor(
-        2**(-(2**-(math.log2(closest_power_of_2) - 3))),
-        dtype=torch.float32,
-    )
-    powers = torch.arange(1, 1 + closest_power_of_2, dtype=torch.int32)
-    slopes = torch.pow(base, powers)
-
-    if closest_power_of_2 != total_num_heads:
-        extra_base = torch.tensor(
-            2**(-(2**-(math.log2(2 * closest_power_of_2) - 3))),
-            dtype=torch.float32,
-        )
-        num_remaining_heads = min(closest_power_of_2,
-                                  total_num_heads - closest_power_of_2)
-        extra_powers = torch.arange(start=1,
-                                    end=1 + 2 * num_remaining_heads,
-                                    step=2,
-                                    dtype=torch.int32)
-        slopes = torch.cat(
-            [slopes, torch.pow(extra_base, extra_powers)], dim=0)
-    return slopes
-
-
-class BaiChuanMLP(nn.Module):
-
-    def __init__(
-        self,
-        hidden_size: int,
-        intermediate_size: int,
-        hidden_act: str,
-    ):
-        super().__init__()
-        self.gate_up_proj = ColumnParallelLinear(
-            hidden_size,
-            2 * intermediate_size,
-            bias=False,
-            gather_output=False,
-        )
-        self.down_proj = RowParallelLinear(
-            intermediate_size,
-            hidden_size,
-            bias=False,
-            input_is_parallel=True,
-        )
-        if hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu is supported for now.")
-        self.act_fn = SiluAndMul()
-
-    def forward(self, x):
-        gate_up, _ = self.gate_up_proj(x)
-        x = self.act_fn(gate_up)
-        x, _ = self.down_proj(x)
-        return x
-
-
-class BaiChuanAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-
-    def __init__(
-        self,
-        hidden_size: int,
-        num_heads: int,
-        position_embedding: str,
-        rope_theta: float = 10000,
-        max_position_embeddings: int = 8192,
-    ):
-        super().__init__()
-        self.hidden_size = hidden_size
-        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size(
-        )
-        self.total_num_heads = num_heads
-        assert self.total_num_heads % tensor_model_parallel_world_size == 0
-        self.num_heads = (self.total_num_heads //
-                          tensor_model_parallel_world_size)
-        self.head_dim = hidden_size // self.total_num_heads
-        self.postion_embedding = position_embedding
-        self.rope_theta = rope_theta
-        self.max_position_embeddings = max_position_embeddings
-
-        # pylint: disable=invalid-name
-        self.W_pack = ColumnParallelLinear(
-            hidden_size,
-            3 * hidden_size,
-            bias=False,
-            gather_output=False,
-        )
-        self.o_proj = RowParallelLinear(
-            self.total_num_heads * self.head_dim,
-            hidden_size,
-            bias=False,
-            input_is_parallel=True,
-        )
-        # Create the alibi slopes and slice them.
-        if self.postion_embedding == "ALIBI":
-            tp_rank = get_tensor_model_parallel_rank()
-            head_start = tp_rank * self.num_heads
-            head_end = (tp_rank + 1) * self.num_heads
-            alibi_slopes = _get_alibi_slopes(self.total_num_heads)
-            alibi_slopes = alibi_slopes[head_start:head_end].tolist()
-
-            scaling = self.head_dim**-0.5
-            self.attn = PagedAttentionWithALiBi(self.num_heads, self.head_dim,
-                                                scaling, alibi_slopes)
-        else:
-            self.scaling = self.head_dim**-0.5
-            self.attn = PagedAttentionWithRoPE(
-                self.num_heads,
-                self.head_dim,
-                self.scaling,
-                rotary_dim=self.head_dim,
-                base=self.rope_theta,
-                max_position=self.max_position_embeddings)
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-        cache_event: Optional[torch.cuda.Event],
-    ) -> torch.Tensor:
-        qkv, _ = self.W_pack(hidden_states)
-        q, k, v = qkv.chunk(chunks=3, dim=-1)
-        k_cache, v_cache = kv_cache
-        if self.postion_embedding == "ALIBI":
-            attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata,
-                                    cache_event)
-        else:
-            attn_output = self.attn(positions, q, k, v, k_cache, v_cache,
-                                    input_metadata, cache_event)
-
-        output, _ = self.o_proj(attn_output)
-        return output
-
-
-class BaiChuanDecoderLayer(nn.Module):
-
-    def __init__(self, config: BaiChuanConfig, position_embedding: str):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
-        self.self_attn = BaiChuanAttention(
-            hidden_size=self.hidden_size,
-            num_heads=config.num_attention_heads,
-            position_embedding=position_embedding,
-            rope_theta=rope_theta,
-            max_position_embeddings=max_position_embeddings,
-        )
-        self.mlp = BaiChuanMLP(
-            hidden_size=self.hidden_size,
-            intermediate_size=config.intermediate_size,
-            hidden_act=config.hidden_act,
-        )
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-        cache_event: Optional[torch.cuda.Event],
-    ) -> torch.Tensor:
-        # Self Attention
-        residual = hidden_states
-        hidden_states = self.input_layernorm(hidden_states)
-        hidden_states = self.self_attn(
-            positions=positions,
-            hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            input_metadata=input_metadata,
-            cache_event=cache_event,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-        return hidden_states
-
-
-class BaiChuanModel(nn.Module):
-
-    def __init__(self, config: BaiChuanConfig, position_embedding: str):
-        super().__init__()
-        self.config = config
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = VocabParallelEmbedding(
-            config.vocab_size,
-            config.hidden_size,
-        )
-        self.layers = nn.ModuleList([
-            BaiChuanDecoderLayer(config, position_embedding)
-            for _ in range(config.num_hidden_layers)
-        ])
-        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-        cache_events: Optional[List[torch.cuda.Event]],
-    ) -> torch.Tensor:
-        hidden_states = self.embed_tokens(input_ids)
-        for i in range(len(self.layers)):
-            if cache_events is None:
-                cache_event = None
-            else:
-                cache_event = cache_events[i]
-            layer = self.layers[i]
-            hidden_states = layer(
-                positions,
-                hidden_states,
-                kv_caches[i],
-                input_metadata,
-                cache_event,
-            )
-        hidden_states = self.norm(hidden_states)
-        return hidden_states
-
-
-class BaiChuanBaseForCausalLM(nn.Module):
-
-    def __init__(self, config, position_embedding: str):
-        super().__init__()
-        self.config = config
-        self.model = BaiChuanModel(config, position_embedding)
-        self.lm_head = ColumnParallelLinear(
-            config.hidden_size,
-            config.vocab_size,
-            bias=False,
-            gather_output=False,
-        )
-        self.sampler = Sampler(config.vocab_size)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-        cache_events: Optional[List[torch.cuda.Event]],
-    ) -> SamplerOutput:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   input_metadata, cache_events)
-        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
-                                   input_metadata)
-        return next_tokens
-
-    _column_parallel_weights = []
-    _row_parallel_weights = ["o_proj.weight", "down_proj.weight"]
-
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
-        tp_world_size = get_tensor_model_parallel_world_size()
-        tp_rank = get_tensor_model_parallel_rank()
-        state_dict = self.state_dict()
-
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
-            if "rotary_emb.inv_freq" in name:
-                continue
-
-            loaded_weight = convert_pyslice_to_tensor(loaded_weight)
-
-            if "W_pack" in name:
-                total_num_heads = self.config.num_attention_heads
-                hidden_size = self.config.hidden_size
-                head_size = hidden_size // total_num_heads
-                num_heads = total_num_heads // tp_world_size
-                head_start = tp_rank * num_heads
-                head_end = (tp_rank + 1) * num_heads
-
-                loaded_weight = loaded_weight.view(3, total_num_heads,
-                                                   head_size, hidden_size)
-                loaded_weight = loaded_weight[:, head_start:head_end, :, :]
-                loaded_weight = loaded_weight.reshape(-1, hidden_size)
-
-            is_gate_up_weight = False
-            for stride_id, weight_name in enumerate(["gate_proj", "up_proj"]):
-                if weight_name not in name:
-                    continue
-                param = state_dict[name.replace(weight_name, "gate_up_proj")]
-                shard_size = param.shape[0] // 2
-                loaded_weight = loaded_weight[shard_size * tp_rank:shard_size *
-                                              (tp_rank + 1)]
-                param_slice = param.data[shard_size * stride_id:shard_size *
-                                         (stride_id + 1)]
-                assert param_slice.shape == loaded_weight.shape
-                param_slice.copy_(loaded_weight)
-                is_gate_up_weight = True
-                break
-            if is_gate_up_weight:
-                continue
-
-            param = state_dict[name]
-
-            if "embed_tokens" in name or "lm_head" in name:
-                load_padded_tensor_parallel_vocab(param, loaded_weight,
-                                                  tp_rank)
-                continue
-
-            load_tensor_parallel_weights(
-                param,
-                loaded_weight,
-                name,
-                self._column_parallel_weights,
-                self._row_parallel_weights,
-                tp_rank,
-            )
-
-
-class BaichuanForCausalLM(BaiChuanBaseForCausalLM):  # baichuan 13b
-
-    def __init__(self, config):
-        super().__init__(config, "ALIBI")
-
-
-class BaiChuanForCausalLM(BaiChuanBaseForCausalLM):  # baichuan 7b
-
-    def __init__(self, config):
-        super().__init__(config, "ROPE")
--- a/server/vllm/vllm/model_executor/models/bloom.py
+++ b/server/vllm/vllm/model_executor/models/bloom.py
-# coding=utf-8
-# Adapted from
-# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/bloom/modeling_bloom.py
-# Copyright 2023 The CacheFlow team.
-# Copyright 2022 HuggingFace Inc. team and BigScience workshop.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Inference-only BLOOM model compatible with HuggingFace weights.
-
-The input of the model is flattened to a 1D tensor of tokens. The model uses
-InputMetadata to extract the original 2D shape of the input.
-"""
-import math
-from typing import List, Optional, Tuple
-
-import torch
-from torch import nn
-from transformers import BloomConfig
-
-from vllm.model_executor.input_metadata import InputMetadata
-from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.attention import PagedAttentionWithALiBi
-from vllm.model_executor.layers.sampler import Sampler
-from vllm.model_executor.weight_utils import (hf_model_weights_iterator,
-                                              load_tensor_parallel_weights)
-from vllm.model_executor.parallel_utils.parallel_state import (
-    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
-from vllm.model_executor.parallel_utils.layers import (VocabParallelEmbedding,
-                                                       ColumnParallelLinear,
-                                                       RowParallelLinear)
-from vllm.sequence import SamplerOutput
-
-KVCache = Tuple[torch.Tensor, torch.Tensor]
-
-
-def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
-    closest_power_of_2 = 2**math.floor(math.log2(total_num_heads))
-    base = torch.tensor(
-        2**(-(2**-(math.log2(closest_power_of_2) - 3))),
-        dtype=torch.float32,
-    )
-    powers = torch.arange(1, 1 + closest_power_of_2, dtype=torch.int32)
-    slopes = torch.pow(base, powers)
-
-    if closest_power_of_2 != total_num_heads:
-        extra_base = torch.tensor(
-            2**(-(2**-(math.log2(2 * closest_power_of_2) - 3))),
-            dtype=torch.float32,
-        )
-        num_remaining_heads = min(closest_power_of_2,
-                                  total_num_heads - closest_power_of_2)
-        extra_powers = torch.arange(start=1,
-                                    end=1 + 2 * num_remaining_heads,
-                                    step=2,
-                                    dtype=torch.int32)
-        slopes = torch.cat(
-            [slopes, torch.pow(extra_base, extra_powers)], dim=0)
-    return slopes
-
-
-class BloomAttention(nn.Module):
-
-    def __init__(self, config: BloomConfig):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        self.total_num_heads = config.n_head
-        self.head_dim = self.hidden_size // self.total_num_heads
-        assert self.head_dim * self.total_num_heads == self.hidden_size
-
-        tp_world_size = get_tensor_model_parallel_world_size()
-        assert self.total_num_heads % tp_world_size == 0
-        self.num_heads = self.total_num_heads // tp_world_size
-
-        self.query_key_value = ColumnParallelLinear(
-            self.hidden_size,
-            3 * self.hidden_size,
-            bias=True,
-            gather_output=False,
-        )
-        self.dense = RowParallelLinear(
-            self.hidden_size,
-            self.hidden_size,
-            bias=True,
-            input_is_parallel=True,
-        )
-
-        # Create the alibi slopes and slice them.
-        tp_rank = get_tensor_model_parallel_rank()
-        head_start = tp_rank * self.num_heads
-        head_end = (tp_rank + 1) * self.num_heads
-        alibi_slopes = _get_alibi_slopes(self.total_num_heads)
-        alibi_slopes = alibi_slopes[head_start:head_end].tolist()
-
-        scaling = self.head_dim**-0.5
-        self.attn = PagedAttentionWithALiBi(self.num_heads, self.head_dim,
-                                            scaling, alibi_slopes)
-
-    def forward(
-        self,
-        position_ids: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-        cache_event: Optional[torch.cuda.Event],
-    ) -> torch.Tensor:
-        del position_ids  # Unused.
-        qkv, _ = self.query_key_value(hidden_states)
-        q, k, v = qkv.chunk(chunks=3, dim=-1)
-        k_cache, v_cache = kv_cache
-        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata,
-                                cache_event)
-        output, _ = self.dense(attn_output)
-        return output
-
-
-class BloomMLP(nn.Module):
-
-    def __init__(self, config: BloomConfig):
-        super().__init__()
-        hidden_size = config.hidden_size
-        self.dense_h_to_4h = ColumnParallelLinear(
-            hidden_size,
-            4 * hidden_size,
-            gather_output=False,
-        )
-        self.act = get_act_fn("gelu")
-        self.dense_4h_to_h = RowParallelLinear(
-            4 * hidden_size,
-            hidden_size,
-            input_is_parallel=True,
-        )
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x, _ = self.dense_h_to_4h(x)
-        x = self.act(x)
-        x, _ = self.dense_4h_to_h(x)
-        return x
-
-
-class BloomBlock(nn.Module):
-
-    def __init__(self, config: BloomConfig):
-        super().__init__()
-        hidden_size = config.hidden_size
-
-        self.input_layernorm = nn.LayerNorm(hidden_size,
-                                            eps=config.layer_norm_epsilon)
-        self.self_attention = BloomAttention(config)
-        self.post_attention_layernorm = nn.LayerNorm(
-            hidden_size, eps=config.layer_norm_epsilon)
-        self.mlp = BloomMLP(config)
-        self.apply_residual_connection_post_layernorm = (
-            config.apply_residual_connection_post_layernorm)
-
-    def forward(
-        self,
-        position_ids: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-        cache_event: Optional[torch.cuda.Event],
-    ) -> torch.Tensor:
-        # Layer norm at the beginning of the transformer layer.
-        layernorm_output = self.input_layernorm(hidden_states)
-
-        # Layer norm post the self attention.
-        if self.apply_residual_connection_post_layernorm:
-            residual = layernorm_output
-        else:
-            residual = hidden_states
-
-        # Self attention.
-        attention_output = self.self_attention(
-            position_ids=position_ids,
-            hidden_states=layernorm_output,
-            kv_cache=kv_cache,
-            input_metadata=input_metadata,
-            cache_event=cache_event,
-        )
-        attention_output = attention_output + residual
-        layernorm_output = self.post_attention_layernorm(attention_output)
-
-        # Get residual
-        if self.apply_residual_connection_post_layernorm:
-            residual = layernorm_output
-        else:
-            residual = attention_output
-
-        # MLP.
-        output = self.mlp(layernorm_output) + residual
-        return output
-
-
-class BloomModel(nn.Module):
-
-    def __init__(self, config: BloomConfig):
-        super().__init__()
-        self.embed_dim = config.hidden_size
-
-        # Embedding + LN Embedding
-        self.word_embeddings = VocabParallelEmbedding(
-            config.vocab_size,
-            self.embed_dim,
-        )
-        self.word_embeddings_layernorm = nn.LayerNorm(
-            self.embed_dim, eps=config.layer_norm_epsilon)
-
-        # Transformer blocks
-        self.h = nn.ModuleList(
-            [BloomBlock(config) for _ in range(config.num_hidden_layers)])
-
-        # Final Layer Norm
-        self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        position_ids: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-        cache_events: Optional[List[torch.cuda.Event]],
-    ) -> torch.Tensor:
-        hidden_states = self.word_embeddings(input_ids)
-        hidden_states = self.word_embeddings_layernorm(hidden_states)
-        for i in range(len(self.h)):
-            if cache_events is None:
-                cache_event = None
-            else:
-                cache_event = cache_events[i]
-            layer = self.h[i]
-            hidden_states = layer(
-                position_ids,
-                hidden_states,
-                kv_caches[i],
-                input_metadata,
-                cache_event,
-            )
-        hidden_states = self.ln_f(hidden_states)
-        return hidden_states
-
-
-class BloomForCausalLM(nn.Module):
-
-    def __init__(self, config: BloomConfig):
-        super().__init__()
-        self.config = config
-        self.transformer = BloomModel(config)
-        # TODO(zhuohan): create a new weight after implementing pipeline
-        #                parallelism
-        self.lm_head_weight = self.transformer.word_embeddings.weight
-        self.sampler = Sampler(config.vocab_size)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-        cache_events: Optional[List[torch.cuda.Event]],
-    ) -> SamplerOutput:
-        hidden_states = self.transformer(input_ids, positions, kv_caches,
-                                         input_metadata, cache_events)
-        next_tokens = self.sampler(self.lm_head_weight, hidden_states,
-                                   input_metadata)
-        return next_tokens
-
-    _column_parallel_weights = [
-        "word_embeddings.weight", "dense_h_to_4h.weight", "dense_h_to_4h.bias"
-    ]
-    _row_parallel_weights = ["dense.weight", "dense_4h_to_h.weight"]
-
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
-        tp_rank = get_tensor_model_parallel_rank()
-        state_dict = self.state_dict()
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
-            if name == "lm_head.weight":
-                # Since hidden_states are parallelized, we need to
-                # load lm_head.weight in parallel.
-                self._column_parallel_weights.append(name)
-                # If lm_head is provided, use it instead.
-                param = self.lm_head_weight
-            else:
-                if not name.startswith("transformer."):
-                    name = "transformer." + name
-                param = state_dict[name]
-
-            if "query_key_value" in name:
-                # NOTE(woosuk): BLOOM's fused QKV has the shape of
-                # [num_heads * 3 * head_size, hidden_size], while the
-                # required shape is [3 * num_heads * head_size, hidden_size].
-                # Thus, we need weight conversion.
-                shard_size = param.shape[0]
-                start = shard_size * tp_rank
-                end = shard_size * (tp_rank + 1)
-                loaded_weight = loaded_weight[start:end]
-
-                num_heads = self.config.num_attention_heads
-                hidden_size = self.config.hidden_size
-                head_size = hidden_size // num_heads
-                if "query_key_value.weight" in name:
-                    loaded_weight = loaded_weight.view(-1, 3, head_size,
-                                                       hidden_size)
-                    loaded_weight = loaded_weight.transpose(0, 1)
-                    loaded_weight = loaded_weight.reshape(-1, hidden_size)
-                elif "query_key_value.bias" in name:
-                    loaded_weight = loaded_weight.view(-1, 3, head_size)
-                    loaded_weight = loaded_weight.transpose(0, 1)
-                    loaded_weight = loaded_weight.reshape(-1)
-                else:
-                    raise ValueError(f"Unexpected weight name: {name}")
-            load_tensor_parallel_weights(param, loaded_weight, name,
-                                         self._column_parallel_weights,
-                                         self._row_parallel_weights, tp_rank)
--- a/server/vllm/vllm/model_executor/models/falcon.py
+++ b/server/vllm/vllm/model_executor/models/falcon.py
-# coding=utf-8
-# Adapted from
-# https://github.com/huggingface/transformers/blob/a5cc30d72ae2dc19af534e4b35c986cc28db1275/src/transformers/models/falcon/modeling_falcon.py
-# Copyright 2023 The vLLM team.
-# Copyright 2023 the Falcon authors and HuggingFace Inc. team.  All rights
-# reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch Falcon model."""
-
-import math
-from typing import List, Optional, Tuple, Union
-
-import torch
-from torch import nn
-from torch.nn import LayerNorm
-from transformers import FalconConfig as HF_FalconConfig
-
-from vllm.model_executor.input_metadata import InputMetadata
-from vllm.model_executor.layers.attention import (PagedAttention,
-                                                  PagedAttentionWithALiBi,
-                                                  PagedAttentionWithRoPE)
-from vllm.model_executor.layers.sampler import Sampler
-from vllm.model_executor.weight_utils import (convert_pyslice_to_tensor,
-                                              hf_model_weights_iterator,
-                                              load_tensor_parallel_weights)
-from vllm.model_executor.parallel_utils.parallel_state import (
-    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
-from vllm.model_executor.parallel_utils.layers import (VocabParallelEmbedding,
-                                                       ColumnParallelLinear,
-                                                       RowParallelLinear)
-from vllm.model_executor.parallel_utils.communication_op import (
-    tensor_model_parallel_all_reduce)
-from vllm.sequence import SamplerOutput
-from vllm.transformers_utils.configs import RWConfig
-
-KVCache = Tuple[torch.Tensor, torch.Tensor]
-FalconConfig = Union[HF_FalconConfig, RWConfig]
-
-
-# NOTE(Hesslow): Unfortunately we did not fuse matmul and bias during
-# training, this means that there's one additional quantization to bfloat16
-# between the operations. In order not to degrade the quality of our HF-port,
-# we keep these characteristics in the final model.
-class FalconLinear(nn.Linear):
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        hidden_states = x @ self.weight.T
-        if self.bias is None:
-            return hidden_states
-        return hidden_states + self.bias
-
-
-def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
-    closest_power_of_2 = 2**math.floor(math.log2(total_num_heads))
-    base = torch.tensor(2**(-(2**-(math.log2(closest_power_of_2) - 3))),
-                        dtype=torch.float32)
-    powers = torch.arange(1, 1 + closest_power_of_2, dtype=torch.int32)
-    slopes = torch.pow(base, powers)
-
-    if closest_power_of_2 != total_num_heads:
-        extra_base = torch.tensor(
-            2**(-(2**-(math.log2(2 * closest_power_of_2) - 3))),
-            dtype=torch.float32)
-        num_remaining_heads = min(closest_power_of_2,
-                                  total_num_heads - closest_power_of_2)
-        extra_powers = torch.arange(1,
-                                    1 + 2 * num_remaining_heads,
-                                    2,
-                                    dtype=torch.int32)
-        slopes = torch.cat(
-            [slopes, torch.pow(extra_base, extra_powers)], dim=0)
-
-    return slopes
-
-
-class FalconAttention(nn.Module):
-
-    def __init__(self, config: FalconConfig):
-        super().__init__()
-
-        self.hidden_size = config.hidden_size
-        tp_size = get_tensor_model_parallel_world_size()
-
-        self.total_num_heads = config.num_attention_heads
-        assert self.total_num_heads % tp_size == 0
-        self.num_heads = self.total_num_heads // tp_size
-        self.head_dim = self.hidden_size // self.total_num_heads
-        assert self.head_dim * self.total_num_heads == self.hidden_size
-
-        self.new_decoder_architecture = config.new_decoder_architecture
-        self.multi_query = config.multi_query
-
-        if self.new_decoder_architecture:
-            self.total_num_kv_heads = config.num_kv_heads
-            assert self.total_num_heads % tp_size == 0
-            self.num_kv_heads = self.total_num_kv_heads // tp_size
-            self.query_key_value = ColumnParallelLinear(
-                self.hidden_size,
-                (self.total_num_heads + 2 * self.total_num_kv_heads) *
-                self.head_dim,
-                bias=config.bias,
-                gather_output=False,
-                skip_bias_add=True,
-            )
-        elif self.multi_query:
-            self.total_num_kv_heads = 1
-            self.num_kv_heads = 1
-            self.query = ColumnParallelLinear(
-                self.hidden_size,
-                self.total_num_heads * self.head_dim,
-                bias=config.bias,
-                gather_output=False,
-                skip_bias_add=True,
-            )
-            self.key_value = FalconLinear(self.hidden_size,
-                                          2 * self.head_dim,
-                                          bias=config.bias)
-        else:
-            self.total_num_kv_heads = self.total_num_heads
-            self.num_kv_heads = self.num_heads
-            self.query_key_value = ColumnParallelLinear(
-                self.hidden_size,
-                (self.total_num_heads + 2 * self.total_num_kv_heads) *
-                self.head_dim,
-                bias=config.bias,
-                gather_output=False,
-                skip_bias_add=True,
-            )
-
-        self.q_size = self.num_heads * self.head_dim
-        self.kv_size = self.num_kv_heads * self.head_dim
-
-        # Layer-wise attention scaling
-        self.inv_norm_factor = 1.0 / math.sqrt(self.head_dim)
-        self.reduce_row_parallel_results = not (config.new_decoder_architecture
-                                                or config.parallel_attn)
-        self.dense = RowParallelLinear(
-            self.hidden_size,
-            self.hidden_size,
-            bias=config.bias,
-            input_is_parallel=True,
-            skip_bias_add=True,
-            reduce_results=self.reduce_row_parallel_results)
-
-        self.use_rotary = config.rotary
-        self.use_alibi = config.alibi
-        assert not (self.use_rotary and self.use_alibi), (
-            "Rotary and alibi are mutually exclusive.")
-
-        if self.use_rotary:
-            rope_theta = getattr(config, "rope_theta", 10000)
-            max_position_embeddings = getattr(config,
-                                              "max_position_embeddings", 8192)
-            self.attn = PagedAttentionWithRoPE(
-                self.num_heads,
-                self.head_dim,
-                self.inv_norm_factor,
-                base=rope_theta,
-                max_position=max_position_embeddings,
-                rotary_dim=self.head_dim,
-                num_kv_heads=self.num_kv_heads)
-        elif self.use_alibi:
-            tp_rank = get_tensor_model_parallel_rank()
-            head_start = tp_rank * self.num_heads
-            head_end = (tp_rank + 1) * self.num_heads
-            alibi_slopes = (_get_alibi_slopes(self.total_num_heads) *
-                            self.inv_norm_factor)
-            alibi_slopes = alibi_slopes[head_start:head_end].tolist()
-            self.attn = PagedAttentionWithALiBi(self.num_heads,
-                                                self.head_dim,
-                                                self.inv_norm_factor,
-                                                alibi_slopes,
-                                                num_kv_heads=self.num_kv_heads)
-        else:
-            self.attn = PagedAttention(self.num_heads,
-                                       self.head_dim,
-                                       scale=self.inv_norm_factor,
-                                       num_kv_heads=self.num_kv_heads)
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-        cache_event: Optional[torch.cuda.Event],
-    ) -> torch.Tensor:
-        if not self.new_decoder_architecture and self.multi_query:
-            q, bias = self.query(hidden_states)
-            if bias is not None:
-                q += bias
-            kv = self.key_value(hidden_states)
-            k, v = kv.split([self.kv_size, self.kv_size], dim=-1)
-        else:
-            qkv, bias = self.query_key_value(hidden_states)
-            if bias is not None:
-                qkv += bias
-            q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size],
-                                dim=-1)
-        k_cache, v_cache = kv_cache
-        if self.use_rotary:
-            attn_output = self.attn(positions, q, k, v, k_cache, v_cache,
-                                    input_metadata, cache_event)
-        else:
-            attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata,
-                                    cache_event)
-        attn_output, bias = self.dense(attn_output)
-        return attn_output, bias
-
-
-class FalconMLP(nn.Module):
-
-    def __init__(self, config: FalconConfig):
-        super().__init__()
-        hidden_size = config.hidden_size
-
-        self.dense_h_to_4h = ColumnParallelLinear(hidden_size,
-                                                  4 * hidden_size,
-                                                  bias=config.bias,
-                                                  gather_output=False,
-                                                  skip_bias_add=True)
-        self.act = nn.GELU()
-        self.reduce_row_parallel_results = not (config.new_decoder_architecture
-                                                or config.parallel_attn)
-        self.dense_4h_to_h = RowParallelLinear(
-            4 * hidden_size,
-            hidden_size,
-            bias=config.bias,
-            input_is_parallel=True,
-            skip_bias_add=True,
-            reduce_results=self.reduce_row_parallel_results)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        # NOTE(zhuohan): Following huggingface, we do not fuse bias add here.
-        x, bias = self.dense_h_to_4h(x)
-        if bias is not None:
-            x += bias
-        x = self.act(x)
-        x, bias = self.dense_4h_to_h(x)
-        return x, bias
-
-
-class FalconDecoderLayer(nn.Module):
-
-    def __init__(self, config: FalconConfig):
-        super().__init__()
-        hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.self_attention = FalconAttention(config)
-        self.mlp = FalconMLP(config)
-        self.config = config
-
-        if config.new_decoder_architecture:
-            # The layer norm before self-attention
-            self.ln_attn = LayerNorm(hidden_size,
-                                     eps=config.layer_norm_epsilon)
-            # The layer norm before the MLP
-            self.ln_mlp = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
-        else:
-            self.input_layernorm = LayerNorm(hidden_size,
-                                             eps=config.layer_norm_epsilon)
-            if not config.parallel_attn:
-                self.post_attention_layernorm = LayerNorm(
-                    hidden_size, eps=config.layer_norm_epsilon)
-
-        self.reduce_row_parallel_results = not (config.new_decoder_architecture
-                                                or config.parallel_attn)
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-        cache_event: Optional[torch.cuda.Event],
-    ):
-        residual = hidden_states
-
-        if self.config.new_decoder_architecture:
-            attention_layernorm_out = self.ln_attn(hidden_states)
-            mlp_layernorm_out = self.ln_mlp(hidden_states)
-        else:
-            attention_layernorm_out = self.input_layernorm(hidden_states)
-
-        # Self attention.
-        attention_output, attention_bias = self.self_attention(
-            positions=positions,
-            hidden_states=attention_layernorm_out,
-            kv_cache=kv_cache,
-            input_metadata=input_metadata,
-            cache_event=cache_event,
-        )
-        if self.reduce_row_parallel_results and attention_bias is not None:
-            attention_output += attention_bias
-
-        if not self.config.new_decoder_architecture:
-            if self.config.parallel_attn:
-                mlp_layernorm_out = attention_layernorm_out
-            else:
-                residual += attention_output
-                mlp_layernorm_out = self.post_attention_layernorm(residual)
-
-        # MLP.
-        mlp_output, mlp_bias = self.mlp(mlp_layernorm_out)
-        if self.reduce_row_parallel_results and mlp_bias is not None:
-            mlp_output += mlp_bias
-
-        if not self.reduce_row_parallel_results:
-            # When MLP and Attention layers are parallel, we can use
-            # only one all-reduce operator to reduce the results from
-            # both MLP and Attention layers.
-            mlp_output += attention_output
-            mlp_output = tensor_model_parallel_all_reduce(mlp_output)
-            if attention_bias is not None:
-                mlp_output += attention_bias
-            if mlp_bias is not None:
-                mlp_output += mlp_bias
-
-        output = mlp_output + residual
-
-        return output
-
-
-class FalconModel(nn.Module):
-
-    def __init__(self, config: FalconConfig):
-        super().__init__()
-        self.config = config
-        self.embed_dim = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.use_alibi = config.alibi
-
-        # Embedding + LN Embedding
-        self.word_embeddings = VocabParallelEmbedding(
-            config.vocab_size,
-            self.embed_dim,
-        )
-
-        # Transformer blocks
-        self.h = nn.ModuleList([
-            FalconDecoderLayer(config) for _ in range(config.num_hidden_layers)
-        ])
-
-        # Final Layer Norm
-        self.ln_f = LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-        cache_events: Optional[List[torch.cuda.Event]],
-    ) -> torch.Tensor:
-        hidden_states = self.word_embeddings(input_ids)
-        for i in range(len(self.h)):
-            if cache_events is None:
-                cache_event = None
-            else:
-                cache_event = cache_events[i]
-            layer = self.h[i]
-            hidden_states = layer(
-                positions,
-                hidden_states,
-                kv_caches[i],
-                input_metadata,
-                cache_event,
-            )
-        hidden_states = self.ln_f(hidden_states)
-        return hidden_states
-
-
-class FalconForCausalLM(nn.Module):
-
-    def __init__(self, config: FalconConfig):
-        super().__init__()
-        self.config = config
-        self.transformer = FalconModel(config)
-        self.lm_head = ColumnParallelLinear(
-            config.hidden_size,
-            config.vocab_size,
-            bias=False,
-            gather_output=False,
-        )
-        self.sampler = Sampler(config.vocab_size)
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-        cache_events: Optional[List[torch.cuda.Event]],
-    ) -> SamplerOutput:
-        hidden_states = self.transformer(
-            input_ids,
-            positions,
-            kv_caches,
-            input_metadata,
-            cache_events,
-        )
-        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
-                                   input_metadata)
-
-        return next_tokens
-
-    _column_parallel_weights = [
-        "word_embeddings.weight", "lm_head.weight", "dense_h_to_4h.weight",
-        "dense_h_to_4h.bias"
-    ]
-    _row_parallel_weights = ["dense.weight", "dense_4h_to_h.weight"]
-
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
-        tp_size = (get_tensor_model_parallel_world_size())
-        tp_rank = get_tensor_model_parallel_rank()
-
-        hidden_size = self.config.hidden_size
-        total_num_heads = self.config.num_attention_heads
-        num_heads = total_num_heads // tp_size
-        head_size = hidden_size // total_num_heads
-        head_start = tp_rank * num_heads
-        head_end = (tp_rank + 1) * num_heads
-        if self.config.new_decoder_architecture:
-            total_num_kv_heads = self.config.num_kv_heads
-            num_kv_heads = total_num_kv_heads // tp_size
-            separated_q_kv = False
-            kv_head_start = tp_rank * num_kv_heads
-            kv_head_end = (tp_rank + 1) * num_kv_heads
-        elif self.config.multi_query:
-            total_num_kv_heads = 1
-            num_kv_heads = 1
-            separated_q_kv = True
-            kv_head_start = 0
-            kv_head_end = 1
-        else:
-            total_num_kv_heads = total_num_heads
-            num_kv_heads = total_num_kv_heads // tp_size
-            separated_q_kv = False
-            kv_head_start = tp_rank * num_kv_heads
-            kv_head_end = (tp_rank + 1) * num_kv_heads
-        num_query_heads_per_kv_head = total_num_heads // total_num_kv_heads
-        state_dict = self.state_dict()
-
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
-            if "query_key_value" in name:
-                loaded_weight = convert_pyslice_to_tensor(loaded_weight)
-                loaded_weight_size = loaded_weight.size()
-                loaded_weight = loaded_weight.view(
-                    total_num_kv_heads, num_query_heads_per_kv_head + 2,
-                    head_size, *loaded_weight_size[1:])
-
-                wq = loaded_weight[:, :-2].reshape(-1, *loaded_weight_size[1:])
-                wk = loaded_weight[:, [-2]].reshape(-1,
-                                                    *loaded_weight_size[1:])
-                wv = loaded_weight[:, [-1]].reshape(-1,
-                                                    *loaded_weight_size[1:])
-
-                wq = wq[head_size * head_start:head_size * head_end]
-                wk = wk[head_size * kv_head_start:head_size * kv_head_end]
-                wv = wv[head_size * kv_head_start:head_size * kv_head_end]
-
-                if separated_q_kv:
-                    loaded_weight_q = wq
-                    loaded_weight_kv = torch.cat([wk, wv], dim=0)
-                    q_weight_name = name.replace("query_key_value", "query")
-                    kv_weight_name = name.replace("query_key_value",
-                                                  "key_value")
-                    load_tensor_parallel_weights(state_dict[q_weight_name],
-                                                 loaded_weight_q,
-                                                 q_weight_name,
-                                                 self._column_parallel_weights,
-                                                 self._row_parallel_weights,
-                                                 tp_rank)
-                    load_tensor_parallel_weights(state_dict[kv_weight_name],
-                                                 loaded_weight_kv,
-                                                 kv_weight_name,
-                                                 self._column_parallel_weights,
-                                                 self._row_parallel_weights,
-                                                 tp_rank)
-                    continue
-                else:
-                    loaded_weight = torch.cat([wq, wk, wv], dim=0)
-
-            param = state_dict[name]
-            load_tensor_parallel_weights(param, loaded_weight, name,
-                                         self._column_parallel_weights,
-                                         self._row_parallel_weights, tp_rank)
--- a/server/vllm/vllm/model_executor/models/gpt2.py
+++ b/server/vllm/vllm/model_executor/models/gpt2.py
-# coding=utf-8
-# Adapted from
-# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py
-# Copyright 2023 The vLLM team.
-# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Inference-only GPT-2 model compatible with HuggingFace weights.
-
-The input of the model is flattened to a 1D tensor of tokens. The model uses
-InputMetadata to extract the original 2D shape of the input.
-"""
-from typing import List, Optional, Tuple
-
-import torch
-from torch import nn
-from transformers import GPT2Config
-
-from vllm.model_executor.input_metadata import InputMetadata
-from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.attention import PagedAttention
-from vllm.model_executor.layers.sampler import Sampler
-from vllm.model_executor.weight_utils import (
-    convert_pyslice_to_tensor, hf_model_weights_iterator,
-    load_padded_tensor_parallel_vocab, load_tensor_parallel_weights)
-from vllm.model_executor.parallel_utils.parallel_state import (
-    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
-from vllm.model_executor.parallel_utils.layers import (VocabParallelEmbedding,
-                                                       ColumnParallelLinear,
-                                                       RowParallelLinear)
-from vllm.sequence import SamplerOutput
-
-KVCache = Tuple[torch.Tensor, torch.Tensor]
-
-
-class GPT2Attention(nn.Module):
-
-    def __init__(self, config: GPT2Config):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        total_num_heads = config.num_attention_heads
-        tensor_model_parallel_world_size = (
-            get_tensor_model_parallel_world_size())
-        assert total_num_heads % tensor_model_parallel_world_size == 0
-        self.num_heads = total_num_heads // tensor_model_parallel_world_size
-        self.head_dim = self.hidden_size // total_num_heads
-        self.scale = self.head_dim**-0.5
-
-        self.c_attn = ColumnParallelLinear(
-            self.hidden_size,
-            3 * self.hidden_size,
-            bias=True,
-            gather_output=False,
-        )
-        self.c_proj = RowParallelLinear(
-            self.hidden_size,
-            self.hidden_size,
-            bias=True,
-            input_is_parallel=True,
-        )
-        self.attn = PagedAttention(self.num_heads,
-                                   self.head_dim,
-                                   scale=self.scale)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-        cache_event: Optional[torch.cuda.Event],
-    ) -> torch.Tensor:
-        qkv, _ = self.c_attn(hidden_states)
-        q, k, v = qkv.chunk(chunks=3, dim=-1)
-        key_cache, value_cache = kv_cache
-        attn_output = self.attn(q, k, v, key_cache, value_cache,
-                                input_metadata, cache_event)
-        attn_output, _ = self.c_proj(attn_output)
-        return attn_output
-
-
-class GPT2MLP(nn.Module):
-
-    def __init__(
-        self,
-        intermediate_size: int,
-        config: GPT2Config,
-    ):
-        super().__init__()
-        hidden_size = config.hidden_size
-        self.c_fc = ColumnParallelLinear(
-            hidden_size,
-            intermediate_size,
-            bias=True,
-            gather_output=False,
-        )
-        self.c_proj = RowParallelLinear(
-            intermediate_size,
-            hidden_size,
-            bias=True,
-            input_is_parallel=True,
-        )
-        self.act = get_act_fn(config.activation_function)
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states, _ = self.c_fc(hidden_states)
-        hidden_states = self.act(hidden_states)
-        hidden_states, _ = self.c_proj(hidden_states)
-        return hidden_states
-
-
-class GPT2Block(nn.Module):
-
-    def __init__(self, config: GPT2Config):
-        super().__init__()
-        hidden_size = config.hidden_size
-        inner_dim = (config.n_inner if config.n_inner is not None else 4 *
-                     hidden_size)
-
-        self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
-        self.attn = GPT2Attention(config)
-        self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
-        self.mlp = GPT2MLP(inner_dim, config)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-        cache_event: Optional[torch.cuda.Event],
-    ) -> torch.Tensor:
-        residual = hidden_states
-        hidden_states = self.ln_1(hidden_states)
-        attn_output = self.attn(
-            hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            input_metadata=input_metadata,
-            cache_event=cache_event,
-        )
-        # residual connection
-        hidden_states = attn_output + residual
-
-        residual = hidden_states
-        hidden_states = self.ln_2(hidden_states)
-        feed_forward_hidden_states = self.mlp(hidden_states)
-        # residual connection
-        hidden_states = residual + feed_forward_hidden_states
-        return hidden_states
-
-
-class GPT2Model(nn.Module):
-
-    def __init__(self, config: GPT2Config):
-        super().__init__()
-        self.config = config
-        assert not config.add_cross_attention
-        assert not config.scale_attn_by_inverse_layer_idx
-        assert not config.reorder_and_upcast_attn
-        self.embed_dim = config.hidden_size
-
-        # Optimization: While the vocab size of GPT-2 is 50257, we extend it
-        # to 50304 in order to make it divisible by 64.
-        # This improves performance since GPUs are faster if the dimension
-        # is divisible by 64. In addition, it allows us to shard the embedding
-        # layer across 2, 4, 8, or more GPUs.
-        vocab_size = ((config.vocab_size + 63) // 64) * 64
-        self.wte = VocabParallelEmbedding(vocab_size, self.embed_dim)
-        self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
-        self.h = nn.ModuleList(
-            [GPT2Block(config) for _ in range(config.num_hidden_layers)])
-        self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        position_ids: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-        cache_events: Optional[List[torch.cuda.Event]],
-    ) -> torch.Tensor:
-        inputs_embeds = self.wte(input_ids)
-        position_embeds = self.wpe(position_ids)
-        hidden_states = inputs_embeds + position_embeds
-
-        for i in range(len(self.h)):
-            if cache_events is None:
-                cache_event = None
-            else:
-                cache_event = cache_events[i]
-            layer = self.h[i]
-            hidden_states = layer(hidden_states, kv_caches[i], input_metadata,
-                                  cache_event)
-
-        hidden_states = self.ln_f(hidden_states)
-        return hidden_states
-
-
-class GPT2LMHeadModel(nn.Module):
-
-    def __init__(self, config: GPT2Config):
-        super().__init__()
-        self.config = config
-        self.transformer = GPT2Model(config)
-        # TODO(zhuohan): create a new weight after implementing pipeline
-        #                parallelism
-        self.lm_head_weight = self.transformer.wte.weight
-        self.sampler = Sampler(config.vocab_size)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-        cache_events: Optional[List[torch.cuda.Event]],
-    ) -> SamplerOutput:
-        hidden_states = self.transformer(input_ids, positions, kv_caches,
-                                         input_metadata, cache_events)
-        next_tokens = self.sampler(self.lm_head_weight, hidden_states,
-                                   input_metadata)
-        return next_tokens
-
-    _column_parallel_weights = ["c_fc.weight", "c_fc.bias"]
-    _row_parallel_weights = ["c_proj.weight"]
-
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
-        tensor_model_parallel_world_size = (
-            get_tensor_model_parallel_world_size())
-        tensor_model_parallel_rank = get_tensor_model_parallel_rank()
-        state_dict = self.state_dict()
-
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
-            if "lm_head.weight" in name:
-                # GPT-2 ties the weights of the embedding layer and the final
-                # linear layer.
-                continue
-            if ".attn.bias" in name or ".attn.masked_bias" in name:
-                # Skip attention mask.
-                # NOTE: "c_attn.bias" should not be skipped.
-                continue
-
-            if not name.startswith("transformer."):
-                name = "transformer." + name
-
-            loaded_weight = convert_pyslice_to_tensor(loaded_weight)
-
-            # The HF's GPT-2 implementation uses Conv1D instead of Linear.
-            # Because of this, we need to transpose the weights.
-            for conv1d_weight_name in ["c_attn", "c_proj", "c_fc"]:
-                if conv1d_weight_name not in name:
-                    continue
-                if not name.endswith(".weight"):
-                    continue
-                loaded_weight = loaded_weight.t()
-            param = state_dict[name]
-
-            if name == "transformer.wte.weight":
-                load_padded_tensor_parallel_vocab(param, loaded_weight,
-                                                  tensor_model_parallel_rank)
-                continue
-
-            # For the fused QKV linear layer, manually shard the weights.
-            if "c_attn" in name:
-                # GPT-2's fused QKV has the shape of
-                # [3 * num_heads * head_size, hidden_size].
-                # When tensor parallelism is used, we shard the weights along
-                # the head dimension.
-                total_num_heads = self.config.num_attention_heads
-                hidden_size = self.config.hidden_size
-                head_size = hidden_size // total_num_heads
-                num_heads = total_num_heads // tensor_model_parallel_world_size
-                head_start = tensor_model_parallel_rank * num_heads
-                head_end = (tensor_model_parallel_rank + 1) * num_heads
-
-                if name.endswith(".weight"):
-                    loaded_weight = loaded_weight.view(3, total_num_heads,
-                                                       head_size, hidden_size)
-                    loaded_weight = loaded_weight[:, head_start:head_end, :, :]
-                    loaded_weight = loaded_weight.reshape(-1, hidden_size)
-                elif name.endswith(".bias"):
-                    loaded_weight = loaded_weight.view(3, total_num_heads,
-                                                       head_size)
-                    loaded_weight = loaded_weight[:, head_start:head_end, :]
-                    loaded_weight = loaded_weight.reshape(-1)
-                else:
-                    raise ValueError(f"Unexpected parameter name {name}")
-            load_tensor_parallel_weights(param, loaded_weight, name,
-                                         self._column_parallel_weights,
-                                         self._row_parallel_weights,
-                                         tensor_model_parallel_rank)
--- a/server/vllm/vllm/model_executor/models/gpt_bigcode.py
+++ b/server/vllm/vllm/model_executor/models/gpt_bigcode.py
-# coding=utf-8
-# Adapted from
-# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py
-# Copyright 2023 The vLLM team.
-# Copyright 2023 CTranslate2, and Michael Feil
-# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Inference-only GPTBigCode model compatible with HuggingFace weights.
-
-The input of the model is flattened to a 1D tensor of tokens. The model uses
-InputMetadata to extract the original 2D shape of the input.
-"""
-from typing import List, Optional, Tuple
-
-import torch
-from torch import nn
-from transformers import GPTBigCodeConfig
-
-from vllm.model_executor.input_metadata import InputMetadata
-from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.attention import PagedAttention
-from vllm.model_executor.layers.sampler import Sampler
-from vllm.model_executor.weight_utils import (
-    convert_pyslice_to_tensor, hf_model_weights_iterator,
-    load_padded_tensor_parallel_vocab, load_tensor_parallel_weights)
-from vllm.model_executor.parallel_utils.parallel_state import (
-    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
-from vllm.model_executor.parallel_utils.layers import (VocabParallelEmbedding,
-                                                       ColumnParallelLinear,
-                                                       RowParallelLinear)
-from vllm.sequence import SamplerOutput
-
-KVCache = Tuple[torch.Tensor, torch.Tensor]
-
-
-class GPTBigCodeAttention(nn.Module):
-
-    def __init__(self, config: GPTBigCodeConfig):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        total_num_heads = config.num_attention_heads
-        self.tensor_model_parallel_world_size = (
-            get_tensor_model_parallel_world_size())
-        assert total_num_heads % self.tensor_model_parallel_world_size == 0
-        self.num_heads = (total_num_heads //
-                          self.tensor_model_parallel_world_size)
-        self.head_dim = self.hidden_size // total_num_heads
-        self.scale = self.head_dim**-0.5
-
-        self.multi_query = config.multi_query
-        if self.multi_query:
-            self.num_kv_heads = 1
-            self.kv_dim = self.head_dim
-            self.c_attn_q = ColumnParallelLinear(
-                self.hidden_size,
-                self.hidden_size,
-                bias=True,
-                gather_output=False,
-            )
-            self.c_attn_kv = nn.Linear(self.hidden_size,
-                                       2 * self.kv_dim,
-                                       bias=True)
-        else:
-            self.num_kv_heads = self.num_heads
-            self.kv_dim = self.num_kv_heads * self.head_dim
-            self.c_attn = ColumnParallelLinear(
-                self.hidden_size,
-                self.hidden_size + 2 * self.kv_dim,
-                bias=True,
-                gather_output=False,
-            )
-
-        self.c_proj = RowParallelLinear(
-            self.hidden_size,
-            self.hidden_size,
-            bias=True,
-            input_is_parallel=True,
-        )
-        self.attn = PagedAttention(self.num_heads,
-                                   self.head_dim,
-                                   scale=self.scale,
-                                   num_kv_heads=self.num_kv_heads)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-        cache_event: Optional[torch.cuda.Event],
-    ) -> torch.Tensor:
-        if self.multi_query:
-            q, _ = self.c_attn_q(hidden_states)
-            kv = self.c_attn_kv(hidden_states)
-            k, v = kv.split([self.kv_dim, self.kv_dim], dim=-1)
-        else:
-            qkv, _ = self.c_attn(hidden_states)
-            q, k, v = qkv.split([
-                self.hidden_size // self.tensor_model_parallel_world_size,
-                self.kv_dim, self.kv_dim
-            ],
-                                dim=-1)
-        key_cache, value_cache = kv_cache
-        attn_output = self.attn(q, k, v, key_cache, value_cache,
-                                input_metadata, cache_event)
-        attn_output, _ = self.c_proj(attn_output)
-        return attn_output
-
-
-class GPTBigMLP(nn.Module):
-
-    def __init__(
-        self,
-        intermediate_size: int,
-        config: GPTBigCodeConfig,
-    ):
-        super().__init__()
-        hidden_size = config.hidden_size
-        self.c_fc = ColumnParallelLinear(
-            hidden_size,
-            intermediate_size,
-            bias=True,
-            gather_output=False,
-        )
-        self.c_proj = RowParallelLinear(
-            intermediate_size,
-            hidden_size,
-            bias=True,
-            input_is_parallel=True,
-        )
-        self.act = get_act_fn(config.activation_function)
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states, _ = self.c_fc(hidden_states)
-        hidden_states = self.act(hidden_states)
-        hidden_states, _ = self.c_proj(hidden_states)
-        return hidden_states
-
-
-class GPTBigCodeBlock(nn.Module):
-
-    def __init__(self, config: GPTBigCodeConfig):
-        super().__init__()
-        hidden_size = config.hidden_size
-        inner_dim = (config.n_inner if config.n_inner is not None else 4 *
-                     hidden_size)
-
-        self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
-        self.attn = GPTBigCodeAttention(config)
-        self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
-        self.mlp = GPTBigMLP(inner_dim, config)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-        cache_event: Optional[torch.cuda.Event],
-    ) -> torch.Tensor:
-        residual = hidden_states
-        hidden_states = self.ln_1(hidden_states)
-        attn_output = self.attn(
-            hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            input_metadata=input_metadata,
-            cache_event=cache_event,
-        )
-        # residual connection
-        hidden_states = attn_output + residual
-
-        residual = hidden_states
-        hidden_states = self.ln_2(hidden_states)
-        feed_forward_hidden_states = self.mlp(hidden_states)
-        # residual connection
-        hidden_states = residual + feed_forward_hidden_states
-        return hidden_states
-
-
-class GPTBigCodeModel(nn.Module):
-
-    def __init__(self, config: GPTBigCodeConfig):
-        super().__init__()
-        self.config = config
-        assert not config.add_cross_attention
-
-        self.embed_dim = config.hidden_size
-
-        # Optimization: While the vocab size of GPT-2 is 50257, we extend it
-        # to 50304 in order to make it divisible by 64.
-        # This improves performance since GPUs are faster if the dimension
-        # is divisible by 64. In addition, it allows us to shard the embedding
-        # layer across 2, 4, 8, or more GPUs.
-        vocab_size = ((config.vocab_size + 63) // 64) * 64
-        self.wte = VocabParallelEmbedding(vocab_size, self.embed_dim)
-        self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
-        self.h = nn.ModuleList(
-            [GPTBigCodeBlock(config) for _ in range(config.num_hidden_layers)])
-        self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        position_ids: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-        cache_events: Optional[List[torch.cuda.Event]],
-    ) -> torch.Tensor:
-        inputs_embeds = self.wte(input_ids)
-        position_embeds = self.wpe(position_ids)
-        hidden_states = inputs_embeds + position_embeds
-
-        for i in range(len(self.h)):
-            if cache_events is None:
-                cache_event = None
-            else:
-                cache_event = cache_events[i]
-            layer = self.h[i]
-            hidden_states = layer(hidden_states, kv_caches[i], input_metadata,
-                                  cache_event)
-
-        hidden_states = self.ln_f(hidden_states)
-        return hidden_states
-
-
-class GPTBigCodeForCausalLM(nn.Module):
-
-    def __init__(self, config: GPTBigCodeConfig):
-        super().__init__()
-        self.config = config
-        self.transformer = GPTBigCodeModel(config)
-        # TODO(zhuohan): create a new weight after implementing pipeline
-        #                parallelism
-        self.lm_head_weight = self.transformer.wte.weight
-        self.sampler = Sampler(config.vocab_size)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-        cache_events: Optional[List[torch.cuda.Event]],
-    ) -> SamplerOutput:
-        hidden_states = self.transformer(input_ids, positions, kv_caches,
-                                         input_metadata, cache_events)
-        next_tokens = self.sampler(self.lm_head_weight, hidden_states,
-                                   input_metadata)
-        return next_tokens
-
-    _column_parallel_weights = ["c_fc.weight", "c_fc.bias"]
-    _row_parallel_weights = ["c_proj.weight"]
-
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
-        tensor_model_parallel_world_size = (
-            get_tensor_model_parallel_world_size())
-        tensor_model_parallel_rank = get_tensor_model_parallel_rank()
-        state_dict = self.state_dict()
-
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
-            if "lm_head.weight" in name:
-                # GPT-2 ties the weights of the embedding layer and the final
-                # linear layer.
-                continue
-            if ".attn.bias" in name:
-                # Skip attention mask.
-                # NOTE: "c_attn.bias" should not be skipped.
-                continue
-
-            if not name.startswith("transformer."):
-                name = "transformer." + name
-
-            # For the fused QKV linear layer, manually shard the weights.
-            if "c_attn" in name:
-                # GPT-2's fused QKV has the shape of
-                # [3 * num_heads * head_size, hidden_size].
-                # When tensor parallelism is used, we shard the weights along
-                # the head dimension.
-                total_num_heads = self.config.num_attention_heads
-                total_num_kv_heads = (1 if self.config.multi_query else
-                                      total_num_heads)
-                hidden_size = self.config.hidden_size
-                head_size = hidden_size // total_num_heads
-                total_kv_size = head_size * total_num_kv_heads
-                num_heads = total_num_heads // tensor_model_parallel_world_size
-                head_start = tensor_model_parallel_rank * num_heads
-                head_end = (tensor_model_parallel_rank + 1) * num_heads
-
-                loaded_weight = convert_pyslice_to_tensor(loaded_weight)
-                wq, wk, wv = torch.split(
-                    loaded_weight, [hidden_size, total_kv_size, total_kv_size],
-                    dim=0)
-
-                wq = wq[head_size * head_start:head_size * head_end]
-                if not self.config.multi_query:
-                    # Split the heads when using normal multi-head attention
-                    wk = wk[head_size * head_start:head_size * head_end]
-                    wv = wv[head_size * head_start:head_size * head_end]
-                    loaded_weight = torch.cat([wq, wk, wv], dim=0)
-                else:
-                    # For multi-query attention, we split the query
-                    # but replicate the key and value.
-                    loaded_weight_q = wq
-                    loaded_weight_kv = torch.cat([wk, wv], dim=0)
-                    q_weight_name = name.replace("c_attn", "c_attn_q")
-                    kv_weight_name = name.replace("c_attn", "c_attn_kv")
-                    load_tensor_parallel_weights(state_dict[q_weight_name],
-                                                 loaded_weight_q,
-                                                 q_weight_name,
-                                                 self._column_parallel_weights,
-                                                 self._row_parallel_weights,
-                                                 tensor_model_parallel_rank)
-                    load_tensor_parallel_weights(state_dict[kv_weight_name],
-                                                 loaded_weight_kv,
-                                                 kv_weight_name,
-                                                 self._column_parallel_weights,
-                                                 self._row_parallel_weights,
-                                                 tensor_model_parallel_rank)
-                    continue
-
-            param = state_dict[name]
-
-            if name == "transformer.wte.weight":
-                load_padded_tensor_parallel_vocab(param, loaded_weight,
-                                                  tensor_model_parallel_rank)
-                continue
-
-            load_tensor_parallel_weights(param, loaded_weight, name,
-                                         self._column_parallel_weights,
-                                         self._row_parallel_weights,
-                                         tensor_model_parallel_rank)
--- a/server/vllm/vllm/model_executor/models/gpt_j.py
+++ b/server/vllm/vllm/model_executor/models/gpt_j.py
-# coding=utf-8
-# Adapted from
-# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gptj/modeling_gptj.py
-# Copyright 2023 The vLLM team.
-# Copyright 2021 The EleutherAI and HuggingFace Teams. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Inference-only GPT-J model compatible with HuggingFace weights.
-
-The input of the model is flattened to a 1D tensor of tokens. The model uses
-InputMetadata to extract the original 2D shape of the input.
-"""
-from typing import List, Optional, Tuple
-
-import torch
-from torch import nn
-from transformers import GPTJConfig
-
-from vllm.model_executor.input_metadata import InputMetadata
-from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.attention import PagedAttentionWithRoPE
-from vllm.model_executor.layers.sampler import Sampler
-from vllm.model_executor.weight_utils import (hf_model_weights_iterator,
-                                              load_tensor_parallel_weights)
-from vllm.model_executor.parallel_utils.parallel_state import (
-    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
-from vllm.model_executor.parallel_utils.layers import (VocabParallelEmbedding,
-                                                       ColumnParallelLinear,
-                                                       RowParallelLinear)
-from vllm.sequence import SamplerOutput
-
-KVCache = Tuple[torch.Tensor, torch.Tensor]
-
-
-class GPTJAttention(nn.Module):
-
-    def __init__(self, config: GPTJConfig):
-        super().__init__()
-        self.total_num_heads = config.num_attention_heads
-        self.hidden_size = config.hidden_size
-        self.head_size = self.hidden_size // self.total_num_heads
-
-        self.qkv_proj = ColumnParallelLinear(
-            config.hidden_size,
-            3 * config.hidden_size,
-            bias=False,
-            gather_output=False,
-        )
-        self.out_proj = RowParallelLinear(
-            config.hidden_size,
-            config.hidden_size,
-            bias=False,
-            input_is_parallel=True,
-        )
-
-        tp_world_size = get_tensor_model_parallel_world_size()
-        assert self.total_num_heads % tp_world_size == 0
-        self.num_heads = self.total_num_heads // tp_world_size
-
-        scaling = self.head_size**-0.5
-        assert getattr(config, "rotary", True)
-        assert config.rotary_dim % 2 == 0
-        rope_theta = getattr(config, "rope_theta", 10000)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
-        self.attn = PagedAttentionWithRoPE(
-            self.num_heads,
-            self.head_size,
-            scaling,
-            config.rotary_dim,
-            base=rope_theta,
-            max_position=max_position_embeddings,
-            is_neox_style=False)
-        self.warmup = False
-
-    def forward(
-        self,
-        position_ids: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-        cache_event: Optional[torch.cuda.Event],
-    ) -> torch.Tensor:
-        qkv, _ = self.qkv_proj(hidden_states)
-        q, k, v = qkv.chunk(chunks=3, dim=-1)
-        k_cache, v_cache = kv_cache
-        attn_output = self.attn(position_ids, q, k, v, k_cache, v_cache,
-                                input_metadata, cache_event)
-        attn_output, _ = self.out_proj(attn_output)
-        return attn_output
-
-
-class GPTJMLP(nn.Module):
-
-    def __init__(self, intermediate_size: int, config: GPTJConfig):
-        super().__init__()
-        hidden_size = config.n_embd
-        self.fc_in = ColumnParallelLinear(
-            hidden_size,
-            intermediate_size,
-            gather_output=False,
-        )
-        self.fc_out = RowParallelLinear(
-            intermediate_size,
-            hidden_size,
-            input_is_parallel=True,
-        )
-        self.act = get_act_fn(config.activation_function)
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states, _ = self.fc_in(hidden_states)
-        hidden_states = self.act(hidden_states)
-        hidden_states, _ = self.fc_out(hidden_states)
-        return hidden_states
-
-
-class GPTJBlock(nn.Module):
-
-    def __init__(self, config: GPTJConfig):
-        super().__init__()
-        if config.n_inner is None:
-            inner_dim = 4 * config.n_embd
-        else:
-            inner_dim = config.n_inner
-        self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
-        self.attn = GPTJAttention(config)
-        self.mlp = GPTJMLP(inner_dim, config)
-
-    def forward(
-        self,
-        position_ids: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-        cache_event: Optional[torch.cuda.Event],
-    ) -> torch.Tensor:
-        residual = hidden_states
-        hidden_states = self.ln_1(hidden_states)
-        attn_output = self.attn(
-            position_ids=position_ids,
-            hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            input_metadata=input_metadata,
-            cache_event=cache_event,
-        )
-        mlp_output = self.mlp(hidden_states)
-        hidden_states = attn_output + mlp_output + residual
-        return hidden_states
-
-
-class GPTJModel(nn.Module):
-
-    def __init__(self, config: GPTJConfig):
-        super().__init__()
-        self.config = config
-        self.embed_dim = config.n_embd
-        self.wte = VocabParallelEmbedding(
-            config.vocab_size,
-            self.embed_dim,
-        )
-        self.h = nn.ModuleList(
-            [GPTJBlock(config) for _ in range(config.n_layer)])
-        self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        position_ids: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-        cache_events: Optional[List[torch.cuda.Event]],
-    ) -> torch.Tensor:
-        hidden_states = self.wte(input_ids)
-        for i in range(len(self.h)):
-            if cache_events is None:
-                cache_event = None
-            else:
-                cache_event = cache_events[i]
-            layer = self.h[i]
-            hidden_states = layer(
-                position_ids,
-                hidden_states,
-                kv_caches[i],
-                input_metadata,
-                cache_event,
-            )
-        hidden_states = self.ln_f(hidden_states)
-        return hidden_states
-
-
-class GPTJForCausalLM(nn.Module):
-
-    def __init__(self, config: GPTJConfig):
-        super().__init__()
-        self.config = config
-        assert not config.tie_word_embeddings
-        self.transformer = GPTJModel(config)
-        self.lm_head = ColumnParallelLinear(
-            config.n_embd,
-            config.vocab_size,
-            gather_output=False,
-        )
-        self.sampler = Sampler(config.vocab_size)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-        cache_events: Optional[List[torch.cuda.Event]],
-    ) -> SamplerOutput:
-        hidden_states = self.transformer(input_ids, positions, kv_caches,
-                                         input_metadata, cache_events)
-        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
-                                   input_metadata, self.lm_head.bias)
-        return next_tokens
-
-    _column_parallel_weights = [
-        "wte.weight", "fc_in.weight", "fc_in.bias", "lm_head.weight",
-        "lm_head.bias"
-    ]
-    _row_parallel_weights = ["out_proj.weight", "fc_out.weight"]
-
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
-        tp_rank = get_tensor_model_parallel_rank()
-        state_dict = self.state_dict()
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
-            if "attn.bias" in name or "attn.masked_bias" in name:
-                continue
-
-            is_attention_weight = False
-            for stride_id, att_weight_name in enumerate(
-                ["q_proj", "k_proj", "v_proj"]):
-                if att_weight_name not in name:
-                    continue
-                param = state_dict[name.replace(att_weight_name, "qkv_proj")]
-                shard_size = param.shape[1]
-                loaded_weight = loaded_weight[shard_size * tp_rank:shard_size *
-                                              (tp_rank + 1)]
-                param_slice = param.data[shard_size * stride_id:shard_size *
-                                         (stride_id + 1)]
-                assert param_slice.shape == loaded_weight.shape
-                param_slice.copy_(loaded_weight)
-                is_attention_weight = True
-                break
-            if is_attention_weight:
-                continue
-
-            param = state_dict[name]
-            load_tensor_parallel_weights(param, loaded_weight, name,
-                                         self._column_parallel_weights,
-                                         self._row_parallel_weights, tp_rank)
--- a/server/vllm/vllm/model_executor/models/gpt_neox.py
+++ b/server/vllm/vllm/model_executor/models/gpt_neox.py
-# coding=utf-8
-# Adapted from
-# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt_neox/modeling_gpt_neox.py
-# Copyright 2023 The vLLM team.
-# Copyright 2022 EleutherAI The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Inference-only GPT-NeoX model compatible with HuggingFace weights.
-
-The input of the model is flattened to a 1D tensor of tokens. The model uses
-InputMetadata to extract the original 2D shape of the input.
-"""
-from typing import List, Optional, Tuple
-
-import torch
-from torch import nn
-from transformers import GPTNeoXConfig
-
-from vllm.model_executor.input_metadata import InputMetadata
-from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.attention import PagedAttentionWithRoPE
-from vllm.model_executor.layers.sampler import Sampler
-from vllm.model_executor.weight_utils import (hf_model_weights_iterator,
-                                              load_tensor_parallel_weights)
-from vllm.model_executor.parallel_utils.parallel_state import (
-    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
-from vllm.model_executor.parallel_utils.layers import (VocabParallelEmbedding,
-                                                       ColumnParallelLinear,
-                                                       RowParallelLinear)
-from vllm.sequence import SamplerOutput
-
-KVCache = Tuple[torch.Tensor, torch.Tensor]
-
-
-class GPTNeoXAttention(nn.Module):
-
-    def __init__(self, config: GPTNeoXConfig):
-        super().__init__()
-        self.total_num_heads = config.num_attention_heads
-        self.hidden_size = config.hidden_size
-        self.head_size = self.hidden_size // self.total_num_heads
-
-        tensor_model_parallel_world_size = (
-            get_tensor_model_parallel_world_size())
-        assert self.total_num_heads % tensor_model_parallel_world_size == 0
-        self.num_heads = (self.total_num_heads //
-                          tensor_model_parallel_world_size)
-
-        self.query_key_value = ColumnParallelLinear(
-            config.hidden_size,
-            3 * config.hidden_size,
-            gather_output=False,
-        )
-        self.dense = RowParallelLinear(
-            config.hidden_size,
-            config.hidden_size,
-            input_is_parallel=True,
-        )
-
-        scaling = self.head_size**-0.5
-        rotary_dim = int(self.head_size * config.rotary_pct)
-        assert rotary_dim % 2 == 0
-        rope_theta = getattr(config, "rope_theta", 10000)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
-        self.attn = PagedAttentionWithRoPE(
-            self.num_heads,
-            self.head_size,
-            scaling,
-            rotary_dim,
-            base=rope_theta,
-            max_position=max_position_embeddings)
-
-    def forward(
-        self,
-        position_ids: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-        cache_event: Optional[torch.cuda.Event],
-    ) -> torch.Tensor:
-        qkv, _ = self.query_key_value(hidden_states)
-        q, k, v = qkv.chunk(chunks=3, dim=-1)
-        k_cache, v_cache = kv_cache
-        attn_output = self.attn(position_ids, q, k, v, k_cache, v_cache,
-                                input_metadata, cache_event)
-        output, _ = self.dense(attn_output)
-        return output
-
-
-class GPTNeoXMLP(nn.Module):
-
-    def __init__(self, config: GPTNeoXConfig):
-        super().__init__()
-        self.dense_h_to_4h = ColumnParallelLinear(
-            config.hidden_size,
-            config.intermediate_size,
-            gather_output=False,
-        )
-        self.dense_4h_to_h = RowParallelLinear(
-            config.intermediate_size,
-            config.hidden_size,
-            input_is_parallel=True,
-        )
-        self.act = get_act_fn(config.hidden_act)
-
-    def forward(self, hidden_states):
-        hidden_states, _ = self.dense_h_to_4h(hidden_states)
-        hidden_states = self.act(hidden_states)
-        hidden_states, _ = self.dense_4h_to_h(hidden_states)
-        return hidden_states
-
-
-class GPTNeoXLayer(nn.Module):
-
-    def __init__(self, config: GPTNeoXConfig):
-        super().__init__()
-        self.use_parallel_residual = config.use_parallel_residual
-        self.input_layernorm = nn.LayerNorm(config.hidden_size,
-                                            eps=config.layer_norm_eps)
-        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size,
-                                                     eps=config.layer_norm_eps)
-        self.attention = GPTNeoXAttention(config)
-        self.mlp = GPTNeoXMLP(config)
-
-    def forward(
-        self,
-        position_ids: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-        cache_event: Optional[torch.cuda.Event],
-    ) -> torch.Tensor:
-        attn_input = self.input_layernorm(hidden_states)
-        attn_output = self.attention(
-            position_ids=position_ids,
-            hidden_states=attn_input,
-            kv_cache=kv_cache,
-            input_metadata=input_metadata,
-            cache_event=cache_event,
-        )
-
-        if self.use_parallel_residual:
-            # pseudocode:
-            # x = x + attn(ln1(x)) + mlp(ln2(x))
-            mlp_input = self.post_attention_layernorm(hidden_states)
-            mlp_output = self.mlp(mlp_input)
-            hidden_states = mlp_output + attn_output + hidden_states
-        else:
-            # pseudocode:
-            # x = x + attn(ln1(x))
-            # x = x + mlp(ln2(x))
-            attn_output = attn_output + hidden_states
-            mlp_input = self.post_attention_layernorm(attn_output)
-            mlp_output = self.mlp(mlp_input)
-            hidden_states = mlp_output + attn_output
-        return hidden_states
-
-
-class GPTNeoXModel(nn.Module):
-
-    def __init__(self, config: GPTNeoXConfig):
-        super().__init__()
-        self.config = config
-
-        self.embed_in = VocabParallelEmbedding(
-            config.vocab_size,
-            config.hidden_size,
-        )
-        self.layers = nn.ModuleList(
-            [GPTNeoXLayer(config) for _ in range(config.num_hidden_layers)])
-        self.final_layer_norm = nn.LayerNorm(config.hidden_size,
-                                             eps=config.layer_norm_eps)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        position_ids: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-        cache_events: Optional[List[torch.cuda.Event]],
-    ) -> torch.Tensor:
-        hidden_states = self.embed_in(input_ids)
-        for i in range(len(self.layers)):
-            if cache_events is None:
-                cache_event = None
-            else:
-                cache_event = cache_events[i]
-            layer = self.layers[i]
-            hidden_states = layer(
-                position_ids,
-                hidden_states,
-                kv_caches[i],
-                input_metadata,
-                cache_event,
-            )
-        hidden_states = self.final_layer_norm(hidden_states)
-        return hidden_states
-
-
-class GPTNeoXForCausalLM(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.gpt_neox = GPTNeoXModel(config)
-        self.embed_out = ColumnParallelLinear(
-            config.hidden_size,
-            config.vocab_size,
-            bias=False,
-            gather_output=False,
-        )
-        self.sampler = Sampler(config.vocab_size)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-        cache_events: Optional[List[torch.cuda.Event]],
-    ) -> SamplerOutput:
-        hidden_states = self.gpt_neox(input_ids, positions, kv_caches,
-                                      input_metadata, cache_events)
-        next_tokens = self.sampler(self.embed_out.weight, hidden_states,
-                                   input_metadata)
-        return next_tokens
-
-    _column_parallel_weights = [
-        "embed_in.weight", "embed_out.weight", "dense_h_to_4h.weight",
-        "dense_h_to_4h.bias"
-    ]
-    _row_parallel_weights = ["dense.weight", "dense_4h_to_h.weight"]
-
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
-        tensor_model_parallel_rank = get_tensor_model_parallel_rank()
-        state_dict = self.state_dict()
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
-            if ("attention.bias" in name or "attention.masked_bias" in name
-                    or "rotary_emb.inv_freq" in name):
-                continue
-            param = state_dict[name]
-            if "query_key_value" in name:
-                # NOTE(woosuk): GPT-NeoX's fused QKV has the shape of
-                # [num_heads * 3 * head_size, hidden_size], while the
-                # required shape is [3 * num_heads * head_size, hidden_size].
-                # Thus, we need weight conversion.
-                shard_size = param.shape[0]
-                loaded_weight = loaded_weight[
-                    shard_size * tensor_model_parallel_rank:shard_size *
-                    (tensor_model_parallel_rank + 1)]
-
-                num_heads = self.config.num_attention_heads
-                hidden_size = self.config.hidden_size
-                head_size = hidden_size // num_heads
-                if "query_key_value.weight" in name:
-                    loaded_weight = loaded_weight.view(-1, 3, head_size,
-                                                       hidden_size)
-                    loaded_weight = loaded_weight.transpose(0, 1)
-                    loaded_weight = loaded_weight.reshape(-1, hidden_size)
-                elif "query_key_value.bias" in name:
-                    loaded_weight = loaded_weight.view(-1, 3, head_size)
-                    loaded_weight = loaded_weight.transpose(0, 1)
-                    loaded_weight = loaded_weight.reshape(-1)
-                else:
-                    raise ValueError(f"Unexpected weight name: {name}")
-            load_tensor_parallel_weights(param, loaded_weight, name,
-                                         self._column_parallel_weights,
-                                         self._row_parallel_weights,
-                                         tensor_model_parallel_rank)
--- a/server/vllm/vllm/model_executor/models/internlm.py
+++ b/server/vllm/vllm/model_executor/models/internlm.py
-# -*- coding: utf-8 -*-
-from typing import List, Optional, Tuple
-
-import torch
-from torch import nn
-from transformers import LlamaConfig
-
-from vllm.model_executor.input_metadata import InputMetadata
-from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.attention import PagedAttentionWithRoPE
-from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.sampler import Sampler
-from vllm.model_executor.parallel_utils.parallel_state import (
-    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
-from vllm.model_executor.parallel_utils.layers import (ColumnParallelLinear,
-                                                       RowParallelLinear,
-                                                       VocabParallelEmbedding)
-from vllm.model_executor.weight_utils import (
-    hf_model_weights_iterator, load_padded_tensor_parallel_vocab,
-    load_tensor_parallel_weights)
-from vllm.sequence import SamplerOutput
-
-KVCache = Tuple[torch.Tensor, torch.Tensor]
-
-
-class InternLMMLP(nn.Module):
-
-    def __init__(
-        self,
-        hidden_size: int,
-        intermediate_size: int,
-        hidden_act: str,
-    ):
-        super().__init__()
-        self.gate_up_proj = ColumnParallelLinear(
-            hidden_size,
-            2 * intermediate_size,
-            bias=False,
-            gather_output=False,
-        )
-        self.down_proj = RowParallelLinear(
-            intermediate_size,
-            hidden_size,
-            bias=False,
-            input_is_parallel=True,
-        )
-        if hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu is supported for now.")
-        self.act_fn = SiluAndMul()
-
-    def forward(self, x):
-        gate_up, _ = self.gate_up_proj(x)
-        x = self.act_fn(gate_up)
-        x, _ = self.down_proj(x)
-        return x
-
-
-class InternLMAttention(nn.Module):
-
-    def __init__(
-        self,
-        hidden_size: int,
-        num_heads: int,
-        rope_theta: float = 10000,
-        max_position_embeddings: int = 8192,
-    ):
-        super().__init__()
-        self.hidden_size = hidden_size
-        tensor_model_parallel_world_size = (
-            get_tensor_model_parallel_world_size())
-        self.total_num_heads = num_heads
-        assert self.total_num_heads % tensor_model_parallel_world_size == 0
-        self.num_heads = (self.total_num_heads //
-                          tensor_model_parallel_world_size)
-        self.head_dim = hidden_size // self.total_num_heads
-        self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
-        self.max_position_embeddings = max_position_embeddings
-
-        self.qkv_proj = ColumnParallelLinear(
-            hidden_size,
-            3 * self.total_num_heads * self.head_dim,
-            bias=True,
-            gather_output=False,
-        )
-        self.o_proj = RowParallelLinear(
-            self.total_num_heads * self.head_dim,
-            hidden_size,
-            bias=True,
-            input_is_parallel=True,
-        )
-        self.attn = PagedAttentionWithRoPE(
-            self.num_heads,
-            self.head_dim,
-            self.scaling,
-            base=self.rope_theta,
-            max_position=self.max_position_embeddings,
-            rotary_dim=self.head_dim)
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-        cache_event: Optional[torch.cuda.Event],
-    ) -> torch.Tensor:
-        qkv, _ = self.qkv_proj(hidden_states)
-        q, k, v = qkv.chunk(chunks=3, dim=-1)
-        k_cache, v_cache = kv_cache
-        attn_output = self.attn(positions, q, k, v, k_cache, v_cache,
-                                input_metadata, cache_event)
-        output, _ = self.o_proj(attn_output)
-        return output
-
-
-class InternLMDecoderLayer(nn.Module):
-
-    def __init__(self, config: LlamaConfig):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
-        self.self_attn = InternLMAttention(
-            hidden_size=self.hidden_size,
-            num_heads=config.num_attention_heads,
-            rope_theta=rope_theta,
-            max_position_embeddings=max_position_embeddings,
-        )
-        self.mlp = InternLMMLP(
-            hidden_size=self.hidden_size,
-            intermediate_size=config.intermediate_size,
-            hidden_act=config.hidden_act,
-        )
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-        cache_event: Optional[torch.cuda.Event],
-    ) -> torch.Tensor:
-        # Self Attention
-        residual = hidden_states
-        hidden_states = self.input_layernorm(hidden_states)
-        hidden_states = self.self_attn(
-            positions=positions,
-            hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            input_metadata=input_metadata,
-            cache_event=cache_event,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-        return hidden_states
-
-
-class InternLMModel(nn.Module):
-
-    def __init__(self, config: LlamaConfig):
-        super().__init__()
-        self.config = config
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        vocab_size = ((config.vocab_size + 63) // 64) * 64
-        self.embed_tokens = VocabParallelEmbedding(
-            vocab_size,
-            config.hidden_size,
-        )
-        self.layers = nn.ModuleList([
-            InternLMDecoderLayer(config)
-            for _ in range(config.num_hidden_layers)
-        ])
-        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-        cache_events: Optional[List[torch.cuda.Event]],
-    ) -> torch.Tensor:
-        hidden_states = self.embed_tokens(input_ids)
-        for i in range(len(self.layers)):
-            if cache_events is None:
-                cache_event = None
-            else:
-                cache_event = cache_events[i]
-            layer = self.layers[i]
-            hidden_states = layer(
-                positions,
-                hidden_states,
-                kv_caches[i],
-                input_metadata,
-                cache_event,
-            )
-        hidden_states = self.norm(hidden_states)
-        return hidden_states
-
-
-class InternLMForCausalLM(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.model = InternLMModel(config)
-        vocab_size = ((config.vocab_size + 63) // 64) * 64
-        self.lm_head = ColumnParallelLinear(
-            config.hidden_size,
-            vocab_size,
-            bias=False,
-            gather_output=False,
-        )
-        self.sampler = Sampler(config.vocab_size)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-        cache_events: Optional[List[torch.cuda.Event]],
-    ) -> SamplerOutput:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   input_metadata, cache_events)
-        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
-                                   input_metadata)
-        return next_tokens
-
-    _column_parallel_weights = [
-        "qkv_proj.weight", "gate_proj.weight", "up_proj.weight"
-    ]
-    _row_parallel_weights = ["o_proj.weight", "down_proj.weight"]
-
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
-        tensor_model_parallel_rank = get_tensor_model_parallel_rank()
-        state_dict = self.state_dict()
-
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
-            if "rotary_emb.inv_freq" in name:
-                continue
-
-            if "embed_tokens" in name or "lm_head" in name:
-                param = state_dict[name]
-                load_padded_tensor_parallel_vocab(param, loaded_weight,
-                                                  tensor_model_parallel_rank)
-                continue
-
-            is_attention_weight = False
-            for stride_id, att_weight_name in enumerate(
-                ["q_proj", "k_proj", "v_proj"]):
-                if att_weight_name not in name:
-                    continue
-                param = state_dict[name.replace(att_weight_name, "qkv_proj")]
-                shard_size = param.shape[0] // 3
-                loaded_weight = loaded_weight[
-                    shard_size * tensor_model_parallel_rank:shard_size *
-                    (tensor_model_parallel_rank + 1)]
-                param_slice = param.data[shard_size * stride_id:shard_size *
-                                         (stride_id + 1)]
-                assert param_slice.shape == loaded_weight.shape
-                param_slice.copy_(loaded_weight)
-                is_attention_weight = True
-                break
-            if is_attention_weight:
-                continue
-
-            is_gate_up_weight = False
-            for stride_id, weight_name in enumerate(["gate_proj", "up_proj"]):
-                if weight_name not in name:
-                    continue
-                param = state_dict[name.replace(weight_name, "gate_up_proj")]
-                shard_size = param.shape[0] // 2
-                loaded_weight = loaded_weight[
-                    shard_size * tensor_model_parallel_rank:shard_size *
-                    (tensor_model_parallel_rank + 1)]
-                param_slice = param.data[shard_size * stride_id:shard_size *
-                                         (stride_id + 1)]
-                assert param_slice.shape == loaded_weight.shape
-                param_slice.copy_(loaded_weight)
-                is_gate_up_weight = True
-                break
-            if is_gate_up_weight:
-                continue
-
-            param = state_dict[name]
-            load_tensor_parallel_weights(param, loaded_weight, name,
-                                         self._column_parallel_weights,
-                                         self._row_parallel_weights,
-                                         tensor_model_parallel_rank)