merge v0.3.1

7e1d5e53 · zhuwenwen · e3378b20 · 5f08050d · e3378b20 · 7e1d5e53
Commit 7e1d5e53 authored Feb 19, 2024 by zhuwenwen
20 changed files
--- a/vllm/model_executor/models/aquila.py
+++ b/vllm/model_executor/models/aquila.py
-# coding=utf-8
-# Adapted from
-# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
-# Copyright 2023 The vLLM team.
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Inference-only LLaMA model compatible with HuggingFace weights."""
-from typing import Any, Dict, List, Optional, Tuple
-import torch
-from torch import nn
-from vllm.model_executor.input_metadata import InputMetadata
-from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.attention import PagedAttention
-from vllm.model_executor.layers.linear import (LinearMethodBase,
-                                               MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
-from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    VocabParallelEmbedding, ParallelLMHead)
-from vllm.model_executor.parallel_utils.parallel_state import (
-    get_tensor_model_parallel_world_size)
-from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
-                                              hf_model_weights_iterator)
-from vllm.sequence import SamplerOutput
-from vllm.transformers_utils.configs.aquila import AquilaConfig
-KVCache = Tuple[torch.Tensor, torch.Tensor]
-class AquilaMLP(nn.Module):
-    def __init__(
-        self,
-        hidden_size: int,
-        intermediate_size: int,
-        hidden_act: str,
-        linear_method: Optional[LinearMethodBase] = None,
-    ):
-        super().__init__()
-        self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2,
-            bias=False,
-            linear_method=linear_method)
-        self.down_proj = RowParallelLinear(intermediate_size,
-                                           hidden_size,
-                                           bias=False,
-                                           linear_method=linear_method)
-        if hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu is supported for now.")
-        self.act_fn = SiluAndMul()
-    def forward(self, x):
-        gate_up, _ = self.gate_up_proj(x)
-        x = self.act_fn(gate_up)
-        x, _ = self.down_proj(x)
-        return x
-class AquilaRMSNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        AquilaRMSNorm is equivalent to T5LayerNorm
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        variance = hidden_states.to(torch.float32).pow(2).mean(-1,
-                                                               keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance +
-                                                    self.variance_epsilon)
-        return (self.weight * hidden_states).to(input_dtype)
-class AquilaAttention(nn.Module):
-    def __init__(
-        self,
-        hidden_size: int,
-        num_heads: int,
-        num_kv_heads: int,
-        rope_theta: float = 10000,
-        max_position_embeddings: int = 8192,
-        rope_scaling: Optional[Dict[str, Any]] = None,
-        linear_method: Optional[LinearMethodBase] = None,
-    ):
-        super().__init__()
-        self.hidden_size = hidden_size
-        tp_size = get_tensor_model_parallel_world_size()
-        self.total_num_heads = num_heads
-        assert self.total_num_heads % tp_size == 0
-        self.num_heads = self.total_num_heads // tp_size
-        self.total_num_kv_heads = num_kv_heads
-        assert self.total_num_kv_heads % tp_size == 0
-        self.num_kv_heads = self.total_num_kv_heads // tp_size
-        self.head_dim = hidden_size // self.total_num_heads
-        self.q_size = self.num_heads * self.head_dim
-        self.kv_size = self.num_kv_heads * self.head_dim
-        self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
-        self.max_position_embeddings = max_position_embeddings
-        self.qkv_proj = QKVParallelLinear(
-            hidden_size,
-            self.head_dim,
-            self.total_num_heads,
-            self.total_num_kv_heads,
-            bias=False,
-            linear_method=linear_method,
-        )
-        self.o_proj = RowParallelLinear(
-            self.total_num_heads * self.head_dim,
-            hidden_size,
-            bias=False,
-            linear_method=linear_method,
-        )
-        self.rotary_emb = get_rope(
-            self.head_dim,
-            rotary_dim=self.head_dim,
-            max_position=self.max_position_embeddings,
-            base=self.rope_theta,
-            rope_scaling=rope_scaling,
-        )
-        self.attn = PagedAttention(self.num_heads,
-                                   self.head_dim,
-                                   self.scaling,
-                                   num_kv_heads=self.num_kv_heads)
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-    ) -> torch.Tensor:
-        qkv, _ = self.qkv_proj(hidden_states)
-        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
-        q, k = self.rotary_emb(positions, q, k)
-        k_cache, v_cache = kv_cache
-        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata)
-        output, _ = self.o_proj(attn_output)
-        return output
-class AquilaDecoderLayer(nn.Module):
-    def __init__(
-        self,
-        config: AquilaConfig,
-        linear_method: Optional[LinearMethodBase] = None,
-    ):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
-        self.self_attn = AquilaAttention(
-            hidden_size=self.hidden_size,
-            num_heads=config.num_attention_heads,
-            num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            max_position_embeddings=max_position_embeddings,
-            rope_scaling=rope_scaling,
-            linear_method=linear_method,
-        )
-        self.mlp = AquilaMLP(
-            hidden_size=self.hidden_size,
-            intermediate_size=config.intermediate_size,
-            hidden_act=config.hidden_act,
-            linear_method=linear_method,
-        )
-        self.input_layernorm = AquilaRMSNorm(config.hidden_size,
-                                             eps=config.rms_norm_eps)
-        self.post_attention_layernorm = AquilaRMSNorm(config.hidden_size,
-                                                      eps=config.rms_norm_eps)
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-    ) -> torch.Tensor:
-        # Self Attention
-        residual = hidden_states
-        hidden_states = self.input_layernorm(hidden_states)
-        hidden_states = self.self_attn(
-            positions=positions,
-            hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            input_metadata=input_metadata,
-        )
-        hidden_states = residual + hidden_states
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-        return hidden_states
-class AquilaModel(nn.Module):
-    def __init__(
-        self,
-        config: AquilaConfig,
-        linear_method: Optional[LinearMethodBase] = None,
-    ):
-        super().__init__()
-        self.config = config
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-        self.embed_tokens = VocabParallelEmbedding(
-            config.vocab_size,
-            config.hidden_size,
-        )
-        self.layers = nn.ModuleList([
-            AquilaDecoderLayer(config, linear_method)
-            for _ in range(config.num_hidden_layers)
-        ])
-        self.norm = AquilaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-    ) -> torch.Tensor:
-        hidden_states = self.embed_tokens(input_ids)
-        for i in range(len(self.layers)):
-            layer = self.layers[i]
-            hidden_states = layer(
-                positions,
-                hidden_states,
-                kv_caches[i],
-                input_metadata,
-            )
-        hidden_states = self.norm(hidden_states)
-        return hidden_states
-class AquilaForCausalLM(nn.Module):
-    def __init__(
-        self,
-        config,
-        linear_method: Optional[LinearMethodBase] = None,
-    ):
-        super().__init__()
-        self.config = config
-        self.linear_method = linear_method
-        self.model = AquilaModel(config, linear_method)
-        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
-        self.sampler = Sampler(config.vocab_size)
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-    ) -> torch.Tensor:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   input_metadata)
-        return hidden_states
-    def sample(
-        self,
-        hidden_states: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
-                                   sampling_metadata)
-        return next_tokens
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
-            if "rotary_emb.inv_freq" in name:
-                continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
--- a/vllm/model_executor/models/decilm.py
+++ b/vllm/model_executor/models/decilm.py
@@ -28,6 +28,7 @@ from typing import Optional
 import torch
 from transformers import PretrainedConfig
+from vllm.config import LoRAConfig
 from vllm.model_executor.layers.linear import LinearMethodBase
 from vllm.model_executor.models.llama import LlamaForCausalLM
 from vllm.model_executor.weight_utils import (default_weight_loader,
@@ -56,10 +57,13 @@ class DeciLMForCausalLM(LlamaForCausalLM):
        self,
        config: Optional[PretrainedConfig] = None,
        linear_method: Optional[LinearMethodBase] = None,
+        lora_config: Optional[LoRAConfig] = None,
    ) -> None:
        config.num_key_value_heads = max(config.num_key_value_heads_per_layer)
        delattr(config, "num_key_value_heads_per_layer")
-        super().__init__(config=config, linear_method=linear_method)
+        super().__init__(config=config,
+                         linear_method=linear_method,
+                         lora_config=lora_config)
    def load_weights(self,
                     model_name_or_path: str,

--- a/vllm/model_executor/models/deepseek.py
+++ b/vllm/model_executor/models/deepseek.py
@@ -25,7 +25,6 @@ from typing import Any, Dict, List, Optional, Tuple
 import torch
 from torch import nn
-import torch.nn.functional as F
 from transformers import PretrainedConfig
 from vllm.model_executor.input_metadata import InputMetadata
@@ -155,20 +154,12 @@ class DeepseekMoE(nn.Module):
            shared_output = self.shared_experts(hidden_states)
        # router_logits: (batch * sequence_length, n_experts)
        router_logits, _ = self.gate(hidden_states)
-        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
-        routing_weights, selected_experts = torch.topk(routing_weights,
-                                                       self.top_k,
-                                                       dim=-1)
-        if self.config.norm_topk_prob:
-            routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
        final_hidden_states = fused_moe(hidden_states,
                                        self.w1,
                                        self.w2,
-                                        routing_weights,
+                                        router_logits,
-                                        selected_experts,
+                                        self.top_k,
+                                        renormalize=self.config.norm_topk_prob,
                                        inplace=True)
        if self.config.n_shared_experts is not None:

--- a/vllm/model_executor/models/internlm.py
+++ b/vllm/model_executor/models/internlm.py
-# -*- coding: utf-8 -*-
-from typing import Any, Dict, List, Optional, Tuple
-import torch
-from torch import nn
-from transformers import LlamaConfig
-from vllm.model_executor.input_metadata import InputMetadata
-from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.attention import PagedAttention
-from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (LinearMethodBase,
-                                               MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
-from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    VocabParallelEmbedding, ParallelLMHead)
-from vllm.model_executor.parallel_utils.parallel_state import (
-    get_tensor_model_parallel_world_size)
-from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
-                                              hf_model_weights_iterator)
-from vllm.sequence import SamplerOutput
-KVCache = Tuple[torch.Tensor, torch.Tensor]
-class InternLMMLP(nn.Module):
-    def __init__(
-        self,
-        hidden_size: int,
-        intermediate_size: int,
-        hidden_act: str,
-        linear_method: Optional[LinearMethodBase] = None,
-    ):
-        super().__init__()
-        self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2,
-            bias=False,
-            linear_method=linear_method)
-        self.down_proj = RowParallelLinear(intermediate_size,
-                                           hidden_size,
-                                           bias=False,
-                                           linear_method=linear_method)
-        if hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu is supported for now.")
-        self.act_fn = SiluAndMul()
-    def forward(self, x):
-        gate_up, _ = self.gate_up_proj(x)
-        x = self.act_fn(gate_up)
-        x, _ = self.down_proj(x)
-        return x
-class InternLMAttention(nn.Module):
-    def __init__(
-        self,
-        hidden_size: int,
-        num_heads: int,
-        bias: bool,
-        rope_theta: float = 10000,
-        max_position_embeddings: int = 8192,
-        linear_method: Optional[LinearMethodBase] = None,
-        rope_scaling: Optional[Dict[str, Any]] = None,
-    ):
-        super().__init__()
-        self.hidden_size = hidden_size
-        tensor_model_parallel_world_size = (
-            get_tensor_model_parallel_world_size())
-        self.total_num_heads = num_heads
-        assert self.total_num_heads % tensor_model_parallel_world_size == 0
-        self.num_heads = (self.total_num_heads //
-                          tensor_model_parallel_world_size)
-        self.head_dim = hidden_size // self.total_num_heads
-        self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
-        self.max_position_embeddings = max_position_embeddings
-        self.qkv_proj = QKVParallelLinear(
-            hidden_size,
-            self.head_dim,
-            self.total_num_heads,
-            bias=bias,
-            linear_method=linear_method,
-        )
-        self.o_proj = RowParallelLinear(
-            self.total_num_heads * self.head_dim,
-            hidden_size,
-            bias=bias,
-            linear_method=linear_method,
-        )
-        self.rotary_emb = get_rope(
-            self.head_dim,
-            rotary_dim=self.head_dim,
-            max_position=self.max_position_embeddings,
-            base=self.rope_theta,
-            rope_scaling=rope_scaling,
-        )
-        self.attn = PagedAttention(self.num_heads, self.head_dim, self.scaling)
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-    ) -> torch.Tensor:
-        qkv, _ = self.qkv_proj(hidden_states)
-        q, k, v = qkv.chunk(chunks=3, dim=-1)
-        q, k = self.rotary_emb(positions, q, k)
-        k_cache, v_cache = kv_cache
-        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata)
-        output, _ = self.o_proj(attn_output)
-        return output
-class InternLMDecoderLayer(nn.Module):
-    def __init__(
-        self,
-        config: LlamaConfig,
-        linear_method: Optional[LinearMethodBase] = None,
-    ):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
-        self.self_attn = InternLMAttention(
-            hidden_size=self.hidden_size,
-            num_heads=config.num_attention_heads,
-            bias=config.bias,
-            rope_theta=rope_theta,
-            max_position_embeddings=max_position_embeddings,
-            linear_method=linear_method,
-            rope_scaling=getattr(config, "rope_scaling", None),
-        )
-        self.mlp = InternLMMLP(
-            hidden_size=self.hidden_size,
-            intermediate_size=config.intermediate_size,
-            hidden_act=config.hidden_act,
-            linear_method=linear_method,
-        )
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-        residual: Optional[torch.Tensor],
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        # Self Attention
-        if residual is None:
-            residual = hidden_states
-            hidden_states = self.input_layernorm(hidden_states)
-        else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
-        hidden_states = self.self_attn(
-            positions=positions,
-            hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            input_metadata=input_metadata,
-        )
-        # Fully Connected
-        hidden_states, residual = self.post_attention_layernorm(
-            hidden_states, residual)
-        hidden_states = self.mlp(hidden_states)
-        return hidden_states, residual
-class InternLMModel(nn.Module):
-    def __init__(
-        self,
-        config: LlamaConfig,
-        linear_method: Optional[LinearMethodBase] = None,
-    ):
-        super().__init__()
-        self.config = config
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-        vocab_size = ((config.vocab_size + 63) // 64) * 64
-        self.embed_tokens = VocabParallelEmbedding(
-            vocab_size,
-            config.hidden_size,
-        )
-        self.layers = nn.ModuleList([
-            InternLMDecoderLayer(config, linear_method)
-            for _ in range(config.num_hidden_layers)
-        ])
-        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-    ) -> torch.Tensor:
-        hidden_states = self.embed_tokens(input_ids)
-        residual = None
-        for i in range(len(self.layers)):
-            layer = self.layers[i]
-            hidden_states, residual = layer(
-                positions,
-                hidden_states,
-                kv_caches[i],
-                input_metadata,
-                residual,
-            )
-        hidden_states, _ = self.norm(hidden_states, residual)
-        return hidden_states
-class InternLMForCausalLM(nn.Module):
-    def __init__(
-        self,
-        config,
-        linear_method: Optional[LinearMethodBase] = None,
-    ):
-        super().__init__()
-        self.config = config
-        self.linear_method = linear_method
-        self.model = InternLMModel(config, linear_method)
-        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
-        self.sampler = Sampler(config.vocab_size)
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-    ) -> torch.Tensor:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   input_metadata)
-        return hidden_states
-    def sample(
-        self,
-        hidden_states: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
-                                   sampling_metadata)
-        return next_tokens
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
-            if "rotary_emb.inv_freq" in name:
-                continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
--- a/vllm/model_executor/models/yi.py
+++ b/vllm/model_executor/models/yi.py
-# coding=utf-8
+# -*- coding: utf-8 -*-
-# Adapted from
-# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
-# Copyright 2023 The vLLM team.
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Inference-only Yi model (https://01.ai) compatible with HuggingFace weights."""
 from typing import Any, Dict, List, Optional, Tuple
 import torch
 from torch import nn
-from vllm.transformers_utils.configs.yi import YiConfig
+from transformers import PretrainedConfig
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import SiluAndMul
@@ -49,7 +27,7 @@ from vllm.sequence import SamplerOutput
 KVCache = Tuple[torch.Tensor, torch.Tensor]
-class YiMLP(nn.Module):
+class InternLM2MLP(nn.Module):
    def __init__(
        self,
@@ -63,10 +41,10 @@ class YiMLP(nn.Module):
            hidden_size, [intermediate_size] * 2,
            bias=False,
            linear_method=linear_method)
-        self.down_proj = RowParallelLinear(intermediate_size,
+        self.w2 = RowParallelLinear(intermediate_size,
-                                           hidden_size,
+                                    hidden_size,
-                                           bias=False,
+                                    bias=False,
-                                           linear_method=linear_method)
+                                    linear_method=linear_method)
        if hidden_act != "silu":
            raise ValueError(f"Unsupported activation: {hidden_act}. "
                             "Only silu is supported for now.")
@@ -75,11 +53,11 @@ class YiMLP(nn.Module):
    def forward(self, x):
        gate_up, _ = self.gate_up_proj(x)
        x = self.act_fn(gate_up)
-        x, _ = self.down_proj(x)
+        x, _ = self.w2(x)
        return x
-class YiAttention(nn.Module):
+class InternLM2Attention(nn.Module):
    def __init__(
        self,
@@ -114,7 +92,7 @@ class YiAttention(nn.Module):
        self.rope_theta = rope_theta
        self.max_position_embeddings = max_position_embeddings
-        self.qkv_proj = QKVParallelLinear(
+        self.wqkv = QKVParallelLinear(
            hidden_size,
            self.head_dim,
            self.total_num_heads,
@@ -122,17 +100,18 @@ class YiAttention(nn.Module):
            bias=False,
            linear_method=linear_method,
        )
-        self.o_proj = RowParallelLinear(
+        self.wo = RowParallelLinear(
            self.total_num_heads * self.head_dim,
            hidden_size,
            bias=False,
            linear_method=linear_method,
        )
        self.rotary_emb = get_rope(
            self.head_dim,
            rotary_dim=self.head_dim,
            max_position=max_position_embeddings,
-            base=self.rope_theta,
+            base=rope_theta,
            rope_scaling=rope_scaling,
        )
        self.attn = PagedAttention(self.num_heads,
@@ -147,20 +126,20 @@ class YiAttention(nn.Module):
        kv_cache: KVCache,
        input_metadata: InputMetadata,
    ) -> torch.Tensor:
-        qkv, _ = self.qkv_proj(hidden_states)
+        qkv, _ = self.wqkv(hidden_states)
        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
        q, k = self.rotary_emb(positions, q, k)
        k_cache, v_cache = kv_cache
        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata)
-        output, _ = self.o_proj(attn_output)
+        output, _ = self.wo(attn_output)
        return output
-class YiDecoderLayer(nn.Module):
+class InternLMDecoderLayer(nn.Module):
    def __init__(
        self,
-        config: YiConfig,
+        config: PretrainedConfig,
        linear_method: Optional[LinearMethodBase] = None,
    ) -> None:
        super().__init__()
@@ -169,7 +148,7 @@ class YiDecoderLayer(nn.Module):
        rope_scaling = getattr(config, "rope_scaling", None)
        max_position_embeddings = getattr(config, "max_position_embeddings",
                                          8192)
-        self.self_attn = YiAttention(
+        self.attention = InternLM2Attention(
            hidden_size=self.hidden_size,
            num_heads=config.num_attention_heads,
            num_kv_heads=config.num_key_value_heads,
@@ -178,14 +157,15 @@ class YiDecoderLayer(nn.Module):
            max_position_embeddings=max_position_embeddings,
            linear_method=linear_method,
        )
-        self.mlp = YiMLP(
+        self.feed_forward = InternLM2MLP(
            hidden_size=self.hidden_size,
            intermediate_size=config.intermediate_size,
            hidden_act=config.hidden_act,
            linear_method=linear_method,
        )
-        self.ln1 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.attention_norm = RMSNorm(config.hidden_size,
-        self.ln2 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+                                      eps=config.rms_norm_eps)
+        self.ffn_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
    def forward(
        self,
@@ -198,10 +178,11 @@ class YiDecoderLayer(nn.Module):
        # Self Attention
        if residual is None:
            residual = hidden_states
-            hidden_states = self.ln1(hidden_states)
+            hidden_states = self.attention_norm(hidden_states)
        else:
-            hidden_states, residual = self.ln1(hidden_states, residual)
+            hidden_states, residual = self.attention_norm(
-        hidden_states = self.self_attn(
+                hidden_states, residual)
+        hidden_states = self.attention(
            positions=positions,
            hidden_states=hidden_states,
            kv_cache=kv_cache,
@@ -209,28 +190,28 @@ class YiDecoderLayer(nn.Module):
        )
        # Fully Connected
-        hidden_states, residual = self.ln2(hidden_states, residual)
+        hidden_states, residual = self.ffn_norm(hidden_states, residual)
-        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.feed_forward(hidden_states)
        return hidden_states, residual
-class YiModel(nn.Module):
+class InternLM2Model(nn.Module):
    def __init__(
        self,
-        config: YiConfig,
+        config: PretrainedConfig,
        linear_method: Optional[LinearMethodBase] = None,
    ) -> None:
        super().__init__()
        self.config = config
        self.padding_idx = config.pad_token_id
        self.vocab_size = config.vocab_size
-        self.embed_tokens = VocabParallelEmbedding(
+        self.tok_embeddings = VocabParallelEmbedding(
            config.vocab_size,
            config.hidden_size,
        )
        self.layers = nn.ModuleList([
-            YiDecoderLayer(config, linear_method)
+            InternLMDecoderLayer(config, linear_method)
            for _ in range(config.num_hidden_layers)
        ])
        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -242,7 +223,7 @@ class YiModel(nn.Module):
        kv_caches: List[KVCache],
        input_metadata: InputMetadata,
    ) -> torch.Tensor:
-        hidden_states = self.embed_tokens(input_ids)
+        hidden_states = self.tok_embeddings(input_ids)
        residual = None
        for i in range(len(self.layers)):
            layer = self.layers[i]
@@ -257,18 +238,18 @@ class YiModel(nn.Module):
        return hidden_states
-class YiForCausalLM(nn.Module):
+class InternLM2ForCausalLM(nn.Module):
    def __init__(
        self,
-        config: YiConfig,
+        config: PretrainedConfig,
        linear_method: Optional[LinearMethodBase] = None,
    ) -> None:
        super().__init__()
        self.config = config
        self.linear_method = linear_method
-        self.model = YiModel(config, linear_method)
+        self.model = InternLM2Model(config, linear_method)
-        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+        self.output = ParallelLMHead(config.vocab_size, config.hidden_size)
        self.sampler = Sampler(config.vocab_size)
    def forward(
@@ -287,7 +268,7 @@ class YiForCausalLM(nn.Module):
        hidden_states: torch.Tensor,
        sampling_metadata: SamplingMetadata,
    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
+        next_tokens = self.sampler(self.output.weight, hidden_states,
                                   sampling_metadata)
        return next_tokens
@@ -298,11 +279,8 @@ class YiForCausalLM(nn.Module):
                     revision: Optional[str] = None):
        stacked_params_mapping = [
            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
+            ("gate_up_proj", "w1", 0),
-            ("qkv_proj", "k_proj", "k"),
+            ("gate_up_proj", "w3", 1),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
        ]
        params_dict = dict(self.named_parameters())
        for name, loaded_weight in hf_model_weights_iterator(
@@ -325,6 +303,23 @@ class YiForCausalLM(nn.Module):
                if name.endswith(".bias") and name not in params_dict:
                    continue
                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
+                if "wqkv" in name:
-                                        default_weight_loader)
+                    config = self.config
-                weight_loader(param, loaded_weight)
+                    kv_groups = config.num_attention_heads // config.num_key_value_heads
+                    head_dim = config.hidden_size // config.num_attention_heads
+                    loaded_weight = loaded_weight.view(-1, 2 + kv_groups,
+                                                       head_dim,
+                                                       loaded_weight.shape[-1])
+                    wq, wk, wv = torch.split(loaded_weight, [kv_groups, 1, 1],
+                                             dim=1)
+                    wq = wq.reshape(-1, wq.shape[-1])
+                    wk = wk.reshape(-1, wk.shape[-1])
+                    wv = wv.reshape(-1, wv.shape[-1])
+                    weight_loader = param.weight_loader
+                    weight_loader(param, wq, 'q')
+                    weight_loader(param, wk, 'k')
+                    weight_loader(param, wv, 'v')
+                else:
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -91,6 +91,7 @@ class LlamaAttention(nn.Module):
        rope_scaling: Optional[Dict[str, Any]] = None,
        max_position_embeddings: int = 8192,
        linear_method: Optional[LinearMethodBase] = None,
+        bias: bool = False,
    ) -> None:
        super().__init__()
        self.hidden_size = hidden_size
@@ -120,13 +121,13 @@ class LlamaAttention(nn.Module):
            self.head_dim,
            self.total_num_heads,
            self.total_num_kv_heads,
-            bias=False,
+            bias=bias,
            linear_method=linear_method,
        )
        self.o_proj = RowParallelLinear(
            self.total_num_heads * self.head_dim,
            hidden_size,
-            bias=False,
+            bias=bias,
            linear_method=linear_method,
        )
@@ -174,11 +175,13 @@ class LlamaDecoderLayer(nn.Module):
        self.self_attn = LlamaAttention(
            hidden_size=self.hidden_size,
            num_heads=config.num_attention_heads,
-            num_kv_heads=config.num_key_value_heads,
+            num_kv_heads=getattr(config, "num_key_value_heads",
+                                 config.num_attention_heads),
            rope_theta=rope_theta,
            rope_scaling=rope_scaling,
            max_position_embeddings=max_position_embeddings,
            linear_method=linear_method,
+            bias=getattr(config, "bias", False),
        )
        self.mlp = LlamaMLP(
            hidden_size=self.hidden_size,
@@ -269,7 +272,32 @@ class LlamaModel(nn.Module):
 class LlamaForCausalLM(nn.Module):
-    supports_lora = True
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "qkv_proj",
+        "o_proj",
+        "gate_up_proj",
+        "down_proj",
+        "embed_tokens",
+        "lm_head",
+    ]
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
    def __init__(
        self,
@@ -281,11 +309,11 @@ class LlamaForCausalLM(nn.Module):
        self.config = config
        self.linear_method = linear_method
        self.model = LlamaModel(config, linear_method, lora_config=lora_config)
-        unpadded_vocab_size = config.vocab_size
+        self.unpadded_vocab_size = config.vocab_size
        if lora_config:
-            unpadded_vocab_size += lora_config.lora_extra_vocab_size
+            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
        self.lm_head = ParallelLMHead(
-            unpadded_vocab_size,
+            self.unpadded_vocab_size,
            config.hidden_size,
            org_num_embeddings=config.vocab_size,
            padding_size=DEFAULT_VOCAB_PADDING_SIZE
@@ -293,7 +321,7 @@ class LlamaForCausalLM(nn.Module):
            # compatibility
            if not lora_config else lora_config.lora_vocab_padding_size,
        )
-        self.sampler = Sampler(unpadded_vocab_size, config.vocab_size)
+        self.sampler = Sampler(self.unpadded_vocab_size, config.vocab_size)
    def forward(
        self,

--- a/vllm/model_executor/models/mistral.py
+++ b/vllm/model_executor/models/mistral.py
@@ -265,7 +265,32 @@ class MistralModel(nn.Module):
 class MistralForCausalLM(nn.Module):
-    supports_lora = True
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "qkv_proj",
+        "o_proj",
+        "gate_up_proj",
+        "down_proj",
+        "embed_tokens",
+        "lm_head",
+    ]
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
    def __init__(
        self,

--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -24,11 +24,10 @@
 from typing import List, Optional, Tuple
 import torch
-import torch.nn.functional as F
 from torch import nn
 from transformers import MixtralConfig
+from vllm.config import LoRAConfig
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.attention import PagedAttention
 from vllm.model_executor.layers.fused_moe import fused_moe
@@ -40,7 +39,7 @@ from vllm.model_executor.layers.linear import (LinearMethodBase,
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    VocabParallelEmbedding, ParallelLMHead)
+    VocabParallelEmbedding, ParallelLMHead, DEFAULT_VOCAB_PADDING_SIZE)
 from vllm.model_executor.parallel_utils.communication_op import (
    tensor_model_parallel_all_reduce)
 from vllm.model_executor.parallel_utils.parallel_state import (
@@ -70,13 +69,14 @@ class MixtralMoE(nn.Module):
        hidden_size: int,
        intermediate_size: int,
        params_dtype: Optional[torch.dtype] = None,
+        tp_size: Optional[int] = None,
    ):
        super().__init__()
-        tp_size = get_tensor_model_parallel_world_size()
+        self.tp_size = tp_size or get_tensor_model_parallel_world_size()
        self.num_total_experts = num_experts
        self.top_k = top_k
        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size // tp_size
+        self.intermediate_size = intermediate_size // self.tp_size
        if params_dtype is None:
            params_dtype = torch.get_default_dtype()
@@ -127,22 +127,17 @@ class MixtralMoE(nn.Module):
        hidden_states = hidden_states.view(-1, self.hidden_size)
        # router_logits: (batch * sequence_length, n_experts)
        router_logits, _ = self.gate(hidden_states)
-        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
-        routing_weights, selected_experts = torch.topk(routing_weights,
-                                                       self.top_k,
-                                                       dim=-1)
-        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
        final_hidden_states = fused_moe(hidden_states,
                                        self.ws,
                                        self.w2s,
-                                        routing_weights,
+                                        router_logits,
-                                        selected_experts,
+                                        self.top_k,
+                                        renormalize=True,
                                        inplace=True)
-        final_hidden_states = tensor_model_parallel_all_reduce(
+        if self.tp_size > 1:
-            final_hidden_states)
+            final_hidden_states = tensor_model_parallel_all_reduce(
+                final_hidden_states)
        return final_hidden_states.view(batch_size, sequence_length,
                                        hidden_size)
@@ -290,14 +285,19 @@ class MixtralModel(nn.Module):
        self,
        config: MixtralConfig,
        linear_method: Optional[LinearMethodBase] = None,
+        lora_config: Optional[LoRAConfig] = None,
    ) -> None:
        super().__init__()
        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
+        lora_vocab = (lora_config.lora_extra_vocab_size *
+                      (lora_config.max_loras or 1)) if lora_config else 0
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
        self.embed_tokens = VocabParallelEmbedding(
-            config.vocab_size,
+            self.vocab_size,
            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
        )
        self.layers = nn.ModuleList([
            MixtralDecoderLayer(config, linear_method=linear_method)
@@ -324,18 +324,52 @@ class MixtralModel(nn.Module):
 class MixtralForCausalLM(nn.Module):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+    }
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "qkv_proj",
+        "o_proj",
+        "embed_tokens",
+        "lm_head",
+    ]
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
    def __init__(
        self,
        config: MixtralConfig,
        linear_method: Optional[LinearMethodBase] = None,
+        lora_config: Optional[LoRAConfig] = None,
    ) -> None:
        super().__init__()
        self.config = config
        self.linear_method = linear_method
-        self.model = MixtralModel(config, linear_method)
+        self.model = MixtralModel(config,
-        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+                                  linear_method,
-        self.sampler = Sampler(config.vocab_size)
+                                  lora_config=lora_config)
+        self.unpadded_vocab_size = config.vocab_size
+        if lora_config:
+            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+        self.lm_head = ParallelLMHead(
+            self.unpadded_vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE
+            # We need bigger padding if using lora for kernel
+            # compatibility
+            if not lora_config else lora_config.lora_vocab_padding_size,
+        )
+        self.sampler = Sampler(self.unpadded_vocab_size, config.vocab_size)
    def forward(
        self,

--- a/vllm/model_executor/parallel_utils/communication_op.py
+++ b/vllm/model_executor/parallel_utils/communication_op.py
 from collections import namedtuple
 from typing import Any, Dict, List, Optional, Union
-from torch.distributed import ProcessGroup
 import torch
+from torch.distributed import ProcessGroup
+from vllm.model_executor.parallel_utils import cupy_utils
 from vllm.model_executor.parallel_utils.parallel_state import (
    get_tensor_model_parallel_rank,
    get_tensor_model_parallel_world_size,
    get_tensor_model_parallel_group,
+    is_cupy_nccl_enabled_for_all_reduce,
 )
 from vllm.model_executor.parallel_utils.custom_all_reduce import custom_all_reduce
@@ -31,8 +32,12 @@ def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor:
    out = custom_all_reduce(input_)
    if out is not None:
        return out
-    torch.distributed.all_reduce(input_,
+    if is_cupy_nccl_enabled_for_all_reduce():
-                                 group=get_tensor_model_parallel_group())
+        # TODO: support multiple parallel groups.
+        cupy_utils.all_reduce(input_)
+    else:
+        torch.distributed.all_reduce(input_,
+                                     group=get_tensor_model_parallel_group())
    return input_

--- a/vllm/model_executor/parallel_utils/cupy_utils.py
+++ b/vllm/model_executor/parallel_utils/cupy_utils.py
+"""CuPy utilities for all-reduce.
+We use CuPy all-reduce instead of torch.distributed.all_reduce when capturing
+CUDA graphs, because torch.distributed.all_reduce causes errors when capturing
+CUDA graphs.
+NOTE: We use CuPy 12.3 since CuPy 13.0 does not support Python 3.8.
+TODO: Remove this file when torch.distributed.all_reduce is fixed.
+"""
+import contextlib
+import torch
+from torch.distributed import ReduceOp
+try:
+    import cupy
+    from cupy.cuda import nccl
+    from cupyx.distributed import NCCLBackend
+except ImportError as e:
+    cupy = e
+    nccl = None
+    class NCCLBackend:
+        ...
+_OP_MAPPING = {
+    ReduceOp.SUM: "sum",
+    ReduceOp.PRODUCT: "prod",
+    ReduceOp.MIN: "min",
+    ReduceOp.MAX: "max",
+}
+class NCCLBackendWithBFloat16(NCCLBackend):
+    # This is enough to add bfloat16 support for most operations,
+    # but broadcast will fail (will require changes in compiled
+    # cupy code).
+    def _get_nccl_dtype_and_count(self, array, count=None):
+        nccl_dtype, count = super()._get_nccl_dtype_and_count(array, count)
+        torch_dtype = getattr(array, "_torch_dtype", None)
+        if torch_dtype is torch.bfloat16:
+            nccl_dtype = nccl.NCCL_BFLOAT16
+        return nccl_dtype, count
+    def barrier(self) -> None:
+        raise RuntimeError(
+            "Currently, CuPy NCCL barrier is not supported since the TCP "
+            "store is immediately stopped after the initialization.")
+_NCCL_BACKEND = None
+_WORLD_SIZE = 0
+def is_initialized() -> bool:
+    """Returns whether the NCCL backend is initialized."""
+    return _NCCL_BACKEND is not None
+@contextlib.contextmanager
+def set_cupy_stream(stream: torch.cuda.Stream):
+    """Set the cuda stream for communication"""
+    cupy_stream = cupy.cuda.ExternalStream(stream.cuda_stream,
+                                           stream.device_index)
+    with cupy_stream:
+        yield
+def init_process_group(world_size: int, rank: int, host: str,
+                       port: int) -> None:
+    """Initializes the CuPy NCCL backend.
+    # TODO: handle NCCL timeouts.
+    """
+    assert not is_initialized()
+    if isinstance(cupy, Exception):
+        raise ImportError(
+            "NCCLBackend is not available. Please install cupy.") from cupy
+    # TODO(woosuk): Create TP and PP process groups for CuPy.
+    global _NCCL_BACKEND
+    global _WORLD_SIZE
+    assert world_size > 0, f"{world_size=} should be a positive integer"
+    assert 0 <= rank < world_size, (
+        f"{rank=} should be a integer between [0, {world_size})")
+    cupy.cuda.runtime.setDevice(torch.cuda.current_device())
+    _NCCL_BACKEND = NCCLBackendWithBFloat16(world_size, rank, host, port)
+    _WORLD_SIZE = world_size
+    # Stop the TCP store to prevent the deadlock issues at termination time.
+    # FIXME(woosuk): This is hacky. Find a more robust solution.
+    if rank == 0 and hasattr(_NCCL_BACKEND, "_store"):
+        _NCCL_BACKEND._store.stop()
+def all_reduce(input_: torch.Tensor, op=ReduceOp.SUM) -> None:
+    """All-reduces the input tensor across the process group."""
+    assert input_.is_cuda, f"{input_} should be a cuda tensor"
+    # Hack to support bfloat16
+    torch_dtype = input_.dtype
+    if torch_dtype is torch.bfloat16:
+        # We need to view as float16, otherwise
+        # cupy will fail. This will not change
+        # the underlying data.
+        input_ = input_.view(torch.float16)
+    cupy_input = cupy.asarray(input_)
+    cupy_input._torch_dtype = torch_dtype  # pylint: disable=protected-access
+    _NCCL_BACKEND.all_reduce(in_array=cupy_input,
+                             out_array=cupy_input,
+                             op=_OP_MAPPING[op])
+def destroy_process_group() -> None:
+    """Destroys the NCCL backend."""
+    global _NCCL_BACKEND
+    global _WORLD_SIZE
+    _NCCL_BACKEND = None
+    _WORLD_SIZE = 0
+def get_world_size() -> int:
+    """Returns the world size."""
+    return _WORLD_SIZE
+def get_nccl_backend():
+    return _NCCL_BACKEND
--- a/vllm/model_executor/parallel_utils/custom_all_reduce.py
+++ b/vllm/model_executor/parallel_utils/custom_all_reduce.py
@@ -67,6 +67,10 @@ def get_handle() -> Optional["CustomAllreduce"]:
    return _CA_HANDLE
+def is_initialized() -> bool:
+    return _CA_HANDLE is not None
 @contextmanager
 def capture():
    try:

--- a/vllm/model_executor/parallel_utils/parallel_state.py
+++ b/vllm/model_executor/parallel_utils/parallel_state.py
@@ -3,9 +3,12 @@
 # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py
 # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 """Tensor and pipeline parallel groups."""
+import contextlib
 import torch
+from vllm.model_executor.parallel_utils import cupy_utils
 # Tensor model parallel group that the current rank belongs to.
 _TENSOR_MODEL_PARALLEL_GROUP = None
 # Pipeline model parallel group that the current rank belongs to.
@@ -206,3 +209,37 @@ def destroy_model_parallel():
    _PIPELINE_MODEL_PARALLEL_GROUP = None
    global _PIPELINE_GLOBAL_RANKS
    _PIPELINE_GLOBAL_RANKS = None
+    # Destroy the cupy states if any.
+    cupy_utils.destroy_process_group()
+# Whether to use cupy for nccl all reduce.
+# We use cupy for all reduce when using CUDA graph, because torch.distributed
+# is not well supported by CUDA graph.
+_ENABLE_CUPY_FOR_ALL_REDUCE = False
+@contextlib.contextmanager
+def with_cupy_nccl_for_all_reduce():
+    """use CuPy nccl instead of torch.distributed for all reduce"""
+    tp_size = get_tensor_model_parallel_world_size()
+    if tp_size == 1:
+        # No-op.
+        # NOTE(woosuk): We don't initialize CuPy when tp_size is 1.
+        yield
+    else:
+        global _ENABLE_CUPY_FOR_ALL_REDUCE
+        old = _ENABLE_CUPY_FOR_ALL_REDUCE
+        _ENABLE_CUPY_FOR_ALL_REDUCE = True
+        stream = torch.cuda.current_stream()
+        with cupy_utils.set_cupy_stream(stream):
+            yield
+        _ENABLE_CUPY_FOR_ALL_REDUCE = old
+def is_cupy_nccl_enabled_for_all_reduce():
+    """check if CuPy nccl is enabled for all reduce"""
+    global _ENABLE_CUPY_FOR_ALL_REDUCE
+    return _ENABLE_CUPY_FOR_ALL_REDUCE
--- a/vllm/model_executor/weight_utils.py
+++ b/vllm/model_executor/weight_utils.py
@@ -11,9 +11,9 @@ from huggingface_hub import snapshot_download, HfFileSystem
 import numpy as np
 from safetensors.torch import load_file, save_file, safe_open
 import torch
-from transformers import PretrainedConfig
 from tqdm.auto import tqdm
+from vllm.config import ModelConfig
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import (get_quantization_config,
                                                     QuantizationConfig)
@@ -83,25 +83,22 @@ def convert_bin_to_safetensor_file(
 # TODO(woosuk): Move this to other place.
-def get_quant_config(
+def get_quant_config(model_config: ModelConfig) -> QuantizationConfig:
-    quantization: str,
+    quant_cls = get_quantization_config(model_config.quantization)
-    model_name_or_path: str,
-    hf_config: PretrainedConfig,
-    cache_dir: Optional[str] = None,
-) -> QuantizationConfig:
-    quant_cls = get_quantization_config(quantization)
    # Read the quantization config from the HF model config, if available.
-    hf_quant_config = getattr(hf_config, "quantization_config", None)
+    hf_quant_config = getattr(model_config.hf_config, "quantization_config",
+                              None)
    if hf_quant_config is not None:
        return quant_cls.from_config(hf_quant_config)
+    model_name_or_path = model_config.model
    is_local = os.path.isdir(model_name_or_path)
    if not is_local:
        # Download the config files.
-        with get_lock(model_name_or_path, cache_dir):
+        with get_lock(model_name_or_path, model_config.download_dir):
            hf_folder = snapshot_download(model_name_or_path,
+                                          revision=model_config.revision,
                                          allow_patterns="*.json",
-                                          cache_dir=cache_dir,
+                                          cache_dir=model_config.download_dir,
                                          tqdm_class=Disabledtqdm)
    else:
        hf_folder = model_name_or_path
@@ -112,10 +109,12 @@ def get_quant_config(
            f.endswith(x) for x in quant_cls.get_config_filenames())
    ]
    if len(quant_config_files) == 0:
-        raise ValueError(f"Cannot find the config file for {quantization}")
+        raise ValueError(
+            f"Cannot find the config file for {model_config.quantization}")
    if len(quant_config_files) > 1:
-        raise ValueError(f"Found multiple config files for {quantization}: "
+        raise ValueError(
-                         f"{quant_config_files}")
+            f"Found multiple config files for {model_config.quantization}: "
+            f"{quant_config_files}")
    quant_config_file = quant_config_files[0]
    with open(quant_config_file, "r") as f:

--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -52,7 +52,6 @@ class SequenceStatus(enum.Enum):
 class SequenceData:
    """Data associated with a sequence.
    Args:
        prompt_token_ids: The token IDs of the prompt.
@@ -197,7 +196,7 @@ class Sequence:
        return self.data.cumulative_logprob
    def get_beam_search_score(self,
-                              length_penalty: float = 0.0,
+                              length_penalty: float = 1.0,
                              seq_len: Optional[int] = None,
                              eos_token_id: Optional[int] = None) -> float:
        """Calculate the beam search score with length penalty.
@@ -254,6 +253,7 @@ class SequenceGroup:
        self.seqs_dict = {seq.seq_id: seq for seq in seqs}
        self.sampling_params = sampling_params
        self.arrival_time = arrival_time
+        self.last_token_time = arrival_time
        self.lora_request = lora_request
        self.prefix: Optional[Prefix] = prefix
        self.prompt_logprobs: Optional[PromptLogprobs] = None
@@ -274,6 +274,12 @@ class SequenceGroup:
    def lora_int_id(self) -> int:
        return self.lora_request.lora_int_id if self.lora_request else 0
+    def get_last_latency(self, now: float) -> float:
+        """Gets last token latency for Request level timings."""
+        latency = now - self.last_token_time
+        self.last_token_time = now
+        return latency
    def get_max_num_running_seqs(self) -> int:
        """The maximum number of sequences running in parallel in the remaining
        lifetime of the request."""

--- a/vllm/test_utils.py
+++ b/vllm/test_utils.py
@@ -15,8 +15,11 @@ def init_test_distributed_environment(
                                     tensor_parallel_size,
                                     worker_use_ray=True)
    distributed_init_method = f"tcp://localhost:{distributed_init_port}"
-    init_distributed_environment(parallel_config, rank,
+    init_distributed_environment(
-                                 distributed_init_method)
+        parallel_config,
+        rank,
+        cupy_port=None,
+        distributed_init_method=distributed_init_method)
 def multi_process_tensor_parallel(

--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -5,14 +5,12 @@ from transformers import AutoConfig, PretrainedConfig
 from vllm.transformers_utils.configs import *
 _CONFIG_REGISTRY = {
-    "aquila": AquilaConfig,
    "baichuan": BaiChuanConfig,
    "chatglm": ChatGLMConfig,
    "mpt": MPTConfig,
    "qwen": QWenConfig,
    "RefinedWeb": RWConfig,  # For tiiuae/falcon-40b(-instruct)
    "RefinedWebModel": RWConfig,  # For tiiuae/falcon-7b(-instruct)
-    "yi": YiConfig,
 }

--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
-from vllm.transformers_utils.configs.aquila import AquilaConfig
 from vllm.transformers_utils.configs.baichuan import BaiChuanConfig
 from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
 from vllm.transformers_utils.configs.mpt import MPTConfig
@@ -7,14 +6,11 @@ from vllm.transformers_utils.configs.qwen import QWenConfig
 # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
 # `FalconConfig` class from the official HuggingFace transformers library.
 from vllm.transformers_utils.configs.falcon import RWConfig
-from vllm.transformers_utils.configs.yi import YiConfig
 __all__ = [
-    "AquilaConfig",
    "BaiChuanConfig",
    "ChatGLMConfig",
    "MPTConfig",
    "QWenConfig",
    "RWConfig",
-    "YiConfig",
 ]
--- a/vllm/transformers_utils/configs/aquila.py
+++ b/vllm/transformers_utils/configs/aquila.py
-# coding=utf-8
-# Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Aquila model configuration"""
-from transformers import PretrainedConfig
-class AquilaConfig(PretrainedConfig):
-    model_type = "aquila"
-    keys_to_ignore_at_inference = ["past_key_values"]
-    def __init__(
-        self,
-        vocab_size=100008,
-        hidden_size=4096,
-        intermediate_size=11008,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=None,
-        hidden_act="silu",
-        max_position_embeddings=2048,
-        initializer_range=0.006,
-        rms_norm_eps=1e-5,
-        use_cache=True,
-        pad_token_id=0,
-        bos_token_id=1,
-        eos_token_id=2,
-        tie_word_embeddings=False,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-        self.num_key_value_heads = num_key_value_heads
-        self.num_attention_heads = num_attention_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
--- a/vllm/transformers_utils/configs/yi.py
+++ b/vllm/transformers_utils/configs/yi.py
-""" Yi model configuration"""
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-logger = logging.get_logger(__name__)
-Yi_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
-class YiConfig(PretrainedConfig):
-    r"""
-        Reference:
-        https://huggingface.co/01-ai/Yi-6B/blob/main/configuration_yi.py
-    """
-    model_type = "Yi"
-    keys_to_ignore_at_inference = ["past_key_values"]
-    def __init__(
-        self,
-        vocab_size=64000,
-        hidden_size=4096,
-        intermediate_size=11008,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=4,
-        hidden_act="silu",
-        max_position_embeddings=4096,
-        initializer_range=0.02,
-        rms_norm_eps=1e-5,
-        use_cache=True,
-        pad_token_id=0,
-        bos_token_id=1,
-        eos_token_id=2,
-        tie_word_embeddings=False,
-        output_attentions=False,
-        rope_theta=5000000.0,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        self.output_attentions = output_attentions
-        self.rope_theta = rope_theta
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -228,7 +228,8 @@ def create_kv_caches_with_random(
    device: Optional[str] = "cuda",
 ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
    torch.random.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
    if isinstance(cache_dtype, str):
        if cache_dtype == "auto":
@@ -257,10 +258,13 @@ def create_kv_caches_with_random(
        key_cache = torch.empty(size=key_cache_shape,
                                dtype=torch_dtype,
                                device=device)
-        if cache_dtype in ["auto", "half", "bfloat16", "float"]:
+        if cache_dtype == 'fp8_e5m2':
-            key_cache.uniform_(-scale, scale)
-        elif cache_dtype == 'fp8_e5m2':
            _generate_random_fp8_e5m2(key_cache, -scale, scale)
+        elif torch_dtype in [torch.half, torch.bfloat16, torch.float]:
+            key_cache.uniform_(-scale, scale)
+        else:
+            raise ValueError(
+                f"Does not support key cache of type {cache_dtype}")
        key_caches.append(key_cache)
    value_cache_shape = (num_blocks, num_heads, head_size, block_size)
@@ -269,9 +273,12 @@ def create_kv_caches_with_random(
        value_cache = torch.empty(size=value_cache_shape,
                                  dtype=torch_dtype,
                                  device=device)
-        if cache_dtype in ["auto", "half", "bfloat16", "float"]:
+        if cache_dtype == 'fp8_e5m2':
-            value_cache.uniform_(-scale, scale)
-        elif cache_dtype == 'fp8_e5m2':
            _generate_random_fp8_e5m2(value_cache, -scale, scale)
+        elif torch_dtype in [torch.half, torch.bfloat16, torch.float]:
+            value_cache.uniform_(-scale, scale)
+        else:
+            raise ValueError(
+                f"Does not support value cache of type {cache_dtype}")
        value_caches.append(value_cache)
    return key_caches, value_caches