Merge tag 'v0.6.0' into v0.6.0-dev

0640f227 · zhuwenwen · 82f1ffdf · 32e7db25 · 0640f227 · 0640f227
Commit 0640f227 authored Sep 09, 2024 by zhuwenwen
20 changed files
--- a/vllm/spec_decode/multi_step_worker.py
+++ b/vllm/spec_decode/multi_step_worker.py
@@ -4,8 +4,9 @@ from typing import Dict, List, Set, Tuple
 import torch
-from vllm.sequence import (ExecuteModelRequest, HiddenStates, SamplerOutput,
+from vllm.model_executor.layers.sampler import SamplerOutput
-                           SequenceData, SequenceGroupMetadata)
+from vllm.sequence import (ExecuteModelRequest, HiddenStates, SequenceData,
+                           SequenceGroupMetadata)
 from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner
 from vllm.spec_decode.interfaces import (SpeculativeProposals,
                                         SpeculativeProposer)

--- a/vllm/spec_decode/ngram_worker.py
+++ b/vllm/spec_decode/ngram_worker.py
@@ -3,7 +3,8 @@ from typing import List, Optional, Set, Tuple
 import torch
-from vllm.sequence import ExecuteModelRequest, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.sequence import ExecuteModelRequest
 from vllm.spec_decode.interfaces import SpeculativeProposals
 from vllm.spec_decode.proposer_worker_base import NonLLMProposerWorkerBase
 from vllm.spec_decode.top1_proposer import Top1Proposer

--- a/vllm/spec_decode/proposer_worker_base.py
+++ b/vllm/spec_decode/proposer_worker_base.py
 from abc import ABC, abstractmethod
 from typing import List, Optional, Set, Tuple
-from vllm.sequence import ExecuteModelRequest, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.sequence import ExecuteModelRequest
 from vllm.spec_decode.interfaces import SpeculativeProposer
 from vllm.worker.worker_base import LoraNotSupportedWorkerBase

--- a/vllm/spec_decode/smaller_tp_proposer_worker.py
+++ b/vllm/spec_decode/smaller_tp_proposer_worker.py
@@ -6,7 +6,8 @@ from vllm.distributed.parallel_state import (get_tp_group,
                                             init_model_parallel_group,
                                             patch_tensor_parallel_group)
 from vllm.logger import init_logger
-from vllm.sequence import ExecuteModelRequest, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.sequence import ExecuteModelRequest
 from vllm.spec_decode.interfaces import SpeculativeProposals
 from vllm.spec_decode.multi_step_worker import MultiStepWorker
 from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase

--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -8,12 +8,13 @@ from vllm.config import ParallelConfig, SpeculativeConfig
 from vllm.distributed.communication_op import broadcast_tensor_dict
 from vllm.logger import init_logger
 from vllm.model_executor.layers.rejection_sampler import RejectionSampler
+from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.layers.spec_decode_base_sampler import (
    SpecDecodeBaseSampler, SpecDecodeStochasticBaseSampler)
 from vllm.model_executor.layers.typical_acceptance_sampler import (
    TypicalAcceptanceSampler)
 from vllm.sequence import (CompletionSequenceGroupOutput, ExecuteModelRequest,
-                           HiddenStates, SamplerOutput, SequenceGroupMetadata,
+                           HiddenStates, SequenceGroupMetadata,
                           get_all_seq_ids, get_all_seq_ids_and_request_ids)
 from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
 from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner
@@ -365,12 +366,13 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
        #    used during the prefill phase.
        # 2. Auto-disable enabled: The running queue size exceeds
        #    the specified threshold.
-        # 3. No request: There are no requests in the batch.
+        # 3. No request: There are no requests in the batch, or
+        #    none of the requests in the batch have spec decoding enabled.
        # In any of these cases, the proposer and scorer workers
        # are called normally.
-        no_spec = num_lookahead_slots == 0 or len(
+        no_spec = num_lookahead_slots == 0 or disable_all_speculation or all(
-            execute_model_req.seq_group_metadata_list
+            sgm.num_speculative_tokens == 0
-        ) == 0 or disable_all_speculation
+            for sgm in execute_model_req.seq_group_metadata_list)
        # Broadcast how many lookahead slots are scheduled for this step, and
        # whether all speculation is disabled, to all non-driver workers.
@@ -415,10 +417,8 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
            self, execute_model_req: ExecuteModelRequest) -> bool:
        # When the batch size is too large, disable speculative decoding
        # to stop trading off throughput for latency.
-        disable_all_speculation = (execute_model_req.running_queue_size >=
+        return (execute_model_req.running_queue_size >=
-                                   self.disable_by_batch_size)
+                self.disable_by_batch_size)
-        return disable_all_speculation
    def _maybe_disable_speculative_tokens(
            self, disable_all_speculation: bool,
@@ -621,18 +621,12 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
        # proposal len. This adds some complexity (splitting the batch into spec
        # and non spec sequences) and should be removed in the future. It can be
        # done by supporting per-sequence proposal lens.
-        _, spec_indices = split_batch_by_proposal_len(
+        (_, spec_indices), (_, non_spec_indices) = split_batch_by_proposal_len(
-            seq_group_metadata_list,
+            seq_group_metadata_list, proposal_lens_list)
-            proposal_lens_list,
-            select_proposal_len_zero=False)
-        _, non_spec_indices = split_batch_by_proposal_len(
-            seq_group_metadata_list,
-            proposal_lens_list,
-            select_proposal_len_zero=True)
        original_indices = spec_indices + non_spec_indices
-        # Get probabilities of target model, excluding bonus token.
+        # Get probabilities of target model, including bonus tokens.
-        proposal_verifier_probs = proposal_scores.probs[spec_indices, :-1]
+        proposal_verifier_probs = proposal_scores.probs[spec_indices]
        # Get non-speculative sampled tokens from target model.
        non_spec_token_ids = proposal_scores.token_ids[non_spec_indices]
@@ -657,13 +651,12 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
            }
        accepted_token_ids = self.spec_decode_sampler(
-            target_probs=proposal_verifier_probs,
+            target_with_bonus_probs=proposal_verifier_probs,
            bonus_token_ids=bonus_token_ids,
            draft_probs=proposal_probs,
            draft_token_ids=proposal_token_ids,
            **sampler_extra_kwargs,
        )
        # Append output tokens from non-speculative sequences to
        # the accepted token ids tensor.
        non_spec_token_ids = non_spec_token_ids.expand(-1, max_proposal_len +

--- a/vllm/spec_decode/top1_proposer.py
+++ b/vllm/spec_decode/top1_proposer.py
@@ -2,8 +2,8 @@ from typing import List, Optional, Set, Tuple
 import torch
-from vllm.sequence import (ExecuteModelRequest, SamplerOutput,
+from vllm.model_executor.layers.sampler import SamplerOutput
-                           SequenceGroupMetadata)
+from vllm.sequence import ExecuteModelRequest, SequenceGroupMetadata
 from vllm.spec_decode.interfaces import (SpeculativeProposals,
                                         SpeculativeProposer)
 from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase
@@ -138,7 +138,7 @@ class Top1Proposer(SpeculativeProposer):
            # Currently only proposal lens of 0 or the global batch proposal len
            # are supported.
-            # If max_proposal_len is defined, then we shall no exceed this
+            # If max_proposal_len is defined, then we shall not exceed this
            # quota for nonzero_proposal
            new_k = 0
            if (self.max_proposal_len is None

--- a/vllm/spec_decode/util.py
+++ b/vllm/spec_decode/util.py
 import time
 from contextlib import contextmanager
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Sequence, Tuple
 import torch
+from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
-                           SamplerOutput, SequenceGroupMetadata,
+                           SequenceGroupMetadata, SequenceOutput)
-                           SequenceOutput)
 SeqId = int
@@ -43,8 +43,8 @@ def get_sampled_token_logprobs(
                                       sampled_token_ids, ]
    expanded_selected_logprobs = selected_logprobs.unsqueeze(-1).expand(
        -1, -1, vocab_size)
-    sampled_token_ids_ranks = (logprob_tensor >=
+    sampled_token_ids_ranks = (logprob_tensor >
-                               expanded_selected_logprobs).sum(-1)
+                               expanded_selected_logprobs).sum(-1).add_(1)
    return sampled_token_ids_ranks, selected_logprobs
@@ -98,33 +98,26 @@ def create_sequence_group_output(
 def split_batch_by_proposal_len(
    seq_group_metadata_list: List[SequenceGroupMetadata],
-    proposal_lens: List[int], select_proposal_len_zero: bool
+    proposal_lens: List[int],
-) -> Tuple[List[SequenceGroupMetadata], List[int]]:
+) -> Tuple[Tuple[List[SequenceGroupMetadata], List[int]], Tuple[
+        List[SequenceGroupMetadata], List[int]]]:
    """Utility function that splits a batch based on whether the proposal len is
    zero or not. We should remove this once vLLM supports per-sequence proposal
    lens in a batch.
    """
-    if select_proposal_len_zero:
+    nonzero_lists: Tuple[List[SequenceGroupMetadata], List[int]] = ([], [])
-        predicate = lambda proposal_len: proposal_len == 0
+    zero_lists: Tuple[List[SequenceGroupMetadata], List[int]] = ([], [])
-    else:
+    for i, (seq_group, proposal_len) in enumerate(
-        predicate = lambda proposal_len: proposal_len != 0
+            zip(seq_group_metadata_list, proposal_lens)):
+        seq_groups, indices = nonzero_lists if proposal_len else zero_lists
-    indices = [
+        seq_groups.append(seq_group)
-        i for i, (_, proposal_len
+        indices.append(i)
-                  ) in enumerate(zip(seq_group_metadata_list, proposal_lens))
+    return nonzero_lists, zero_lists
-        if predicate(proposal_len)
-    ]
-    seq_groups = [
-        seq_group for seq_group, proposal_len in zip(
-            seq_group_metadata_list, proposal_lens) if predicate(proposal_len)
-    ]
-    return seq_groups, indices
 def sampler_output_to_torch(
-    sampler_output_list: List[SamplerOutput], sampler_transposed: bool
+    sampler_output_list: Sequence[SamplerOutput], sampler_transposed: bool
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
    """Utility function which converts a list of SamplerOutput to tensors.
@@ -148,18 +141,12 @@ def sampler_output_to_torch(
        dim=0,
    )
-    if sampler_transposed:
-        sampled_token_probs = sampled_token_probs.transpose(0, 1)
    # shape: [batch_size, num_sampler_output, vocab_size]
    sampled_token_logprobs = torch.stack(
        [sampler_output.logprobs for sampler_output in sampler_output_list],
        dim=0,
    )
-    if sampler_transposed:
-        sampled_token_logprobs = sampled_token_logprobs.transpose(0, 1)
    # shape: [batch_size, num_sampler_output]
    sampled_token_ids = torch.stack(
        [
@@ -168,7 +155,10 @@ def sampler_output_to_torch(
        ],
        dim=0,
    )
    if sampler_transposed:
+        sampled_token_probs = sampled_token_probs.transpose(0, 1)
+        sampled_token_logprobs = sampled_token_logprobs.transpose(0, 1)
        sampled_token_ids = sampled_token_ids.transpose(0, 1)
    if sampler_output_list[0].hidden_states is not None:

--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -11,11 +11,12 @@ from transformers.models.auto.modeling_auto import (
 from vllm.envs import VLLM_USE_MODELSCOPE
 from vllm.logger import init_logger
 from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig,
-                                             EAGLEConfig, InternVLChatConfig,
+                                             EAGLEConfig, ExaoneConfig,
-                                             JAISConfig, MedusaConfig,
+                                             InternVLChatConfig, JAISConfig,
-                                             MLPSpeculatorConfig, MPTConfig,
+                                             MedusaConfig, MLPSpeculatorConfig,
-                                             NemotronConfig, RWConfig,
+                                             MPTConfig, NemotronConfig,
-                                             UltravoxConfig)
+                                             RWConfig, UltravoxConfig)
+from vllm.transformers_utils.utils import check_gguf_file
 if VLLM_USE_MODELSCOPE:
    from modelscope import AutoConfig
@@ -34,6 +35,7 @@ _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
    "mlp_speculator": MLPSpeculatorConfig,
    "medusa": MedusaConfig,
    "eagle": EAGLEConfig,
+    "exaone": ExaoneConfig,
    "internvl_chat": InternVLChatConfig,
    "nemotron": NemotronConfig,
    "ultravox": UltravoxConfig,
@@ -55,7 +57,7 @@ def get_config(
 ) -> PretrainedConfig:
    # Separate model folder from file path for GGUF models
-    is_gguf = Path(model).is_file() and Path(model).suffix == ".gguf"
+    is_gguf = check_gguf_file(model)
    if is_gguf:
        kwargs["gguf_file"] = Path(model).name
        model = Path(model).parent
@@ -107,8 +109,11 @@ def get_hf_image_processor_config(
    revision: Optional[str] = None,
    **kwargs,
 ) -> Dict[str, Any]:
+    # ModelScope does not provide an interface for image_processor
+    if VLLM_USE_MODELSCOPE:
+        return dict()
    # Separate model folder from file path for GGUF models
-    if Path(model).is_file() and Path(model).suffix == ".gguf":
+    if check_gguf_file(model):
        model = Path(model).parent
    return get_image_processor_config(model, revision=revision, **kwargs)

--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
 from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
 from vllm.transformers_utils.configs.dbrx import DbrxConfig
 from vllm.transformers_utils.configs.eagle import EAGLEConfig
+from vllm.transformers_utils.configs.exaone import ExaoneConfig
 # RWConfig is for the original tiiuae/falcon-40b(-instruct) and
 # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
 # `FalconConfig` class from the official HuggingFace transformers library.
@@ -22,6 +23,7 @@ __all__ = [
    "JAISConfig",
    "MedusaConfig",
    "EAGLEConfig",
+    "ExaoneConfig",
    "MLPSpeculatorConfig",
    "NemotronConfig",
    "UltravoxConfig",

--- a/vllm/transformers_utils/configs/exaone.py
+++ b/vllm/transformers_utils/configs/exaone.py
+# coding=utf-8
+# Copied from
+# https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/configuration_exaone.py
+# Copyright 2021 The LG AI Research EXAONE Lab. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Exaone model configuration"""
+from typing import Dict
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+EXAONE_PRETRAINED_CONFIG_ARCHIVE_MAP: Dict[str, str] = {}
+class ExaoneConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a :class:
+    `~transformers.ExaoneModel`. It is used to instantiate a GPT Lingvo model
+    according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar
+    configuration to that of the Exaone
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig`
+    and can be used to control the model outputs. Read the documentation from :
+    class:`~transformers.PretrainedConfig` for more information.
+    Args:
+        vocab_size (:obj:`int`, `optional`, defaults to 50257):
+            Vocabulary size of the GPT Lingvo model. Defines the number of
+            different tokens that can be represented by the :obj:`inputs_ids`
+            passed when calling :class:`~transformers.ExaoneModel`. Vocabulary
+            size of the model.
+            Defines the different tokens that can be represented by the
+            `inputs_ids` passed to the forward method of :class:
+            `~transformers.EXAONEModel`.
+        hidden_size (:obj:`int`, `optional`, defaults to 2048):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_layers (:obj:`int`, `optional`, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the
+            Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to
+            implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi
+            Head Attention (MHA), if `num_key_value_heads=1 the model will use
+            Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint,
+            each group key and value head should be constructed by meanpooling
+            all the original heads within that group. For more details checkout
+            [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not
+            specified, will default to `num_attention_heads`.
+        rotary_pct (`float`, *optional*, defaults to 0.25):
+            percentage of hidden dimensions to allocate to rotary embeddings
+        intermediate_size (:obj:`int`, `optional`, defaults to 8192):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in
+            the Transformer encoder.
+        activation_function (:obj:`str` or :obj:`function`, `optional`,
+        defaults to :obj:`"gelu_new"`):
+            The non-linear activation function (function or string) in the
+            encoder and pooler. If string, :obj:`"gelu"`, :obj:`"relu"`,
+            :obj:`"selu"` and :obj:`"gelu_new"` are supported.
+        embed_dropout (:obj:`float`, `optional`, defaults to 0.0):
+            The dropout probabilitiy for all fully connected layers in the
+            embeddings, encoder, and pooler.
+        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (:obj:`int`, `optional`, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case
+            (e.g., 512 or 1024 or 2048).
+        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
+            The vocabulary size of the :obj:`token_type_ids` passed when calling
+            :class:`~transformers.EXAONEModel`.
+        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for
+            initializing all weight matrices.
+        layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last key/values
+            attentions (not used by all models).
+            Only relevant if ``config.is_decoder=True``.
+        gradient_checkpointing (:obj:`bool`, `optional`,
+        defaults to :obj:`False`):
+            If True, use gradient checkpointing to save memory at the expense
+            of slower backward pass.
+        Example::
+            >>> from transformers import ExoneModel, ExaoneConfig
+            >>> # Initializing a EXAONE configuration
+            >>> configuration = ExaoneConfig()
+            >>> # Initializing a model from configuration
+            >>> model = ExoneModel(configuration)
+            >>> # Accessing the model configuration
+            >>> configuration = model.config
+    """
+    model_type = "exaone"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {"num_hidden_layers": "num_layers"}
+    def __init__(
+        self,
+        vocab_size=102400,
+        max_position_embeddings=2048,
+        hidden_size=2048,
+        num_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        intermediate_size=None,
+        activation_function="silu",
+        rotary_pct=0.25,
+        resid_dropout=0.0,
+        embed_dropout=0.0,
+        attention_dropout=0.0,
+        layer_norm_epsilon=1e-6,
+        initializer_range=0.02,
+        use_cache=True,
+        bos_token_id=0,
+        eos_token_id=2,
+        tie_word_embeddings=True,
+        **kwargs,
+    ):
+        super().__init__(
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_hidden_layers = num_layers
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        if intermediate_size:
+            self.intermediate_size = intermediate_size
+        else:
+            self.intermediate_size = hidden_size * 4
+        self.activation_function = activation_function
+        self.resid_dropout = resid_dropout
+        self.embed_dropout = embed_dropout
+        self.attention_dropout = attention_dropout
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+        self.rotary_pct = rotary_pct
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.use_logit_cap = kwargs.pop("use_logit_cap", False)
+        self.ln_no_scale = kwargs.pop("ln_no_scale", False)
+        self.use_gated = kwargs.pop("use_gated", False)
+        self.use_emb_norm = kwargs.pop("use_emb_norm", False)
+        self.use_rotary_pos = kwargs.pop("use_rotary_pos", False)
+        self.rotary_type = kwargs.pop("rotary_type", None)
+        self.scaling_factor = kwargs.pop("scaling_factor", 1)
+        self.use_absolute_pos = kwargs.pop("use_absolute_pos", True)
+        self.use_extra_logit = kwargs.pop("use_extra_logit", True)
+        self.rotary_expand_length = kwargs.pop("rotary_expand_length", None)
+        self.rotary_base = kwargs.pop("rotary_base", 10000.0)
+        self.use_qkv_fuse = kwargs.pop("use_qkv_fuse", False)
+        self.rescale_before_lm_head = kwargs.pop("rescale_before_lm_head",
+                                                 (rotary_pct == 0.25))
+        if self.use_rotary_pos:
+            self.use_absolute_pos = False
--- a/vllm/transformers_utils/configs/granite.py
+++ b/vllm/transformers_utils/configs/granite.py
+# coding=utf-8
+# Copyright 2024 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Granite model configuration"""
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_rope_utils import rope_config_validation
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class GraniteConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of
+    a [`GraniteModel`]. It is used to instantiate an Granite
+    model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar
+    configuration to that of the Granite-3B.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to
+    control the model outputs. Read the documentation from [`PretrainedConfig`]
+    for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Granite model. Defines the number of
+            different tokens that can be represented by the `inputs_ids`
+            passed when calling [`GraniteModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the
+            Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to
+            implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi
+            Head Attention (MHA), if `num_key_value_heads=1` the model will use
+            Multi Query Attention (MQA) otherwise GQA is used. When converting
+            a multi-head checkpoint to a GQA checkpoint, each group key and
+            value head should be constructed by meanpooling all the original
+            heads within that group. For more details checkout
+            [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not
+            specified, will default to `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the
+            decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for
+            initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values
+            attentions (not used by all models). Only relevant if
+            `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE
+            embeddings. Currently supports two scaling strategies: linear and
+            dynamic. Their scaling factor must be a float greater than 1. The
+            expected format is
+            `{"type": strategy name, "factor": scaling factor}`.
+            When using this flag, don't update `max_position_embeddings` to
+            the expected new maximum. See the following thread for more
+            information on how these scaling strategies behave:
+            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/.
+            This is an experimental feature, subject to breaking API changes
+            in future versions.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output
+            projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        mlp_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in up_proj, down_proj and gate_proj layers
+            in the MLP layers.
+        embedding_multiplier (`float`, *optional*, defaults to 1.0):
+            embedding multiplier
+        logits_scaling (`float`, *optional*, defaults to 1.0):
+            divisor for output logits
+        residual_multiplier (`float`, *optional*, defaults to 1.0):
+            residual multiplier
+        attention_multiplier (`float`, *optional*, defaults to 1.0):
+            attention multiplier
+    ```python
+    >>> from transformers import GraniteModel, GraniteConfig
+    >>> # Initializing a Granite granite-3b style configuration
+    >>> configuration = GraniteConfig()
+    >>> # Initializing a model from the granite-7b style configuration
+    >>> model = GraniteModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "granite"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        mlp_bias=False,
+        embedding_multiplier=1.0,
+        logits_scaling=1.0,
+        residual_multiplier=1.0,
+        attention_multiplier=1.0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.mlp_bias = mlp_bias
+        self.embedding_multiplier = embedding_multiplier
+        self.logits_scaling = logits_scaling
+        self.residual_multiplier = residual_multiplier
+        self.attention_multiplier = attention_multiplier
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        rope_config_validation(self)
--- a/vllm/transformers_utils/detokenizer.py
+++ b/vllm/transformers_utils/detokenizer.py
@@ -230,7 +230,7 @@ def convert_prompt_ids_to_tokens(
    prefix_offset = max(
        read_offset - INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET, 0)
    # This is required to guard against out-of-vocab prompt token ids
-    _replace_none_with_empty(new_tokens)
+    _replace_none_with_empty(new_tokens)  # type: ignore[arg-type]
    return new_tokens, prefix_offset, read_offset

--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
 import os
+import warnings
 from pathlib import Path
 from typing import Optional, Union
@@ -9,12 +10,15 @@ from transformers import (AutoTokenizer, PreTrainedTokenizer,
 from vllm.envs import VLLM_USE_MODELSCOPE
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
-from vllm.transformers_utils.tokenizers import BaichuanTokenizer
+from vllm.transformers_utils.tokenizers import (BaichuanTokenizer,
+                                                MistralTokenizer)
+from vllm.transformers_utils.utils import check_gguf_file
 from vllm.utils import make_async
 logger = init_logger(__name__)
-AnyTokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
+AnyTokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast,
+                     MistralTokenizer]
 def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
@@ -93,51 +97,69 @@ def get_tokenizer(
        kwargs["truncation_side"] = "left"
    # Separate model folder from file path for GGUF models
-    is_gguf = Path(tokenizer_name).is_file() and Path(
+    is_gguf = check_gguf_file(tokenizer_name)
-        tokenizer_name).suffix == ".gguf"
    if is_gguf:
        kwargs["gguf_file"] = Path(tokenizer_name).name
        tokenizer_name = Path(tokenizer_name).parent
-    try:
+    # if tokenizer is from official mistral org
-        tokenizer = AutoTokenizer.from_pretrained(
+    is_from_mistral_org = str(tokenizer_name).split("/")[0] == "mistralai"
-            tokenizer_name,
+    if is_from_mistral_org and tokenizer_mode != "mistral":
-            *args,
+        warnings.warn(
-            trust_remote_code=trust_remote_code,
+            'It is strongly recommended to run mistral models with '
-            revision=revision,
+            '`--tokenizer_mode "mistral"` to ensure correct '
-            **kwargs)
+            'encoding and decoding.',
-    except ValueError as e:
+            FutureWarning,
-        # If the error pertains to the tokenizer class not existing or not
+            stacklevel=2)
-        # currently being imported, suggest using the --trust-remote-code flag.
-        if (not trust_remote_code and
+    if tokenizer_mode == "mistral":
-            ("does not exist or is not currently imported." in str(e)
+        tokenizer = MistralTokenizer.from_pretrained(str(tokenizer_name),
-             or "requires you to execute the tokenizer file" in str(e))):
+                                                     revision=revision)
-            err_msg = (
+    else:
-                "Failed to load the tokenizer. If the tokenizer is a custom "
+        try:
-                "tokenizer not yet available in the HuggingFace transformers "
+            tokenizer = AutoTokenizer.from_pretrained(
-                "library, consider setting `trust_remote_code=True` in LLM "
-                "or using the `--trust-remote-code` flag in the CLI.")
-            raise RuntimeError(err_msg) from e
-        else:
-            raise e
-    except AttributeError as e:
-        if "BaichuanTokenizer" in str(e):
-            # This is for the error "'BaichuanTokenizer' object has no
-            # attribute 'sp_model'".
-            tokenizer = BaichuanTokenizer.from_pretrained(
                tokenizer_name,
                *args,
                trust_remote_code=trust_remote_code,
                revision=revision,
-                **kwargs)
+                **kwargs,
-        else:
+            )
-            raise e
+        except ValueError as e:
+            # If the error pertains to the tokenizer class not existing or not
+            # currently being imported,
+            # suggest using the --trust-remote-code flag.
+            if not trust_remote_code and (
+                    "does not exist or is not currently imported." in str(e)
+                    or "requires you to execute the tokenizer file" in str(e)):
+                err_msg = ("Failed to load the tokenizer. If the tokenizer "
+                           "is a custom tokenizer not yet available in the "
+                           "HuggingFace transformers library, consider "
+                           "setting `trust_remote_code=True` in LLM or using "
+                           "the `--trust-remote-code` flag in the CLI.")
+                raise RuntimeError(err_msg) from e
+            else:
+                raise e
+        except AttributeError as e:
+            if "BaichuanTokenizer" in str(e):
+                # This is for the error "'BaichuanTokenizer' object has no
+                # attribute 'sp_model'".
+                tokenizer = BaichuanTokenizer.from_pretrained(
+                    tokenizer_name,
+                    *args,
+                    trust_remote_code=trust_remote_code,
+                    revision=revision,
+                    **kwargs,
+                )
+            else:
+                raise e
+        if not isinstance(tokenizer, PreTrainedTokenizerFast):
+            logger.warning(
+                "Using a slow tokenizer. This might cause a significant "
+                "slowdown. Consider using a fast tokenizer instead.")
+        tokenizer = get_cached_tokenizer(tokenizer)
-    if not isinstance(tokenizer, PreTrainedTokenizerFast):
+    return tokenizer
-        logger.warning(
-            "Using a slow tokenizer. This might cause a significant "
-            "slowdown. Consider using a fast tokenizer instead.")
-    return get_cached_tokenizer(tokenizer)
 def get_lora_tokenizer(lora_request: LoRARequest, *args,

--- a/vllm/transformers_utils/tokenizers/__init__.py
+++ b/vllm/transformers_utils/tokenizers/__init__.py
 from vllm.transformers_utils.tokenizers.baichuan import BaichuanTokenizer
+from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
-__all__ = [
+__all__ = ["BaichuanTokenizer", "MistralTokenizer"]
-    "BaichuanTokenizer",
-]
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
+import os
+import re
+from dataclasses import dataclass
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+from huggingface_hub import HfApi, hf_hub_download
+# yapf: disable
+from mistral_common.tokens.tokenizers.mistral import ChatCompletionRequest
+from mistral_common.tokens.tokenizers.mistral import (
+    MistralTokenizer as PublicMistralTokenizer)
+# yapf: enable
+from mistral_common.tokens.tokenizers.sentencepiece import (
+    SentencePieceTokenizer)
+from mistral_common.tokens.tokenizers.tekken import (SpecialTokenPolicy,
+                                                     Tekkenizer)
+if TYPE_CHECKING:
+    from vllm.entrypoints.chat_utils import ConversationMessage
+@dataclass
+class Encoding:
+    input_ids: List[int]
+def find_tokenizer_file(files: List[str]):
+    file_pattern = re.compile(r"^tokenizer\.model\.v.*$|^tekken\.json$")
+    matched_files = [file for file in files if file_pattern.match(file)]
+    if len(matched_files) > 1:
+        raise OSError(f"Found {len(matched_files)} files matching the "
+                      "pattern: {matched_files}. Make sure only one Mistral "
+                      "tokenizer is present in {tokenizer_name}.")
+    elif len(matched_files) == 0:
+        raise OSError(f"Found {len(matched_files)} files matching the "
+                      "pattern: {matched_files}. Make sure that a Mistral "
+                      "tokenizer is present in {tokenizer_name}.")
+    return matched_files[0]
+class MistralTokenizer:
+    def __init__(self, tokenizer: PublicMistralTokenizer) -> None:
+        self.mistral = tokenizer
+        self.instruct = tokenizer.instruct_tokenizer
+        self.tokenizer = tokenizer.instruct_tokenizer.tokenizer
+        self.vocab_size = len(self.tokenizer.vocab())
+        assert isinstance(self.tokenizer,
+                          (Tekkenizer, SentencePieceTokenizer)), type(
+                              self.tokenizer)
+        if (is_tekken := isinstance(self.tokenizer, Tekkenizer)):
+            # Make sure special tokens will not raise
+            self.tokenizer.special_token_policy = SpecialTokenPolicy.IGNORE
+        self._is_tekken = is_tekken
+        # the following attributes are set to fit VLLM's design
+        self.is_fast = True
+        self.chat_template = True
+        self.all_special_ids: List[Any] = []
+        self.all_special_tokens: List[Any] = []
+        self.all_special_tokens_extended: List[Any] = []
+    @classmethod
+    def from_pretrained(cls,
+                        path_or_repo_id: str,
+                        *,
+                        revision: Optional[str] = None) -> "MistralTokenizer":
+        if not Path(path_or_repo_id).exists():
+            assert len(path_or_repo_id.split("/")) == 2, (
+                "You have either provided a non-existent path: "
+                "{path_or_repo_id} or an invalid HF Hub repo id.")
+            tokenizer_file = cls._download_mistral_tokenizer_from_hf(
+                path_or_repo_id, revision)
+        elif Path(path_or_repo_id).is_dir():
+            tokenizer_file_name = find_tokenizer_file(
+                os.listdir(path_or_repo_id))
+            tokenizer_file = str(Path(path_or_repo_id) / tokenizer_file_name)
+        else:
+            assert Path(
+                path_or_repo_id).is_file(), f"Invalid path: {path_or_repo_id}"
+        mistral_tokenizer = PublicMistralTokenizer.from_file(tokenizer_file)
+        return cls(mistral_tokenizer)
+    @staticmethod
+    def _download_mistral_tokenizer_from_hf(tokenizer_name: str,
+                                            revision: Optional[str]) -> str:
+        api = HfApi()
+        repo_info = api.model_info(tokenizer_name)
+        files = [s.rfilename for s in repo_info.siblings]
+        filename = find_tokenizer_file(files)
+        tokenizer_file = hf_hub_download(tokenizer_name,
+                                         filename=filename,
+                                         revision=revision)
+        return tokenizer_file
+    def __call__(
+        self,
+        prompt: str,
+        add_special_tokens: bool = False,
+        truncation: bool = False,
+        max_length: Optional[int] = None,
+    ):
+        # Mistral Tokenizers should not add special tokens
+        input_ids = self.encode(prompt)
+        if truncation:
+            input_ids = input_ids[:max_length]
+        return Encoding(input_ids=input_ids)
+    def get_added_vocab(self) -> List[str]:
+        # Mistral tokenizers have no added vocabulary
+        return []
+    def encode(self, prompt: str) -> List[int]:
+        # `encode ` should only be used for prompt completion
+        # it should never be used for chat_completion.
+        # For chat completion use `apply_chat_template`
+        return self.tokenizer.encode(prompt, bos=True, eos=False)
+    def apply_chat_template(self,
+                            conversation: List["ConversationMessage"],
+                            tools: Optional[Dict[str, Any]] = None,
+                            **kwargs) -> List[int]:
+        assert tools is None, "`tools` are not yet supported."
+        request = ChatCompletionRequest(
+            messages=conversation)  # type: ignore[type-var]
+        encoded = self.mistral.encode_chat_completion(request)
+        # encode-decode to get clean prompt
+        return encoded.tokens
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        if self._is_tekken:
+            return "".join(tokens)
+        else:
+            return self.tokenizer.decode(tokens)  # type: ignore[arg-type]
+    def decode(self, ids: Union[List[int], int]) -> str:
+        if isinstance(ids, int):
+            ids = [ids]
+        return self.tokenizer.decode(ids)
+    @property
+    def eos_token_id(self):
+        return self.tokenizer.eos_id
+    def convert_ids_to_tokens(
+            self,
+            ids: List[int],
+            skip_special_tokens: Optional[bool] = True) -> List[str]:
+        # TODO(Patrick) - potentially allow special tokens to not be skipped
+        assert (
+            skip_special_tokens
+        ), "Skipping special tokens is not supported for Mistral tokenizers."
+        assert isinstance(self.tokenizer,
+                          (Tekkenizer, SentencePieceTokenizer)), type(
+                              self.tokenizer)
+        tokens = [self.tokenizer.id_to_piece(id) for id in ids]
+        return tokens
+    def __len__(self):
+        return self.vocab_size
--- a/vllm/transformers_utils/utils.py
+++ b/vllm/transformers_utils/utils.py
+from os import PathLike
+from pathlib import Path
+from typing import Union
+def check_gguf_file(model: Union[str, PathLike]) -> bool:
+    """Check if the file is a GGUF model."""
+    model = Path(model)
+    if not model.is_file():
+        return False
+    elif model.suffix == ".gguf":
+        return True
+    with open(model, "rb") as f:
+        header = f.read(4)
+    return header == b"GGUF"
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -25,6 +25,8 @@ import numpy.typing as npt
 import psutil
 import torch
 import torch.types
+import yaml
+from packaging.version import Version
 from typing_extensions import ParamSpec, TypeIs, assert_never
 import vllm.envs as envs
@@ -1092,6 +1094,9 @@ class FlexibleArgumentParser(argparse.ArgumentParser):
        if args is None:
            args = sys.argv[1:]
+        if '--config' in args:
+            args = FlexibleArgumentParser._pull_args_from_config(args)
        # Convert underscores to dashes and vice versa in argument names
        processed_args = []
        for arg in args:
@@ -1108,9 +1113,114 @@ class FlexibleArgumentParser(argparse.ArgumentParser):
        return super().parse_args(processed_args, namespace)
+    @staticmethod
+    def _pull_args_from_config(args: List[str]) -> List[str]:
+        """Method to pull arguments specified in the config file
+        into the command-line args variable.
+        The arguments in config file will be inserted between 
+        the argument list.
+        example:
+        ```yaml
+            port: 12323
+            tensor-parallel-size: 4
+        ```
+        ```python
+        $: vllm {serve,chat,complete} "facebook/opt-12B" \
+            --config config.yaml -tp 2
+        $: args = [
+            "serve,chat,complete",
+            "facebook/opt-12B", 
+            '--config', 'config.yaml', 
+            '-tp', '2'
+        ]
+        $: args = [
+            "serve,chat,complete",
+            "facebook/opt-12B", 
+            '--port', '12323', 
+            '--tensor-parallel-size', '4', 
+            '-tp', '2'
+            ]
+        ```
+        Please note how the config args are inserted after the sub command.
+        this way the order of priorities is maintained when these are args 
+        parsed by super().
+        """
+        assert args.count(
+            '--config') <= 1, "More than one config file specified!"
+        index = args.index('--config')
+        if index == len(args) - 1:
+            raise ValueError("No config file specified! \
+                             Please check your command-line arguments.")
+        file_path = args[index + 1]
+        config_args = FlexibleArgumentParser._load_config_file(file_path)
+        # 0th index is for {serve,chat,complete}
+        # followed by config args
+        # followed by rest of cli args.
+        # maintaining this order will enforce the precedence
+        # of cli > config > defaults
+        args = [args[0]] + config_args + args[1:index] + args[index + 2:]
+        return args
+    @staticmethod
+    def _load_config_file(file_path: str) -> List[str]:
+        """Loads a yaml file and returns the key value pairs as a 
+        flattened list with argparse like pattern
+        ```yaml
+            port: 12323
+            tensor-parallel-size: 4
+        ```
+        returns:
+            processed_args: list[str] = [
+                '--port': '12323',
+                '--tensor-parallel-size': '4'
+            ]
+        """
+        extension: str = file_path.split('.')[-1]
+        if extension not in ('yaml', 'yml'):
+            raise ValueError(
+                "Config file must be of a yaml/yml type.\
+                              %s supplied", extension)
+        # only expecting a flat dictionary of atomic types
+        processed_args: List[str] = []
+        config: Dict[str, Union[int, str]] = {}
+        try:
+            with open(file_path, 'r') as config_file:
+                config = yaml.safe_load(config_file)
+        except Exception as ex:
+            logger.error(
+                "Unable to read the config file at %s. \
+                Make sure path is correct", file_path)
+            raise ex
+        for key, value in config.items():
+            processed_args.append('--' + key)
+            processed_args.append(str(value))
+        return processed_args
 async def _run_task_with_lock(task: Callable, lock: asyncio.Lock, *args,
                              **kwargs):
    """Utility function to run async task in a lock"""
    async with lock:
        return await task(*args, **kwargs)
+# Using dynamo with vLLM doesn't really work well with PyTorch versions < 2.4.0.
+# In particular, the FakeScalarType is not supported for earlier versions of
+# PyTorch which breaks dynamo for any ops registered using ScalarType.
+def supports_dynamo() -> bool:
+    base_torch_version = Version(Version(torch.__version__).base_version)
+    return base_torch_version >= Version("2.4.0")
--- a/vllm/version.py
+++ b/vllm/version.py
@@ -9,4 +9,4 @@ except Exception as e:
                  stacklevel=2)
    __commit__ = "COMMIT_HASH_PLACEHOLDER"
-__version__ = "0.5.5"
+__version__ = "0.6.0"
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -10,11 +10,11 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                         SchedulerConfig)
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
+from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.model_loader import get_model
 from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
                             MultiModalInputs)
-from vllm.sequence import (IntermediateTensors, SamplerOutput,
+from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
-                           SequenceGroupMetadata)
 from vllm.utils import make_tensor_with_pad
 from vllm.worker.model_runner_base import (
    ModelRunnerBase, ModelRunnerInputBase,

--- a/vllm/worker/enc_dec_model_runner.py
+++ b/vllm/worker/enc_dec_model_runner.py
@@ -16,9 +16,10 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
 from vllm.inputs import INPUT_REGISTRY, InputRegistry
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
+from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.sampling_params import SamplingParams
-from vllm.sequence import (IntermediateTensors, PoolerOutput, SamplerOutput,
+from vllm.sequence import (IntermediateTensors, PoolerOutput,
                           SequenceGroupMetadata)
 from vllm.utils import STR_NOT_IMPL_ENC_DEC_BACKEND, make_tensor_with_pad
 from vllm.worker.model_runner import (GPUModelRunnerBase,