Commit 0640f227 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.6.0' into v0.6.0-dev

parents 82f1ffdf 32e7db25
......@@ -4,8 +4,9 @@ from typing import Dict, List, Set, Tuple
import torch
from vllm.sequence import (ExecuteModelRequest, HiddenStates, SamplerOutput,
SequenceData, SequenceGroupMetadata)
from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.sequence import (ExecuteModelRequest, HiddenStates, SequenceData,
SequenceGroupMetadata)
from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner
from vllm.spec_decode.interfaces import (SpeculativeProposals,
SpeculativeProposer)
......
......@@ -3,7 +3,8 @@ from typing import List, Optional, Set, Tuple
import torch
from vllm.sequence import ExecuteModelRequest, SamplerOutput
from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.sequence import ExecuteModelRequest
from vllm.spec_decode.interfaces import SpeculativeProposals
from vllm.spec_decode.proposer_worker_base import NonLLMProposerWorkerBase
from vllm.spec_decode.top1_proposer import Top1Proposer
......
from abc import ABC, abstractmethod
from typing import List, Optional, Set, Tuple
from vllm.sequence import ExecuteModelRequest, SamplerOutput
from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.sequence import ExecuteModelRequest
from vllm.spec_decode.interfaces import SpeculativeProposer
from vllm.worker.worker_base import LoraNotSupportedWorkerBase
......
......@@ -6,7 +6,8 @@ from vllm.distributed.parallel_state import (get_tp_group,
init_model_parallel_group,
patch_tensor_parallel_group)
from vllm.logger import init_logger
from vllm.sequence import ExecuteModelRequest, SamplerOutput
from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.sequence import ExecuteModelRequest
from vllm.spec_decode.interfaces import SpeculativeProposals
from vllm.spec_decode.multi_step_worker import MultiStepWorker
from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase
......
......@@ -8,12 +8,13 @@ from vllm.config import ParallelConfig, SpeculativeConfig
from vllm.distributed.communication_op import broadcast_tensor_dict
from vllm.logger import init_logger
from vllm.model_executor.layers.rejection_sampler import RejectionSampler
from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.model_executor.layers.spec_decode_base_sampler import (
SpecDecodeBaseSampler, SpecDecodeStochasticBaseSampler)
from vllm.model_executor.layers.typical_acceptance_sampler import (
TypicalAcceptanceSampler)
from vllm.sequence import (CompletionSequenceGroupOutput, ExecuteModelRequest,
HiddenStates, SamplerOutput, SequenceGroupMetadata,
HiddenStates, SequenceGroupMetadata,
get_all_seq_ids, get_all_seq_ids_and_request_ids)
from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner
......@@ -365,12 +366,13 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
# used during the prefill phase.
# 2. Auto-disable enabled: The running queue size exceeds
# the specified threshold.
# 3. No request: There are no requests in the batch.
# 3. No request: There are no requests in the batch, or
# none of the requests in the batch have spec decoding enabled.
# In any of these cases, the proposer and scorer workers
# are called normally.
no_spec = num_lookahead_slots == 0 or len(
execute_model_req.seq_group_metadata_list
) == 0 or disable_all_speculation
no_spec = num_lookahead_slots == 0 or disable_all_speculation or all(
sgm.num_speculative_tokens == 0
for sgm in execute_model_req.seq_group_metadata_list)
# Broadcast how many lookahead slots are scheduled for this step, and
# whether all speculation is disabled, to all non-driver workers.
......@@ -415,10 +417,8 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
self, execute_model_req: ExecuteModelRequest) -> bool:
# When the batch size is too large, disable speculative decoding
# to stop trading off throughput for latency.
disable_all_speculation = (execute_model_req.running_queue_size >=
self.disable_by_batch_size)
return disable_all_speculation
return (execute_model_req.running_queue_size >=
self.disable_by_batch_size)
def _maybe_disable_speculative_tokens(
self, disable_all_speculation: bool,
......@@ -621,18 +621,12 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
# proposal len. This adds some complexity (splitting the batch into spec
# and non spec sequences) and should be removed in the future. It can be
# done by supporting per-sequence proposal lens.
_, spec_indices = split_batch_by_proposal_len(
seq_group_metadata_list,
proposal_lens_list,
select_proposal_len_zero=False)
_, non_spec_indices = split_batch_by_proposal_len(
seq_group_metadata_list,
proposal_lens_list,
select_proposal_len_zero=True)
(_, spec_indices), (_, non_spec_indices) = split_batch_by_proposal_len(
seq_group_metadata_list, proposal_lens_list)
original_indices = spec_indices + non_spec_indices
# Get probabilities of target model, excluding bonus token.
proposal_verifier_probs = proposal_scores.probs[spec_indices, :-1]
# Get probabilities of target model, including bonus tokens.
proposal_verifier_probs = proposal_scores.probs[spec_indices]
# Get non-speculative sampled tokens from target model.
non_spec_token_ids = proposal_scores.token_ids[non_spec_indices]
......@@ -657,13 +651,12 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
}
accepted_token_ids = self.spec_decode_sampler(
target_probs=proposal_verifier_probs,
target_with_bonus_probs=proposal_verifier_probs,
bonus_token_ids=bonus_token_ids,
draft_probs=proposal_probs,
draft_token_ids=proposal_token_ids,
**sampler_extra_kwargs,
)
# Append output tokens from non-speculative sequences to
# the accepted token ids tensor.
non_spec_token_ids = non_spec_token_ids.expand(-1, max_proposal_len +
......
......@@ -2,8 +2,8 @@ from typing import List, Optional, Set, Tuple
import torch
from vllm.sequence import (ExecuteModelRequest, SamplerOutput,
SequenceGroupMetadata)
from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.sequence import ExecuteModelRequest, SequenceGroupMetadata
from vllm.spec_decode.interfaces import (SpeculativeProposals,
SpeculativeProposer)
from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase
......@@ -138,7 +138,7 @@ class Top1Proposer(SpeculativeProposer):
# Currently only proposal lens of 0 or the global batch proposal len
# are supported.
# If max_proposal_len is defined, then we shall no exceed this
# If max_proposal_len is defined, then we shall not exceed this
# quota for nonzero_proposal
new_k = 0
if (self.max_proposal_len is None
......
import time
from contextlib import contextmanager
from typing import Dict, List, Optional, Tuple
from typing import Dict, List, Optional, Sequence, Tuple
import torch
from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
SamplerOutput, SequenceGroupMetadata,
SequenceOutput)
SequenceGroupMetadata, SequenceOutput)
SeqId = int
......@@ -43,8 +43,8 @@ def get_sampled_token_logprobs(
sampled_token_ids, ]
expanded_selected_logprobs = selected_logprobs.unsqueeze(-1).expand(
-1, -1, vocab_size)
sampled_token_ids_ranks = (logprob_tensor >=
expanded_selected_logprobs).sum(-1)
sampled_token_ids_ranks = (logprob_tensor >
expanded_selected_logprobs).sum(-1).add_(1)
return sampled_token_ids_ranks, selected_logprobs
......@@ -98,33 +98,26 @@ def create_sequence_group_output(
def split_batch_by_proposal_len(
seq_group_metadata_list: List[SequenceGroupMetadata],
proposal_lens: List[int], select_proposal_len_zero: bool
) -> Tuple[List[SequenceGroupMetadata], List[int]]:
proposal_lens: List[int],
) -> Tuple[Tuple[List[SequenceGroupMetadata], List[int]], Tuple[
List[SequenceGroupMetadata], List[int]]]:
"""Utility function that splits a batch based on whether the proposal len is
zero or not. We should remove this once vLLM supports per-sequence proposal
lens in a batch.
"""
if select_proposal_len_zero:
predicate = lambda proposal_len: proposal_len == 0
else:
predicate = lambda proposal_len: proposal_len != 0
indices = [
i for i, (_, proposal_len
) in enumerate(zip(seq_group_metadata_list, proposal_lens))
if predicate(proposal_len)
]
seq_groups = [
seq_group for seq_group, proposal_len in zip(
seq_group_metadata_list, proposal_lens) if predicate(proposal_len)
]
return seq_groups, indices
nonzero_lists: Tuple[List[SequenceGroupMetadata], List[int]] = ([], [])
zero_lists: Tuple[List[SequenceGroupMetadata], List[int]] = ([], [])
for i, (seq_group, proposal_len) in enumerate(
zip(seq_group_metadata_list, proposal_lens)):
seq_groups, indices = nonzero_lists if proposal_len else zero_lists
seq_groups.append(seq_group)
indices.append(i)
return nonzero_lists, zero_lists
def sampler_output_to_torch(
sampler_output_list: List[SamplerOutput], sampler_transposed: bool
sampler_output_list: Sequence[SamplerOutput], sampler_transposed: bool
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
"""Utility function which converts a list of SamplerOutput to tensors.
......@@ -148,18 +141,12 @@ def sampler_output_to_torch(
dim=0,
)
if sampler_transposed:
sampled_token_probs = sampled_token_probs.transpose(0, 1)
# shape: [batch_size, num_sampler_output, vocab_size]
sampled_token_logprobs = torch.stack(
[sampler_output.logprobs for sampler_output in sampler_output_list],
dim=0,
)
if sampler_transposed:
sampled_token_logprobs = sampled_token_logprobs.transpose(0, 1)
# shape: [batch_size, num_sampler_output]
sampled_token_ids = torch.stack(
[
......@@ -168,7 +155,10 @@ def sampler_output_to_torch(
],
dim=0,
)
if sampler_transposed:
sampled_token_probs = sampled_token_probs.transpose(0, 1)
sampled_token_logprobs = sampled_token_logprobs.transpose(0, 1)
sampled_token_ids = sampled_token_ids.transpose(0, 1)
if sampler_output_list[0].hidden_states is not None:
......
......@@ -11,11 +11,12 @@ from transformers.models.auto.modeling_auto import (
from vllm.envs import VLLM_USE_MODELSCOPE
from vllm.logger import init_logger
from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig,
EAGLEConfig, InternVLChatConfig,
JAISConfig, MedusaConfig,
MLPSpeculatorConfig, MPTConfig,
NemotronConfig, RWConfig,
UltravoxConfig)
EAGLEConfig, ExaoneConfig,
InternVLChatConfig, JAISConfig,
MedusaConfig, MLPSpeculatorConfig,
MPTConfig, NemotronConfig,
RWConfig, UltravoxConfig)
from vllm.transformers_utils.utils import check_gguf_file
if VLLM_USE_MODELSCOPE:
from modelscope import AutoConfig
......@@ -34,6 +35,7 @@ _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
"mlp_speculator": MLPSpeculatorConfig,
"medusa": MedusaConfig,
"eagle": EAGLEConfig,
"exaone": ExaoneConfig,
"internvl_chat": InternVLChatConfig,
"nemotron": NemotronConfig,
"ultravox": UltravoxConfig,
......@@ -55,7 +57,7 @@ def get_config(
) -> PretrainedConfig:
# Separate model folder from file path for GGUF models
is_gguf = Path(model).is_file() and Path(model).suffix == ".gguf"
is_gguf = check_gguf_file(model)
if is_gguf:
kwargs["gguf_file"] = Path(model).name
model = Path(model).parent
......@@ -107,8 +109,11 @@ def get_hf_image_processor_config(
revision: Optional[str] = None,
**kwargs,
) -> Dict[str, Any]:
# ModelScope does not provide an interface for image_processor
if VLLM_USE_MODELSCOPE:
return dict()
# Separate model folder from file path for GGUF models
if Path(model).is_file() and Path(model).suffix == ".gguf":
if check_gguf_file(model):
model = Path(model).parent
return get_image_processor_config(model, revision=revision, **kwargs)
......
from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
from vllm.transformers_utils.configs.dbrx import DbrxConfig
from vllm.transformers_utils.configs.eagle import EAGLEConfig
from vllm.transformers_utils.configs.exaone import ExaoneConfig
# RWConfig is for the original tiiuae/falcon-40b(-instruct) and
# tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
# `FalconConfig` class from the official HuggingFace transformers library.
......@@ -22,6 +23,7 @@ __all__ = [
"JAISConfig",
"MedusaConfig",
"EAGLEConfig",
"ExaoneConfig",
"MLPSpeculatorConfig",
"NemotronConfig",
"UltravoxConfig",
......
# coding=utf-8
# Copied from
# https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/configuration_exaone.py
# Copyright 2021 The LG AI Research EXAONE Lab. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Exaone model configuration"""
from typing import Dict
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging
logger = logging.get_logger(__name__)
EXAONE_PRETRAINED_CONFIG_ARCHIVE_MAP: Dict[str, str] = {}
class ExaoneConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a :class:
`~transformers.ExaoneModel`. It is used to instantiate a GPT Lingvo model
according to the specified arguments, defining the model architecture.
Instantiating a configuration with the defaults will yield a similar
configuration to that of the Exaone
Configuration objects inherit from :class:`~transformers.PretrainedConfig`
and can be used to control the model outputs. Read the documentation from :
class:`~transformers.PretrainedConfig` for more information.
Args:
vocab_size (:obj:`int`, `optional`, defaults to 50257):
Vocabulary size of the GPT Lingvo model. Defines the number of
different tokens that can be represented by the :obj:`inputs_ids`
passed when calling :class:`~transformers.ExaoneModel`. Vocabulary
size of the model.
Defines the different tokens that can be represented by the
`inputs_ids` passed to the forward method of :class:
`~transformers.EXAONEModel`.
hidden_size (:obj:`int`, `optional`, defaults to 2048):
Dimensionality of the encoder layers and the pooler layer.
num_layers (:obj:`int`, `optional`, defaults to 24):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 32):
Number of attention heads for each attention layer in the
Transformer decoder.
num_key_value_heads (`int`, *optional*):
This is the number of key_value heads that should be used to
implement Grouped Query Attention. If
`num_key_value_heads=num_attention_heads`, the model will use Multi
Head Attention (MHA), if `num_key_value_heads=1 the model will use
Multi Query Attention (MQA) otherwise GQA is used. When
converting a multi-head checkpoint to a GQA checkpoint,
each group key and value head should be constructed by meanpooling
all the original heads within that group. For more details checkout
[this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not
specified, will default to `num_attention_heads`.
rotary_pct (`float`, *optional*, defaults to 0.25):
percentage of hidden dimensions to allocate to rotary embeddings
intermediate_size (:obj:`int`, `optional`, defaults to 8192):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in
the Transformer encoder.
activation_function (:obj:`str` or :obj:`function`, `optional`,
defaults to :obj:`"gelu_new"`):
The non-linear activation function (function or string) in the
encoder and pooler. If string, :obj:`"gelu"`, :obj:`"relu"`,
:obj:`"selu"` and :obj:`"gelu_new"` are supported.
embed_dropout (:obj:`float`, `optional`, defaults to 0.0):
The dropout probabilitiy for all fully connected layers in the
embeddings, encoder, and pooler.
attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
The dropout ratio for the attention probabilities.
max_position_embeddings (:obj:`int`, `optional`, defaults to 2048):
The maximum sequence length that this model might ever be used with.
Typically set this to something large just in case
(e.g., 512 or 1024 or 2048).
type_vocab_size (:obj:`int`, `optional`, defaults to 2):
The vocabulary size of the :obj:`token_type_ids` passed when calling
:class:`~transformers.EXAONEModel`.
initializer_range (:obj:`float`, `optional`, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for
initializing all weight matrices.
layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-5):
The epsilon used by the layer normalization layers.
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not the model should return the last key/values
attentions (not used by all models).
Only relevant if ``config.is_decoder=True``.
gradient_checkpointing (:obj:`bool`, `optional`,
defaults to :obj:`False`):
If True, use gradient checkpointing to save memory at the expense
of slower backward pass.
Example::
>>> from transformers import ExoneModel, ExaoneConfig
>>> # Initializing a EXAONE configuration
>>> configuration = ExaoneConfig()
>>> # Initializing a model from configuration
>>> model = ExoneModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
"""
model_type = "exaone"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {"num_hidden_layers": "num_layers"}
def __init__(
self,
vocab_size=102400,
max_position_embeddings=2048,
hidden_size=2048,
num_layers=32,
num_attention_heads=32,
num_key_value_heads=None,
intermediate_size=None,
activation_function="silu",
rotary_pct=0.25,
resid_dropout=0.0,
embed_dropout=0.0,
attention_dropout=0.0,
layer_norm_epsilon=1e-6,
initializer_range=0.02,
use_cache=True,
bos_token_id=0,
eos_token_id=2,
tie_word_embeddings=True,
**kwargs,
):
super().__init__(
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.num_layers = num_layers
self.num_attention_heads = num_attention_heads
self.num_hidden_layers = num_layers
if num_key_value_heads is None:
num_key_value_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
if intermediate_size:
self.intermediate_size = intermediate_size
else:
self.intermediate_size = hidden_size * 4
self.activation_function = activation_function
self.resid_dropout = resid_dropout
self.embed_dropout = embed_dropout
self.attention_dropout = attention_dropout
self.layer_norm_epsilon = layer_norm_epsilon
self.initializer_range = initializer_range
self.use_cache = use_cache
self.rotary_pct = rotary_pct
self.bos_token_id = bos_token_id
self.eos_token_id = eos_token_id
self.use_logit_cap = kwargs.pop("use_logit_cap", False)
self.ln_no_scale = kwargs.pop("ln_no_scale", False)
self.use_gated = kwargs.pop("use_gated", False)
self.use_emb_norm = kwargs.pop("use_emb_norm", False)
self.use_rotary_pos = kwargs.pop("use_rotary_pos", False)
self.rotary_type = kwargs.pop("rotary_type", None)
self.scaling_factor = kwargs.pop("scaling_factor", 1)
self.use_absolute_pos = kwargs.pop("use_absolute_pos", True)
self.use_extra_logit = kwargs.pop("use_extra_logit", True)
self.rotary_expand_length = kwargs.pop("rotary_expand_length", None)
self.rotary_base = kwargs.pop("rotary_base", 10000.0)
self.use_qkv_fuse = kwargs.pop("use_qkv_fuse", False)
self.rescale_before_lm_head = kwargs.pop("rescale_before_lm_head",
(rotary_pct == 0.25))
if self.use_rotary_pos:
self.use_absolute_pos = False
# coding=utf-8
# Copyright 2024 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Granite model configuration"""
from transformers.configuration_utils import PretrainedConfig
from transformers.modeling_rope_utils import rope_config_validation
from transformers.utils import logging
logger = logging.get_logger(__name__)
class GraniteConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of
a [`GraniteModel`]. It is used to instantiate an Granite
model according to the specified arguments, defining the model architecture.
Instantiating a configuration with the defaults will yield a similar
configuration to that of the Granite-3B.
Configuration objects inherit from [`PretrainedConfig`] and can be used to
control the model outputs. Read the documentation from [`PretrainedConfig`]
for more information.
Args:
vocab_size (`int`, *optional*, defaults to 32000):
Vocabulary size of the Granite model. Defines the number of
different tokens that can be represented by the `inputs_ids`
passed when calling [`GraniteModel`]
hidden_size (`int`, *optional*, defaults to 4096):
Dimension of the hidden representations.
intermediate_size (`int`, *optional*, defaults to 11008):
Dimension of the MLP representations.
num_hidden_layers (`int`, *optional*, defaults to 32):
Number of hidden layers in the Transformer decoder.
num_attention_heads (`int`, *optional*, defaults to 32):
Number of attention heads for each attention layer in the
Transformer decoder.
num_key_value_heads (`int`, *optional*):
This is the number of key_value heads that should be used to
implement Grouped Query Attention. If
`num_key_value_heads=num_attention_heads`, the model will use Multi
Head Attention (MHA), if `num_key_value_heads=1` the model will use
Multi Query Attention (MQA) otherwise GQA is used. When converting
a multi-head checkpoint to a GQA checkpoint, each group key and
value head should be constructed by meanpooling all the original
heads within that group. For more details checkout
[this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not
specified, will default to `num_attention_heads`.
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
The non-linear activation function (function or string) in the
decoder.
max_position_embeddings (`int`, *optional*, defaults to 2048):
The maximum sequence length that this model might ever be used with.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for
initializing all weight matrices.
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
The epsilon used by the rms normalization layers.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values
attentions (not used by all models). Only relevant if
`config.is_decoder=True`.
pad_token_id (`int`, *optional*):
Padding token id.
bos_token_id (`int`, *optional*, defaults to 1):
Beginning of stream token id.
eos_token_id (`int`, *optional*, defaults to 2):
End of stream token id.
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether to tie weight embeddings
rope_theta (`float`, *optional*, defaults to 10000.0):
The base period of the RoPE embeddings.
rope_scaling (`Dict`, *optional*):
Dictionary containing the scaling configuration for the RoPE
embeddings. Currently supports two scaling strategies: linear and
dynamic. Their scaling factor must be a float greater than 1. The
expected format is
`{"type": strategy name, "factor": scaling factor}`.
When using this flag, don't update `max_position_embeddings` to
the expected new maximum. See the following thread for more
information on how these scaling strategies behave:
https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/.
This is an experimental feature, subject to breaking API changes
in future versions.
attention_bias (`bool`, *optional*, defaults to `False`):
Whether to use a bias in the query, key, value and output
projection layers during self-attention.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
mlp_bias (`bool`, *optional*, defaults to `False`):
Whether to use a bias in up_proj, down_proj and gate_proj layers
in the MLP layers.
embedding_multiplier (`float`, *optional*, defaults to 1.0):
embedding multiplier
logits_scaling (`float`, *optional*, defaults to 1.0):
divisor for output logits
residual_multiplier (`float`, *optional*, defaults to 1.0):
residual multiplier
attention_multiplier (`float`, *optional*, defaults to 1.0):
attention multiplier
```python
>>> from transformers import GraniteModel, GraniteConfig
>>> # Initializing a Granite granite-3b style configuration
>>> configuration = GraniteConfig()
>>> # Initializing a model from the granite-7b style configuration
>>> model = GraniteModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "granite"
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
self,
vocab_size=32000,
hidden_size=4096,
intermediate_size=11008,
num_hidden_layers=32,
num_attention_heads=32,
num_key_value_heads=None,
hidden_act="silu",
max_position_embeddings=2048,
initializer_range=0.02,
rms_norm_eps=1e-6,
use_cache=True,
pad_token_id=None,
bos_token_id=1,
eos_token_id=2,
tie_word_embeddings=False,
rope_theta=10000.0,
rope_scaling=None,
attention_bias=False,
attention_dropout=0.0,
mlp_bias=False,
embedding_multiplier=1.0,
logits_scaling=1.0,
residual_multiplier=1.0,
attention_multiplier=1.0,
**kwargs,
):
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
# for backward compatibility
if num_key_value_heads is None:
num_key_value_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.use_cache = use_cache
self.rope_theta = rope_theta
self.rope_scaling = rope_scaling
self.attention_bias = attention_bias
self.attention_dropout = attention_dropout
self.mlp_bias = mlp_bias
self.embedding_multiplier = embedding_multiplier
self.logits_scaling = logits_scaling
self.residual_multiplier = residual_multiplier
self.attention_multiplier = attention_multiplier
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
rope_config_validation(self)
......@@ -230,7 +230,7 @@ def convert_prompt_ids_to_tokens(
prefix_offset = max(
read_offset - INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET, 0)
# This is required to guard against out-of-vocab prompt token ids
_replace_none_with_empty(new_tokens)
_replace_none_with_empty(new_tokens) # type: ignore[arg-type]
return new_tokens, prefix_offset, read_offset
......
import os
import warnings
from pathlib import Path
from typing import Optional, Union
......@@ -9,12 +10,15 @@ from transformers import (AutoTokenizer, PreTrainedTokenizer,
from vllm.envs import VLLM_USE_MODELSCOPE
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.transformers_utils.tokenizers import BaichuanTokenizer
from vllm.transformers_utils.tokenizers import (BaichuanTokenizer,
MistralTokenizer)
from vllm.transformers_utils.utils import check_gguf_file
from vllm.utils import make_async
logger = init_logger(__name__)
AnyTokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
AnyTokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast,
MistralTokenizer]
def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
......@@ -93,51 +97,69 @@ def get_tokenizer(
kwargs["truncation_side"] = "left"
# Separate model folder from file path for GGUF models
is_gguf = Path(tokenizer_name).is_file() and Path(
tokenizer_name).suffix == ".gguf"
is_gguf = check_gguf_file(tokenizer_name)
if is_gguf:
kwargs["gguf_file"] = Path(tokenizer_name).name
tokenizer_name = Path(tokenizer_name).parent
try:
tokenizer = AutoTokenizer.from_pretrained(
tokenizer_name,
*args,
trust_remote_code=trust_remote_code,
revision=revision,
**kwargs)
except ValueError as e:
# If the error pertains to the tokenizer class not existing or not
# currently being imported, suggest using the --trust-remote-code flag.
if (not trust_remote_code and
("does not exist or is not currently imported." in str(e)
or "requires you to execute the tokenizer file" in str(e))):
err_msg = (
"Failed to load the tokenizer. If the tokenizer is a custom "
"tokenizer not yet available in the HuggingFace transformers "
"library, consider setting `trust_remote_code=True` in LLM "
"or using the `--trust-remote-code` flag in the CLI.")
raise RuntimeError(err_msg) from e
else:
raise e
except AttributeError as e:
if "BaichuanTokenizer" in str(e):
# This is for the error "'BaichuanTokenizer' object has no
# attribute 'sp_model'".
tokenizer = BaichuanTokenizer.from_pretrained(
# if tokenizer is from official mistral org
is_from_mistral_org = str(tokenizer_name).split("/")[0] == "mistralai"
if is_from_mistral_org and tokenizer_mode != "mistral":
warnings.warn(
'It is strongly recommended to run mistral models with '
'`--tokenizer_mode "mistral"` to ensure correct '
'encoding and decoding.',
FutureWarning,
stacklevel=2)
if tokenizer_mode == "mistral":
tokenizer = MistralTokenizer.from_pretrained(str(tokenizer_name),
revision=revision)
else:
try:
tokenizer = AutoTokenizer.from_pretrained(
tokenizer_name,
*args,
trust_remote_code=trust_remote_code,
revision=revision,
**kwargs)
else:
raise e
**kwargs,
)
except ValueError as e:
# If the error pertains to the tokenizer class not existing or not
# currently being imported,
# suggest using the --trust-remote-code flag.
if not trust_remote_code and (
"does not exist or is not currently imported." in str(e)
or "requires you to execute the tokenizer file" in str(e)):
err_msg = ("Failed to load the tokenizer. If the tokenizer "
"is a custom tokenizer not yet available in the "
"HuggingFace transformers library, consider "
"setting `trust_remote_code=True` in LLM or using "
"the `--trust-remote-code` flag in the CLI.")
raise RuntimeError(err_msg) from e
else:
raise e
except AttributeError as e:
if "BaichuanTokenizer" in str(e):
# This is for the error "'BaichuanTokenizer' object has no
# attribute 'sp_model'".
tokenizer = BaichuanTokenizer.from_pretrained(
tokenizer_name,
*args,
trust_remote_code=trust_remote_code,
revision=revision,
**kwargs,
)
else:
raise e
if not isinstance(tokenizer, PreTrainedTokenizerFast):
logger.warning(
"Using a slow tokenizer. This might cause a significant "
"slowdown. Consider using a fast tokenizer instead.")
tokenizer = get_cached_tokenizer(tokenizer)
if not isinstance(tokenizer, PreTrainedTokenizerFast):
logger.warning(
"Using a slow tokenizer. This might cause a significant "
"slowdown. Consider using a fast tokenizer instead.")
return get_cached_tokenizer(tokenizer)
return tokenizer
def get_lora_tokenizer(lora_request: LoRARequest, *args,
......
from vllm.transformers_utils.tokenizers.baichuan import BaichuanTokenizer
from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
__all__ = [
"BaichuanTokenizer",
]
__all__ = ["BaichuanTokenizer", "MistralTokenizer"]
import os
import re
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
from huggingface_hub import HfApi, hf_hub_download
# yapf: disable
from mistral_common.tokens.tokenizers.mistral import ChatCompletionRequest
from mistral_common.tokens.tokenizers.mistral import (
MistralTokenizer as PublicMistralTokenizer)
# yapf: enable
from mistral_common.tokens.tokenizers.sentencepiece import (
SentencePieceTokenizer)
from mistral_common.tokens.tokenizers.tekken import (SpecialTokenPolicy,
Tekkenizer)
if TYPE_CHECKING:
from vllm.entrypoints.chat_utils import ConversationMessage
@dataclass
class Encoding:
input_ids: List[int]
def find_tokenizer_file(files: List[str]):
file_pattern = re.compile(r"^tokenizer\.model\.v.*$|^tekken\.json$")
matched_files = [file for file in files if file_pattern.match(file)]
if len(matched_files) > 1:
raise OSError(f"Found {len(matched_files)} files matching the "
"pattern: {matched_files}. Make sure only one Mistral "
"tokenizer is present in {tokenizer_name}.")
elif len(matched_files) == 0:
raise OSError(f"Found {len(matched_files)} files matching the "
"pattern: {matched_files}. Make sure that a Mistral "
"tokenizer is present in {tokenizer_name}.")
return matched_files[0]
class MistralTokenizer:
def __init__(self, tokenizer: PublicMistralTokenizer) -> None:
self.mistral = tokenizer
self.instruct = tokenizer.instruct_tokenizer
self.tokenizer = tokenizer.instruct_tokenizer.tokenizer
self.vocab_size = len(self.tokenizer.vocab())
assert isinstance(self.tokenizer,
(Tekkenizer, SentencePieceTokenizer)), type(
self.tokenizer)
if (is_tekken := isinstance(self.tokenizer, Tekkenizer)):
# Make sure special tokens will not raise
self.tokenizer.special_token_policy = SpecialTokenPolicy.IGNORE
self._is_tekken = is_tekken
# the following attributes are set to fit VLLM's design
self.is_fast = True
self.chat_template = True
self.all_special_ids: List[Any] = []
self.all_special_tokens: List[Any] = []
self.all_special_tokens_extended: List[Any] = []
@classmethod
def from_pretrained(cls,
path_or_repo_id: str,
*,
revision: Optional[str] = None) -> "MistralTokenizer":
if not Path(path_or_repo_id).exists():
assert len(path_or_repo_id.split("/")) == 2, (
"You have either provided a non-existent path: "
"{path_or_repo_id} or an invalid HF Hub repo id.")
tokenizer_file = cls._download_mistral_tokenizer_from_hf(
path_or_repo_id, revision)
elif Path(path_or_repo_id).is_dir():
tokenizer_file_name = find_tokenizer_file(
os.listdir(path_or_repo_id))
tokenizer_file = str(Path(path_or_repo_id) / tokenizer_file_name)
else:
assert Path(
path_or_repo_id).is_file(), f"Invalid path: {path_or_repo_id}"
mistral_tokenizer = PublicMistralTokenizer.from_file(tokenizer_file)
return cls(mistral_tokenizer)
@staticmethod
def _download_mistral_tokenizer_from_hf(tokenizer_name: str,
revision: Optional[str]) -> str:
api = HfApi()
repo_info = api.model_info(tokenizer_name)
files = [s.rfilename for s in repo_info.siblings]
filename = find_tokenizer_file(files)
tokenizer_file = hf_hub_download(tokenizer_name,
filename=filename,
revision=revision)
return tokenizer_file
def __call__(
self,
prompt: str,
add_special_tokens: bool = False,
truncation: bool = False,
max_length: Optional[int] = None,
):
# Mistral Tokenizers should not add special tokens
input_ids = self.encode(prompt)
if truncation:
input_ids = input_ids[:max_length]
return Encoding(input_ids=input_ids)
def get_added_vocab(self) -> List[str]:
# Mistral tokenizers have no added vocabulary
return []
def encode(self, prompt: str) -> List[int]:
# `encode ` should only be used for prompt completion
# it should never be used for chat_completion.
# For chat completion use `apply_chat_template`
return self.tokenizer.encode(prompt, bos=True, eos=False)
def apply_chat_template(self,
conversation: List["ConversationMessage"],
tools: Optional[Dict[str, Any]] = None,
**kwargs) -> List[int]:
assert tools is None, "`tools` are not yet supported."
request = ChatCompletionRequest(
messages=conversation) # type: ignore[type-var]
encoded = self.mistral.encode_chat_completion(request)
# encode-decode to get clean prompt
return encoded.tokens
def convert_tokens_to_string(self, tokens: List[str]) -> str:
if self._is_tekken:
return "".join(tokens)
else:
return self.tokenizer.decode(tokens) # type: ignore[arg-type]
def decode(self, ids: Union[List[int], int]) -> str:
if isinstance(ids, int):
ids = [ids]
return self.tokenizer.decode(ids)
@property
def eos_token_id(self):
return self.tokenizer.eos_id
def convert_ids_to_tokens(
self,
ids: List[int],
skip_special_tokens: Optional[bool] = True) -> List[str]:
# TODO(Patrick) - potentially allow special tokens to not be skipped
assert (
skip_special_tokens
), "Skipping special tokens is not supported for Mistral tokenizers."
assert isinstance(self.tokenizer,
(Tekkenizer, SentencePieceTokenizer)), type(
self.tokenizer)
tokens = [self.tokenizer.id_to_piece(id) for id in ids]
return tokens
def __len__(self):
return self.vocab_size
from os import PathLike
from pathlib import Path
from typing import Union
def check_gguf_file(model: Union[str, PathLike]) -> bool:
"""Check if the file is a GGUF model."""
model = Path(model)
if not model.is_file():
return False
elif model.suffix == ".gguf":
return True
with open(model, "rb") as f:
header = f.read(4)
return header == b"GGUF"
......@@ -25,6 +25,8 @@ import numpy.typing as npt
import psutil
import torch
import torch.types
import yaml
from packaging.version import Version
from typing_extensions import ParamSpec, TypeIs, assert_never
import vllm.envs as envs
......@@ -1092,6 +1094,9 @@ class FlexibleArgumentParser(argparse.ArgumentParser):
if args is None:
args = sys.argv[1:]
if '--config' in args:
args = FlexibleArgumentParser._pull_args_from_config(args)
# Convert underscores to dashes and vice versa in argument names
processed_args = []
for arg in args:
......@@ -1108,9 +1113,114 @@ class FlexibleArgumentParser(argparse.ArgumentParser):
return super().parse_args(processed_args, namespace)
@staticmethod
def _pull_args_from_config(args: List[str]) -> List[str]:
"""Method to pull arguments specified in the config file
into the command-line args variable.
The arguments in config file will be inserted between
the argument list.
example:
```yaml
port: 12323
tensor-parallel-size: 4
```
```python
$: vllm {serve,chat,complete} "facebook/opt-12B" \
--config config.yaml -tp 2
$: args = [
"serve,chat,complete",
"facebook/opt-12B",
'--config', 'config.yaml',
'-tp', '2'
]
$: args = [
"serve,chat,complete",
"facebook/opt-12B",
'--port', '12323',
'--tensor-parallel-size', '4',
'-tp', '2'
]
```
Please note how the config args are inserted after the sub command.
this way the order of priorities is maintained when these are args
parsed by super().
"""
assert args.count(
'--config') <= 1, "More than one config file specified!"
index = args.index('--config')
if index == len(args) - 1:
raise ValueError("No config file specified! \
Please check your command-line arguments.")
file_path = args[index + 1]
config_args = FlexibleArgumentParser._load_config_file(file_path)
# 0th index is for {serve,chat,complete}
# followed by config args
# followed by rest of cli args.
# maintaining this order will enforce the precedence
# of cli > config > defaults
args = [args[0]] + config_args + args[1:index] + args[index + 2:]
return args
@staticmethod
def _load_config_file(file_path: str) -> List[str]:
"""Loads a yaml file and returns the key value pairs as a
flattened list with argparse like pattern
```yaml
port: 12323
tensor-parallel-size: 4
```
returns:
processed_args: list[str] = [
'--port': '12323',
'--tensor-parallel-size': '4'
]
"""
extension: str = file_path.split('.')[-1]
if extension not in ('yaml', 'yml'):
raise ValueError(
"Config file must be of a yaml/yml type.\
%s supplied", extension)
# only expecting a flat dictionary of atomic types
processed_args: List[str] = []
config: Dict[str, Union[int, str]] = {}
try:
with open(file_path, 'r') as config_file:
config = yaml.safe_load(config_file)
except Exception as ex:
logger.error(
"Unable to read the config file at %s. \
Make sure path is correct", file_path)
raise ex
for key, value in config.items():
processed_args.append('--' + key)
processed_args.append(str(value))
return processed_args
async def _run_task_with_lock(task: Callable, lock: asyncio.Lock, *args,
**kwargs):
"""Utility function to run async task in a lock"""
async with lock:
return await task(*args, **kwargs)
# Using dynamo with vLLM doesn't really work well with PyTorch versions < 2.4.0.
# In particular, the FakeScalarType is not supported for earlier versions of
# PyTorch which breaks dynamo for any ops registered using ScalarType.
def supports_dynamo() -> bool:
base_torch_version = Version(Version(torch.__version__).base_version)
return base_torch_version >= Version("2.4.0")
......@@ -9,4 +9,4 @@ except Exception as e:
stacklevel=2)
__commit__ = "COMMIT_HASH_PLACEHOLDER"
__version__ = "0.5.5"
__version__ = "0.6.0"
......@@ -10,11 +10,11 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
SchedulerConfig)
from vllm.logger import init_logger
from vllm.model_executor import SamplingMetadata
from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.model_executor.model_loader import get_model
from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
MultiModalInputs)
from vllm.sequence import (IntermediateTensors, SamplerOutput,
SequenceGroupMetadata)
from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
from vllm.utils import make_tensor_with_pad
from vllm.worker.model_runner_base import (
ModelRunnerBase, ModelRunnerInputBase,
......
......@@ -16,9 +16,10 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
from vllm.inputs import INPUT_REGISTRY, InputRegistry
from vllm.logger import init_logger
from vllm.model_executor import SamplingMetadata
from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
from vllm.sampling_params import SamplingParams
from vllm.sequence import (IntermediateTensors, PoolerOutput, SamplerOutput,
from vllm.sequence import (IntermediateTensors, PoolerOutput,
SequenceGroupMetadata)
from vllm.utils import STR_NOT_IMPL_ENC_DEC_BACKEND, make_tensor_with_pad
from vllm.worker.model_runner import (GPUModelRunnerBase,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment