Commit 0640f227 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.6.0' into v0.6.0-dev

parents 82f1ffdf 32e7db25
...@@ -4,8 +4,9 @@ from typing import Dict, List, Set, Tuple ...@@ -4,8 +4,9 @@ from typing import Dict, List, Set, Tuple
import torch import torch
from vllm.sequence import (ExecuteModelRequest, HiddenStates, SamplerOutput, from vllm.model_executor.layers.sampler import SamplerOutput
SequenceData, SequenceGroupMetadata) from vllm.sequence import (ExecuteModelRequest, HiddenStates, SequenceData,
SequenceGroupMetadata)
from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner
from vllm.spec_decode.interfaces import (SpeculativeProposals, from vllm.spec_decode.interfaces import (SpeculativeProposals,
SpeculativeProposer) SpeculativeProposer)
......
...@@ -3,7 +3,8 @@ from typing import List, Optional, Set, Tuple ...@@ -3,7 +3,8 @@ from typing import List, Optional, Set, Tuple
import torch import torch
from vllm.sequence import ExecuteModelRequest, SamplerOutput from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.sequence import ExecuteModelRequest
from vllm.spec_decode.interfaces import SpeculativeProposals from vllm.spec_decode.interfaces import SpeculativeProposals
from vllm.spec_decode.proposer_worker_base import NonLLMProposerWorkerBase from vllm.spec_decode.proposer_worker_base import NonLLMProposerWorkerBase
from vllm.spec_decode.top1_proposer import Top1Proposer from vllm.spec_decode.top1_proposer import Top1Proposer
......
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import List, Optional, Set, Tuple from typing import List, Optional, Set, Tuple
from vllm.sequence import ExecuteModelRequest, SamplerOutput from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.sequence import ExecuteModelRequest
from vllm.spec_decode.interfaces import SpeculativeProposer from vllm.spec_decode.interfaces import SpeculativeProposer
from vllm.worker.worker_base import LoraNotSupportedWorkerBase from vllm.worker.worker_base import LoraNotSupportedWorkerBase
......
...@@ -6,7 +6,8 @@ from vllm.distributed.parallel_state import (get_tp_group, ...@@ -6,7 +6,8 @@ from vllm.distributed.parallel_state import (get_tp_group,
init_model_parallel_group, init_model_parallel_group,
patch_tensor_parallel_group) patch_tensor_parallel_group)
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.sequence import ExecuteModelRequest, SamplerOutput from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.sequence import ExecuteModelRequest
from vllm.spec_decode.interfaces import SpeculativeProposals from vllm.spec_decode.interfaces import SpeculativeProposals
from vllm.spec_decode.multi_step_worker import MultiStepWorker from vllm.spec_decode.multi_step_worker import MultiStepWorker
from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase
......
...@@ -8,12 +8,13 @@ from vllm.config import ParallelConfig, SpeculativeConfig ...@@ -8,12 +8,13 @@ from vllm.config import ParallelConfig, SpeculativeConfig
from vllm.distributed.communication_op import broadcast_tensor_dict from vllm.distributed.communication_op import broadcast_tensor_dict
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.layers.rejection_sampler import RejectionSampler from vllm.model_executor.layers.rejection_sampler import RejectionSampler
from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.model_executor.layers.spec_decode_base_sampler import ( from vllm.model_executor.layers.spec_decode_base_sampler import (
SpecDecodeBaseSampler, SpecDecodeStochasticBaseSampler) SpecDecodeBaseSampler, SpecDecodeStochasticBaseSampler)
from vllm.model_executor.layers.typical_acceptance_sampler import ( from vllm.model_executor.layers.typical_acceptance_sampler import (
TypicalAcceptanceSampler) TypicalAcceptanceSampler)
from vllm.sequence import (CompletionSequenceGroupOutput, ExecuteModelRequest, from vllm.sequence import (CompletionSequenceGroupOutput, ExecuteModelRequest,
HiddenStates, SamplerOutput, SequenceGroupMetadata, HiddenStates, SequenceGroupMetadata,
get_all_seq_ids, get_all_seq_ids_and_request_ids) get_all_seq_ids, get_all_seq_ids_and_request_ids)
from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner
...@@ -365,12 +366,13 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase): ...@@ -365,12 +366,13 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
# used during the prefill phase. # used during the prefill phase.
# 2. Auto-disable enabled: The running queue size exceeds # 2. Auto-disable enabled: The running queue size exceeds
# the specified threshold. # the specified threshold.
# 3. No request: There are no requests in the batch. # 3. No request: There are no requests in the batch, or
# none of the requests in the batch have spec decoding enabled.
# In any of these cases, the proposer and scorer workers # In any of these cases, the proposer and scorer workers
# are called normally. # are called normally.
no_spec = num_lookahead_slots == 0 or len( no_spec = num_lookahead_slots == 0 or disable_all_speculation or all(
execute_model_req.seq_group_metadata_list sgm.num_speculative_tokens == 0
) == 0 or disable_all_speculation for sgm in execute_model_req.seq_group_metadata_list)
# Broadcast how many lookahead slots are scheduled for this step, and # Broadcast how many lookahead slots are scheduled for this step, and
# whether all speculation is disabled, to all non-driver workers. # whether all speculation is disabled, to all non-driver workers.
...@@ -415,10 +417,8 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase): ...@@ -415,10 +417,8 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
self, execute_model_req: ExecuteModelRequest) -> bool: self, execute_model_req: ExecuteModelRequest) -> bool:
# When the batch size is too large, disable speculative decoding # When the batch size is too large, disable speculative decoding
# to stop trading off throughput for latency. # to stop trading off throughput for latency.
disable_all_speculation = (execute_model_req.running_queue_size >= return (execute_model_req.running_queue_size >=
self.disable_by_batch_size) self.disable_by_batch_size)
return disable_all_speculation
def _maybe_disable_speculative_tokens( def _maybe_disable_speculative_tokens(
self, disable_all_speculation: bool, self, disable_all_speculation: bool,
...@@ -621,18 +621,12 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase): ...@@ -621,18 +621,12 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
# proposal len. This adds some complexity (splitting the batch into spec # proposal len. This adds some complexity (splitting the batch into spec
# and non spec sequences) and should be removed in the future. It can be # and non spec sequences) and should be removed in the future. It can be
# done by supporting per-sequence proposal lens. # done by supporting per-sequence proposal lens.
_, spec_indices = split_batch_by_proposal_len( (_, spec_indices), (_, non_spec_indices) = split_batch_by_proposal_len(
seq_group_metadata_list, seq_group_metadata_list, proposal_lens_list)
proposal_lens_list,
select_proposal_len_zero=False)
_, non_spec_indices = split_batch_by_proposal_len(
seq_group_metadata_list,
proposal_lens_list,
select_proposal_len_zero=True)
original_indices = spec_indices + non_spec_indices original_indices = spec_indices + non_spec_indices
# Get probabilities of target model, excluding bonus token. # Get probabilities of target model, including bonus tokens.
proposal_verifier_probs = proposal_scores.probs[spec_indices, :-1] proposal_verifier_probs = proposal_scores.probs[spec_indices]
# Get non-speculative sampled tokens from target model. # Get non-speculative sampled tokens from target model.
non_spec_token_ids = proposal_scores.token_ids[non_spec_indices] non_spec_token_ids = proposal_scores.token_ids[non_spec_indices]
...@@ -657,13 +651,12 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase): ...@@ -657,13 +651,12 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
} }
accepted_token_ids = self.spec_decode_sampler( accepted_token_ids = self.spec_decode_sampler(
target_probs=proposal_verifier_probs, target_with_bonus_probs=proposal_verifier_probs,
bonus_token_ids=bonus_token_ids, bonus_token_ids=bonus_token_ids,
draft_probs=proposal_probs, draft_probs=proposal_probs,
draft_token_ids=proposal_token_ids, draft_token_ids=proposal_token_ids,
**sampler_extra_kwargs, **sampler_extra_kwargs,
) )
# Append output tokens from non-speculative sequences to # Append output tokens from non-speculative sequences to
# the accepted token ids tensor. # the accepted token ids tensor.
non_spec_token_ids = non_spec_token_ids.expand(-1, max_proposal_len + non_spec_token_ids = non_spec_token_ids.expand(-1, max_proposal_len +
......
...@@ -2,8 +2,8 @@ from typing import List, Optional, Set, Tuple ...@@ -2,8 +2,8 @@ from typing import List, Optional, Set, Tuple
import torch import torch
from vllm.sequence import (ExecuteModelRequest, SamplerOutput, from vllm.model_executor.layers.sampler import SamplerOutput
SequenceGroupMetadata) from vllm.sequence import ExecuteModelRequest, SequenceGroupMetadata
from vllm.spec_decode.interfaces import (SpeculativeProposals, from vllm.spec_decode.interfaces import (SpeculativeProposals,
SpeculativeProposer) SpeculativeProposer)
from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase
...@@ -138,7 +138,7 @@ class Top1Proposer(SpeculativeProposer): ...@@ -138,7 +138,7 @@ class Top1Proposer(SpeculativeProposer):
# Currently only proposal lens of 0 or the global batch proposal len # Currently only proposal lens of 0 or the global batch proposal len
# are supported. # are supported.
# If max_proposal_len is defined, then we shall no exceed this # If max_proposal_len is defined, then we shall not exceed this
# quota for nonzero_proposal # quota for nonzero_proposal
new_k = 0 new_k = 0
if (self.max_proposal_len is None if (self.max_proposal_len is None
......
import time import time
from contextlib import contextmanager from contextlib import contextmanager
from typing import Dict, List, Optional, Tuple from typing import Dict, List, Optional, Sequence, Tuple
import torch import torch
from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.sequence import (CompletionSequenceGroupOutput, Logprob, from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
SamplerOutput, SequenceGroupMetadata, SequenceGroupMetadata, SequenceOutput)
SequenceOutput)
SeqId = int SeqId = int
...@@ -43,8 +43,8 @@ def get_sampled_token_logprobs( ...@@ -43,8 +43,8 @@ def get_sampled_token_logprobs(
sampled_token_ids, ] sampled_token_ids, ]
expanded_selected_logprobs = selected_logprobs.unsqueeze(-1).expand( expanded_selected_logprobs = selected_logprobs.unsqueeze(-1).expand(
-1, -1, vocab_size) -1, -1, vocab_size)
sampled_token_ids_ranks = (logprob_tensor >= sampled_token_ids_ranks = (logprob_tensor >
expanded_selected_logprobs).sum(-1) expanded_selected_logprobs).sum(-1).add_(1)
return sampled_token_ids_ranks, selected_logprobs return sampled_token_ids_ranks, selected_logprobs
...@@ -98,33 +98,26 @@ def create_sequence_group_output( ...@@ -98,33 +98,26 @@ def create_sequence_group_output(
def split_batch_by_proposal_len( def split_batch_by_proposal_len(
seq_group_metadata_list: List[SequenceGroupMetadata], seq_group_metadata_list: List[SequenceGroupMetadata],
proposal_lens: List[int], select_proposal_len_zero: bool proposal_lens: List[int],
) -> Tuple[List[SequenceGroupMetadata], List[int]]: ) -> Tuple[Tuple[List[SequenceGroupMetadata], List[int]], Tuple[
List[SequenceGroupMetadata], List[int]]]:
"""Utility function that splits a batch based on whether the proposal len is """Utility function that splits a batch based on whether the proposal len is
zero or not. We should remove this once vLLM supports per-sequence proposal zero or not. We should remove this once vLLM supports per-sequence proposal
lens in a batch. lens in a batch.
""" """
if select_proposal_len_zero: nonzero_lists: Tuple[List[SequenceGroupMetadata], List[int]] = ([], [])
predicate = lambda proposal_len: proposal_len == 0 zero_lists: Tuple[List[SequenceGroupMetadata], List[int]] = ([], [])
else: for i, (seq_group, proposal_len) in enumerate(
predicate = lambda proposal_len: proposal_len != 0 zip(seq_group_metadata_list, proposal_lens)):
seq_groups, indices = nonzero_lists if proposal_len else zero_lists
indices = [ seq_groups.append(seq_group)
i for i, (_, proposal_len indices.append(i)
) in enumerate(zip(seq_group_metadata_list, proposal_lens)) return nonzero_lists, zero_lists
if predicate(proposal_len)
]
seq_groups = [
seq_group for seq_group, proposal_len in zip(
seq_group_metadata_list, proposal_lens) if predicate(proposal_len)
]
return seq_groups, indices
def sampler_output_to_torch( def sampler_output_to_torch(
sampler_output_list: List[SamplerOutput], sampler_transposed: bool sampler_output_list: Sequence[SamplerOutput], sampler_transposed: bool
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
"""Utility function which converts a list of SamplerOutput to tensors. """Utility function which converts a list of SamplerOutput to tensors.
...@@ -148,18 +141,12 @@ def sampler_output_to_torch( ...@@ -148,18 +141,12 @@ def sampler_output_to_torch(
dim=0, dim=0,
) )
if sampler_transposed:
sampled_token_probs = sampled_token_probs.transpose(0, 1)
# shape: [batch_size, num_sampler_output, vocab_size] # shape: [batch_size, num_sampler_output, vocab_size]
sampled_token_logprobs = torch.stack( sampled_token_logprobs = torch.stack(
[sampler_output.logprobs for sampler_output in sampler_output_list], [sampler_output.logprobs for sampler_output in sampler_output_list],
dim=0, dim=0,
) )
if sampler_transposed:
sampled_token_logprobs = sampled_token_logprobs.transpose(0, 1)
# shape: [batch_size, num_sampler_output] # shape: [batch_size, num_sampler_output]
sampled_token_ids = torch.stack( sampled_token_ids = torch.stack(
[ [
...@@ -168,7 +155,10 @@ def sampler_output_to_torch( ...@@ -168,7 +155,10 @@ def sampler_output_to_torch(
], ],
dim=0, dim=0,
) )
if sampler_transposed: if sampler_transposed:
sampled_token_probs = sampled_token_probs.transpose(0, 1)
sampled_token_logprobs = sampled_token_logprobs.transpose(0, 1)
sampled_token_ids = sampled_token_ids.transpose(0, 1) sampled_token_ids = sampled_token_ids.transpose(0, 1)
if sampler_output_list[0].hidden_states is not None: if sampler_output_list[0].hidden_states is not None:
......
...@@ -11,11 +11,12 @@ from transformers.models.auto.modeling_auto import ( ...@@ -11,11 +11,12 @@ from transformers.models.auto.modeling_auto import (
from vllm.envs import VLLM_USE_MODELSCOPE from vllm.envs import VLLM_USE_MODELSCOPE
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig, from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig,
EAGLEConfig, InternVLChatConfig, EAGLEConfig, ExaoneConfig,
JAISConfig, MedusaConfig, InternVLChatConfig, JAISConfig,
MLPSpeculatorConfig, MPTConfig, MedusaConfig, MLPSpeculatorConfig,
NemotronConfig, RWConfig, MPTConfig, NemotronConfig,
UltravoxConfig) RWConfig, UltravoxConfig)
from vllm.transformers_utils.utils import check_gguf_file
if VLLM_USE_MODELSCOPE: if VLLM_USE_MODELSCOPE:
from modelscope import AutoConfig from modelscope import AutoConfig
...@@ -34,6 +35,7 @@ _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = { ...@@ -34,6 +35,7 @@ _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
"mlp_speculator": MLPSpeculatorConfig, "mlp_speculator": MLPSpeculatorConfig,
"medusa": MedusaConfig, "medusa": MedusaConfig,
"eagle": EAGLEConfig, "eagle": EAGLEConfig,
"exaone": ExaoneConfig,
"internvl_chat": InternVLChatConfig, "internvl_chat": InternVLChatConfig,
"nemotron": NemotronConfig, "nemotron": NemotronConfig,
"ultravox": UltravoxConfig, "ultravox": UltravoxConfig,
...@@ -55,7 +57,7 @@ def get_config( ...@@ -55,7 +57,7 @@ def get_config(
) -> PretrainedConfig: ) -> PretrainedConfig:
# Separate model folder from file path for GGUF models # Separate model folder from file path for GGUF models
is_gguf = Path(model).is_file() and Path(model).suffix == ".gguf" is_gguf = check_gguf_file(model)
if is_gguf: if is_gguf:
kwargs["gguf_file"] = Path(model).name kwargs["gguf_file"] = Path(model).name
model = Path(model).parent model = Path(model).parent
...@@ -107,8 +109,11 @@ def get_hf_image_processor_config( ...@@ -107,8 +109,11 @@ def get_hf_image_processor_config(
revision: Optional[str] = None, revision: Optional[str] = None,
**kwargs, **kwargs,
) -> Dict[str, Any]: ) -> Dict[str, Any]:
# ModelScope does not provide an interface for image_processor
if VLLM_USE_MODELSCOPE:
return dict()
# Separate model folder from file path for GGUF models # Separate model folder from file path for GGUF models
if Path(model).is_file() and Path(model).suffix == ".gguf": if check_gguf_file(model):
model = Path(model).parent model = Path(model).parent
return get_image_processor_config(model, revision=revision, **kwargs) return get_image_processor_config(model, revision=revision, **kwargs)
......
from vllm.transformers_utils.configs.chatglm import ChatGLMConfig from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
from vllm.transformers_utils.configs.dbrx import DbrxConfig from vllm.transformers_utils.configs.dbrx import DbrxConfig
from vllm.transformers_utils.configs.eagle import EAGLEConfig from vllm.transformers_utils.configs.eagle import EAGLEConfig
from vllm.transformers_utils.configs.exaone import ExaoneConfig
# RWConfig is for the original tiiuae/falcon-40b(-instruct) and # RWConfig is for the original tiiuae/falcon-40b(-instruct) and
# tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
# `FalconConfig` class from the official HuggingFace transformers library. # `FalconConfig` class from the official HuggingFace transformers library.
...@@ -22,6 +23,7 @@ __all__ = [ ...@@ -22,6 +23,7 @@ __all__ = [
"JAISConfig", "JAISConfig",
"MedusaConfig", "MedusaConfig",
"EAGLEConfig", "EAGLEConfig",
"ExaoneConfig",
"MLPSpeculatorConfig", "MLPSpeculatorConfig",
"NemotronConfig", "NemotronConfig",
"UltravoxConfig", "UltravoxConfig",
......
# coding=utf-8
# Copied from
# https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/configuration_exaone.py
# Copyright 2021 The LG AI Research EXAONE Lab. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Exaone model configuration"""
from typing import Dict
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging
logger = logging.get_logger(__name__)
EXAONE_PRETRAINED_CONFIG_ARCHIVE_MAP: Dict[str, str] = {}
class ExaoneConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a :class:
`~transformers.ExaoneModel`. It is used to instantiate a GPT Lingvo model
according to the specified arguments, defining the model architecture.
Instantiating a configuration with the defaults will yield a similar
configuration to that of the Exaone
Configuration objects inherit from :class:`~transformers.PretrainedConfig`
and can be used to control the model outputs. Read the documentation from :
class:`~transformers.PretrainedConfig` for more information.
Args:
vocab_size (:obj:`int`, `optional`, defaults to 50257):
Vocabulary size of the GPT Lingvo model. Defines the number of
different tokens that can be represented by the :obj:`inputs_ids`
passed when calling :class:`~transformers.ExaoneModel`. Vocabulary
size of the model.
Defines the different tokens that can be represented by the
`inputs_ids` passed to the forward method of :class:
`~transformers.EXAONEModel`.
hidden_size (:obj:`int`, `optional`, defaults to 2048):
Dimensionality of the encoder layers and the pooler layer.
num_layers (:obj:`int`, `optional`, defaults to 24):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 32):
Number of attention heads for each attention layer in the
Transformer decoder.
num_key_value_heads (`int`, *optional*):
This is the number of key_value heads that should be used to
implement Grouped Query Attention. If
`num_key_value_heads=num_attention_heads`, the model will use Multi
Head Attention (MHA), if `num_key_value_heads=1 the model will use
Multi Query Attention (MQA) otherwise GQA is used. When
converting a multi-head checkpoint to a GQA checkpoint,
each group key and value head should be constructed by meanpooling
all the original heads within that group. For more details checkout
[this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not
specified, will default to `num_attention_heads`.
rotary_pct (`float`, *optional*, defaults to 0.25):
percentage of hidden dimensions to allocate to rotary embeddings
intermediate_size (:obj:`int`, `optional`, defaults to 8192):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in
the Transformer encoder.
activation_function (:obj:`str` or :obj:`function`, `optional`,
defaults to :obj:`"gelu_new"`):
The non-linear activation function (function or string) in the
encoder and pooler. If string, :obj:`"gelu"`, :obj:`"relu"`,
:obj:`"selu"` and :obj:`"gelu_new"` are supported.
embed_dropout (:obj:`float`, `optional`, defaults to 0.0):
The dropout probabilitiy for all fully connected layers in the
embeddings, encoder, and pooler.
attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
The dropout ratio for the attention probabilities.
max_position_embeddings (:obj:`int`, `optional`, defaults to 2048):
The maximum sequence length that this model might ever be used with.
Typically set this to something large just in case
(e.g., 512 or 1024 or 2048).
type_vocab_size (:obj:`int`, `optional`, defaults to 2):
The vocabulary size of the :obj:`token_type_ids` passed when calling
:class:`~transformers.EXAONEModel`.
initializer_range (:obj:`float`, `optional`, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for
initializing all weight matrices.
layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-5):
The epsilon used by the layer normalization layers.
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not the model should return the last key/values
attentions (not used by all models).
Only relevant if ``config.is_decoder=True``.
gradient_checkpointing (:obj:`bool`, `optional`,
defaults to :obj:`False`):
If True, use gradient checkpointing to save memory at the expense
of slower backward pass.
Example::
>>> from transformers import ExoneModel, ExaoneConfig
>>> # Initializing a EXAONE configuration
>>> configuration = ExaoneConfig()
>>> # Initializing a model from configuration
>>> model = ExoneModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
"""
model_type = "exaone"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {"num_hidden_layers": "num_layers"}
def __init__(
self,
vocab_size=102400,
max_position_embeddings=2048,
hidden_size=2048,
num_layers=32,
num_attention_heads=32,
num_key_value_heads=None,
intermediate_size=None,
activation_function="silu",
rotary_pct=0.25,
resid_dropout=0.0,
embed_dropout=0.0,
attention_dropout=0.0,
layer_norm_epsilon=1e-6,
initializer_range=0.02,
use_cache=True,
bos_token_id=0,
eos_token_id=2,
tie_word_embeddings=True,
**kwargs,
):
super().__init__(
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.num_layers = num_layers
self.num_attention_heads = num_attention_heads
self.num_hidden_layers = num_layers
if num_key_value_heads is None:
num_key_value_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
if intermediate_size:
self.intermediate_size = intermediate_size
else:
self.intermediate_size = hidden_size * 4
self.activation_function = activation_function
self.resid_dropout = resid_dropout
self.embed_dropout = embed_dropout
self.attention_dropout = attention_dropout
self.layer_norm_epsilon = layer_norm_epsilon
self.initializer_range = initializer_range
self.use_cache = use_cache
self.rotary_pct = rotary_pct
self.bos_token_id = bos_token_id
self.eos_token_id = eos_token_id
self.use_logit_cap = kwargs.pop("use_logit_cap", False)
self.ln_no_scale = kwargs.pop("ln_no_scale", False)
self.use_gated = kwargs.pop("use_gated", False)
self.use_emb_norm = kwargs.pop("use_emb_norm", False)
self.use_rotary_pos = kwargs.pop("use_rotary_pos", False)
self.rotary_type = kwargs.pop("rotary_type", None)
self.scaling_factor = kwargs.pop("scaling_factor", 1)
self.use_absolute_pos = kwargs.pop("use_absolute_pos", True)
self.use_extra_logit = kwargs.pop("use_extra_logit", True)
self.rotary_expand_length = kwargs.pop("rotary_expand_length", None)
self.rotary_base = kwargs.pop("rotary_base", 10000.0)
self.use_qkv_fuse = kwargs.pop("use_qkv_fuse", False)
self.rescale_before_lm_head = kwargs.pop("rescale_before_lm_head",
(rotary_pct == 0.25))
if self.use_rotary_pos:
self.use_absolute_pos = False
# coding=utf-8
# Copyright 2024 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Granite model configuration"""
from transformers.configuration_utils import PretrainedConfig
from transformers.modeling_rope_utils import rope_config_validation
from transformers.utils import logging
logger = logging.get_logger(__name__)
class GraniteConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of
a [`GraniteModel`]. It is used to instantiate an Granite
model according to the specified arguments, defining the model architecture.
Instantiating a configuration with the defaults will yield a similar
configuration to that of the Granite-3B.
Configuration objects inherit from [`PretrainedConfig`] and can be used to
control the model outputs. Read the documentation from [`PretrainedConfig`]
for more information.
Args:
vocab_size (`int`, *optional*, defaults to 32000):
Vocabulary size of the Granite model. Defines the number of
different tokens that can be represented by the `inputs_ids`
passed when calling [`GraniteModel`]
hidden_size (`int`, *optional*, defaults to 4096):
Dimension of the hidden representations.
intermediate_size (`int`, *optional*, defaults to 11008):
Dimension of the MLP representations.
num_hidden_layers (`int`, *optional*, defaults to 32):
Number of hidden layers in the Transformer decoder.
num_attention_heads (`int`, *optional*, defaults to 32):
Number of attention heads for each attention layer in the
Transformer decoder.
num_key_value_heads (`int`, *optional*):
This is the number of key_value heads that should be used to
implement Grouped Query Attention. If
`num_key_value_heads=num_attention_heads`, the model will use Multi
Head Attention (MHA), if `num_key_value_heads=1` the model will use
Multi Query Attention (MQA) otherwise GQA is used. When converting
a multi-head checkpoint to a GQA checkpoint, each group key and
value head should be constructed by meanpooling all the original
heads within that group. For more details checkout
[this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not
specified, will default to `num_attention_heads`.
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
The non-linear activation function (function or string) in the
decoder.
max_position_embeddings (`int`, *optional*, defaults to 2048):
The maximum sequence length that this model might ever be used with.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for
initializing all weight matrices.
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
The epsilon used by the rms normalization layers.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values
attentions (not used by all models). Only relevant if
`config.is_decoder=True`.
pad_token_id (`int`, *optional*):
Padding token id.
bos_token_id (`int`, *optional*, defaults to 1):
Beginning of stream token id.
eos_token_id (`int`, *optional*, defaults to 2):
End of stream token id.
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether to tie weight embeddings
rope_theta (`float`, *optional*, defaults to 10000.0):
The base period of the RoPE embeddings.
rope_scaling (`Dict`, *optional*):
Dictionary containing the scaling configuration for the RoPE
embeddings. Currently supports two scaling strategies: linear and
dynamic. Their scaling factor must be a float greater than 1. The
expected format is
`{"type": strategy name, "factor": scaling factor}`.
When using this flag, don't update `max_position_embeddings` to
the expected new maximum. See the following thread for more
information on how these scaling strategies behave:
https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/.
This is an experimental feature, subject to breaking API changes
in future versions.
attention_bias (`bool`, *optional*, defaults to `False`):
Whether to use a bias in the query, key, value and output
projection layers during self-attention.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
mlp_bias (`bool`, *optional*, defaults to `False`):
Whether to use a bias in up_proj, down_proj and gate_proj layers
in the MLP layers.
embedding_multiplier (`float`, *optional*, defaults to 1.0):
embedding multiplier
logits_scaling (`float`, *optional*, defaults to 1.0):
divisor for output logits
residual_multiplier (`float`, *optional*, defaults to 1.0):
residual multiplier
attention_multiplier (`float`, *optional*, defaults to 1.0):
attention multiplier
```python
>>> from transformers import GraniteModel, GraniteConfig
>>> # Initializing a Granite granite-3b style configuration
>>> configuration = GraniteConfig()
>>> # Initializing a model from the granite-7b style configuration
>>> model = GraniteModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "granite"
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
self,
vocab_size=32000,
hidden_size=4096,
intermediate_size=11008,
num_hidden_layers=32,
num_attention_heads=32,
num_key_value_heads=None,
hidden_act="silu",
max_position_embeddings=2048,
initializer_range=0.02,
rms_norm_eps=1e-6,
use_cache=True,
pad_token_id=None,
bos_token_id=1,
eos_token_id=2,
tie_word_embeddings=False,
rope_theta=10000.0,
rope_scaling=None,
attention_bias=False,
attention_dropout=0.0,
mlp_bias=False,
embedding_multiplier=1.0,
logits_scaling=1.0,
residual_multiplier=1.0,
attention_multiplier=1.0,
**kwargs,
):
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
# for backward compatibility
if num_key_value_heads is None:
num_key_value_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.use_cache = use_cache
self.rope_theta = rope_theta
self.rope_scaling = rope_scaling
self.attention_bias = attention_bias
self.attention_dropout = attention_dropout
self.mlp_bias = mlp_bias
self.embedding_multiplier = embedding_multiplier
self.logits_scaling = logits_scaling
self.residual_multiplier = residual_multiplier
self.attention_multiplier = attention_multiplier
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
rope_config_validation(self)
...@@ -230,7 +230,7 @@ def convert_prompt_ids_to_tokens( ...@@ -230,7 +230,7 @@ def convert_prompt_ids_to_tokens(
prefix_offset = max( prefix_offset = max(
read_offset - INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET, 0) read_offset - INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET, 0)
# This is required to guard against out-of-vocab prompt token ids # This is required to guard against out-of-vocab prompt token ids
_replace_none_with_empty(new_tokens) _replace_none_with_empty(new_tokens) # type: ignore[arg-type]
return new_tokens, prefix_offset, read_offset return new_tokens, prefix_offset, read_offset
......
import os import os
import warnings
from pathlib import Path from pathlib import Path
from typing import Optional, Union from typing import Optional, Union
...@@ -9,12 +10,15 @@ from transformers import (AutoTokenizer, PreTrainedTokenizer, ...@@ -9,12 +10,15 @@ from transformers import (AutoTokenizer, PreTrainedTokenizer,
from vllm.envs import VLLM_USE_MODELSCOPE from vllm.envs import VLLM_USE_MODELSCOPE
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.transformers_utils.tokenizers import BaichuanTokenizer from vllm.transformers_utils.tokenizers import (BaichuanTokenizer,
MistralTokenizer)
from vllm.transformers_utils.utils import check_gguf_file
from vllm.utils import make_async from vllm.utils import make_async
logger = init_logger(__name__) logger = init_logger(__name__)
AnyTokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast] AnyTokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast,
MistralTokenizer]
def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer: def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
...@@ -93,51 +97,69 @@ def get_tokenizer( ...@@ -93,51 +97,69 @@ def get_tokenizer(
kwargs["truncation_side"] = "left" kwargs["truncation_side"] = "left"
# Separate model folder from file path for GGUF models # Separate model folder from file path for GGUF models
is_gguf = Path(tokenizer_name).is_file() and Path( is_gguf = check_gguf_file(tokenizer_name)
tokenizer_name).suffix == ".gguf"
if is_gguf: if is_gguf:
kwargs["gguf_file"] = Path(tokenizer_name).name kwargs["gguf_file"] = Path(tokenizer_name).name
tokenizer_name = Path(tokenizer_name).parent tokenizer_name = Path(tokenizer_name).parent
try: # if tokenizer is from official mistral org
tokenizer = AutoTokenizer.from_pretrained( is_from_mistral_org = str(tokenizer_name).split("/")[0] == "mistralai"
tokenizer_name, if is_from_mistral_org and tokenizer_mode != "mistral":
*args, warnings.warn(
trust_remote_code=trust_remote_code, 'It is strongly recommended to run mistral models with '
revision=revision, '`--tokenizer_mode "mistral"` to ensure correct '
**kwargs) 'encoding and decoding.',
except ValueError as e: FutureWarning,
# If the error pertains to the tokenizer class not existing or not stacklevel=2)
# currently being imported, suggest using the --trust-remote-code flag.
if (not trust_remote_code and if tokenizer_mode == "mistral":
("does not exist or is not currently imported." in str(e) tokenizer = MistralTokenizer.from_pretrained(str(tokenizer_name),
or "requires you to execute the tokenizer file" in str(e))): revision=revision)
err_msg = ( else:
"Failed to load the tokenizer. If the tokenizer is a custom " try:
"tokenizer not yet available in the HuggingFace transformers " tokenizer = AutoTokenizer.from_pretrained(
"library, consider setting `trust_remote_code=True` in LLM "
"or using the `--trust-remote-code` flag in the CLI.")
raise RuntimeError(err_msg) from e
else:
raise e
except AttributeError as e:
if "BaichuanTokenizer" in str(e):
# This is for the error "'BaichuanTokenizer' object has no
# attribute 'sp_model'".
tokenizer = BaichuanTokenizer.from_pretrained(
tokenizer_name, tokenizer_name,
*args, *args,
trust_remote_code=trust_remote_code, trust_remote_code=trust_remote_code,
revision=revision, revision=revision,
**kwargs) **kwargs,
else: )
raise e except ValueError as e:
# If the error pertains to the tokenizer class not existing or not
# currently being imported,
# suggest using the --trust-remote-code flag.
if not trust_remote_code and (
"does not exist or is not currently imported." in str(e)
or "requires you to execute the tokenizer file" in str(e)):
err_msg = ("Failed to load the tokenizer. If the tokenizer "
"is a custom tokenizer not yet available in the "
"HuggingFace transformers library, consider "
"setting `trust_remote_code=True` in LLM or using "
"the `--trust-remote-code` flag in the CLI.")
raise RuntimeError(err_msg) from e
else:
raise e
except AttributeError as e:
if "BaichuanTokenizer" in str(e):
# This is for the error "'BaichuanTokenizer' object has no
# attribute 'sp_model'".
tokenizer = BaichuanTokenizer.from_pretrained(
tokenizer_name,
*args,
trust_remote_code=trust_remote_code,
revision=revision,
**kwargs,
)
else:
raise e
if not isinstance(tokenizer, PreTrainedTokenizerFast):
logger.warning(
"Using a slow tokenizer. This might cause a significant "
"slowdown. Consider using a fast tokenizer instead.")
tokenizer = get_cached_tokenizer(tokenizer)
if not isinstance(tokenizer, PreTrainedTokenizerFast): return tokenizer
logger.warning(
"Using a slow tokenizer. This might cause a significant "
"slowdown. Consider using a fast tokenizer instead.")
return get_cached_tokenizer(tokenizer)
def get_lora_tokenizer(lora_request: LoRARequest, *args, def get_lora_tokenizer(lora_request: LoRARequest, *args,
......
from vllm.transformers_utils.tokenizers.baichuan import BaichuanTokenizer from vllm.transformers_utils.tokenizers.baichuan import BaichuanTokenizer
from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
__all__ = [ __all__ = ["BaichuanTokenizer", "MistralTokenizer"]
"BaichuanTokenizer",
]
import os
import re
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
from huggingface_hub import HfApi, hf_hub_download
# yapf: disable
from mistral_common.tokens.tokenizers.mistral import ChatCompletionRequest
from mistral_common.tokens.tokenizers.mistral import (
MistralTokenizer as PublicMistralTokenizer)
# yapf: enable
from mistral_common.tokens.tokenizers.sentencepiece import (
SentencePieceTokenizer)
from mistral_common.tokens.tokenizers.tekken import (SpecialTokenPolicy,
Tekkenizer)
if TYPE_CHECKING:
from vllm.entrypoints.chat_utils import ConversationMessage
@dataclass
class Encoding:
input_ids: List[int]
def find_tokenizer_file(files: List[str]):
file_pattern = re.compile(r"^tokenizer\.model\.v.*$|^tekken\.json$")
matched_files = [file for file in files if file_pattern.match(file)]
if len(matched_files) > 1:
raise OSError(f"Found {len(matched_files)} files matching the "
"pattern: {matched_files}. Make sure only one Mistral "
"tokenizer is present in {tokenizer_name}.")
elif len(matched_files) == 0:
raise OSError(f"Found {len(matched_files)} files matching the "
"pattern: {matched_files}. Make sure that a Mistral "
"tokenizer is present in {tokenizer_name}.")
return matched_files[0]
class MistralTokenizer:
def __init__(self, tokenizer: PublicMistralTokenizer) -> None:
self.mistral = tokenizer
self.instruct = tokenizer.instruct_tokenizer
self.tokenizer = tokenizer.instruct_tokenizer.tokenizer
self.vocab_size = len(self.tokenizer.vocab())
assert isinstance(self.tokenizer,
(Tekkenizer, SentencePieceTokenizer)), type(
self.tokenizer)
if (is_tekken := isinstance(self.tokenizer, Tekkenizer)):
# Make sure special tokens will not raise
self.tokenizer.special_token_policy = SpecialTokenPolicy.IGNORE
self._is_tekken = is_tekken
# the following attributes are set to fit VLLM's design
self.is_fast = True
self.chat_template = True
self.all_special_ids: List[Any] = []
self.all_special_tokens: List[Any] = []
self.all_special_tokens_extended: List[Any] = []
@classmethod
def from_pretrained(cls,
path_or_repo_id: str,
*,
revision: Optional[str] = None) -> "MistralTokenizer":
if not Path(path_or_repo_id).exists():
assert len(path_or_repo_id.split("/")) == 2, (
"You have either provided a non-existent path: "
"{path_or_repo_id} or an invalid HF Hub repo id.")
tokenizer_file = cls._download_mistral_tokenizer_from_hf(
path_or_repo_id, revision)
elif Path(path_or_repo_id).is_dir():
tokenizer_file_name = find_tokenizer_file(
os.listdir(path_or_repo_id))
tokenizer_file = str(Path(path_or_repo_id) / tokenizer_file_name)
else:
assert Path(
path_or_repo_id).is_file(), f"Invalid path: {path_or_repo_id}"
mistral_tokenizer = PublicMistralTokenizer.from_file(tokenizer_file)
return cls(mistral_tokenizer)
@staticmethod
def _download_mistral_tokenizer_from_hf(tokenizer_name: str,
revision: Optional[str]) -> str:
api = HfApi()
repo_info = api.model_info(tokenizer_name)
files = [s.rfilename for s in repo_info.siblings]
filename = find_tokenizer_file(files)
tokenizer_file = hf_hub_download(tokenizer_name,
filename=filename,
revision=revision)
return tokenizer_file
def __call__(
self,
prompt: str,
add_special_tokens: bool = False,
truncation: bool = False,
max_length: Optional[int] = None,
):
# Mistral Tokenizers should not add special tokens
input_ids = self.encode(prompt)
if truncation:
input_ids = input_ids[:max_length]
return Encoding(input_ids=input_ids)
def get_added_vocab(self) -> List[str]:
# Mistral tokenizers have no added vocabulary
return []
def encode(self, prompt: str) -> List[int]:
# `encode ` should only be used for prompt completion
# it should never be used for chat_completion.
# For chat completion use `apply_chat_template`
return self.tokenizer.encode(prompt, bos=True, eos=False)
def apply_chat_template(self,
conversation: List["ConversationMessage"],
tools: Optional[Dict[str, Any]] = None,
**kwargs) -> List[int]:
assert tools is None, "`tools` are not yet supported."
request = ChatCompletionRequest(
messages=conversation) # type: ignore[type-var]
encoded = self.mistral.encode_chat_completion(request)
# encode-decode to get clean prompt
return encoded.tokens
def convert_tokens_to_string(self, tokens: List[str]) -> str:
if self._is_tekken:
return "".join(tokens)
else:
return self.tokenizer.decode(tokens) # type: ignore[arg-type]
def decode(self, ids: Union[List[int], int]) -> str:
if isinstance(ids, int):
ids = [ids]
return self.tokenizer.decode(ids)
@property
def eos_token_id(self):
return self.tokenizer.eos_id
def convert_ids_to_tokens(
self,
ids: List[int],
skip_special_tokens: Optional[bool] = True) -> List[str]:
# TODO(Patrick) - potentially allow special tokens to not be skipped
assert (
skip_special_tokens
), "Skipping special tokens is not supported for Mistral tokenizers."
assert isinstance(self.tokenizer,
(Tekkenizer, SentencePieceTokenizer)), type(
self.tokenizer)
tokens = [self.tokenizer.id_to_piece(id) for id in ids]
return tokens
def __len__(self):
return self.vocab_size
from os import PathLike
from pathlib import Path
from typing import Union
def check_gguf_file(model: Union[str, PathLike]) -> bool:
"""Check if the file is a GGUF model."""
model = Path(model)
if not model.is_file():
return False
elif model.suffix == ".gguf":
return True
with open(model, "rb") as f:
header = f.read(4)
return header == b"GGUF"
...@@ -25,6 +25,8 @@ import numpy.typing as npt ...@@ -25,6 +25,8 @@ import numpy.typing as npt
import psutil import psutil
import torch import torch
import torch.types import torch.types
import yaml
from packaging.version import Version
from typing_extensions import ParamSpec, TypeIs, assert_never from typing_extensions import ParamSpec, TypeIs, assert_never
import vllm.envs as envs import vllm.envs as envs
...@@ -1092,6 +1094,9 @@ class FlexibleArgumentParser(argparse.ArgumentParser): ...@@ -1092,6 +1094,9 @@ class FlexibleArgumentParser(argparse.ArgumentParser):
if args is None: if args is None:
args = sys.argv[1:] args = sys.argv[1:]
if '--config' in args:
args = FlexibleArgumentParser._pull_args_from_config(args)
# Convert underscores to dashes and vice versa in argument names # Convert underscores to dashes and vice versa in argument names
processed_args = [] processed_args = []
for arg in args: for arg in args:
...@@ -1108,9 +1113,114 @@ class FlexibleArgumentParser(argparse.ArgumentParser): ...@@ -1108,9 +1113,114 @@ class FlexibleArgumentParser(argparse.ArgumentParser):
return super().parse_args(processed_args, namespace) return super().parse_args(processed_args, namespace)
@staticmethod
def _pull_args_from_config(args: List[str]) -> List[str]:
"""Method to pull arguments specified in the config file
into the command-line args variable.
The arguments in config file will be inserted between
the argument list.
example:
```yaml
port: 12323
tensor-parallel-size: 4
```
```python
$: vllm {serve,chat,complete} "facebook/opt-12B" \
--config config.yaml -tp 2
$: args = [
"serve,chat,complete",
"facebook/opt-12B",
'--config', 'config.yaml',
'-tp', '2'
]
$: args = [
"serve,chat,complete",
"facebook/opt-12B",
'--port', '12323',
'--tensor-parallel-size', '4',
'-tp', '2'
]
```
Please note how the config args are inserted after the sub command.
this way the order of priorities is maintained when these are args
parsed by super().
"""
assert args.count(
'--config') <= 1, "More than one config file specified!"
index = args.index('--config')
if index == len(args) - 1:
raise ValueError("No config file specified! \
Please check your command-line arguments.")
file_path = args[index + 1]
config_args = FlexibleArgumentParser._load_config_file(file_path)
# 0th index is for {serve,chat,complete}
# followed by config args
# followed by rest of cli args.
# maintaining this order will enforce the precedence
# of cli > config > defaults
args = [args[0]] + config_args + args[1:index] + args[index + 2:]
return args
@staticmethod
def _load_config_file(file_path: str) -> List[str]:
"""Loads a yaml file and returns the key value pairs as a
flattened list with argparse like pattern
```yaml
port: 12323
tensor-parallel-size: 4
```
returns:
processed_args: list[str] = [
'--port': '12323',
'--tensor-parallel-size': '4'
]
"""
extension: str = file_path.split('.')[-1]
if extension not in ('yaml', 'yml'):
raise ValueError(
"Config file must be of a yaml/yml type.\
%s supplied", extension)
# only expecting a flat dictionary of atomic types
processed_args: List[str] = []
config: Dict[str, Union[int, str]] = {}
try:
with open(file_path, 'r') as config_file:
config = yaml.safe_load(config_file)
except Exception as ex:
logger.error(
"Unable to read the config file at %s. \
Make sure path is correct", file_path)
raise ex
for key, value in config.items():
processed_args.append('--' + key)
processed_args.append(str(value))
return processed_args
async def _run_task_with_lock(task: Callable, lock: asyncio.Lock, *args, async def _run_task_with_lock(task: Callable, lock: asyncio.Lock, *args,
**kwargs): **kwargs):
"""Utility function to run async task in a lock""" """Utility function to run async task in a lock"""
async with lock: async with lock:
return await task(*args, **kwargs) return await task(*args, **kwargs)
# Using dynamo with vLLM doesn't really work well with PyTorch versions < 2.4.0.
# In particular, the FakeScalarType is not supported for earlier versions of
# PyTorch which breaks dynamo for any ops registered using ScalarType.
def supports_dynamo() -> bool:
base_torch_version = Version(Version(torch.__version__).base_version)
return base_torch_version >= Version("2.4.0")
...@@ -9,4 +9,4 @@ except Exception as e: ...@@ -9,4 +9,4 @@ except Exception as e:
stacklevel=2) stacklevel=2)
__commit__ = "COMMIT_HASH_PLACEHOLDER" __commit__ = "COMMIT_HASH_PLACEHOLDER"
__version__ = "0.5.5" __version__ = "0.6.0"
...@@ -10,11 +10,11 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, ...@@ -10,11 +10,11 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
SchedulerConfig) SchedulerConfig)
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor import SamplingMetadata from vllm.model_executor import SamplingMetadata
from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.model_executor.model_loader import get_model from vllm.model_executor.model_loader import get_model
from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
MultiModalInputs) MultiModalInputs)
from vllm.sequence import (IntermediateTensors, SamplerOutput, from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
SequenceGroupMetadata)
from vllm.utils import make_tensor_with_pad from vllm.utils import make_tensor_with_pad
from vllm.worker.model_runner_base import ( from vllm.worker.model_runner_base import (
ModelRunnerBase, ModelRunnerInputBase, ModelRunnerBase, ModelRunnerInputBase,
......
...@@ -16,9 +16,10 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, ...@@ -16,9 +16,10 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
from vllm.inputs import INPUT_REGISTRY, InputRegistry from vllm.inputs import INPUT_REGISTRY, InputRegistry
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor import SamplingMetadata from vllm.model_executor import SamplingMetadata
from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.sequence import (IntermediateTensors, PoolerOutput, SamplerOutput, from vllm.sequence import (IntermediateTensors, PoolerOutput,
SequenceGroupMetadata) SequenceGroupMetadata)
from vllm.utils import STR_NOT_IMPL_ENC_DEC_BACKEND, make_tensor_with_pad from vllm.utils import STR_NOT_IMPL_ENC_DEC_BACKEND, make_tensor_with_pad
from vllm.worker.model_runner import (GPUModelRunnerBase, from vllm.worker.model_runner import (GPUModelRunnerBase,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment