Unverified Commit 2f8844ba authored by Zhuohan Li's avatar Zhuohan Li Committed by GitHub
Browse files

Re-enable the 80 char line width limit (#3305)

parent 4b59f00e
...@@ -10,7 +10,8 @@ from vllm.worker.worker import Worker ...@@ -10,7 +10,8 @@ from vllm.worker.worker import Worker
from vllm.spec_decode.multi_step_worker import MultiStepWorker from vllm.spec_decode.multi_step_worker import MultiStepWorker
from vllm.model_executor.layers.rejection_sampler import RejectionSampler from vllm.model_executor.layers.rejection_sampler import RejectionSampler
from vllm.config import CacheConfig from vllm.config import CacheConfig
from vllm.spec_decode.util import nvtx_range, get_all_seq_ids, split_batch_by_proposal_len from vllm.spec_decode.util import (nvtx_range, get_all_seq_ids,
split_batch_by_proposal_len)
from vllm.spec_decode.interfaces import SpeculativeProposals, SpeculativeScores from vllm.spec_decode.interfaces import SpeculativeProposals, SpeculativeScores
from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
from vllm.spec_decode.interfaces import SpeculativeScorer from vllm.spec_decode.interfaces import SpeculativeScorer
...@@ -109,10 +110,12 @@ class SpecDecodeWorker: ...@@ -109,10 +110,12 @@ class SpecDecodeWorker:
block_size, gpu_memory_utilization, cpu_swap_space, block_size, gpu_memory_utilization, cpu_swap_space,
cache_dtype)) cache_dtype))
scorer_cache_block_size_bytes = self.scorer_worker.get_cache_block_size_bytes( scorer_cache_block_size_bytes = (
block_size, cache_dtype) self.scorer_worker.get_cache_block_size_bytes(
proposer_cache_block_size_bytes = self.proposer_worker.get_cache_block_size_bytes( block_size, cache_dtype))
block_size, cache_dtype) proposer_cache_block_size_bytes = (
self.proposer_worker.get_cache_block_size_bytes(
block_size, cache_dtype))
new_num_gpu_blocks = split_num_cache_blocks_evenly( new_num_gpu_blocks = split_num_cache_blocks_evenly(
scorer_cache_block_size_bytes, proposer_cache_block_size_bytes, scorer_cache_block_size_bytes, proposer_cache_block_size_bytes,
...@@ -320,8 +323,8 @@ class SpecDecodeWorker: ...@@ -320,8 +323,8 @@ class SpecDecodeWorker:
sampler_output_list.append( sampler_output_list.append(
SamplerOutput(outputs=step_output_token_ids)) SamplerOutput(outputs=step_output_token_ids))
maybe_rejsample_metrics = self._metrics.maybe_collect_rejsample_metrics( maybe_rejsample_metrics = (
k) self._metrics.maybe_collect_rejsample_metrics(k))
if maybe_rejsample_metrics is not None: if maybe_rejsample_metrics is not None:
sampler_output_list[ sampler_output_list[
0].spec_decode_worker_metrics = maybe_rejsample_metrics 0].spec_decode_worker_metrics = maybe_rejsample_metrics
......
...@@ -62,62 +62,6 @@ class MPTConfig(PretrainedConfig): ...@@ -62,62 +62,6 @@ class MPTConfig(PretrainedConfig):
fc_type: str = 'torch', fc_type: str = 'torch',
verbose: Optional[int] = None, verbose: Optional[int] = None,
**kwargs: Any): **kwargs: Any):
"""The MPT configuration class.
Args:
d_model (int): The size of the embedding dimension of the model.
n_heads (int): The number of attention heads.
n_layers (int): The number of layers in the model.
expansion_ratio (int): The ratio of the up/down scale in the ffn.
max_seq_len (int): The maximum sequence length of the model.
vocab_size (int): The size of the vocabulary.
resid_pdrop (float): The dropout probability applied to the attention output before combining with residual.
emb_pdrop (float): The dropout probability for the embedding layer.
learned_pos_emb (bool): Whether to use learned positional embeddings
attn_config (Dict): A dictionary used to configure the model's attention module:
attn_type (str): type of attention to use. Options: multihead_attention, multiquery_attention, grouped_query_attention
attn_pdrop (float): The dropout probability for the attention layers.
attn_impl (str): The attention implementation to use. One of 'torch', 'flash', or 'triton'.
qk_ln (bool): Whether to apply layer normalization to the queries and keys in the attention layer.
clip_qkv (Optional[float]): If not None, clip the queries, keys, and values in the attention layer to
this value.
softmax_scale (Optional[float]): If not None, scale the softmax in the attention layer by this value. If None,
use the default scale of ``1/sqrt(d_keys)``.
prefix_lm (Optional[bool]): Whether the model should operate as a Prefix LM. This requires passing an
extra `prefix_mask` argument which indicates which tokens belong to the prefix. Tokens in the prefix
can attend to one another bi-directionally. Tokens outside the prefix use causal attention.
attn_uses_sequence_id (Optional[bool]): Whether to restrict attention to tokens that have the same sequence_id.
When the model is in `train` mode, this requires passing an extra `sequence_id` argument which indicates
which sub-sequence each token belongs to.
Defaults to ``False`` meaning any provided `sequence_id` will be ignored.
alibi (bool): Whether to use the alibi bias instead of position embeddings.
alibi_bias_max (int): The maximum value of the alibi bias.
kv_n_heads (Optional[int]): For grouped_query_attention only, allow user to specify number of kv heads.
ffn_config (Dict): A dictionary used to configure the model's ffn module:
ffn_type (str): type of ffn to use. Options: mptmlp, te_ln_mlp
init_device (str): The device to use for parameter initialization.
logit_scale (Optional[Union[float, str]]): If not None, scale the logits by this value.
no_bias (bool): Whether to use bias in all layers.
verbose (int): The verbosity level. 0 is silent.
embedding_fraction (float): The fraction to scale the gradients of the embedding layer by.
norm_type (str): choose type of norm to use
use_cache (bool): Whether or not the model should return the last key/values attentions
init_config (Dict): A dictionary used to configure the model initialization:
init_config.name: The parameter initialization scheme to use. Options: 'default_', 'baseline_',
'kaiming_uniform_', 'kaiming_normal_', 'neox_init_', 'small_init_', 'xavier_uniform_', or
'xavier_normal_'. These mimic the parameter initialization methods in PyTorch.
init_div_is_residual (Union[int, float, str, bool]): Value to divide initial weights by if ``module._is_residual`` is True.
emb_init_std (Optional[float]): The standard deviation of the normal distribution used to initialize the embedding layer.
emb_init_uniform_lim (Optional[Union[Tuple[float, float], float]]): The lower and upper limits of the uniform distribution
used to initialize the embedding layer. Mutually exclusive with ``emb_init_std``.
init_std (float): The standard deviation of the normal distribution used to initialize the model,
if using the baseline_ parameter initialization scheme.
init_gain (float): The gain to use for parameter initialization with kaiming or xavier initialization schemes.
fan_mode (str): The fan mode to use for parameter initialization with kaiming initialization schemes.
init_nonlinearity (str): The nonlinearity to use for parameter initialization with kaiming initialization schemes.
---
See llmfoundry.models.utils.param_init_fns.py for info on other param init config options
fc_type (str): choose fc layer implementation. Options: torch and te. te layers support fp8 when using H100 GPUs.
"""
self.d_model = d_model self.d_model = d_model
self.n_heads = n_heads self.n_heads = n_heads
self.n_layers = n_layers self.n_layers = n_layers
...@@ -139,8 +83,8 @@ class MPTConfig(PretrainedConfig): ...@@ -139,8 +83,8 @@ class MPTConfig(PretrainedConfig):
self.fc_type = fc_type self.fc_type = fc_type
if verbose is not None: if verbose is not None:
warnings.warn(DeprecationWarning( warnings.warn(DeprecationWarning(
'verbose argument for MPTConfig is now ignored and will be removed. Use python_log_level instead.' 'verbose argument for MPTConfig is now ignored and '
), 'will be removed. Use python_log_level instead.'),
stacklevel=2) stacklevel=2)
if 'name' in kwargs: if 'name' in kwargs:
del kwargs['name'] del kwargs['name']
...@@ -149,7 +93,8 @@ class MPTConfig(PretrainedConfig): ...@@ -149,7 +93,8 @@ class MPTConfig(PretrainedConfig):
if self.attn_config.get('alibi', False): if self.attn_config.get('alibi', False):
self.learned_pos_emb = False self.learned_pos_emb = False
warnings.warn( warnings.warn(
f'alibi is turned on, setting `learned_pos_emb` to {self.learned_pos_emb}`', f'alibi is turned on, setting `learned_pos_emb` '
f'to {self.learned_pos_emb}`',
stacklevel=2) stacklevel=2)
super().__init__(**kwargs) super().__init__(**kwargs)
self._validate_config() self._validate_config()
...@@ -176,8 +121,8 @@ class MPTConfig(PretrainedConfig): ...@@ -176,8 +121,8 @@ class MPTConfig(PretrainedConfig):
[self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop] [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop]
)): )):
raise ValueError( raise ValueError(
"self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are probabilities and must be between 0 and 1" # pylint: disable=line-too-long "self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are "
) "probabilities and must be between 0 and 1")
if self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']: if self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']:
raise ValueError( raise ValueError(
f"Unknown attn_impl={self.attn_config['attn_impl']}") f"Unknown attn_impl={self.attn_config['attn_impl']}")
...@@ -193,17 +138,17 @@ class MPTConfig(PretrainedConfig): ...@@ -193,17 +138,17 @@ class MPTConfig(PretrainedConfig):
if self.attn_config['attn_uses_sequence_id'] and self.attn_config[ if self.attn_config['attn_uses_sequence_id'] and self.attn_config[
'attn_impl'] not in ['torch', 'triton']: 'attn_impl'] not in ['torch', 'triton']:
raise NotImplementedError( raise NotImplementedError(
'attn_uses_sequence_id only implemented with torch and triton attention.' # pylint: disable=line-too-long 'attn_uses_sequence_id only implemented with torch '
) 'and triton attention.')
if self.embedding_fraction > 1 or self.embedding_fraction <= 0: if self.embedding_fraction > 1 or self.embedding_fraction <= 0:
raise ValueError( raise ValueError(
'model.embedding_fraction must be between 0 (exclusive) and 1 (inclusive)!' # pylint: disable=line-too-long 'model.embedding_fraction must be between 0 (exclusive) '
) 'and 1 (inclusive)!')
if isinstance(self.logit_scale, if isinstance(self.logit_scale,
str) and self.logit_scale != 'inv_sqrt_d_model': str) and self.logit_scale != 'inv_sqrt_d_model':
raise ValueError( raise ValueError(
f"self.logit_scale={self.logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'." # pylint: disable=line-too-long f"self.logit_scale={self.logit_scale!r} is not recognized as "
) "an option; use numeric value or 'inv_sqrt_d_model'.")
if self.init_config.get('name', None) is None: if self.init_config.get('name', None) is None:
raise ValueError( raise ValueError(
f"self.init_config={self.init_config!r} 'name' needs to be set." f"self.init_config={self.init_config!r} 'name' needs to be set."
...@@ -219,11 +164,11 @@ class MPTConfig(PretrainedConfig): ...@@ -219,11 +164,11 @@ class MPTConfig(PretrainedConfig):
del te del te
except Exception as exc: except Exception as exc:
raise ImportError( raise ImportError(
# pylint: disable=line-too-long 'TransformerEngine import fail. `fc_type: te` requires '
'TransformerEngine import fail. `fc_type: te` requires TransformerEngine be installed. ' 'TransformerEngine be installed. '
+ 'The required version of transformer_engine also requires '
'The required version of transformer_engine also requires FlashAttention v1.0.6 is installed:\n' 'FlashAttention v1.0.6 is installed:\n'
+ 'pip install flash-attn==1.0.6 --no-build-isolation \n' + 'pip install flash-attn==1.0.6 --no-build-isolation \n'
'pip install git+https://github.com/NVIDIA/TransformerEngine.git@144e4888b2cdd60bd52e706d5b7a79cb9c1a7156' 'pip install git+https://github.com/NVIDIA/TransformerEngine.git@144e4888b2cdd60bd52e706d5b7a79cb9c1a7156'
) from exc ) from exc
if self.ffn_config['ffn_type'] == 'mptmlp': if self.ffn_config['ffn_type'] == 'mptmlp':
......
...@@ -2,78 +2,6 @@ from transformers import PretrainedConfig ...@@ -2,78 +2,6 @@ from transformers import PretrainedConfig
class Starcoder2Config(PretrainedConfig): class Starcoder2Config(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`Starcoder2Model`]. It is used to instantiate a
Starcoder2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the [bigcode/starcoder2-7b_16k](https://huggingface.co/bigcode/starcoder2-7b_16k) model.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 49152):
Vocabulary size of the Starcoder2 model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`Starcoder2Model`]
hidden_size (`int`, *optional*, defaults to 3072):
Dimension of the hidden representations.
intermediate_size (`int`, *optional*, defaults to 12288):
Dimension of the MLP representations.
num_hidden_layers (`int`, *optional*, defaults to 30):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 24):
Number of attention heads for each attention layer in the Transformer encoder.
num_key_value_heads (`int`, *optional*, defaults to 2):
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
by meanpooling all the original heads within that group. For more details checkout [this
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
The non-linear activation function (function or string) in the decoder.
max_position_embeddings (`int`, *optional*, defaults to 4096):
The maximum sequence length that this model might ever be used with. Starcoder2's sliding window attention
allows sequence of up to 4096*32 tokens.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
norm_epsilon (`float`, *optional*, defaults to 1e-05):
Epsilon value for the layer norm
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`.
bos_token_id (`int`, *optional*, defaults to 50256):
The id of the "beginning-of-sequence" token.
eos_token_id (`int`, *optional*, defaults to 50256):
The id of the "end-of-sequence" token.
rope_theta (`float`, *optional*, defaults to 10000.0):
The base period of the RoPE embeddings.
sliding_window (`int`, *optional*):
Sliding window attention window size. If not specified, will default to `None` (no sliding window).
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
residual_dropout (`float`, *optional*, defaults to 0.0):
Residual connection dropout value.
embedding_dropout (`float`, *optional*, defaults to 0.0):
Embedding dropout.
use_bias (`bool`, *optional*, defaults to `True`):
Whether to use bias term on linear layers of the model.
```python
>>> from transformers import Starcoder2Model, Starcoder2Config
>>> # Initializing a Starcoder2 7B style configuration
>>> configuration = Starcoder2Config()
>>> # Initializing a model from the Starcoder2 7B style configuration
>>> model = Starcoder2Model(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "starcoder2" model_type = "starcoder2"
keys_to_ignore_at_inference = ["past_key_values"] keys_to_ignore_at_inference = ["past_key_values"]
......
# yapf: disable
# Adapted from # Adapted from
# https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/8f6e343d545c503b91429582231d1d354dac2740/tokenization_baichuan.py # https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/8f6e343d545c503b91429582231d1d354dac2740/tokenization_baichuan.py
# This includes a fix suggested in # This includes a fix suggested in
...@@ -13,7 +12,6 @@ import sentencepiece as spm ...@@ -13,7 +12,6 @@ import sentencepiece as spm
from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
from transformers.utils import logging from transformers.utils import logging
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"} VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
...@@ -52,27 +50,16 @@ class BaichuanTokenizer(PreTrainedTokenizer): ...@@ -52,27 +50,16 @@ class BaichuanTokenizer(PreTrainedTokenizer):
clean_up_tokenization_spaces=False, clean_up_tokenization_spaces=False,
**kwargs, **kwargs,
): ):
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs self.sp_model_kwargs = ({} if sp_model_kwargs is None else
bos_token = ( sp_model_kwargs)
AddedToken(bos_token, lstrip=False, rstrip=False) bos_token = (AddedToken(bos_token, lstrip=False, rstrip=False)
if isinstance(bos_token, str) if isinstance(bos_token, str) else bos_token)
else bos_token eos_token = (AddedToken(eos_token, lstrip=False, rstrip=False)
) if isinstance(eos_token, str) else eos_token)
eos_token = ( unk_token = (AddedToken(unk_token, lstrip=False, rstrip=False)
AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token)
if isinstance(eos_token, str) pad_token = (AddedToken(pad_token, lstrip=False, rstrip=False)
else eos_token if isinstance(pad_token, str) else pad_token)
)
unk_token = (
AddedToken(unk_token, lstrip=False, rstrip=False)
if isinstance(unk_token, str)
else unk_token
)
pad_token = (
AddedToken(pad_token, lstrip=False, rstrip=False)
if isinstance(pad_token, str)
else pad_token
)
self.vocab_file = vocab_file self.vocab_file = vocab_file
self.add_bos_token = add_bos_token self.add_bos_token = add_bos_token
self.add_eos_token = add_eos_token self.add_eos_token = add_eos_token
...@@ -107,7 +94,10 @@ class BaichuanTokenizer(PreTrainedTokenizer): ...@@ -107,7 +94,10 @@ class BaichuanTokenizer(PreTrainedTokenizer):
def get_vocab(self): def get_vocab(self):
"""Returns vocab as a dict""" """Returns vocab as a dict"""
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} vocab = {
self.convert_ids_to_tokens(i): i
for i in range(self.vocab_size)
}
vocab.update(self.added_tokens_encoder) vocab.update(self.added_tokens_encoder)
return vocab return vocab
...@@ -130,7 +120,8 @@ class BaichuanTokenizer(PreTrainedTokenizer): ...@@ -130,7 +120,8 @@ class BaichuanTokenizer(PreTrainedTokenizer):
out_string = "" out_string = ""
prev_is_special = False prev_is_special = False
for i, token in enumerate(tokens): for i, token in enumerate(tokens):
# make sure that special tokens are not decoded using sentencepiece model # make sure that special tokens are not decoded using
# sentencepiece model
if token in self.all_special_tokens: if token in self.all_special_tokens:
if not prev_is_special and i != 0: if not prev_is_special and i != 0:
out_string += " " out_string += " "
...@@ -143,9 +134,9 @@ class BaichuanTokenizer(PreTrainedTokenizer): ...@@ -143,9 +134,9 @@ class BaichuanTokenizer(PreTrainedTokenizer):
out_string += self.sp_model.decode(current_sub_tokens) out_string += self.sp_model.decode(current_sub_tokens)
return out_string return out_string
def save_vocabulary( def save_vocabulary(self,
self, save_directory, filename_prefix: Optional[str] = None save_directory,
) -> Tuple[str]: filename_prefix: Optional[str] = None) -> Tuple[str]:
""" """
Save the vocabulary and special tokens file to a directory. Save the vocabulary and special tokens file to a directory.
...@@ -157,24 +148,24 @@ class BaichuanTokenizer(PreTrainedTokenizer): ...@@ -157,24 +148,24 @@ class BaichuanTokenizer(PreTrainedTokenizer):
`Tuple(str)`: Paths to the files saved. `Tuple(str)`: Paths to the files saved.
""" """
if not os.path.isdir(save_directory): if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory") logger.error(f"Vocabulary path ({save_directory}) "
"should be a directory")
return return
out_vocab_file = os.path.join( out_vocab_file = os.path.join(
save_directory, save_directory,
(filename_prefix + "-" if filename_prefix else "") (filename_prefix + "-" if filename_prefix else "") +
+ VOCAB_FILES_NAMES["vocab_file"], VOCAB_FILES_NAMES["vocab_file"],
) )
if os.path.abspath(self.vocab_file) != os.path.abspath( if os.path.abspath(self.vocab_file) != os.path.abspath(
out_vocab_file out_vocab_file) and os.path.isfile(self.vocab_file):
) and os.path.isfile(self.vocab_file):
copyfile(self.vocab_file, out_vocab_file) copyfile(self.vocab_file, out_vocab_file)
elif not os.path.isfile(self.vocab_file): elif not os.path.isfile(self.vocab_file):
with open(out_vocab_file, "wb") as fi: with open(out_vocab_file, "wb") as fi:
content_spiece_model = self.sp_model.serialized_model_proto() content_spiece_model = self.sp_model.serialized_model_proto()
fi.write(content_spiece_model) fi.write(content_spiece_model)
return (out_vocab_file,) return (out_vocab_file, )
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
bos_token_id = [self.bos_token_id] if self.add_bos_token else [] bos_token_id = [self.bos_token_id] if self.add_bos_token else []
...@@ -194,7 +185,8 @@ class BaichuanTokenizer(PreTrainedTokenizer): ...@@ -194,7 +185,8 @@ class BaichuanTokenizer(PreTrainedTokenizer):
already_has_special_tokens: bool = False, already_has_special_tokens: bool = False,
) -> List[int]: ) -> List[int]:
""" """
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding Retrieve sequence ids from a token list that has no special tokens
added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method. special tokens using the tokenizer `prepare_for_model` method.
Args: Args:
...@@ -202,11 +194,14 @@ class BaichuanTokenizer(PreTrainedTokenizer): ...@@ -202,11 +194,14 @@ class BaichuanTokenizer(PreTrainedTokenizer):
List of IDs. List of IDs.
token_ids_1 (`List[int]`, *optional*): token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs. Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`): already_has_special_tokens (`bool`, *optional*, defaults to
Whether or not the token list is already formatted with special tokens for the model. `False`):
Whether or not the token list is already formatted with
special tokens for the model.
Returns: Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. `List[int]`: A list of integers in the range [0, 1]:
1 for a special token, 0 for a sequence token.
""" """
if already_has_special_tokens: if already_has_special_tokens:
return super().get_special_tokens_mask( return super().get_special_tokens_mask(
...@@ -220,20 +215,16 @@ class BaichuanTokenizer(PreTrainedTokenizer): ...@@ -220,20 +215,16 @@ class BaichuanTokenizer(PreTrainedTokenizer):
if token_ids_1 is None: if token_ids_1 is None:
return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
return ( return (bos_token_id + ([0] * len(token_ids_0)) + eos_token_id +
bos_token_id bos_token_id + ([0] * len(token_ids_1)) + eos_token_id)
+ ([0] * len(token_ids_0))
+ eos_token_id
+ bos_token_id
+ ([0] * len(token_ids_1))
+ eos_token_id
)
def create_token_type_ids_from_sequences( def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None self,
) -> List[int]: token_ids_0: List[int],
token_ids_1: Optional[List[int]] = None) -> List[int]:
""" """
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT Creates a mask from the two sequences passed to be used in a
sequence-pair classification task. An ALBERT
sequence pair mask has the following format: sequence pair mask has the following format:
``` ```
...@@ -250,7 +241,8 @@ class BaichuanTokenizer(PreTrainedTokenizer): ...@@ -250,7 +241,8 @@ class BaichuanTokenizer(PreTrainedTokenizer):
Optional second list of IDs for sequence pairs. Optional second list of IDs for sequence pairs.
Returns: Returns:
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s). `List[int]`: List of [token type IDs](../glossary#token-type-ids)
according to the given sequence(s).
""" """
bos_token_id = [self.bos_token_id] if self.add_bos_token else [] bos_token_id = [self.bos_token_id] if self.add_bos_token else []
eos_token_id = [self.eos_token_id] if self.add_eos_token else [] eos_token_id = [self.eos_token_id] if self.add_eos_token else []
......
...@@ -133,9 +133,10 @@ def get_max_shared_memory_bytes(gpu: int = 0) -> int: ...@@ -133,9 +133,10 @@ def get_max_shared_memory_bytes(gpu: int = 0) -> int:
# the Neuron-X backend does not have the `cuda_utils` module. # the Neuron-X backend does not have the `cuda_utils` module.
from vllm._C import cuda_utils from vllm._C import cuda_utils
max_shared_mem = cuda_utils.get_max_shared_memory_per_block_device_attribute( max_shared_mem = (
gpu) cuda_utils.get_max_shared_memory_per_block_device_attribute(gpu))
# value 0 will cause MAX_SEQ_LEN become negative and test_attention.py will fail # value 0 will cause MAX_SEQ_LEN become negative and test_attention.py
# will fail
assert max_shared_mem > 0, "max_shared_mem can not be zero" assert max_shared_mem > 0, "max_shared_mem can not be zero"
return int(max_shared_mem) return int(max_shared_mem)
...@@ -209,9 +210,8 @@ def get_nvcc_cuda_version() -> Optional[Version]: ...@@ -209,9 +210,8 @@ def get_nvcc_cuda_version() -> Optional[Version]:
if not cuda_home: if not cuda_home:
cuda_home = '/usr/local/cuda' cuda_home = '/usr/local/cuda'
if os.path.isfile(cuda_home + '/bin/nvcc'): if os.path.isfile(cuda_home + '/bin/nvcc'):
logger.info( logger.info(f'CUDA_HOME is not found in the environment. '
f'CUDA_HOME is not found in the environment. Using {cuda_home} as CUDA_HOME.' f'Using {cuda_home} as CUDA_HOME.')
)
else: else:
logger.warning( logger.warning(
f'Not found nvcc in {cuda_home}. Skip cuda version check!') f'Not found nvcc in {cuda_home}. Skip cuda version check!')
......
...@@ -93,14 +93,13 @@ class ModelRunner: ...@@ -93,14 +93,13 @@ class ModelRunner:
scheduler_config=self.scheduler_config) scheduler_config=self.scheduler_config)
self.model_memory_usage = m.consumed_memory self.model_memory_usage = m.consumed_memory
logger.info( logger.info(f"Loading model weights took "
f"Loading model weights took {self.model_memory_usage / float(2**30):.4f} GB" f"{self.model_memory_usage / float(2**30):.4f} GB")
)
if self.lora_config: if self.lora_config:
assert hasattr( assert hasattr(self.model, "supported_lora_modules"
self.model, "supported_lora_modules" ) and self.model.supported_lora_modules, (
) and self.model.supported_lora_modules, "Model does not support LoRA" "Model does not support LoRA")
assert hasattr( assert hasattr(
self.model, self.model,
"embedding_modules"), "Model does not have embedding_modules" "embedding_modules"), "Model does not have embedding_modules"
......
...@@ -79,7 +79,8 @@ class Worker: ...@@ -79,7 +79,8 @@ class Worker:
cpu_swap_space: int = 0, cpu_swap_space: int = 0,
cache_dtype: str = "float16", cache_dtype: str = "float16",
) -> Tuple[int, int]: ) -> Tuple[int, int]:
"""Simply returns max_num_seqs as num_gpu_blocks, 0 as num_cpu_blocks.""" """Simply returns max_num_seqs as num_gpu_blocks, 0 as
num_cpu_blocks."""
num_gpu_blocks = self.scheduler_config.max_num_seqs num_gpu_blocks = self.scheduler_config.max_num_seqs
num_cpu_blocks = 0 num_cpu_blocks = 0
return num_gpu_blocks, num_cpu_blocks return num_gpu_blocks, num_cpu_blocks
...@@ -177,7 +178,8 @@ def _init_distributed_environment( ...@@ -177,7 +178,8 @@ def _init_distributed_environment(
"distributed_init_method must be set if torch.distributed " "distributed_init_method must be set if torch.distributed "
"is not already initialized") "is not already initialized")
else: else:
distributed_backend = distributed_backend if distributed_backend else "nccl" distributed_backend = (distributed_backend
if distributed_backend else "nccl")
torch.distributed.init_process_group( torch.distributed.init_process_group(
backend=distributed_backend, backend=distributed_backend,
world_size=parallel_config.world_size, world_size=parallel_config.world_size,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment