Unverified Commit 2f8844ba authored by Zhuohan Li's avatar Zhuohan Li Committed by GitHub
Browse files

Re-enable the 80 char line width limit (#3305)

parent 4b59f00e
...@@ -10,7 +10,8 @@ from vllm.worker.worker import Worker ...@@ -10,7 +10,8 @@ from vllm.worker.worker import Worker
from vllm.spec_decode.multi_step_worker import MultiStepWorker from vllm.spec_decode.multi_step_worker import MultiStepWorker
from vllm.model_executor.layers.rejection_sampler import RejectionSampler from vllm.model_executor.layers.rejection_sampler import RejectionSampler
from vllm.config import CacheConfig from vllm.config import CacheConfig
from vllm.spec_decode.util import nvtx_range, get_all_seq_ids, split_batch_by_proposal_len from vllm.spec_decode.util import (nvtx_range, get_all_seq_ids,
split_batch_by_proposal_len)
from vllm.spec_decode.interfaces import SpeculativeProposals, SpeculativeScores from vllm.spec_decode.interfaces import SpeculativeProposals, SpeculativeScores
from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
from vllm.spec_decode.interfaces import SpeculativeScorer from vllm.spec_decode.interfaces import SpeculativeScorer
...@@ -25,7 +26,7 @@ class SpecDecodeWorker: ...@@ -25,7 +26,7 @@ class SpecDecodeWorker:
LLM, after which some verification routine determines which (if any) of the LLM, after which some verification routine determines which (if any) of the
speculative tokens are accepted by the larger LLM. speculative tokens are accepted by the larger LLM.
See https://github.com/vllm-project/vllm/pull/2188 and See https://github.com/vllm-project/vllm/pull/2188 and
https://github.com/vllm-project/vllm/pull/3103 for more info. https://github.com/vllm-project/vllm/pull/3103 for more info.
The current implementation has the following limitations: The current implementation has the following limitations:
...@@ -109,10 +110,12 @@ class SpecDecodeWorker: ...@@ -109,10 +110,12 @@ class SpecDecodeWorker:
block_size, gpu_memory_utilization, cpu_swap_space, block_size, gpu_memory_utilization, cpu_swap_space,
cache_dtype)) cache_dtype))
scorer_cache_block_size_bytes = self.scorer_worker.get_cache_block_size_bytes( scorer_cache_block_size_bytes = (
block_size, cache_dtype) self.scorer_worker.get_cache_block_size_bytes(
proposer_cache_block_size_bytes = self.proposer_worker.get_cache_block_size_bytes( block_size, cache_dtype))
block_size, cache_dtype) proposer_cache_block_size_bytes = (
self.proposer_worker.get_cache_block_size_bytes(
block_size, cache_dtype))
new_num_gpu_blocks = split_num_cache_blocks_evenly( new_num_gpu_blocks = split_num_cache_blocks_evenly(
scorer_cache_block_size_bytes, proposer_cache_block_size_bytes, scorer_cache_block_size_bytes, proposer_cache_block_size_bytes,
...@@ -320,8 +323,8 @@ class SpecDecodeWorker: ...@@ -320,8 +323,8 @@ class SpecDecodeWorker:
sampler_output_list.append( sampler_output_list.append(
SamplerOutput(outputs=step_output_token_ids)) SamplerOutput(outputs=step_output_token_ids))
maybe_rejsample_metrics = self._metrics.maybe_collect_rejsample_metrics( maybe_rejsample_metrics = (
k) self._metrics.maybe_collect_rejsample_metrics(k))
if maybe_rejsample_metrics is not None: if maybe_rejsample_metrics is not None:
sampler_output_list[ sampler_output_list[
0].spec_decode_worker_metrics = maybe_rejsample_metrics 0].spec_decode_worker_metrics = maybe_rejsample_metrics
......
...@@ -62,62 +62,6 @@ class MPTConfig(PretrainedConfig): ...@@ -62,62 +62,6 @@ class MPTConfig(PretrainedConfig):
fc_type: str = 'torch', fc_type: str = 'torch',
verbose: Optional[int] = None, verbose: Optional[int] = None,
**kwargs: Any): **kwargs: Any):
"""The MPT configuration class.
Args:
d_model (int): The size of the embedding dimension of the model.
n_heads (int): The number of attention heads.
n_layers (int): The number of layers in the model.
expansion_ratio (int): The ratio of the up/down scale in the ffn.
max_seq_len (int): The maximum sequence length of the model.
vocab_size (int): The size of the vocabulary.
resid_pdrop (float): The dropout probability applied to the attention output before combining with residual.
emb_pdrop (float): The dropout probability for the embedding layer.
learned_pos_emb (bool): Whether to use learned positional embeddings
attn_config (Dict): A dictionary used to configure the model's attention module:
attn_type (str): type of attention to use. Options: multihead_attention, multiquery_attention, grouped_query_attention
attn_pdrop (float): The dropout probability for the attention layers.
attn_impl (str): The attention implementation to use. One of 'torch', 'flash', or 'triton'.
qk_ln (bool): Whether to apply layer normalization to the queries and keys in the attention layer.
clip_qkv (Optional[float]): If not None, clip the queries, keys, and values in the attention layer to
this value.
softmax_scale (Optional[float]): If not None, scale the softmax in the attention layer by this value. If None,
use the default scale of ``1/sqrt(d_keys)``.
prefix_lm (Optional[bool]): Whether the model should operate as a Prefix LM. This requires passing an
extra `prefix_mask` argument which indicates which tokens belong to the prefix. Tokens in the prefix
can attend to one another bi-directionally. Tokens outside the prefix use causal attention.
attn_uses_sequence_id (Optional[bool]): Whether to restrict attention to tokens that have the same sequence_id.
When the model is in `train` mode, this requires passing an extra `sequence_id` argument which indicates
which sub-sequence each token belongs to.
Defaults to ``False`` meaning any provided `sequence_id` will be ignored.
alibi (bool): Whether to use the alibi bias instead of position embeddings.
alibi_bias_max (int): The maximum value of the alibi bias.
kv_n_heads (Optional[int]): For grouped_query_attention only, allow user to specify number of kv heads.
ffn_config (Dict): A dictionary used to configure the model's ffn module:
ffn_type (str): type of ffn to use. Options: mptmlp, te_ln_mlp
init_device (str): The device to use for parameter initialization.
logit_scale (Optional[Union[float, str]]): If not None, scale the logits by this value.
no_bias (bool): Whether to use bias in all layers.
verbose (int): The verbosity level. 0 is silent.
embedding_fraction (float): The fraction to scale the gradients of the embedding layer by.
norm_type (str): choose type of norm to use
use_cache (bool): Whether or not the model should return the last key/values attentions
init_config (Dict): A dictionary used to configure the model initialization:
init_config.name: The parameter initialization scheme to use. Options: 'default_', 'baseline_',
'kaiming_uniform_', 'kaiming_normal_', 'neox_init_', 'small_init_', 'xavier_uniform_', or
'xavier_normal_'. These mimic the parameter initialization methods in PyTorch.
init_div_is_residual (Union[int, float, str, bool]): Value to divide initial weights by if ``module._is_residual`` is True.
emb_init_std (Optional[float]): The standard deviation of the normal distribution used to initialize the embedding layer.
emb_init_uniform_lim (Optional[Union[Tuple[float, float], float]]): The lower and upper limits of the uniform distribution
used to initialize the embedding layer. Mutually exclusive with ``emb_init_std``.
init_std (float): The standard deviation of the normal distribution used to initialize the model,
if using the baseline_ parameter initialization scheme.
init_gain (float): The gain to use for parameter initialization with kaiming or xavier initialization schemes.
fan_mode (str): The fan mode to use for parameter initialization with kaiming initialization schemes.
init_nonlinearity (str): The nonlinearity to use for parameter initialization with kaiming initialization schemes.
---
See llmfoundry.models.utils.param_init_fns.py for info on other param init config options
fc_type (str): choose fc layer implementation. Options: torch and te. te layers support fp8 when using H100 GPUs.
"""
self.d_model = d_model self.d_model = d_model
self.n_heads = n_heads self.n_heads = n_heads
self.n_layers = n_layers self.n_layers = n_layers
...@@ -139,8 +83,8 @@ class MPTConfig(PretrainedConfig): ...@@ -139,8 +83,8 @@ class MPTConfig(PretrainedConfig):
self.fc_type = fc_type self.fc_type = fc_type
if verbose is not None: if verbose is not None:
warnings.warn(DeprecationWarning( warnings.warn(DeprecationWarning(
'verbose argument for MPTConfig is now ignored and will be removed. Use python_log_level instead.' 'verbose argument for MPTConfig is now ignored and '
), 'will be removed. Use python_log_level instead.'),
stacklevel=2) stacklevel=2)
if 'name' in kwargs: if 'name' in kwargs:
del kwargs['name'] del kwargs['name']
...@@ -149,7 +93,8 @@ class MPTConfig(PretrainedConfig): ...@@ -149,7 +93,8 @@ class MPTConfig(PretrainedConfig):
if self.attn_config.get('alibi', False): if self.attn_config.get('alibi', False):
self.learned_pos_emb = False self.learned_pos_emb = False
warnings.warn( warnings.warn(
f'alibi is turned on, setting `learned_pos_emb` to {self.learned_pos_emb}`', f'alibi is turned on, setting `learned_pos_emb` '
f'to {self.learned_pos_emb}`',
stacklevel=2) stacklevel=2)
super().__init__(**kwargs) super().__init__(**kwargs)
self._validate_config() self._validate_config()
...@@ -176,8 +121,8 @@ class MPTConfig(PretrainedConfig): ...@@ -176,8 +121,8 @@ class MPTConfig(PretrainedConfig):
[self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop] [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop]
)): )):
raise ValueError( raise ValueError(
"self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are probabilities and must be between 0 and 1" # pylint: disable=line-too-long "self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are "
) "probabilities and must be between 0 and 1")
if self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']: if self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']:
raise ValueError( raise ValueError(
f"Unknown attn_impl={self.attn_config['attn_impl']}") f"Unknown attn_impl={self.attn_config['attn_impl']}")
...@@ -193,17 +138,17 @@ class MPTConfig(PretrainedConfig): ...@@ -193,17 +138,17 @@ class MPTConfig(PretrainedConfig):
if self.attn_config['attn_uses_sequence_id'] and self.attn_config[ if self.attn_config['attn_uses_sequence_id'] and self.attn_config[
'attn_impl'] not in ['torch', 'triton']: 'attn_impl'] not in ['torch', 'triton']:
raise NotImplementedError( raise NotImplementedError(
'attn_uses_sequence_id only implemented with torch and triton attention.' # pylint: disable=line-too-long 'attn_uses_sequence_id only implemented with torch '
) 'and triton attention.')
if self.embedding_fraction > 1 or self.embedding_fraction <= 0: if self.embedding_fraction > 1 or self.embedding_fraction <= 0:
raise ValueError( raise ValueError(
'model.embedding_fraction must be between 0 (exclusive) and 1 (inclusive)!' # pylint: disable=line-too-long 'model.embedding_fraction must be between 0 (exclusive) '
) 'and 1 (inclusive)!')
if isinstance(self.logit_scale, if isinstance(self.logit_scale,
str) and self.logit_scale != 'inv_sqrt_d_model': str) and self.logit_scale != 'inv_sqrt_d_model':
raise ValueError( raise ValueError(
f"self.logit_scale={self.logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'." # pylint: disable=line-too-long f"self.logit_scale={self.logit_scale!r} is not recognized as "
) "an option; use numeric value or 'inv_sqrt_d_model'.")
if self.init_config.get('name', None) is None: if self.init_config.get('name', None) is None:
raise ValueError( raise ValueError(
f"self.init_config={self.init_config!r} 'name' needs to be set." f"self.init_config={self.init_config!r} 'name' needs to be set."
...@@ -219,11 +164,11 @@ class MPTConfig(PretrainedConfig): ...@@ -219,11 +164,11 @@ class MPTConfig(PretrainedConfig):
del te del te
except Exception as exc: except Exception as exc:
raise ImportError( raise ImportError(
# pylint: disable=line-too-long 'TransformerEngine import fail. `fc_type: te` requires '
'TransformerEngine import fail. `fc_type: te` requires TransformerEngine be installed. ' 'TransformerEngine be installed. '
+ 'The required version of transformer_engine also requires '
'The required version of transformer_engine also requires FlashAttention v1.0.6 is installed:\n' 'FlashAttention v1.0.6 is installed:\n'
+ 'pip install flash-attn==1.0.6 --no-build-isolation \n' + 'pip install flash-attn==1.0.6 --no-build-isolation \n'
'pip install git+https://github.com/NVIDIA/TransformerEngine.git@144e4888b2cdd60bd52e706d5b7a79cb9c1a7156' 'pip install git+https://github.com/NVIDIA/TransformerEngine.git@144e4888b2cdd60bd52e706d5b7a79cb9c1a7156'
) from exc ) from exc
if self.ffn_config['ffn_type'] == 'mptmlp': if self.ffn_config['ffn_type'] == 'mptmlp':
......
...@@ -2,78 +2,6 @@ from transformers import PretrainedConfig ...@@ -2,78 +2,6 @@ from transformers import PretrainedConfig
class Starcoder2Config(PretrainedConfig): class Starcoder2Config(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`Starcoder2Model`]. It is used to instantiate a
Starcoder2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the [bigcode/starcoder2-7b_16k](https://huggingface.co/bigcode/starcoder2-7b_16k) model.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 49152):
Vocabulary size of the Starcoder2 model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`Starcoder2Model`]
hidden_size (`int`, *optional*, defaults to 3072):
Dimension of the hidden representations.
intermediate_size (`int`, *optional*, defaults to 12288):
Dimension of the MLP representations.
num_hidden_layers (`int`, *optional*, defaults to 30):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 24):
Number of attention heads for each attention layer in the Transformer encoder.
num_key_value_heads (`int`, *optional*, defaults to 2):
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
by meanpooling all the original heads within that group. For more details checkout [this
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
The non-linear activation function (function or string) in the decoder.
max_position_embeddings (`int`, *optional*, defaults to 4096):
The maximum sequence length that this model might ever be used with. Starcoder2's sliding window attention
allows sequence of up to 4096*32 tokens.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
norm_epsilon (`float`, *optional*, defaults to 1e-05):
Epsilon value for the layer norm
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`.
bos_token_id (`int`, *optional*, defaults to 50256):
The id of the "beginning-of-sequence" token.
eos_token_id (`int`, *optional*, defaults to 50256):
The id of the "end-of-sequence" token.
rope_theta (`float`, *optional*, defaults to 10000.0):
The base period of the RoPE embeddings.
sliding_window (`int`, *optional*):
Sliding window attention window size. If not specified, will default to `None` (no sliding window).
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
residual_dropout (`float`, *optional*, defaults to 0.0):
Residual connection dropout value.
embedding_dropout (`float`, *optional*, defaults to 0.0):
Embedding dropout.
use_bias (`bool`, *optional*, defaults to `True`):
Whether to use bias term on linear layers of the model.
```python
>>> from transformers import Starcoder2Model, Starcoder2Config
>>> # Initializing a Starcoder2 7B style configuration
>>> configuration = Starcoder2Config()
>>> # Initializing a model from the Starcoder2 7B style configuration
>>> model = Starcoder2Model(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "starcoder2" model_type = "starcoder2"
keys_to_ignore_at_inference = ["past_key_values"] keys_to_ignore_at_inference = ["past_key_values"]
......
# yapf: disable
# Adapted from # Adapted from
# https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/8f6e343d545c503b91429582231d1d354dac2740/tokenization_baichuan.py # https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/8f6e343d545c503b91429582231d1d354dac2740/tokenization_baichuan.py
# This includes a fix suggested in # This includes a fix suggested in
...@@ -13,7 +12,6 @@ import sentencepiece as spm ...@@ -13,7 +12,6 @@ import sentencepiece as spm
from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
from transformers.utils import logging from transformers.utils import logging
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"} VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
...@@ -52,27 +50,16 @@ class BaichuanTokenizer(PreTrainedTokenizer): ...@@ -52,27 +50,16 @@ class BaichuanTokenizer(PreTrainedTokenizer):
clean_up_tokenization_spaces=False, clean_up_tokenization_spaces=False,
**kwargs, **kwargs,
): ):
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs self.sp_model_kwargs = ({} if sp_model_kwargs is None else
bos_token = ( sp_model_kwargs)
AddedToken(bos_token, lstrip=False, rstrip=False) bos_token = (AddedToken(bos_token, lstrip=False, rstrip=False)
if isinstance(bos_token, str) if isinstance(bos_token, str) else bos_token)
else bos_token eos_token = (AddedToken(eos_token, lstrip=False, rstrip=False)
) if isinstance(eos_token, str) else eos_token)
eos_token = ( unk_token = (AddedToken(unk_token, lstrip=False, rstrip=False)
AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token)
if isinstance(eos_token, str) pad_token = (AddedToken(pad_token, lstrip=False, rstrip=False)
else eos_token if isinstance(pad_token, str) else pad_token)
)
unk_token = (
AddedToken(unk_token, lstrip=False, rstrip=False)
if isinstance(unk_token, str)
else unk_token
)
pad_token = (
AddedToken(pad_token, lstrip=False, rstrip=False)
if isinstance(pad_token, str)
else pad_token
)
self.vocab_file = vocab_file self.vocab_file = vocab_file
self.add_bos_token = add_bos_token self.add_bos_token = add_bos_token
self.add_eos_token = add_eos_token self.add_eos_token = add_eos_token
...@@ -107,7 +94,10 @@ class BaichuanTokenizer(PreTrainedTokenizer): ...@@ -107,7 +94,10 @@ class BaichuanTokenizer(PreTrainedTokenizer):
def get_vocab(self): def get_vocab(self):
"""Returns vocab as a dict""" """Returns vocab as a dict"""
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} vocab = {
self.convert_ids_to_tokens(i): i
for i in range(self.vocab_size)
}
vocab.update(self.added_tokens_encoder) vocab.update(self.added_tokens_encoder)
return vocab return vocab
...@@ -130,7 +120,8 @@ class BaichuanTokenizer(PreTrainedTokenizer): ...@@ -130,7 +120,8 @@ class BaichuanTokenizer(PreTrainedTokenizer):
out_string = "" out_string = ""
prev_is_special = False prev_is_special = False
for i, token in enumerate(tokens): for i, token in enumerate(tokens):
# make sure that special tokens are not decoded using sentencepiece model # make sure that special tokens are not decoded using
# sentencepiece model
if token in self.all_special_tokens: if token in self.all_special_tokens:
if not prev_is_special and i != 0: if not prev_is_special and i != 0:
out_string += " " out_string += " "
...@@ -143,9 +134,9 @@ class BaichuanTokenizer(PreTrainedTokenizer): ...@@ -143,9 +134,9 @@ class BaichuanTokenizer(PreTrainedTokenizer):
out_string += self.sp_model.decode(current_sub_tokens) out_string += self.sp_model.decode(current_sub_tokens)
return out_string return out_string
def save_vocabulary( def save_vocabulary(self,
self, save_directory, filename_prefix: Optional[str] = None save_directory,
) -> Tuple[str]: filename_prefix: Optional[str] = None) -> Tuple[str]:
""" """
Save the vocabulary and special tokens file to a directory. Save the vocabulary and special tokens file to a directory.
...@@ -157,24 +148,24 @@ class BaichuanTokenizer(PreTrainedTokenizer): ...@@ -157,24 +148,24 @@ class BaichuanTokenizer(PreTrainedTokenizer):
`Tuple(str)`: Paths to the files saved. `Tuple(str)`: Paths to the files saved.
""" """
if not os.path.isdir(save_directory): if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory") logger.error(f"Vocabulary path ({save_directory}) "
"should be a directory")
return return
out_vocab_file = os.path.join( out_vocab_file = os.path.join(
save_directory, save_directory,
(filename_prefix + "-" if filename_prefix else "") (filename_prefix + "-" if filename_prefix else "") +
+ VOCAB_FILES_NAMES["vocab_file"], VOCAB_FILES_NAMES["vocab_file"],
) )
if os.path.abspath(self.vocab_file) != os.path.abspath( if os.path.abspath(self.vocab_file) != os.path.abspath(
out_vocab_file out_vocab_file) and os.path.isfile(self.vocab_file):
) and os.path.isfile(self.vocab_file):
copyfile(self.vocab_file, out_vocab_file) copyfile(self.vocab_file, out_vocab_file)
elif not os.path.isfile(self.vocab_file): elif not os.path.isfile(self.vocab_file):
with open(out_vocab_file, "wb") as fi: with open(out_vocab_file, "wb") as fi:
content_spiece_model = self.sp_model.serialized_model_proto() content_spiece_model = self.sp_model.serialized_model_proto()
fi.write(content_spiece_model) fi.write(content_spiece_model)
return (out_vocab_file,) return (out_vocab_file, )
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
bos_token_id = [self.bos_token_id] if self.add_bos_token else [] bos_token_id = [self.bos_token_id] if self.add_bos_token else []
...@@ -194,7 +185,8 @@ class BaichuanTokenizer(PreTrainedTokenizer): ...@@ -194,7 +185,8 @@ class BaichuanTokenizer(PreTrainedTokenizer):
already_has_special_tokens: bool = False, already_has_special_tokens: bool = False,
) -> List[int]: ) -> List[int]:
""" """
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding Retrieve sequence ids from a token list that has no special tokens
added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method. special tokens using the tokenizer `prepare_for_model` method.
Args: Args:
...@@ -202,11 +194,14 @@ class BaichuanTokenizer(PreTrainedTokenizer): ...@@ -202,11 +194,14 @@ class BaichuanTokenizer(PreTrainedTokenizer):
List of IDs. List of IDs.
token_ids_1 (`List[int]`, *optional*): token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs. Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`): already_has_special_tokens (`bool`, *optional*, defaults to
Whether or not the token list is already formatted with special tokens for the model. `False`):
Whether or not the token list is already formatted with
special tokens for the model.
Returns: Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. `List[int]`: A list of integers in the range [0, 1]:
1 for a special token, 0 for a sequence token.
""" """
if already_has_special_tokens: if already_has_special_tokens:
return super().get_special_tokens_mask( return super().get_special_tokens_mask(
...@@ -220,20 +215,16 @@ class BaichuanTokenizer(PreTrainedTokenizer): ...@@ -220,20 +215,16 @@ class BaichuanTokenizer(PreTrainedTokenizer):
if token_ids_1 is None: if token_ids_1 is None:
return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
return ( return (bos_token_id + ([0] * len(token_ids_0)) + eos_token_id +
bos_token_id bos_token_id + ([0] * len(token_ids_1)) + eos_token_id)
+ ([0] * len(token_ids_0))
+ eos_token_id
+ bos_token_id
+ ([0] * len(token_ids_1))
+ eos_token_id
)
def create_token_type_ids_from_sequences( def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None self,
) -> List[int]: token_ids_0: List[int],
token_ids_1: Optional[List[int]] = None) -> List[int]:
""" """
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT Creates a mask from the two sequences passed to be used in a
sequence-pair classification task. An ALBERT
sequence pair mask has the following format: sequence pair mask has the following format:
``` ```
...@@ -250,7 +241,8 @@ class BaichuanTokenizer(PreTrainedTokenizer): ...@@ -250,7 +241,8 @@ class BaichuanTokenizer(PreTrainedTokenizer):
Optional second list of IDs for sequence pairs. Optional second list of IDs for sequence pairs.
Returns: Returns:
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s). `List[int]`: List of [token type IDs](../glossary#token-type-ids)
according to the given sequence(s).
""" """
bos_token_id = [self.bos_token_id] if self.add_bos_token else [] bos_token_id = [self.bos_token_id] if self.add_bos_token else []
eos_token_id = [self.eos_token_id] if self.add_eos_token else [] eos_token_id = [self.eos_token_id] if self.add_eos_token else []
......
...@@ -133,9 +133,10 @@ def get_max_shared_memory_bytes(gpu: int = 0) -> int: ...@@ -133,9 +133,10 @@ def get_max_shared_memory_bytes(gpu: int = 0) -> int:
# the Neuron-X backend does not have the `cuda_utils` module. # the Neuron-X backend does not have the `cuda_utils` module.
from vllm._C import cuda_utils from vllm._C import cuda_utils
max_shared_mem = cuda_utils.get_max_shared_memory_per_block_device_attribute( max_shared_mem = (
gpu) cuda_utils.get_max_shared_memory_per_block_device_attribute(gpu))
# value 0 will cause MAX_SEQ_LEN become negative and test_attention.py will fail # value 0 will cause MAX_SEQ_LEN become negative and test_attention.py
# will fail
assert max_shared_mem > 0, "max_shared_mem can not be zero" assert max_shared_mem > 0, "max_shared_mem can not be zero"
return int(max_shared_mem) return int(max_shared_mem)
...@@ -209,9 +210,8 @@ def get_nvcc_cuda_version() -> Optional[Version]: ...@@ -209,9 +210,8 @@ def get_nvcc_cuda_version() -> Optional[Version]:
if not cuda_home: if not cuda_home:
cuda_home = '/usr/local/cuda' cuda_home = '/usr/local/cuda'
if os.path.isfile(cuda_home + '/bin/nvcc'): if os.path.isfile(cuda_home + '/bin/nvcc'):
logger.info( logger.info(f'CUDA_HOME is not found in the environment. '
f'CUDA_HOME is not found in the environment. Using {cuda_home} as CUDA_HOME.' f'Using {cuda_home} as CUDA_HOME.')
)
else: else:
logger.warning( logger.warning(
f'Not found nvcc in {cuda_home}. Skip cuda version check!') f'Not found nvcc in {cuda_home}. Skip cuda version check!')
......
...@@ -93,14 +93,13 @@ class ModelRunner: ...@@ -93,14 +93,13 @@ class ModelRunner:
scheduler_config=self.scheduler_config) scheduler_config=self.scheduler_config)
self.model_memory_usage = m.consumed_memory self.model_memory_usage = m.consumed_memory
logger.info( logger.info(f"Loading model weights took "
f"Loading model weights took {self.model_memory_usage / float(2**30):.4f} GB" f"{self.model_memory_usage / float(2**30):.4f} GB")
)
if self.lora_config: if self.lora_config:
assert hasattr( assert hasattr(self.model, "supported_lora_modules"
self.model, "supported_lora_modules" ) and self.model.supported_lora_modules, (
) and self.model.supported_lora_modules, "Model does not support LoRA" "Model does not support LoRA")
assert hasattr( assert hasattr(
self.model, self.model,
"embedding_modules"), "Model does not have embedding_modules" "embedding_modules"), "Model does not have embedding_modules"
......
...@@ -79,7 +79,8 @@ class Worker: ...@@ -79,7 +79,8 @@ class Worker:
cpu_swap_space: int = 0, cpu_swap_space: int = 0,
cache_dtype: str = "float16", cache_dtype: str = "float16",
) -> Tuple[int, int]: ) -> Tuple[int, int]:
"""Simply returns max_num_seqs as num_gpu_blocks, 0 as num_cpu_blocks.""" """Simply returns max_num_seqs as num_gpu_blocks, 0 as
num_cpu_blocks."""
num_gpu_blocks = self.scheduler_config.max_num_seqs num_gpu_blocks = self.scheduler_config.max_num_seqs
num_cpu_blocks = 0 num_cpu_blocks = 0
return num_gpu_blocks, num_cpu_blocks return num_gpu_blocks, num_cpu_blocks
...@@ -177,7 +178,8 @@ def _init_distributed_environment( ...@@ -177,7 +178,8 @@ def _init_distributed_environment(
"distributed_init_method must be set if torch.distributed " "distributed_init_method must be set if torch.distributed "
"is not already initialized") "is not already initialized")
else: else:
distributed_backend = distributed_backend if distributed_backend else "nccl" distributed_backend = (distributed_backend
if distributed_backend else "nccl")
torch.distributed.init_process_group( torch.distributed.init_process_group(
backend=distributed_backend, backend=distributed_backend,
world_size=parallel_config.world_size, world_size=parallel_config.world_size,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment