Unverified Commit 2f8844ba authored by Zhuohan Li's avatar Zhuohan Li Committed by GitHub
Browse files

Re-enable the 80 char line width limit (#3305)

parent 4b59f00e
......@@ -10,7 +10,8 @@ from vllm.worker.worker import Worker
from vllm.spec_decode.multi_step_worker import MultiStepWorker
from vllm.model_executor.layers.rejection_sampler import RejectionSampler
from vllm.config import CacheConfig
from vllm.spec_decode.util import nvtx_range, get_all_seq_ids, split_batch_by_proposal_len
from vllm.spec_decode.util import (nvtx_range, get_all_seq_ids,
split_batch_by_proposal_len)
from vllm.spec_decode.interfaces import SpeculativeProposals, SpeculativeScores
from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
from vllm.spec_decode.interfaces import SpeculativeScorer
......@@ -25,7 +26,7 @@ class SpecDecodeWorker:
LLM, after which some verification routine determines which (if any) of the
speculative tokens are accepted by the larger LLM.
See https://github.com/vllm-project/vllm/pull/2188 and
See https://github.com/vllm-project/vllm/pull/2188 and
https://github.com/vllm-project/vllm/pull/3103 for more info.
The current implementation has the following limitations:
......@@ -109,10 +110,12 @@ class SpecDecodeWorker:
block_size, gpu_memory_utilization, cpu_swap_space,
cache_dtype))
scorer_cache_block_size_bytes = self.scorer_worker.get_cache_block_size_bytes(
block_size, cache_dtype)
proposer_cache_block_size_bytes = self.proposer_worker.get_cache_block_size_bytes(
block_size, cache_dtype)
scorer_cache_block_size_bytes = (
self.scorer_worker.get_cache_block_size_bytes(
block_size, cache_dtype))
proposer_cache_block_size_bytes = (
self.proposer_worker.get_cache_block_size_bytes(
block_size, cache_dtype))
new_num_gpu_blocks = split_num_cache_blocks_evenly(
scorer_cache_block_size_bytes, proposer_cache_block_size_bytes,
......@@ -320,8 +323,8 @@ class SpecDecodeWorker:
sampler_output_list.append(
SamplerOutput(outputs=step_output_token_ids))
maybe_rejsample_metrics = self._metrics.maybe_collect_rejsample_metrics(
k)
maybe_rejsample_metrics = (
self._metrics.maybe_collect_rejsample_metrics(k))
if maybe_rejsample_metrics is not None:
sampler_output_list[
0].spec_decode_worker_metrics = maybe_rejsample_metrics
......
......@@ -62,62 +62,6 @@ class MPTConfig(PretrainedConfig):
fc_type: str = 'torch',
verbose: Optional[int] = None,
**kwargs: Any):
"""The MPT configuration class.
Args:
d_model (int): The size of the embedding dimension of the model.
n_heads (int): The number of attention heads.
n_layers (int): The number of layers in the model.
expansion_ratio (int): The ratio of the up/down scale in the ffn.
max_seq_len (int): The maximum sequence length of the model.
vocab_size (int): The size of the vocabulary.
resid_pdrop (float): The dropout probability applied to the attention output before combining with residual.
emb_pdrop (float): The dropout probability for the embedding layer.
learned_pos_emb (bool): Whether to use learned positional embeddings
attn_config (Dict): A dictionary used to configure the model's attention module:
attn_type (str): type of attention to use. Options: multihead_attention, multiquery_attention, grouped_query_attention
attn_pdrop (float): The dropout probability for the attention layers.
attn_impl (str): The attention implementation to use. One of 'torch', 'flash', or 'triton'.
qk_ln (bool): Whether to apply layer normalization to the queries and keys in the attention layer.
clip_qkv (Optional[float]): If not None, clip the queries, keys, and values in the attention layer to
this value.
softmax_scale (Optional[float]): If not None, scale the softmax in the attention layer by this value. If None,
use the default scale of ``1/sqrt(d_keys)``.
prefix_lm (Optional[bool]): Whether the model should operate as a Prefix LM. This requires passing an
extra `prefix_mask` argument which indicates which tokens belong to the prefix. Tokens in the prefix
can attend to one another bi-directionally. Tokens outside the prefix use causal attention.
attn_uses_sequence_id (Optional[bool]): Whether to restrict attention to tokens that have the same sequence_id.
When the model is in `train` mode, this requires passing an extra `sequence_id` argument which indicates
which sub-sequence each token belongs to.
Defaults to ``False`` meaning any provided `sequence_id` will be ignored.
alibi (bool): Whether to use the alibi bias instead of position embeddings.
alibi_bias_max (int): The maximum value of the alibi bias.
kv_n_heads (Optional[int]): For grouped_query_attention only, allow user to specify number of kv heads.
ffn_config (Dict): A dictionary used to configure the model's ffn module:
ffn_type (str): type of ffn to use. Options: mptmlp, te_ln_mlp
init_device (str): The device to use for parameter initialization.
logit_scale (Optional[Union[float, str]]): If not None, scale the logits by this value.
no_bias (bool): Whether to use bias in all layers.
verbose (int): The verbosity level. 0 is silent.
embedding_fraction (float): The fraction to scale the gradients of the embedding layer by.
norm_type (str): choose type of norm to use
use_cache (bool): Whether or not the model should return the last key/values attentions
init_config (Dict): A dictionary used to configure the model initialization:
init_config.name: The parameter initialization scheme to use. Options: 'default_', 'baseline_',
'kaiming_uniform_', 'kaiming_normal_', 'neox_init_', 'small_init_', 'xavier_uniform_', or
'xavier_normal_'. These mimic the parameter initialization methods in PyTorch.
init_div_is_residual (Union[int, float, str, bool]): Value to divide initial weights by if ``module._is_residual`` is True.
emb_init_std (Optional[float]): The standard deviation of the normal distribution used to initialize the embedding layer.
emb_init_uniform_lim (Optional[Union[Tuple[float, float], float]]): The lower and upper limits of the uniform distribution
used to initialize the embedding layer. Mutually exclusive with ``emb_init_std``.
init_std (float): The standard deviation of the normal distribution used to initialize the model,
if using the baseline_ parameter initialization scheme.
init_gain (float): The gain to use for parameter initialization with kaiming or xavier initialization schemes.
fan_mode (str): The fan mode to use for parameter initialization with kaiming initialization schemes.
init_nonlinearity (str): The nonlinearity to use for parameter initialization with kaiming initialization schemes.
---
See llmfoundry.models.utils.param_init_fns.py for info on other param init config options
fc_type (str): choose fc layer implementation. Options: torch and te. te layers support fp8 when using H100 GPUs.
"""
self.d_model = d_model
self.n_heads = n_heads
self.n_layers = n_layers
......@@ -139,8 +83,8 @@ class MPTConfig(PretrainedConfig):
self.fc_type = fc_type
if verbose is not None:
warnings.warn(DeprecationWarning(
'verbose argument for MPTConfig is now ignored and will be removed. Use python_log_level instead.'
),
'verbose argument for MPTConfig is now ignored and '
'will be removed. Use python_log_level instead.'),
stacklevel=2)
if 'name' in kwargs:
del kwargs['name']
......@@ -149,7 +93,8 @@ class MPTConfig(PretrainedConfig):
if self.attn_config.get('alibi', False):
self.learned_pos_emb = False
warnings.warn(
f'alibi is turned on, setting `learned_pos_emb` to {self.learned_pos_emb}`',
f'alibi is turned on, setting `learned_pos_emb` '
f'to {self.learned_pos_emb}`',
stacklevel=2)
super().__init__(**kwargs)
self._validate_config()
......@@ -176,8 +121,8 @@ class MPTConfig(PretrainedConfig):
[self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop]
)):
raise ValueError(
"self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are probabilities and must be between 0 and 1" # pylint: disable=line-too-long
)
"self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are "
"probabilities and must be between 0 and 1")
if self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']:
raise ValueError(
f"Unknown attn_impl={self.attn_config['attn_impl']}")
......@@ -193,17 +138,17 @@ class MPTConfig(PretrainedConfig):
if self.attn_config['attn_uses_sequence_id'] and self.attn_config[
'attn_impl'] not in ['torch', 'triton']:
raise NotImplementedError(
'attn_uses_sequence_id only implemented with torch and triton attention.' # pylint: disable=line-too-long
)
'attn_uses_sequence_id only implemented with torch '
'and triton attention.')
if self.embedding_fraction > 1 or self.embedding_fraction <= 0:
raise ValueError(
'model.embedding_fraction must be between 0 (exclusive) and 1 (inclusive)!' # pylint: disable=line-too-long
)
'model.embedding_fraction must be between 0 (exclusive) '
'and 1 (inclusive)!')
if isinstance(self.logit_scale,
str) and self.logit_scale != 'inv_sqrt_d_model':
raise ValueError(
f"self.logit_scale={self.logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'." # pylint: disable=line-too-long
)
f"self.logit_scale={self.logit_scale!r} is not recognized as "
"an option; use numeric value or 'inv_sqrt_d_model'.")
if self.init_config.get('name', None) is None:
raise ValueError(
f"self.init_config={self.init_config!r} 'name' needs to be set."
......@@ -219,11 +164,11 @@ class MPTConfig(PretrainedConfig):
del te
except Exception as exc:
raise ImportError(
# pylint: disable=line-too-long
'TransformerEngine import fail. `fc_type: te` requires TransformerEngine be installed. '
+
'The required version of transformer_engine also requires FlashAttention v1.0.6 is installed:\n'
+ 'pip install flash-attn==1.0.6 --no-build-isolation \n' +
'TransformerEngine import fail. `fc_type: te` requires '
'TransformerEngine be installed. '
'The required version of transformer_engine also requires '
'FlashAttention v1.0.6 is installed:\n'
'pip install flash-attn==1.0.6 --no-build-isolation \n'
'pip install git+https://github.com/NVIDIA/TransformerEngine.git@144e4888b2cdd60bd52e706d5b7a79cb9c1a7156'
) from exc
if self.ffn_config['ffn_type'] == 'mptmlp':
......
......@@ -2,78 +2,6 @@ from transformers import PretrainedConfig
class Starcoder2Config(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`Starcoder2Model`]. It is used to instantiate a
Starcoder2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the [bigcode/starcoder2-7b_16k](https://huggingface.co/bigcode/starcoder2-7b_16k) model.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 49152):
Vocabulary size of the Starcoder2 model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`Starcoder2Model`]
hidden_size (`int`, *optional*, defaults to 3072):
Dimension of the hidden representations.
intermediate_size (`int`, *optional*, defaults to 12288):
Dimension of the MLP representations.
num_hidden_layers (`int`, *optional*, defaults to 30):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 24):
Number of attention heads for each attention layer in the Transformer encoder.
num_key_value_heads (`int`, *optional*, defaults to 2):
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
by meanpooling all the original heads within that group. For more details checkout [this
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
The non-linear activation function (function or string) in the decoder.
max_position_embeddings (`int`, *optional*, defaults to 4096):
The maximum sequence length that this model might ever be used with. Starcoder2's sliding window attention
allows sequence of up to 4096*32 tokens.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
norm_epsilon (`float`, *optional*, defaults to 1e-05):
Epsilon value for the layer norm
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`.
bos_token_id (`int`, *optional*, defaults to 50256):
The id of the "beginning-of-sequence" token.
eos_token_id (`int`, *optional*, defaults to 50256):
The id of the "end-of-sequence" token.
rope_theta (`float`, *optional*, defaults to 10000.0):
The base period of the RoPE embeddings.
sliding_window (`int`, *optional*):
Sliding window attention window size. If not specified, will default to `None` (no sliding window).
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
residual_dropout (`float`, *optional*, defaults to 0.0):
Residual connection dropout value.
embedding_dropout (`float`, *optional*, defaults to 0.0):
Embedding dropout.
use_bias (`bool`, *optional*, defaults to `True`):
Whether to use bias term on linear layers of the model.
```python
>>> from transformers import Starcoder2Model, Starcoder2Config
>>> # Initializing a Starcoder2 7B style configuration
>>> configuration = Starcoder2Config()
>>> # Initializing a model from the Starcoder2 7B style configuration
>>> model = Starcoder2Model(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "starcoder2"
keys_to_ignore_at_inference = ["past_key_values"]
......
# yapf: disable
# Adapted from
# https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/8f6e343d545c503b91429582231d1d354dac2740/tokenization_baichuan.py
# This includes a fix suggested in
......@@ -13,7 +12,6 @@ import sentencepiece as spm
from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
from transformers.utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
......@@ -52,27 +50,16 @@ class BaichuanTokenizer(PreTrainedTokenizer):
clean_up_tokenization_spaces=False,
**kwargs,
):
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
bos_token = (
AddedToken(bos_token, lstrip=False, rstrip=False)
if isinstance(bos_token, str)
else bos_token
)
eos_token = (
AddedToken(eos_token, lstrip=False, rstrip=False)
if isinstance(eos_token, str)
else eos_token
)
unk_token = (
AddedToken(unk_token, lstrip=False, rstrip=False)
if isinstance(unk_token, str)
else unk_token
)
pad_token = (
AddedToken(pad_token, lstrip=False, rstrip=False)
if isinstance(pad_token, str)
else pad_token
)
self.sp_model_kwargs = ({} if sp_model_kwargs is None else
sp_model_kwargs)
bos_token = (AddedToken(bos_token, lstrip=False, rstrip=False)
if isinstance(bos_token, str) else bos_token)
eos_token = (AddedToken(eos_token, lstrip=False, rstrip=False)
if isinstance(eos_token, str) else eos_token)
unk_token = (AddedToken(unk_token, lstrip=False, rstrip=False)
if isinstance(unk_token, str) else unk_token)
pad_token = (AddedToken(pad_token, lstrip=False, rstrip=False)
if isinstance(pad_token, str) else pad_token)
self.vocab_file = vocab_file
self.add_bos_token = add_bos_token
self.add_eos_token = add_eos_token
......@@ -107,7 +94,10 @@ class BaichuanTokenizer(PreTrainedTokenizer):
def get_vocab(self):
"""Returns vocab as a dict"""
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
vocab = {
self.convert_ids_to_tokens(i): i
for i in range(self.vocab_size)
}
vocab.update(self.added_tokens_encoder)
return vocab
......@@ -130,7 +120,8 @@ class BaichuanTokenizer(PreTrainedTokenizer):
out_string = ""
prev_is_special = False
for i, token in enumerate(tokens):
# make sure that special tokens are not decoded using sentencepiece model
# make sure that special tokens are not decoded using
# sentencepiece model
if token in self.all_special_tokens:
if not prev_is_special and i != 0:
out_string += " "
......@@ -143,9 +134,9 @@ class BaichuanTokenizer(PreTrainedTokenizer):
out_string += self.sp_model.decode(current_sub_tokens)
return out_string
def save_vocabulary(
self, save_directory, filename_prefix: Optional[str] = None
) -> Tuple[str]:
def save_vocabulary(self,
save_directory,
filename_prefix: Optional[str] = None) -> Tuple[str]:
"""
Save the vocabulary and special tokens file to a directory.
......@@ -157,24 +148,24 @@ class BaichuanTokenizer(PreTrainedTokenizer):
`Tuple(str)`: Paths to the files saved.
"""
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
logger.error(f"Vocabulary path ({save_directory}) "
"should be a directory")
return
out_vocab_file = os.path.join(
save_directory,
(filename_prefix + "-" if filename_prefix else "")
+ VOCAB_FILES_NAMES["vocab_file"],
(filename_prefix + "-" if filename_prefix else "") +
VOCAB_FILES_NAMES["vocab_file"],
)
if os.path.abspath(self.vocab_file) != os.path.abspath(
out_vocab_file
) and os.path.isfile(self.vocab_file):
out_vocab_file) and os.path.isfile(self.vocab_file):
copyfile(self.vocab_file, out_vocab_file)
elif not os.path.isfile(self.vocab_file):
with open(out_vocab_file, "wb") as fi:
content_spiece_model = self.sp_model.serialized_model_proto()
fi.write(content_spiece_model)
return (out_vocab_file,)
return (out_vocab_file, )
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
bos_token_id = [self.bos_token_id] if self.add_bos_token else []
......@@ -194,7 +185,8 @@ class BaichuanTokenizer(PreTrainedTokenizer):
already_has_special_tokens: bool = False,
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
Retrieve sequence ids from a token list that has no special tokens
added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
......@@ -202,11 +194,14 @@ class BaichuanTokenizer(PreTrainedTokenizer):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
already_has_special_tokens (`bool`, *optional*, defaults to
`False`):
Whether or not the token list is already formatted with
special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
`List[int]`: A list of integers in the range [0, 1]:
1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
......@@ -220,20 +215,16 @@ class BaichuanTokenizer(PreTrainedTokenizer):
if token_ids_1 is None:
return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
return (
bos_token_id
+ ([0] * len(token_ids_0))
+ eos_token_id
+ bos_token_id
+ ([0] * len(token_ids_1))
+ eos_token_id
)
return (bos_token_id + ([0] * len(token_ids_0)) + eos_token_id +
bos_token_id + ([0] * len(token_ids_1)) + eos_token_id)
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
self,
token_ids_0: List[int],
token_ids_1: Optional[List[int]] = None) -> List[int]:
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
Creates a mask from the two sequences passed to be used in a
sequence-pair classification task. An ALBERT
sequence pair mask has the following format:
```
......@@ -250,7 +241,8 @@ class BaichuanTokenizer(PreTrainedTokenizer):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
`List[int]`: List of [token type IDs](../glossary#token-type-ids)
according to the given sequence(s).
"""
bos_token_id = [self.bos_token_id] if self.add_bos_token else []
eos_token_id = [self.eos_token_id] if self.add_eos_token else []
......
......@@ -133,9 +133,10 @@ def get_max_shared_memory_bytes(gpu: int = 0) -> int:
# the Neuron-X backend does not have the `cuda_utils` module.
from vllm._C import cuda_utils
max_shared_mem = cuda_utils.get_max_shared_memory_per_block_device_attribute(
gpu)
# value 0 will cause MAX_SEQ_LEN become negative and test_attention.py will fail
max_shared_mem = (
cuda_utils.get_max_shared_memory_per_block_device_attribute(gpu))
# value 0 will cause MAX_SEQ_LEN become negative and test_attention.py
# will fail
assert max_shared_mem > 0, "max_shared_mem can not be zero"
return int(max_shared_mem)
......@@ -209,9 +210,8 @@ def get_nvcc_cuda_version() -> Optional[Version]:
if not cuda_home:
cuda_home = '/usr/local/cuda'
if os.path.isfile(cuda_home + '/bin/nvcc'):
logger.info(
f'CUDA_HOME is not found in the environment. Using {cuda_home} as CUDA_HOME.'
)
logger.info(f'CUDA_HOME is not found in the environment. '
f'Using {cuda_home} as CUDA_HOME.')
else:
logger.warning(
f'Not found nvcc in {cuda_home}. Skip cuda version check!')
......
......@@ -93,14 +93,13 @@ class ModelRunner:
scheduler_config=self.scheduler_config)
self.model_memory_usage = m.consumed_memory
logger.info(
f"Loading model weights took {self.model_memory_usage / float(2**30):.4f} GB"
)
logger.info(f"Loading model weights took "
f"{self.model_memory_usage / float(2**30):.4f} GB")
if self.lora_config:
assert hasattr(
self.model, "supported_lora_modules"
) and self.model.supported_lora_modules, "Model does not support LoRA"
assert hasattr(self.model, "supported_lora_modules"
) and self.model.supported_lora_modules, (
"Model does not support LoRA")
assert hasattr(
self.model,
"embedding_modules"), "Model does not have embedding_modules"
......
......@@ -79,7 +79,8 @@ class Worker:
cpu_swap_space: int = 0,
cache_dtype: str = "float16",
) -> Tuple[int, int]:
"""Simply returns max_num_seqs as num_gpu_blocks, 0 as num_cpu_blocks."""
"""Simply returns max_num_seqs as num_gpu_blocks, 0 as
num_cpu_blocks."""
num_gpu_blocks = self.scheduler_config.max_num_seqs
num_cpu_blocks = 0
return num_gpu_blocks, num_cpu_blocks
......@@ -177,7 +178,8 @@ def _init_distributed_environment(
"distributed_init_method must be set if torch.distributed "
"is not already initialized")
else:
distributed_backend = distributed_backend if distributed_backend else "nccl"
distributed_backend = (distributed_backend
if distributed_backend else "nccl")
torch.distributed.init_process_group(
backend=distributed_backend,
world_size=parallel_config.world_size,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment