Re-enable the 80 char line width limit (#3305)

2f8844ba · Zhuohan Li · GitHub · 4b59f00e · 2f8844ba · 2f8844ba
Unverified Commit 2f8844ba authored Mar 10, 2024 by Zhuohan Li Committed by GitHub Mar 10, 2024
7 changed files
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -10,7 +10,8 @@ from vllm.worker.worker import Worker
 from vllm.spec_decode.multi_step_worker import MultiStepWorker
 from vllm.model_executor.layers.rejection_sampler import RejectionSampler
 from vllm.config import CacheConfig
-from vllm.spec_decode.util import nvtx_range, get_all_seq_ids, split_batch_by_proposal_len
+from vllm.spec_decode.util import (nvtx_range, get_all_seq_ids,
+                                   split_batch_by_proposal_len)
 from vllm.spec_decode.interfaces import SpeculativeProposals, SpeculativeScores
 from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
 from vllm.spec_decode.interfaces import SpeculativeScorer
@@ -25,7 +26,7 @@ class SpecDecodeWorker:
    LLM, after which some verification routine determines which (if any) of the
    speculative tokens are accepted by the larger LLM.
-    See https://github.com/vllm-project/vllm/pull/2188 and 
+    See https://github.com/vllm-project/vllm/pull/2188 and
    https://github.com/vllm-project/vllm/pull/3103 for more info.
    The current implementation has the following limitations:
@@ -109,10 +110,12 @@ class SpecDecodeWorker:
                block_size, gpu_memory_utilization, cpu_swap_space,
                cache_dtype))
-        scorer_cache_block_size_bytes = self.scorer_worker.get_cache_block_size_bytes(
+        scorer_cache_block_size_bytes = (
-            block_size, cache_dtype)
+            self.scorer_worker.get_cache_block_size_bytes(
-        proposer_cache_block_size_bytes = self.proposer_worker.get_cache_block_size_bytes(
+                block_size, cache_dtype))
-            block_size, cache_dtype)
+        proposer_cache_block_size_bytes = (
+            self.proposer_worker.get_cache_block_size_bytes(
+                block_size, cache_dtype))
        new_num_gpu_blocks = split_num_cache_blocks_evenly(
            scorer_cache_block_size_bytes, proposer_cache_block_size_bytes,
@@ -320,8 +323,8 @@ class SpecDecodeWorker:
            sampler_output_list.append(
                SamplerOutput(outputs=step_output_token_ids))
-        maybe_rejsample_metrics = self._metrics.maybe_collect_rejsample_metrics(
+        maybe_rejsample_metrics = (
-            k)
+            self._metrics.maybe_collect_rejsample_metrics(k))
        if maybe_rejsample_metrics is not None:
            sampler_output_list[
                0].spec_decode_worker_metrics = maybe_rejsample_metrics

--- a/vllm/transformers_utils/configs/mpt.py
+++ b/vllm/transformers_utils/configs/mpt.py
@@ -62,62 +62,6 @@ class MPTConfig(PretrainedConfig):
                 fc_type: str = 'torch',
                 verbose: Optional[int] = None,
                 **kwargs: Any):
-        """The MPT configuration class.
-        Args:
-            d_model (int): The size of the embedding dimension of the model.
-            n_heads (int): The number of attention heads.
-            n_layers (int): The number of layers in the model.
-            expansion_ratio (int): The ratio of the up/down scale in the ffn.
-            max_seq_len (int): The maximum sequence length of the model.
-            vocab_size (int): The size of the vocabulary.
-            resid_pdrop (float): The dropout probability applied to the attention output before combining with residual.
-            emb_pdrop (float): The dropout probability for the embedding layer.
-            learned_pos_emb (bool): Whether to use learned positional embeddings
-            attn_config (Dict): A dictionary used to configure the model's attention module:
-                attn_type (str): type of attention to use. Options: multihead_attention, multiquery_attention, grouped_query_attention
-                attn_pdrop (float): The dropout probability for the attention layers.
-                attn_impl (str): The attention implementation to use. One of 'torch', 'flash', or 'triton'.
-                qk_ln (bool): Whether to apply layer normalization to the queries and keys in the attention layer.
-                clip_qkv (Optional[float]): If not None, clip the queries, keys, and values in the attention layer to
-                    this value.
-                softmax_scale (Optional[float]): If not None, scale the softmax in the attention layer by this value. If None,
-                    use the default scale of ``1/sqrt(d_keys)``.
-                prefix_lm (Optional[bool]): Whether the model should operate as a Prefix LM. This requires passing an
-                    extra `prefix_mask` argument which indicates which tokens belong to the prefix. Tokens in the prefix
-                    can attend to one another bi-directionally. Tokens outside the prefix use causal attention.
-                attn_uses_sequence_id (Optional[bool]): Whether to restrict attention to tokens that have the same sequence_id.
-                    When the model is in `train` mode, this requires passing an extra `sequence_id` argument which indicates
-                    which sub-sequence each token belongs to.
-                    Defaults to ``False`` meaning any provided `sequence_id` will be ignored.
-                alibi (bool): Whether to use the alibi bias instead of position embeddings.
-                alibi_bias_max (int): The maximum value of the alibi bias.
-                kv_n_heads (Optional[int]): For grouped_query_attention only, allow user to specify number of kv heads.
-            ffn_config (Dict): A dictionary used to configure the model's ffn module:
-                ffn_type (str): type of ffn to use. Options: mptmlp, te_ln_mlp
-            init_device (str): The device to use for parameter initialization.
-            logit_scale (Optional[Union[float, str]]): If not None, scale the logits by this value.
-            no_bias (bool): Whether to use bias in all layers.
-            verbose (int): The verbosity level. 0 is silent.
-            embedding_fraction (float): The fraction to scale the gradients of the embedding layer by.
-            norm_type (str): choose type of norm to use
-            use_cache (bool): Whether or not the model should return the last key/values attentions
-            init_config (Dict): A dictionary used to configure the model initialization:
-                init_config.name: The parameter initialization scheme to use. Options: 'default_', 'baseline_',
-                    'kaiming_uniform_', 'kaiming_normal_', 'neox_init_', 'small_init_', 'xavier_uniform_', or
-                    'xavier_normal_'. These mimic the parameter initialization methods in PyTorch.
-                init_div_is_residual (Union[int, float, str, bool]): Value to divide initial weights by if ``module._is_residual`` is True.
-                emb_init_std (Optional[float]): The standard deviation of the normal distribution used to initialize the embedding layer.
-                emb_init_uniform_lim (Optional[Union[Tuple[float, float], float]]): The lower and upper limits of the uniform distribution
-                    used to initialize the embedding layer. Mutually exclusive with ``emb_init_std``.
-                init_std (float): The standard deviation of the normal distribution used to initialize the model,
-                    if using the baseline_ parameter initialization scheme.
-                init_gain (float): The gain to use for parameter initialization with kaiming or xavier initialization schemes.
-                fan_mode (str): The fan mode to use for parameter initialization with kaiming initialization schemes.
-                init_nonlinearity (str): The nonlinearity to use for parameter initialization with kaiming initialization schemes.
-                ---
-                See llmfoundry.models.utils.param_init_fns.py for info on other param init config options
-            fc_type (str): choose fc layer implementation. Options: torch and te. te layers support fp8 when using H100 GPUs.
-        """
        self.d_model = d_model
        self.n_heads = n_heads
        self.n_layers = n_layers
@@ -139,8 +83,8 @@ class MPTConfig(PretrainedConfig):
        self.fc_type = fc_type
        if verbose is not None:
            warnings.warn(DeprecationWarning(
-                'verbose argument for MPTConfig is now ignored and will be removed. Use python_log_level instead.'
+                'verbose argument for MPTConfig is now ignored and '
-            ),
+                'will be removed. Use python_log_level instead.'),
                          stacklevel=2)
        if 'name' in kwargs:
            del kwargs['name']
@@ -149,7 +93,8 @@ class MPTConfig(PretrainedConfig):
        if self.attn_config.get('alibi', False):
            self.learned_pos_emb = False
            warnings.warn(
-                f'alibi is turned on, setting `learned_pos_emb` to {self.learned_pos_emb}`',
+                f'alibi is turned on, setting `learned_pos_emb` '
+                f'to {self.learned_pos_emb}`',
                stacklevel=2)
        super().__init__(**kwargs)
        self._validate_config()
@@ -176,8 +121,8 @@ class MPTConfig(PretrainedConfig):
            [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop]
        )):
            raise ValueError(
-                "self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are probabilities and must be between 0 and 1"  # pylint: disable=line-too-long
+                "self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are "
-            )
+                "probabilities and must be between 0 and 1")
        if self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']:
            raise ValueError(
                f"Unknown attn_impl={self.attn_config['attn_impl']}")
@@ -193,17 +138,17 @@ class MPTConfig(PretrainedConfig):
        if self.attn_config['attn_uses_sequence_id'] and self.attn_config[
                'attn_impl'] not in ['torch', 'triton']:
            raise NotImplementedError(
-                'attn_uses_sequence_id only implemented with torch and triton attention.'  # pylint: disable=line-too-long
+                'attn_uses_sequence_id only implemented with torch '
-            )
+                'and triton attention.')
        if self.embedding_fraction > 1 or self.embedding_fraction <= 0:
            raise ValueError(
-                'model.embedding_fraction must be between 0 (exclusive) and 1 (inclusive)!'  # pylint: disable=line-too-long
+                'model.embedding_fraction must be between 0 (exclusive) '
-            )
+                'and 1 (inclusive)!')
        if isinstance(self.logit_scale,
                      str) and self.logit_scale != 'inv_sqrt_d_model':
            raise ValueError(
-                f"self.logit_scale={self.logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'."  # pylint: disable=line-too-long
+                f"self.logit_scale={self.logit_scale!r} is not recognized as "
-            )
+                "an option; use numeric value or 'inv_sqrt_d_model'.")
        if self.init_config.get('name', None) is None:
            raise ValueError(
                f"self.init_config={self.init_config!r} 'name' needs to be set."
@@ -219,11 +164,11 @@ class MPTConfig(PretrainedConfig):
                del te
            except Exception as exc:
                raise ImportError(
-                    # pylint: disable=line-too-long
+                    'TransformerEngine import fail. `fc_type: te` requires '
-                    'TransformerEngine import fail. `fc_type: te` requires TransformerEngine be installed. '
+                    'TransformerEngine be installed. '
-                    +
+                    'The required version of transformer_engine also requires '
-                    'The required version of transformer_engine also requires FlashAttention v1.0.6 is installed:\n'
+                    'FlashAttention v1.0.6 is installed:\n'
-                    + 'pip install flash-attn==1.0.6 --no-build-isolation \n' +
+                    'pip install flash-attn==1.0.6 --no-build-isolation \n'
                    'pip install git+https://github.com/NVIDIA/TransformerEngine.git@144e4888b2cdd60bd52e706d5b7a79cb9c1a7156'
                ) from exc
        if self.ffn_config['ffn_type'] == 'mptmlp':

--- a/vllm/transformers_utils/configs/starcoder2.py
+++ b/vllm/transformers_utils/configs/starcoder2.py
@@ -2,78 +2,6 @@ from transformers import PretrainedConfig
 class Starcoder2Config(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`Starcoder2Model`]. It is used to instantiate a
-    Starcoder2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
-    with the defaults will yield a similar configuration to that of the [bigcode/starcoder2-7b_16k](https://huggingface.co/bigcode/starcoder2-7b_16k) model.
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-    Args:
-        vocab_size (`int`, *optional*, defaults to 49152):
-            Vocabulary size of the Starcoder2 model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`Starcoder2Model`]
-        hidden_size (`int`, *optional*, defaults to 3072):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 12288):
-            Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 30):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 24):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        num_key_value_heads (`int`, *optional*, defaults to 2):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details checkout [this
-            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
-        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
-            The non-linear activation function (function or string) in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to 4096):
-            The maximum sequence length that this model might ever be used with. Starcoder2's sliding window attention
-            allows sequence of up to 4096*32 tokens.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        norm_epsilon (`float`, *optional*, defaults to 1e-05):
-            Epsilon value for the layer norm
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        bos_token_id (`int`, *optional*, defaults to 50256):
-            The id of the "beginning-of-sequence" token.
-        eos_token_id (`int`, *optional*, defaults to 50256):
-            The id of the "end-of-sequence" token.
-        rope_theta (`float`, *optional*, defaults to 10000.0):
-            The base period of the RoPE embeddings.
-        sliding_window (`int`, *optional*):
-            Sliding window attention window size. If not specified, will default to `None` (no sliding window).
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-        residual_dropout (`float`, *optional*, defaults to 0.0):
-            Residual connection dropout value.
-        embedding_dropout (`float`, *optional*, defaults to 0.0):
-            Embedding dropout.
-        use_bias (`bool`, *optional*, defaults to `True`):
-            Whether to use bias term on linear layers of the model.
-    ```python
-    >>> from transformers import Starcoder2Model, Starcoder2Config
-    >>> # Initializing a Starcoder2 7B style configuration
-    >>> configuration = Starcoder2Config()
-    >>> # Initializing a model from the Starcoder2 7B style configuration
-    >>> model = Starcoder2Model(configuration)
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
    model_type = "starcoder2"
    keys_to_ignore_at_inference = ["past_key_values"]

--- a/vllm/transformers_utils/tokenizers/baichuan.py
+++ b/vllm/transformers_utils/tokenizers/baichuan.py
-# yapf: disable
 # Adapted from
 # https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/8f6e343d545c503b91429582231d1d354dac2740/tokenization_baichuan.py
 # This includes a fix suggested in
@@ -13,7 +12,6 @@ import sentencepiece as spm
 from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
 from transformers.utils import logging
 logger = logging.get_logger(__name__)
 VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
@@ -52,27 +50,16 @@ class BaichuanTokenizer(PreTrainedTokenizer):
        clean_up_tokenization_spaces=False,
        **kwargs,
    ):
-        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        self.sp_model_kwargs = ({} if sp_model_kwargs is None else
-        bos_token = (
+                                sp_model_kwargs)
-            AddedToken(bos_token, lstrip=False, rstrip=False)
+        bos_token = (AddedToken(bos_token, lstrip=False, rstrip=False)
-            if isinstance(bos_token, str)
+                     if isinstance(bos_token, str) else bos_token)
-            else bos_token
+        eos_token = (AddedToken(eos_token, lstrip=False, rstrip=False)
-        )
+                     if isinstance(eos_token, str) else eos_token)
-        eos_token = (
+        unk_token = (AddedToken(unk_token, lstrip=False, rstrip=False)
-            AddedToken(eos_token, lstrip=False, rstrip=False)
+                     if isinstance(unk_token, str) else unk_token)
-            if isinstance(eos_token, str)
+        pad_token = (AddedToken(pad_token, lstrip=False, rstrip=False)
-            else eos_token
+                     if isinstance(pad_token, str) else pad_token)
-        )
-        unk_token = (
-            AddedToken(unk_token, lstrip=False, rstrip=False)
-            if isinstance(unk_token, str)
-            else unk_token
-        )
-        pad_token = (
-            AddedToken(pad_token, lstrip=False, rstrip=False)
-            if isinstance(pad_token, str)
-            else pad_token
-        )
        self.vocab_file = vocab_file
        self.add_bos_token = add_bos_token
        self.add_eos_token = add_eos_token
@@ -107,7 +94,10 @@ class BaichuanTokenizer(PreTrainedTokenizer):
    def get_vocab(self):
        """Returns vocab as a dict"""
-        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab = {
+            self.convert_ids_to_tokens(i): i
+            for i in range(self.vocab_size)
+        }
        vocab.update(self.added_tokens_encoder)
        return vocab
@@ -130,7 +120,8 @@ class BaichuanTokenizer(PreTrainedTokenizer):
        out_string = ""
        prev_is_special = False
        for i, token in enumerate(tokens):
-            # make sure that special tokens are not decoded using sentencepiece model
+            # make sure that special tokens are not decoded using
+            # sentencepiece model
            if token in self.all_special_tokens:
                if not prev_is_special and i != 0:
                    out_string += " "
@@ -143,9 +134,9 @@ class BaichuanTokenizer(PreTrainedTokenizer):
        out_string += self.sp_model.decode(current_sub_tokens)
        return out_string
-    def save_vocabulary(
+    def save_vocabulary(self,
-        self, save_directory, filename_prefix: Optional[str] = None
+                        save_directory,
-    ) -> Tuple[str]:
+                        filename_prefix: Optional[str] = None) -> Tuple[str]:
        """
        Save the vocabulary and special tokens file to a directory.
@@ -157,24 +148,24 @@ class BaichuanTokenizer(PreTrainedTokenizer):
            `Tuple(str)`: Paths to the files saved.
        """
        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            logger.error(f"Vocabulary path ({save_directory}) "
+                         "should be a directory")
            return
        out_vocab_file = os.path.join(
            save_directory,
-            (filename_prefix + "-" if filename_prefix else "")
+            (filename_prefix + "-" if filename_prefix else "") +
-            + VOCAB_FILES_NAMES["vocab_file"],
+            VOCAB_FILES_NAMES["vocab_file"],
        )
        if os.path.abspath(self.vocab_file) != os.path.abspath(
-            out_vocab_file
+                out_vocab_file) and os.path.isfile(self.vocab_file):
-        ) and os.path.isfile(self.vocab_file):
            copyfile(self.vocab_file, out_vocab_file)
        elif not os.path.isfile(self.vocab_file):
            with open(out_vocab_file, "wb") as fi:
                content_spiece_model = self.sp_model.serialized_model_proto()
                fi.write(content_spiece_model)
-        return (out_vocab_file,)
+        return (out_vocab_file, )
    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
@@ -194,7 +185,8 @@ class BaichuanTokenizer(PreTrainedTokenizer):
        already_has_special_tokens: bool = False,
    ) -> List[int]:
        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        Retrieve sequence ids from a token list that has no special tokens
+        added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.
        Args:
@@ -202,11 +194,14 @@ class BaichuanTokenizer(PreTrainedTokenizer):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to
-                Whether or not the token list is already formatted with special tokens for the model.
+            `False`):
+                Whether or not the token list is already formatted with
+                special tokens for the model.
        Returns:
-            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]:
+            1 for a special token, 0 for a sequence token.
        """
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
@@ -220,20 +215,16 @@ class BaichuanTokenizer(PreTrainedTokenizer):
        if token_ids_1 is None:
            return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
-        return (
+        return (bos_token_id + ([0] * len(token_ids_0)) + eos_token_id +
-            bos_token_id
+                bos_token_id + ([0] * len(token_ids_1)) + eos_token_id)
-            + ([0] * len(token_ids_0))
-            + eos_token_id
-            + bos_token_id
-            + ([0] * len(token_ids_1))
-            + eos_token_id
-        )
    def create_token_type_ids_from_sequences(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+            self,
-    ) -> List[int]:
+            token_ids_0: List[int],
+            token_ids_1: Optional[List[int]] = None) -> List[int]:
        """
-        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
+        Creates a mask from the two sequences passed to be used in a
+        sequence-pair classification task. An ALBERT
        sequence pair mask has the following format:
        ```
@@ -250,7 +241,8 @@ class BaichuanTokenizer(PreTrainedTokenizer):
                Optional second list of IDs for sequence pairs.
        Returns:
-            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids)
+            according to the given sequence(s).
        """
        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
        eos_token_id = [self.eos_token_id] if self.add_eos_token else []

--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -133,9 +133,10 @@ def get_max_shared_memory_bytes(gpu: int = 0) -> int:
    # the Neuron-X backend does not have the `cuda_utils` module.
    from vllm._C import cuda_utils
-    max_shared_mem = cuda_utils.get_max_shared_memory_per_block_device_attribute(
+    max_shared_mem = (
-        gpu)
+        cuda_utils.get_max_shared_memory_per_block_device_attribute(gpu))
-    # value 0 will cause MAX_SEQ_LEN become negative and test_attention.py will fail
+    # value 0 will cause MAX_SEQ_LEN become negative and test_attention.py
+    # will fail
    assert max_shared_mem > 0, "max_shared_mem can not be zero"
    return int(max_shared_mem)
@@ -209,9 +210,8 @@ def get_nvcc_cuda_version() -> Optional[Version]:
    if not cuda_home:
        cuda_home = '/usr/local/cuda'
        if os.path.isfile(cuda_home + '/bin/nvcc'):
-            logger.info(
+            logger.info(f'CUDA_HOME is not found in the environment. '
-                f'CUDA_HOME is not found in the environment. Using {cuda_home} as CUDA_HOME.'
+                        f'Using {cuda_home} as CUDA_HOME.')
-            )
        else:
            logger.warning(
                f'Not found nvcc in {cuda_home}. Skip cuda version check!')

--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -93,14 +93,13 @@ class ModelRunner:
                                   scheduler_config=self.scheduler_config)
        self.model_memory_usage = m.consumed_memory
-        logger.info(
+        logger.info(f"Loading model weights took "
-            f"Loading model weights took {self.model_memory_usage / float(2**30):.4f} GB"
+                    f"{self.model_memory_usage / float(2**30):.4f} GB")
-        )
        if self.lora_config:
-            assert hasattr(
+            assert hasattr(self.model, "supported_lora_modules"
-                self.model, "supported_lora_modules"
+                           ) and self.model.supported_lora_modules, (
-            ) and self.model.supported_lora_modules, "Model does not support LoRA"
+                               "Model does not support LoRA")
            assert hasattr(
                self.model,
                "embedding_modules"), "Model does not have embedding_modules"

--- a/vllm/worker/neuron_worker.py
+++ b/vllm/worker/neuron_worker.py
@@ -79,7 +79,8 @@ class Worker:
        cpu_swap_space: int = 0,
        cache_dtype: str = "float16",
    ) -> Tuple[int, int]:
-        """Simply returns max_num_seqs as num_gpu_blocks, 0 as num_cpu_blocks."""
+        """Simply returns max_num_seqs as num_gpu_blocks, 0 as
+        num_cpu_blocks."""
        num_gpu_blocks = self.scheduler_config.max_num_seqs
        num_cpu_blocks = 0
        return num_gpu_blocks, num_cpu_blocks
@@ -177,7 +178,8 @@ def _init_distributed_environment(
            "distributed_init_method must be set if torch.distributed "
            "is not already initialized")
    else:
-        distributed_backend = distributed_backend if distributed_backend else "nccl"
+        distributed_backend = (distributed_backend
+                               if distributed_backend else "nccl")
        torch.distributed.init_process_group(
            backend=distributed_backend,
            world_size=parallel_config.world_size,