feat: add ruff and resolve issue (#2262)

* feat: add ruff and resolve issue * fix: update client exports and adjust after rebase * fix: adjust syntax to avoid circular import * fix: adjust client ruff settings * fix: lint and refactor import check and avoid model enum as global names * fix: improve fbgemm_gpu check and lints * fix: update lints * fix: prefer comparing model enum over str * fix: adjust lints and ignore specific rules * fix: avoid unneeded quantize check

feat: add ruff and resolve issue (#2262)
* feat: add ruff and resolve issue * fix: update client exports and adjust after rebase * fix: adjust syntax to avoid circular import * fix: adjust client ruff settings * fix: lint and refactor import check and avoid model enum as global names * fix: improve fbgemm_gpu check and lints * fix: update lints * fix: prefer comparing model enum over str * fix: adjust lints and ignore specific rules * fix: avoid unneeded quantize check
bab02ff2 · drbh · GitHub · 4b49c50f · bab02ff2 · bab02ff2
Unverified Commit bab02ff2 authored Jul 26, 2024 by drbh Committed by GitHub Jul 26, 2024
20 changed files
--- a/server/text_generation_server/models/custom_modeling/flash_deepseek_v2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_deepseek_v2_modeling.py
@@ -39,6 +39,12 @@ from torch import nn
 from transformers.activations import ACT2FN
 from transformers.configuration_utils import PretrainedConfig

+if SYSTEM == "rocm":
+    try:
+        from vllm import _custom_C
+    except Exception as e:
+        raise ImportError(f"Could not load `vllm._custom_C`. Full error: {e}")
+

 class DeepseekV2Config(PretrainedConfig):
    def __init__(

--- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
@@ -46,7 +46,6 @@ from text_generation_server.layers.layernorm import (
    FastRMSNorm,
 )
 from text_generation_server.utils.weights import (
-    UnquantizedWeight,
    Weights,
 )
 from text_generation_server.layers.fp8 import HybridFP8UnquantLoader
@@ -277,7 +276,7 @@ class LlamaMLP(nn.Module):
                bias=bias,
            )
        else:
-            prefixes = [f"gate_proj", f"up_proj"]
+            prefixes = ["gate_proj", "up_proj"]
            sizes = [
                config.intermediate_size,
                config.intermediate_size,

--- a/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
@@ -28,7 +28,6 @@ from typing import Optional, List, Tuple

 from text_generation_server.utils.import_utils import SYSTEM
 from text_generation_server.layers.attention import (
-    Seqlen,
    paged_attention,
    attention,
    reshape_and_cache,
@@ -38,7 +37,6 @@ from text_generation_server.layers import (
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    SpeculativeHead,
-    get_linear,
    TensorParallelMultiAdapterLinear,
    TensorParallelAdapterRowLinear,
 )

--- a/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py
@@ -21,7 +21,6 @@
 import torch
 import torch.distributed

-import numpy as np

 from torch import nn
 from text_generation_server.utils.import_utils import SYSTEM
@@ -31,7 +30,6 @@ if SYSTEM != "ipex":
 from transformers.activations import ACT2FN
 from transformers.configuration_utils import PretrainedConfig
 from typing import Optional, List, Tuple
-from loguru import logger

 from text_generation_server.layers.attention import (
    paged_attention,

--- a/server/text_generation_server/models/custom_modeling/flash_pali_gemma_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_pali_gemma_modeling.py
@@ -16,7 +16,6 @@
 import torch
 import torch.distributed
 from torch import nn
-from transformers.configuration_utils import PretrainedConfig
 from typing import Optional, List, Tuple

 from text_generation_server.layers.tensor_parallel import TensorParallelColumnLinear

--- a/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
@@ -15,7 +15,6 @@ from text_generation_server.layers import (
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    SpeculativeHead,
-    get_linear,
 )
 from text_generation_server.layers.rotary import PositionRotaryEmbedding
 from text_generation_server.layers.layernorm import (

--- a/server/text_generation_server/models/custom_modeling/idefics2.py
+++ b/server/text_generation_server/models/custom_modeling/idefics2.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 """ PyTorch Idefics2 model."""

-from typing import List, Optional, Tuple, Union
+from typing import List, Optional, Tuple

 import torch
 import torch.utils.checkpoint
@@ -22,10 +22,8 @@ from torch import nn
 import math

 from transformers.activations import ACT2FN
-from transformers.image_processing_utils import select_best_resolution
 from text_generation_server.models.custom_modeling.vlm import (
    load_text_model,
-    load_vision_model,
 )
 from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask


--- a/server/text_generation_server/models/custom_modeling/idefics_image_processing.py
+++ b/server/text_generation_server/models/custom_modeling/idefics_image_processing.py
@@ -19,6 +19,7 @@ import numpy as np

 from PIL import Image

+import transformers
 from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
 from transformers.image_transforms import (
    resize,
@@ -293,6 +294,4 @@ class IdeficsImageProcessor(BaseImageProcessor):
        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)


-import transformers
-
 transformers.IdeficsImageProcessor = IdeficsImageProcessor
--- a/server/text_generation_server/models/custom_modeling/idefics_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/idefics_modeling.py
@@ -21,10 +21,8 @@
 from typing import List, Optional, Tuple, Union

 import torch
-import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch import nn
-from torch.nn import CrossEntropyLoss

 from transformers import PreTrainedModel
 from transformers.activations import ACT2FN
@@ -33,13 +31,6 @@ from transformers.modeling_outputs import (
    CausalLMOutputWithPast,
    dataclass,
 )
-from transformers.modeling_utils import PretrainedConfig
-from transformers.utils import (
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    logging,
-    replace_return_docstrings,
-)
 from text_generation_server.models.custom_modeling.idefics_config import IdeficsConfig
 from text_generation_server.models.custom_modeling.idefics_vision import (
    IdeficsVisionTransformer,
@@ -56,6 +47,7 @@ from text_generation_server.layers import (
 )
 from text_generation_server.layers.rotary import PositionRotaryEmbedding
 from text_generation_server.utils.import_utils import SYSTEM
+from loguru import logger

 if SYSTEM == "cuda":
    import dropout_layer_norm
@@ -237,7 +229,7 @@ class IdeficsDecoupledPartialTPEmbedding(nn.Module):
            prefix="model.embed_tokens", weights=weights
        )
        self.additional_weight = nn.Parameter(
-            weights.get_tensor(f"model.embed_tokens.additional_embedding.weight")
+            weights.get_tensor("model.embed_tokens.additional_embedding.weight")
        )

    def forward(self, input_ids):
@@ -499,7 +491,6 @@ class IdeficsAttention(nn.Module):
        # if not hasattr(nn.functional, "scaled_dot_product_attention"):
        #     raise ValueError("this model requires pytorch 2.0 or higher")

-        process_group = weights.process_group
        if self.num_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
@@ -1024,7 +1015,7 @@ class IdeficsModel(IdeficsPreTrainedModel):
        if config.use_resampler:
            perceiver_config = config.perceiver_config
            self.perceiver_resampler = IdeficsPerceiverResampler(
-                prefix=f"model.perceiver_resampler",
+                prefix="model.perceiver_resampler",
                config=config,
                embed_dim=config.vision_config.embed_dim,
                depth=perceiver_config.resampler_depth,
@@ -1052,7 +1043,7 @@ class IdeficsModel(IdeficsPreTrainedModel):
        # self.gradient_checkpointing = False

        self.norm = IdeficsRMSNorm(
-            prefix=f"model.norm", weights=weights, eps=config.rms_norm_eps
+            prefix="model.norm", weights=weights, eps=config.rms_norm_eps
        )

        # self.gradient_checkpointing = False

--- a/server/text_generation_server/models/custom_modeling/idefics_perceiver.py
+++ b/server/text_generation_server/models/custom_modeling/idefics_perceiver.py
@@ -169,7 +169,6 @@ class IdeficsPerceiverAttention(nn.Module):

        self.qk_scale = self.head_dim**-0.5

-        process_group = weights.process_group
        if n_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {n_heads} "

--- a/server/text_generation_server/models/custom_modeling/idefics_processing.py
+++ b/server/text_generation_server/models/custom_modeling/idefics_processing.py
@@ -28,9 +28,6 @@ from transformers.tokenization_utils_base import (
    TruncationStrategy,
 )
 from transformers.utils import TensorType, is_torch_available
-from text_generation_server.models.custom_modeling.idefics_image_processing import (
-    IdeficsImageProcessor,
-)


 if is_torch_available():

--- a/server/text_generation_server/models/custom_modeling/idefics_vision.py
+++ b/server/text_generation_server/models/custom_modeling/idefics_vision.py
@@ -129,7 +129,6 @@ class IdeficsVisionAttention(nn.Module):
        self.scale = self.head_dim**-0.5
        self.dropout = config.attention_dropout

-        process_group = weights.process_group
        if self.num_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
@@ -460,7 +459,6 @@ class IdeficsVisionTransformer(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.config = config
-        embed_dim = config.hidden_size

        self.embeddings = IdeficsVisionEmbeddings(
            prefix=f"{prefix}.embeddings", config=config, weights=weights

--- a/server/text_generation_server/models/custom_modeling/llava_next.py
+++ b/server/text_generation_server/models/custom_modeling/llava_next.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 """ PyTorch Llava-NeXT model."""

-from typing import List, Optional, Tuple, Union
+from typing import List, Optional, Tuple

 import torch
 import torch.utils.checkpoint

--- a/server/text_generation_server/models/custom_modeling/mpt_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/mpt_modeling.py
@@ -4,7 +4,6 @@ Inspired by https://github.com/karpathy/minGPT/blob/master/mingpt/model.py
 """

 import math
-import os
 import warnings
 from typing import List, Optional, Tuple, Union
 import torch
@@ -194,7 +193,7 @@ def flash_attn_fn(
 ):
    try:
        from flash_attn import bert_padding, flash_attn_interface
-    except:
+    except Exception:
        raise RuntimeError("Please install flash-attn==1.0.3.post0")
    check_valid_inputs(query, key, value)
    if past_key_value is not None:
@@ -207,7 +206,7 @@ def flash_attn_fn(
        _s_k = max(0, attn_bias.size(3) - key.size(1))
        attn_bias = attn_bias[:, :, _s_q:, _s_k:]
    if attn_bias is not None:
-        raise NotImplementedError(f"attn_bias not implemented for flash attn.")
+        raise NotImplementedError("attn_bias not implemented for flash attn.")
    (batch_size, seqlen) = query.shape[:2]
    if key_padding_mask is None:
        key_padding_mask = torch.ones_like(key[:, :, 0], dtype=torch.bool)
@@ -269,13 +268,13 @@ def triton_flash_attn_fn(
 ):
    try:
        from .flash_attn_triton import flash_attn_func
-    except:
+    except Exception:
        _installed = False
        if version.parse(torch.__version__) < version.parse("2.0.0"):
            _installed = True
            try:
                from flash_attn.flash_attn_triton import flash_attn_func
-            except:
+            except Exception:
                _installed = False
        if not _installed:
            raise RuntimeError(
@@ -292,9 +291,9 @@ def triton_flash_attn_fn(
        _s_k = max(0, attn_bias.size(3) - key.size(1))
        attn_bias = attn_bias[:, :, _s_q:, _s_k:]
    if dropout_p:
-        raise NotImplementedError(f"Dropout not implemented for attn_impl: triton.")
+        raise NotImplementedError("Dropout not implemented for attn_impl: triton.")
    if needs_weights:
-        raise NotImplementedError(f"attn_impl: triton cannot return attn weights.")
+        raise NotImplementedError("attn_impl: triton cannot return attn weights.")
    if key_padding_mask is not None:
        warnings.warn(
            "Propagating key_padding_mask to the attention module "
@@ -428,7 +427,7 @@ class MultiQueryAttention(nn.Module):
    additive bias.
    """

-    def __init__(self, config, prefix, weights):
+    def __init__(self, config, prefix, weights, verbose=False):
        super().__init__()
        attn_impl = config.attn_config.attn_impl
        self.attn_impl = config.attn_config.attn_impl
@@ -445,7 +444,7 @@ class MultiQueryAttention(nn.Module):
        self.Wqkv = TensorParallelColumnLinear.load(
            config, prefix=f"{prefix}.Wqkv", weights=weights, bias=not config.no_bias
        )
-        fuse_splits = (d_model, d_model + self.head_dim)
+        (d_model, d_model + self.head_dim)
        if self.qk_ln:
            raise NotImplementedError("qk_ln not supported")
        if self.attn_impl == "flash":
@@ -795,7 +794,9 @@ class MPTModel(MPTPreTrainedModel):
        self.alibi = config.attn_config.alibi
        self.alibi_bias_max = config.attn_config.alibi_bias_max
        if config.init_device == "mixed":
-            if dist.get_local_rank() == 0:
+            # TODO: reimplement mixed device initialization
+            # dist.get_local_rank() == 0:
+            if True:
                config.init_device = "cpu"
            else:
                config.init_device = "meta"
@@ -1016,7 +1017,7 @@ class MPTModel(MPTPreTrainedModel):
            if past_key_values is not None:
                if len(past_key_values) != self.config.n_layers:
                    raise ValueError(
-                        f"past_key_values must provide a past_key_value for each attention "
+                        "past_key_values must provide a past_key_value for each attention "
                        + f"layer in the network (len(past_key_values)={len(past_key_values)!r}; self.config.n_layers={self.config.n_layers!r})."
                    )
                past_position = past_key_values[0][0].size(1)
@@ -1182,7 +1183,7 @@ class MPTForCausalLM(MPTPreTrainedModel):
            input_ids = input_ids[:, -1].unsqueeze(-1)
        if self.transformer.prefix_lm:
            prefix_mask = torch.ones_like(attention_mask)
-            if kwargs.get("use_cache") == False:
+            if kwargs.get("use_cache") is False:
                raise NotImplementedError(
                    "MPT with prefix_lm=True does not support use_cache=False."
                )

--- a/server/text_generation_server/models/custom_modeling/neox_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/neox_modeling.py
@@ -21,25 +21,14 @@ import torch
 import torch.distributed
 import torch.utils.checkpoint
 from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from torch.nn import CrossEntropyLoss

 from transformers.activations import ACT2FN
-from transformers.file_utils import (
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from transformers.modeling_outputs import (
    BaseModelOutputWithPast,
    CausalLMOutputWithPast,
-    QuestionAnsweringModelOutput,
-    SequenceClassifierOutputWithPast,
-    TokenClassifierOutput,
 )
 from transformers.modeling_utils import PreTrainedModel
-from transformers import GPTNeoXConfig
-from loguru import logger
 from text_generation_server.layers import (
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
@@ -133,7 +122,6 @@ class GPTNeoXAttention(nn.Module):
        self.hidden_size = config.hidden_size
        self.head_size = self.hidden_size // self.num_attention_heads
        self.rotary_ndims = int(self.head_size * config.rotary_pct)
-        max_positions = config.max_position_embeddings
        # ??? TODO
        # self.register_buffer(
        #     "bias",

--- a/server/text_generation_server/models/custom_modeling/phi_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/phi_modeling.py
@@ -5,7 +5,7 @@ import torch.distributed

 import math
 from torch import nn
-from typing import Optional, List, Tuple, Any
+from typing import Optional, List, Tuple
 from transformers.configuration_utils import PretrainedConfig
 from transformers.modeling_outputs import CausalLMOutputWithPast


--- a/server/text_generation_server/models/custom_modeling/siglip.py
+++ b/server/text_generation_server/models/custom_modeling/siglip.py
-from typing import Optional, Tuple, Union
-
+from typing import Optional, Tuple
+import warnings
 import math
 import torch
 from torch import nn

 from transformers.activations import ACT2FN
-from transformers.modeling_attn_mask_utils import (
-    _create_4d_causal_attention_mask,
-    _prepare_4d_attention_mask,
-)
 from transformers.modeling_outputs import (
-    BaseModelOutput,
    BaseModelOutputWithPooling,
-    ImageClassifierOutput,
 )
-from transformers import SiglipConfig, SiglipTextConfig, SiglipVisionConfig
+from transformers import SiglipConfig, SiglipVisionConfig
+from torch.nn.init import _calculate_fan_in_and_fan_out

 from text_generation_server.layers.tensor_parallel import (
    TensorParallelEmbedding,
@@ -244,9 +239,6 @@ class SiglipMultiheadAttentionPoolingHead(nn.Module):
        return hidden_state[:, 0]


-import warnings
-
-
 def _trunc_normal_(tensor, mean, std, a, b):
    # Cut & paste from PyTorch official master until it's in a few official releases - RW
    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
@@ -264,12 +256,12 @@ def _trunc_normal_(tensor, mean, std, a, b):
    # Values are generated by using a truncated uniform distribution and
    # then using the inverse CDF for the normal distribution.
    # Get upper and lower cdf values
-    l = norm_cdf((a - mean) / std)
-    u = norm_cdf((b - mean) / std)
+    lower = norm_cdf((a - mean) / std)
+    upper = norm_cdf((b - mean) / std)

    # Uniformly fill tensor with values from [l, u], then translate to
    # [2l-1, 2u-1].
-    tensor.uniform_(2 * l - 1, 2 * u - 1)
+    tensor.uniform_(2 * lower - 1, 2 * upper - 1)

    # Use inverse cdf transform for normal distribution to get truncated
    # standard normal
@@ -313,9 +305,6 @@ def trunc_normal_tf_(
        tensor.mul_(std).add_(mean)


-from torch.nn.init import _calculate_fan_in_and_fan_out
-
-
 def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
    if mode == "fan_in":
@@ -349,9 +338,6 @@ def default_flax_embed_init(tensor):
    variance_scaling_(tensor, mode="fan_in", distribution="normal")


-from transformers import PreTrainedModel
-
-
 class SiglipEncoder(nn.Module):
    """
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
@@ -393,7 +379,6 @@ class SiglipVisionTransformer(nn.Module):
    def __init__(self, prefix, config: SiglipVisionConfig, weights):
        super().__init__()
        self.config = config
-        embed_dim = config.hidden_size

        self.embeddings = SiglipVisionEmbeddings(
            prefix=f"{prefix}.embeddings", config=config, weights=weights

--- a/server/text_generation_server/models/custom_modeling/t5_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/t5_modeling.py
@@ -45,6 +45,15 @@ from text_generation_server.layers import (
    SpeculativeHead,
 )

+# copied from https://github.com/huggingface/transformers/blob/cd4584e3c809bb9e1392ccd3fe38b40daba5519a/src/transformers/models/t5/modeling_t5.py#L1316
+# Warning message for FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
+__HEAD_MASK_WARNING_MSG = """
+The input argument `head_mask` was split into two arguments `head_mask` and `decoder_head_mask`. Currently,
+`decoder_head_mask` is set to copy `head_mask`, but this feature is deprecated and will be removed in future versions.
+If you do not want to use any `decoder_head_mask` now, please set `decoder_head_mask = torch.ones(num_layers,
+num_heads)`.
+"""
+

 class PartialTPEmbedding(nn.Module):
    def __init__(self, prefix: str, weights):
@@ -1132,12 +1141,12 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
        if labels is not None:
            loss_fct = CrossEntropyLoss(ignore_index=-100)
            # move labels to correct device to enable PP
-            labels = labels.to(lm_logits.device)
-            loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
+            labels = labels.to(logits.device)
+            loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
            # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666

        if not return_dict:
-            output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs
+            output = (logits,) + decoder_outputs[1:] + encoder_outputs
            return ((loss,) + output) if loss is not None else output

        return (

--- a/server/text_generation_server/models/custom_modeling/vlm.py
+++ b/server/text_generation_server/models/custom_modeling/vlm.py
@@ -42,7 +42,7 @@ def load_vision_model(prefix, config, weights):
        )

        return SiglipVisionTransformer(
-            prefix=f"vision_tower.vision_model", config=config, weights=weights
+            prefix="vision_tower.vision_model", config=config, weights=weights
        )
    else:
        raise RuntimeError(f"Unsupported model type {config.model_type}")
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@@ -1194,7 +1194,7 @@ class FlashCausalLM(Model):
                    if self.speculate is None or self.speculate + 1 <= bs:
                        self.cuda_graph_warmup(bs, max_s, max_bt)
            except torch.cuda.OutOfMemoryError:
-                logger.exception(f"Decode cuda graph warmup failed")
+                logger.exception("Decode cuda graph warmup failed")
        else:
            log_master(
                logger.info, f"Cuda Graphs are disabled (CUDA_GRAPHS={CUDA_GRAPHS})."