Time to Say Goodbye, torch 1.7 and 1.8 (#22291)

* time to say goodbye, torch 1.7 and 1.8 * clean up torch_int_div * clean up is_torch_less_than_1_8-9 * update --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>

Time to Say Goodbye, torch 1.7 and 1.8 (#22291)
* time to say goodbye, torch 1.7 and 1.8 * clean up torch_int_div * clean up is_torch_less_than_1_8-9 * update --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
67c2dbdb · Yih-Dar · GitHub · 86c7931a · 67c2dbdb · 67c2dbdb
Unverified Commit 67c2dbdb authored Mar 21, 2023 by Yih-Dar Committed by GitHub Mar 21, 2023
20 changed files
--- a/setup.py
+++ b/setup.py
@@ -171,7 +171,7 @@ _deps = [
    "timeout-decorator",
    "timm",
    "tokenizers>=0.11.1,!=0.11.3,<0.14",
-    "torch>=1.7,!=1.12.0",
+    "torch>=1.9,!=1.12.0",
    "torchaudio",
    "torchvision",
    "pyctcdecode>=0.4.0",

--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -77,7 +77,7 @@ deps = {
    "timeout-decorator": "timeout-decorator",
    "timm": "timm",
    "tokenizers": "tokenizers>=0.11.1,!=0.11.3,<0.14",
-    "torch": "torch>=1.7,!=1.12.0",
+    "torch": "torch>=1.9,!=1.12.0",
    "torchaudio": "torchaudio",
    "torchvision": "torchvision",
    "pyctcdecode": "pyctcdecode>=0.4.0",

--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -115,7 +115,6 @@ from .utils import (
    is_torch_cuda_available,
    is_torch_fx_available,
    is_torch_fx_proxy,
-    is_torch_onnx_dict_inputs_support_available,
    is_torch_tf32_available,
    is_torch_tpu_available,
    is_torchaudio_available,

--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -32,7 +32,6 @@ from ..models.auto import (
    MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
    MODEL_FOR_VISION_2_SEQ_MAPPING,
 )
-from ..pytorch_utils import torch_int_div
 from ..utils import ModelOutput, logging
 from .beam_constraints import DisjunctiveConstraint, PhrasalConstraint
 from .beam_search import BeamScorer, BeamSearchScorer, ConstrainedBeamSearchScorer
@@ -2795,7 +2794,7 @@ class GenerationMixin:
                next_token_scores, 2 * num_beams, dim=1, largest=True, sorted=True
            )

-            next_indices = torch_int_div(next_tokens, vocab_size)
+            next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor")
            next_tokens = next_tokens % vocab_size

            # stateless
@@ -3129,7 +3128,7 @@ class GenerationMixin:
            next_token_scores, _indices = torch.sort(next_token_scores, descending=True, dim=1)
            next_tokens = torch.gather(next_tokens, -1, _indices)

-            next_indices = torch_int_div(next_tokens, vocab_size)
+            next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor")
            next_tokens = next_tokens % vocab_size

            # stateless
@@ -3473,7 +3472,7 @@ class GenerationMixin:
                    next_token_scores, 2 * group_size, dim=1, largest=True, sorted=True
                )

-                next_indices = torch_int_div(next_tokens, vocab_size)
+                next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor")
                next_tokens = next_tokens % vocab_size

                # stateless
@@ -3503,7 +3502,9 @@ class GenerationMixin:
                # (beam_idx // group_size) -> batch_idx
                # (beam_idx % group_size) -> offset of idx inside the group
                reordering_indices[batch_group_indices] = (
-                    num_beams * torch_int_div(beam_idx, group_size) + group_start_idx + (beam_idx % group_size)
+                    num_beams * torch.div(beam_idx, group_size, rounding_mode="floor")
+                    + group_start_idx
+                    + (beam_idx % group_size)
                )

            # Store scores, attentions and hidden_states when required

--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -539,9 +539,6 @@ def _move_model_to_meta(model, loaded_state_dict_keys, start_prefix):

    """

-    # meta device was added in pt=1.9
-    require_version_core("torch>=1.9")
-
    # dematerialize param storage for keys that are going to be replaced by state_dict, by
    # putting those on the meta device
    for k in loaded_state_dict_keys:
@@ -2100,8 +2097,6 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
                raise ValueError("Passing along a `device_map` requires `low_cpu_mem_usage=True`")

        if low_cpu_mem_usage:
-            # low_cpu_mem_usage requires PyTorch >= 1.9 to have the meta device.
-            require_version_core("torch>=1.9")
            if device_map is not None:
                # The max memory utils require PyTorch >= 1.10 to have torch.cuda.mem_get_info.
                require_version_core("torch>=1.10")

--- a/src/transformers/models/big_bird/modeling_big_bird.py
+++ b/src/transformers/models/big_bird/modeling_big_bird.py
@@ -37,7 +37,7 @@ from ...modeling_outputs import (
    TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward, torch_int_div
+from ...pytorch_utils import apply_chunking_to_forward
 from ...utils import (
    ModelOutput,
    add_code_sample_docstrings,
@@ -972,7 +972,7 @@ class BigBirdBlockSparseAttention(nn.Module):
        num_indices_to_pick_from = params.shape[2]

        shift = torch.arange(indices.shape[0] * indices.shape[1] * num_indices_to_gather, device=indices.device)
-        indices_shift = torch_int_div(shift, num_indices_to_gather) * num_indices_to_pick_from
+        indices_shift = torch.div(shift, num_indices_to_gather, rounding_mode="floor") * num_indices_to_pick_from

        flattened_indices = indices.view(-1) + indices_shift
        flattened_params = params.reshape(-1, params.shape[-2], params.shape[-1])

--- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
+++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
@@ -36,7 +36,6 @@ from ...modeling_outputs import (
    Seq2SeqSequenceClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_int_div
 from ...utils import (
    add_code_sample_docstrings,
    add_end_docstrings,
@@ -791,7 +790,7 @@ class BigBirdPegasusBlockSparseAttention(nn.Module):
        num_indices_to_pick_from = params.shape[2]

        shift = torch.arange(indices.shape[0] * indices.shape[1] * num_indices_to_gather, device=indices.device)
-        indices_shift = torch_int_div(shift, num_indices_to_gather) * num_indices_to_pick_from
+        indices_shift = torch.div(shift, num_indices_to_gather, rounding_mode="floor") * num_indices_to_pick_from

        flattened_indices = indices.view(-1) + indices_shift
        flattened_params = params.reshape(-1, params.shape[-2], params.shape[-1])

--- a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
@@ -68,8 +68,6 @@ if is_torch_available():
    import torch
    from torch import nn

-    from transformers.pytorch_utils import torch_int_div
-

 if is_vision_available():
    import PIL
@@ -1314,7 +1312,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
        prob = out_logits.sigmoid()
        topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 300, dim=1)
        scores = topk_values
-        topk_boxes = torch_int_div(topk_indexes, out_logits.shape[2])
+        topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
        labels = topk_indexes % out_logits.shape[2]
        boxes = center_to_corners_format(out_bbox)
        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
@@ -1360,7 +1358,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
        prob = out_logits.sigmoid()
        topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1)
        scores = topk_values
-        topk_boxes = torch_int_div(topk_indexes, out_logits.shape[2])
+        topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
        labels = topk_indexes % out_logits.shape[2]
        boxes = center_to_corners_format(out_bbox)
        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))

--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -26,7 +26,6 @@ from torch import Tensor, nn
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithCrossAttentions, Seq2SeqModelOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_int_div
 from ...utils import (
    ModelOutput,
    add_start_docstrings,
@@ -452,7 +451,7 @@ class ConditionalDetrSinePositionEmbedding(nn.Module):
            x_embed = x_embed / (x_embed[:, :, -1:] + 1e-6) * self.scale

        dim_t = torch.arange(self.embedding_dim, dtype=torch.float32, device=pixel_values.device)
-        dim_t = self.temperature ** (2 * torch_int_div(dim_t, 2) / self.embedding_dim)
+        dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim)

        pos_x = x_embed[:, :, :, None] / dim_t
        pos_y = y_embed[:, :, :, None] / dim_t
@@ -504,7 +503,7 @@ def build_position_encoding(config):
 def gen_sine_position_embeddings(pos_tensor):
    scale = 2 * math.pi
    dim_t = torch.arange(128, dtype=torch.float32, device=pos_tensor.device)
-    dim_t = 10000 ** (2 * torch_int_div(dim_t, 2) / 128)
+    dim_t = 10000 ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / 128)
    x_embed = pos_tensor[:, :, 0] * scale
    y_embed = pos_tensor[:, :, 1] * scale
    pos_x = x_embed[:, :, None] / dim_t

--- a/src/transformers/models/data2vec/modeling_data2vec_audio.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_audio.py
@@ -35,7 +35,6 @@ from ...modeling_outputs import (
    XVectorOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_int_div
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_data2vec_audio import Data2VecAudioConfig

@@ -731,7 +730,7 @@ class Data2VecAudioPreTrainedModel(PreTrainedModel):
        def _conv_out_length(input_length, kernel_size, stride):
            # 1D convolutional layer output length formula taken
            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
-            return torch_int_div(input_length - kernel_size, stride) + 1
+            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1

        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)

--- a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
@@ -68,8 +68,6 @@ if is_torch_available():
    import torch
    from torch import nn

-    from ...pytorch_utils import torch_int_div
-

 if is_vision_available():
    import PIL
@@ -1312,7 +1310,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
        prob = out_logits.sigmoid()
        topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1)
        scores = topk_values
-        topk_boxes = torch_int_div(topk_indexes, out_logits.shape[2])
+        topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
        labels = topk_indexes % out_logits.shape[2]
        boxes = center_to_corners_format(out_bbox)
        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
@@ -1357,7 +1355,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
        prob = out_logits.sigmoid()
        topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1)
        scores = topk_values
-        topk_boxes = torch_int_div(topk_indexes, out_logits.shape[2])
+        topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
        labels = topk_indexes % out_logits.shape[2]
        boxes = center_to_corners_format(out_bbox)
        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))

--- a/src/transformers/models/deformable_detr/modeling_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
@@ -41,7 +41,7 @@ from ...file_utils import (
 )
 from ...modeling_outputs import BaseModelOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import meshgrid, torch_int_div
+from ...pytorch_utils import meshgrid
 from ...utils import is_ninja_available, logging
 from ..auto import AutoBackbone
 from .configuration_deformable_detr import DeformableDetrConfig
@@ -497,7 +497,7 @@ class DeformableDetrSinePositionEmbedding(nn.Module):
            x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale

        dim_t = torch.arange(self.embedding_dim, dtype=torch.float32, device=pixel_values.device)
-        dim_t = self.temperature ** (2 * torch_int_div(dim_t, 2) / self.embedding_dim)
+        dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim)

        pos_x = x_embed[:, :, :, None] / dim_t
        pos_y = y_embed[:, :, :, None] / dim_t
@@ -1552,7 +1552,7 @@ class DeformableDetrModel(DeformableDetrPreTrainedModel):
        scale = 2 * math.pi

        dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=proposals.device)
-        dim_t = temperature ** (2 * torch_int_div(dim_t, 2) / num_pos_feats)
+        dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats)
        # batch_size, num_queries, 4
        proposals = proposals.sigmoid() * scale
        # batch_size, num_queries, 4, 128

--- a/src/transformers/models/deta/image_processing_deta.py
+++ b/src/transformers/models/deta/image_processing_deta.py
@@ -63,7 +63,6 @@ from ...utils.generic import ExplicitEnum, TensorType
 if is_torch_available():
    import torch

-    from ...pytorch_utils import torch_int_div

 if is_torchvision_available():
    from torchvision.ops.boxes import batched_nms
@@ -967,7 +966,7 @@ class DetaImageProcessor(BaseImageProcessor):

        all_scores = prob.view(batch_size, num_queries * num_labels).to(out_logits.device)
        all_indexes = torch.arange(num_queries * num_labels)[None].repeat(batch_size, 1).to(out_logits.device)
-        all_boxes = torch_int_div(all_indexes, out_logits.shape[2])
+        all_boxes = torch.div(all_indexes, out_logits.shape[2], rounding_mode="floor")
        all_labels = all_indexes % out_logits.shape[2]

        boxes = center_to_corners_format(out_bbox)

--- a/src/transformers/models/deta/modeling_deta.py
+++ b/src/transformers/models/deta/modeling_deta.py
@@ -36,7 +36,7 @@ from ...file_utils import (
 )
 from ...modeling_outputs import BaseModelOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import meshgrid, torch_int_div
+from ...pytorch_utils import meshgrid
 from ...utils import is_torchvision_available, logging, requires_backends
 from ..auto import AutoBackbone
 from .configuration_deta import DetaConfig
@@ -399,7 +399,7 @@ class DetaSinePositionEmbedding(nn.Module):
            x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale

        dim_t = torch.arange(self.embedding_dim, dtype=torch.float32, device=pixel_values.device)
-        dim_t = self.temperature ** (2 * torch_int_div(dim_t, 2) / self.embedding_dim)
+        dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim)

        pos_x = x_embed[:, :, :, None] / dim_t
        pos_y = y_embed[:, :, :, None] / dim_t
@@ -1463,7 +1463,7 @@ class DetaModel(DetaPreTrainedModel):
        scale = 2 * math.pi

        dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=proposals.device)
-        dim_t = temperature ** (2 * torch_int_div(dim_t, 2) / num_pos_feats)
+        dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats)
        # batch_size, num_queries, 4
        proposals = proposals.sigmoid() * scale
        # batch_size, num_queries, 4, 128

--- a/src/transformers/models/detr/modeling_detr.py
+++ b/src/transformers/models/detr/modeling_detr.py
@@ -26,7 +26,6 @@ from torch import Tensor, nn
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithCrossAttentions, Seq2SeqModelOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_int_div
 from ...utils import (
    ModelOutput,
    add_start_docstrings,
@@ -442,7 +441,7 @@ class DetrSinePositionEmbedding(nn.Module):
            x_embed = x_embed / (x_embed[:, :, -1:] + 1e-6) * self.scale

        dim_t = torch.arange(self.embedding_dim, dtype=torch.float32, device=pixel_values.device)
-        dim_t = self.temperature ** (2 * torch_int_div(dim_t, 2) / self.embedding_dim)
+        dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim)

        pos_x = x_embed[:, :, :, None] / dim_t
        pos_y = y_embed[:, :, :, None] / dim_t

--- a/src/transformers/models/hubert/modeling_hubert.py
+++ b/src/transformers/models/hubert/modeling_hubert.py
@@ -27,7 +27,6 @@ from ...activations import ACT2FN
 from ...deepspeed import is_deepspeed_zero3_enabled
 from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_int_div
 from ...utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
@@ -871,7 +870,7 @@ class HubertPreTrainedModel(PreTrainedModel):
        def _conv_out_length(input_length, kernel_size, stride):
            # 1D convolutional layer output length formula taken
            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
-            return torch_int_div(input_length - kernel_size, stride) + 1
+            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1

        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)

--- a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
@@ -31,7 +31,7 @@ from ...modeling_outputs import (
    TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward, torch_int_div
+from ...pytorch_utils import apply_chunking_to_forward
 from ...utils import (
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
@@ -770,7 +770,7 @@ class LayoutLMv2Model(LayoutLMv2PreTrainedModel):
        return embeddings

    def _calc_visual_bbox(self, image_feature_pool_shape, bbox, device, final_shape):
-        visual_bbox_x = torch_int_div(
+        visual_bbox_x = torch.div(
            torch.arange(
                0,
                1000 * (image_feature_pool_shape[1] + 1),
@@ -779,8 +779,9 @@ class LayoutLMv2Model(LayoutLMv2PreTrainedModel):
                dtype=bbox.dtype,
            ),
            self.config.image_feature_pool_shape[1],
+            rounding_mode="floor",
        )
-        visual_bbox_y = torch_int_div(
+        visual_bbox_y = torch.div(
            torch.arange(
                0,
                1000 * (self.config.image_feature_pool_shape[0] + 1),
@@ -789,6 +790,7 @@ class LayoutLMv2Model(LayoutLMv2PreTrainedModel):
                dtype=bbox.dtype,
            ),
            self.config.image_feature_pool_shape[0],
+            rounding_mode="floor",
        )
        visual_bbox = torch.stack(
            [

--- a/src/transformers/models/mask2former/image_processing_mask2former.py
+++ b/src/transformers/models/mask2former/image_processing_mask2former.py
@@ -57,8 +57,6 @@ if is_torch_available():
    import torch
    from torch import nn

-    from ...pytorch_utils import torch_int_div
-

 # Copied from transformers.models.detr.image_processing_detr.max_across_indices
 def max_across_indices(values: Iterable[Any]) -> List[Any]:
@@ -1009,7 +1007,7 @@ class Mask2FormerImageProcessor(BaseImageProcessor):
            scores_per_image, topk_indices = scores.flatten(0, 1).topk(num_queries, sorted=False)
            labels_per_image = labels[topk_indices]

-            topk_indices = torch_int_div(topk_indices, num_classes)
+            topk_indices = torch.div(topk_indices, num_classes, rounding_mode="floor")
            mask_pred = mask_pred[topk_indices]
            pred_masks = (mask_pred > 0).float()


--- a/src/transformers/models/maskformer/image_processing_maskformer.py
+++ b/src/transformers/models/maskformer/image_processing_maskformer.py
@@ -61,8 +61,6 @@ if is_torch_available():
    import torch
    from torch import nn

-    from ...pytorch_utils import torch_int_div
-

 # Copied from transformers.models.detr.image_processing_detr.max_across_indices
 def max_across_indices(values: Iterable[Any]) -> List[Any]:
@@ -1077,7 +1075,7 @@ class MaskFormerImageProcessor(BaseImageProcessor):
            scores_per_image, topk_indices = scores.flatten(0, 1).topk(num_queries, sorted=False)
            labels_per_image = labels[topk_indices]

-            topk_indices = torch_int_div(topk_indices, num_classes)
+            topk_indices = torch.div(topk_indices, num_classes, rounding_mode="floor")
            mask_pred = mask_pred[topk_indices]
            pred_masks = (mask_pred > 0).float()


--- a/src/transformers/models/mctct/modeling_mctct.py
+++ b/src/transformers/models/mctct/modeling_mctct.py
@@ -33,19 +33,12 @@ from ...modeling_utils import (
    find_pruneable_heads_and_indices,
    prune_linear_layer,
 )
-from ...pytorch_utils import is_torch_less_than_1_9
 from ...utils import logging
 from .configuration_mctct import MCTCTConfig


 logger = logging.get_logger(__name__)

-if is_torch_less_than_1_9:
-    logger.warning(
-        f"You are using torch=={torch.__version__}, but torch>=1.9.0 is required to use MCTCTModel. Please upgrade"
-        " torch."
-    )
-
 _HIDDEN_STATES_START_POSITION = 1

 _CONFIG_FOR_DOC = "MCTCTConfig"