Time to Say Goodbye, torch 1.7 and 1.8 (#22291)

* time to say goodbye, torch 1.7 and 1.8 * clean up torch_int_div * clean up is_torch_less_than_1_8-9 * update --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>

Time to Say Goodbye, torch 1.7 and 1.8 (#22291)
* time to say goodbye, torch 1.7 and 1.8 * clean up torch_int_div * clean up is_torch_less_than_1_8-9 * update --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
67c2dbdb · Yih-Dar · GitHub · 86c7931a · 67c2dbdb · 67c2dbdb
Unverified Commit 67c2dbdb authored Mar 21, 2023 by Yih-Dar Committed by GitHub Mar 21, 2023
20 changed files
--- a/src/transformers/models/oneformer/image_processing_oneformer.py
+++ b/src/transformers/models/oneformer/image_processing_oneformer.py
@@ -58,8 +58,6 @@ if is_torch_available():
    import torch
    from torch import nn
-    from ...pytorch_utils import torch_int_div
 # Copied from transformers.models.detr.image_processing_detr.max_across_indices
 def max_across_indices(values: Iterable[Any]) -> List[Any]:
@@ -1122,7 +1120,7 @@ class OneFormerImageProcessor(BaseImageProcessor):
            scores_per_image, topk_indices = scores.flatten(0, 1).topk(num_queries, sorted=False)
            labels_per_image = labels[topk_indices]
-            topk_indices = torch_int_div(topk_indices, num_classes)
+            topk_indices = torch.div(topk_indices, num_classes, rounding_mode="floor")
            # mask_pred = mask_pred.unsqueeze(1).repeat(1, self.sem_seg_head.num_classes, 1).flatten(0, 1)
            mask_pred = masks_queries_logits[i][topk_indices]

--- a/src/transformers/models/sew/modeling_sew.py
+++ b/src/transformers/models/sew/modeling_sew.py
@@ -28,7 +28,6 @@ from ...activations import ACT2FN
 from ...deepspeed import is_deepspeed_zero3_enabled
 from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_int_div
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_sew import SEWConfig
@@ -770,7 +769,7 @@ class SEWPreTrainedModel(PreTrainedModel):
        def _conv_out_length(input_length, kernel_size, stride):
            # 1D convolutional layer output length formula taken
            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
-            return torch_int_div(input_length - kernel_size, stride) + 1
+            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)

--- a/src/transformers/models/sew_d/modeling_sew_d.py
+++ b/src/transformers/models/sew_d/modeling_sew_d.py
@@ -29,7 +29,7 @@ from ...activations import ACT2FN
 from ...deepspeed import is_deepspeed_zero3_enabled
 from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import softmax_backward_data, torch_int_div
+from ...pytorch_utils import softmax_backward_data
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_sew_d import SEWDConfig
@@ -1305,7 +1305,7 @@ class SEWDPreTrainedModel(PreTrainedModel):
        def _conv_out_length(input_length, kernel_size, stride):
            # 1D convolutional layer output length formula taken
            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
-            return torch_int_div(input_length - kernel_size, stride) + 1
+            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)

--- a/src/transformers/models/speecht5/modeling_speecht5.py
+++ b/src/transformers/models/speecht5/modeling_speecht5.py
@@ -34,7 +34,6 @@ from ...modeling_outputs import (
    Seq2SeqSpectrogramOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_int_div
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_speecht5 import SpeechT5Config, SpeechT5HifiGanConfig
@@ -620,7 +619,7 @@ class SpeechT5SpeechEncoderPrenet(nn.Module):
        def _conv_out_length(input_length, kernel_size, stride):
            # 1D convolutional layer output length formula taken
            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
-            return torch_int_div(input_length - kernel_size, stride) + 1
+            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)

--- a/src/transformers/models/table_transformer/modeling_table_transformer.py
+++ b/src/transformers/models/table_transformer/modeling_table_transformer.py
@@ -26,7 +26,6 @@ from torch import Tensor, nn
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithCrossAttentions, Seq2SeqModelOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_int_div
 from ...utils import (
    ModelOutput,
    add_start_docstrings,
@@ -380,7 +379,7 @@ class TableTransformerSinePositionEmbedding(nn.Module):
            x_embed = x_embed / (x_embed[:, :, -1:] + 1e-6) * self.scale
        dim_t = torch.arange(self.embedding_dim, dtype=torch.float32, device=pixel_values.device)
-        dim_t = self.temperature ** (2 * torch_int_div(dim_t, 2) / self.embedding_dim)
+        dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim)
        pos_x = x_embed[:, :, :, None] / dim_t
        pos_y = y_embed[:, :, :, None] / dim_t

--- a/src/transformers/models/tapas/modeling_tapas.py
+++ b/src/transformers/models/tapas/modeling_tapas.py
@@ -33,7 +33,6 @@ from ...pytorch_utils import (
    apply_chunking_to_forward,
    find_pruneable_heads_and_indices,
    prune_linear_layer,
-    torch_int_div,
 )
 from ...utils import (
    ModelOutput,
@@ -1637,7 +1636,7 @@ class ProductIndexMap(IndexMap):
    def project_outer(self, index):
        """Projects an index with the same index set onto the outer components."""
-        indices = torch_int_div(index.indices, self.inner_index.num_segments).type(torch.long)
+        indices = torch.div(index.indices, self.inner_index.num_segments, rounding_mode="floor").type(torch.long)
        return IndexMap(indices=indices, num_segments=self.outer_index.num_segments, batch_dims=index.batch_dims)
    def project_inner(self, index):

--- a/src/transformers/models/unispeech/modeling_unispeech.py
+++ b/src/transformers/models/unispeech/modeling_unispeech.py
@@ -29,7 +29,6 @@ from ...activations import ACT2FN
 from ...deepspeed import is_deepspeed_zero3_enabled
 from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput, Wav2Vec2BaseModelOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_int_div
 from ...utils import (
    ModelOutput,
    add_code_sample_docstrings,
@@ -981,7 +980,7 @@ class UniSpeechPreTrainedModel(PreTrainedModel):
        def _conv_out_length(input_length, kernel_size, stride):
            # 1D convolutional layer output length formula taken
            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
-            return torch_int_div(input_length - kernel_size, stride) + 1
+            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)

--- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
+++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
@@ -36,7 +36,6 @@ from ...modeling_outputs import (
    XVectorOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_int_div
 from ...utils import (
    ModelOutput,
    add_code_sample_docstrings,
@@ -995,7 +994,7 @@ class UniSpeechSatPreTrainedModel(PreTrainedModel):
        def _conv_out_length(input_length, kernel_size, stride):
            # 1D convolutional layer output length formula taken
            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
-            return torch_int_div(input_length - kernel_size, stride) + 1
+            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)

--- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
@@ -37,7 +37,6 @@ from ...modeling_outputs import (
    XVectorOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_int_div
 from ...utils import (
    ModelOutput,
    add_code_sample_docstrings,
@@ -1098,7 +1097,7 @@ class Wav2Vec2PreTrainedModel(PreTrainedModel):
        def _conv_out_length(input_length, kernel_size, stride):
            # 1D convolutional layer output length formula taken
            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
-            return torch_int_div(input_length - kernel_size, stride) + 1
+            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)

--- a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
+++ b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
@@ -35,7 +35,6 @@ from ...modeling_outputs import (
    XVectorOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_int_div
 from ...utils import (
    ModelOutput,
    add_code_sample_docstrings,
@@ -1143,7 +1142,7 @@ class Wav2Vec2ConformerPreTrainedModel(PreTrainedModel):
        def _conv_out_length(input_length, kernel_size, stride):
            # 1D convolutional layer output length formula taken
            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
-            return torch_int_div(input_length - kernel_size, stride) + 1
+            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)

--- a/src/transformers/models/wavlm/modeling_wavlm.py
+++ b/src/transformers/models/wavlm/modeling_wavlm.py
@@ -36,7 +36,6 @@ from ...modeling_outputs import (
    XVectorOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_int_div
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_wavlm import WavLMConfig
@@ -1019,7 +1018,7 @@ class WavLMPreTrainedModel(PreTrainedModel):
        def _conv_out_length(input_length, kernel_size, stride):
            # 1D convolutional layer output length formula taken
            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
-            return torch_int_div(input_length - kernel_size, stride) + 1
+            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)

--- a/src/transformers/onnx/convert.py
+++ b/src/transformers/onnx/convert.py
@@ -26,7 +26,6 @@ from ..utils import (
    TensorType,
    is_tf_available,
    is_torch_available,
-    is_torch_onnx_dict_inputs_support_available,
    logging,
 )
 from .config import OnnxConfig
@@ -339,9 +338,6 @@ def export(
    if is_torch_available():
        from ..utils import torch_version
-        if not is_torch_onnx_dict_inputs_support_available():
-            raise AssertionError(f"Unsupported PyTorch version, minimum required is 1.8.0, got: {torch_version}")
        if not config.is_torch_support_available:
            logger.warning(
                f"Unsupported PyTorch version for this model. Minimum required is {config.torch_onnx_minimum_version},"

--- a/src/transformers/pytorch_utils.py
+++ b/src/transformers/pytorch_utils.py
@@ -27,22 +27,10 @@ logger = logging.get_logger(__name__)
 parsed_torch_version_base = version.parse(version.parse(torch.__version__).base_version)
-is_torch_less_than_1_8 = parsed_torch_version_base < version.parse("1.8.0")
-is_torch_less_than_1_9 = parsed_torch_version_base < version.parse("1.9.0")
 is_torch_greater_or_equal_than_1_10 = parsed_torch_version_base >= version.parse("1.10")
 is_torch_less_than_1_11 = parsed_torch_version_base < version.parse("1.11")
-def torch_int_div(tensor1, tensor2):
-    """
-    A function that performs integer division across different versions of PyTorch.
-    """
-    if is_torch_less_than_1_8:
-        return tensor1 // tensor2
-    else:
-        return torch.div(tensor1, tensor2, rounding_mode="floor")
 def softmax_backward_data(parent, grad_output, output, dim, self):
    """
    A function that calls the internal `_softmax_backward_data` PyTorch method and that adjusts the arguments according

--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -151,7 +151,6 @@ from .import_utils import (
    is_torch_fx_available,
    is_torch_fx_proxy,
    is_torch_neuroncore_available,
-    is_torch_onnx_dict_inputs_support_available,
    is_torch_tensorrt_fx_available,
    is_torch_tf32_available,
    is_torch_tpu_available,

--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -276,7 +276,6 @@ except importlib_metadata.PackageNotFoundError:
 # This is the version of torch required to run torch.fx features and torch.onnx with dictionary inputs.
 TORCH_FX_REQUIRED_VERSION = version.parse("1.10")
-TORCH_ONNX_DICT_INPUTS_MINIMUM_VERSION = version.parse("1.8")
 def is_kenlm_available():
@@ -388,7 +387,7 @@ def is_torch_tf32_available():
 torch_version = None
-_torch_fx_available = _torch_onnx_dict_inputs_support_available = False
+_torch_fx_available = False
 if _torch_available:
    torch_version = version.parse(importlib_metadata.version("torch"))
    _torch_fx_available = (torch_version.major, torch_version.minor) >= (
@@ -396,8 +395,6 @@ if _torch_available:
        TORCH_FX_REQUIRED_VERSION.minor,
    )
-    _torch_onnx_dict_inputs_support_available = torch_version >= TORCH_ONNX_DICT_INPUTS_MINIMUM_VERSION
 def is_torch_fx_available():
    return _torch_fx_available
@@ -407,10 +404,6 @@ def is_bs4_available():
    return importlib.util.find_spec("bs4") is not None
-def is_torch_onnx_dict_inputs_support_available():
-    return _torch_onnx_dict_inputs_support_available
 def is_tf_available():
    return _tf_available

--- a/tests/generation/test_beam_search.py
+++ b/tests/generation/test_beam_search.py
@@ -32,7 +32,6 @@ if is_torch_available():
        DisjunctiveConstraint,
        PhrasalConstraint,
    )
-    from transformers.pytorch_utils import torch_int_div
 class BeamSearchTester:
@@ -161,7 +160,9 @@ class BeamSearchTester:
        expected_output_scores = cut_expected_tensor(next_scores)
        # add num_beams * batch_idx
-        offset = torch_int_div(torch.arange(self.num_beams * self.batch_size, device=torch_device), self.num_beams)
+        offset = torch.div(
+            torch.arange(self.num_beams * self.batch_size, device=torch_device), self.num_beams, rounding_mode="floor"
+        )
        expected_output_indices = cut_expected_tensor(next_indices) + offset * self.num_beams
        self.parent.assertListEqual(expected_output_tokens.tolist(), output_tokens.tolist())
@@ -398,7 +399,9 @@ class ConstrainedBeamSearchTester:
        expected_output_scores = cut_expected_tensor(next_scores)
        # add num_beams * batch_idx
-        offset = torch_int_div(torch.arange(self.num_beams * self.batch_size, device=torch_device), self.num_beams)
+        offset = torch.div(
+            torch.arange(self.num_beams * self.batch_size, device=torch_device), self.num_beams, rounding_mode="floor"
+        )
        expected_output_indices = cut_expected_tensor(next_indices) + offset * self.num_beams
        self.parent.assertListEqual(expected_output_tokens.tolist(), output_tokens.tolist())

--- a/tests/models/bloom/test_modeling_bloom.py
+++ b/tests/models/bloom/test_modeling_bloom.py
@@ -38,10 +38,9 @@ if is_torch_available():
        BloomModel,
        BloomTokenizerFast,
    )
-    from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_10, is_torch_less_than_1_9
+    from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_10
 else:
    is_torch_greater_or_equal_than_1_10 = False
-    is_torch_less_than_1_9 = True
 @require_torch
@@ -751,9 +750,6 @@ class BloomEmbeddingTest(unittest.TestCase):
                self.assertAlmostEqual(EMBEDDINGS_DS_AFTER_LN[key][idx], output_dict_norm[key][idx], places=1)
    @require_torch
-    @unittest.skipIf(
-        is_torch_less_than_1_9, reason="Test failed with torch < 1.9 (`min_cuda` not implemented for `BFloat16`)"
-    )
    def test_hidden_states_transformers(self):
        cuda_available = torch.cuda.is_available()
        model = BloomModel.from_pretrained(self.path_bigscience_model, use_cache=False, torch_dtype="auto").to(

--- a/tests/models/mctct/test_modeling_mctct.py
+++ b/tests/models/mctct/test_modeling_mctct.py
@@ -32,9 +32,6 @@ if is_torch_available():
    import torch
    from transformers import MCTCTForCTC, MCTCTModel, MCTCTProcessor
-    from transformers.pytorch_utils import is_torch_less_than_1_9
-else:
-    is_torch_less_than_1_9 = True
 class MCTCTModelTester:
@@ -265,7 +262,6 @@ class MCTCTModelTester:
 @require_torch
-@unittest.skipIf(is_torch_less_than_1_9, "MCTCT is only available in torch v1.9+")
 class MCTCTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
    all_model_classes = (MCTCTForCTC, MCTCTModel) if is_torch_available() else ()
    pipeline_model_mapping = (

--- a/tests/models/swin/test_modeling_swin.py
+++ b/tests/models/swin/test_modeling_swin.py
@@ -33,9 +33,7 @@ if is_torch_available():
    from transformers import SwinBackbone, SwinForImageClassification, SwinForMaskedImageModeling, SwinModel
    from transformers.models.swin.modeling_swin import SWIN_PRETRAINED_MODEL_ARCHIVE_LIST
-    from transformers.pytorch_utils import is_torch_less_than_1_9
-else:
-    is_torch_less_than_1_9 = True
 if is_vision_available():
    from PIL import Image
@@ -266,7 +264,6 @@ class SwinModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
    def test_multi_gpu_data_parallel_forward(self):
        pass
-    @unittest.skipIf(is_torch_less_than_1_9, reason="This test fails for SwinModel when torch < 1.9")
    def test_training_gradient_checkpointing(self):
        super().test_training_gradient_checkpointing()

--- a/tests/models/trajectory_transformer/test_modeling_trajectory_transformer.py
+++ b/tests/models/trajectory_transformer/test_modeling_trajectory_transformer.py
@@ -36,9 +36,6 @@ if is_torch_available():
    from transformers.models.trajectory_transformer.modeling_trajectory_transformer import (
        TRAJECTORY_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
    )
-    from transformers.pytorch_utils import is_torch_less_than_1_9
-else:
-    is_torch_less_than_1_9 = True
 class TrajectoryTransformerModelTester:
@@ -199,7 +196,6 @@ class TrajectoryTransformerModelTest(ModelTesterMixin, GenerationTesterMixin, Pi
        ).loss
        loss.backward()
-    @unittest.skipIf(is_torch_less_than_1_9, reason="This test fails for TrajectoryTransformerModel when torch < 1.9")
    def test_training_gradient_checkpointing(self):
        if not self.model_tester.is_training:
            return