[Perceiver] Skip multi-gpu tests for now (#14813)

* [Perceiver] Skip multi-gpu tests for now * Update tests/test_modeling_perceiver.py * up * up

[Perceiver] Skip multi-gpu tests for now (#14813)
* [Perceiver] Skip multi-gpu tests for now * Update tests/test_modeling_perceiver.py * up * up
952a77b0 · Patrick von Platen · GitHub · 8a818c26 · 952a77b0 · 952a77b0
Unverified Commit 952a77b0 authored Dec 20, 2021 by Patrick von Platen Committed by GitHub Dec 20, 2021
7 changed files
--- a/docs/source/model_doc/perceiver.mdx
+++ b/docs/source/model_doc/perceiver.mdx
@@ -86,6 +86,10 @@ is implemented in the library. Note that the models available in the library onl
 with the Perceiver. There are many more use cases, including question answering, named-entity recognition, object detection, 
 audio classification, video classification, etc. 

+**Note**:
+
+- Perceiver does **not** work with `torch.nn.DataParallel` due to a bug in PyTorch, see [issue #36035](https://github.com/pytorch/pytorch/issues/36035)
+
 ## Perceiver specific outputs

 [[autodoc]] models.perceiver.modeling_perceiver.PerceiverModelOutput

--- a/docs/source/model_doc/reformer.rst
+++ b/docs/source/model_doc/reformer.rst
@@ -35,6 +35,11 @@ while being much more memory-efficient and much faster on long sequences.*
 This model was contributed by `patrickvonplaten <https://huggingface.co/patrickvonplaten>`__. The Authors' code can be
 found `here <https://github.com/google/trax/tree/master/trax/models/reformer>`__.

+**Note**:
+
+- Reformer does **not** work with `torch.nn.DataParallel` due to a bug in PyTorch, see `issue #36035
+  <https://github.com/pytorch/pytorch/issues/36035>`__
+
 Axial Positional Encodings
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


--- a/docs/source/model_doc/transformerxl.rst
+++ b/docs/source/model_doc/transformerxl.rst
@@ -44,6 +44,11 @@ Tips:
 This model was contributed by `thomwolf <https://huggingface.co/thomwolf>`__. The original code can be found `here
 <https://github.com/kimiyoung/transformer-xl>`__.

+**Note**:
+
+- TransformerXL does **not** work with `torch.nn.DataParallel` due to a bug in PyTorch, see `issue #36035
+  <https://github.com/pytorch/pytorch/issues/36035>`__
+

 TransfoXLConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/src/transformers/models/perceiver/modeling_perceiver.py
+++ b/src/transformers/models/perceiver/modeling_perceiver.py
@@ -2128,7 +2128,9 @@ class PerceiverBasicDecoder(PerceiverAbstractDecoder):
            # to get the indices for the unflattened array
            # unravel_index returns a tuple (x_idx, y_idx, ...)
            # stack to get the [n, d] tensor of coordinates
-            indices = list(torch.from_numpy(x) for x in np.unravel_index(subsampled_points, self.output_index_dims))
+            indices = list(
+                torch.from_numpy(x) for x in np.unravel_index(subsampled_points.cpu(), self.output_index_dims)
+            )
            pos = torch.stack(indices, dim=1)
            batch_size = inputs.shape[0]
            # Map these coordinates to [-1, 1]

--- a/tests/test_modeling_perceiver.py
+++ b/tests/test_modeling_perceiver.py
@@ -758,29 +758,11 @@ class PerceiverModelTest(ModelTesterMixin, unittest.TestCase):
                    loss.backward()

    @require_torch_multi_gpu
+    @unittest.skip(
+        reason="Perceiver does not work with data parallel (DP) because of a bug in PyTorch: https://github.com/pytorch/pytorch/issues/36035"
+    )
    def test_multi_gpu_data_parallel_forward(self):
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_model_class(model_class)
-
-            # some params shouldn't be scattered by nn.DataParallel
-            # so just remove them if they are present.
-            blacklist_non_batched_params = ["head_mask", "decoder_head_mask", "cross_attn_head_mask"]
-            for k in blacklist_non_batched_params:
-                inputs_dict.pop(k, None)
-
-            # move input tensors to cuda:O
-            for k, v in inputs_dict.items():
-                if torch.is_tensor(v):
-                    inputs_dict[k] = v.to(0)
-
-            model = model_class(config=config)
-            model.to(0)
-            model.eval()
-
-            # Wrap model in nn.DataParallel
-            model = nn.DataParallel(model)
-            with torch.no_grad():
-                _ = model(**self._prepare_for_class(inputs_dict, model_class))
+        pass

    @unittest.skip(reason="Perceiver models don't have a typical head like is the case with BERT")
    def test_save_load_fast_init_from_base(self):

--- a/tests/test_modeling_reformer.py
+++ b/tests/test_modeling_reformer.py
@@ -573,8 +573,10 @@ class ReformerTesterMixin:
        self.model_tester.create_and_check_reformer_model_fp16_generate(*config_and_inputs)

    @require_torch_multi_gpu
+    @unittest.skip(
+        reason="Reformer does not work with data parallel (DP) because of a bug in PyTorch: https://github.com/pytorch/pytorch/issues/36035"
+    )
    def test_multi_gpu_data_parallel_forward(self):
-        # Opt-out of this test.
        pass

    def test_for_sequence_classification(self):

--- a/tests/test_modeling_transfo_xl.py
+++ b/tests/test_modeling_transfo_xl.py
@@ -232,8 +232,10 @@ class TransfoXLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestC
        return

    @require_torch_multi_gpu
+    @unittest.skip(
+        reason="Transfo-XL does not work with data parallel (DP) because of a bug in PyTorch: https://github.com/pytorch/pytorch/issues/36035"
+    )
    def test_multi_gpu_data_parallel_forward(self):
-        # Opt-out of this test.
        pass

    @slow