Fixing slow pipeline tests (#14260)

* Fiixng slow pipeline tests * Remove the image-segmentaiton override. * Fixing clamping only in training. * Wav2vec2. * Remove last mention of `no_grad`. * Fixing copies. * Rename.

Fixing slow pipeline tests (#14260)
* Fiixng slow pipeline tests * Remove the image-segmentaiton override. * Fixing clamping only in training. * Wav2vec2. * Remove last mention of `no_grad`. * Fixing copies. * Rename.
68427c9b · Nicolas Patry · GitHub · 1a674ce6 · 68427c9b · 68427c9b
Unverified Commit 68427c9b authored Nov 04, 2021 by Nicolas Patry Committed by GitHub Nov 04, 2021
7 changed files
--- a/src/transformers/models/detr/modeling_detr.py
+++ b/src/transformers/models/detr/modeling_detr.py
@@ -648,6 +648,7 @@ class DetrEncoderLayer(nn.Module):
        hidden_states = residual + hidden_states
        hidden_states = self.final_layer_norm(hidden_states)

+        if self.training:
            if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any():
                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)

--- a/src/transformers/models/unispeech/modeling_unispeech.py
+++ b/src/transformers/models/unispeech/modeling_unispeech.py
@@ -947,7 +947,10 @@ class UniSpeechPreTrainedModel(PreTrainedModel):
        return input_lengths

    def _get_feature_vector_attention_mask(self, feature_vector_length: int, attention_mask: torch.LongTensor):
-        output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
+        # Effectively attention_mask.sum(-1), but not inplace to be able to run
+        # on inference mode.
+        non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1]
+        output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths).to(torch.long)
        batch_size = attention_mask.shape[0]

        attention_mask = torch.zeros(

--- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
+++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
@@ -948,7 +948,10 @@ class UniSpeechSatPreTrainedModel(PreTrainedModel):
        return input_lengths

    def _get_feature_vector_attention_mask(self, feature_vector_length: int, attention_mask: torch.LongTensor):
-        output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
+        # Effectively attention_mask.sum(-1), but not inplace to be able to run
+        # on inference mode.
+        non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1]
+        output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths).to(torch.long)
        batch_size = attention_mask.shape[0]

        attention_mask = torch.zeros(

--- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
@@ -989,7 +989,10 @@ class Wav2Vec2PreTrainedModel(PreTrainedModel):
        return input_lengths

    def _get_feature_vector_attention_mask(self, feature_vector_length: int, attention_mask: torch.LongTensor):
-        output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
+        # Effectively attention_mask.sum(-1), but not inplace to be able to run
+        # on inference mode.
+        non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1]
+        output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths).to(torch.long)
        batch_size = attention_mask.shape[0]

        attention_mask = torch.zeros(

--- a/src/transformers/pipelines/image_segmentation.py
+++ b/src/transformers/pipelines/image_segmentation.py
@@ -91,9 +91,6 @@ class ImageSegmentationPipeline(Pipeline):

        return super().__call__(*args, **kwargs)

-    def get_inference_context(self):
-        return torch.no_grad
-
    def preprocess(self, image):
        image = load_image(image)
        target_size = torch.IntTensor([[image.height, image.width]])

--- a/src/transformers/pipelines/table_question_answering.py
+++ b/src/transformers/pipelines/table_question_answering.py
@@ -93,7 +93,6 @@ class TableQuestionAnsweringPipeline(Pipeline):
        )

    def batch_inference(self, **inputs):
-        with torch.no_grad():
        return self.model(**inputs)

    def sequential_inference(self, **inputs):
@@ -101,7 +100,6 @@ class TableQuestionAnsweringPipeline(Pipeline):
        Inference used for models that need to process sequences in a sequential fashion, like the SQA models which
        handle conversational query related to a table.
        """
-        with torch.no_grad():
        all_logits = []
        all_aggregations = []
        prev_answers = None

--- a/tests/test_pipelines_audio_classification.py
+++ b/tests/test_pipelines_audio_classification.py
@@ -117,7 +117,7 @@ class AudioClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTest
        self.assertEqual(
            nested_simplify(output, decimals=4),
            [
-                {"score": 0.9809, "label": "go"},
+                {"score": 0.981, "label": "go"},
                {"score": 0.0073, "label": "up"},
                {"score": 0.0064, "label": "_unknown_"},
                {"score": 0.0015, "label": "down"},