[style] consistent nn. and nn.functional (#12124)

* consistent nn. and nn.functional * fix glitch * fix glitch #2

[style] consistent nn. and nn.functional (#12124)
* consistent nn. and nn.functional * fix glitch * fix glitch #2
1ed2ebf6 · Stas Bekman · GitHub · ff7c8168 · 1ed2ebf6 · 1ed2ebf6
Unverified Commit 1ed2ebf6 authored Jun 14, 2021 by Stas Bekman Committed by GitHub Jun 14, 2021
20 changed files
--- a/src/transformers/models/openai/modeling_openai.py
+++ b/src/transformers/models/openai/modeling_openai.py
@@ -23,7 +23,7 @@ from dataclasses import dataclass
 from typing import Optional, Tuple
 import torch
-import torch.nn as nn
+from torch import nn
 from torch.nn import CrossEntropyLoss, MSELoss
 from ...activations import gelu_new, silu

--- a/src/transformers/models/pegasus/modeling_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_pegasus.py
@@ -21,7 +21,6 @@ from typing import Optional, Tuple
 import numpy as np
 import torch
-import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch import nn
 from torch.nn import CrossEntropyLoss
@@ -239,7 +238,7 @@ class PegasusAttention(nn.Module):
            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
-        attn_weights = F.softmax(attn_weights, dim=-1)
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
        if layer_head_mask is not None:
            if layer_head_mask.size() != (self.num_heads,):
@@ -259,7 +258,7 @@ class PegasusAttention(nn.Module):
        else:
            attn_weights_reshaped = None
-        attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
        attn_output = torch.bmm(attn_probs, value_states)
@@ -321,15 +320,15 @@ class PegasusEncoderLayer(nn.Module):
            layer_head_mask=layer_head_mask,
            output_attentions=output_attentions,
        )
-        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        hidden_states = residual + hidden_states
        residual = hidden_states
        hidden_states = self.final_layer_norm(hidden_states)
        hidden_states = self.activation_fn(self.fc1(hidden_states))
-        hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
        hidden_states = self.fc2(hidden_states)
-        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        hidden_states = residual + hidden_states
        if hidden_states.dtype == torch.float16 and (
@@ -417,7 +416,7 @@ class PegasusDecoderLayer(nn.Module):
            layer_head_mask=layer_head_mask,
            output_attentions=output_attentions,
        )
-        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        hidden_states = residual + hidden_states
        # Cross-Attention Block
@@ -437,7 +436,7 @@ class PegasusDecoderLayer(nn.Module):
                past_key_value=cross_attn_past_key_value,
                output_attentions=output_attentions,
            )
-            hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
            hidden_states = residual + hidden_states
            # add cross-attn to positions 3,4 of present_key_value tuple
@@ -447,9 +446,9 @@ class PegasusDecoderLayer(nn.Module):
        residual = hidden_states
        hidden_states = self.final_layer_norm(hidden_states)
        hidden_states = self.activation_fn(self.fc1(hidden_states))
-        hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
        hidden_states = self.fc2(hidden_states)
-        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        hidden_states = residual + hidden_states
        outputs = (hidden_states,)
@@ -629,7 +628,7 @@ class PegasusEncoder(PegasusPreTrainedModel):
    Args:
        config: PegasusConfig
-        embed_tokens (torch.nn.Embedding): output embedding
+        embed_tokens (nn.Embedding): output embedding
    """
    def __init__(self, config: PegasusConfig, embed_tokens: Optional[nn.Embedding] = None):
@@ -729,7 +728,7 @@ class PegasusEncoder(PegasusPreTrainedModel):
        hidden_states = inputs_embeds + embed_pos
-        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        # expand attention_mask
        if attention_mask is not None:
@@ -797,7 +796,7 @@ class PegasusDecoder(PegasusPreTrainedModel):
    Args:
        config: PegasusConfig
-        embed_tokens (torch.nn.Embedding): output embedding
+        embed_tokens (nn.Embedding): output embedding
    """
    def __init__(self, config: PegasusConfig, embed_tokens: Optional[nn.Embedding] = None):
@@ -969,7 +968,7 @@ class PegasusDecoder(PegasusPreTrainedModel):
        hidden_states = inputs_embeds + positions
-        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        # decoder layers
        all_hidden_states = () if output_hidden_states else None

--- a/src/transformers/models/prophetnet/convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/prophetnet/convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py
@@ -17,7 +17,7 @@
 import argparse
-import torch
+from torch import nn
 from transformers import ProphetNetForConditionalGeneration, XLMProphetNetForConditionalGeneration, logging
@@ -107,15 +107,15 @@ def convert_prophetnet_checkpoint_to_pytorch(prophetnet_checkpoint_path: str, py
                param.weight.shape == old_model.in_proj_weight[:embed_dim, :].shape, "Shapes have to match"
                param.bias.shape == old_model.in_proj_bias[:embed_dim].shape, "Shapes have to match"
                if attribute == "query_proj":
-                    model.query_proj.weight = torch.nn.Parameter(old_model.in_proj_weight[:embed_dim, :])
+                    model.query_proj.weight = nn.Parameter(old_model.in_proj_weight[:embed_dim, :])
-                    model.query_proj.bias = torch.nn.Parameter(old_model.in_proj_bias[:embed_dim])
+                    model.query_proj.bias = nn.Parameter(old_model.in_proj_bias[:embed_dim])
                elif attribute == "key_proj":
-                    model.key_proj.weight = torch.nn.Parameter(old_model.in_proj_weight[embed_dim : 2 * embed_dim, :])
+                    model.key_proj.weight = nn.Parameter(old_model.in_proj_weight[embed_dim : 2 * embed_dim, :])
-                    model.key_proj.bias = torch.nn.Parameter(old_model.in_proj_bias[embed_dim : 2 * embed_dim])
+                    model.key_proj.bias = nn.Parameter(old_model.in_proj_bias[embed_dim : 2 * embed_dim])
                elif attribute == "value_proj":
-                    model.value_proj.weight = torch.nn.Parameter(old_model.in_proj_weight[2 * embed_dim :, :])
+                    model.value_proj.weight = nn.Parameter(old_model.in_proj_weight[2 * embed_dim :, :])
-                    model.value_proj.bias = torch.nn.Parameter(old_model.in_proj_bias[2 * embed_dim :])
+                    model.value_proj.bias = nn.Parameter(old_model.in_proj_bias[2 * embed_dim :])
                is_key_init = True
                break
            elif attribute == "position_embeddings":
@@ -123,7 +123,7 @@ def convert_prophetnet_checkpoint_to_pytorch(prophetnet_checkpoint_path: str, py
                    model.position_embeddings.weight.shape[-1] == old_model.embed_positions.weight.shape[-1]
                ), "Hidden size has to match"
                assert model.position_embeddings.weight.shape[0] == 512, "We want 512 position_embeddings."
-                model.position_embeddings.weight = torch.nn.Parameter(old_model.embed_positions.weight[:512, :])
+                model.position_embeddings.weight = nn.Parameter(old_model.embed_positions.weight[:512, :])
                is_key_init = True
                break

--- a/src/transformers/models/prophetnet/modeling_prophetnet.py
+++ b/src/transformers/models/prophetnet/modeling_prophetnet.py
@@ -21,7 +21,6 @@ from dataclasses import dataclass
 from typing import Optional, Tuple
 import torch
-import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch import Tensor, nn
 from torch.nn import LayerNorm
@@ -183,9 +182,9 @@ PROPHETNET_STANDALONE_INPUTS_DOCSTRING = r"""
 def softmax(hidden_state, dim, onnx_trace=False):
    if onnx_trace:
-        return F.softmax(hidden_state.float(), dim=dim)
+        return nn.functional.softmax(hidden_state.float(), dim=dim)
    else:
-        return F.softmax(hidden_state, dim=dim, dtype=torch.float32)
+        return nn.functional.softmax(hidden_state, dim=dim, dtype=torch.float32)
 def ngram_attention_bias(sequence_length, ngram, device, dtype):
@@ -732,7 +731,7 @@ class ProphetNetAttention(nn.Module):
        else:
            attn_weights_reshaped = None
-        attn_weights = F.softmax(attn_weights, dim=-1)
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
        if layer_head_mask is not None:
            assert layer_head_mask.size() == (
@@ -746,7 +745,7 @@ class ProphetNetAttention(nn.Module):
            # apply head_mask also on attn_weights_reshaped which is used for n-gram attention inside the model
            attn_weights_reshaped = layer_head_mask.view(1, -1, 1, 1) * attn_weights_reshaped
-        attn_probs = F.dropout(
+        attn_probs = nn.functional.dropout(
            attn_weights,
            p=self.attention_dropout,
            training=self.training,
@@ -767,7 +766,7 @@ class ProphetNetAttention(nn.Module):
        attn_output = self.out_proj(attn_output)
-        attn_output = F.dropout(attn_output, p=self.dropout, training=self.training)
+        attn_output = nn.functional.dropout(attn_output, p=self.dropout, training=self.training)
        return attn_output, attn_weights_reshaped, past_key_value
@@ -788,9 +787,9 @@ class ProphetNetFeedForward(nn.Module):
        hidden_states = self.intermediate(hidden_states)
        hidden_states = self.activation_fn(hidden_states)
-        hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
        hidden_states = self.output(hidden_states)
-        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        return hidden_states
@@ -924,7 +923,7 @@ class ProphetNetNgramSelfAttention(nn.Module):
            )
            main_attn_probs = main_attn_probs.view(batch_size * self.num_attn_heads, -1, sequence_length)
-        main_attn_probs = F.dropout(main_attn_probs, p=self.attention_dropout, training=self.training)
+        main_attn_probs = nn.functional.dropout(main_attn_probs, p=self.attention_dropout, training=self.training)
        # project to attn_output
        main_attn_output = torch.bmm(main_attn_probs, main_value_states)
@@ -989,7 +988,9 @@ class ProphetNetNgramSelfAttention(nn.Module):
                self.ngram, batch_size * self.num_attn_heads, sequence_length, 2 * sequence_length
            )
-        predict_attn_probs = F.dropout(predict_attn_probs, p=self.attention_dropout, training=self.training)
+        predict_attn_probs = nn.functional.dropout(
+            predict_attn_probs, p=self.attention_dropout, training=self.training
+        )
        # project to attention output
        # [ngram, B*head, T, c]
        predict_attn_output = torch.einsum("nbts,nbsc->nbtc", (predict_attn_probs, predict_value_states))
@@ -1012,7 +1013,7 @@ class ProphetNetNgramSelfAttention(nn.Module):
            self.ngram, batch_size, self.num_attn_heads, sequence_length, -1
        ).transpose(0, 1)
-        attn_output = F.dropout(attn_output, p=self.dropout, training=self.training)
+        attn_output = nn.functional.dropout(attn_output, p=self.dropout, training=self.training)
        return attn_output, main_attn_probs, predict_attn_probs, past_key_value
@@ -1321,7 +1322,7 @@ class ProphetNetEncoder(ProphetNetPreTrainedModel):
        hidden_states = inputs_embeds + position_embeddings
        hidden_states = self.embeddings_layer_norm(hidden_states)
-        hidden_states = F.dropout(hidden_states, p=self.config.dropout, training=self.training)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.config.dropout, training=self.training)
        encoder_hidden_states = () if output_hidden_states else None
        all_attentions = () if output_attentions else None
@@ -1538,7 +1539,7 @@ class ProphetNetDecoder(ProphetNetPreTrainedModel):
        if self.embeddings_layer_norm:
            hidden_states = self.embeddings_layer_norm(hidden_states)
-        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        # init attentions, hidden_states and cache with empty tuples
        all_main_stream_hidden_states = () if output_hidden_states else None
@@ -1995,13 +1996,13 @@ class ProphetNetForConditionalGeneration(ProphetNetPreTrainedModel):
                break
            expend_targets[i, :, :] = labels
-        lprobs = F.log_softmax(
+        lprobs = nn.functional.log_softmax(
            logits.view(-1, logits.size(-1)),
            dim=-1,
            dtype=torch.float32,
        )
-        loss = F.nll_loss(lprobs, expend_targets.view(-1), reduction="mean")
+        loss = nn.functional.nll_loss(lprobs, expend_targets.view(-1), reduction="mean")
        if self.config.eps > 0.0:
            smooth_loss = -lprobs.sum(dim=-1, keepdim=True)
@@ -2239,13 +2240,13 @@ class ProphetNetForCausalLM(ProphetNetPreTrainedModel):
                break
            expend_targets[i, :, :] = labels
-        lprobs = F.log_softmax(
+        lprobs = nn.functional.log_softmax(
            logits.view(-1, logits.size(-1)),
            dim=-1,
            dtype=torch.float32,
        )
-        loss = F.nll_loss(lprobs, expend_targets.view(-1), reduction="mean")
+        loss = nn.functional.nll_loss(lprobs, expend_targets.view(-1), reduction="mean")
        if self.config.eps > 0.0:
            smooth_loss = -lprobs.sum(dim=-1, keepdim=True)

--- a/src/transformers/models/rag/modeling_rag.py
+++ b/src/transformers/models/rag/modeling_rag.py
@@ -18,6 +18,7 @@ from dataclasses import dataclass
 from typing import Callable, List, Optional, Tuple
 import torch
+from torch import nn
 from ...configuration_utils import PretrainedConfig
 from ...file_utils import add_start_docstrings_to_model_forward, replace_return_docstrings
@@ -1065,10 +1066,10 @@ class RagSequenceForGeneration(RagPreTrainedModel):
            return ll.squeeze(-1), smooth_obj.squeeze(-1)
        # seq_logits dim = (batch*n_docs, tgt_len , #vocabs)
-        seq_logprobs = torch.nn.functional.log_softmax(seq_logits, dim=-1).view(
+        seq_logprobs = nn.functional.log_softmax(seq_logits, dim=-1).view(
            seq_logits.shape[0] // n_docs, n_docs, -1, seq_logits.size(-1)
        )  # batch_size x n_docs x tgt_len x #vocab_size
-        doc_logprobs = torch.nn.functional.log_softmax(doc_scores, dim=1).unsqueeze(-1).unsqueeze(-1)
+        doc_logprobs = nn.functional.log_softmax(doc_scores, dim=1).unsqueeze(-1).unsqueeze(-1)
        # RAG-sequence marginalization
        first_token_scores = seq_logprobs[:, :, :1, :]
@@ -1212,7 +1213,7 @@ class RagTokenForGeneration(RagPreTrainedModel):
        n_docs = n_docs if n_docs is not None else self.config.n_docs
        # RAG-token marginalization
-        seq_logprobs = torch.nn.functional.log_softmax(seq_logits, dim=-1).view(
+        seq_logprobs = nn.functional.log_softmax(seq_logits, dim=-1).view(
            seq_logits.shape[0] // n_docs, n_docs, -1, seq_logits.size(-1)
        )
        doc_logprobs = torch.log_softmax(doc_scores, dim=1)

--- a/src/transformers/models/reformer/convert_reformer_trax_checkpoint_to_pytorch.py
+++ b/src/transformers/models/reformer/convert_reformer_trax_checkpoint_to_pytorch.py
@@ -20,6 +20,7 @@ import pickle
 import numpy as np
 import torch
+from torch import nn
 from transformers import ReformerConfig, ReformerModelWithLMHead
 from transformers.utils import logging
@@ -31,10 +32,10 @@ logging.set_verbosity_info()
 def set_param(torch_layer, weight, bias=None):
    # set parameter of one layer
    assert torch_layer.weight.shape == weight.shape, f"{torch_layer} layer.weight does not match"
-    torch_layer.weight = torch.nn.Parameter(weight)
+    torch_layer.weight = nn.Parameter(weight)
    if bias is not None:
        assert torch_layer.bias.shape == bias.shape, f"{torch_layer} layer.bias does not match"
-        torch_layer.bias = torch.nn.Parameter(bias)
+        torch_layer.bias = nn.Parameter(bias)
 def set_layer_weights_in_torch_lsh(weights, torch_layer, hidden_size):
@@ -153,7 +154,7 @@ def set_model_weights_in_torch(weights, torch_model, hidden_size):
            assert (
                position_embeddings.weights[emb_idx].shape == emb_weights.shape
            ), f"{position_embeddings[emb_idx]} emb does not match"
-            position_embeddings.weights[emb_idx] = torch.nn.Parameter(torch.tensor(emb_weights))
+            position_embeddings.weights[emb_idx] = nn.Parameter(torch.tensor(emb_weights))
    trax_layer_weights = weights[5]
    assert len(torch_model_reformer.encoder.layers) * 4 == len(

--- a/src/transformers/models/reformer/modeling_reformer.py
+++ b/src/transformers/models/reformer/modeling_reformer.py
@@ -1782,7 +1782,7 @@ class ReformerPreTrainedModel(PreTrainedModel):
        """Initialize the weights"""
        if isinstance(module, AxialPositionEmbeddings):
            for weight in module.weights:
-                torch.nn.init.normal_(weight, std=self.config.axial_norm_std)
+                nn.init.normal_(weight, std=self.config.axial_norm_std)
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:

--- a/src/transformers/models/retribert/modeling_retribert.py
+++ b/src/transformers/models/retribert/modeling_retribert.py
@@ -20,8 +20,8 @@ RetriBERT model
 import math
 import torch
-import torch.nn as nn
 import torch.utils.checkpoint as checkpoint
+from torch import nn
 from ...file_utils import add_start_docstrings
 from ...modeling_utils import PreTrainedModel

--- a/src/transformers/models/roberta/modeling_roberta.py
+++ b/src/transformers/models/roberta/modeling_roberta.py
@@ -18,8 +18,8 @@
 import math
 import torch
-import torch.nn as nn
 import torch.utils.checkpoint
+from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from ...activations import ACT2FN, gelu

--- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py
@@ -20,7 +20,6 @@ import random
 from typing import Optional, Tuple
 import torch
-import torch.nn.functional as F
 from torch import nn
 from torch.nn import CrossEntropyLoss
@@ -306,7 +305,7 @@ class Speech2TextAttention(nn.Module):
            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
-        attn_weights = F.softmax(attn_weights, dim=-1)
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
        if layer_head_mask is not None:
            if layer_head_mask.size() != (self.num_heads,):
@@ -326,7 +325,7 @@ class Speech2TextAttention(nn.Module):
        else:
            attn_weights_reshaped = None
-        attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
        attn_output = torch.bmm(attn_probs, value_states)
@@ -387,15 +386,15 @@ class Speech2TextEncoderLayer(nn.Module):
            layer_head_mask=layer_head_mask,
            output_attentions=output_attentions,
        )
-        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        hidden_states = residual + hidden_states
        residual = hidden_states
        hidden_states = self.final_layer_norm(hidden_states)
        hidden_states = self.activation_fn(self.fc1(hidden_states))
-        hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
        hidden_states = self.fc2(hidden_states)
-        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        hidden_states = residual + hidden_states
        if hidden_states.dtype == torch.float16 and (
@@ -482,7 +481,7 @@ class Speech2TextDecoderLayer(nn.Module):
            layer_head_mask=layer_head_mask,
            output_attentions=output_attentions,
        )
-        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        hidden_states = residual + hidden_states
        # Cross-Attention Block
@@ -502,7 +501,7 @@ class Speech2TextDecoderLayer(nn.Module):
                past_key_value=cross_attn_past_key_value,
                output_attentions=output_attentions,
            )
-            hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
            hidden_states = residual + hidden_states
            # add cross-attn to positions 3,4 of present_key_value tuple
@@ -512,9 +511,9 @@ class Speech2TextDecoderLayer(nn.Module):
        residual = hidden_states
        hidden_states = self.final_layer_norm(hidden_states)
        hidden_states = self.activation_fn(self.fc1(hidden_states))
-        hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
        hidden_states = self.fc2(hidden_states)
-        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        hidden_states = residual + hidden_states
        outputs = (hidden_states,)
@@ -686,7 +685,7 @@ class Speech2TextEncoder(Speech2TextPreTrainedModel):
    Args:
        config: Speech2TextConfig
-        embed_tokens (torch.nn.Embedding): output embedding
+        embed_tokens (nn.Embedding): output embedding
    """
    def __init__(self, config: Speech2TextConfig):
@@ -772,7 +771,7 @@ class Speech2TextEncoder(Speech2TextPreTrainedModel):
        embed_pos = self.embed_positions(padding_mask)
        hidden_states = inputs_embeds + embed_pos
-        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        # expand attention_mask
        if attention_mask is not None:
@@ -840,7 +839,7 @@ class Speech2TextDecoder(Speech2TextPreTrainedModel):
    Args:
        config: Speech2TextConfig
-        embed_tokens (torch.nn.Embedding): output embedding
+        embed_tokens (nn.Embedding): output embedding
    """
    def __init__(self, config: Speech2TextConfig):
@@ -1008,7 +1007,7 @@ class Speech2TextDecoder(Speech2TextPreTrainedModel):
        positions = self.embed_positions(input_ids, past_key_values_length=past_key_values_length)
        hidden_states = inputs_embeds + positions
-        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        # decoder layers
        all_hidden_states = () if output_hidden_states else None

--- a/src/transformers/models/squeezebert/modeling_squeezebert.py
+++ b/src/transformers/models/squeezebert/modeling_squeezebert.py
@@ -92,7 +92,7 @@ class SqueezeBertEmbeddings(nn.Module):
        return embeddings
-class MatMulWrapper(torch.nn.Module):
+class MatMulWrapper(nn.Module):
    """
    Wrapper for torch.matmul(). This makes flop-counting easier to implement. Note that if you directly call
    torch.matmul() in your code, the flop counter will typically ignore the flops of the matmul.

--- a/src/transformers/models/t5/modeling_t5.py
+++ b/src/transformers/models/t5/modeling_t5.py
@@ -21,7 +21,6 @@ import os
 import warnings
 import torch
-import torch.nn.functional as F
 from torch import nn
 from torch.nn import CrossEntropyLoss
 from torch.utils.checkpoint import checkpoint
@@ -179,7 +178,7 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
 ####################################################
 # PyTorch Models are constructed by sub-classing
 # - torch.nn.Module for the layers and
-# - PreTrainedModel for the models (it-self a sub-class of torch.nn.Module)
+# - PreTrainedModel for the models (it-self a sub-class of nn.Module)
 ####################################################
 PARALLELIZE_DOCSTRING = r"""
    This is an experimental feature and is a subject to change at a moment's notice.
@@ -257,7 +256,7 @@ class T5DenseReluDense(nn.Module):
    def forward(self, hidden_states):
        hidden_states = self.wi(hidden_states)
-        hidden_states = F.relu(hidden_states)
+        hidden_states = nn.functional.relu(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.wo(hidden_states)
        return hidden_states
@@ -502,10 +501,10 @@ class T5Attention(nn.Module):
                position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)
        scores += position_bias
-        attn_weights = F.softmax(scores.float(), dim=-1).type_as(
+        attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
            scores
        )  # (batch_size, n_heads, seq_length, key_length)
-        attn_weights = F.dropout(
+        attn_weights = nn.functional.dropout(
            attn_weights, p=self.dropout, training=self.training
        )  # (batch_size, n_heads, seq_length, key_length)

--- a/src/transformers/models/tapas/modeling_tapas.py
+++ b/src/transformers/models/tapas/modeling_tapas.py
@@ -22,8 +22,8 @@ from dataclasses import dataclass
 from typing import Optional, Tuple
 import torch
-import torch.nn as nn
 import torch.utils.checkpoint
+from torch import nn
 from torch.nn import CrossEntropyLoss, MSELoss
 from ...activations import ACT2FN
@@ -2096,10 +2096,8 @@ def _calculate_aggregation_loss_known(
        # Use aggregation supervision as the target.
        target_aggregation = aggregation_labels
-    one_hot_labels = torch.nn.functional.one_hot(target_aggregation, num_classes=num_aggregation_labels).type(
+    one_hot_labels = nn.functional.one_hot(target_aggregation, num_classes=num_aggregation_labels).type(torch.float32)
-        torch.float32
+    log_probs = nn.functional.log_softmax(logits_aggregation, dim=-1)
-    )
-    log_probs = torch.nn.functional.log_softmax(logits_aggregation, dim=-1)
    # torch.FloatTensor[batch_size]
    per_example_aggregation_intermediate = -torch.sum(one_hot_labels * log_probs, dim=-1)
@@ -2243,7 +2241,7 @@ def _calculate_expected_result(
        aggregation_op_only_probs = gumbel_dist.sample()
    else:
        # <float32>[batch_size, num_aggregation_labels - 1]
-        aggregation_op_only_probs = torch.nn.functional.softmax(
+        aggregation_op_only_probs = nn.functional.softmax(
            logits_aggregation[:, 1:] / config.aggregation_temperature, dim=-1
        )

--- a/src/transformers/models/transfo_xl/modeling_transfo_xl.py
+++ b/src/transformers/models/transfo_xl/modeling_transfo_xl.py
@@ -21,8 +21,7 @@ from dataclasses import dataclass
 from typing import List, Optional, Tuple
 import torch
-import torch.nn as nn
+from torch import nn
-import torch.nn.functional as F
 from torch.nn import CrossEntropyLoss, MSELoss
 from ...file_utils import (
@@ -344,7 +343,7 @@ class RelPartialLearnableMultiHeadAttn(nn.Module):
                    attn_score = attn_score.float().masked_fill(attn_mask[:, :, :, None], -1e30).type_as(attn_score)
        # [qlen x klen x bsz x n_head]
-        attn_prob = F.softmax(attn_score, dim=1)
+        attn_prob = nn.functional.softmax(attn_score, dim=1)
        attn_prob = self.dropatt(attn_prob)
        # Mask heads if we want to
@@ -434,7 +433,7 @@ class AdaptiveEmbedding(nn.Module):
        if self.div_val == 1:
            embed = self.emb_layers[0](inp)
            if self.d_proj != self.d_embed:
-                embed = F.linear(embed, self.emb_projs[0])
+                embed = nn.functional.linear(embed, self.emb_projs[0])
        else:
            param = next(self.parameters())
            inp_flat = inp.view(-1)
@@ -450,7 +449,7 @@ class AdaptiveEmbedding(nn.Module):
                inp_i = inp_flat.index_select(0, indices_i) - l_idx
                emb_i = self.emb_layers[i](inp_i)
-                emb_i = F.linear(emb_i, self.emb_projs[i])
+                emb_i = nn.functional.linear(emb_i, self.emb_projs[i])
                emb_flat.index_copy_(0, indices_i, emb_i)

--- a/src/transformers/models/transfo_xl/modeling_transfo_xl_utilities.py
+++ b/src/transformers/models/transfo_xl/modeling_transfo_xl_utilities.py
@@ -19,8 +19,7 @@
 import torch
-import torch.nn as nn
+from torch import nn
-import torch.nn.functional as F
 # CUDA_MAJOR = int(torch.version.cuda.split('.')[0])
@@ -71,11 +70,11 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
    def _compute_logit(self, hidden, weight, bias, proj):
        if proj is None:
-            logit = F.linear(hidden, weight, bias=bias)
+            logit = nn.functional.linear(hidden, weight, bias=bias)
        else:
            # if CUDA_MAJOR <= 9 and CUDA_MINOR <= 1:
-            proj_hid = F.linear(hidden, proj.t().contiguous())
+            proj_hid = nn.functional.linear(hidden, proj.t().contiguous())
-            logit = F.linear(proj_hid, weight, bias=bias)
+            logit = nn.functional.linear(proj_hid, weight, bias=bias)
            # else:
            #     logit = torch.einsum('bd,de,ev->bv', (hidden, proj, weight.t()))
            #     if bias is not None:
@@ -110,9 +109,9 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
        if self.n_clusters == 0:
            logit = self._compute_logit(hidden, self.out_layers[0].weight, self.out_layers[0].bias, self.out_projs[0])
            if labels is not None:
-                out = -F.log_softmax(logit, dim=-1).gather(1, labels.unsqueeze(1)).squeeze(1)
+                out = -nn.functional.log_softmax(logit, dim=-1).gather(1, labels.unsqueeze(1)).squeeze(1)
            else:
-                out = F.log_softmax(logit, dim=-1)
+                out = nn.functional.log_softmax(logit, dim=-1)
        else:
            # construct weights and biases
            weights, biases = [], []
@@ -135,7 +134,7 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
            head_weight, head_bias, head_proj = weights[0], biases[0], self.out_projs[0]
            head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj)
-            head_logprob = F.log_softmax(head_logit, dim=1)
+            head_logprob = nn.functional.log_softmax(head_logit, dim=1)
            if labels is None:
                out = hidden.new_empty((head_logit.size(0), self.n_token))
@@ -169,7 +168,7 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
                    weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i]
                    tail_logit_i = self._compute_logit(hidden_i, weight_i, bias_i, proj_i)
-                    tail_logprob_i = F.log_softmax(tail_logit_i, dim=1)
+                    tail_logprob_i = nn.functional.log_softmax(tail_logit_i, dim=1)
                    cluster_prob_idx = self.cutoffs[0] + i - 1  # No probability for the head cluster
                    if labels is not None:
                        logprob_i = head_logprob_i[:, cluster_prob_idx] + tail_logprob_i.gather(
@@ -205,7 +204,7 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
        """
        if self.n_clusters == 0:
            logit = self._compute_logit(hidden, self.out_layers[0].weight, self.out_layers[0].bias, self.out_projs[0])
-            return F.log_softmax(logit, dim=-1)
+            return nn.functional.log_softmax(logit, dim=-1)
        else:
            # construct weights and biases
            weights, biases = [], []
@@ -229,7 +228,7 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
            head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj)
            out = hidden.new_empty((head_logit.size(0), self.n_token))
-            head_logprob = F.log_softmax(head_logit, dim=1)
+            head_logprob = nn.functional.log_softmax(head_logit, dim=1)
            cutoff_values = [0] + self.cutoffs
            for i in range(len(cutoff_values) - 1):
@@ -241,7 +240,7 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
                    weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i]
                    tail_logit_i = self._compute_logit(hidden, weight_i, bias_i, proj_i)
-                    tail_logprob_i = F.log_softmax(tail_logit_i, dim=1)
+                    tail_logprob_i = nn.functional.log_softmax(tail_logit_i, dim=1)
                    logprob_i = head_logprob[:, -i] + tail_logprob_i
                    out[:, start_idx, stop_idx] = logprob_i

--- a/src/transformers/models/visual_bert/modeling_visual_bert.py
+++ b/src/transformers/models/visual_bert/modeling_visual_bert.py
@@ -89,10 +89,10 @@ class VisualBertEmbeddings(nn.Module):
        self.visual_position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        if config.special_visual_initialize:
-            self.visual_token_type_embeddings.weight.data = torch.nn.Parameter(
+            self.visual_token_type_embeddings.weight.data = nn.Parameter(
                self.token_type_embeddings.weight.data.clone(), requires_grad=True
            )
-            self.visual_position_embeddings.weight.data = torch.nn.Parameter(
+            self.visual_position_embeddings.weight.data = nn.Parameter(
                self.position_embeddings.weight.data.clone(), requires_grad=True
            )
@@ -1253,8 +1253,8 @@ class VisualBertForQuestionAnswering(VisualBertPreTrainedModel):
        loss = None
        if labels is not None:
-            loss_fct = torch.nn.KLDivLoss(reduction="batchmean")
+            loss_fct = nn.KLDivLoss(reduction="batchmean")
-            log_softmax = torch.nn.LogSoftmax(dim=-1)
+            log_softmax = nn.LogSoftmax(dim=-1)
            reshaped_logits = log_softmax(reshaped_logits)
            loss = loss_fct(reshaped_logits, labels.contiguous())
        if not return_dict:

--- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
@@ -20,7 +20,6 @@ from typing import Optional, Tuple, Union
 import numpy as np
 import torch
-import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch import nn
@@ -449,7 +448,7 @@ class Wav2Vec2Attention(nn.Module):
            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
-        attn_weights = F.softmax(attn_weights, dim=-1)
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
        if layer_head_mask is not None:
            if layer_head_mask.size() != (self.num_heads,):
@@ -469,7 +468,7 @@ class Wav2Vec2Attention(nn.Module):
        else:
            attn_weights_reshaped = None
-        attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
        attn_output = torch.bmm(attn_probs, value_states)
@@ -805,9 +804,9 @@ class Wav2Vec2GumbelVectorQuantizer(nn.Module):
        if self.training:
            # sample code vector probs via gumbel in differentiateable way
-            codevector_probs = F.gumbel_softmax(hidden_states.float(), tau=self.temperature, hard=True).type_as(
+            codevector_probs = nn.functional.gumbel_softmax(
-                hidden_states
+                hidden_states.float(), tau=self.temperature, hard=True
-            )
+            ).type_as(hidden_states)
            # compute perplexity
            codevector_soft_dist = torch.softmax(
@@ -867,12 +866,12 @@ class Wav2Vec2PreTrainedModel(PreTrainedModel):
                if hasattr(module, "weight_v") and hasattr(module, "weight_g"):
                    with deepspeed.zero.GatheredParameters([module.weight_v, module.weight_g], modifier_rank=0):
-                        torch.nn.init.kaiming_normal_(module.weight.data)
+                        nn.init.kaiming_normal_(module.weight.data)
                else:
                    with deepspeed.zero.GatheredParameters(module.weight, modifier_rank=0):
-                        torch.nn.init.kaiming_normal_(module.weight.data)
+                        nn.init.kaiming_normal_(module.weight.data)
            else:
-                torch.nn.init.kaiming_normal_(module.weight.data)
+                nn.init.kaiming_normal_(module.weight.data)
        if isinstance(module, (nn.Linear, nn.Conv1d)) and module.bias is not None:
            module.bias.data.zero_()
@@ -1296,7 +1295,7 @@ class Wav2Vec2ForPreTraining(Wav2Vec2PreTrainedModel):
            # -log(exp(sim(c_t, q_t)/\kappa) / \sum_{\sim{q}} exp(sim(c_t, \sim{q})/\kappa))
            preds = logits.transpose(0, 2).reshape(-1, logits.size(0))
            target = ((1 - mask_time_indices.long()) * -100).transpose(0, 1).flatten()
-            contrastive_loss = F.cross_entropy(preds.float(), target, reduction="sum")
+            contrastive_loss = nn.functional.cross_entropy(preds.float(), target, reduction="sum")
            # 7. compute diversity loss: \mathbf{L}_d
            num_codevectors = self.config.num_codevectors_per_group * self.config.num_codevector_groups
@@ -1502,10 +1501,10 @@ class Wav2Vec2ForCTC(Wav2Vec2PreTrainedModel):
            flattened_targets = labels.masked_select(labels_mask)
            # ctc_loss doesn't support fp16
-            log_probs = F.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)
+            log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)
            with torch.backends.cudnn.flags(enabled=False):
-                loss = F.ctc_loss(
+                loss = nn.functional.ctc_loss(
                    log_probs,
                    flattened_targets,
                    input_lengths,

--- a/src/transformers/models/xlm/modeling_tf_xlm.py
+++ b/src/transformers/models/xlm/modeling_tf_xlm.py
@@ -503,7 +503,7 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
            # encoder attention (for decoder only)
            # if self.is_decoder and src_enc is not None:
            #     attn = self.encoder_attn[i](tensor, src_mask, kv=src_enc, cache=cache)
-            #     attn = F.dropout(attn, p=self.dropout, training=self.training)
+            #     attn = nn.functional.dropout(attn, p=self.dropout, training=self.training)
            #     tensor = tensor + attn
            #     tensor = self.layer_norm15[i](tensor)

--- a/src/transformers/models/xlm/modeling_xlm.py
+++ b/src/transformers/models/xlm/modeling_xlm.py
@@ -25,7 +25,6 @@ import numpy as np
 import torch
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-from torch.nn import functional as F
 from ...activations import gelu
 from ...file_utils import (
@@ -190,8 +189,8 @@ class MultiHeadAttention(nn.Module):
        mask = (mask == 0).view(mask_reshape).expand_as(scores)  # (bs, n_heads, qlen, klen)
        scores.masked_fill_(mask, -float("inf"))  # (bs, n_heads, qlen, klen)
-        weights = F.softmax(scores.float(), dim=-1).type_as(scores)  # (bs, n_heads, qlen, klen)
+        weights = nn.functional.softmax(scores.float(), dim=-1).type_as(scores)  # (bs, n_heads, qlen, klen)
-        weights = F.dropout(weights, p=self.dropout, training=self.training)  # (bs, n_heads, qlen, klen)
+        weights = nn.functional.dropout(weights, p=self.dropout, training=self.training)  # (bs, n_heads, qlen, klen)
        # Mask heads if we want to
        if head_mask is not None:
@@ -212,7 +211,7 @@ class TransformerFFN(nn.Module):
        self.dropout = config.dropout
        self.lin1 = nn.Linear(in_dim, dim_hidden)
        self.lin2 = nn.Linear(dim_hidden, out_dim)
-        self.act = gelu if config.gelu_activation else F.relu
+        self.act = gelu if config.gelu_activation else nn.functional.relu
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        self.seq_len_dim = 1
@@ -223,7 +222,7 @@ class TransformerFFN(nn.Module):
        x = self.lin1(input)
        x = self.act(x)
        x = self.lin2(x)
-        x = F.dropout(x, p=self.dropout, training=self.training)
+        x = nn.functional.dropout(x, p=self.dropout, training=self.training)
        return x
@@ -578,7 +577,7 @@ class XLMModel(XLMPreTrainedModel):
        if token_type_ids is not None:
            tensor = tensor + self.embeddings(token_type_ids)
        tensor = self.layer_norm_emb(tensor)
-        tensor = F.dropout(tensor, p=self.dropout, training=self.training)
+        tensor = nn.functional.dropout(tensor, p=self.dropout, training=self.training)
        tensor *= mask.unsqueeze(-1).to(tensor.dtype)
        # transformer layers
@@ -599,14 +598,14 @@ class XLMModel(XLMPreTrainedModel):
            attn = attn_outputs[0]
            if output_attentions:
                attentions = attentions + (attn_outputs[1],)
-            attn = F.dropout(attn, p=self.dropout, training=self.training)
+            attn = nn.functional.dropout(attn, p=self.dropout, training=self.training)
            tensor = tensor + attn
            tensor = self.layer_norm1[i](tensor)
            # encoder attention (for decoder only)
            # if self.is_decoder and src_enc is not None:
            #     attn = self.encoder_attn[i](tensor, src_mask, kv=src_enc, cache=cache)
-            #     attn = F.dropout(attn, p=self.dropout, training=self.training)
+            #     attn = nn.functional.dropout(attn, p=self.dropout, training=self.training)
            #     tensor = tensor + attn
            #     tensor = self.layer_norm15[i](tensor)
@@ -661,7 +660,9 @@ class XLMPredLayer(nn.Module):
            scores = self.proj(x)
            outputs = (scores,) + outputs
            if y is not None:
-                loss = F.cross_entropy(scores.view(-1, self.n_words), y.view(-1), reduction="elementwise_mean")
+                loss = nn.functional.cross_entropy(
+                    scores.view(-1, self.n_words), y.view(-1), reduction="elementwise_mean"
+                )
                outputs = (loss,) + outputs
        else:
            scores = self.proj.log_prob(x)

--- a/src/transformers/models/xlnet/modeling_xlnet.py
+++ b/src/transformers/models/xlnet/modeling_xlnet.py
@@ -23,7 +23,6 @@ from typing import List, Optional, Tuple
 import torch
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-from torch.nn import functional as F
 from ...activations import ACT2FN
 from ...file_utils import (
@@ -305,7 +304,7 @@ class XLNetRelativeAttention(nn.Module):
                attn_score = attn_score - 1e30 * torch.einsum("ijbn->bnij", attn_mask)
        # attention probability
-        attn_prob = F.softmax(attn_score, dim=3)
+        attn_prob = nn.functional.softmax(attn_score, dim=3)
        attn_prob = self.dropout(attn_prob)
        # Mask heads if we want to
@@ -1208,7 +1207,7 @@ class XLNetModel(XLNetPreTrainedModel):
            # `1` indicates not in the same segment [qlen x klen x bsz]
            seg_mat = (token_type_ids[:, None] != cat_ids[None, :]).long()
-            seg_mat = F.one_hot(seg_mat, num_classes=2).to(dtype_float)
+            seg_mat = nn.functional.one_hot(seg_mat, num_classes=2).to(dtype_float)
        else:
            seg_mat = None
@@ -2034,7 +2033,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
        else:
            # during inference, compute the end logits based on beam search
            bsz, slen, hsz = hidden_states.size()
-            start_log_probs = F.softmax(start_logits, dim=-1)  # shape (bsz, slen)
+            start_log_probs = nn.functional.softmax(start_logits, dim=-1)  # shape (bsz, slen)
            start_top_log_probs, start_top_index = torch.topk(
                start_log_probs, self.start_n_top, dim=-1
@@ -2048,7 +2047,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
            )  # shape (bsz, slen, start_n_top, hsz)
            p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None
            end_logits = self.end_logits(hidden_states_expanded, start_states=start_states, p_mask=p_mask)
-            end_log_probs = F.softmax(end_logits, dim=1)  # shape (bsz, slen, start_n_top)
+            end_log_probs = nn.functional.softmax(end_logits, dim=1)  # shape (bsz, slen, start_n_top)
            end_top_log_probs, end_top_index = torch.topk(
                end_log_probs, self.end_n_top, dim=1