"...git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "d1ec36b94f5ba45fb2423e74074cfedab48cfe73"
Unverified Commit a156da9a authored by Stas Bekman's avatar Stas Bekman Committed by GitHub
Browse files

consistent nn. and nn.functional: p2 templates (#12153)

parent 007be9e4
...@@ -711,7 +711,7 @@ defined by the name of the class attribute you give the layer. Let's ...@@ -711,7 +711,7 @@ defined by the name of the class attribute you give the layer. Let's
define a dummy model in PyTorch, called `SimpleModel` as follows: define a dummy model in PyTorch, called `SimpleModel` as follows:
```python ```python
import torch.nn as nn from torch import nn
class SimpleModel(nn.Module): class SimpleModel(nn.Module):
def __init__(self): def __init__(self):
......
...@@ -1542,7 +1542,6 @@ import random ...@@ -1542,7 +1542,6 @@ import random
from typing import Optional, Tuple from typing import Optional, Tuple
import torch import torch
import torch.nn.functional as F
from torch import nn from torch import nn
from torch.nn import CrossEntropyLoss from torch.nn import CrossEntropyLoss
...@@ -1743,7 +1742,7 @@ class {{cookiecutter.camelcase_modelname}}Attention(nn.Module): ...@@ -1743,7 +1742,7 @@ class {{cookiecutter.camelcase_modelname}}Attention(nn.Module):
attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
attn_weights = F.softmax(attn_weights, dim=-1) attn_weights = nn.functional.softmax(attn_weights, dim=-1)
if layer_head_mask is not None: if layer_head_mask is not None:
if layer_head_mask.size() != (self.num_heads,): if layer_head_mask.size() != (self.num_heads,):
...@@ -1763,7 +1762,7 @@ class {{cookiecutter.camelcase_modelname}}Attention(nn.Module): ...@@ -1763,7 +1762,7 @@ class {{cookiecutter.camelcase_modelname}}Attention(nn.Module):
else: else:
attn_weights_reshaped = None attn_weights_reshaped = None
attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training) attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
attn_output = torch.bmm(attn_probs, value_states) attn_output = torch.bmm(attn_probs, value_states)
...@@ -1823,15 +1822,15 @@ class {{cookiecutter.camelcase_modelname}}EncoderLayer(nn.Module): ...@@ -1823,15 +1822,15 @@ class {{cookiecutter.camelcase_modelname}}EncoderLayer(nn.Module):
layer_head_mask=layer_head_mask, layer_head_mask=layer_head_mask,
output_attentions=output_attentions, output_attentions=output_attentions,
) )
hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states hidden_states = residual + hidden_states
hidden_states = self.self_attn_layer_norm(hidden_states) hidden_states = self.self_attn_layer_norm(hidden_states)
residual = hidden_states residual = hidden_states
hidden_states = self.activation_fn(self.fc1(hidden_states)) hidden_states = self.activation_fn(self.fc1(hidden_states))
hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training) hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
hidden_states = self.fc2(hidden_states) hidden_states = self.fc2(hidden_states)
hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states hidden_states = residual + hidden_states
hidden_states = self.final_layer_norm(hidden_states) hidden_states = self.final_layer_norm(hidden_states)
...@@ -1916,7 +1915,7 @@ class {{cookiecutter.camelcase_modelname}}DecoderLayer(nn.Module): ...@@ -1916,7 +1915,7 @@ class {{cookiecutter.camelcase_modelname}}DecoderLayer(nn.Module):
layer_head_mask=layer_head_mask, layer_head_mask=layer_head_mask,
output_attentions=output_attentions, output_attentions=output_attentions,
) )
hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states hidden_states = residual + hidden_states
hidden_states = self.self_attn_layer_norm(hidden_states) hidden_states = self.self_attn_layer_norm(hidden_states)
...@@ -1936,7 +1935,7 @@ class {{cookiecutter.camelcase_modelname}}DecoderLayer(nn.Module): ...@@ -1936,7 +1935,7 @@ class {{cookiecutter.camelcase_modelname}}DecoderLayer(nn.Module):
past_key_value=cross_attn_past_key_value, past_key_value=cross_attn_past_key_value,
output_attentions=output_attentions, output_attentions=output_attentions,
) )
hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states hidden_states = residual + hidden_states
hidden_states = self.encoder_attn_layer_norm(hidden_states) hidden_states = self.encoder_attn_layer_norm(hidden_states)
...@@ -1946,9 +1945,9 @@ class {{cookiecutter.camelcase_modelname}}DecoderLayer(nn.Module): ...@@ -1946,9 +1945,9 @@ class {{cookiecutter.camelcase_modelname}}DecoderLayer(nn.Module):
# Fully Connected # Fully Connected
residual = hidden_states residual = hidden_states
hidden_states = self.activation_fn(self.fc1(hidden_states)) hidden_states = self.activation_fn(self.fc1(hidden_states))
hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training) hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
hidden_states = self.fc2(hidden_states) hidden_states = self.fc2(hidden_states)
hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states hidden_states = residual + hidden_states
hidden_states = self.final_layer_norm(hidden_states) hidden_states = self.final_layer_norm(hidden_states)
...@@ -2171,7 +2170,7 @@ class {{cookiecutter.camelcase_modelname}}Encoder({{cookiecutter.camelcase_model ...@@ -2171,7 +2170,7 @@ class {{cookiecutter.camelcase_modelname}}Encoder({{cookiecutter.camelcase_model
Args: Args:
config: {{cookiecutter.camelcase_modelname}}Config config: {{cookiecutter.camelcase_modelname}}Config
embed_tokens (torch.nn.Embedding): output embedding embed_tokens (nn.Embedding): output embedding
""" """
def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, embed_tokens: Optional[nn.Embedding] = None): def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, embed_tokens: Optional[nn.Embedding] = None):
...@@ -2270,7 +2269,7 @@ class {{cookiecutter.camelcase_modelname}}Encoder({{cookiecutter.camelcase_model ...@@ -2270,7 +2269,7 @@ class {{cookiecutter.camelcase_modelname}}Encoder({{cookiecutter.camelcase_model
hidden_states = inputs_embeds + embed_pos hidden_states = inputs_embeds + embed_pos
hidden_states = self.layernorm_embedding(hidden_states) hidden_states = self.layernorm_embedding(hidden_states)
hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
# expand attention_mask # expand attention_mask
if attention_mask is not None: if attention_mask is not None:
...@@ -2337,7 +2336,7 @@ class {{cookiecutter.camelcase_modelname}}Decoder({{cookiecutter.camelcase_model ...@@ -2337,7 +2336,7 @@ class {{cookiecutter.camelcase_modelname}}Decoder({{cookiecutter.camelcase_model
Args: Args:
config: {{cookiecutter.camelcase_modelname}}Config config: {{cookiecutter.camelcase_modelname}}Config
embed_tokens (torch.nn.Embedding): output embedding embed_tokens (nn.Embedding): output embedding
""" """
def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, embed_tokens: Optional[nn.Embedding] = None): def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, embed_tokens: Optional[nn.Embedding] = None):
...@@ -2506,7 +2505,7 @@ class {{cookiecutter.camelcase_modelname}}Decoder({{cookiecutter.camelcase_model ...@@ -2506,7 +2505,7 @@ class {{cookiecutter.camelcase_modelname}}Decoder({{cookiecutter.camelcase_model
hidden_states = inputs_embeds + positions hidden_states = inputs_embeds + positions
hidden_states = self.layernorm_embedding(hidden_states) hidden_states = self.layernorm_embedding(hidden_states)
hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
# decoder layers # decoder layers
all_hidden_states = () if output_hidden_states else None all_hidden_states = () if output_hidden_states else None
......
...@@ -725,7 +725,7 @@ defined by the name of the class attribute you give the layer. Let's ...@@ -725,7 +725,7 @@ defined by the name of the class attribute you give the layer. Let's
define a dummy model in PyTorch, called `SimpleModel` as follows: define a dummy model in PyTorch, called `SimpleModel` as follows:
```python ```python
import torch.nn as nn from torch import nn
class SimpleModel(nn.Module): class SimpleModel(nn.Module):
def __init__(self): def __init__(self):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment