"tests/backend/vscode:/vscode.git/clone" did not exist on "071cba1f2bc9f856040ccebfde706ec9b2cd4434"
Commit f492db25 authored by Kartikay Khandelwal's avatar Kartikay Khandelwal Committed by Facebook Github Bot
Browse files

Refactor Fairseq models for BERT and XLM to use TransformerSentenceEncoder (#622)

Summary:
Pull Request resolved: https://github.com/pytorch/fairseq/pull/622

Updating some defaults to more meaningful values

Reviewed By: rutyrinott

Differential Revision: D14761263

fbshipit-source-id: 7ac670aa370f315ddfb511c63273583a6062c569
parent f040158a
......@@ -37,8 +37,8 @@ class TransformerSentenceEncoderLayer(nn.Module):
dropout: float = 0.1,
attention_dropout: float = 0.1,
activation_dropout: float = 0.1,
encoder_normalize_before: bool = True,
use_bert_layer_norm: bool = True,
encoder_normalize_before: bool = False,
use_bert_layer_norm: bool = False,
use_gelu: bool = True,
) -> None:
......@@ -108,6 +108,7 @@ class TransformerSentenceEncoderLayer(nn.Module):
x = F.dropout(x, p=self.dropout, training=self.training)
x = residual + x
x = self._maybe_layer_norm(self.self_attn_layer_norm, x, after=True)
residual = x
x = self._maybe_layer_norm(self.final_layer_norm, x, before=True)
x = self.activation_fn(self.fc1(x))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment