Commit f492db25 authored by Kartikay Khandelwal's avatar Kartikay Khandelwal Committed by Facebook Github Bot
Browse files

Refactor Fairseq models for BERT and XLM to use TransformerSentenceEncoder (#622)

Summary:
Pull Request resolved: https://github.com/pytorch/fairseq/pull/622

Updating some defaults to more meaningful values

Reviewed By: rutyrinott

Differential Revision: D14761263

fbshipit-source-id: 7ac670aa370f315ddfb511c63273583a6062c569
parent f040158a
...@@ -37,8 +37,8 @@ class TransformerSentenceEncoderLayer(nn.Module): ...@@ -37,8 +37,8 @@ class TransformerSentenceEncoderLayer(nn.Module):
dropout: float = 0.1, dropout: float = 0.1,
attention_dropout: float = 0.1, attention_dropout: float = 0.1,
activation_dropout: float = 0.1, activation_dropout: float = 0.1,
encoder_normalize_before: bool = True, encoder_normalize_before: bool = False,
use_bert_layer_norm: bool = True, use_bert_layer_norm: bool = False,
use_gelu: bool = True, use_gelu: bool = True,
) -> None: ) -> None:
...@@ -108,6 +108,7 @@ class TransformerSentenceEncoderLayer(nn.Module): ...@@ -108,6 +108,7 @@ class TransformerSentenceEncoderLayer(nn.Module):
x = F.dropout(x, p=self.dropout, training=self.training) x = F.dropout(x, p=self.dropout, training=self.training)
x = residual + x x = residual + x
x = self._maybe_layer_norm(self.self_attn_layer_norm, x, after=True) x = self._maybe_layer_norm(self.self_attn_layer_norm, x, after=True)
residual = x residual = x
x = self._maybe_layer_norm(self.final_layer_norm, x, before=True) x = self._maybe_layer_norm(self.final_layer_norm, x, before=True)
x = self.activation_fn(self.fc1(x)) x = self.activation_fn(self.fc1(x))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment