Unverified Commit 416711c3 authored by Hovnatan Karapetyan's avatar Hovnatan Karapetyan Committed by GitHub
Browse files

Fix 29807 sinusoidal positional encodings in Flaubert, Informer and XLM (#29904)

* Fix sinusoidal_embeddings in FlaubertModel

* Fix for Informer

* Fix for XLM

* Move sinusoidal emb for XLM

* Move sinusoidal emb for Flaubert

* Small cleanup

* Add comments on tests code copied from

* Add with Distilbert->
parent 83b26dd7
...@@ -58,10 +58,10 @@ from ..deprecated._archive_maps import FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST # ...@@ -58,10 +58,10 @@ from ..deprecated._archive_maps import FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST #
# Copied from transformers.models.xlm.modeling_xlm.create_sinusoidal_embeddings # Copied from transformers.models.xlm.modeling_xlm.create_sinusoidal_embeddings
def create_sinusoidal_embeddings(n_pos, dim, out): def create_sinusoidal_embeddings(n_pos, dim, out):
position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]) position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)])
out.requires_grad = False
out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2])) out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2])) out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
out.detach_() out.detach_()
out.requires_grad = False
# Copied from transformers.models.xlm.modeling_xlm.get_masks # Copied from transformers.models.xlm.modeling_xlm.get_masks
...@@ -370,6 +370,10 @@ class FlaubertPreTrainedModel(PreTrainedModel): ...@@ -370,6 +370,10 @@ class FlaubertPreTrainedModel(PreTrainedModel):
if isinstance(module, nn.LayerNorm): if isinstance(module, nn.LayerNorm):
module.bias.data.zero_() module.bias.data.zero_()
module.weight.data.fill_(1.0) module.weight.data.fill_(1.0)
if isinstance(module, FlaubertModel) and self.config.sinusoidal_embeddings:
create_sinusoidal_embeddings(
self.config.max_position_embeddings, self.config.emb_dim, out=module.position_embeddings.weight
)
class FlaubertModel(FlaubertPreTrainedModel): class FlaubertModel(FlaubertPreTrainedModel):
...@@ -407,8 +411,6 @@ class FlaubertModel(FlaubertPreTrainedModel): ...@@ -407,8 +411,6 @@ class FlaubertModel(FlaubertPreTrainedModel):
# embeddings # embeddings
self.position_embeddings = nn.Embedding(config.max_position_embeddings, self.dim) self.position_embeddings = nn.Embedding(config.max_position_embeddings, self.dim)
if config.sinusoidal_embeddings:
create_sinusoidal_embeddings(config.max_position_embeddings, self.dim, out=self.position_embeddings.weight)
if config.n_langs > 1 and config.use_lang_emb: if config.n_langs > 1 and config.use_lang_emb:
self.lang_embeddings = nn.Embedding(self.n_langs, self.dim) self.lang_embeddings = nn.Embedding(self.n_langs, self.dim)
self.embeddings = nn.Embedding(self.n_words, self.dim, padding_idx=self.pad_index) self.embeddings = nn.Embedding(self.n_words, self.dim, padding_idx=self.pad_index)
......
...@@ -890,7 +890,7 @@ class InformerPreTrainedModel(PreTrainedModel): ...@@ -890,7 +890,7 @@ class InformerPreTrainedModel(PreTrainedModel):
module.weight.data.normal_(mean=0.0, std=std) module.weight.data.normal_(mean=0.0, std=std)
if module.bias is not None: if module.bias is not None:
module.bias.data.zero_() module.bias.data.zero_()
elif isinstance(module, nn.Embedding): elif isinstance(module, nn.Embedding) and not isinstance(module, InformerSinusoidalPositionalEmbedding):
module.weight.data.normal_(mean=0.0, std=std) module.weight.data.normal_(mean=0.0, std=std)
if module.padding_idx is not None: if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_() module.weight.data[module.padding_idx].zero_()
......
...@@ -59,10 +59,10 @@ from ..deprecated._archive_maps import XLM_PRETRAINED_MODEL_ARCHIVE_LIST # noqa ...@@ -59,10 +59,10 @@ from ..deprecated._archive_maps import XLM_PRETRAINED_MODEL_ARCHIVE_LIST # noqa
def create_sinusoidal_embeddings(n_pos, dim, out): def create_sinusoidal_embeddings(n_pos, dim, out):
position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]) position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)])
out.requires_grad = False
out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2])) out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2])) out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
out.detach_() out.detach_()
out.requires_grad = False
def get_masks(slen, lengths, causal, padding_mask=None): def get_masks(slen, lengths, causal, padding_mask=None):
...@@ -245,6 +245,10 @@ class XLMPreTrainedModel(PreTrainedModel): ...@@ -245,6 +245,10 @@ class XLMPreTrainedModel(PreTrainedModel):
if isinstance(module, nn.LayerNorm): if isinstance(module, nn.LayerNorm):
module.bias.data.zero_() module.bias.data.zero_()
module.weight.data.fill_(1.0) module.weight.data.fill_(1.0)
if isinstance(module, XLMModel) and self.config.sinusoidal_embeddings:
create_sinusoidal_embeddings(
self.config.max_position_embeddings, self.config.emb_dim, out=module.position_embeddings.weight
)
@dataclass @dataclass
...@@ -414,8 +418,6 @@ class XLMModel(XLMPreTrainedModel): ...@@ -414,8 +418,6 @@ class XLMModel(XLMPreTrainedModel):
# embeddings # embeddings
self.position_embeddings = nn.Embedding(config.max_position_embeddings, self.dim) self.position_embeddings = nn.Embedding(config.max_position_embeddings, self.dim)
if config.sinusoidal_embeddings:
create_sinusoidal_embeddings(config.max_position_embeddings, self.dim, out=self.position_embeddings.weight)
if config.n_langs > 1 and config.use_lang_emb: if config.n_langs > 1 and config.use_lang_emb:
self.lang_embeddings = nn.Embedding(self.n_langs, self.dim) self.lang_embeddings = nn.Embedding(self.n_langs, self.dim)
self.embeddings = nn.Embedding(self.n_words, self.dim, padding_idx=self.pad_index) self.embeddings = nn.Embedding(self.n_words, self.dim, padding_idx=self.pad_index)
......
...@@ -36,6 +36,7 @@ if is_torch_available(): ...@@ -36,6 +36,7 @@ if is_torch_available():
FlaubertModel, FlaubertModel,
FlaubertWithLMHeadModel, FlaubertWithLMHeadModel,
) )
from transformers.models.flaubert.modeling_flaubert import create_sinusoidal_embeddings
class FlaubertModelTester(object): class FlaubertModelTester(object):
...@@ -431,6 +432,14 @@ class FlaubertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase ...@@ -431,6 +432,14 @@ class FlaubertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
config_and_inputs = self.model_tester.prepare_config_and_inputs() config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_flaubert_model(*config_and_inputs) self.model_tester.create_and_check_flaubert_model(*config_and_inputs)
# Copied from tests/models/distilbert/test_modeling_distilbert.py with Distilbert->Flaubert
def test_flaubert_model_with_sinusoidal_encodings(self):
config = FlaubertConfig(sinusoidal_embeddings=True)
model = FlaubertModel(config=config)
sinusoidal_pos_embds = torch.empty((config.max_position_embeddings, config.emb_dim), dtype=torch.float32)
create_sinusoidal_embeddings(config.max_position_embeddings, config.emb_dim, sinusoidal_pos_embds)
self.model_tester.parent.assertTrue(torch.equal(model.position_embeddings.weight, sinusoidal_pos_embds))
def test_flaubert_lm_head(self): def test_flaubert_lm_head(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs() config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_flaubert_lm_head(*config_and_inputs) self.model_tester.create_and_check_flaubert_lm_head(*config_and_inputs)
......
...@@ -35,7 +35,11 @@ if is_torch_available(): ...@@ -35,7 +35,11 @@ if is_torch_available():
import torch import torch
from transformers import InformerConfig, InformerForPrediction, InformerModel from transformers import InformerConfig, InformerForPrediction, InformerModel
from transformers.models.informer.modeling_informer import InformerDecoder, InformerEncoder from transformers.models.informer.modeling_informer import (
InformerDecoder,
InformerEncoder,
InformerSinusoidalPositionalEmbedding,
)
@require_torch @require_torch
...@@ -164,6 +168,12 @@ class InformerModelTester: ...@@ -164,6 +168,12 @@ class InformerModelTester:
self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3) self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
embed_positions = InformerSinusoidalPositionalEmbedding(
config.context_length + config.prediction_length, config.d_model
)
self.parent.assertTrue(torch.equal(model.encoder.embed_positions.weight, embed_positions.weight))
self.parent.assertTrue(torch.equal(model.decoder.embed_positions.weight, embed_positions.weight))
with tempfile.TemporaryDirectory() as tmpdirname: with tempfile.TemporaryDirectory() as tmpdirname:
decoder = model.get_decoder() decoder = model.get_decoder()
decoder.save_pretrained(tmpdirname) decoder.save_pretrained(tmpdirname)
......
...@@ -36,6 +36,7 @@ if is_torch_available(): ...@@ -36,6 +36,7 @@ if is_torch_available():
XLMModel, XLMModel,
XLMWithLMHeadModel, XLMWithLMHeadModel,
) )
from transformers.models.xlm.modeling_xlm import create_sinusoidal_embeddings
class XLMModelTester: class XLMModelTester:
...@@ -432,6 +433,14 @@ class XLMModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, ...@@ -432,6 +433,14 @@ class XLMModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
config_and_inputs = self.model_tester.prepare_config_and_inputs() config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_xlm_model(*config_and_inputs) self.model_tester.create_and_check_xlm_model(*config_and_inputs)
# Copied from tests/models/distilbert/test_modeling_distilbert.py with Distilbert->XLM
def test_xlm_model_with_sinusoidal_encodings(self):
config = XLMConfig(sinusoidal_embeddings=True)
model = XLMModel(config=config)
sinusoidal_pos_embds = torch.empty((config.max_position_embeddings, config.emb_dim), dtype=torch.float32)
create_sinusoidal_embeddings(config.max_position_embeddings, config.emb_dim, sinusoidal_pos_embds)
self.model_tester.parent.assertTrue(torch.equal(model.position_embeddings.weight, sinusoidal_pos_embds))
def test_xlm_lm_head(self): def test_xlm_lm_head(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs() config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_xlm_lm_head(*config_and_inputs) self.model_tester.create_and_check_xlm_lm_head(*config_and_inputs)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment