"git@developer.sourcefind.cn:yangrong/internvl2_pytorch.git" did not exist on "045dc06ea405e713ff3c96ff6d62bfb61d844f74"
Commit 57053334 authored by thomwolf's avatar thomwolf
Browse files

add initialization for everybody

parent f2a337b3
import tensorflow as tf import tensorflow as tf
import tensorflow_datasets import tensorflow_datasets
from transformers import * from pytorch_transformers import *
# Load dataset, tokenizer, model from pretrained model/vocabulary # Load dataset, tokenizer, model from pretrained model/vocabulary
tokenizer = BertTokenizer.from_pretrained('bert-base-cased') tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
dataset = tensorflow_datasets.load('glue/mrpc')
model = TFBertForSequenceClassification.from_pretrained('bert-base-cased') model = TFBertForSequenceClassification.from_pretrained('bert-base-cased')
data = tensorflow_datasets.load('glue/mrpc')
# Prepare dataset for GLUE as a tf.data.Dataset instance # Prepare dataset for GLUE as a tf.data.Dataset instance
train_dataset = glue_convert_examples_to_features(dataset['train'], tokenizer, task='mrpc') train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, 128, 'mrpc')
valid_dataset = glue_convert_examples_to_features(dataset['validation'], tokenizer, task='mrpc') valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, 128, 'mrpc')
train_dataset = train_dataset.shuffle(100).batch(32).repeat(3) train_dataset = train_dataset.shuffle(100).batch(32).repeat(2)
valid_dataset = valid_dataset.batch(64) valid_dataset = valid_dataset.batch(64)
# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule # Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule
learning_rate = tf.keras.optimizers.schedules.PolynomialDecay(2e-5, 345, end_learning_rate=0) optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=['sparse_categorical_accuracy']) model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
# Train and evaluate using tf.keras.Model.fit() # Train and evaluate using tf.keras.Model.fit()
model.fit(train_dataset, epochs=3, steps_per_epoch=115, history = model.fit(train_dataset, epochs=2, steps_per_epoch=115,
validation_data=valid_dataset, validation_steps=7) validation_data=valid_dataset, validation_steps=7)
>>> Train for 115 steps, validate for 7 steps
>>> Epoch 1/2
>>> 115/115 [==============================] - 53s 459ms/step - loss: 0.6033 - accuracy: 0.6712 - val_loss: 0.4964 - val_accuracy: 0.7647
>>> Epoch 2/2
>>> 115/115 [==============================] - 33s 289ms/step - loss: 0.4141 - accuracy: 0.8160 - val_loss: 0.3914 - val_accuracy: 0.8382
# Save the TensorFlow model and load it in PyTorch # Load the TensorFlow model in PyTorch for inspection
model.save_pretrained('./save/') model.save_pretrained('./save/')
pytorch_model = BertForSequenceClassification.from_pretrained('./save/', from_tf=True) pytorch_model = BertForSequenceClassification.from_pretrained('./save/', from_tf=True)
# Quickly inspect a few predictions - MRPC is a paraphrasing task # Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task
inputs = tokenizer.encode_plus("The company is doing great", sentence_0 = "This research was consistent with his findings."
"The company has good results", sentence_1 = "His findings were compatible with this research."
add_special_tokens=True, sentence_2 = "His findings were not compatible with this research."
return_tensors='pt') inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt')
pred = pytorch_model(**inputs) inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt')
print("Paraphrase" if pred.argmax().item() == 0 else "Not paraphrase")
pred_1 = pytorch_model(**inputs_1)[0].argmax().item()
pred_2 = pytorch_model(**inputs_2)[0].argmax().item()
print("sentence_1 is", "a paraphrase" if pred_1 else "not a paraphrase", "of sentence_0")
print("sentence_2 is", "a paraphrase" if pred_2 else "not a paraphrase", "of sentence_0")
>>> sentence_1 is a paraphrase of sentence_0
>>> sentence_2 is not a paraphrase of sentence_0
\ No newline at end of file
...@@ -29,7 +29,7 @@ import numpy as np ...@@ -29,7 +29,7 @@ import numpy as np
import tensorflow as tf import tensorflow as tf
from .configuration_distilbert import DistilBertConfig from .configuration_distilbert import DistilBertConfig
from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, shape_list from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, shape_list, get_initializer
from .file_utils import add_start_docstrings from .file_utils import add_start_docstrings
from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
...@@ -79,8 +79,15 @@ class TFEmbeddings(tf.keras.layers.Layer): ...@@ -79,8 +79,15 @@ class TFEmbeddings(tf.keras.layers.Layer):
super(TFEmbeddings, self).__init__(**kwargs) super(TFEmbeddings, self).__init__(**kwargs)
self.vocab_size = config.vocab_size self.vocab_size = config.vocab_size
self.dim = config.dim self.dim = config.dim
self.word_embeddings = TFSharedEmbeddings(config.vocab_size, config.dim, name='word_embeddings') # padding_idx=0) self.initializer_range = config.initializer_range
self.position_embeddings = tf.keras.layers.Embedding(config.max_position_embeddings, config.dim, name='position_embeddings') self.word_embeddings = TFSharedEmbeddings(config.vocab_size,
config.dim,
initializer_range=config.initializer_range,
name='word_embeddings') # padding_idx=0)
self.position_embeddings = tf.keras.layers.Embedding(config.max_position_embeddings,
config.dim,
embeddings_initializer=get_initializer(config.initializer_range),
name='position_embeddings')
if config.sinusoidal_pos_embds: if config.sinusoidal_pos_embds:
raise NotImplementedError raise NotImplementedError
...@@ -95,8 +102,7 @@ class TFEmbeddings(tf.keras.layers.Layer): ...@@ -95,8 +102,7 @@ class TFEmbeddings(tf.keras.layers.Layer):
self.word_embeddings = self.add_weight( self.word_embeddings = self.add_weight(
"weight", "weight",
shape=[self.vocab_size, self.dim], shape=[self.vocab_size, self.dim],
initializer=tf.random_normal_initializer( initializer=get_initializer(self.initializer_range))
mean=0., stddev=self.dim**-0.5))
super(TFEmbeddings, self).build(input_shape) super(TFEmbeddings, self).build(input_shape)
def call(self, inputs, mode="embedding", training=False): def call(self, inputs, mode="embedding", training=False):
...@@ -178,10 +184,18 @@ class TFMultiHeadSelfAttention(tf.keras.layers.Layer): ...@@ -178,10 +184,18 @@ class TFMultiHeadSelfAttention(tf.keras.layers.Layer):
assert self.dim % self.n_heads == 0 assert self.dim % self.n_heads == 0
self.q_lin = tf.keras.layers.Dense(config.dim, name="q_lin") self.q_lin = tf.keras.layers.Dense(config.dim,
self.k_lin = tf.keras.layers.Dense(config.dim, name="k_lin") kernel_initializer=get_initializer(config.initializer_range),
self.v_lin = tf.keras.layers.Dense(config.dim, name="v_lin") name="q_lin")
self.out_lin = tf.keras.layers.Dense(config.dim, name="out_lin") self.k_lin = tf.keras.layers.Dense(config.dim,
kernel_initializer=get_initializer(config.initializer_range),
name="k_lin")
self.v_lin = tf.keras.layers.Dense(config.dim,
kernel_initializer=get_initializer(config.initializer_range),
name="v_lin")
self.out_lin = tf.keras.layers.Dense(config.dim,
kernel_initializer=get_initializer(config.initializer_range),
name="out_lin")
self.pruned_heads = set() self.pruned_heads = set()
...@@ -254,8 +268,12 @@ class TFFFN(tf.keras.layers.Layer): ...@@ -254,8 +268,12 @@ class TFFFN(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super(TFFFN, self).__init__(**kwargs) super(TFFFN, self).__init__(**kwargs)
self.dropout = tf.keras.layers.Dropout(config.dropout) self.dropout = tf.keras.layers.Dropout(config.dropout)
self.lin1 = tf.keras.layers.Dense(config.hidden_dim, name="lin1") self.lin1 = tf.keras.layers.Dense(config.hidden_dim,
self.lin2 = tf.keras.layers.Dense(config.dim, name="lin2") kernel_initializer=get_initializer(config.initializer_range),
name="lin1")
self.lin2 = tf.keras.layers.Dense(config.dim,
kernel_initializer=get_initializer(config.initializer_range),
name="lin2")
assert config.activation in ['relu', 'gelu'], "activation ({}) must be in ['relu', 'gelu']".format(config.activation) assert config.activation in ['relu', 'gelu'], "activation ({}) must be in ['relu', 'gelu']".format(config.activation)
self.activation = tf.keras.layers.Activation(gelu) if config.activation=='gelu' else tf.keras.activations.relu self.activation = tf.keras.layers.Activation(gelu) if config.activation=='gelu' else tf.keras.activations.relu
...@@ -596,7 +614,9 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel): ...@@ -596,7 +614,9 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
self.vocab_size = config.vocab_size self.vocab_size = config.vocab_size
self.distilbert = TFDistilBertMainLayer(config, name="distilbert") self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
self.vocab_transform = tf.keras.layers.Dense(config.dim, name="vocab_transform") self.vocab_transform = tf.keras.layers.Dense(config.dim,
kernel_initializer=get_initializer(config.initializer_range),
name="vocab_transform")
self.act = tf.keras.layers.Activation(gelu) self.act = tf.keras.layers.Activation(gelu)
self.vocab_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="vocab_layer_norm") self.vocab_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="vocab_layer_norm")
self.vocab_projector = TFDistilBertLMHead(config, self.distilbert.embeddings, name="vocab_projector") self.vocab_projector = TFDistilBertLMHead(config, self.distilbert.embeddings, name="vocab_projector")
...@@ -647,8 +667,13 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel): ...@@ -647,8 +667,13 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.distilbert = TFDistilBertMainLayer(config, name="distilbert") self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
self.pre_classifier = tf.keras.layers.Dense(config.dim, activation='relu', name="pre_classifier") self.pre_classifier = tf.keras.layers.Dense(config.dim,
self.classifier = tf.keras.layers.Dense(config.num_labels, name="classifier") kernel_initializer=get_initializer(config.initializer_range),
activation='relu',
name="pre_classifier")
self.classifier = tf.keras.layers.Dense(config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
name="classifier")
self.dropout = tf.keras.layers.Dropout(config.seq_classif_dropout) self.dropout = tf.keras.layers.Dropout(config.seq_classif_dropout)
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
...@@ -700,7 +725,9 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel): ...@@ -700,7 +725,9 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel):
super(TFDistilBertForQuestionAnswering, self).__init__(config, *inputs, **kwargs) super(TFDistilBertForQuestionAnswering, self).__init__(config, *inputs, **kwargs)
self.distilbert = TFDistilBertMainLayer(config, name="distilbert") self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
self.qa_outputs = tf.keras.layers.Dense(config.num_labels, name='qa_outputs') self.qa_outputs = tf.keras.layers.Dense(config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
name='qa_outputs')
assert config.num_labels == 2 assert config.num_labels == 2
self.dropout = tf.keras.layers.Dropout(config.qa_dropout) self.dropout = tf.keras.layers.Dropout(config.qa_dropout)
......
...@@ -29,7 +29,7 @@ import numpy as np ...@@ -29,7 +29,7 @@ import numpy as np
import tensorflow as tf import tensorflow as tf
from .modeling_tf_utils import (TFPreTrainedModel, TFConv1D, TFSharedEmbeddings, from .modeling_tf_utils import (TFPreTrainedModel, TFConv1D, TFSharedEmbeddings,
TFSequenceSummary, shape_list) TFSequenceSummary, shape_list, get_initializer)
from .configuration_gpt2 import GPT2Config from .configuration_gpt2 import GPT2Config
from .file_utils import add_start_docstrings from .file_utils import add_start_docstrings
from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
...@@ -76,8 +76,8 @@ class TFAttention(tf.keras.layers.Layer): ...@@ -76,8 +76,8 @@ class TFAttention(tf.keras.layers.Layer):
self.split_size = n_state self.split_size = n_state
self.scale = scale self.scale = scale
self.c_attn = TFConv1D(n_state * 3, nx, name='c_attn') self.c_attn = TFConv1D(n_state * 3, nx, initializer_range=config.initializer_range, name='c_attn')
self.c_proj = TFConv1D(n_state, nx, name='c_proj') self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name='c_proj')
self.attn_dropout = tf.keras.layers.Dropout(config.attn_pdrop) self.attn_dropout = tf.keras.layers.Dropout(config.attn_pdrop)
self.resid_dropout = tf.keras.layers.Dropout(config.resid_pdrop) self.resid_dropout = tf.keras.layers.Dropout(config.resid_pdrop)
self.pruned_heads = set() self.pruned_heads = set()
...@@ -166,8 +166,8 @@ class TFMLP(tf.keras.layers.Layer): ...@@ -166,8 +166,8 @@ class TFMLP(tf.keras.layers.Layer):
def __init__(self, n_state, config, **kwargs): def __init__(self, n_state, config, **kwargs):
super(TFMLP, self).__init__(**kwargs) super(TFMLP, self).__init__(**kwargs)
nx = config.n_embd nx = config.n_embd
self.c_fc = TFConv1D(n_state, nx, name='c_fc') self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name='c_fc')
self.c_proj = TFConv1D(nx, n_state, name='c_proj') self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name='c_proj')
self.act = gelu self.act = gelu
self.dropout = tf.keras.layers.Dropout(config.resid_pdrop) self.dropout = tf.keras.layers.Dropout(config.resid_pdrop)
...@@ -212,8 +212,14 @@ class TFGPT2MainLayer(tf.keras.layers.Layer): ...@@ -212,8 +212,14 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
self.vocab_size = config.vocab_size self.vocab_size = config.vocab_size
self.n_embd = config.n_embd self.n_embd = config.n_embd
self.wte = TFSharedEmbeddings(config.vocab_size, config.hidden_size, name='wte') self.wte = TFSharedEmbeddings(config.vocab_size,
self.wpe = tf.keras.layers.Embedding(config.n_positions, config.n_embd, name='wpe') config.hidden_size,
initializer_range=config.initializer_range,
name='wte')
self.wpe = tf.keras.layers.Embedding(config.n_positions,
config.n_embd,
embeddings_initializer=get_initializer(config.initializer_range),
name='wpe')
self.drop = tf.keras.layers.Dropout(config.embd_pdrop) self.drop = tf.keras.layers.Dropout(config.embd_pdrop)
self.h = [TFBlock(config.n_ctx, self.h = [TFBlock(config.n_ctx,
config, config,
...@@ -557,7 +563,7 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel): ...@@ -557,7 +563,7 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFGPT2DoubleHeadsModel, self).__init__(config, *inputs, **kwargs) super(TFGPT2DoubleHeadsModel, self).__init__(config, *inputs, **kwargs)
self.transformer = TFGPT2MainLayer(config, name='transformer') self.transformer = TFGPT2MainLayer(config, name='transformer')
self.multiple_choice_head = TFSequenceSummary(config, name='multiple_choice_head') self.multiple_choice_head = TFSequenceSummary(config, initializer_range=config.initializer_range, name='multiple_choice_head')
def call(self, inputs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, mc_token_ids=None, training=False): def call(self, inputs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, mc_token_ids=None, training=False):
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
......
...@@ -29,7 +29,7 @@ import numpy as np ...@@ -29,7 +29,7 @@ import numpy as np
import tensorflow as tf import tensorflow as tf
from .modeling_tf_utils import (TFPreTrainedModel, TFConv1D, TFSharedEmbeddings, from .modeling_tf_utils import (TFPreTrainedModel, TFConv1D, TFSharedEmbeddings,
TFSequenceSummary, shape_list) TFSequenceSummary, shape_list, get_initializer)
from .configuration_openai import OpenAIGPTConfig from .configuration_openai import OpenAIGPTConfig
from .file_utils import add_start_docstrings from .file_utils import add_start_docstrings
from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
...@@ -83,8 +83,8 @@ class TFAttention(tf.keras.layers.Layer): ...@@ -83,8 +83,8 @@ class TFAttention(tf.keras.layers.Layer):
self.split_size = n_state self.split_size = n_state
self.scale = scale self.scale = scale
self.c_attn = TFConv1D(n_state * 3, nx, name='c_attn') self.c_attn = TFConv1D(n_state * 3, nx, initializer_range=config.initializer_range, name='c_attn')
self.c_proj = TFConv1D(n_state, nx, name='c_proj') self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name='c_proj')
self.attn_dropout = tf.keras.layers.Dropout(config.attn_pdrop) self.attn_dropout = tf.keras.layers.Dropout(config.attn_pdrop)
self.resid_dropout = tf.keras.layers.Dropout(config.resid_pdrop) self.resid_dropout = tf.keras.layers.Dropout(config.resid_pdrop)
self.pruned_heads = set() self.pruned_heads = set()
...@@ -168,8 +168,8 @@ class TFMLP(tf.keras.layers.Layer): ...@@ -168,8 +168,8 @@ class TFMLP(tf.keras.layers.Layer):
def __init__(self, n_state, config, **kwargs): def __init__(self, n_state, config, **kwargs):
super(TFMLP, self).__init__(**kwargs) super(TFMLP, self).__init__(**kwargs)
nx = config.n_embd nx = config.n_embd
self.c_fc = TFConv1D(n_state, nx, name='c_fc') self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name='c_fc')
self.c_proj = TFConv1D(nx, n_state, name='c_proj') self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name='c_proj')
self.act = gelu self.act = gelu
self.dropout = tf.keras.layers.Dropout(config.resid_pdrop) self.dropout = tf.keras.layers.Dropout(config.resid_pdrop)
...@@ -212,8 +212,14 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer): ...@@ -212,8 +212,14 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
self.vocab_size = config.vocab_size self.vocab_size = config.vocab_size
self.n_embd = config.n_embd self.n_embd = config.n_embd
self.tokens_embed = TFSharedEmbeddings(config.vocab_size, config.n_embd, name='tokens_embed') self.tokens_embed = TFSharedEmbeddings(config.vocab_size,
self.positions_embed = tf.keras.layers.Embedding(config.n_positions, config.n_embd, name='positions_embed') config.n_embd,
initializer_range=config.initializer_range,
name='tokens_embed')
self.positions_embed = tf.keras.layers.Embedding(config.n_positions,
config.n_embd,
embeddings_initializer=get_initializer(config.initializer_range),
name='positions_embed')
self.drop = tf.keras.layers.Dropout(config.embd_pdrop) self.drop = tf.keras.layers.Dropout(config.embd_pdrop)
self.h = [TFBlock(config.n_ctx, self.h = [TFBlock(config.n_ctx,
config, config,
...@@ -522,7 +528,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel): ...@@ -522,7 +528,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFOpenAIGPTDoubleHeadsModel, self).__init__(config, *inputs, **kwargs) super(TFOpenAIGPTDoubleHeadsModel, self).__init__(config, *inputs, **kwargs)
self.transformer = TFOpenAIGPTMainLayer(config, name='transformer') self.transformer = TFOpenAIGPTMainLayer(config, name='transformer')
self.multiple_choice_head = TFSequenceSummary(config, name='multiple_choice_head') self.multiple_choice_head = TFSequenceSummary(config, initializer_range=config.initializer_range, name='multiple_choice_head')
def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, mc_token_ids=None, training=False): def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, mc_token_ids=None, training=False):
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
......
...@@ -24,7 +24,7 @@ import numpy as np ...@@ -24,7 +24,7 @@ import numpy as np
import tensorflow as tf import tensorflow as tf
from .configuration_roberta import RobertaConfig from .configuration_roberta import RobertaConfig
from .modeling_tf_utils import TFPreTrainedModel from .modeling_tf_utils import TFPreTrainedModel, get_initializer
from .file_utils import add_start_docstrings from .file_utils import add_start_docstrings
from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
...@@ -232,7 +232,9 @@ class TFRobertaLMHead(tf.keras.layers.Layer): ...@@ -232,7 +232,9 @@ class TFRobertaLMHead(tf.keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs): def __init__(self, config, input_embeddings, **kwargs):
super(TFRobertaLMHead, self).__init__(**kwargs) super(TFRobertaLMHead, self).__init__(**kwargs)
self.vocab_size = config.vocab_size self.vocab_size = config.vocab_size
self.dense = tf.keras.layers.Dense(config.hidden_size, name='dense') self.dense = tf.keras.layers.Dense(config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
name='dense')
self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='layer_norm') self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='layer_norm')
self.act = tf.keras.layers.Activation(gelu) self.act = tf.keras.layers.Activation(gelu)
...@@ -315,9 +317,14 @@ class TFRobertaClassificationHead(tf.keras.layers.Layer): ...@@ -315,9 +317,14 @@ class TFRobertaClassificationHead(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super(TFRobertaClassificationHead, self).__init__(config, **kwargs) super(TFRobertaClassificationHead, self).__init__(config, **kwargs)
self.dense = tf.keras.layers.Dense(config.hidden_size, activation='tanh', name="dense") self.dense = tf.keras.layers.Dense(config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
activation='tanh',
name="dense")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
self.out_proj = tf.keras.layers.Dense(config.num_labels, name="out_proj") self.out_proj = tf.keras.layers.Dense(config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
name="out_proj")
def call(self, features, training=False): def call(self, features, training=False):
x = features[:, 0, :] # take <s> token (equiv. to [CLS]) x = features[:, 0, :] # take <s> token (equiv. to [CLS])
......
...@@ -30,7 +30,7 @@ import numpy as np ...@@ -30,7 +30,7 @@ import numpy as np
import tensorflow as tf import tensorflow as tf
from .configuration_transfo_xl import TransfoXLConfig from .configuration_transfo_xl import TransfoXLConfig
from .modeling_tf_utils import TFPreTrainedModel, TFConv1D, TFSequenceSummary, shape_list from .modeling_tf_utils import TFPreTrainedModel, TFConv1D, TFSequenceSummary, shape_list, get_initializer
from .modeling_tf_transfo_xl_utilities import TFAdaptiveSoftmaxMask from .modeling_tf_transfo_xl_utilities import TFAdaptiveSoftmaxMask
from .file_utils import add_start_docstrings from .file_utils import add_start_docstrings
from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
...@@ -66,16 +66,21 @@ class TFPositionalEmbedding(tf.keras.layers.Layer): ...@@ -66,16 +66,21 @@ class TFPositionalEmbedding(tf.keras.layers.Layer):
class TFPositionwiseFF(tf.keras.layers.Layer): class TFPositionwiseFF(tf.keras.layers.Layer):
def __init__(self, d_model, d_inner, dropout, pre_lnorm=False, layer_norm_epsilon=1e-5, **kwargs): def __init__(self, d_model, d_inner, dropout, pre_lnorm=False, layer_norm_epsilon=1e-5, init_std=0.02, **kwargs):
super(TFPositionwiseFF, self).__init__(**kwargs) super(TFPositionwiseFF, self).__init__(**kwargs)
self.d_model = d_model self.d_model = d_model
self.d_inner = d_inner self.d_inner = d_inner
self.dropout = dropout self.dropout = dropout
self.layer_1 = tf.keras.layers.Dense(d_inner, activation=tf.nn.relu, name='CoreNet_._0') self.layer_1 = tf.keras.layers.Dense(d_inner,
kernel_initializer=get_initializer(init_std),
activation=tf.nn.relu,
name='CoreNet_._0')
self.drop_1 = tf.keras.layers.Dropout(dropout) self.drop_1 = tf.keras.layers.Dropout(dropout)
self.layer_2 = tf.keras.layers.Dense(d_model, name='CoreNet_._3') self.layer_2 = tf.keras.layers.Dense(d_model,
kernel_initializer=get_initializer(init_std),
name='CoreNet_._3')
self.drop_2 = tf.keras.layers.Dropout(dropout) self.drop_2 = tf.keras.layers.Dropout(dropout)
self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name='layer_norm') self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name='layer_norm')
...@@ -110,7 +115,7 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer): ...@@ -110,7 +115,7 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
def __init__(self, n_head, d_model, d_head, dropout, dropatt=0, def __init__(self, n_head, d_model, d_head, dropout, dropatt=0,
tgt_len=None, ext_len=None, mem_len=None, pre_lnorm=False, tgt_len=None, ext_len=None, mem_len=None, pre_lnorm=False,
r_r_bias=None, r_w_bias=None, output_attentions=False, r_r_bias=None, r_w_bias=None, output_attentions=False,
layer_norm_epsilon=1e-5, **kwargs): layer_norm_epsilon=1e-5, init_std=0.02, **kwargs):
super(TFRelPartialLearnableMultiHeadAttn, self).__init__(**kwargs) super(TFRelPartialLearnableMultiHeadAttn, self).__init__(**kwargs)
self.output_attentions = output_attentions self.output_attentions = output_attentions
...@@ -119,11 +124,17 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer): ...@@ -119,11 +124,17 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
self.d_head = d_head self.d_head = d_head
self.dropout = dropout self.dropout = dropout
self.qkv_net = tf.keras.layers.Dense(3 * n_head * d_head, use_bias=False, name='qkv_net') self.qkv_net = tf.keras.layers.Dense(3 * n_head * d_head,
kernel_initializer=get_initializer(init_std),
use_bias=False,
name='qkv_net')
self.drop = tf.keras.layers.Dropout(dropout) self.drop = tf.keras.layers.Dropout(dropout)
self.dropatt = tf.keras.layers.Dropout(dropatt) self.dropatt = tf.keras.layers.Dropout(dropatt)
self.o_net = tf.keras.layers.Dense(d_model, use_bias=False, name='o_net') self.o_net = tf.keras.layers.Dense(d_model,
kernel_initializer=get_initializer(init_std),
use_bias=False,
name='o_net')
self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name='layer_norm') self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name='layer_norm')
...@@ -138,14 +149,19 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer): ...@@ -138,14 +149,19 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
self.r_r_bias = None self.r_r_bias = None
self.r_w_bias = None self.r_w_bias = None
self.r_net = tf.keras.layers.Dense(self.n_head * self.d_head, use_bias=False, name='r_net') self.r_net = tf.keras.layers.Dense(self.n_head * self.d_head,
kernel_initializer=get_initializer(init_std),
use_bias=False,
name='r_net')
def build(self, input_shape): def build(self, input_shape):
if self.r_r_bias is None or self.r_w_bias is None: # Biases are not shared if self.r_r_bias is None or self.r_w_bias is None: # Biases are not shared
self.r_r_bias = self.add_weight(shape=(self.n_head, self.d_head), self.r_r_bias = self.add_weight(shape=(self.n_head, self.d_head),
initializer='zeros',
trainable=True, trainable=True,
name='r_r_bias') name='r_r_bias')
self.r_w_bias = self.add_weight(shape=(self.n_head, self.d_head), self.r_w_bias = self.add_weight(shape=(self.n_head, self.d_head),
initializer='zeros',
trainable=True, trainable=True,
name='r_w_bias') name='r_w_bias')
super(TFRelPartialLearnableMultiHeadAttn, self).build(input_shape) super(TFRelPartialLearnableMultiHeadAttn, self).build(input_shape)
...@@ -249,17 +265,18 @@ class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer): ...@@ -249,17 +265,18 @@ class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer):
r_r_bias=None, r_r_bias=None,
output_attentions=False, output_attentions=False,
layer_norm_epsilon=1e-5, layer_norm_epsilon=1e-5,
init_std=0.02,
**kwargs): **kwargs):
super(TFRelPartialLearnableDecoderLayer, self).__init__(**kwargs) super(TFRelPartialLearnableDecoderLayer, self).__init__(**kwargs)
self.dec_attn = TFRelPartialLearnableMultiHeadAttn(n_head, d_model, self.dec_attn = TFRelPartialLearnableMultiHeadAttn(n_head, d_model,
d_head, dropout, tgt_len=tgt_len, ext_len=ext_len, d_head, dropout, tgt_len=tgt_len, ext_len=ext_len,
mem_len=mem_len, dropatt=dropatt, pre_lnorm=pre_lnorm, mem_len=mem_len, dropatt=dropatt, pre_lnorm=pre_lnorm,
r_w_bias=r_w_bias, r_r_bias=r_r_bias, r_w_bias=r_w_bias, r_r_bias=r_r_bias, init_std=init_std,
output_attentions=output_attentions, output_attentions=output_attentions,
layer_norm_epsilon=layer_norm_epsilon, name='dec_attn') layer_norm_epsilon=layer_norm_epsilon, name='dec_attn')
self.pos_ff = TFPositionwiseFF(d_model, d_inner, dropout, self.pos_ff = TFPositionwiseFF(d_model, d_inner, dropout,
pre_lnorm=pre_lnorm, pre_lnorm=pre_lnorm, init_std=init_std,
layer_norm_epsilon=layer_norm_epsilon, layer_norm_epsilon=layer_norm_epsilon,
name='pos_ff') name='pos_ff')
...@@ -275,12 +292,13 @@ class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer): ...@@ -275,12 +292,13 @@ class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer):
class TFAdaptiveEmbedding(tf.keras.layers.Layer): class TFAdaptiveEmbedding(tf.keras.layers.Layer):
def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, init_std=0.02,
sample_softmax=False, **kwargs): sample_softmax=False, **kwargs):
super(TFAdaptiveEmbedding, self).__init__(**kwargs) super(TFAdaptiveEmbedding, self).__init__(**kwargs)
self.n_token = n_token self.n_token = n_token
self.d_embed = d_embed self.d_embed = d_embed
self.init_std = init_std
self.cutoffs = cutoffs + [n_token] self.cutoffs = cutoffs + [n_token]
self.div_val = div_val self.div_val = div_val
...@@ -298,12 +316,16 @@ class TFAdaptiveEmbedding(tf.keras.layers.Layer): ...@@ -298,12 +316,16 @@ class TFAdaptiveEmbedding(tf.keras.layers.Layer):
for i in range(len(self.cutoffs)): for i in range(len(self.cutoffs)):
l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1] l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1]
d_emb_i = d_embed // (div_val ** i) d_emb_i = d_embed // (div_val ** i)
self.emb_layers.append(tf.keras.layers.Embedding(r_idx-l_idx, d_emb_i, name='emb_layers_._{}'.format(i))) self.emb_layers.append(tf.keras.layers.Embedding(r_idx-l_idx,
d_emb_i,
embeddings_initializer=get_initializer(init_std),
name='emb_layers_._{}'.format(i)))
def build(self, input_shape): def build(self, input_shape):
for i in range(len(self.cutoffs)): for i in range(len(self.cutoffs)):
d_emb_i = self.d_embed // (self.div_val ** i) d_emb_i = self.d_embed // (self.div_val ** i)
self.emb_projs.append(self.add_weight(shape=(d_emb_i, self.d_proj), self.emb_projs.append(self.add_weight(shape=(d_emb_i, self.d_proj),
initializer=get_initializer(self.init_std),
trainable=True, trainable=True,
name='emb_projs_._{}'.format(i))) name='emb_projs_._{}'.format(i)))
super(TFAdaptiveEmbedding, self).build(input_shape) super(TFAdaptiveEmbedding, self).build(input_shape)
...@@ -349,7 +371,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer): ...@@ -349,7 +371,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
self.untie_r = config.untie_r self.untie_r = config.untie_r
self.word_emb = TFAdaptiveEmbedding(config.n_token, config.d_embed, config.d_model, config.cutoffs, self.word_emb = TFAdaptiveEmbedding(config.n_token, config.d_embed, config.d_model, config.cutoffs,
div_val=config.div_val, name='word_emb') div_val=config.div_val, init_std=config.init_std, name='word_emb')
self.drop = tf.keras.layers.Dropout(config.dropout) self.drop = tf.keras.layers.Dropout(config.dropout)
...@@ -374,6 +396,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer): ...@@ -374,6 +396,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
r_r_bias=None if self.untie_r else self.r_r_bias, r_r_bias=None if self.untie_r else self.r_r_bias,
output_attentions=self.output_attentions, output_attentions=self.output_attentions,
layer_norm_epsilon=config.layer_norm_epsilon, layer_norm_epsilon=config.layer_norm_epsilon,
init_std=config.init_std,
name='layers_._{}'.format(i)) name='layers_._{}'.format(i))
) )
else: # learnable embeddings and absolute embeddings else: # learnable embeddings and absolute embeddings
......
...@@ -277,20 +277,20 @@ class TFPreTrainedModel(tf.keras.Model): ...@@ -277,20 +277,20 @@ class TFPreTrainedModel(tf.keras.Model):
return model return model
class TFConv1D(tf.keras.layers.Layer): class TFConv1D(tf.keras.layers.Layer):
def __init__(self, nf, nx, *inputs, **kwargs): def __init__(self, nf, nx, *inputs, initializer_range=0.02, **kwargs):
""" TFConv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2) """ TFConv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2)
Basically works like a Linear layer but the weights are transposed Basically works like a Linear layer but the weights are transposed
""" """
super(TFConv1D, self).__init__(*inputs, **kwargs) super(TFConv1D, self).__init__(*inputs, **kwargs)
self.nf = nf self.nf = nf
self.nx = nx self.nx = nx
self.initializer_range = initializer_range
def build(self, input_shape): def build(self, input_shape):
self.weight = self.add_weight( self.weight = self.add_weight(
"weight", "weight",
shape=[self.nx, self.nf], shape=[self.nx, self.nf],
initializer=tf.random_normal_initializer( initializer=get_initializer(self.initializer_range))
mean=0., stddev=0.02))
self.bias = self.add_weight( self.bias = self.add_weight(
"bias", "bias",
shape=[1, self.nf], shape=[1, self.nf],
...@@ -314,19 +314,17 @@ class TFSharedEmbeddings(tf.keras.layers.Layer): ...@@ -314,19 +314,17 @@ class TFSharedEmbeddings(tf.keras.layers.Layer):
super(TFSharedEmbeddings, self).__init__(**kwargs) super(TFSharedEmbeddings, self).__init__(**kwargs)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.initializer_range = initializer_range self.initializer_range = hidden_size**-0.5 if initializer_range is None else initializer_range
def build(self, input_shape): def build(self, input_shape):
"""Build shared word embedding layer """Build shared word embedding layer
Shared weights logic adapted from Shared weights logic adapted from
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24 https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
""" """
initializer_range = self.hidden_size**-0.5 if self.initializer_range is None else self.initializer_range
self.weight = self.add_weight( self.weight = self.add_weight(
"weight", "weight",
shape=[self.vocab_size, self.hidden_size], shape=[self.vocab_size, self.hidden_size],
initializer=tf.random_normal_initializer( initializer=get_initializer(self.initializer_range))
mean=0., stddev=initializer_range))
super(TFSharedEmbeddings, self).build(input_shape) super(TFSharedEmbeddings, self).build(input_shape)
def call(self, inputs, mode="embedding"): def call(self, inputs, mode="embedding"):
...@@ -385,7 +383,7 @@ class TFSequenceSummary(tf.keras.layers.Layer): ...@@ -385,7 +383,7 @@ class TFSequenceSummary(tf.keras.layers.Layer):
summary_first_dropout: Add a dropout before the projection and activation summary_first_dropout: Add a dropout before the projection and activation
summary_last_dropout: Add a dropout after the projection and activation summary_last_dropout: Add a dropout after the projection and activation
""" """
def __init__(self, config, **kwargs): def __init__(self, config, initializer_range=0.02, **kwargs):
super(TFSequenceSummary, self).__init__(**kwargs) super(TFSequenceSummary, self).__init__(**kwargs)
self.summary_type = config.summary_type if hasattr(config, 'summary_use_proj') else 'last' self.summary_type = config.summary_type if hasattr(config, 'summary_use_proj') else 'last'
...@@ -401,7 +399,9 @@ class TFSequenceSummary(tf.keras.layers.Layer): ...@@ -401,7 +399,9 @@ class TFSequenceSummary(tf.keras.layers.Layer):
num_classes = config.num_labels num_classes = config.num_labels
else: else:
num_classes = config.hidden_size num_classes = config.hidden_size
self.summary = tf.keras.layers.Dense(num_classes, name='summary') self.summary = tf.keras.layers.Dense(num_classes,
kernel_initializer=get_initializer(initializer_range),
name='summary')
self.activation = None self.activation = None
if hasattr(config, 'summary_activation') and config.summary_activation == 'tanh': if hasattr(config, 'summary_activation') and config.summary_activation == 'tanh':
......
...@@ -25,7 +25,7 @@ import numpy as np ...@@ -25,7 +25,7 @@ import numpy as np
import tensorflow as tf import tensorflow as tf
from .configuration_xlm import XLMConfig from .configuration_xlm import XLMConfig
from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary, shape_list from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary, shape_list, get_initializer
from .file_utils import add_start_docstrings from .file_utils import add_start_docstrings
from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
...@@ -119,10 +119,10 @@ class TFMultiHeadAttention(tf.keras.layers.Layer): ...@@ -119,10 +119,10 @@ class TFMultiHeadAttention(tf.keras.layers.Layer):
self.n_heads = n_heads self.n_heads = n_heads
assert self.dim % self.n_heads == 0 assert self.dim % self.n_heads == 0
self.q_lin = tf.keras.layers.Dense(dim, name='q_lin') self.q_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name='q_lin')
self.k_lin = tf.keras.layers.Dense(dim, name='k_lin') self.k_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name='k_lin')
self.v_lin = tf.keras.layers.Dense(dim, name='v_lin') self.v_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name='v_lin')
self.out_lin = tf.keras.layers.Dense(dim, name='out_lin') self.out_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name='out_lin')
self.dropout = tf.keras.layers.Dropout(config.attention_dropout) self.dropout = tf.keras.layers.Dropout(config.attention_dropout)
self.pruned_heads = set() self.pruned_heads = set()
...@@ -199,8 +199,8 @@ class TFTransformerFFN(tf.keras.layers.Layer): ...@@ -199,8 +199,8 @@ class TFTransformerFFN(tf.keras.layers.Layer):
def __init__(self, in_dim, dim_hidden, out_dim, config, **kwargs): def __init__(self, in_dim, dim_hidden, out_dim, config, **kwargs):
super(TFTransformerFFN, self).__init__(**kwargs) super(TFTransformerFFN, self).__init__(**kwargs)
self.lin1 = tf.keras.layers.Dense(dim_hidden, name='lin1') self.lin1 = tf.keras.layers.Dense(dim_hidden, kernel_initializer=get_initializer(config.init_std), name='lin1')
self.lin2 = tf.keras.layers.Dense(out_dim, name='lin2') self.lin2 = tf.keras.layers.Dense(out_dim, kernel_initializer=get_initializer(config.init_std), name='lin2')
self.act = tf.keras.layers.Activation(gelu) if config.gelu_activation else tf.keras.activations.relu self.act = tf.keras.layers.Activation(gelu) if config.gelu_activation else tf.keras.activations.relu
self.dropout = tf.keras.layers.Dropout(config.dropout) self.dropout = tf.keras.layers.Dropout(config.dropout)
...@@ -249,13 +249,19 @@ class TFXLMMainLayer(tf.keras.layers.Layer): ...@@ -249,13 +249,19 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
self.dropout = tf.keras.layers.Dropout(config.dropout) self.dropout = tf.keras.layers.Dropout(config.dropout)
self.attention_dropout = tf.keras.layers.Dropout(config.attention_dropout) self.attention_dropout = tf.keras.layers.Dropout(config.attention_dropout)
self.position_embeddings = tf.keras.layers.Embedding(config.max_position_embeddings, self.dim, name='position_embeddings') self.position_embeddings = tf.keras.layers.Embedding(config.max_position_embeddings,
self.dim,
embeddings_initializer=get_initializer(config.embed_init_std),
name='position_embeddings')
if config.sinusoidal_embeddings: if config.sinusoidal_embeddings:
raise NotImplementedError raise NotImplementedError
# create_sinusoidal_embeddings(config.max_position_embeddings, self.dim, out=self.position_embeddings.weight) # create_sinusoidal_embeddings(config.max_position_embeddings, self.dim, out=self.position_embeddings.weight)
if config.n_langs > 1 and config.use_lang_emb: if config.n_langs > 1 and config.use_lang_emb:
self.lang_embeddings = tf.keras.layers.Embedding(self.n_langs, self.dim, name='lang_embeddings') self.lang_embeddings = tf.keras.layers.Embedding(self.n_langs,
self.embeddings = TFSharedEmbeddings(self.n_words, self.dim, name='embeddings') # padding_idx=self.pad_index) self.dim,
embeddings_initializer=get_initializer(config.embed_init_std),
name='lang_embeddings')
self.embeddings = TFSharedEmbeddings(self.n_words, self.dim, initializer_range=config.embed_init_std, name='embeddings') # padding_idx=self.pad_index)
self.layer_norm_emb = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='layer_norm_emb') self.layer_norm_emb = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='layer_norm_emb')
# transformer layers # transformer layers
...@@ -676,7 +682,7 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel): ...@@ -676,7 +682,7 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel):
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.transformer = TFXLMMainLayer(config, name='transformer') self.transformer = TFXLMMainLayer(config, name='transformer')
self.sequence_summary = TFSequenceSummary(config, name='sequence_summary') self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name='sequence_summary')
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
transformer_outputs = self.transformer(inputs, **kwargs) transformer_outputs = self.transformer(inputs, **kwargs)
...@@ -721,7 +727,9 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel): ...@@ -721,7 +727,9 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFXLMForQuestionAnsweringSimple, self).__init__(config, *inputs, **kwargs) super(TFXLMForQuestionAnsweringSimple, self).__init__(config, *inputs, **kwargs)
self.transformer = TFXLMMainLayer(config, name='transformer') self.transformer = TFXLMMainLayer(config, name='transformer')
self.qa_outputs = tf.keras.layers.Dense(config.num_labels, name='qa_outputs') self.qa_outputs = tf.keras.layers.Dense(config.num_labels,
kernel_initializer=get_initializer(config.init_std),
name='qa_outputs')
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
transformer_outputs = self.transformer(inputs, **kwargs) transformer_outputs = self.transformer(inputs, **kwargs)
......
...@@ -28,7 +28,7 @@ import numpy as np ...@@ -28,7 +28,7 @@ import numpy as np
import tensorflow as tf import tensorflow as tf
from .configuration_xlnet import XLNetConfig from .configuration_xlnet import XLNetConfig
from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary, shape_list from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary, shape_list, get_initializer
from .file_utils import add_start_docstrings from .file_utils import add_start_docstrings
from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
...@@ -87,7 +87,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer): ...@@ -87,7 +87,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
self.dropout = tf.keras.layers.Dropout(config.dropout) self.dropout = tf.keras.layers.Dropout(config.dropout)
def build(self, input_shape): def build(self, input_shape):
initializer = tf.random_normal_initializer(mean=0., stddev=self.initializer_range) initializer = get_initializer(self.initializer_range)
self.q = self.add_weight(shape=(self.d_model, self.n_head, self.d_head), self.q = self.add_weight(shape=(self.d_model, self.n_head, self.d_head),
initializer=initializer, initializer=initializer,
trainable=True, name='q') trainable=True, name='q')
...@@ -104,13 +104,13 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer): ...@@ -104,13 +104,13 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
initializer=initializer, initializer=initializer,
trainable=True, name='r') trainable=True, name='r')
self.r_r_bias = self.add_weight(shape=(self.n_head, self.d_head), self.r_r_bias = self.add_weight(shape=(self.n_head, self.d_head),
initializer=initializer, initializer='zeros',
trainable=True, name='r_r_bias') trainable=True, name='r_r_bias')
self.r_s_bias = self.add_weight(shape=(self.n_head, self.d_head), self.r_s_bias = self.add_weight(shape=(self.n_head, self.d_head),
initializer=initializer, initializer='zeros',
trainable=True, name='r_s_bias') trainable=True, name='r_s_bias')
self.r_w_bias = self.add_weight(shape=(self.n_head, self.d_head), self.r_w_bias = self.add_weight(shape=(self.n_head, self.d_head),
initializer=initializer, initializer='zeros',
trainable=True, name='r_w_bias') trainable=True, name='r_w_bias')
self.seg_embed = self.add_weight(shape=(2, self.n_head, self.d_head), self.seg_embed = self.add_weight(shape=(2, self.n_head, self.d_head),
initializer=initializer, initializer=initializer,
...@@ -294,8 +294,12 @@ class TFXLNetFeedForward(tf.keras.layers.Layer): ...@@ -294,8 +294,12 @@ class TFXLNetFeedForward(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super(TFXLNetFeedForward, self).__init__(**kwargs) super(TFXLNetFeedForward, self).__init__(**kwargs)
self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='layer_norm') self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='layer_norm')
self.layer_1 = tf.keras.layers.Dense(config.d_inner, name='layer_1') self.layer_1 = tf.keras.layers.Dense(config.d_inner,
self.layer_2 = tf.keras.layers.Dense(config.d_model, name='layer_2') kernel_initializer=get_initializer(config.initializer_range),
name='layer_1')
self.layer_2 = tf.keras.layers.Dense(config.d_model,
kernel_initializer=get_initializer(config.initializer_range),
name='layer_2')
self.dropout = tf.keras.layers.Dropout(config.dropout) self.dropout = tf.keras.layers.Dropout(config.dropout)
if isinstance(config.ff_activation, str) or \ if isinstance(config.ff_activation, str) or \
(sys.version_info[0] == 2 and isinstance(config.ff_activation, unicode)): (sys.version_info[0] == 2 and isinstance(config.ff_activation, unicode)):
...@@ -375,7 +379,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): ...@@ -375,7 +379,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
self.dropout = tf.keras.layers.Dropout(config.dropout) self.dropout = tf.keras.layers.Dropout(config.dropout)
def build(self, input_shape): def build(self, input_shape):
initializer = tf.random_normal_initializer(mean=0., stddev=self.initializer_range) initializer = get_initializer(self.initializer_range)
self.mask_emb = self.add_weight(shape=(1, 1, self.d_model), self.mask_emb = self.add_weight(shape=(1, 1, self.d_model),
initializer=initializer, initializer=initializer,
trainable=True, name='mask_emb') trainable=True, name='mask_emb')
...@@ -900,8 +904,10 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel): ...@@ -900,8 +904,10 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel):
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.transformer = TFXLNetMainLayer(config, name='transformer') self.transformer = TFXLNetMainLayer(config, name='transformer')
self.sequence_summary = TFSequenceSummary(config, name='sequence_summary') self.sequence_summary = TFSequenceSummary(config, initializer_range=config.initializer_range, name='sequence_summary')
self.logits_proj = tf.keras.layers.Dense(config.num_labels, name='logits_proj') self.logits_proj = tf.keras.layers.Dense(config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
name='logits_proj')
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
transformer_outputs = self.transformer(inputs, **kwargs) transformer_outputs = self.transformer(inputs, **kwargs)
...@@ -949,7 +955,9 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel): ...@@ -949,7 +955,9 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFXLNetForQuestionAnsweringSimple, self).__init__(config, *inputs, **kwargs) super(TFXLNetForQuestionAnsweringSimple, self).__init__(config, *inputs, **kwargs)
self.transformer = TFXLNetMainLayer(config, name='transformer') self.transformer = TFXLNetMainLayer(config, name='transformer')
self.qa_outputs = tf.keras.layers.Dense(config.num_labels, name='qa_outputs') self.qa_outputs = tf.keras.layers.Dense(config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
name='qa_outputs')
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
transformer_outputs = self.transformer(inputs, **kwargs) transformer_outputs = self.transformer(inputs, **kwargs)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment