Commit 83a41d39 authored by Julien Chaumond's avatar Julien Chaumond
Browse files

💄 super

parent cd51893d
...@@ -101,7 +101,7 @@ def gelu(x): ...@@ -101,7 +101,7 @@ def gelu(x):
class Attention(nn.Module): class Attention(nn.Module):
def __init__(self, nx, n_ctx, config, scale=False): def __init__(self, nx, n_ctx, config, scale=False):
super(Attention, self).__init__() super().__init__()
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
n_state = nx # in Attention: n_state=768 (nx=n_embd) n_state = nx # in Attention: n_state=768 (nx=n_embd)
...@@ -202,7 +202,7 @@ class Attention(nn.Module): ...@@ -202,7 +202,7 @@ class Attention(nn.Module):
class MLP(nn.Module): class MLP(nn.Module):
def __init__(self, n_state, config): # in MLP: n_state=3072 (4 * n_embd) def __init__(self, n_state, config): # in MLP: n_state=3072 (4 * n_embd)
super(MLP, self).__init__() super().__init__()
nx = config.n_embd nx = config.n_embd
self.c_fc = Conv1D(n_state, nx) self.c_fc = Conv1D(n_state, nx)
self.c_proj = Conv1D(nx, n_state) self.c_proj = Conv1D(nx, n_state)
...@@ -217,7 +217,7 @@ class MLP(nn.Module): ...@@ -217,7 +217,7 @@ class MLP(nn.Module):
class Block(nn.Module): class Block(nn.Module):
def __init__(self, n_ctx, config, scale=False): def __init__(self, n_ctx, config, scale=False):
super(Block, self).__init__() super().__init__()
nx = config.n_embd nx = config.n_embd
self.ln_1 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon) self.ln_1 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon)
self.attn = Attention(nx, n_ctx, config, scale) self.attn = Attention(nx, n_ctx, config, scale)
...@@ -249,7 +249,7 @@ class GPT2PreTrainedModel(PreTrainedModel): ...@@ -249,7 +249,7 @@ class GPT2PreTrainedModel(PreTrainedModel):
base_model_prefix = "transformer" base_model_prefix = "transformer"
def __init__(self, *inputs, **kwargs): def __init__(self, *inputs, **kwargs):
super(GPT2PreTrainedModel, self).__init__(*inputs, **kwargs) super().__init__(*inputs, **kwargs)
def _init_weights(self, module): def _init_weights(self, module):
""" Initialize the weights. """ Initialize the weights.
...@@ -355,7 +355,7 @@ class GPT2Model(GPT2PreTrainedModel): ...@@ -355,7 +355,7 @@ class GPT2Model(GPT2PreTrainedModel):
""" """
def __init__(self, config): def __init__(self, config):
super(GPT2Model, self).__init__(config) super().__init__(config)
self.output_hidden_states = config.output_hidden_states self.output_hidden_states = config.output_hidden_states
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
self.output_past = config.output_past self.output_past = config.output_past
...@@ -550,7 +550,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): ...@@ -550,7 +550,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
""" """
def __init__(self, config): def __init__(self, config):
super(GPT2LMHeadModel, self).__init__(config) super().__init__(config)
self.transformer = GPT2Model(config) self.transformer = GPT2Model(config)
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
...@@ -678,7 +678,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): ...@@ -678,7 +678,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
""" """
def __init__(self, config): def __init__(self, config):
super(GPT2DoubleHeadsModel, self).__init__(config) super().__init__(config)
config.num_labels = 1 config.num_labels = 1
self.transformer = GPT2Model(config) self.transformer = GPT2Model(config)
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
......
...@@ -33,7 +33,7 @@ class ModalEmbeddings(nn.Module): ...@@ -33,7 +33,7 @@ class ModalEmbeddings(nn.Module):
""" """
def __init__(self, config, encoder, embeddings): def __init__(self, config, encoder, embeddings):
super(ModalEmbeddings, self).__init__() super().__init__()
self.config = config self.config = config
self.encoder = encoder self.encoder = encoder
self.proj_embeddings = nn.Linear(config.modal_hidden_size, config.hidden_size) self.proj_embeddings = nn.Linear(config.modal_hidden_size, config.hidden_size)
...@@ -175,7 +175,7 @@ class MMBTModel(nn.Module): ...@@ -175,7 +175,7 @@ class MMBTModel(nn.Module):
""" """
def __init__(self, config, transformer, encoder): def __init__(self, config, transformer, encoder):
super(MMBTModel, self).__init__() super().__init__()
self.config = config self.config = config
self.transformer = transformer self.transformer = transformer
self.modal_encoder = ModalEmbeddings(config, encoder, transformer.embeddings) self.modal_encoder = ModalEmbeddings(config, encoder, transformer.embeddings)
...@@ -359,7 +359,7 @@ class MMBTForClassification(nn.Module): ...@@ -359,7 +359,7 @@ class MMBTForClassification(nn.Module):
""" """
def __init__(self, config, transformer, encoder): def __init__(self, config, transformer, encoder):
super(MMBTForClassification, self).__init__() super().__init__()
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.mmbt = MMBTModel(config, transformer, encoder) self.mmbt = MMBTModel(config, transformer, encoder)
......
...@@ -127,7 +127,7 @@ ACT_FNS = {"relu": nn.ReLU, "swish": swish, "gelu": gelu} ...@@ -127,7 +127,7 @@ ACT_FNS = {"relu": nn.ReLU, "swish": swish, "gelu": gelu}
class Attention(nn.Module): class Attention(nn.Module):
def __init__(self, nx, n_ctx, config, scale=False): def __init__(self, nx, n_ctx, config, scale=False):
super(Attention, self).__init__() super().__init__()
n_state = nx # in Attention: n_state=768 (nx=n_embd) n_state = nx # in Attention: n_state=768 (nx=n_embd)
# [switch nx => n_state from Block to Attention to keep identical to TF implem] # [switch nx => n_state from Block to Attention to keep identical to TF implem]
assert n_state % config.n_head == 0 assert n_state % config.n_head == 0
...@@ -221,7 +221,7 @@ class Attention(nn.Module): ...@@ -221,7 +221,7 @@ class Attention(nn.Module):
class MLP(nn.Module): class MLP(nn.Module):
def __init__(self, n_state, config): # in MLP: n_state=3072 (4 * n_embd) def __init__(self, n_state, config): # in MLP: n_state=3072 (4 * n_embd)
super(MLP, self).__init__() super().__init__()
nx = config.n_embd nx = config.n_embd
self.c_fc = Conv1D(n_state, nx) self.c_fc = Conv1D(n_state, nx)
self.c_proj = Conv1D(nx, n_state) self.c_proj = Conv1D(nx, n_state)
...@@ -236,7 +236,7 @@ class MLP(nn.Module): ...@@ -236,7 +236,7 @@ class MLP(nn.Module):
class Block(nn.Module): class Block(nn.Module):
def __init__(self, n_ctx, config, scale=False): def __init__(self, n_ctx, config, scale=False):
super(Block, self).__init__() super().__init__()
nx = config.n_embd nx = config.n_embd
self.attn = Attention(nx, n_ctx, config, scale) self.attn = Attention(nx, n_ctx, config, scale)
self.ln_1 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon) self.ln_1 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon)
...@@ -359,7 +359,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): ...@@ -359,7 +359,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
""" """
def __init__(self, config): def __init__(self, config):
super(OpenAIGPTModel, self).__init__(config) super().__init__(config)
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states self.output_hidden_states = config.output_hidden_states
...@@ -518,7 +518,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): ...@@ -518,7 +518,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
""" """
def __init__(self, config): def __init__(self, config):
super(OpenAIGPTLMHeadModel, self).__init__(config) super().__init__(config)
self.transformer = OpenAIGPTModel(config) self.transformer = OpenAIGPTModel(config)
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
...@@ -623,7 +623,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): ...@@ -623,7 +623,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
""" """
def __init__(self, config): def __init__(self, config):
super(OpenAIGPTDoubleHeadsModel, self).__init__(config) super().__init__(config)
config.num_labels = 1 config.num_labels = 1
self.transformer = OpenAIGPTModel(config) self.transformer = OpenAIGPTModel(config)
......
...@@ -45,7 +45,7 @@ class RobertaEmbeddings(BertEmbeddings): ...@@ -45,7 +45,7 @@ class RobertaEmbeddings(BertEmbeddings):
""" """
def __init__(self, config): def __init__(self, config):
super(RobertaEmbeddings, self).__init__(config) super().__init__(config)
self.padding_idx = 1 self.padding_idx = 1
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=self.padding_idx) self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=self.padding_idx)
self.position_embeddings = nn.Embedding( self.position_embeddings = nn.Embedding(
...@@ -60,7 +60,7 @@ class RobertaEmbeddings(BertEmbeddings): ...@@ -60,7 +60,7 @@ class RobertaEmbeddings(BertEmbeddings):
else: else:
position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds) position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
return super(RobertaEmbeddings, self).forward( return super().forward(
input_ids, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds input_ids, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds
) )
...@@ -204,7 +204,7 @@ class RobertaModel(BertModel): ...@@ -204,7 +204,7 @@ class RobertaModel(BertModel):
base_model_prefix = "roberta" base_model_prefix = "roberta"
def __init__(self, config): def __init__(self, config):
super(RobertaModel, self).__init__(config) super().__init__(config)
self.embeddings = RobertaEmbeddings(config) self.embeddings = RobertaEmbeddings(config)
self.init_weights() self.init_weights()
...@@ -254,7 +254,7 @@ class RobertaForMaskedLM(BertPreTrainedModel): ...@@ -254,7 +254,7 @@ class RobertaForMaskedLM(BertPreTrainedModel):
base_model_prefix = "roberta" base_model_prefix = "roberta"
def __init__(self, config): def __init__(self, config):
super(RobertaForMaskedLM, self).__init__(config) super().__init__(config)
self.roberta = RobertaModel(config) self.roberta = RobertaModel(config)
self.lm_head = RobertaLMHead(config) self.lm_head = RobertaLMHead(config)
...@@ -299,7 +299,7 @@ class RobertaLMHead(nn.Module): ...@@ -299,7 +299,7 @@ class RobertaLMHead(nn.Module):
"""Roberta Head for masked language modeling.""" """Roberta Head for masked language modeling."""
def __init__(self, config): def __init__(self, config):
super(RobertaLMHead, self).__init__() super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.layer_norm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.layer_norm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
...@@ -362,7 +362,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel): ...@@ -362,7 +362,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
base_model_prefix = "roberta" base_model_prefix = "roberta"
def __init__(self, config): def __init__(self, config):
super(RobertaForSequenceClassification, self).__init__(config) super().__init__(config)
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.roberta = RobertaModel(config) self.roberta = RobertaModel(config)
...@@ -484,7 +484,7 @@ class RobertaForMultipleChoice(BertPreTrainedModel): ...@@ -484,7 +484,7 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
base_model_prefix = "roberta" base_model_prefix = "roberta"
def __init__(self, config): def __init__(self, config):
super(RobertaForMultipleChoice, self).__init__(config) super().__init__(config)
self.roberta = RobertaModel(config) self.roberta = RobertaModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob) self.dropout = nn.Dropout(config.hidden_dropout_prob)
...@@ -571,7 +571,7 @@ class RobertaForTokenClassification(BertPreTrainedModel): ...@@ -571,7 +571,7 @@ class RobertaForTokenClassification(BertPreTrainedModel):
base_model_prefix = "roberta" base_model_prefix = "roberta"
def __init__(self, config): def __init__(self, config):
super(RobertaForTokenClassification, self).__init__(config) super().__init__(config)
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.roberta = RobertaModel(config) self.roberta = RobertaModel(config)
...@@ -625,7 +625,7 @@ class RobertaClassificationHead(nn.Module): ...@@ -625,7 +625,7 @@ class RobertaClassificationHead(nn.Module):
"""Head for sentence-level classification tasks.""" """Head for sentence-level classification tasks."""
def __init__(self, config): def __init__(self, config):
super(RobertaClassificationHead, self).__init__() super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.dropout = nn.Dropout(config.hidden_dropout_prob) self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.out_proj = nn.Linear(config.hidden_size, config.num_labels) self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
...@@ -684,7 +684,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel): ...@@ -684,7 +684,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
base_model_prefix = "roberta" base_model_prefix = "roberta"
def __init__(self, config): def __init__(self, config):
super(RobertaForQuestionAnswering, self).__init__(config) super().__init__(config)
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.roberta = RobertaModel(config) self.roberta = RobertaModel(config)
......
...@@ -142,7 +142,7 @@ class T5LayerNorm(nn.Module): ...@@ -142,7 +142,7 @@ class T5LayerNorm(nn.Module):
""" Construct a layernorm module in the T5 style """ Construct a layernorm module in the T5 style
No bias and no substraction of mean. No bias and no substraction of mean.
""" """
super(T5LayerNorm, self).__init__() super().__init__()
self.weight = nn.Parameter(torch.ones(hidden_size)) self.weight = nn.Parameter(torch.ones(hidden_size))
self.variance_epsilon = eps self.variance_epsilon = eps
...@@ -154,7 +154,7 @@ class T5LayerNorm(nn.Module): ...@@ -154,7 +154,7 @@ class T5LayerNorm(nn.Module):
class T5DenseReluDense(nn.Module): class T5DenseReluDense(nn.Module):
def __init__(self, config): def __init__(self, config):
super(T5DenseReluDense, self).__init__() super().__init__()
self.wi = nn.Linear(config.d_model, config.d_ff, bias=False) self.wi = nn.Linear(config.d_model, config.d_ff, bias=False)
self.wo = nn.Linear(config.d_ff, config.d_model, bias=False) self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
self.dropout = nn.Dropout(config.dropout_rate) self.dropout = nn.Dropout(config.dropout_rate)
...@@ -169,7 +169,7 @@ class T5DenseReluDense(nn.Module): ...@@ -169,7 +169,7 @@ class T5DenseReluDense(nn.Module):
class T5LayerFF(nn.Module): class T5LayerFF(nn.Module):
def __init__(self, config): def __init__(self, config):
super(T5LayerFF, self).__init__() super().__init__()
self.DenseReluDense = T5DenseReluDense(config) self.DenseReluDense = T5DenseReluDense(config)
self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
self.dropout = nn.Dropout(config.dropout_rate) self.dropout = nn.Dropout(config.dropout_rate)
...@@ -185,7 +185,7 @@ class T5Attention(nn.Module): ...@@ -185,7 +185,7 @@ class T5Attention(nn.Module):
NEW_ID = itertools.count() NEW_ID = itertools.count()
def __init__(self, config, has_relative_attention_bias=False): def __init__(self, config, has_relative_attention_bias=False):
super(T5Attention, self).__init__() super().__init__()
self.layer_id = next(T5Attention.NEW_ID) self.layer_id = next(T5Attention.NEW_ID)
self.is_decoder = config.is_decoder self.is_decoder = config.is_decoder
self.has_relative_attention_bias = has_relative_attention_bias self.has_relative_attention_bias = has_relative_attention_bias
...@@ -363,7 +363,7 @@ class T5Attention(nn.Module): ...@@ -363,7 +363,7 @@ class T5Attention(nn.Module):
class T5LayerSelfAttention(nn.Module): class T5LayerSelfAttention(nn.Module):
def __init__(self, config, has_relative_attention_bias=False): def __init__(self, config, has_relative_attention_bias=False):
super(T5LayerSelfAttention, self).__init__() super().__init__()
self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias) self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
self.dropout = nn.Dropout(config.dropout_rate) self.dropout = nn.Dropout(config.dropout_rate)
...@@ -381,7 +381,7 @@ class T5LayerSelfAttention(nn.Module): ...@@ -381,7 +381,7 @@ class T5LayerSelfAttention(nn.Module):
class T5LayerCrossAttention(nn.Module): class T5LayerCrossAttention(nn.Module):
def __init__(self, config, has_relative_attention_bias=False): def __init__(self, config, has_relative_attention_bias=False):
super(T5LayerCrossAttention, self).__init__() super().__init__()
self.EncDecAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias) self.EncDecAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
self.dropout = nn.Dropout(config.dropout_rate) self.dropout = nn.Dropout(config.dropout_rate)
...@@ -399,7 +399,7 @@ class T5LayerCrossAttention(nn.Module): ...@@ -399,7 +399,7 @@ class T5LayerCrossAttention(nn.Module):
class T5Block(nn.Module): class T5Block(nn.Module):
def __init__(self, config, has_relative_attention_bias=False): def __init__(self, config, has_relative_attention_bias=False):
super(T5Block, self).__init__() super().__init__()
self.is_decoder = config.is_decoder self.is_decoder = config.is_decoder
self.layer = nn.ModuleList() self.layer = nn.ModuleList()
self.layer.append(T5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias)) self.layer.append(T5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias))
...@@ -501,7 +501,7 @@ class T5PreTrainedModel(PreTrainedModel): ...@@ -501,7 +501,7 @@ class T5PreTrainedModel(PreTrainedModel):
class T5Stack(T5PreTrainedModel): class T5Stack(T5PreTrainedModel):
def __init__(self, config): def __init__(self, config):
super(T5Stack, self).__init__(config) super().__init__(config)
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states self.output_hidden_states = config.output_hidden_states
self.is_decoder = config.is_decoder self.is_decoder = config.is_decoder
...@@ -724,7 +724,7 @@ class T5Model(T5PreTrainedModel): ...@@ -724,7 +724,7 @@ class T5Model(T5PreTrainedModel):
""" """
def __init__(self, config): def __init__(self, config):
super(T5Model, self).__init__(config) super().__init__(config)
self.shared = nn.Embedding(config.vocab_size, config.d_model) self.shared = nn.Embedding(config.vocab_size, config.d_model)
encoder_config = copy.deepcopy(config) encoder_config = copy.deepcopy(config)
...@@ -830,7 +830,7 @@ class T5WithLMHeadModel(T5PreTrainedModel): ...@@ -830,7 +830,7 @@ class T5WithLMHeadModel(T5PreTrainedModel):
""" """
def __init__(self, config): def __init__(self, config):
super(T5WithLMHeadModel, self).__init__(config) super().__init__(config)
self.model_dim = config.d_model self.model_dim = config.d_model
self.shared = nn.Embedding(config.vocab_size, config.d_model) self.shared = nn.Embedding(config.vocab_size, config.d_model)
......
...@@ -45,7 +45,7 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer): ...@@ -45,7 +45,7 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer):
""" """
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super(TFAlbertEmbeddings, self).__init__(**kwargs) super().__init__(**kwargs)
self.config = config self.config = config
self.position_embeddings = tf.keras.layers.Embedding( self.position_embeddings = tf.keras.layers.Embedding(
...@@ -76,7 +76,7 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer): ...@@ -76,7 +76,7 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer):
shape=[self.config.vocab_size, self.config.embedding_size], shape=[self.config.vocab_size, self.config.embedding_size],
initializer=get_initializer(self.config.initializer_range), initializer=get_initializer(self.config.initializer_range),
) )
super(TFAlbertEmbeddings, self).build(input_shape) super().build(input_shape)
def call(self, inputs, mode="embedding", training=False): def call(self, inputs, mode="embedding", training=False):
"""Get token embeddings of inputs. """Get token embeddings of inputs.
...@@ -141,7 +141,7 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer): ...@@ -141,7 +141,7 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer):
class TFAlbertSelfAttention(tf.keras.layers.Layer): class TFAlbertSelfAttention(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super(TFAlbertSelfAttention, self).__init__(**kwargs) super().__init__(**kwargs)
if config.hidden_size % config.num_attention_heads != 0: if config.hidden_size % config.num_attention_heads != 0:
raise ValueError( raise ValueError(
"The hidden size (%d) is not a multiple of the number of attention " "The hidden size (%d) is not a multiple of the number of attention "
...@@ -217,7 +217,7 @@ class TFAlbertSelfAttention(tf.keras.layers.Layer): ...@@ -217,7 +217,7 @@ class TFAlbertSelfAttention(tf.keras.layers.Layer):
class TFAlbertSelfOutput(tf.keras.layers.Layer): class TFAlbertSelfOutput(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super(TFAlbertSelfOutput, self).__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = tf.keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
...@@ -235,7 +235,7 @@ class TFAlbertSelfOutput(tf.keras.layers.Layer): ...@@ -235,7 +235,7 @@ class TFAlbertSelfOutput(tf.keras.layers.Layer):
class TFAlbertAttention(TFBertSelfAttention): class TFAlbertAttention(TFBertSelfAttention):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super(TFAlbertAttention, self).__init__(config, **kwargs) super().__init__(config, **kwargs)
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
self.dense = tf.keras.layers.Dense( self.dense = tf.keras.layers.Dense(
...@@ -303,7 +303,7 @@ class TFAlbertAttention(TFBertSelfAttention): ...@@ -303,7 +303,7 @@ class TFAlbertAttention(TFBertSelfAttention):
class TFAlbertLayer(tf.keras.layers.Layer): class TFAlbertLayer(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super(TFAlbertLayer, self).__init__(**kwargs) super().__init__(**kwargs)
self.attention = TFAlbertAttention(config, name="attention") self.attention = TFAlbertAttention(config, name="attention")
self.ffn = tf.keras.layers.Dense( self.ffn = tf.keras.layers.Dense(
...@@ -341,7 +341,7 @@ class TFAlbertLayer(tf.keras.layers.Layer): ...@@ -341,7 +341,7 @@ class TFAlbertLayer(tf.keras.layers.Layer):
class TFAlbertLayerGroup(tf.keras.layers.Layer): class TFAlbertLayerGroup(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super(TFAlbertLayerGroup, self).__init__(**kwargs) super().__init__(**kwargs)
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states self.output_hidden_states = config.output_hidden_states
...@@ -376,7 +376,7 @@ class TFAlbertLayerGroup(tf.keras.layers.Layer): ...@@ -376,7 +376,7 @@ class TFAlbertLayerGroup(tf.keras.layers.Layer):
class TFAlbertTransformer(tf.keras.layers.Layer): class TFAlbertTransformer(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super(TFAlbertTransformer, self).__init__(**kwargs) super().__init__(**kwargs)
self.config = config self.config = config
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
...@@ -445,7 +445,7 @@ class TFAlbertPreTrainedModel(TFPreTrainedModel): ...@@ -445,7 +445,7 @@ class TFAlbertPreTrainedModel(TFPreTrainedModel):
class TFAlbertMLMHead(tf.keras.layers.Layer): class TFAlbertMLMHead(tf.keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs): def __init__(self, config, input_embeddings, **kwargs):
super(TFAlbertMLMHead, self).__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.vocab_size = config.vocab_size
self.dense = tf.keras.layers.Dense( self.dense = tf.keras.layers.Dense(
...@@ -467,7 +467,7 @@ class TFAlbertMLMHead(tf.keras.layers.Layer): ...@@ -467,7 +467,7 @@ class TFAlbertMLMHead(tf.keras.layers.Layer):
self.decoder_bias = self.add_weight( self.decoder_bias = self.add_weight(
shape=(self.vocab_size,), initializer="zeros", trainable=True, name="decoder/bias" shape=(self.vocab_size,), initializer="zeros", trainable=True, name="decoder/bias"
) )
super(TFAlbertMLMHead, self).build(input_shape) super().build(input_shape)
def call(self, hidden_states): def call(self, hidden_states):
hidden_states = self.dense(hidden_states) hidden_states = self.dense(hidden_states)
...@@ -596,7 +596,7 @@ class TFAlbertModel(TFAlbertPreTrainedModel): ...@@ -596,7 +596,7 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
""" """
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super(TFAlbertModel, self).__init__(config, **kwargs) super().__init__(config, **kwargs)
self.num_hidden_layers = config.num_hidden_layers self.num_hidden_layers = config.num_hidden_layers
self.embeddings = TFAlbertEmbeddings(config, name="embeddings") self.embeddings = TFAlbertEmbeddings(config, name="embeddings")
...@@ -733,7 +733,7 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel): ...@@ -733,7 +733,7 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
""" """
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFAlbertForMaskedLM, self).__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.albert = TFAlbertModel(config, name="albert") self.albert = TFAlbertModel(config, name="albert")
self.predictions = TFAlbertMLMHead(config, self.albert.embeddings, name="predictions") self.predictions = TFAlbertMLMHead(config, self.albert.embeddings, name="predictions")
...@@ -786,7 +786,7 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel): ...@@ -786,7 +786,7 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel):
""" """
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFAlbertForSequenceClassification, self).__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.albert = TFAlbertModel(config, name="albert") self.albert = TFAlbertModel(config, name="albert")
......
...@@ -93,7 +93,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer): ...@@ -93,7 +93,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
""" """
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super(TFBertEmbeddings, self).__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.vocab_size = config.vocab_size
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
self.initializer_range = config.initializer_range self.initializer_range = config.initializer_range
...@@ -126,7 +126,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer): ...@@ -126,7 +126,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
shape=[self.vocab_size, self.hidden_size], shape=[self.vocab_size, self.hidden_size],
initializer=get_initializer(self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
super(TFBertEmbeddings, self).build(input_shape) super().build(input_shape)
def call(self, inputs, mode="embedding", training=False): def call(self, inputs, mode="embedding", training=False):
"""Get token embeddings of inputs. """Get token embeddings of inputs.
...@@ -193,7 +193,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer): ...@@ -193,7 +193,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
class TFBertSelfAttention(tf.keras.layers.Layer): class TFBertSelfAttention(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super(TFBertSelfAttention, self).__init__(**kwargs) super().__init__(**kwargs)
if config.hidden_size % config.num_attention_heads != 0: if config.hidden_size % config.num_attention_heads != 0:
raise ValueError( raise ValueError(
"The hidden size (%d) is not a multiple of the number of attention " "The hidden size (%d) is not a multiple of the number of attention "
...@@ -269,7 +269,7 @@ class TFBertSelfAttention(tf.keras.layers.Layer): ...@@ -269,7 +269,7 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
class TFBertSelfOutput(tf.keras.layers.Layer): class TFBertSelfOutput(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super(TFBertSelfOutput, self).__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = tf.keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
...@@ -287,7 +287,7 @@ class TFBertSelfOutput(tf.keras.layers.Layer): ...@@ -287,7 +287,7 @@ class TFBertSelfOutput(tf.keras.layers.Layer):
class TFBertAttention(tf.keras.layers.Layer): class TFBertAttention(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super(TFBertAttention, self).__init__(**kwargs) super().__init__(**kwargs)
self.self_attention = TFBertSelfAttention(config, name="self") self.self_attention = TFBertSelfAttention(config, name="self")
self.dense_output = TFBertSelfOutput(config, name="output") self.dense_output = TFBertSelfOutput(config, name="output")
...@@ -305,7 +305,7 @@ class TFBertAttention(tf.keras.layers.Layer): ...@@ -305,7 +305,7 @@ class TFBertAttention(tf.keras.layers.Layer):
class TFBertIntermediate(tf.keras.layers.Layer): class TFBertIntermediate(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super(TFBertIntermediate, self).__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = tf.keras.layers.Dense(
config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
...@@ -322,7 +322,7 @@ class TFBertIntermediate(tf.keras.layers.Layer): ...@@ -322,7 +322,7 @@ class TFBertIntermediate(tf.keras.layers.Layer):
class TFBertOutput(tf.keras.layers.Layer): class TFBertOutput(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super(TFBertOutput, self).__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = tf.keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
...@@ -340,7 +340,7 @@ class TFBertOutput(tf.keras.layers.Layer): ...@@ -340,7 +340,7 @@ class TFBertOutput(tf.keras.layers.Layer):
class TFBertLayer(tf.keras.layers.Layer): class TFBertLayer(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super(TFBertLayer, self).__init__(**kwargs) super().__init__(**kwargs)
self.attention = TFBertAttention(config, name="attention") self.attention = TFBertAttention(config, name="attention")
self.intermediate = TFBertIntermediate(config, name="intermediate") self.intermediate = TFBertIntermediate(config, name="intermediate")
self.bert_output = TFBertOutput(config, name="output") self.bert_output = TFBertOutput(config, name="output")
...@@ -358,7 +358,7 @@ class TFBertLayer(tf.keras.layers.Layer): ...@@ -358,7 +358,7 @@ class TFBertLayer(tf.keras.layers.Layer):
class TFBertEncoder(tf.keras.layers.Layer): class TFBertEncoder(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super(TFBertEncoder, self).__init__(**kwargs) super().__init__(**kwargs)
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states self.output_hidden_states = config.output_hidden_states
self.layer = [TFBertLayer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)] self.layer = [TFBertLayer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)]
...@@ -392,7 +392,7 @@ class TFBertEncoder(tf.keras.layers.Layer): ...@@ -392,7 +392,7 @@ class TFBertEncoder(tf.keras.layers.Layer):
class TFBertPooler(tf.keras.layers.Layer): class TFBertPooler(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super(TFBertPooler, self).__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = tf.keras.layers.Dense(
config.hidden_size, config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
...@@ -410,7 +410,7 @@ class TFBertPooler(tf.keras.layers.Layer): ...@@ -410,7 +410,7 @@ class TFBertPooler(tf.keras.layers.Layer):
class TFBertPredictionHeadTransform(tf.keras.layers.Layer): class TFBertPredictionHeadTransform(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super(TFBertPredictionHeadTransform, self).__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = tf.keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
...@@ -429,7 +429,7 @@ class TFBertPredictionHeadTransform(tf.keras.layers.Layer): ...@@ -429,7 +429,7 @@ class TFBertPredictionHeadTransform(tf.keras.layers.Layer):
class TFBertLMPredictionHead(tf.keras.layers.Layer): class TFBertLMPredictionHead(tf.keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs): def __init__(self, config, input_embeddings, **kwargs):
super(TFBertLMPredictionHead, self).__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.vocab_size = config.vocab_size
self.transform = TFBertPredictionHeadTransform(config, name="transform") self.transform = TFBertPredictionHeadTransform(config, name="transform")
...@@ -439,7 +439,7 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer): ...@@ -439,7 +439,7 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer):
def build(self, input_shape): def build(self, input_shape):
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
super(TFBertLMPredictionHead, self).build(input_shape) super().build(input_shape)
def call(self, hidden_states): def call(self, hidden_states):
hidden_states = self.transform(hidden_states) hidden_states = self.transform(hidden_states)
...@@ -450,7 +450,7 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer): ...@@ -450,7 +450,7 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer):
class TFBertMLMHead(tf.keras.layers.Layer): class TFBertMLMHead(tf.keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs): def __init__(self, config, input_embeddings, **kwargs):
super(TFBertMLMHead, self).__init__(**kwargs) super().__init__(**kwargs)
self.predictions = TFBertLMPredictionHead(config, input_embeddings, name="predictions") self.predictions = TFBertLMPredictionHead(config, input_embeddings, name="predictions")
def call(self, sequence_output): def call(self, sequence_output):
...@@ -460,7 +460,7 @@ class TFBertMLMHead(tf.keras.layers.Layer): ...@@ -460,7 +460,7 @@ class TFBertMLMHead(tf.keras.layers.Layer):
class TFBertNSPHead(tf.keras.layers.Layer): class TFBertNSPHead(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super(TFBertNSPHead, self).__init__(**kwargs) super().__init__(**kwargs)
self.seq_relationship = tf.keras.layers.Dense( self.seq_relationship = tf.keras.layers.Dense(
2, kernel_initializer=get_initializer(config.initializer_range), name="seq_relationship" 2, kernel_initializer=get_initializer(config.initializer_range), name="seq_relationship"
) )
...@@ -472,7 +472,7 @@ class TFBertNSPHead(tf.keras.layers.Layer): ...@@ -472,7 +472,7 @@ class TFBertNSPHead(tf.keras.layers.Layer):
class TFBertMainLayer(tf.keras.layers.Layer): class TFBertMainLayer(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super(TFBertMainLayer, self).__init__(**kwargs) super().__init__(**kwargs)
self.num_hidden_layers = config.num_hidden_layers self.num_hidden_layers = config.num_hidden_layers
self.embeddings = TFBertEmbeddings(config, name="embeddings") self.embeddings = TFBertEmbeddings(config, name="embeddings")
...@@ -707,7 +707,7 @@ class TFBertModel(TFBertPreTrainedModel): ...@@ -707,7 +707,7 @@ class TFBertModel(TFBertPreTrainedModel):
""" """
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFBertModel, self).__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.bert = TFBertMainLayer(config, name="bert") self.bert = TFBertMainLayer(config, name="bert")
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
...@@ -750,7 +750,7 @@ class TFBertForPreTraining(TFBertPreTrainedModel): ...@@ -750,7 +750,7 @@ class TFBertForPreTraining(TFBertPreTrainedModel):
""" """
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFBertForPreTraining, self).__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.bert = TFBertMainLayer(config, name="bert") self.bert = TFBertMainLayer(config, name="bert")
self.nsp = TFBertNSPHead(config, name="nsp___cls") self.nsp = TFBertNSPHead(config, name="nsp___cls")
...@@ -803,7 +803,7 @@ class TFBertForMaskedLM(TFBertPreTrainedModel): ...@@ -803,7 +803,7 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):
""" """
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFBertForMaskedLM, self).__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.bert = TFBertMainLayer(config, name="bert") self.bert = TFBertMainLayer(config, name="bert")
self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls") self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls")
...@@ -854,7 +854,7 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel): ...@@ -854,7 +854,7 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
""" """
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFBertForNextSentencePrediction, self).__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.bert = TFBertMainLayer(config, name="bert") self.bert = TFBertMainLayer(config, name="bert")
self.nsp = TFBertNSPHead(config, name="nsp___cls") self.nsp = TFBertNSPHead(config, name="nsp___cls")
...@@ -903,7 +903,7 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel): ...@@ -903,7 +903,7 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel):
""" """
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFBertForSequenceClassification, self).__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.bert = TFBertMainLayer(config, name="bert") self.bert = TFBertMainLayer(config, name="bert")
...@@ -960,7 +960,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel): ...@@ -960,7 +960,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
""" """
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFBertForMultipleChoice, self).__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.bert = TFBertMainLayer(config, name="bert") self.bert = TFBertMainLayer(config, name="bert")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
...@@ -1064,7 +1064,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel): ...@@ -1064,7 +1064,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel):
""" """
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFBertForTokenClassification, self).__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.bert = TFBertMainLayer(config, name="bert") self.bert = TFBertMainLayer(config, name="bert")
...@@ -1121,7 +1121,7 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel): ...@@ -1121,7 +1121,7 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel):
""" """
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFBertForQuestionAnswering, self).__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.bert = TFBertMainLayer(config, name="bert") self.bert = TFBertMainLayer(config, name="bert")
......
...@@ -75,7 +75,7 @@ def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=N ...@@ -75,7 +75,7 @@ def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=N
class TFMultiHeadAttention(tf.keras.layers.Layer): class TFMultiHeadAttention(tf.keras.layers.Layer):
def __init__(self, d_model_size, num_heads, output_attentions=False, **kwargs): def __init__(self, d_model_size, num_heads, output_attentions=False, **kwargs):
super(TFMultiHeadAttention, self).__init__(**kwargs) super().__init__(**kwargs)
self.output_attentions = output_attentions self.output_attentions = output_attentions
self.num_heads = num_heads self.num_heads = num_heads
self.d_model_size = d_model_size self.d_model_size = d_model_size
...@@ -132,7 +132,7 @@ class TFEncoderLayer(tf.keras.layers.Layer): ...@@ -132,7 +132,7 @@ class TFEncoderLayer(tf.keras.layers.Layer):
def __init__( def __init__(
self, d_model_size, num_heads, dff, rate=0.1, layer_norm_epsilon=1e-6, output_attentions=False, **kwargs self, d_model_size, num_heads, dff, rate=0.1, layer_norm_epsilon=1e-6, output_attentions=False, **kwargs
): ):
super(TFEncoderLayer, self).__init__(**kwargs) super().__init__(**kwargs)
self.multi_head_attention = TFMultiHeadAttention( self.multi_head_attention = TFMultiHeadAttention(
d_model_size, num_heads, output_attentions, name="multi_head_attention" d_model_size, num_heads, output_attentions, name="multi_head_attention"
...@@ -166,7 +166,7 @@ class TFEncoderLayer(tf.keras.layers.Layer): ...@@ -166,7 +166,7 @@ class TFEncoderLayer(tf.keras.layers.Layer):
class TFCTRLMainLayer(tf.keras.layers.Layer): class TFCTRLMainLayer(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super(TFCTRLMainLayer, self).__init__(**kwargs) super().__init__(**kwargs)
self.output_hidden_states = config.output_hidden_states self.output_hidden_states = config.output_hidden_states
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
self.output_past = config.output_past self.output_past = config.output_past
...@@ -443,7 +443,7 @@ class TFCTRLModel(TFCTRLPreTrainedModel): ...@@ -443,7 +443,7 @@ class TFCTRLModel(TFCTRLPreTrainedModel):
""" """
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFCTRLModel, self).__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.transformer = TFCTRLMainLayer(config, name="transformer") self.transformer = TFCTRLMainLayer(config, name="transformer")
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
...@@ -453,7 +453,7 @@ class TFCTRLModel(TFCTRLPreTrainedModel): ...@@ -453,7 +453,7 @@ class TFCTRLModel(TFCTRLPreTrainedModel):
class TFCTRLLMHead(tf.keras.layers.Layer): class TFCTRLLMHead(tf.keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs): def __init__(self, config, input_embeddings, **kwargs):
super(TFCTRLLMHead, self).__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.vocab_size = config.vocab_size
# The output weights are the same as the input embeddings, but there is # The output weights are the same as the input embeddings, but there is
...@@ -462,7 +462,7 @@ class TFCTRLLMHead(tf.keras.layers.Layer): ...@@ -462,7 +462,7 @@ class TFCTRLLMHead(tf.keras.layers.Layer):
def build(self, input_shape): def build(self, input_shape):
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
super(TFCTRLLMHead, self).build(input_shape) super().build(input_shape)
def call(self, hidden_states): def call(self, hidden_states):
hidden_states = self.input_embeddings(hidden_states, mode="linear") hidden_states = self.input_embeddings(hidden_states, mode="linear")
...@@ -508,7 +508,7 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel): ...@@ -508,7 +508,7 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel):
""" """
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFCTRLLMHeadModel, self).__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.transformer = TFCTRLMainLayer(config, name="transformer") self.transformer = TFCTRLMainLayer(config, name="transformer")
self.lm_head = TFCTRLLMHead(config, self.transformer.w, name="lm_head") self.lm_head = TFCTRLLMHead(config, self.transformer.w, name="lm_head")
......
...@@ -65,7 +65,7 @@ def gelu_new(x): ...@@ -65,7 +65,7 @@ def gelu_new(x):
class TFEmbeddings(tf.keras.layers.Layer): class TFEmbeddings(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super(TFEmbeddings, self).__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.vocab_size = config.vocab_size
self.dim = config.dim self.dim = config.dim
self.initializer_range = config.initializer_range self.initializer_range = config.initializer_range
...@@ -92,7 +92,7 @@ class TFEmbeddings(tf.keras.layers.Layer): ...@@ -92,7 +92,7 @@ class TFEmbeddings(tf.keras.layers.Layer):
self.word_embeddings = self.add_weight( self.word_embeddings = self.add_weight(
"weight", shape=[self.vocab_size, self.dim], initializer=get_initializer(self.initializer_range) "weight", shape=[self.vocab_size, self.dim], initializer=get_initializer(self.initializer_range)
) )
super(TFEmbeddings, self).build(input_shape) super().build(input_shape)
def call(self, inputs, inputs_embeds=None, mode="embedding", training=False): def call(self, inputs, inputs_embeds=None, mode="embedding", training=False):
"""Get token embeddings of inputs. """Get token embeddings of inputs.
...@@ -169,7 +169,7 @@ class TFEmbeddings(tf.keras.layers.Layer): ...@@ -169,7 +169,7 @@ class TFEmbeddings(tf.keras.layers.Layer):
class TFMultiHeadSelfAttention(tf.keras.layers.Layer): class TFMultiHeadSelfAttention(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super(TFMultiHeadSelfAttention, self).__init__(**kwargs) super().__init__(**kwargs)
self.n_heads = config.n_heads self.n_heads = config.n_heads
self.dim = config.dim self.dim = config.dim
...@@ -259,7 +259,7 @@ class TFMultiHeadSelfAttention(tf.keras.layers.Layer): ...@@ -259,7 +259,7 @@ class TFMultiHeadSelfAttention(tf.keras.layers.Layer):
class TFFFN(tf.keras.layers.Layer): class TFFFN(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super(TFFFN, self).__init__(**kwargs) super().__init__(**kwargs)
self.dropout = tf.keras.layers.Dropout(config.dropout) self.dropout = tf.keras.layers.Dropout(config.dropout)
self.lin1 = tf.keras.layers.Dense( self.lin1 = tf.keras.layers.Dense(
config.hidden_dim, kernel_initializer=get_initializer(config.initializer_range), name="lin1" config.hidden_dim, kernel_initializer=get_initializer(config.initializer_range), name="lin1"
...@@ -284,7 +284,7 @@ class TFFFN(tf.keras.layers.Layer): ...@@ -284,7 +284,7 @@ class TFFFN(tf.keras.layers.Layer):
class TFTransformerBlock(tf.keras.layers.Layer): class TFTransformerBlock(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super(TFTransformerBlock, self).__init__(**kwargs) super().__init__(**kwargs)
self.n_heads = config.n_heads self.n_heads = config.n_heads
self.dim = config.dim self.dim = config.dim
...@@ -338,7 +338,7 @@ class TFTransformerBlock(tf.keras.layers.Layer): ...@@ -338,7 +338,7 @@ class TFTransformerBlock(tf.keras.layers.Layer):
class TFTransformer(tf.keras.layers.Layer): class TFTransformer(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super(TFTransformer, self).__init__(**kwargs) super().__init__(**kwargs)
self.n_layers = config.n_layers self.n_layers = config.n_layers
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states self.output_hidden_states = config.output_hidden_states
...@@ -399,7 +399,7 @@ class TFTransformer(tf.keras.layers.Layer): ...@@ -399,7 +399,7 @@ class TFTransformer(tf.keras.layers.Layer):
class TFDistilBertMainLayer(tf.keras.layers.Layer): class TFDistilBertMainLayer(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super(TFDistilBertMainLayer, self).__init__(**kwargs) super().__init__(**kwargs)
self.num_hidden_layers = config.num_hidden_layers self.num_hidden_layers = config.num_hidden_layers
self.embeddings = TFEmbeddings(config, name="embeddings") # Embeddings self.embeddings = TFEmbeddings(config, name="embeddings") # Embeddings
...@@ -569,7 +569,7 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel): ...@@ -569,7 +569,7 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel):
""" """
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFDistilBertModel, self).__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.distilbert = TFDistilBertMainLayer(config, name="distilbert") # Embeddings self.distilbert = TFDistilBertMainLayer(config, name="distilbert") # Embeddings
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
...@@ -579,7 +579,7 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel): ...@@ -579,7 +579,7 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel):
class TFDistilBertLMHead(tf.keras.layers.Layer): class TFDistilBertLMHead(tf.keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs): def __init__(self, config, input_embeddings, **kwargs):
super(TFDistilBertLMHead, self).__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.vocab_size = config.vocab_size
# The output weights are the same as the input embeddings, but there is # The output weights are the same as the input embeddings, but there is
...@@ -588,7 +588,7 @@ class TFDistilBertLMHead(tf.keras.layers.Layer): ...@@ -588,7 +588,7 @@ class TFDistilBertLMHead(tf.keras.layers.Layer):
def build(self, input_shape): def build(self, input_shape):
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
super(TFDistilBertLMHead, self).build(input_shape) super().build(input_shape)
def call(self, hidden_states): def call(self, hidden_states):
hidden_states = self.input_embeddings(hidden_states, mode="linear") hidden_states = self.input_embeddings(hidden_states, mode="linear")
...@@ -628,7 +628,7 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel): ...@@ -628,7 +628,7 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
""" """
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFDistilBertForMaskedLM, self).__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states self.output_hidden_states = config.output_hidden_states
self.vocab_size = config.vocab_size self.vocab_size = config.vocab_size
...@@ -690,7 +690,7 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel): ...@@ -690,7 +690,7 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
""" """
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFDistilBertForSequenceClassification, self).__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.distilbert = TFDistilBertMainLayer(config, name="distilbert") self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
...@@ -747,7 +747,7 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel): ...@@ -747,7 +747,7 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel):
""" """
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFDistilBertForTokenClassification, self).__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.distilbert = TFDistilBertMainLayer(config, name="distilbert") self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
...@@ -804,7 +804,7 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel): ...@@ -804,7 +804,7 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel):
""" """
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFDistilBertForQuestionAnswering, self).__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.distilbert = TFDistilBertMainLayer(config, name="distilbert") self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
self.qa_outputs = tf.keras.layers.Dense( self.qa_outputs = tf.keras.layers.Dense(
......
...@@ -58,7 +58,7 @@ def gelu(x): ...@@ -58,7 +58,7 @@ def gelu(x):
class TFAttention(tf.keras.layers.Layer): class TFAttention(tf.keras.layers.Layer):
def __init__(self, nx, n_ctx, config, scale=False, **kwargs): def __init__(self, nx, n_ctx, config, scale=False, **kwargs):
super(TFAttention, self).__init__(**kwargs) super().__init__(**kwargs)
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
n_state = nx # in Attention: n_state=768 (nx=n_embd) n_state = nx # in Attention: n_state=768 (nx=n_embd)
...@@ -157,7 +157,7 @@ class TFAttention(tf.keras.layers.Layer): ...@@ -157,7 +157,7 @@ class TFAttention(tf.keras.layers.Layer):
class TFMLP(tf.keras.layers.Layer): class TFMLP(tf.keras.layers.Layer):
def __init__(self, n_state, config, **kwargs): def __init__(self, n_state, config, **kwargs):
super(TFMLP, self).__init__(**kwargs) super().__init__(**kwargs)
nx = config.n_embd nx = config.n_embd
self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_fc") self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_fc")
self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name="c_proj") self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name="c_proj")
...@@ -173,7 +173,7 @@ class TFMLP(tf.keras.layers.Layer): ...@@ -173,7 +173,7 @@ class TFMLP(tf.keras.layers.Layer):
class TFBlock(tf.keras.layers.Layer): class TFBlock(tf.keras.layers.Layer):
def __init__(self, n_ctx, config, scale=False, **kwargs): def __init__(self, n_ctx, config, scale=False, **kwargs):
super(TFBlock, self).__init__(**kwargs) super().__init__(**kwargs)
nx = config.n_embd nx = config.n_embd
self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1") self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1")
self.attn = TFAttention(nx, n_ctx, config, scale, name="attn") self.attn = TFAttention(nx, n_ctx, config, scale, name="attn")
...@@ -198,7 +198,7 @@ class TFBlock(tf.keras.layers.Layer): ...@@ -198,7 +198,7 @@ class TFBlock(tf.keras.layers.Layer):
class TFGPT2MainLayer(tf.keras.layers.Layer): class TFGPT2MainLayer(tf.keras.layers.Layer):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFGPT2MainLayer, self).__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.output_hidden_states = config.output_hidden_states self.output_hidden_states = config.output_hidden_states
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
self.num_hidden_layers = config.n_layer self.num_hidden_layers = config.n_layer
...@@ -475,7 +475,7 @@ class TFGPT2Model(TFGPT2PreTrainedModel): ...@@ -475,7 +475,7 @@ class TFGPT2Model(TFGPT2PreTrainedModel):
""" """
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFGPT2Model, self).__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.transformer = TFGPT2MainLayer(config, name="transformer") self.transformer = TFGPT2MainLayer(config, name="transformer")
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
...@@ -521,7 +521,7 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel): ...@@ -521,7 +521,7 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
""" """
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFGPT2LMHeadModel, self).__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.transformer = TFGPT2MainLayer(config, name="transformer") self.transformer = TFGPT2MainLayer(config, name="transformer")
def get_output_embeddings(self): def get_output_embeddings(self):
...@@ -598,7 +598,7 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel): ...@@ -598,7 +598,7 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
""" """
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFGPT2DoubleHeadsModel, self).__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
config.num_labels = 1 config.num_labels = 1
self.transformer = TFGPT2MainLayer(config, name="transformer") self.transformer = TFGPT2MainLayer(config, name="transformer")
self.multiple_choice_head = TFSequenceSummary( self.multiple_choice_head = TFSequenceSummary(
......
...@@ -66,7 +66,7 @@ ACT_FNS = { ...@@ -66,7 +66,7 @@ ACT_FNS = {
class TFAttention(tf.keras.layers.Layer): class TFAttention(tf.keras.layers.Layer):
def __init__(self, nx, n_ctx, config, scale=False, **kwargs): def __init__(self, nx, n_ctx, config, scale=False, **kwargs):
super(TFAttention, self).__init__(**kwargs) super().__init__(**kwargs)
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
n_state = nx # in Attention: n_state=768 (nx=n_embd) n_state = nx # in Attention: n_state=768 (nx=n_embd)
...@@ -160,7 +160,7 @@ class TFAttention(tf.keras.layers.Layer): ...@@ -160,7 +160,7 @@ class TFAttention(tf.keras.layers.Layer):
class TFMLP(tf.keras.layers.Layer): class TFMLP(tf.keras.layers.Layer):
def __init__(self, n_state, config, **kwargs): def __init__(self, n_state, config, **kwargs):
super(TFMLP, self).__init__(**kwargs) super().__init__(**kwargs)
nx = config.n_embd nx = config.n_embd
self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_fc") self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_fc")
self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name="c_proj") self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name="c_proj")
...@@ -176,7 +176,7 @@ class TFMLP(tf.keras.layers.Layer): ...@@ -176,7 +176,7 @@ class TFMLP(tf.keras.layers.Layer):
class TFBlock(tf.keras.layers.Layer): class TFBlock(tf.keras.layers.Layer):
def __init__(self, n_ctx, config, scale=False, **kwargs): def __init__(self, n_ctx, config, scale=False, **kwargs):
super(TFBlock, self).__init__(**kwargs) super().__init__(**kwargs)
nx = config.n_embd nx = config.n_embd
self.attn = TFAttention(nx, n_ctx, config, scale, name="attn") self.attn = TFAttention(nx, n_ctx, config, scale, name="attn")
self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1") self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1")
...@@ -199,7 +199,7 @@ class TFBlock(tf.keras.layers.Layer): ...@@ -199,7 +199,7 @@ class TFBlock(tf.keras.layers.Layer):
class TFOpenAIGPTMainLayer(tf.keras.layers.Layer): class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFOpenAIGPTMainLayer, self).__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.output_hidden_states = config.output_hidden_states self.output_hidden_states = config.output_hidden_states
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
self.num_hidden_layers = config.n_layer self.num_hidden_layers = config.n_layer
...@@ -453,7 +453,7 @@ class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel): ...@@ -453,7 +453,7 @@ class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
""" """
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFOpenAIGPTModel, self).__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.transformer = TFOpenAIGPTMainLayer(config, name="transformer") self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
...@@ -494,7 +494,7 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel): ...@@ -494,7 +494,7 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
""" """
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFOpenAIGPTLMHeadModel, self).__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.transformer = TFOpenAIGPTMainLayer(config, name="transformer") self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
def get_output_embeddings(self): def get_output_embeddings(self):
...@@ -563,7 +563,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel): ...@@ -563,7 +563,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
""" """
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFOpenAIGPTDoubleHeadsModel, self).__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
config.num_labels = 1 config.num_labels = 1
self.transformer = TFOpenAIGPTMainLayer(config, name="transformer") self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
self.multiple_choice_head = TFSequenceSummary( self.multiple_choice_head = TFSequenceSummary(
......
...@@ -42,7 +42,7 @@ class TFRobertaEmbeddings(TFBertEmbeddings): ...@@ -42,7 +42,7 @@ class TFRobertaEmbeddings(TFBertEmbeddings):
""" """
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super(TFRobertaEmbeddings, self).__init__(config, **kwargs) super().__init__(config, **kwargs)
self.padding_idx = 1 self.padding_idx = 1
def create_position_ids_from_input_ids(self, x): def create_position_ids_from_input_ids(self, x):
...@@ -78,7 +78,7 @@ class TFRobertaEmbeddings(TFBertEmbeddings): ...@@ -78,7 +78,7 @@ class TFRobertaEmbeddings(TFBertEmbeddings):
else: else:
position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds) position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
return super(TFRobertaEmbeddings, self)._embedding( return super()._embedding(
[input_ids, position_ids, token_type_ids, inputs_embeds], training=training [input_ids, position_ids, token_type_ids, inputs_embeds], training=training
) )
...@@ -89,7 +89,7 @@ class TFRobertaMainLayer(TFBertMainLayer): ...@@ -89,7 +89,7 @@ class TFRobertaMainLayer(TFBertMainLayer):
""" """
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super(TFRobertaMainLayer, self).__init__(config, **kwargs) super().__init__(config, **kwargs)
self.embeddings = TFRobertaEmbeddings(config, name="embeddings") self.embeddings = TFRobertaEmbeddings(config, name="embeddings")
def get_input_embeddings(self): def get_input_embeddings(self):
...@@ -234,7 +234,7 @@ class TFRobertaModel(TFRobertaPreTrainedModel): ...@@ -234,7 +234,7 @@ class TFRobertaModel(TFRobertaPreTrainedModel):
""" """
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFRobertaModel, self).__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.roberta = TFRobertaMainLayer(config, name="roberta") self.roberta = TFRobertaMainLayer(config, name="roberta")
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
...@@ -246,7 +246,7 @@ class TFRobertaLMHead(tf.keras.layers.Layer): ...@@ -246,7 +246,7 @@ class TFRobertaLMHead(tf.keras.layers.Layer):
"""Roberta Head for masked language modeling.""" """Roberta Head for masked language modeling."""
def __init__(self, config, input_embeddings, **kwargs): def __init__(self, config, input_embeddings, **kwargs):
super(TFRobertaLMHead, self).__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.vocab_size = config.vocab_size
self.dense = tf.keras.layers.Dense( self.dense = tf.keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
...@@ -260,7 +260,7 @@ class TFRobertaLMHead(tf.keras.layers.Layer): ...@@ -260,7 +260,7 @@ class TFRobertaLMHead(tf.keras.layers.Layer):
def build(self, input_shape): def build(self, input_shape):
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
super(TFRobertaLMHead, self).build(input_shape) super().build(input_shape)
def call(self, features): def call(self, features):
x = self.dense(features) x = self.dense(features)
...@@ -305,7 +305,7 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel): ...@@ -305,7 +305,7 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel):
""" """
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFRobertaForMaskedLM, self).__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.roberta = TFRobertaMainLayer(config, name="roberta") self.roberta = TFRobertaMainLayer(config, name="roberta")
self.lm_head = TFRobertaLMHead(config, self.roberta.embeddings, name="lm_head") self.lm_head = TFRobertaLMHead(config, self.roberta.embeddings, name="lm_head")
...@@ -328,7 +328,7 @@ class TFRobertaClassificationHead(tf.keras.layers.Layer): ...@@ -328,7 +328,7 @@ class TFRobertaClassificationHead(tf.keras.layers.Layer):
"""Head for sentence-level classification tasks.""" """Head for sentence-level classification tasks."""
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super(TFRobertaClassificationHead, self).__init__(config, **kwargs) super().__init__(config, **kwargs)
self.dense = tf.keras.layers.Dense( self.dense = tf.keras.layers.Dense(
config.hidden_size, config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
...@@ -383,7 +383,7 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel): ...@@ -383,7 +383,7 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel):
""" """
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFRobertaForSequenceClassification, self).__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.roberta = TFRobertaMainLayer(config, name="roberta") self.roberta = TFRobertaMainLayer(config, name="roberta")
...@@ -433,7 +433,7 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel): ...@@ -433,7 +433,7 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel):
""" """
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFRobertaForTokenClassification, self).__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.roberta = TFRobertaMainLayer(config, name="roberta") self.roberta = TFRobertaMainLayer(config, name="roberta")
......
...@@ -50,13 +50,13 @@ class TFT5LayerNorm(tf.keras.layers.Layer): ...@@ -50,13 +50,13 @@ class TFT5LayerNorm(tf.keras.layers.Layer):
""" Construct a layernorm module in the T5 style """ Construct a layernorm module in the T5 style
No bias and no substraction of mean. No bias and no substraction of mean.
""" """
super(TFT5LayerNorm, self).__init__(**kwargs) super().__init__(**kwargs)
self.variance_epsilon = epsilon self.variance_epsilon = epsilon
def build(self, input_shape): def build(self, input_shape):
"""Build shared word embedding layer """ """Build shared word embedding layer """
self.weight = self.add_weight("weight", shape=(input_shape[-1],), initializer="ones") self.weight = self.add_weight("weight", shape=(input_shape[-1],), initializer="ones")
super(TFT5LayerNorm, self).build(input_shape) super().build(input_shape)
def call(self, x): def call(self, x):
variance = tf.math.reduce_mean(tf.math.square(x), axis=-1, keepdims=True) variance = tf.math.reduce_mean(tf.math.square(x), axis=-1, keepdims=True)
...@@ -66,7 +66,7 @@ class TFT5LayerNorm(tf.keras.layers.Layer): ...@@ -66,7 +66,7 @@ class TFT5LayerNorm(tf.keras.layers.Layer):
class TFT5DenseReluDense(tf.keras.layers.Layer): class TFT5DenseReluDense(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super(TFT5DenseReluDense, self).__init__(**kwargs) super().__init__(**kwargs)
self.wi = tf.keras.layers.Dense(config.d_ff, use_bias=False, name="wi") self.wi = tf.keras.layers.Dense(config.d_ff, use_bias=False, name="wi")
self.wo = tf.keras.layers.Dense(config.d_model, use_bias=False, name="wo") self.wo = tf.keras.layers.Dense(config.d_model, use_bias=False, name="wo")
self.dropout = tf.keras.layers.Dropout(config.dropout_rate) self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
...@@ -82,7 +82,7 @@ class TFT5DenseReluDense(tf.keras.layers.Layer): ...@@ -82,7 +82,7 @@ class TFT5DenseReluDense(tf.keras.layers.Layer):
class TFT5LayerFF(tf.keras.layers.Layer): class TFT5LayerFF(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super(TFT5LayerFF, self).__init__(**kwargs) super().__init__(**kwargs)
self.DenseReluDense = TFT5DenseReluDense(config, name="DenseReluDense") self.DenseReluDense = TFT5DenseReluDense(config, name="DenseReluDense")
self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="layer_norm") self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="layer_norm")
self.dropout = tf.keras.layers.Dropout(config.dropout_rate) self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
...@@ -98,7 +98,7 @@ class TFT5Attention(tf.keras.layers.Layer): ...@@ -98,7 +98,7 @@ class TFT5Attention(tf.keras.layers.Layer):
NEW_ID = itertools.count() NEW_ID = itertools.count()
def __init__(self, config, has_relative_attention_bias=False, **kwargs): def __init__(self, config, has_relative_attention_bias=False, **kwargs):
super(TFT5Attention, self).__init__(**kwargs) super().__init__(**kwargs)
self.layer_id = next(TFT5Attention.NEW_ID) self.layer_id = next(TFT5Attention.NEW_ID)
self.is_decoder = config.is_decoder self.is_decoder = config.is_decoder
self.has_relative_attention_bias = has_relative_attention_bias self.has_relative_attention_bias = has_relative_attention_bias
...@@ -259,7 +259,7 @@ class TFT5Attention(tf.keras.layers.Layer): ...@@ -259,7 +259,7 @@ class TFT5Attention(tf.keras.layers.Layer):
class TFT5LayerSelfAttention(tf.keras.layers.Layer): class TFT5LayerSelfAttention(tf.keras.layers.Layer):
def __init__(self, config, has_relative_attention_bias=False, **kwargs): def __init__(self, config, has_relative_attention_bias=False, **kwargs):
super(TFT5LayerSelfAttention, self).__init__(**kwargs) super().__init__(**kwargs)
self.SelfAttention = TFT5Attention( self.SelfAttention = TFT5Attention(
config, has_relative_attention_bias=has_relative_attention_bias, name="SelfAttention" config, has_relative_attention_bias=has_relative_attention_bias, name="SelfAttention"
) )
...@@ -279,7 +279,7 @@ class TFT5LayerSelfAttention(tf.keras.layers.Layer): ...@@ -279,7 +279,7 @@ class TFT5LayerSelfAttention(tf.keras.layers.Layer):
class TFT5LayerCrossAttention(tf.keras.layers.Layer): class TFT5LayerCrossAttention(tf.keras.layers.Layer):
def __init__(self, config, has_relative_attention_bias=False, **kwargs): def __init__(self, config, has_relative_attention_bias=False, **kwargs):
super(TFT5LayerCrossAttention, self).__init__(**kwargs) super().__init__(**kwargs)
self.EncDecAttention = TFT5Attention( self.EncDecAttention = TFT5Attention(
config, has_relative_attention_bias=has_relative_attention_bias, name="EncDecAttention" config, has_relative_attention_bias=has_relative_attention_bias, name="EncDecAttention"
) )
...@@ -299,7 +299,7 @@ class TFT5LayerCrossAttention(tf.keras.layers.Layer): ...@@ -299,7 +299,7 @@ class TFT5LayerCrossAttention(tf.keras.layers.Layer):
class TFT5Block(tf.keras.layers.Layer): class TFT5Block(tf.keras.layers.Layer):
def __init__(self, config, has_relative_attention_bias=False, **kwargs): def __init__(self, config, has_relative_attention_bias=False, **kwargs):
super(TFT5Block, self).__init__(**kwargs) super().__init__(**kwargs)
self.is_decoder = config.is_decoder self.is_decoder = config.is_decoder
self.layer = [] self.layer = []
self.layer.append( self.layer.append(
...@@ -361,7 +361,7 @@ class TFT5Block(tf.keras.layers.Layer): ...@@ -361,7 +361,7 @@ class TFT5Block(tf.keras.layers.Layer):
#################################################### ####################################################
class TFT5MainLayer(tf.keras.layers.Layer): class TFT5MainLayer(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super(TFT5MainLayer, self).__init__(**kwargs) super().__init__(**kwargs)
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states self.output_hidden_states = config.output_hidden_states
self.is_decoder = config.is_decoder self.is_decoder = config.is_decoder
...@@ -633,7 +633,7 @@ class TFT5Model(TFT5PreTrainedModel): ...@@ -633,7 +633,7 @@ class TFT5Model(TFT5PreTrainedModel):
""" """
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFT5Model, self).__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.shared = TFSharedEmbeddings(config.vocab_size, config.d_model, name="shared") self.shared = TFSharedEmbeddings(config.vocab_size, config.d_model, name="shared")
encoder_config = copy.deepcopy(config) encoder_config = copy.deepcopy(config)
...@@ -724,7 +724,7 @@ class TFT5WithLMHeadModel(TFT5PreTrainedModel): ...@@ -724,7 +724,7 @@ class TFT5WithLMHeadModel(TFT5PreTrainedModel):
""" """
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFT5WithLMHeadModel, self).__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.model_dim = config.d_model self.model_dim = config.d_model
self.shared = TFSharedEmbeddings(config.vocab_size, config.d_model, name="shared") self.shared = TFSharedEmbeddings(config.vocab_size, config.d_model, name="shared")
......
...@@ -36,7 +36,7 @@ TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP = { ...@@ -36,7 +36,7 @@ TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP = {
class TFPositionalEmbedding(tf.keras.layers.Layer): class TFPositionalEmbedding(tf.keras.layers.Layer):
def __init__(self, demb, **kwargs): def __init__(self, demb, **kwargs):
super(TFPositionalEmbedding, self).__init__(**kwargs) super().__init__(**kwargs)
self.inv_freq = 1 / (10000 ** (tf.range(0, demb, 2.0) / demb)) self.inv_freq = 1 / (10000 ** (tf.range(0, demb, 2.0) / demb))
...@@ -52,7 +52,7 @@ class TFPositionalEmbedding(tf.keras.layers.Layer): ...@@ -52,7 +52,7 @@ class TFPositionalEmbedding(tf.keras.layers.Layer):
class TFPositionwiseFF(tf.keras.layers.Layer): class TFPositionwiseFF(tf.keras.layers.Layer):
def __init__(self, d_model, d_inner, dropout, pre_lnorm=False, layer_norm_epsilon=1e-5, init_std=0.02, **kwargs): def __init__(self, d_model, d_inner, dropout, pre_lnorm=False, layer_norm_epsilon=1e-5, init_std=0.02, **kwargs):
super(TFPositionwiseFF, self).__init__(**kwargs) super().__init__(**kwargs)
self.d_model = d_model self.d_model = d_model
self.d_inner = d_inner self.d_inner = d_inner
...@@ -112,7 +112,7 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer): ...@@ -112,7 +112,7 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
init_std=0.02, init_std=0.02,
**kwargs **kwargs
): ):
super(TFRelPartialLearnableMultiHeadAttn, self).__init__(**kwargs) super().__init__(**kwargs)
self.output_attentions = output_attentions self.output_attentions = output_attentions
self.n_head = n_head self.n_head = n_head
...@@ -155,7 +155,7 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer): ...@@ -155,7 +155,7 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
self.r_w_bias = self.add_weight( self.r_w_bias = self.add_weight(
shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_w_bias" shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_w_bias"
) )
super(TFRelPartialLearnableMultiHeadAttn, self).build(input_shape) super().build(input_shape)
def _rel_shift(self, x): def _rel_shift(self, x):
x_size = shape_list(x) x_size = shape_list(x)
...@@ -267,7 +267,7 @@ class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer): ...@@ -267,7 +267,7 @@ class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer):
init_std=0.02, init_std=0.02,
**kwargs **kwargs
): ):
super(TFRelPartialLearnableDecoderLayer, self).__init__(**kwargs) super().__init__(**kwargs)
self.dec_attn = TFRelPartialLearnableMultiHeadAttn( self.dec_attn = TFRelPartialLearnableMultiHeadAttn(
n_head, n_head,
...@@ -308,7 +308,7 @@ class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer): ...@@ -308,7 +308,7 @@ class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer):
class TFAdaptiveEmbedding(tf.keras.layers.Layer): class TFAdaptiveEmbedding(tf.keras.layers.Layer):
def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, init_std=0.02, sample_softmax=False, **kwargs): def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, init_std=0.02, sample_softmax=False, **kwargs):
super(TFAdaptiveEmbedding, self).__init__(**kwargs) super().__init__(**kwargs)
self.n_token = n_token self.n_token = n_token
self.d_embed = d_embed self.d_embed = d_embed
...@@ -350,7 +350,7 @@ class TFAdaptiveEmbedding(tf.keras.layers.Layer): ...@@ -350,7 +350,7 @@ class TFAdaptiveEmbedding(tf.keras.layers.Layer):
name="emb_projs_._{}".format(i), name="emb_projs_._{}".format(i),
) )
) )
super(TFAdaptiveEmbedding, self).build(input_shape) super().build(input_shape)
def call(self, inp): def call(self, inp):
if self.div_val == 1: if self.div_val == 1:
...@@ -380,7 +380,7 @@ class TFAdaptiveEmbedding(tf.keras.layers.Layer): ...@@ -380,7 +380,7 @@ class TFAdaptiveEmbedding(tf.keras.layers.Layer):
class TFTransfoXLMainLayer(tf.keras.layers.Layer): class TFTransfoXLMainLayer(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super(TFTransfoXLMainLayer, self).__init__(**kwargs) super().__init__(**kwargs)
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states self.output_hidden_states = config.output_hidden_states
...@@ -455,7 +455,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer): ...@@ -455,7 +455,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
self.r_r_bias = self.add_weight( self.r_r_bias = self.add_weight(
shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_r_bias" shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_r_bias"
) )
super(TFTransfoXLMainLayer, self).build(input_shape) super().build(input_shape)
def get_input_embeddings(self): def get_input_embeddings(self):
return self.word_emb return self.word_emb
...@@ -728,7 +728,7 @@ class TFTransfoXLModel(TFTransfoXLPreTrainedModel): ...@@ -728,7 +728,7 @@ class TFTransfoXLModel(TFTransfoXLPreTrainedModel):
""" """
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFTransfoXLModel, self).__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.transformer = TFTransfoXLMainLayer(config, name="transformer") self.transformer = TFTransfoXLMainLayer(config, name="transformer")
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
...@@ -774,7 +774,7 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel): ...@@ -774,7 +774,7 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
""" """
def __init__(self, config): def __init__(self, config):
super(TFTransfoXLLMHeadModel, self).__init__(config) super().__init__(config)
self.transformer = TFTransfoXLMainLayer(config, name="transformer") self.transformer = TFTransfoXLMainLayer(config, name="transformer")
self.sample_softmax = config.sample_softmax self.sample_softmax = config.sample_softmax
# use sampled softmax # use sampled softmax
......
...@@ -24,7 +24,7 @@ from .modeling_tf_utils import shape_list ...@@ -24,7 +24,7 @@ from .modeling_tf_utils import shape_list
class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer): class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
def __init__(self, vocab_size, d_embed, d_proj, cutoffs, div_val=1, keep_order=False, **kwargs): def __init__(self, vocab_size, d_embed, d_proj, cutoffs, div_val=1, keep_order=False, **kwargs):
super(TFAdaptiveSoftmaxMask, self).__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.d_embed = d_embed self.d_embed = d_embed
...@@ -98,7 +98,7 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer): ...@@ -98,7 +98,7 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
name="out_layers_._{}_._bias".format(i), name="out_layers_._{}_._bias".format(i),
) )
self.out_layers.append((weight, bias)) self.out_layers.append((weight, bias))
super(TFAdaptiveSoftmaxMask, self).build(input_shape) super().build(input_shape)
@staticmethod @staticmethod
def _logit(x, W, b, proj=None): def _logit(x, W, b, proj=None):
......
...@@ -78,7 +78,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin): ...@@ -78,7 +78,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin):
return {"input_ids": tf.constant(DUMMY_INPUTS)} return {"input_ids": tf.constant(DUMMY_INPUTS)}
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFPreTrainedModel, self).__init__(*inputs, **kwargs) super().__init__(*inputs, **kwargs)
if not isinstance(config, PretrainedConfig): if not isinstance(config, PretrainedConfig):
raise ValueError( raise ValueError(
"Parameter config in `{}(config)` should be an instance of class `PretrainedConfig`. " "Parameter config in `{}(config)` should be an instance of class `PretrainedConfig`. "
...@@ -385,7 +385,7 @@ class TFConv1D(tf.keras.layers.Layer): ...@@ -385,7 +385,7 @@ class TFConv1D(tf.keras.layers.Layer):
""" TFConv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2) """ TFConv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2)
Basically works like a Linear layer but the weights are transposed Basically works like a Linear layer but the weights are transposed
""" """
super(TFConv1D, self).__init__(**kwargs) super().__init__(**kwargs)
self.nf = nf self.nf = nf
self.nx = nx self.nx = nx
self.initializer_range = initializer_range self.initializer_range = initializer_range
...@@ -412,7 +412,7 @@ class TFSharedEmbeddings(tf.keras.layers.Layer): ...@@ -412,7 +412,7 @@ class TFSharedEmbeddings(tf.keras.layers.Layer):
""" """
def __init__(self, vocab_size, hidden_size, initializer_range=None, **kwargs): def __init__(self, vocab_size, hidden_size, initializer_range=None, **kwargs):
super(TFSharedEmbeddings, self).__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.initializer_range = hidden_size ** -0.5 if initializer_range is None else initializer_range self.initializer_range = hidden_size ** -0.5 if initializer_range is None else initializer_range
...@@ -425,7 +425,7 @@ class TFSharedEmbeddings(tf.keras.layers.Layer): ...@@ -425,7 +425,7 @@ class TFSharedEmbeddings(tf.keras.layers.Layer):
self.weight = self.add_weight( self.weight = self.add_weight(
"weight", shape=[self.vocab_size, self.hidden_size], initializer=get_initializer(self.initializer_range) "weight", shape=[self.vocab_size, self.hidden_size], initializer=get_initializer(self.initializer_range)
) )
super(TFSharedEmbeddings, self).build(input_shape) super().build(input_shape)
def call(self, inputs, mode="embedding"): def call(self, inputs, mode="embedding"):
"""Get token embeddings of inputs. """Get token embeddings of inputs.
...@@ -485,7 +485,7 @@ class TFSequenceSummary(tf.keras.layers.Layer): ...@@ -485,7 +485,7 @@ class TFSequenceSummary(tf.keras.layers.Layer):
""" """
def __init__(self, config, initializer_range=0.02, **kwargs): def __init__(self, config, initializer_range=0.02, **kwargs):
super(TFSequenceSummary, self).__init__(**kwargs) super().__init__(**kwargs)
self.summary_type = config.summary_type if hasattr(config, "summary_use_proj") else "last" self.summary_type = config.summary_type if hasattr(config, "summary_use_proj") else "last"
if self.summary_type == "attn": if self.summary_type == "attn":
......
...@@ -97,7 +97,7 @@ class TFMultiHeadAttention(tf.keras.layers.Layer): ...@@ -97,7 +97,7 @@ class TFMultiHeadAttention(tf.keras.layers.Layer):
NEW_ID = itertools.count() NEW_ID = itertools.count()
def __init__(self, n_heads, dim, config, **kwargs): def __init__(self, n_heads, dim, config, **kwargs):
super(TFMultiHeadAttention, self).__init__(**kwargs) super().__init__(**kwargs)
self.layer_id = next(TFMultiHeadAttention.NEW_ID) self.layer_id = next(TFMultiHeadAttention.NEW_ID)
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
self.dim = dim self.dim = dim
...@@ -182,7 +182,7 @@ class TFMultiHeadAttention(tf.keras.layers.Layer): ...@@ -182,7 +182,7 @@ class TFMultiHeadAttention(tf.keras.layers.Layer):
class TFTransformerFFN(tf.keras.layers.Layer): class TFTransformerFFN(tf.keras.layers.Layer):
def __init__(self, in_dim, dim_hidden, out_dim, config, **kwargs): def __init__(self, in_dim, dim_hidden, out_dim, config, **kwargs):
super(TFTransformerFFN, self).__init__(**kwargs) super().__init__(**kwargs)
self.lin1 = tf.keras.layers.Dense(dim_hidden, kernel_initializer=get_initializer(config.init_std), name="lin1") self.lin1 = tf.keras.layers.Dense(dim_hidden, kernel_initializer=get_initializer(config.init_std), name="lin1")
self.lin2 = tf.keras.layers.Dense(out_dim, kernel_initializer=get_initializer(config.init_std), name="lin2") self.lin2 = tf.keras.layers.Dense(out_dim, kernel_initializer=get_initializer(config.init_std), name="lin2")
self.act = tf.keras.layers.Activation(gelu) if config.gelu_activation else tf.keras.activations.relu self.act = tf.keras.layers.Activation(gelu) if config.gelu_activation else tf.keras.activations.relu
...@@ -198,7 +198,7 @@ class TFTransformerFFN(tf.keras.layers.Layer): ...@@ -198,7 +198,7 @@ class TFTransformerFFN(tf.keras.layers.Layer):
class TFXLMMainLayer(tf.keras.layers.Layer): class TFXLMMainLayer(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super(TFXLMMainLayer, self).__init__(**kwargs) super().__init__(**kwargs)
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states self.output_hidden_states = config.output_hidden_states
...@@ -608,7 +608,7 @@ class TFXLMModel(TFXLMPreTrainedModel): ...@@ -608,7 +608,7 @@ class TFXLMModel(TFXLMPreTrainedModel):
""" """
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFXLMModel, self).__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.transformer = TFXLMMainLayer(config, name="transformer") self.transformer = TFXLMMainLayer(config, name="transformer")
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
...@@ -622,7 +622,7 @@ class TFXLMPredLayer(tf.keras.layers.Layer): ...@@ -622,7 +622,7 @@ class TFXLMPredLayer(tf.keras.layers.Layer):
""" """
def __init__(self, config, input_embeddings, **kwargs): def __init__(self, config, input_embeddings, **kwargs):
super(TFXLMPredLayer, self).__init__(**kwargs) super().__init__(**kwargs)
self.asm = config.asm self.asm = config.asm
self.n_words = config.n_words self.n_words = config.n_words
self.pad_index = config.pad_index self.pad_index = config.pad_index
...@@ -641,7 +641,7 @@ class TFXLMPredLayer(tf.keras.layers.Layer): ...@@ -641,7 +641,7 @@ class TFXLMPredLayer(tf.keras.layers.Layer):
def build(self, input_shape): def build(self, input_shape):
# The output weights are the same as the input embeddings, but there is an output-only bias for each token. # The output weights are the same as the input embeddings, but there is an output-only bias for each token.
self.bias = self.add_weight(shape=(self.n_words,), initializer="zeros", trainable=True, name="bias") self.bias = self.add_weight(shape=(self.n_words,), initializer="zeros", trainable=True, name="bias")
super(TFXLMPredLayer, self).build(input_shape) super().build(input_shape)
def call(self, hidden_states): def call(self, hidden_states):
hidden_states = self.input_embeddings(hidden_states, mode="linear") hidden_states = self.input_embeddings(hidden_states, mode="linear")
...@@ -682,7 +682,7 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel): ...@@ -682,7 +682,7 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
""" """
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFXLMWithLMHeadModel, self).__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.transformer = TFXLMMainLayer(config, name="transformer") self.transformer = TFXLMMainLayer(config, name="transformer")
self.pred_layer = TFXLMPredLayer(config, self.transformer.embeddings, name="pred_layer_._proj") self.pred_layer = TFXLMPredLayer(config, self.transformer.embeddings, name="pred_layer_._proj")
...@@ -733,7 +733,7 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel): ...@@ -733,7 +733,7 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel):
""" """
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFXLMForSequenceClassification, self).__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.transformer = TFXLMMainLayer(config, name="transformer") self.transformer = TFXLMMainLayer(config, name="transformer")
...@@ -784,7 +784,7 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel): ...@@ -784,7 +784,7 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel):
""" """
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFXLMForQuestionAnsweringSimple, self).__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.transformer = TFXLMMainLayer(config, name="transformer") self.transformer = TFXLMMainLayer(config, name="transformer")
self.qa_outputs = tf.keras.layers.Dense( self.qa_outputs = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.init_std), name="qa_outputs" config.num_labels, kernel_initializer=get_initializer(config.init_std), name="qa_outputs"
......
...@@ -57,7 +57,7 @@ ACT2FN = { ...@@ -57,7 +57,7 @@ ACT2FN = {
class TFXLNetRelativeAttention(tf.keras.layers.Layer): class TFXLNetRelativeAttention(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super(TFXLNetRelativeAttention, self).__init__(**kwargs) super().__init__(**kwargs)
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
if config.d_model % config.n_head != 0: if config.d_model % config.n_head != 0:
...@@ -104,7 +104,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer): ...@@ -104,7 +104,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
self.seg_embed = self.add_weight( self.seg_embed = self.add_weight(
shape=(2, self.n_head, self.d_head), initializer=initializer, trainable=True, name="seg_embed" shape=(2, self.n_head, self.d_head), initializer=initializer, trainable=True, name="seg_embed"
) )
super(TFXLNetRelativeAttention, self).build(input_shape) super().build(input_shape)
def prune_heads(self, heads): def prune_heads(self, heads):
raise NotImplementedError raise NotImplementedError
...@@ -280,7 +280,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer): ...@@ -280,7 +280,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
class TFXLNetFeedForward(tf.keras.layers.Layer): class TFXLNetFeedForward(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super(TFXLNetFeedForward, self).__init__(**kwargs) super().__init__(**kwargs)
self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
self.layer_1 = tf.keras.layers.Dense( self.layer_1 = tf.keras.layers.Dense(
config.d_inner, kernel_initializer=get_initializer(config.initializer_range), name="layer_1" config.d_inner, kernel_initializer=get_initializer(config.initializer_range), name="layer_1"
...@@ -307,7 +307,7 @@ class TFXLNetFeedForward(tf.keras.layers.Layer): ...@@ -307,7 +307,7 @@ class TFXLNetFeedForward(tf.keras.layers.Layer):
class TFXLNetLayer(tf.keras.layers.Layer): class TFXLNetLayer(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super(TFXLNetLayer, self).__init__(**kwargs) super().__init__(**kwargs)
self.rel_attn = TFXLNetRelativeAttention(config, name="rel_attn") self.rel_attn = TFXLNetRelativeAttention(config, name="rel_attn")
self.ff = TFXLNetFeedForward(config, name="ff") self.ff = TFXLNetFeedForward(config, name="ff")
self.dropout = tf.keras.layers.Dropout(config.dropout) self.dropout = tf.keras.layers.Dropout(config.dropout)
...@@ -326,7 +326,7 @@ class TFXLNetLayer(tf.keras.layers.Layer): ...@@ -326,7 +326,7 @@ class TFXLNetLayer(tf.keras.layers.Layer):
class TFXLNetLMHead(tf.keras.layers.Layer): class TFXLNetLMHead(tf.keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs): def __init__(self, config, input_embeddings, **kwargs):
super(TFXLNetLMHead, self).__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.vocab_size = config.vocab_size
# The output weights are the same as the input embeddings, but there is # The output weights are the same as the input embeddings, but there is
# an output-only bias for each token. # an output-only bias for each token.
...@@ -334,7 +334,7 @@ class TFXLNetLMHead(tf.keras.layers.Layer): ...@@ -334,7 +334,7 @@ class TFXLNetLMHead(tf.keras.layers.Layer):
def build(self, input_shape): def build(self, input_shape):
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
super(TFXLNetLMHead, self).build(input_shape) super().build(input_shape)
def call(self, hidden_states): def call(self, hidden_states):
hidden_states = self.input_embeddings(hidden_states, mode="linear") hidden_states = self.input_embeddings(hidden_states, mode="linear")
...@@ -344,7 +344,7 @@ class TFXLNetLMHead(tf.keras.layers.Layer): ...@@ -344,7 +344,7 @@ class TFXLNetLMHead(tf.keras.layers.Layer):
class TFXLNetMainLayer(tf.keras.layers.Layer): class TFXLNetMainLayer(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super(TFXLNetMainLayer, self).__init__(**kwargs) super().__init__(**kwargs)
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states self.output_hidden_states = config.output_hidden_states
self.output_past = config.output_past self.output_past = config.output_past
...@@ -832,7 +832,7 @@ class TFXLNetModel(TFXLNetPreTrainedModel): ...@@ -832,7 +832,7 @@ class TFXLNetModel(TFXLNetPreTrainedModel):
""" """
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFXLNetModel, self).__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.transformer = TFXLNetMainLayer(config, name="transformer") self.transformer = TFXLNetMainLayer(config, name="transformer")
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
...@@ -885,7 +885,7 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel): ...@@ -885,7 +885,7 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel):
""" """
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFXLNetLMHeadModel, self).__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.transformer = TFXLNetMainLayer(config, name="transformer") self.transformer = TFXLNetMainLayer(config, name="transformer")
self.lm_loss = TFXLNetLMHead(config, self.transformer.word_embedding, name="lm_loss") self.lm_loss = TFXLNetLMHead(config, self.transformer.word_embedding, name="lm_loss")
...@@ -940,7 +940,7 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel): ...@@ -940,7 +940,7 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel):
""" """
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFXLNetForSequenceClassification, self).__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.transformer = TFXLNetMainLayer(config, name="transformer") self.transformer = TFXLNetMainLayer(config, name="transformer")
...@@ -1001,7 +1001,7 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel): ...@@ -1001,7 +1001,7 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel):
""" """
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFXLNetForTokenClassification, self).__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.transformer = TFXLNetMainLayer(config, name="transformer") self.transformer = TFXLNetMainLayer(config, name="transformer")
...@@ -1058,7 +1058,7 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel): ...@@ -1058,7 +1058,7 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel):
""" """
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFXLNetForQuestionAnsweringSimple, self).__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.transformer = TFXLNetMainLayer(config, name="transformer") self.transformer = TFXLNetMainLayer(config, name="transformer")
self.qa_outputs = tf.keras.layers.Dense( self.qa_outputs = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
...@@ -1127,7 +1127,7 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel): ...@@ -1127,7 +1127,7 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel):
# """ # """
# def __init__(self, config, *inputs, **kwargs): # def __init__(self, config, *inputs, **kwargs):
# super(TFXLNetForQuestionAnswering, self).__init__(config, *inputs, **kwargs) # super().__init__(config, *inputs, **kwargs)
# self.start_n_top = config.start_n_top # self.start_n_top = config.start_n_top
# self.end_n_top = config.end_n_top # self.end_n_top = config.end_n_top
......
...@@ -165,7 +165,7 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path): ...@@ -165,7 +165,7 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
class PositionalEmbedding(nn.Module): class PositionalEmbedding(nn.Module):
def __init__(self, demb): def __init__(self, demb):
super(PositionalEmbedding, self).__init__() super().__init__()
self.demb = demb self.demb = demb
...@@ -184,7 +184,7 @@ class PositionalEmbedding(nn.Module): ...@@ -184,7 +184,7 @@ class PositionalEmbedding(nn.Module):
class PositionwiseFF(nn.Module): class PositionwiseFF(nn.Module):
def __init__(self, d_model, d_inner, dropout, pre_lnorm=False, layer_norm_epsilon=1e-5): def __init__(self, d_model, d_inner, dropout, pre_lnorm=False, layer_norm_epsilon=1e-5):
super(PositionwiseFF, self).__init__() super().__init__()
self.d_model = d_model self.d_model = d_model
self.d_inner = d_inner self.d_inner = d_inner
...@@ -236,7 +236,7 @@ class RelPartialLearnableMultiHeadAttn(nn.Module): ...@@ -236,7 +236,7 @@ class RelPartialLearnableMultiHeadAttn(nn.Module):
output_attentions=False, output_attentions=False,
layer_norm_epsilon=1e-5, layer_norm_epsilon=1e-5,
): ):
super(RelPartialLearnableMultiHeadAttn, self).__init__() super().__init__()
self.output_attentions = output_attentions self.output_attentions = output_attentions
self.n_head = n_head self.n_head = n_head
...@@ -368,7 +368,7 @@ class RelPartialLearnableMultiHeadAttn(nn.Module): ...@@ -368,7 +368,7 @@ class RelPartialLearnableMultiHeadAttn(nn.Module):
class RelPartialLearnableDecoderLayer(nn.Module): class RelPartialLearnableDecoderLayer(nn.Module):
def __init__(self, n_head, d_model, d_head, d_inner, dropout, layer_norm_epsilon=1e-5, **kwargs): def __init__(self, n_head, d_model, d_head, d_inner, dropout, layer_norm_epsilon=1e-5, **kwargs):
super(RelPartialLearnableDecoderLayer, self).__init__() super().__init__()
self.dec_attn = RelPartialLearnableMultiHeadAttn( self.dec_attn = RelPartialLearnableMultiHeadAttn(
n_head, d_model, d_head, dropout, layer_norm_epsilon=layer_norm_epsilon, **kwargs n_head, d_model, d_head, dropout, layer_norm_epsilon=layer_norm_epsilon, **kwargs
...@@ -389,7 +389,7 @@ class RelPartialLearnableDecoderLayer(nn.Module): ...@@ -389,7 +389,7 @@ class RelPartialLearnableDecoderLayer(nn.Module):
class AdaptiveEmbedding(nn.Module): class AdaptiveEmbedding(nn.Module):
def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, sample_softmax=False): def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, sample_softmax=False):
super(AdaptiveEmbedding, self).__init__() super().__init__()
self.n_token = n_token self.n_token = n_token
self.d_embed = d_embed self.d_embed = d_embed
...@@ -587,7 +587,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel): ...@@ -587,7 +587,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
""" """
def __init__(self, config): def __init__(self, config):
super(TransfoXLModel, self).__init__(config) super().__init__(config)
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states self.output_hidden_states = config.output_hidden_states
...@@ -845,7 +845,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel): ...@@ -845,7 +845,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
""" """
def __init__(self, config): def __init__(self, config):
super(TransfoXLLMHeadModel, self).__init__(config) super().__init__(config)
self.transformer = TransfoXLModel(config) self.transformer = TransfoXLModel(config)
self.sample_softmax = config.sample_softmax self.sample_softmax = config.sample_softmax
# use sampled softmax # use sampled softmax
......
...@@ -29,7 +29,7 @@ import torch.nn.functional as F ...@@ -29,7 +29,7 @@ import torch.nn.functional as F
class ProjectedAdaptiveLogSoftmax(nn.Module): class ProjectedAdaptiveLogSoftmax(nn.Module):
def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, keep_order=False): def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, keep_order=False):
super(ProjectedAdaptiveLogSoftmax, self).__init__() super().__init__()
self.n_token = n_token self.n_token = n_token
self.d_embed = d_embed self.d_embed = d_embed
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment