Unverified Commit 202d6863 authored by Susnato Dhar's avatar Susnato Dhar Committed by GitHub
Browse files

deleted references of self.vocab_size and self.type_vocab_size for multiple...

deleted references of self.vocab_size and self.type_vocab_size for multiple models [TF implementation] (#21164)
parent af37d183
...@@ -202,7 +202,6 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer): ...@@ -202,7 +202,6 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
self.return_dict = config.use_return_dict self.return_dict = config.use_return_dict
self.num_hidden_layers = config.n_layer self.num_hidden_layers = config.n_layer
self.vocab_size = config.vocab_size
self.n_embd = config.n_embd self.n_embd = config.n_embd
self.n_positions = config.n_positions self.n_positions = config.n_positions
self.initializer_range = config.initializer_range self.initializer_range = config.initializer_range
...@@ -302,10 +301,10 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer): ...@@ -302,10 +301,10 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
# indices on GPU, returning zeros instead. This is a dangerous silent behavior. # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
tf.debugging.assert_less( tf.debugging.assert_less(
input_ids, input_ids,
tf.cast(self.vocab_size, dtype=input_ids.dtype), tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
message=( message=(
"input_ids must be smaller than the embedding layer's input dimension (got" "input_ids must be smaller than the embedding layer's input dimension (got"
f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})" f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
), ),
) )
inputs_embeds = self.tokens_embed(input_ids, mode="embedding") inputs_embeds = self.tokens_embed(input_ids, mode="embedding")
...@@ -316,10 +315,10 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer): ...@@ -316,10 +315,10 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
# indices on GPU, returning zeros instead. This is a dangerous silent behavior. # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
tf.debugging.assert_less( tf.debugging.assert_less(
token_type_ids, token_type_ids,
tf.cast(self.vocab_size, dtype=token_type_ids.dtype), tf.cast(self.config.vocab_size, dtype=token_type_ids.dtype),
message=( message=(
"token_type_ids must be smaller than the embedding layer's input dimension (got" "token_type_ids must be smaller than the embedding layer's input dimension (got"
f" {tf.math.reduce_max(token_type_ids)} >= {self.vocab_size})" f" {tf.math.reduce_max(token_type_ids)} >= {self.config.vocab_size})"
), ),
) )
token_type_embeds = self.tokens_embed(token_type_ids, mode="embedding") token_type_embeds = self.tokens_embed(token_type_ids, mode="embedding")
......
...@@ -74,8 +74,7 @@ class TFRemBertEmbeddings(tf.keras.layers.Layer): ...@@ -74,8 +74,7 @@ class TFRemBertEmbeddings(tf.keras.layers.Layer):
def __init__(self, config: RemBertConfig, **kwargs): def __init__(self, config: RemBertConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.config = config
self.type_vocab_size = config.type_vocab_size
self.input_embedding_size = config.input_embedding_size self.input_embedding_size = config.input_embedding_size
self.max_position_embeddings = config.max_position_embeddings self.max_position_embeddings = config.max_position_embeddings
self.initializer_range = config.initializer_range self.initializer_range = config.initializer_range
...@@ -86,14 +85,14 @@ class TFRemBertEmbeddings(tf.keras.layers.Layer): ...@@ -86,14 +85,14 @@ class TFRemBertEmbeddings(tf.keras.layers.Layer):
with tf.name_scope("word_embeddings"): with tf.name_scope("word_embeddings"):
self.weight = self.add_weight( self.weight = self.add_weight(
name="weight", name="weight",
shape=[self.vocab_size, self.input_embedding_size], shape=[self.config.vocab_size, self.input_embedding_size],
initializer=get_initializer(self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
with tf.name_scope("token_type_embeddings"): with tf.name_scope("token_type_embeddings"):
self.token_type_embeddings = self.add_weight( self.token_type_embeddings = self.add_weight(
name="embeddings", name="embeddings",
shape=[self.type_vocab_size, self.input_embedding_size], shape=[self.config.type_vocab_size, self.input_embedding_size],
initializer=get_initializer(self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
...@@ -128,10 +127,10 @@ class TFRemBertEmbeddings(tf.keras.layers.Layer): ...@@ -128,10 +127,10 @@ class TFRemBertEmbeddings(tf.keras.layers.Layer):
# indices on GPU, returning zeros instead. This is a dangerous silent behavior. # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
tf.debugging.assert_less( tf.debugging.assert_less(
input_ids, input_ids,
tf.cast(self.vocab_size, dtype=input_ids.dtype), tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
message=( message=(
"input_ids must be smaller than the embedding layer's input dimension (got" "input_ids must be smaller than the embedding layer's input dimension (got"
f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})" f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
), ),
) )
inputs_embeds = tf.gather(params=self.weight, indices=input_ids) inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
...@@ -561,7 +560,7 @@ class TFRemBertLMPredictionHead(tf.keras.layers.Layer): ...@@ -561,7 +560,7 @@ class TFRemBertLMPredictionHead(tf.keras.layers.Layer):
def __init__(self, config: RemBertConfig, input_embeddings: tf.keras.layers.Layer, **kwargs): def __init__(self, config: RemBertConfig, input_embeddings: tf.keras.layers.Layer, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.config = config
self.initializer_range = config.initializer_range self.initializer_range = config.initializer_range
self.output_embedding_size = config.output_embedding_size self.output_embedding_size = config.output_embedding_size
self.dense = tf.keras.layers.Dense( self.dense = tf.keras.layers.Dense(
...@@ -576,11 +575,11 @@ class TFRemBertLMPredictionHead(tf.keras.layers.Layer): ...@@ -576,11 +575,11 @@ class TFRemBertLMPredictionHead(tf.keras.layers.Layer):
def build(self, input_shape: tf.TensorShape): def build(self, input_shape: tf.TensorShape):
self.decoder = self.add_weight( self.decoder = self.add_weight(
name="decoder/weight", name="decoder/weight",
shape=[self.vocab_size, self.output_embedding_size], shape=[self.config.vocab_size, self.output_embedding_size],
initializer=get_initializer(self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
self.decoder_bias = self.add_weight( self.decoder_bias = self.add_weight(
shape=(self.vocab_size,), initializer="zeros", trainable=True, name="decoder/bias" shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="decoder/bias"
) )
super().build(input_shape) super().build(input_shape)
...@@ -597,7 +596,7 @@ class TFRemBertLMPredictionHead(tf.keras.layers.Layer): ...@@ -597,7 +596,7 @@ class TFRemBertLMPredictionHead(tf.keras.layers.Layer):
def set_bias(self, value: tf.Variable): def set_bias(self, value: tf.Variable):
self.decoder_bias = value["decoder_bias"] self.decoder_bias = value["decoder_bias"]
self.vocab_size = shape_list(value["decoder_bias"])[0] self.config.vocab_size = shape_list(value["decoder_bias"])[0]
def call(self, hidden_states: tf.Tensor) -> tf.Tensor: def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dense(inputs=hidden_states)
...@@ -606,7 +605,7 @@ class TFRemBertLMPredictionHead(tf.keras.layers.Layer): ...@@ -606,7 +605,7 @@ class TFRemBertLMPredictionHead(tf.keras.layers.Layer):
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.output_embedding_size]) hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.output_embedding_size])
hidden_states = self.LayerNorm(hidden_states) hidden_states = self.LayerNorm(hidden_states)
hidden_states = tf.matmul(a=hidden_states, b=self.decoder, transpose_b=True) hidden_states = tf.matmul(a=hidden_states, b=self.decoder, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size]) hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.decoder_bias) hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.decoder_bias)
return hidden_states return hidden_states
......
...@@ -82,8 +82,7 @@ class TFRobertaEmbeddings(tf.keras.layers.Layer): ...@@ -82,8 +82,7 @@ class TFRobertaEmbeddings(tf.keras.layers.Layer):
super().__init__(**kwargs) super().__init__(**kwargs)
self.padding_idx = 1 self.padding_idx = 1
self.vocab_size = config.vocab_size self.config = config
self.type_vocab_size = config.type_vocab_size
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
self.max_position_embeddings = config.max_position_embeddings self.max_position_embeddings = config.max_position_embeddings
self.initializer_range = config.initializer_range self.initializer_range = config.initializer_range
...@@ -94,14 +93,14 @@ class TFRobertaEmbeddings(tf.keras.layers.Layer): ...@@ -94,14 +93,14 @@ class TFRobertaEmbeddings(tf.keras.layers.Layer):
with tf.name_scope("word_embeddings"): with tf.name_scope("word_embeddings"):
self.weight = self.add_weight( self.weight = self.add_weight(
name="weight", name="weight",
shape=[self.vocab_size, self.hidden_size], shape=[self.config.vocab_size, self.hidden_size],
initializer=get_initializer(self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
with tf.name_scope("token_type_embeddings"): with tf.name_scope("token_type_embeddings"):
self.token_type_embeddings = self.add_weight( self.token_type_embeddings = self.add_weight(
name="embeddings", name="embeddings",
shape=[self.type_vocab_size, self.hidden_size], shape=[self.config.type_vocab_size, self.hidden_size],
initializer=get_initializer(self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
...@@ -150,10 +149,10 @@ class TFRobertaEmbeddings(tf.keras.layers.Layer): ...@@ -150,10 +149,10 @@ class TFRobertaEmbeddings(tf.keras.layers.Layer):
# indices on GPU, returning zeros instead. This is a dangerous silent behavior. # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
tf.debugging.assert_less( tf.debugging.assert_less(
input_ids, input_ids,
tf.cast(self.vocab_size, dtype=input_ids.dtype), tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
message=( message=(
"input_ids must be smaller than the embedding layer's input dimension (got" "input_ids must be smaller than the embedding layer's input dimension (got"
f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})" f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
), ),
) )
inputs_embeds = tf.gather(params=self.weight, indices=input_ids) inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
...@@ -1018,7 +1017,7 @@ class TFRobertaLMHead(tf.keras.layers.Layer): ...@@ -1018,7 +1017,7 @@ class TFRobertaLMHead(tf.keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs): def __init__(self, config, input_embeddings, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.config = config
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
self.dense = tf.keras.layers.Dense( self.dense = tf.keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
...@@ -1031,7 +1030,7 @@ class TFRobertaLMHead(tf.keras.layers.Layer): ...@@ -1031,7 +1030,7 @@ class TFRobertaLMHead(tf.keras.layers.Layer):
self.decoder = input_embeddings self.decoder = input_embeddings
def build(self, input_shape): def build(self, input_shape):
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
super().build(input_shape) super().build(input_shape)
...@@ -1047,7 +1046,7 @@ class TFRobertaLMHead(tf.keras.layers.Layer): ...@@ -1047,7 +1046,7 @@ class TFRobertaLMHead(tf.keras.layers.Layer):
def set_bias(self, value): def set_bias(self, value):
self.bias = value["bias"] self.bias = value["bias"]
self.vocab_size = shape_list(value["bias"])[0] self.config.vocab_size = shape_list(value["bias"])[0]
def call(self, hidden_states): def call(self, hidden_states):
hidden_states = self.dense(hidden_states) hidden_states = self.dense(hidden_states)
...@@ -1058,7 +1057,7 @@ class TFRobertaLMHead(tf.keras.layers.Layer): ...@@ -1058,7 +1057,7 @@ class TFRobertaLMHead(tf.keras.layers.Layer):
seq_length = shape_list(tensor=hidden_states)[1] seq_length = shape_list(tensor=hidden_states)[1]
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size]) hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True) hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size]) hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias) hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
return hidden_states return hidden_states
......
...@@ -87,8 +87,7 @@ class TFRobertaPreLayerNormEmbeddings(tf.keras.layers.Layer): ...@@ -87,8 +87,7 @@ class TFRobertaPreLayerNormEmbeddings(tf.keras.layers.Layer):
super().__init__(**kwargs) super().__init__(**kwargs)
self.padding_idx = 1 self.padding_idx = 1
self.vocab_size = config.vocab_size self.config = config
self.type_vocab_size = config.type_vocab_size
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
self.max_position_embeddings = config.max_position_embeddings self.max_position_embeddings = config.max_position_embeddings
self.initializer_range = config.initializer_range self.initializer_range = config.initializer_range
...@@ -99,14 +98,14 @@ class TFRobertaPreLayerNormEmbeddings(tf.keras.layers.Layer): ...@@ -99,14 +98,14 @@ class TFRobertaPreLayerNormEmbeddings(tf.keras.layers.Layer):
with tf.name_scope("word_embeddings"): with tf.name_scope("word_embeddings"):
self.weight = self.add_weight( self.weight = self.add_weight(
name="weight", name="weight",
shape=[self.vocab_size, self.hidden_size], shape=[self.config.vocab_size, self.hidden_size],
initializer=get_initializer(self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
with tf.name_scope("token_type_embeddings"): with tf.name_scope("token_type_embeddings"):
self.token_type_embeddings = self.add_weight( self.token_type_embeddings = self.add_weight(
name="embeddings", name="embeddings",
shape=[self.type_vocab_size, self.hidden_size], shape=[self.config.type_vocab_size, self.hidden_size],
initializer=get_initializer(self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
...@@ -155,10 +154,10 @@ class TFRobertaPreLayerNormEmbeddings(tf.keras.layers.Layer): ...@@ -155,10 +154,10 @@ class TFRobertaPreLayerNormEmbeddings(tf.keras.layers.Layer):
# indices on GPU, returning zeros instead. This is a dangerous silent behavior. # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
tf.debugging.assert_less( tf.debugging.assert_less(
input_ids, input_ids,
tf.cast(self.vocab_size, dtype=input_ids.dtype), tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
message=( message=(
"input_ids must be smaller than the embedding layer's input dimension (got" "input_ids must be smaller than the embedding layer's input dimension (got"
f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})" f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
), ),
) )
inputs_embeds = tf.gather(params=self.weight, indices=input_ids) inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
...@@ -1021,7 +1020,7 @@ class TFRobertaPreLayerNormLMHead(tf.keras.layers.Layer): ...@@ -1021,7 +1020,7 @@ class TFRobertaPreLayerNormLMHead(tf.keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs): def __init__(self, config, input_embeddings, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.config = config
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
self.dense = tf.keras.layers.Dense( self.dense = tf.keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
...@@ -1034,7 +1033,7 @@ class TFRobertaPreLayerNormLMHead(tf.keras.layers.Layer): ...@@ -1034,7 +1033,7 @@ class TFRobertaPreLayerNormLMHead(tf.keras.layers.Layer):
self.decoder = input_embeddings self.decoder = input_embeddings
def build(self, input_shape): def build(self, input_shape):
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
super().build(input_shape) super().build(input_shape)
...@@ -1050,7 +1049,7 @@ class TFRobertaPreLayerNormLMHead(tf.keras.layers.Layer): ...@@ -1050,7 +1049,7 @@ class TFRobertaPreLayerNormLMHead(tf.keras.layers.Layer):
def set_bias(self, value): def set_bias(self, value):
self.bias = value["bias"] self.bias = value["bias"]
self.vocab_size = shape_list(value["bias"])[0] self.config.vocab_size = shape_list(value["bias"])[0]
def call(self, hidden_states): def call(self, hidden_states):
hidden_states = self.dense(hidden_states) hidden_states = self.dense(hidden_states)
...@@ -1061,7 +1060,7 @@ class TFRobertaPreLayerNormLMHead(tf.keras.layers.Layer): ...@@ -1061,7 +1060,7 @@ class TFRobertaPreLayerNormLMHead(tf.keras.layers.Layer):
seq_length = shape_list(tensor=hidden_states)[1] seq_length = shape_list(tensor=hidden_states)[1]
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size]) hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True) hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size]) hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias) hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
return hidden_states return hidden_states
......
...@@ -136,8 +136,7 @@ class TFRoFormerEmbeddings(tf.keras.layers.Layer): ...@@ -136,8 +136,7 @@ class TFRoFormerEmbeddings(tf.keras.layers.Layer):
def __init__(self, config: RoFormerConfig, **kwargs): def __init__(self, config: RoFormerConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.config = config
self.type_vocab_size = config.type_vocab_size
self.embedding_size = config.embedding_size self.embedding_size = config.embedding_size
self.initializer_range = config.initializer_range self.initializer_range = config.initializer_range
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
...@@ -147,14 +146,14 @@ class TFRoFormerEmbeddings(tf.keras.layers.Layer): ...@@ -147,14 +146,14 @@ class TFRoFormerEmbeddings(tf.keras.layers.Layer):
with tf.name_scope("word_embeddings"): with tf.name_scope("word_embeddings"):
self.weight = self.add_weight( self.weight = self.add_weight(
name="weight", name="weight",
shape=[self.vocab_size, self.embedding_size], shape=[self.config.vocab_size, self.embedding_size],
initializer=get_initializer(self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
with tf.name_scope("token_type_embeddings"): with tf.name_scope("token_type_embeddings"):
self.token_type_embeddings = self.add_weight( self.token_type_embeddings = self.add_weight(
name="embeddings", name="embeddings",
shape=[self.type_vocab_size, self.embedding_size], shape=[self.config.type_vocab_size, self.embedding_size],
initializer=get_initializer(self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
...@@ -181,10 +180,10 @@ class TFRoFormerEmbeddings(tf.keras.layers.Layer): ...@@ -181,10 +180,10 @@ class TFRoFormerEmbeddings(tf.keras.layers.Layer):
# indices on GPU, returning zeros instead. This is a dangerous silent behavior. # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
tf.debugging.assert_less( tf.debugging.assert_less(
input_ids, input_ids,
tf.cast(self.vocab_size, dtype=input_ids.dtype), tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
message=( message=(
"input_ids must be smaller than the embedding layer's input dimension (got" "input_ids must be smaller than the embedding layer's input dimension (got"
f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})" f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
), ),
) )
inputs_embeds = tf.gather(params=self.weight, indices=input_ids) inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
...@@ -531,7 +530,7 @@ class TFRoFormerLMPredictionHead(tf.keras.layers.Layer): ...@@ -531,7 +530,7 @@ class TFRoFormerLMPredictionHead(tf.keras.layers.Layer):
def __init__(self, config: RoFormerConfig, input_embeddings: tf.keras.layers.Layer, **kwargs): def __init__(self, config: RoFormerConfig, input_embeddings: tf.keras.layers.Layer, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.config = config
self.embedding_size = config.embedding_size self.embedding_size = config.embedding_size
self.transform = TFRoFormerPredictionHeadTransform(config, name="transform") self.transform = TFRoFormerPredictionHeadTransform(config, name="transform")
...@@ -541,7 +540,7 @@ class TFRoFormerLMPredictionHead(tf.keras.layers.Layer): ...@@ -541,7 +540,7 @@ class TFRoFormerLMPredictionHead(tf.keras.layers.Layer):
self.input_embeddings = input_embeddings self.input_embeddings = input_embeddings
def build(self, input_shape: tf.TensorShape): def build(self, input_shape: tf.TensorShape):
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
super().build(input_shape) super().build(input_shape)
...@@ -557,14 +556,14 @@ class TFRoFormerLMPredictionHead(tf.keras.layers.Layer): ...@@ -557,14 +556,14 @@ class TFRoFormerLMPredictionHead(tf.keras.layers.Layer):
def set_bias(self, value: tf.Variable): def set_bias(self, value: tf.Variable):
self.bias = value["bias"] self.bias = value["bias"]
self.vocab_size = shape_list(value["bias"])[0] self.config.vocab_size = shape_list(value["bias"])[0]
def call(self, hidden_states: tf.Tensor) -> tf.Tensor: def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.transform(hidden_states=hidden_states) hidden_states = self.transform(hidden_states=hidden_states)
seq_length = shape_list(hidden_states)[1] seq_length = shape_list(hidden_states)[1]
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.embedding_size]) hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.embedding_size])
hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True) hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size]) hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias) hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
return hidden_states return hidden_states
......
...@@ -149,8 +149,7 @@ class TFTapasEmbeddings(tf.keras.layers.Layer): ...@@ -149,8 +149,7 @@ class TFTapasEmbeddings(tf.keras.layers.Layer):
def __init__(self, config: TapasConfig, **kwargs): def __init__(self, config: TapasConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.config = config
self.type_vocab_sizes = config.type_vocab_sizes
self.number_of_token_type_embeddings = len(config.type_vocab_sizes) self.number_of_token_type_embeddings = len(config.type_vocab_sizes)
self.reset_position_index_per_cell = config.reset_position_index_per_cell self.reset_position_index_per_cell = config.reset_position_index_per_cell
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
...@@ -163,7 +162,7 @@ class TFTapasEmbeddings(tf.keras.layers.Layer): ...@@ -163,7 +162,7 @@ class TFTapasEmbeddings(tf.keras.layers.Layer):
with tf.name_scope("word_embeddings"): with tf.name_scope("word_embeddings"):
self.weight = self.add_weight( self.weight = self.add_weight(
name="weight", name="weight",
shape=[self.vocab_size, self.hidden_size], shape=[self.config.vocab_size, self.hidden_size],
initializer=get_initializer(self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
...@@ -173,7 +172,7 @@ class TFTapasEmbeddings(tf.keras.layers.Layer): ...@@ -173,7 +172,7 @@ class TFTapasEmbeddings(tf.keras.layers.Layer):
shape=[self.max_position_embeddings, self.hidden_size], shape=[self.max_position_embeddings, self.hidden_size],
initializer=get_initializer(self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
for i, type_vocab_size in enumerate(self.type_vocab_sizes): for i, type_vocab_size in enumerate(self.config.type_vocab_sizes):
with tf.name_scope(f"token_type_embeddings_{i}"): with tf.name_scope(f"token_type_embeddings_{i}"):
setattr( setattr(
self, self,
...@@ -220,9 +219,9 @@ class TFTapasEmbeddings(tf.keras.layers.Layer): ...@@ -220,9 +219,9 @@ class TFTapasEmbeddings(tf.keras.layers.Layer):
if self.reset_position_index_per_cell: if self.reset_position_index_per_cell:
# shape (batch_size, seq_len) # shape (batch_size, seq_len)
col_index = IndexMap(token_type_ids[:, :, 1], self.type_vocab_sizes[1], batch_dims=1) col_index = IndexMap(token_type_ids[:, :, 1], self.config.type_vocab_sizes[1], batch_dims=1)
# shape (batch_size, seq_len) # shape (batch_size, seq_len)
row_index = IndexMap(token_type_ids[:, :, 2], self.type_vocab_sizes[2], batch_dims=1) row_index = IndexMap(token_type_ids[:, :, 2], self.config.type_vocab_sizes[2], batch_dims=1)
# shape (batch_size, seq_len) # shape (batch_size, seq_len)
full_index = ProductIndexMap(col_index, row_index) full_index = ProductIndexMap(col_index, row_index)
# shape (max_rows * max_columns,). First absolute position for every cell # shape (max_rows * max_columns,). First absolute position for every cell
...@@ -238,10 +237,10 @@ class TFTapasEmbeddings(tf.keras.layers.Layer): ...@@ -238,10 +237,10 @@ class TFTapasEmbeddings(tf.keras.layers.Layer):
# indices on GPU, returning zeros instead. This is a dangerous silent behavior. # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
tf.debugging.assert_less( tf.debugging.assert_less(
input_ids, input_ids,
tf.cast(self.vocab_size, dtype=input_ids.dtype), tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
message=( message=(
"input_ids must be smaller than the embedding layer's input dimension (got" "input_ids must be smaller than the embedding layer's input dimension (got"
f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})" f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
), ),
) )
inputs_embeds = tf.gather(params=self.weight, indices=input_ids) inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
...@@ -687,7 +686,7 @@ class TFTapasLMPredictionHead(tf.keras.layers.Layer): ...@@ -687,7 +686,7 @@ class TFTapasLMPredictionHead(tf.keras.layers.Layer):
def __init__(self, config: TapasConfig, input_embeddings: tf.keras.layers.Layer, **kwargs): def __init__(self, config: TapasConfig, input_embeddings: tf.keras.layers.Layer, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.config = config
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
self.transform = TFTapasPredictionHeadTransform(config, name="transform") self.transform = TFTapasPredictionHeadTransform(config, name="transform")
...@@ -697,7 +696,7 @@ class TFTapasLMPredictionHead(tf.keras.layers.Layer): ...@@ -697,7 +696,7 @@ class TFTapasLMPredictionHead(tf.keras.layers.Layer):
self.input_embeddings = input_embeddings self.input_embeddings = input_embeddings
def build(self, input_shape: tf.TensorShape): def build(self, input_shape: tf.TensorShape):
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
super().build(input_shape) super().build(input_shape)
...@@ -713,14 +712,14 @@ class TFTapasLMPredictionHead(tf.keras.layers.Layer): ...@@ -713,14 +712,14 @@ class TFTapasLMPredictionHead(tf.keras.layers.Layer):
def set_bias(self, value: tf.Variable): def set_bias(self, value: tf.Variable):
self.bias = value["bias"] self.bias = value["bias"]
self.vocab_size = shape_list(value["bias"])[0] self.config.vocab_size = shape_list(value["bias"])[0]
def call(self, hidden_states: tf.Tensor) -> tf.Tensor: def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.transform(hidden_states=hidden_states) hidden_states = self.transform(hidden_states=hidden_states)
seq_length = shape_list(hidden_states)[1] seq_length = shape_list(hidden_states)[1]
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size]) hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True) hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size]) hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias) hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
return hidden_states return hidden_states
......
...@@ -171,8 +171,7 @@ class TFXLMRobertaEmbeddings(tf.keras.layers.Layer): ...@@ -171,8 +171,7 @@ class TFXLMRobertaEmbeddings(tf.keras.layers.Layer):
super().__init__(**kwargs) super().__init__(**kwargs)
self.padding_idx = 1 self.padding_idx = 1
self.vocab_size = config.vocab_size self.config = config
self.type_vocab_size = config.type_vocab_size
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
self.max_position_embeddings = config.max_position_embeddings self.max_position_embeddings = config.max_position_embeddings
self.initializer_range = config.initializer_range self.initializer_range = config.initializer_range
...@@ -183,14 +182,14 @@ class TFXLMRobertaEmbeddings(tf.keras.layers.Layer): ...@@ -183,14 +182,14 @@ class TFXLMRobertaEmbeddings(tf.keras.layers.Layer):
with tf.name_scope("word_embeddings"): with tf.name_scope("word_embeddings"):
self.weight = self.add_weight( self.weight = self.add_weight(
name="weight", name="weight",
shape=[self.vocab_size, self.hidden_size], shape=[self.config.vocab_size, self.hidden_size],
initializer=get_initializer(self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
with tf.name_scope("token_type_embeddings"): with tf.name_scope("token_type_embeddings"):
self.token_type_embeddings = self.add_weight( self.token_type_embeddings = self.add_weight(
name="embeddings", name="embeddings",
shape=[self.type_vocab_size, self.hidden_size], shape=[self.config.type_vocab_size, self.hidden_size],
initializer=get_initializer(self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
...@@ -239,10 +238,10 @@ class TFXLMRobertaEmbeddings(tf.keras.layers.Layer): ...@@ -239,10 +238,10 @@ class TFXLMRobertaEmbeddings(tf.keras.layers.Layer):
# indices on GPU, returning zeros instead. This is a dangerous silent behavior. # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
tf.debugging.assert_less( tf.debugging.assert_less(
input_ids, input_ids,
tf.cast(self.vocab_size, dtype=input_ids.dtype), tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
message=( message=(
"input_ids must be smaller than the embedding layer's input dimension (got" "input_ids must be smaller than the embedding layer's input dimension (got"
f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})" f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
), ),
) )
inputs_embeds = tf.gather(params=self.weight, indices=input_ids) inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
...@@ -1013,7 +1012,7 @@ class TFXLMRobertaLMHead(tf.keras.layers.Layer): ...@@ -1013,7 +1012,7 @@ class TFXLMRobertaLMHead(tf.keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs): def __init__(self, config, input_embeddings, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.config = config
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
self.dense = tf.keras.layers.Dense( self.dense = tf.keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
...@@ -1026,7 +1025,7 @@ class TFXLMRobertaLMHead(tf.keras.layers.Layer): ...@@ -1026,7 +1025,7 @@ class TFXLMRobertaLMHead(tf.keras.layers.Layer):
self.decoder = input_embeddings self.decoder = input_embeddings
def build(self, input_shape): def build(self, input_shape):
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
super().build(input_shape) super().build(input_shape)
...@@ -1042,7 +1041,7 @@ class TFXLMRobertaLMHead(tf.keras.layers.Layer): ...@@ -1042,7 +1041,7 @@ class TFXLMRobertaLMHead(tf.keras.layers.Layer):
def set_bias(self, value): def set_bias(self, value):
self.bias = value["bias"] self.bias = value["bias"]
self.vocab_size = shape_list(value["bias"])[0] self.config.vocab_size = shape_list(value["bias"])[0]
def call(self, hidden_states): def call(self, hidden_states):
hidden_states = self.dense(hidden_states) hidden_states = self.dense(hidden_states)
...@@ -1053,7 +1052,7 @@ class TFXLMRobertaLMHead(tf.keras.layers.Layer): ...@@ -1053,7 +1052,7 @@ class TFXLMRobertaLMHead(tf.keras.layers.Layer):
seq_length = shape_list(tensor=hidden_states)[1] seq_length = shape_list(tensor=hidden_states)[1]
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size]) hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True) hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size]) hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias) hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
return hidden_states return hidden_states
......
...@@ -402,13 +402,13 @@ class TFXLNetLayer(tf.keras.layers.Layer): ...@@ -402,13 +402,13 @@ class TFXLNetLayer(tf.keras.layers.Layer):
class TFXLNetLMHead(tf.keras.layers.Layer): class TFXLNetLMHead(tf.keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs): def __init__(self, config, input_embeddings, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.config = config
# The output weights are the same as the input embeddings, but there is # The output weights are the same as the input embeddings, but there is
# an output-only bias for each token. # an output-only bias for each token.
self.input_embeddings = input_embeddings self.input_embeddings = input_embeddings
def build(self, input_shape): def build(self, input_shape):
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
super().build(input_shape) super().build(input_shape)
def get_output_embeddings(self): def get_output_embeddings(self):
...@@ -423,7 +423,7 @@ class TFXLNetLMHead(tf.keras.layers.Layer): ...@@ -423,7 +423,7 @@ class TFXLNetLMHead(tf.keras.layers.Layer):
def set_bias(self, value): def set_bias(self, value):
self.bias = value["bias"] self.bias = value["bias"]
self.vocab_size = shape_list(value["bias"])[0] self.config.vocab_size = shape_list(value["bias"])[0]
def call(self, hidden_states): def call(self, hidden_states):
hidden_states = self.input_embeddings(hidden_states, mode="linear") hidden_states = self.input_embeddings(hidden_states, mode="linear")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment