Unverified Commit 202d6863 authored by Susnato Dhar's avatar Susnato Dhar Committed by GitHub
Browse files

deleted references of self.vocab_size and self.type_vocab_size for multiple...

deleted references of self.vocab_size and self.type_vocab_size for multiple models [TF implementation] (#21164)
parent af37d183
...@@ -138,8 +138,7 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer): ...@@ -138,8 +138,7 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer):
def __init__(self, config: AlbertConfig, **kwargs): def __init__(self, config: AlbertConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.config = config
self.type_vocab_size = config.type_vocab_size
self.embedding_size = config.embedding_size self.embedding_size = config.embedding_size
self.max_position_embeddings = config.max_position_embeddings self.max_position_embeddings = config.max_position_embeddings
self.initializer_range = config.initializer_range self.initializer_range = config.initializer_range
...@@ -150,14 +149,14 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer): ...@@ -150,14 +149,14 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer):
with tf.name_scope("word_embeddings"): with tf.name_scope("word_embeddings"):
self.weight = self.add_weight( self.weight = self.add_weight(
name="weight", name="weight",
shape=[self.vocab_size, self.embedding_size], shape=[self.config.vocab_size, self.embedding_size],
initializer=get_initializer(self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
with tf.name_scope("token_type_embeddings"): with tf.name_scope("token_type_embeddings"):
self.token_type_embeddings = self.add_weight( self.token_type_embeddings = self.add_weight(
name="embeddings", name="embeddings",
shape=[self.type_vocab_size, self.embedding_size], shape=[self.config.type_vocab_size, self.embedding_size],
initializer=get_initializer(self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
...@@ -194,10 +193,10 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer): ...@@ -194,10 +193,10 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer):
# indices on GPU, returning zeros instead. This is a dangerous silent behavior. # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
tf.debugging.assert_less( tf.debugging.assert_less(
input_ids, input_ids,
tf.cast(self.vocab_size, dtype=input_ids.dtype), tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
message=( message=(
"input_ids must be smaller than the embedding layer's input dimension (got" "input_ids must be smaller than the embedding layer's input dimension (got"
f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})" f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
), ),
) )
inputs_embeds = tf.gather(params=self.weight, indices=input_ids) inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
...@@ -481,7 +480,7 @@ class TFAlbertMLMHead(tf.keras.layers.Layer): ...@@ -481,7 +480,7 @@ class TFAlbertMLMHead(tf.keras.layers.Layer):
def __init__(self, config: AlbertConfig, input_embeddings: tf.keras.layers.Layer, **kwargs): def __init__(self, config: AlbertConfig, input_embeddings: tf.keras.layers.Layer, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.config = config
self.embedding_size = config.embedding_size self.embedding_size = config.embedding_size
self.dense = tf.keras.layers.Dense( self.dense = tf.keras.layers.Dense(
config.embedding_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" config.embedding_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
...@@ -498,9 +497,9 @@ class TFAlbertMLMHead(tf.keras.layers.Layer): ...@@ -498,9 +497,9 @@ class TFAlbertMLMHead(tf.keras.layers.Layer):
self.decoder = input_embeddings self.decoder = input_embeddings
def build(self, input_shape: tf.TensorShape): def build(self, input_shape: tf.TensorShape):
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
self.decoder_bias = self.add_weight( self.decoder_bias = self.add_weight(
shape=(self.vocab_size,), initializer="zeros", trainable=True, name="decoder/bias" shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="decoder/bias"
) )
super().build(input_shape) super().build(input_shape)
...@@ -518,7 +517,7 @@ class TFAlbertMLMHead(tf.keras.layers.Layer): ...@@ -518,7 +517,7 @@ class TFAlbertMLMHead(tf.keras.layers.Layer):
def set_bias(self, value: tf.Variable): def set_bias(self, value: tf.Variable):
self.bias = value["bias"] self.bias = value["bias"]
self.decoder_bias = value["decoder_bias"] self.decoder_bias = value["decoder_bias"]
self.vocab_size = shape_list(value["bias"])[0] self.config.vocab_size = shape_list(value["bias"])[0]
def call(self, hidden_states: tf.Tensor) -> tf.Tensor: def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dense(inputs=hidden_states)
...@@ -527,7 +526,7 @@ class TFAlbertMLMHead(tf.keras.layers.Layer): ...@@ -527,7 +526,7 @@ class TFAlbertMLMHead(tf.keras.layers.Layer):
seq_length = shape_list(tensor=hidden_states)[1] seq_length = shape_list(tensor=hidden_states)[1]
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.embedding_size]) hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.embedding_size])
hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True) hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size]) hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.decoder_bias) hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.decoder_bias)
return hidden_states return hidden_states
......
...@@ -149,8 +149,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer): ...@@ -149,8 +149,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
def __init__(self, config: BertConfig, **kwargs): def __init__(self, config: BertConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.config = config
self.type_vocab_size = config.type_vocab_size
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
self.max_position_embeddings = config.max_position_embeddings self.max_position_embeddings = config.max_position_embeddings
self.initializer_range = config.initializer_range self.initializer_range = config.initializer_range
...@@ -161,14 +160,14 @@ class TFBertEmbeddings(tf.keras.layers.Layer): ...@@ -161,14 +160,14 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
with tf.name_scope("word_embeddings"): with tf.name_scope("word_embeddings"):
self.weight = self.add_weight( self.weight = self.add_weight(
name="weight", name="weight",
shape=[self.vocab_size, self.hidden_size], shape=[self.config.vocab_size, self.hidden_size],
initializer=get_initializer(self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
with tf.name_scope("token_type_embeddings"): with tf.name_scope("token_type_embeddings"):
self.token_type_embeddings = self.add_weight( self.token_type_embeddings = self.add_weight(
name="embeddings", name="embeddings",
shape=[self.type_vocab_size, self.hidden_size], shape=[self.config.type_vocab_size, self.hidden_size],
initializer=get_initializer(self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
...@@ -204,10 +203,10 @@ class TFBertEmbeddings(tf.keras.layers.Layer): ...@@ -204,10 +203,10 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
# indices on GPU, returning zeros instead. This is a dangerous silent behavior. # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
tf.debugging.assert_less( tf.debugging.assert_less(
input_ids, input_ids,
tf.cast(self.vocab_size, dtype=input_ids.dtype), tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
message=( message=(
"input_ids must be smaller than the embedding layer's input dimension (got" "input_ids must be smaller than the embedding layer's input dimension (got"
f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})" f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
), ),
) )
inputs_embeds = tf.gather(params=self.weight, indices=input_ids) inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
...@@ -648,7 +647,7 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer): ...@@ -648,7 +647,7 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer):
def __init__(self, config: BertConfig, input_embeddings: tf.keras.layers.Layer, **kwargs): def __init__(self, config: BertConfig, input_embeddings: tf.keras.layers.Layer, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.config = config
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
self.transform = TFBertPredictionHeadTransform(config, name="transform") self.transform = TFBertPredictionHeadTransform(config, name="transform")
...@@ -658,7 +657,7 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer): ...@@ -658,7 +657,7 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer):
self.input_embeddings = input_embeddings self.input_embeddings = input_embeddings
def build(self, input_shape: tf.TensorShape): def build(self, input_shape: tf.TensorShape):
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
super().build(input_shape) super().build(input_shape)
...@@ -674,14 +673,14 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer): ...@@ -674,14 +673,14 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer):
def set_bias(self, value: tf.Variable): def set_bias(self, value: tf.Variable):
self.bias = value["bias"] self.bias = value["bias"]
self.vocab_size = shape_list(value["bias"])[0] self.config.vocab_size = shape_list(value["bias"])[0]
def call(self, hidden_states: tf.Tensor) -> tf.Tensor: def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.transform(hidden_states=hidden_states) hidden_states = self.transform(hidden_states=hidden_states)
seq_length = shape_list(hidden_states)[1] seq_length = shape_list(hidden_states)[1]
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size]) hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True) hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size]) hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias) hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
return hidden_states return hidden_states
......
...@@ -177,8 +177,7 @@ class TFCamembertEmbeddings(tf.keras.layers.Layer): ...@@ -177,8 +177,7 @@ class TFCamembertEmbeddings(tf.keras.layers.Layer):
super().__init__(**kwargs) super().__init__(**kwargs)
self.padding_idx = 1 self.padding_idx = 1
self.vocab_size = config.vocab_size self.config = config
self.type_vocab_size = config.type_vocab_size
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
self.max_position_embeddings = config.max_position_embeddings self.max_position_embeddings = config.max_position_embeddings
self.initializer_range = config.initializer_range self.initializer_range = config.initializer_range
...@@ -189,14 +188,14 @@ class TFCamembertEmbeddings(tf.keras.layers.Layer): ...@@ -189,14 +188,14 @@ class TFCamembertEmbeddings(tf.keras.layers.Layer):
with tf.name_scope("word_embeddings"): with tf.name_scope("word_embeddings"):
self.weight = self.add_weight( self.weight = self.add_weight(
name="weight", name="weight",
shape=[self.vocab_size, self.hidden_size], shape=[self.config.vocab_size, self.hidden_size],
initializer=get_initializer(self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
with tf.name_scope("token_type_embeddings"): with tf.name_scope("token_type_embeddings"):
self.token_type_embeddings = self.add_weight( self.token_type_embeddings = self.add_weight(
name="embeddings", name="embeddings",
shape=[self.type_vocab_size, self.hidden_size], shape=[self.config.type_vocab_size, self.hidden_size],
initializer=get_initializer(self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
...@@ -245,10 +244,10 @@ class TFCamembertEmbeddings(tf.keras.layers.Layer): ...@@ -245,10 +244,10 @@ class TFCamembertEmbeddings(tf.keras.layers.Layer):
# indices on GPU, returning zeros instead. This is a dangerous silent behavior. # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
tf.debugging.assert_less( tf.debugging.assert_less(
input_ids, input_ids,
tf.cast(self.vocab_size, dtype=input_ids.dtype), tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
message=( message=(
"input_ids must be smaller than the embedding layer's input dimension (got" "input_ids must be smaller than the embedding layer's input dimension (got"
f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})" f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
), ),
) )
inputs_embeds = tf.gather(params=self.weight, indices=input_ids) inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
...@@ -1018,7 +1017,7 @@ class TFCamembertLMHead(tf.keras.layers.Layer): ...@@ -1018,7 +1017,7 @@ class TFCamembertLMHead(tf.keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs): def __init__(self, config, input_embeddings, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.config = config
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
self.dense = tf.keras.layers.Dense( self.dense = tf.keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
...@@ -1031,7 +1030,7 @@ class TFCamembertLMHead(tf.keras.layers.Layer): ...@@ -1031,7 +1030,7 @@ class TFCamembertLMHead(tf.keras.layers.Layer):
self.decoder = input_embeddings self.decoder = input_embeddings
def build(self, input_shape): def build(self, input_shape):
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
super().build(input_shape) super().build(input_shape)
...@@ -1047,7 +1046,7 @@ class TFCamembertLMHead(tf.keras.layers.Layer): ...@@ -1047,7 +1046,7 @@ class TFCamembertLMHead(tf.keras.layers.Layer):
def set_bias(self, value): def set_bias(self, value):
self.bias = value["bias"] self.bias = value["bias"]
self.vocab_size = shape_list(value["bias"])[0] self.config.vocab_size = shape_list(value["bias"])[0]
def call(self, hidden_states): def call(self, hidden_states):
hidden_states = self.dense(hidden_states) hidden_states = self.dense(hidden_states)
...@@ -1058,7 +1057,7 @@ class TFCamembertLMHead(tf.keras.layers.Layer): ...@@ -1058,7 +1057,7 @@ class TFCamembertLMHead(tf.keras.layers.Layer):
seq_length = shape_list(tensor=hidden_states)[1] seq_length = shape_list(tensor=hidden_states)[1]
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size]) hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True) hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size]) hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias) hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
return hidden_states return hidden_states
......
...@@ -201,7 +201,6 @@ class TFCLIPTextEmbeddings(tf.keras.layers.Layer): ...@@ -201,7 +201,6 @@ class TFCLIPTextEmbeddings(tf.keras.layers.Layer):
super().__init__(**kwargs) super().__init__(**kwargs)
self.embed_dim = config.hidden_size self.embed_dim = config.hidden_size
self.vocab_size = config.vocab_size
self.config = config self.config = config
...@@ -209,7 +208,7 @@ class TFCLIPTextEmbeddings(tf.keras.layers.Layer): ...@@ -209,7 +208,7 @@ class TFCLIPTextEmbeddings(tf.keras.layers.Layer):
with tf.name_scope("token_embedding"): with tf.name_scope("token_embedding"):
self.weight = self.add_weight( self.weight = self.add_weight(
shape=(self.vocab_size, self.embed_dim), shape=(self.config.vocab_size, self.embed_dim),
initializer=get_initializer(self.config.initializer_factor * self.config.initializer_range), initializer=get_initializer(self.config.initializer_factor * self.config.initializer_range),
trainable=True, trainable=True,
name="weight", name="weight",
...@@ -245,10 +244,10 @@ class TFCLIPTextEmbeddings(tf.keras.layers.Layer): ...@@ -245,10 +244,10 @@ class TFCLIPTextEmbeddings(tf.keras.layers.Layer):
# indices on GPU, returning zeros instead. This is a dangerous silent behavior. # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
tf.debugging.assert_less( tf.debugging.assert_less(
input_ids, input_ids,
tf.cast(self.vocab_size, dtype=input_ids.dtype), tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
message=( message=(
"input_ids must be smaller than the embedding layer's input dimension (got" "input_ids must be smaller than the embedding layer's input dimension (got"
f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})" f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
), ),
) )
inputs_embeds = tf.gather(params=self.weight, indices=input_ids) inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
......
...@@ -74,8 +74,7 @@ class TFConvBertEmbeddings(tf.keras.layers.Layer): ...@@ -74,8 +74,7 @@ class TFConvBertEmbeddings(tf.keras.layers.Layer):
def __init__(self, config: ConvBertConfig, **kwargs): def __init__(self, config: ConvBertConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.config = config
self.type_vocab_size = config.type_vocab_size
self.embedding_size = config.embedding_size self.embedding_size = config.embedding_size
self.max_position_embeddings = config.max_position_embeddings self.max_position_embeddings = config.max_position_embeddings
self.initializer_range = config.initializer_range self.initializer_range = config.initializer_range
...@@ -86,14 +85,14 @@ class TFConvBertEmbeddings(tf.keras.layers.Layer): ...@@ -86,14 +85,14 @@ class TFConvBertEmbeddings(tf.keras.layers.Layer):
with tf.name_scope("word_embeddings"): with tf.name_scope("word_embeddings"):
self.weight = self.add_weight( self.weight = self.add_weight(
name="weight", name="weight",
shape=[self.vocab_size, self.embedding_size], shape=[self.config.vocab_size, self.embedding_size],
initializer=get_initializer(self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
with tf.name_scope("token_type_embeddings"): with tf.name_scope("token_type_embeddings"):
self.token_type_embeddings = self.add_weight( self.token_type_embeddings = self.add_weight(
name="embeddings", name="embeddings",
shape=[self.type_vocab_size, self.embedding_size], shape=[self.config.type_vocab_size, self.embedding_size],
initializer=get_initializer(self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
...@@ -130,10 +129,10 @@ class TFConvBertEmbeddings(tf.keras.layers.Layer): ...@@ -130,10 +129,10 @@ class TFConvBertEmbeddings(tf.keras.layers.Layer):
# indices on GPU, returning zeros instead. This is a dangerous silent behavior. # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
tf.debugging.assert_less( tf.debugging.assert_less(
input_ids, input_ids,
tf.cast(self.vocab_size, dtype=input_ids.dtype), tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
message=( message=(
"input_ids must be smaller than the embedding layer's input dimension (got" "input_ids must be smaller than the embedding layer's input dimension (got"
f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})" f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
), ),
) )
inputs_embeds = tf.gather(params=self.weight, indices=input_ids) inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
...@@ -791,12 +790,12 @@ class TFConvBertMaskedLMHead(tf.keras.layers.Layer): ...@@ -791,12 +790,12 @@ class TFConvBertMaskedLMHead(tf.keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs): def __init__(self, config, input_embeddings, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.config = config
self.embedding_size = config.embedding_size self.embedding_size = config.embedding_size
self.input_embeddings = input_embeddings self.input_embeddings = input_embeddings
def build(self, input_shape): def build(self, input_shape):
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
super().build(input_shape) super().build(input_shape)
...@@ -812,13 +811,13 @@ class TFConvBertMaskedLMHead(tf.keras.layers.Layer): ...@@ -812,13 +811,13 @@ class TFConvBertMaskedLMHead(tf.keras.layers.Layer):
def set_bias(self, value): def set_bias(self, value):
self.bias = value["bias"] self.bias = value["bias"]
self.vocab_size = shape_list(value["bias"])[0] self.config.vocab_size = shape_list(value["bias"])[0]
def call(self, hidden_states): def call(self, hidden_states):
seq_length = shape_list(tensor=hidden_states)[1] seq_length = shape_list(tensor=hidden_states)[1]
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.embedding_size]) hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.embedding_size])
hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True) hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size]) hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias) hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
return hidden_states return hidden_states
...@@ -844,7 +843,7 @@ class TFConvBertForMaskedLM(TFConvBertPreTrainedModel, TFMaskedLanguageModelingL ...@@ -844,7 +843,7 @@ class TFConvBertForMaskedLM(TFConvBertPreTrainedModel, TFMaskedLanguageModelingL
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, **kwargs) super().__init__(config, **kwargs)
self.vocab_size = config.vocab_size self.config = config
self.convbert = TFConvBertMainLayer(config, name="convbert") self.convbert = TFConvBertMainLayer(config, name="convbert")
self.generator_predictions = TFConvBertGeneratorPredictions(config, name="generator_predictions") self.generator_predictions = TFConvBertGeneratorPredictions(config, name="generator_predictions")
......
...@@ -586,7 +586,7 @@ class TFCTRLModel(TFCTRLPreTrainedModel): ...@@ -586,7 +586,7 @@ class TFCTRLModel(TFCTRLPreTrainedModel):
class TFCTRLLMHead(tf.keras.layers.Layer): class TFCTRLLMHead(tf.keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs): def __init__(self, config, input_embeddings, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.config = config
# CTRL has numerical issues in XLA generate # CTRL has numerical issues in XLA generate
self.supports_xla_generation = False self.supports_xla_generation = False
...@@ -595,7 +595,7 @@ class TFCTRLLMHead(tf.keras.layers.Layer): ...@@ -595,7 +595,7 @@ class TFCTRLLMHead(tf.keras.layers.Layer):
self.input_embeddings = input_embeddings self.input_embeddings = input_embeddings
def build(self, input_shape): def build(self, input_shape):
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
super().build(input_shape) super().build(input_shape)
def get_output_embeddings(self): def get_output_embeddings(self):
...@@ -610,7 +610,7 @@ class TFCTRLLMHead(tf.keras.layers.Layer): ...@@ -610,7 +610,7 @@ class TFCTRLLMHead(tf.keras.layers.Layer):
def set_bias(self, value): def set_bias(self, value):
self.bias = value["bias"] self.bias = value["bias"]
self.vocab_size = shape_list(value["bias"])[0] self.config.vocab_size = shape_list(value["bias"])[0]
def call(self, hidden_states): def call(self, hidden_states):
hidden_states = self.input_embeddings(hidden_states, mode="linear") hidden_states = self.input_embeddings(hidden_states, mode="linear")
......
...@@ -722,8 +722,7 @@ class TFDebertaEmbeddings(tf.keras.layers.Layer): ...@@ -722,8 +722,7 @@ class TFDebertaEmbeddings(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.config = config
self.type_vocab_size = config.type_vocab_size
self.embedding_size = getattr(config, "embedding_size", config.hidden_size) self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
self.max_position_embeddings = config.max_position_embeddings self.max_position_embeddings = config.max_position_embeddings
...@@ -738,15 +737,15 @@ class TFDebertaEmbeddings(tf.keras.layers.Layer): ...@@ -738,15 +737,15 @@ class TFDebertaEmbeddings(tf.keras.layers.Layer):
with tf.name_scope("word_embeddings"): with tf.name_scope("word_embeddings"):
self.weight = self.add_weight( self.weight = self.add_weight(
name="weight", name="weight",
shape=[self.vocab_size, self.embedding_size], shape=[self.config.vocab_size, self.embedding_size],
initializer=get_initializer(self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
with tf.name_scope("token_type_embeddings"): with tf.name_scope("token_type_embeddings"):
if self.type_vocab_size > 0: if self.config.type_vocab_size > 0:
self.token_type_embeddings = self.add_weight( self.token_type_embeddings = self.add_weight(
name="embeddings", name="embeddings",
shape=[self.type_vocab_size, self.embedding_size], shape=[self.config.type_vocab_size, self.embedding_size],
initializer=get_initializer(self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
else: else:
...@@ -787,10 +786,10 @@ class TFDebertaEmbeddings(tf.keras.layers.Layer): ...@@ -787,10 +786,10 @@ class TFDebertaEmbeddings(tf.keras.layers.Layer):
# indices on GPU, returning zeros instead. This is a dangerous silent behavior. # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
tf.debugging.assert_less( tf.debugging.assert_less(
input_ids, input_ids,
tf.cast(self.vocab_size, dtype=input_ids.dtype), tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
message=( message=(
"input_ids must be smaller than the embedding layer's input dimension (got" "input_ids must be smaller than the embedding layer's input dimension (got"
f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})" f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
), ),
) )
inputs_embeds = tf.gather(params=self.weight, indices=input_ids) inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
...@@ -807,7 +806,7 @@ class TFDebertaEmbeddings(tf.keras.layers.Layer): ...@@ -807,7 +806,7 @@ class TFDebertaEmbeddings(tf.keras.layers.Layer):
if self.position_biased_input: if self.position_biased_input:
position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids) position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
final_embeddings += position_embeds final_embeddings += position_embeds
if self.type_vocab_size > 0: if self.config.type_vocab_size > 0:
token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids) token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids)
final_embeddings += token_type_embeds final_embeddings += token_type_embeds
...@@ -857,7 +856,7 @@ class TFDebertaLMPredictionHead(tf.keras.layers.Layer): ...@@ -857,7 +856,7 @@ class TFDebertaLMPredictionHead(tf.keras.layers.Layer):
def __init__(self, config: DebertaConfig, input_embeddings: tf.keras.layers.Layer, **kwargs): def __init__(self, config: DebertaConfig, input_embeddings: tf.keras.layers.Layer, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.config = config
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
self.transform = TFDebertaPredictionHeadTransform(config, name="transform") self.transform = TFDebertaPredictionHeadTransform(config, name="transform")
...@@ -867,7 +866,7 @@ class TFDebertaLMPredictionHead(tf.keras.layers.Layer): ...@@ -867,7 +866,7 @@ class TFDebertaLMPredictionHead(tf.keras.layers.Layer):
self.input_embeddings = input_embeddings self.input_embeddings = input_embeddings
def build(self, input_shape: tf.TensorShape): def build(self, input_shape: tf.TensorShape):
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
super().build(input_shape) super().build(input_shape)
...@@ -883,14 +882,14 @@ class TFDebertaLMPredictionHead(tf.keras.layers.Layer): ...@@ -883,14 +882,14 @@ class TFDebertaLMPredictionHead(tf.keras.layers.Layer):
def set_bias(self, value: tf.Variable): def set_bias(self, value: tf.Variable):
self.bias = value["bias"] self.bias = value["bias"]
self.vocab_size = shape_list(value["bias"])[0] self.config.vocab_size = shape_list(value["bias"])[0]
def call(self, hidden_states: tf.Tensor) -> tf.Tensor: def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.transform(hidden_states=hidden_states) hidden_states = self.transform(hidden_states=hidden_states)
seq_length = shape_list(hidden_states)[1] seq_length = shape_list(hidden_states)[1]
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size]) hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True) hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size]) hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias) hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
return hidden_states return hidden_states
......
...@@ -811,8 +811,7 @@ class TFDebertaV2Embeddings(tf.keras.layers.Layer): ...@@ -811,8 +811,7 @@ class TFDebertaV2Embeddings(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.config = config
self.type_vocab_size = config.type_vocab_size
self.embedding_size = getattr(config, "embedding_size", config.hidden_size) self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
self.max_position_embeddings = config.max_position_embeddings self.max_position_embeddings = config.max_position_embeddings
...@@ -827,15 +826,15 @@ class TFDebertaV2Embeddings(tf.keras.layers.Layer): ...@@ -827,15 +826,15 @@ class TFDebertaV2Embeddings(tf.keras.layers.Layer):
with tf.name_scope("word_embeddings"): with tf.name_scope("word_embeddings"):
self.weight = self.add_weight( self.weight = self.add_weight(
name="weight", name="weight",
shape=[self.vocab_size, self.embedding_size], shape=[self.config.vocab_size, self.embedding_size],
initializer=get_initializer(self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
with tf.name_scope("token_type_embeddings"): with tf.name_scope("token_type_embeddings"):
if self.type_vocab_size > 0: if self.config.type_vocab_size > 0:
self.token_type_embeddings = self.add_weight( self.token_type_embeddings = self.add_weight(
name="embeddings", name="embeddings",
shape=[self.type_vocab_size, self.embedding_size], shape=[self.config.type_vocab_size, self.embedding_size],
initializer=get_initializer(self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
else: else:
...@@ -876,10 +875,10 @@ class TFDebertaV2Embeddings(tf.keras.layers.Layer): ...@@ -876,10 +875,10 @@ class TFDebertaV2Embeddings(tf.keras.layers.Layer):
# indices on GPU, returning zeros instead. This is a dangerous silent behavior. # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
tf.debugging.assert_less( tf.debugging.assert_less(
input_ids, input_ids,
tf.cast(self.vocab_size, dtype=input_ids.dtype), tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
message=( message=(
"input_ids must be smaller than the embedding layer's input dimension (got" "input_ids must be smaller than the embedding layer's input dimension (got"
f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})" f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
), ),
) )
inputs_embeds = tf.gather(params=self.weight, indices=input_ids) inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
...@@ -896,7 +895,7 @@ class TFDebertaV2Embeddings(tf.keras.layers.Layer): ...@@ -896,7 +895,7 @@ class TFDebertaV2Embeddings(tf.keras.layers.Layer):
if self.position_biased_input: if self.position_biased_input:
position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids) position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
final_embeddings += position_embeds final_embeddings += position_embeds
if self.type_vocab_size > 0: if self.config.type_vocab_size > 0:
token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids) token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids)
final_embeddings += token_type_embeds final_embeddings += token_type_embeds
...@@ -948,7 +947,7 @@ class TFDebertaV2LMPredictionHead(tf.keras.layers.Layer): ...@@ -948,7 +947,7 @@ class TFDebertaV2LMPredictionHead(tf.keras.layers.Layer):
def __init__(self, config: DebertaV2Config, input_embeddings: tf.keras.layers.Layer, **kwargs): def __init__(self, config: DebertaV2Config, input_embeddings: tf.keras.layers.Layer, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.config = config
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
self.transform = TFDebertaV2PredictionHeadTransform(config, name="transform") self.transform = TFDebertaV2PredictionHeadTransform(config, name="transform")
...@@ -958,7 +957,7 @@ class TFDebertaV2LMPredictionHead(tf.keras.layers.Layer): ...@@ -958,7 +957,7 @@ class TFDebertaV2LMPredictionHead(tf.keras.layers.Layer):
self.input_embeddings = input_embeddings self.input_embeddings = input_embeddings
def build(self, input_shape: tf.TensorShape): def build(self, input_shape: tf.TensorShape):
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
super().build(input_shape) super().build(input_shape)
...@@ -974,14 +973,14 @@ class TFDebertaV2LMPredictionHead(tf.keras.layers.Layer): ...@@ -974,14 +973,14 @@ class TFDebertaV2LMPredictionHead(tf.keras.layers.Layer):
def set_bias(self, value: tf.Variable): def set_bias(self, value: tf.Variable):
self.bias = value["bias"] self.bias = value["bias"]
self.vocab_size = shape_list(value["bias"])[0] self.config.vocab_size = shape_list(value["bias"])[0]
def call(self, hidden_states: tf.Tensor) -> tf.Tensor: def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.transform(hidden_states=hidden_states) hidden_states = self.transform(hidden_states=hidden_states)
seq_length = shape_list(hidden_states)[1] seq_length = shape_list(hidden_states)[1]
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size]) hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True) hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size]) hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias) hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
return hidden_states return hidden_states
......
...@@ -76,7 +76,7 @@ class TFEmbeddings(tf.keras.layers.Layer): ...@@ -76,7 +76,7 @@ class TFEmbeddings(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.config = config
self.dim = config.dim self.dim = config.dim
self.initializer_range = config.initializer_range self.initializer_range = config.initializer_range
self.max_position_embeddings = config.max_position_embeddings self.max_position_embeddings = config.max_position_embeddings
...@@ -87,7 +87,7 @@ class TFEmbeddings(tf.keras.layers.Layer): ...@@ -87,7 +87,7 @@ class TFEmbeddings(tf.keras.layers.Layer):
with tf.name_scope("word_embeddings"): with tf.name_scope("word_embeddings"):
self.weight = self.add_weight( self.weight = self.add_weight(
name="weight", name="weight",
shape=[self.vocab_size, self.dim], shape=[self.config.vocab_size, self.dim],
initializer=get_initializer(initializer_range=self.initializer_range), initializer=get_initializer(initializer_range=self.initializer_range),
) )
...@@ -114,10 +114,10 @@ class TFEmbeddings(tf.keras.layers.Layer): ...@@ -114,10 +114,10 @@ class TFEmbeddings(tf.keras.layers.Layer):
# indices on GPU, returning zeros instead. This is a dangerous silent behavior. # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
tf.debugging.assert_less( tf.debugging.assert_less(
input_ids, input_ids,
tf.cast(self.vocab_size, dtype=input_ids.dtype), tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
message=( message=(
"input_ids must be smaller than the embedding layer's input dimension (got" "input_ids must be smaller than the embedding layer's input dimension (got"
f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})" f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
), ),
) )
inputs_embeds = tf.gather(params=self.weight, indices=input_ids) inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
...@@ -581,7 +581,7 @@ class TFDistilBertLMHead(tf.keras.layers.Layer): ...@@ -581,7 +581,7 @@ class TFDistilBertLMHead(tf.keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs): def __init__(self, config, input_embeddings, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.config = config
self.dim = config.dim self.dim = config.dim
# The output weights are the same as the input embeddings, but there is # The output weights are the same as the input embeddings, but there is
...@@ -589,7 +589,7 @@ class TFDistilBertLMHead(tf.keras.layers.Layer): ...@@ -589,7 +589,7 @@ class TFDistilBertLMHead(tf.keras.layers.Layer):
self.input_embeddings = input_embeddings self.input_embeddings = input_embeddings
def build(self, input_shape): def build(self, input_shape):
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
super().build(input_shape) super().build(input_shape)
...@@ -605,13 +605,13 @@ class TFDistilBertLMHead(tf.keras.layers.Layer): ...@@ -605,13 +605,13 @@ class TFDistilBertLMHead(tf.keras.layers.Layer):
def set_bias(self, value): def set_bias(self, value):
self.bias = value["bias"] self.bias = value["bias"]
self.vocab_size = shape_list(value["bias"])[0] self.config.vocab_size = shape_list(value["bias"])[0]
def call(self, hidden_states): def call(self, hidden_states):
seq_length = shape_list(tensor=hidden_states)[1] seq_length = shape_list(tensor=hidden_states)[1]
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.dim]) hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.dim])
hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True) hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size]) hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias) hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
return hidden_states return hidden_states
...@@ -624,7 +624,7 @@ class TFDistilBertLMHead(tf.keras.layers.Layer): ...@@ -624,7 +624,7 @@ class TFDistilBertLMHead(tf.keras.layers.Layer):
class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel, TFMaskedLanguageModelingLoss): class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel, TFMaskedLanguageModelingLoss):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.vocab_size = config.vocab_size self.config = config
self.distilbert = TFDistilBertMainLayer(config, name="distilbert") self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
self.vocab_transform = tf.keras.layers.Dense( self.vocab_transform = tf.keras.layers.Dense(
......
...@@ -478,8 +478,7 @@ class TFElectraEmbeddings(tf.keras.layers.Layer): ...@@ -478,8 +478,7 @@ class TFElectraEmbeddings(tf.keras.layers.Layer):
def __init__(self, config: ElectraConfig, **kwargs): def __init__(self, config: ElectraConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.config = config
self.type_vocab_size = config.type_vocab_size
self.embedding_size = config.embedding_size self.embedding_size = config.embedding_size
self.max_position_embeddings = config.max_position_embeddings self.max_position_embeddings = config.max_position_embeddings
self.initializer_range = config.initializer_range self.initializer_range = config.initializer_range
...@@ -490,14 +489,14 @@ class TFElectraEmbeddings(tf.keras.layers.Layer): ...@@ -490,14 +489,14 @@ class TFElectraEmbeddings(tf.keras.layers.Layer):
with tf.name_scope("word_embeddings"): with tf.name_scope("word_embeddings"):
self.weight = self.add_weight( self.weight = self.add_weight(
name="weight", name="weight",
shape=[self.vocab_size, self.embedding_size], shape=[self.config.vocab_size, self.embedding_size],
initializer=get_initializer(self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
with tf.name_scope("token_type_embeddings"): with tf.name_scope("token_type_embeddings"):
self.token_type_embeddings = self.add_weight( self.token_type_embeddings = self.add_weight(
name="embeddings", name="embeddings",
shape=[self.type_vocab_size, self.embedding_size], shape=[self.config.type_vocab_size, self.embedding_size],
initializer=get_initializer(self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
...@@ -534,10 +533,10 @@ class TFElectraEmbeddings(tf.keras.layers.Layer): ...@@ -534,10 +533,10 @@ class TFElectraEmbeddings(tf.keras.layers.Layer):
# indices on GPU, returning zeros instead. This is a dangerous silent behavior. # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
tf.debugging.assert_less( tf.debugging.assert_less(
input_ids, input_ids,
tf.cast(self.vocab_size, dtype=input_ids.dtype), tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
message=( message=(
"input_ids must be smaller than the embedding layer's input dimension (got" "input_ids must be smaller than the embedding layer's input dimension (got"
f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})" f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
), ),
) )
inputs_embeds = tf.gather(params=self.weight, indices=input_ids) inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
...@@ -1107,12 +1106,12 @@ class TFElectraMaskedLMHead(tf.keras.layers.Layer): ...@@ -1107,12 +1106,12 @@ class TFElectraMaskedLMHead(tf.keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs): def __init__(self, config, input_embeddings, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.config = config
self.embedding_size = config.embedding_size self.embedding_size = config.embedding_size
self.input_embeddings = input_embeddings self.input_embeddings = input_embeddings
def build(self, input_shape): def build(self, input_shape):
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
super().build(input_shape) super().build(input_shape)
...@@ -1128,13 +1127,13 @@ class TFElectraMaskedLMHead(tf.keras.layers.Layer): ...@@ -1128,13 +1127,13 @@ class TFElectraMaskedLMHead(tf.keras.layers.Layer):
def set_bias(self, value): def set_bias(self, value):
self.bias = value["bias"] self.bias = value["bias"]
self.vocab_size = shape_list(value["bias"])[0] self.config.vocab_size = shape_list(value["bias"])[0]
def call(self, hidden_states): def call(self, hidden_states):
seq_length = shape_list(tensor=hidden_states)[1] seq_length = shape_list(tensor=hidden_states)[1]
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.embedding_size]) hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.embedding_size])
hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True) hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size]) hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias) hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
return hidden_states return hidden_states
...@@ -1153,7 +1152,7 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel, TFMaskedLanguageModelingLos ...@@ -1153,7 +1152,7 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel, TFMaskedLanguageModelingLos
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(config, **kwargs) super().__init__(config, **kwargs)
self.vocab_size = config.vocab_size self.config = config
self.electra = TFElectraMainLayer(config, name="electra") self.electra = TFElectraMainLayer(config, name="electra")
self.generator_predictions = TFElectraGeneratorPredictions(config, name="generator_predictions") self.generator_predictions = TFElectraGeneratorPredictions(config, name="generator_predictions")
......
...@@ -202,7 +202,7 @@ class TFEsmEmbeddings(Layer): ...@@ -202,7 +202,7 @@ class TFEsmEmbeddings(Layer):
self.padding_idx = config.pad_token_id self.padding_idx = config.pad_token_id
self.token_dropout = config.token_dropout self.token_dropout = config.token_dropout
self.mask_token_id = config.mask_token_id self.mask_token_id = config.mask_token_id
self.vocab_size = config.vocab_size self.config = config
def call( def call(
self, input_ids=None, attention_mask=None, position_ids=None, inputs_embeds=None, past_key_values_length=0 self, input_ids=None, attention_mask=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
...@@ -219,10 +219,10 @@ class TFEsmEmbeddings(Layer): ...@@ -219,10 +219,10 @@ class TFEsmEmbeddings(Layer):
# indices on GPU, returning zeros instead. This is a dangerous silent behavior. # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
tf.debugging.assert_less( tf.debugging.assert_less(
input_ids, input_ids,
tf.cast(self.vocab_size, dtype=input_ids.dtype), tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
message=( message=(
"input_ids must be smaller than the embedding layer's input dimension (got" "input_ids must be smaller than the embedding layer's input dimension (got"
f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})" f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
), ),
) )
inputs_embeds = self.word_embeddings(input_ids) inputs_embeds = self.word_embeddings(input_ids)
...@@ -1222,13 +1222,13 @@ class TFEsmLMHead(Layer): ...@@ -1222,13 +1222,13 @@ class TFEsmLMHead(Layer):
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="decoder", name="decoder",
) )
self.vocab_size = config.vocab_size self.config = config
def build(self, input_shape): def build(self, input_shape):
super().build(input_shape) super().build(input_shape)
# Separate bias to match the PT model and allow weight cross-loading to work # Separate bias to match the PT model and allow weight cross-loading to work
# Put it in the build so it gets the right name when adding it as a weight # Put it in the build so it gets the right name when adding it as a weight
self.bias = self.add_weight("bias", shape=(self.vocab_size,), initializer="zeros", trainable=True) self.bias = self.add_weight("bias", shape=(self.config.vocab_size,), initializer="zeros", trainable=True)
def get_bias(self): def get_bias(self):
return {"bias": self.bias} return {"bias": self.bias}
......
...@@ -82,7 +82,7 @@ class TFFunnelEmbeddings(tf.keras.layers.Layer): ...@@ -82,7 +82,7 @@ class TFFunnelEmbeddings(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.config = config
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
self.initializer_std = 1.0 if config.initializer_std is None else config.initializer_std self.initializer_std = 1.0 if config.initializer_std is None else config.initializer_std
...@@ -93,7 +93,7 @@ class TFFunnelEmbeddings(tf.keras.layers.Layer): ...@@ -93,7 +93,7 @@ class TFFunnelEmbeddings(tf.keras.layers.Layer):
with tf.name_scope("word_embeddings"): with tf.name_scope("word_embeddings"):
self.weight = self.add_weight( self.weight = self.add_weight(
name="weight", name="weight",
shape=[self.vocab_size, self.hidden_size], shape=[self.config.vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_std), initializer=get_initializer(initializer_range=self.initializer_std),
) )
...@@ -114,10 +114,10 @@ class TFFunnelEmbeddings(tf.keras.layers.Layer): ...@@ -114,10 +114,10 @@ class TFFunnelEmbeddings(tf.keras.layers.Layer):
# indices on GPU, returning zeros instead. This is a dangerous silent behavior. # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
tf.debugging.assert_less( tf.debugging.assert_less(
input_ids, input_ids,
tf.cast(self.vocab_size, dtype=input_ids.dtype), tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
message=( message=(
"input_ids must be smaller than the embedding layer's input dimension (got" "input_ids must be smaller than the embedding layer's input dimension (got"
f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})" f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
), ),
) )
inputs_embeds = tf.gather(self.weight, input_ids) inputs_embeds = tf.gather(self.weight, input_ids)
...@@ -924,12 +924,12 @@ class TFFunnelDiscriminatorPredictions(tf.keras.layers.Layer): ...@@ -924,12 +924,12 @@ class TFFunnelDiscriminatorPredictions(tf.keras.layers.Layer):
class TFFunnelMaskedLMHead(tf.keras.layers.Layer): class TFFunnelMaskedLMHead(tf.keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs): def __init__(self, config, input_embeddings, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.config = config
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
self.input_embeddings = input_embeddings self.input_embeddings = input_embeddings
def build(self, input_shape): def build(self, input_shape):
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
super().build(input_shape) super().build(input_shape)
...@@ -945,13 +945,13 @@ class TFFunnelMaskedLMHead(tf.keras.layers.Layer): ...@@ -945,13 +945,13 @@ class TFFunnelMaskedLMHead(tf.keras.layers.Layer):
def set_bias(self, value): def set_bias(self, value):
self.bias = value["bias"] self.bias = value["bias"]
self.vocab_size = shape_list(value["bias"])[0] self.config.vocab_size = shape_list(value["bias"])[0]
def call(self, hidden_states, training=False): def call(self, hidden_states, training=False):
seq_length = shape_list(tensor=hidden_states)[1] seq_length = shape_list(tensor=hidden_states)[1]
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size]) hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True) hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size]) hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias) hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
return hidden_states return hidden_states
......
...@@ -314,7 +314,6 @@ class TFGPT2MainLayer(tf.keras.layers.Layer): ...@@ -314,7 +314,6 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
self.return_dict = config.use_return_dict self.return_dict = config.use_return_dict
self.num_hidden_layers = config.n_layer self.num_hidden_layers = config.n_layer
self.vocab_size = config.vocab_size
self.n_embd = config.n_embd self.n_embd = config.n_embd
self.n_positions = config.n_positions self.n_positions = config.n_positions
self.initializer_range = config.initializer_range self.initializer_range = config.initializer_range
...@@ -449,7 +448,7 @@ class TFGPT2MainLayer(tf.keras.layers.Layer): ...@@ -449,7 +448,7 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
tf.cast(self.config.vocab_size, dtype=input_ids.dtype), tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
message=( message=(
"input_ids must be smaller than the embedding layer's input dimension (got" "input_ids must be smaller than the embedding layer's input dimension (got"
f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})" f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
), ),
) )
inputs_embeds = self.wte(input_ids, mode="embedding") inputs_embeds = self.wte(input_ids, mode="embedding")
......
...@@ -347,7 +347,6 @@ class TFGPTJMainLayer(tf.keras.layers.Layer): ...@@ -347,7 +347,6 @@ class TFGPTJMainLayer(tf.keras.layers.Layer):
self.return_dict = config.use_return_dict self.return_dict = config.use_return_dict
self.num_hidden_layers = config.n_layer self.num_hidden_layers = config.n_layer
self.vocab_size = config.vocab_size
self.n_embd = config.n_embd self.n_embd = config.n_embd
self.n_positions = config.n_positions self.n_positions = config.n_positions
self.initializer_range = config.initializer_range self.initializer_range = config.initializer_range
......
...@@ -536,7 +536,6 @@ class TFGroupViTTextEmbeddings(tf.keras.layers.Layer): ...@@ -536,7 +536,6 @@ class TFGroupViTTextEmbeddings(tf.keras.layers.Layer):
super().__init__(**kwargs) super().__init__(**kwargs)
self.embed_dim = config.hidden_size self.embed_dim = config.hidden_size
self.vocab_size = config.vocab_size
self.config = config self.config = config
...@@ -544,7 +543,7 @@ class TFGroupViTTextEmbeddings(tf.keras.layers.Layer): ...@@ -544,7 +543,7 @@ class TFGroupViTTextEmbeddings(tf.keras.layers.Layer):
with tf.name_scope("token_embedding"): with tf.name_scope("token_embedding"):
self.weight = self.add_weight( self.weight = self.add_weight(
shape=(self.vocab_size, self.embed_dim), shape=(self.config.vocab_size, self.embed_dim),
initializer=get_initializer(self.config.initializer_factor * self.config.initializer_range), initializer=get_initializer(self.config.initializer_factor * self.config.initializer_range),
trainable=True, trainable=True,
name="weight", name="weight",
...@@ -580,10 +579,10 @@ class TFGroupViTTextEmbeddings(tf.keras.layers.Layer): ...@@ -580,10 +579,10 @@ class TFGroupViTTextEmbeddings(tf.keras.layers.Layer):
# indices on GPU, returning zeros instead. This is a dangerous silent behavior. # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
tf.debugging.assert_less( tf.debugging.assert_less(
input_ids, input_ids,
tf.cast(self.vocab_size, dtype=input_ids.dtype), tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
message=( message=(
"input_ids must be smaller than the embedding layer's input dimension (got" "input_ids must be smaller than the embedding layer's input dimension (got"
f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})" f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
), ),
) )
inputs_embeds = tf.gather(params=self.weight, indices=input_ids) inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
......
...@@ -62,8 +62,7 @@ class TFLayoutLMEmbeddings(tf.keras.layers.Layer): ...@@ -62,8 +62,7 @@ class TFLayoutLMEmbeddings(tf.keras.layers.Layer):
def __init__(self, config: LayoutLMConfig, **kwargs): def __init__(self, config: LayoutLMConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.config = config
self.type_vocab_size = config.type_vocab_size
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
self.max_position_embeddings = config.max_position_embeddings self.max_position_embeddings = config.max_position_embeddings
self.max_2d_position_embeddings = config.max_2d_position_embeddings self.max_2d_position_embeddings = config.max_2d_position_embeddings
...@@ -75,14 +74,14 @@ class TFLayoutLMEmbeddings(tf.keras.layers.Layer): ...@@ -75,14 +74,14 @@ class TFLayoutLMEmbeddings(tf.keras.layers.Layer):
with tf.name_scope("word_embeddings"): with tf.name_scope("word_embeddings"):
self.weight = self.add_weight( self.weight = self.add_weight(
name="weight", name="weight",
shape=[self.vocab_size, self.hidden_size], shape=[self.config.vocab_size, self.hidden_size],
initializer=get_initializer(self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
with tf.name_scope("token_type_embeddings"): with tf.name_scope("token_type_embeddings"):
self.token_type_embeddings = self.add_weight( self.token_type_embeddings = self.add_weight(
name="embeddings", name="embeddings",
shape=[self.type_vocab_size, self.hidden_size], shape=[self.config.type_vocab_size, self.hidden_size],
initializer=get_initializer(self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
...@@ -145,10 +144,10 @@ class TFLayoutLMEmbeddings(tf.keras.layers.Layer): ...@@ -145,10 +144,10 @@ class TFLayoutLMEmbeddings(tf.keras.layers.Layer):
# indices on GPU, returning zeros instead. This is a dangerous silent behavior. # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
tf.debugging.assert_less( tf.debugging.assert_less(
input_ids, input_ids,
tf.cast(self.vocab_size, dtype=input_ids.dtype), tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
message=( message=(
"input_ids must be smaller than the embedding layer's input dimension (got" "input_ids must be smaller than the embedding layer's input dimension (got"
f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})" f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
), ),
) )
inputs_embeds = tf.gather(params=self.weight, indices=input_ids) inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
...@@ -622,7 +621,7 @@ class TFLayoutLMLMPredictionHead(tf.keras.layers.Layer): ...@@ -622,7 +621,7 @@ class TFLayoutLMLMPredictionHead(tf.keras.layers.Layer):
def __init__(self, config: LayoutLMConfig, input_embeddings: tf.keras.layers.Layer, **kwargs): def __init__(self, config: LayoutLMConfig, input_embeddings: tf.keras.layers.Layer, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.config = config
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
self.transform = TFLayoutLMPredictionHeadTransform(config, name="transform") self.transform = TFLayoutLMPredictionHeadTransform(config, name="transform")
...@@ -632,7 +631,7 @@ class TFLayoutLMLMPredictionHead(tf.keras.layers.Layer): ...@@ -632,7 +631,7 @@ class TFLayoutLMLMPredictionHead(tf.keras.layers.Layer):
self.input_embeddings = input_embeddings self.input_embeddings = input_embeddings
def build(self, input_shape: tf.TensorShape): def build(self, input_shape: tf.TensorShape):
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
super().build(input_shape) super().build(input_shape)
...@@ -648,14 +647,14 @@ class TFLayoutLMLMPredictionHead(tf.keras.layers.Layer): ...@@ -648,14 +647,14 @@ class TFLayoutLMLMPredictionHead(tf.keras.layers.Layer):
def set_bias(self, value: tf.Variable): def set_bias(self, value: tf.Variable):
self.bias = value["bias"] self.bias = value["bias"]
self.vocab_size = shape_list(value["bias"])[0] self.config.vocab_size = shape_list(value["bias"])[0]
def call(self, hidden_states: tf.Tensor) -> tf.Tensor: def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.transform(hidden_states=hidden_states) hidden_states = self.transform(hidden_states=hidden_states)
seq_length = shape_list(hidden_states)[1] seq_length = shape_list(hidden_states)[1]
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size]) hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True) hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size]) hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias) hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
return hidden_states return hidden_states
......
...@@ -421,7 +421,7 @@ class TFLongformerLMHead(tf.keras.layers.Layer): ...@@ -421,7 +421,7 @@ class TFLongformerLMHead(tf.keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs): def __init__(self, config, input_embeddings, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.config = config
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
self.dense = tf.keras.layers.Dense( self.dense = tf.keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
...@@ -434,7 +434,7 @@ class TFLongformerLMHead(tf.keras.layers.Layer): ...@@ -434,7 +434,7 @@ class TFLongformerLMHead(tf.keras.layers.Layer):
self.decoder = input_embeddings self.decoder = input_embeddings
def build(self, input_shape): def build(self, input_shape):
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
super().build(input_shape) super().build(input_shape)
...@@ -450,7 +450,7 @@ class TFLongformerLMHead(tf.keras.layers.Layer): ...@@ -450,7 +450,7 @@ class TFLongformerLMHead(tf.keras.layers.Layer):
def set_bias(self, value): def set_bias(self, value):
self.bias = value["bias"] self.bias = value["bias"]
self.vocab_size = shape_list(value["bias"])[0] self.config.vocab_size = shape_list(value["bias"])[0]
def call(self, hidden_states): def call(self, hidden_states):
hidden_states = self.dense(hidden_states) hidden_states = self.dense(hidden_states)
...@@ -461,7 +461,7 @@ class TFLongformerLMHead(tf.keras.layers.Layer): ...@@ -461,7 +461,7 @@ class TFLongformerLMHead(tf.keras.layers.Layer):
seq_length = shape_list(tensor=hidden_states)[1] seq_length = shape_list(tensor=hidden_states)[1]
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size]) hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True) hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size]) hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias) hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
return hidden_states return hidden_states
...@@ -476,8 +476,7 @@ class TFLongformerEmbeddings(tf.keras.layers.Layer): ...@@ -476,8 +476,7 @@ class TFLongformerEmbeddings(tf.keras.layers.Layer):
super().__init__(**kwargs) super().__init__(**kwargs)
self.padding_idx = 1 self.padding_idx = 1
self.vocab_size = config.vocab_size self.config = config
self.type_vocab_size = config.type_vocab_size
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
self.max_position_embeddings = config.max_position_embeddings self.max_position_embeddings = config.max_position_embeddings
self.initializer_range = config.initializer_range self.initializer_range = config.initializer_range
...@@ -488,14 +487,14 @@ class TFLongformerEmbeddings(tf.keras.layers.Layer): ...@@ -488,14 +487,14 @@ class TFLongformerEmbeddings(tf.keras.layers.Layer):
with tf.name_scope("word_embeddings"): with tf.name_scope("word_embeddings"):
self.weight = self.add_weight( self.weight = self.add_weight(
name="weight", name="weight",
shape=[self.vocab_size, self.hidden_size], shape=[self.config.vocab_size, self.hidden_size],
initializer=get_initializer(self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
with tf.name_scope("token_type_embeddings"): with tf.name_scope("token_type_embeddings"):
self.token_type_embeddings = self.add_weight( self.token_type_embeddings = self.add_weight(
name="embeddings", name="embeddings",
shape=[self.type_vocab_size, self.hidden_size], shape=[self.config.type_vocab_size, self.hidden_size],
initializer=get_initializer(self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
...@@ -544,10 +543,10 @@ class TFLongformerEmbeddings(tf.keras.layers.Layer): ...@@ -544,10 +543,10 @@ class TFLongformerEmbeddings(tf.keras.layers.Layer):
# indices on GPU, returning zeros instead. This is a dangerous silent behavior. # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
tf.debugging.assert_less( tf.debugging.assert_less(
input_ids, input_ids,
tf.cast(self.vocab_size, dtype=input_ids.dtype), tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
message=( message=(
"input_ids must be smaller than the embedding layer's input dimension (got" "input_ids must be smaller than the embedding layer's input dimension (got"
f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})" f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
), ),
) )
inputs_embeds = tf.gather(params=self.weight, indices=input_ids) inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
......
...@@ -193,8 +193,7 @@ class TFLxmertEmbeddings(tf.keras.layers.Layer): ...@@ -193,8 +193,7 @@ class TFLxmertEmbeddings(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.config = config
self.type_vocab_size = config.type_vocab_size
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
self.max_position_embeddings = config.max_position_embeddings self.max_position_embeddings = config.max_position_embeddings
self.initializer_range = config.initializer_range self.initializer_range = config.initializer_range
...@@ -205,14 +204,14 @@ class TFLxmertEmbeddings(tf.keras.layers.Layer): ...@@ -205,14 +204,14 @@ class TFLxmertEmbeddings(tf.keras.layers.Layer):
with tf.name_scope("word_embeddings"): with tf.name_scope("word_embeddings"):
self.weight = self.add_weight( self.weight = self.add_weight(
name="weight", name="weight",
shape=[self.vocab_size, self.hidden_size], shape=[self.config.vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range), initializer=get_initializer(initializer_range=self.initializer_range),
) )
with tf.name_scope("token_type_embeddings"): with tf.name_scope("token_type_embeddings"):
self.token_type_embeddings = self.add_weight( self.token_type_embeddings = self.add_weight(
name="embeddings", name="embeddings",
shape=[self.type_vocab_size, self.hidden_size], shape=[self.config.type_vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range), initializer=get_initializer(initializer_range=self.initializer_range),
) )
...@@ -239,10 +238,10 @@ class TFLxmertEmbeddings(tf.keras.layers.Layer): ...@@ -239,10 +238,10 @@ class TFLxmertEmbeddings(tf.keras.layers.Layer):
# indices on GPU, returning zeros instead. This is a dangerous silent behavior. # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
tf.debugging.assert_less( tf.debugging.assert_less(
input_ids, input_ids,
tf.cast(self.vocab_size, dtype=input_ids.dtype), tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
message=( message=(
"input_ids must be smaller than the embedding layer's input dimension (got" "input_ids must be smaller than the embedding layer's input dimension (got"
f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})" f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
), ),
) )
inputs_embeds = tf.gather(params=self.weight, indices=input_ids) inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
...@@ -1054,7 +1053,7 @@ class TFLxmertLMPredictionHead(tf.keras.layers.Layer): ...@@ -1054,7 +1053,7 @@ class TFLxmertLMPredictionHead(tf.keras.layers.Layer):
def __init__(self, config: LxmertConfig, input_embeddings: tf.keras.layers.Layer, **kwargs): def __init__(self, config: LxmertConfig, input_embeddings: tf.keras.layers.Layer, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.config = config
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
self.transform = TFLxmertPredictionHeadTransform(config, name="transform") self.transform = TFLxmertPredictionHeadTransform(config, name="transform")
...@@ -1064,7 +1063,7 @@ class TFLxmertLMPredictionHead(tf.keras.layers.Layer): ...@@ -1064,7 +1063,7 @@ class TFLxmertLMPredictionHead(tf.keras.layers.Layer):
self.input_embeddings = input_embeddings self.input_embeddings = input_embeddings
def build(self, input_shape: tf.TensorShape): def build(self, input_shape: tf.TensorShape):
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
super().build(input_shape) super().build(input_shape)
...@@ -1080,14 +1079,14 @@ class TFLxmertLMPredictionHead(tf.keras.layers.Layer): ...@@ -1080,14 +1079,14 @@ class TFLxmertLMPredictionHead(tf.keras.layers.Layer):
def set_bias(self, value: tf.Variable): def set_bias(self, value: tf.Variable):
self.bias = value["bias"] self.bias = value["bias"]
self.vocab_size = shape_list(value["bias"])[0] self.config.vocab_size = shape_list(value["bias"])[0]
def call(self, hidden_states: tf.Tensor) -> tf.Tensor: def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.transform(hidden_states=hidden_states) hidden_states = self.transform(hidden_states=hidden_states)
seq_length = shape_list(hidden_states)[1] seq_length = shape_list(hidden_states)[1]
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size]) hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True) hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size]) hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias) hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
return hidden_states return hidden_states
......
...@@ -166,9 +166,8 @@ class TFMobileBertEmbeddings(tf.keras.layers.Layer): ...@@ -166,9 +166,8 @@ class TFMobileBertEmbeddings(tf.keras.layers.Layer):
self.trigram_input = config.trigram_input self.trigram_input = config.trigram_input
self.embedding_size = config.embedding_size self.embedding_size = config.embedding_size
self.vocab_size = config.vocab_size self.config = config
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
self.type_vocab_size = config.type_vocab_size
self.max_position_embeddings = config.max_position_embeddings self.max_position_embeddings = config.max_position_embeddings
self.initializer_range = config.initializer_range self.initializer_range = config.initializer_range
self.embedding_transformation = tf.keras.layers.Dense(config.hidden_size, name="embedding_transformation") self.embedding_transformation = tf.keras.layers.Dense(config.hidden_size, name="embedding_transformation")
...@@ -184,14 +183,14 @@ class TFMobileBertEmbeddings(tf.keras.layers.Layer): ...@@ -184,14 +183,14 @@ class TFMobileBertEmbeddings(tf.keras.layers.Layer):
with tf.name_scope("word_embeddings"): with tf.name_scope("word_embeddings"):
self.weight = self.add_weight( self.weight = self.add_weight(
name="weight", name="weight",
shape=[self.vocab_size, self.embedding_size], shape=[self.config.vocab_size, self.embedding_size],
initializer=get_initializer(initializer_range=self.initializer_range), initializer=get_initializer(initializer_range=self.initializer_range),
) )
with tf.name_scope("token_type_embeddings"): with tf.name_scope("token_type_embeddings"):
self.token_type_embeddings = self.add_weight( self.token_type_embeddings = self.add_weight(
name="embeddings", name="embeddings",
shape=[self.type_vocab_size, self.hidden_size], shape=[self.config.type_vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range), initializer=get_initializer(initializer_range=self.initializer_range),
) )
...@@ -218,10 +217,10 @@ class TFMobileBertEmbeddings(tf.keras.layers.Layer): ...@@ -218,10 +217,10 @@ class TFMobileBertEmbeddings(tf.keras.layers.Layer):
# indices on GPU, returning zeros instead. This is a dangerous silent behavior. # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
tf.debugging.assert_less( tf.debugging.assert_less(
input_ids, input_ids,
tf.cast(self.vocab_size, dtype=input_ids.dtype), tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
message=( message=(
"input_ids must be smaller than the embedding layer's input dimension (got" "input_ids must be smaller than the embedding layer's input dimension (got"
f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})" f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
), ),
) )
inputs_embeds = tf.gather(params=self.weight, indices=input_ids) inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
...@@ -658,13 +657,12 @@ class TFMobileBertLMPredictionHead(tf.keras.layers.Layer): ...@@ -658,13 +657,12 @@ class TFMobileBertLMPredictionHead(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.transform = TFMobileBertPredictionHeadTransform(config, name="transform") self.transform = TFMobileBertPredictionHeadTransform(config, name="transform")
self.vocab_size = config.vocab_size
self.config = config self.config = config
def build(self, input_shape): def build(self, input_shape):
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
self.dense = self.add_weight( self.dense = self.add_weight(
shape=(self.config.hidden_size - self.config.embedding_size, self.vocab_size), shape=(self.config.hidden_size - self.config.embedding_size, self.config.vocab_size),
initializer="zeros", initializer="zeros",
trainable=True, trainable=True,
name="dense/weight", name="dense/weight",
...@@ -682,14 +680,14 @@ class TFMobileBertLMPredictionHead(tf.keras.layers.Layer): ...@@ -682,14 +680,14 @@ class TFMobileBertLMPredictionHead(tf.keras.layers.Layer):
def set_output_embeddings(self, value): def set_output_embeddings(self, value):
self.decoder = value self.decoder = value
self.vocab_size = shape_list(value)[0] self.config.vocab_size = shape_list(value)[0]
def get_bias(self): def get_bias(self):
return {"bias": self.bias} return {"bias": self.bias}
def set_bias(self, value): def set_bias(self, value):
self.bias = value["bias"] self.bias = value["bias"]
self.vocab_size = shape_list(value["bias"])[0] self.config.vocab_size = shape_list(value["bias"])[0]
def call(self, hidden_states): def call(self, hidden_states):
hidden_states = self.transform(hidden_states) hidden_states = self.transform(hidden_states)
......
...@@ -97,7 +97,7 @@ class TFMPNetEmbeddings(tf.keras.layers.Layer): ...@@ -97,7 +97,7 @@ class TFMPNetEmbeddings(tf.keras.layers.Layer):
super().__init__(**kwargs) super().__init__(**kwargs)
self.padding_idx = 1 self.padding_idx = 1
self.vocab_size = config.vocab_size self.config = config
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
self.max_position_embeddings = config.max_position_embeddings self.max_position_embeddings = config.max_position_embeddings
self.initializer_range = config.initializer_range self.initializer_range = config.initializer_range
...@@ -108,7 +108,7 @@ class TFMPNetEmbeddings(tf.keras.layers.Layer): ...@@ -108,7 +108,7 @@ class TFMPNetEmbeddings(tf.keras.layers.Layer):
with tf.name_scope("word_embeddings"): with tf.name_scope("word_embeddings"):
self.weight = self.add_weight( self.weight = self.add_weight(
name="weight", name="weight",
shape=[self.vocab_size, self.hidden_size], shape=[self.config.vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range), initializer=get_initializer(initializer_range=self.initializer_range),
) )
...@@ -149,10 +149,10 @@ class TFMPNetEmbeddings(tf.keras.layers.Layer): ...@@ -149,10 +149,10 @@ class TFMPNetEmbeddings(tf.keras.layers.Layer):
# indices on GPU, returning zeros instead. This is a dangerous silent behavior. # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
tf.debugging.assert_less( tf.debugging.assert_less(
input_ids, input_ids,
tf.cast(self.vocab_size, dtype=input_ids.dtype), tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
message=( message=(
"input_ids must be smaller than the embedding layer's input dimension (got" "input_ids must be smaller than the embedding layer's input dimension (got"
f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})" f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
), ),
) )
inputs_embeds = tf.gather(params=self.weight, indices=input_ids) inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
...@@ -735,7 +735,7 @@ class TFMPNetLMHead(tf.keras.layers.Layer): ...@@ -735,7 +735,7 @@ class TFMPNetLMHead(tf.keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs): def __init__(self, config, input_embeddings, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.config = config
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
self.dense = tf.keras.layers.Dense( self.dense = tf.keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
...@@ -748,7 +748,7 @@ class TFMPNetLMHead(tf.keras.layers.Layer): ...@@ -748,7 +748,7 @@ class TFMPNetLMHead(tf.keras.layers.Layer):
self.decoder = input_embeddings self.decoder = input_embeddings
def build(self, input_shape): def build(self, input_shape):
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
super().build(input_shape) super().build(input_shape)
...@@ -764,7 +764,7 @@ class TFMPNetLMHead(tf.keras.layers.Layer): ...@@ -764,7 +764,7 @@ class TFMPNetLMHead(tf.keras.layers.Layer):
def set_bias(self, value): def set_bias(self, value):
self.bias = value["bias"] self.bias = value["bias"]
self.vocab_size = shape_list(value["bias"])[0] self.config.vocab_size = shape_list(value["bias"])[0]
def call(self, hidden_states): def call(self, hidden_states):
hidden_states = self.dense(hidden_states) hidden_states = self.dense(hidden_states)
...@@ -775,7 +775,7 @@ class TFMPNetLMHead(tf.keras.layers.Layer): ...@@ -775,7 +775,7 @@ class TFMPNetLMHead(tf.keras.layers.Layer):
seq_length = shape_list(tensor=hidden_states)[1] seq_length = shape_list(tensor=hidden_states)[1]
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size]) hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True) hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size]) hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias) hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
return hidden_states return hidden_states
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment