"vscode:/vscode.git/clone" did not exist on "b56a27b568aa666de89bf1a5e79eabcc72e11be2"
Commit 5571e9b6 authored by Jialu Liu's avatar Jialu Liu Committed by A. Unique TensorFlower
Browse files

Internal change

PiperOrigin-RevId: 396831100
parent b3537541
task:
hub_module_url: ''
model:
num_classes: 3
train_data:
drop_remainder: true
global_batch_size: 32
input_path: ''
is_training: true
seq_length: 128
validation_data:
drop_remainder: false
global_batch_size: 32
input_path: ''
is_training: false
seq_length: 128
trainer:
checkpoint_interval: 1000
optimizer_config:
learning_rate:
polynomial:
decay_steps: 61359
end_learning_rate: 0.0
initial_learning_rate: 3.0e-05
power: 1.0
type: polynomial
optimizer:
type: adamw
warmup:
polynomial:
power: 1
warmup_steps: 6136
type: polynomial
steps_per_loop: 1000
summary_interval: 1000
# Training data size 392,702 examples, 5 epochs.
train_steps: 61359
validation_interval: 2000
validation_steps: 307
task:
hub_module_url: ''
max_answer_length: 30
n_best_size: 20
null_score_diff_threshold: 0.0
train_data:
drop_remainder: true
global_batch_size: 32
input_path: ''
is_training: true
seq_length: 384
validation_data:
do_lower_case: true
doc_stride: 128
drop_remainder: false
global_batch_size: 32
input_path: ''
is_training: false
query_length: 64
seq_length: 384
tokenization: WordPiece
version_2_with_negative: false
vocab_file: ''
trainer:
checkpoint_interval: 500
max_to_keep: 5
optimizer_config:
learning_rate:
polynomial:
decay_steps: 5549
end_learning_rate: 0.0
initial_learning_rate: 5.0e-05
power: 1.0
type: polynomial
optimizer:
type: adamw
warmup:
polynomial:
power: 1
warmup_steps: 555
type: polynomial
steps_per_loop: 500
summary_interval: 500
train_steps: 5549
validation_interval: 500
validation_steps: 339
task:
hub_module_url: ''
max_answer_length: 30
n_best_size: 20
null_score_diff_threshold: 0.0
train_data:
drop_remainder: true
global_batch_size: 32
input_path: ''
is_training: true
seq_length: 384
validation_data:
do_lower_case: true
doc_stride: 128
drop_remainder: false
global_batch_size: 32
input_path: ''
is_training: false
query_length: 64
seq_length: 384
tokenization: WordPiece
version_2_with_negative: true
vocab_file: ''
trainer:
checkpoint_interval: 500
max_to_keep: 5
optimizer_config:
learning_rate:
polynomial:
decay_steps: 8160
end_learning_rate: 0.0
initial_learning_rate: 5.0e-05
power: 1.0
type: polynomial
optimizer:
type: adamw
warmup:
polynomial:
name: polynomial
power: 1
warmup_steps: 816
type: polynomial
steps_per_loop: 500
summary_interval: 500
train_steps: 8160
validation_interval: 500
validation_steps: 383
task:
model:
cls_heads: [{activation: tanh, cls_token_idx: 0, dropout_rate: 0.1, inner_dim: 768,
name: next_sentence, num_classes: 2}]
generator_encoder:
bert:
attention_dropout_rate: 0.1
dropout_rate: 0.1
embedding_size: 768
hidden_activation: gelu
hidden_size: 256
initializer_range: 0.02
intermediate_size: 1024
max_position_embeddings: 512
num_attention_heads: 4
num_layers: 12
type_vocab_size: 2
vocab_size: 30522
num_masked_tokens: 76
sequence_length: 512
num_classes: 2
discriminator_encoder:
bert:
attention_dropout_rate: 0.1
dropout_rate: 0.1
embedding_size: 768
hidden_activation: gelu
hidden_size: 768
initializer_range: 0.02
intermediate_size: 3072
max_position_embeddings: 512
num_attention_heads: 12
num_layers: 12
type_vocab_size: 2
vocab_size: 30522
discriminator_loss_weight: 50.0
disallow_correct: false
tie_embeddings: true
train_data:
drop_remainder: true
global_batch_size: 256
input_path: ''
is_training: true
max_predictions_per_seq: 76
seq_length: 512
use_next_sentence_label: false
use_position_id: false
validation_data:
drop_remainder: true
global_batch_size: 256
input_path: ''
is_training: false
max_predictions_per_seq: 76
seq_length: 512
use_next_sentence_label: false
use_position_id: false
trainer:
checkpoint_interval: 6000
max_to_keep: 50
optimizer_config:
learning_rate:
polynomial:
cycle: false
decay_steps: 1000000
end_learning_rate: 0.0
initial_learning_rate: 0.0002
power: 1.0
type: polynomial
optimizer:
type: adamw
warmup:
polynomial:
power: 1
warmup_steps: 10000
type: polynomial
steps_per_loop: 1000
summary_interval: 1000
train_steps: 1000000
validation_interval: 100
validation_steps: 64
task:
hub_module_url: ''
model:
num_classes: 3
train_data:
drop_remainder: true
global_batch_size: 32
input_path: ''
is_training: true
seq_length: 128
validation_data:
drop_remainder: false
global_batch_size: 32
input_path: ''
is_training: false
seq_length: 128
trainer:
checkpoint_interval: 1000
optimizer_config:
learning_rate:
polynomial:
decay_steps: 61359
end_learning_rate: 0.0
initial_learning_rate: 1.0e-04
power: 1.0
type: polynomial
optimizer:
type: adamw
warmup:
polynomial:
power: 1
warmup_steps: 6136
type: polynomial
steps_per_loop: 1000
summary_interval: 1000
# Training data size 392,702 examples, 5 epochs.
train_steps: 61359
validation_interval: 2000
# Eval data size = 9815 examples.
validation_steps: 307
task:
hub_module_url: ''
max_answer_length: 30
n_best_size: 20
null_score_diff_threshold: 0.0
train_data:
drop_remainder: true
global_batch_size: 48
input_path: ''
is_training: true
seq_length: 384
validation_data:
do_lower_case: true
doc_stride: 128
drop_remainder: false
global_batch_size: 48
input_path: ''
is_training: false
query_length: 64
seq_length: 384
tokenization: WordPiece
version_2_with_negative: false
vocab_file: ''
trainer:
checkpoint_interval: 500
max_to_keep: 5
optimizer_config:
learning_rate:
polynomial:
decay_steps: 9248
end_learning_rate: 0.0
initial_learning_rate: 8.0e-05
power: 1.0
type: polynomial
optimizer:
type: adamw
warmup:
polynomial:
power: 1
warmup_steps: 925
type: polynomial
steps_per_loop: 500
summary_interval: 500
train_steps: 9248
validation_interval: 500
validation_steps: 226
task:
hub_module_url: ''
max_answer_length: 30
n_best_size: 20
null_score_diff_threshold: 0.0
train_data:
drop_remainder: true
global_batch_size: 48
input_path: ''
is_training: true
seq_length: 384
validation_data:
do_lower_case: true
doc_stride: 128
drop_remainder: false
global_batch_size: 48
input_path: ''
is_training: false
query_length: 64
seq_length: 384
tokenization: WordPiece
version_2_with_negative: true
vocab_file: ''
trainer:
checkpoint_interval: 500
max_to_keep: 5
optimizer_config:
learning_rate:
polynomial:
decay_steps: 13601
end_learning_rate: 0.0
initial_learning_rate: 8.0e-05
power: 1.0
type: polynomial
optimizer:
type: adamw
warmup:
polynomial:
name: polynomial
power: 1
warmup_steps: 1360
type: polynomial
steps_per_loop: 500
summary_interval: 500
train_steps: 13601
validation_interval: 500
validation_steps: 255
task:
model:
candidate_size: 5
num_shared_generator_hidden_layers: 3
num_discriminator_task_agnostic_layers: 11
tie_embeddings: true
generator:
attention_dropout_rate: 0.1
dropout_rate: 0.1
embedding_size: 128
hidden_activation: gelu
hidden_size: 256
initializer_range: 0.02
intermediate_size: 1024
max_position_embeddings: 512
num_attention_heads: 4
num_layers: 6
type_vocab_size: 2
vocab_size: 30522
discriminator:
attention_dropout_rate: 0.1
dropout_rate: 0.1
embedding_size: 128
hidden_activation: gelu
hidden_size: 256
initializer_range: 0.02
intermediate_size: 1024
max_position_embeddings: 512
num_attention_heads: 4
num_layers: 12
type_vocab_size: 2
vocab_size: 30522
train_data:
drop_remainder: true
global_batch_size: 256
input_path: ''
is_training: true
max_predictions_per_seq: 76
seq_length: 512
use_next_sentence_label: false
use_position_id: false
validation_data:
drop_remainder: true
global_batch_size: 256
input_path: ''
is_training: false
max_predictions_per_seq: 76
seq_length: 512
use_next_sentence_label: false
use_position_id: false
trainer:
checkpoint_interval: 4000
max_to_keep: 5
optimizer_config:
learning_rate:
polynomial:
cycle: false
decay_steps: 500000
end_learning_rate: 0.0
initial_learning_rate: 0.0005
power: 1.0
type: polynomial
optimizer:
type: adamw
warmup:
polynomial:
power: 1
warmup_steps: 10000
type: polynomial
steps_per_loop: 4000
summary_interval: 4000
train_steps: 500000
validation_interval: 100
validation_steps: 64
task:
model:
encoder:
bert:
attention_dropout_rate: 0.1
dropout_rate: 0.1
embedding_size: 768
hidden_activation: gelu
hidden_size: 768
initializer_range: 0.02
intermediate_size: 3072
max_position_embeddings: 512
num_attention_heads: 12
num_layers: 12
type_vocab_size: 2
vocab_size: 30522
task:
model:
encoder:
bert:
attention_dropout_rate: 0.1
dropout_rate: 0.1
embedding_size: 128
hidden_activation: gelu
hidden_size: 256
initializer_range: 0.02
intermediate_size: 1024
max_position_embeddings: 512
num_attention_heads: 4
num_layers: 12
type_vocab_size: 2
vocab_size: 30522
......@@ -47,13 +47,6 @@ class ReplacedTokenDetectionHead(tf.keras.layers.Layer):
self.activation = self.hidden_cfg['intermediate_activation']
self.initializer = self.hidden_cfg['kernel_initializer']
if output not in ('predictions', 'logits'):
raise ValueError(
('Unknown `output` value "%s". `output` can be either "logits" or '
'"predictions"') % output)
self._output_type = output
def build(self, input_shape):
self.hidden_layers = []
for i in range(self.num_task_agnostic_layers, self.num_hidden_instances):
self.hidden_layers.append(
......@@ -74,6 +67,12 @@ class ReplacedTokenDetectionHead(tf.keras.layers.Layer):
units=1, kernel_initializer=self.initializer,
name='transform/rtd_head')
if output not in ('predictions', 'logits'):
raise ValueError(
('Unknown `output` value "%s". `output` can be either "logits" or '
'"predictions"') % output)
self._output_type = output
def call(self, sequence_data, input_mask):
"""Compute inner-products of hidden vectors with sampled element embeddings.
......@@ -117,13 +116,6 @@ class MultiWordSelectionHead(tf.keras.layers.Layer):
self.activation = activation
self.initializer = tf.keras.initializers.get(initializer)
if output not in ('predictions', 'logits'):
raise ValueError(
('Unknown `output` value "%s". `output` can be either "logits" or '
'"predictions"') % output)
self._output_type = output
def build(self, input_shape):
self._vocab_size, self.embed_size = self.embedding_table.shape
self.dense = tf.keras.layers.Dense(
self.embed_size,
......@@ -133,7 +125,11 @@ class MultiWordSelectionHead(tf.keras.layers.Layer):
self.layer_norm = tf.keras.layers.LayerNormalization(
axis=-1, epsilon=1e-12, name='transform/mws_layernorm')
super(MultiWordSelectionHead, self).build(input_shape)
if output not in ('predictions', 'logits'):
raise ValueError(
('Unknown `output` value "%s". `output` can be either "logits" or '
'"predictions"') % output)
self._output_type = output
def call(self, sequence_data, masked_positions, candidate_sets):
"""Compute inner-products of hidden vectors with sampled element embeddings.
......@@ -277,27 +273,28 @@ class TeamsPretrainer(tf.keras.Model):
self.mlm_activation = mlm_activation
self.mlm_initializer = mlm_initializer
self.output_type = output_type
embedding_table = generator_network.embedding_network.get_embedding_table()
self.embedding_table = (
self.discriminator_mws_network.embedding_network.get_embedding_table())
self.masked_lm = layers.MaskedLM(
embedding_table=embedding_table,
embedding_table=self.embedding_table,
activation=mlm_activation,
initializer=mlm_initializer,
output=output_type,
name='generator_masked_lm')
discriminator_cfg = self.discriminator_mws_network.get_config()
self.num_task_agnostic_layers = num_discriminator_task_agnostic_layers
self.discriminator_rtd_head = ReplacedTokenDetectionHead(
encoder_cfg=discriminator_cfg,
num_task_agnostic_layers=num_discriminator_task_agnostic_layers,
num_task_agnostic_layers=self.num_task_agnostic_layers,
output=output_type,
name='discriminator_rtd')
hidden_cfg = discriminator_cfg['hidden_cfg']
self.discriminator_mws_head = MultiWordSelectionHead(
embedding_table=embedding_table,
embedding_table=self.embedding_table,
activation=hidden_cfg['intermediate_activation'],
initializer=hidden_cfg['kernel_initializer'],
output=output_type,
name='discriminator_mws')
self.num_task_agnostic_layers = num_discriminator_task_agnostic_layers
def call(self, inputs):
"""TEAMS forward pass.
......@@ -380,7 +377,7 @@ class TeamsPretrainer(tf.keras.Model):
sampled_tokens = tf.stop_gradient(
models.electra_pretrainer.sample_from_softmax(
mlm_logits, disallow=None))
sampled_tokids = tf.argmax(sampled_tokens, -1, output_type=tf.int32)
sampled_tokids = tf.argmax(sampled_tokens, axis=-1, output_type=tf.int32)
# Prepares input and label for replaced token detection task.
updated_input_ids, masked = models.electra_pretrainer.scatter_update(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment