Unverified Commit d38bbb22 authored by Julien Plu's avatar Julien Plu Committed by GitHub
Browse files

Update the NER TF script (#3511)



* Update the NER TF script to remove the softmax and make the pad token label id to -1

* Reformat the quality and style
Co-authored-by: default avatarJulien Plu <julien.plu@adevinta.com>
parent eff757f2
...@@ -157,7 +157,9 @@ def train( ...@@ -157,7 +157,9 @@ def train(
writer = tf.summary.create_file_writer("/tmp/mylogs") writer = tf.summary.create_file_writer("/tmp/mylogs")
with strategy.scope(): with strategy.scope():
loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE) loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(
from_logits=True, reduction=tf.keras.losses.Reduction.NONE
)
optimizer = create_optimizer(args["learning_rate"], num_train_steps, args["warmup_steps"]) optimizer = create_optimizer(args["learning_rate"], num_train_steps, args["warmup_steps"])
if args["fp16"]: if args["fp16"]:
...@@ -205,11 +207,9 @@ def train( ...@@ -205,11 +207,9 @@ def train(
with tf.GradientTape() as tape: with tf.GradientTape() as tape:
logits = model(train_features["input_ids"], **inputs)[0] logits = model(train_features["input_ids"], **inputs)[0]
logits = tf.reshape(logits, (-1, len(labels) + 1)) active_loss = tf.reshape(train_labels, (-1,)) != pad_token_label_id
active_loss = tf.reshape(train_features["input_mask"], (-1,)) active_logits = tf.boolean_mask(tf.reshape(logits, (-1, len(labels))), active_loss)
active_logits = tf.boolean_mask(logits, active_loss) active_labels = tf.boolean_mask(tf.reshape(train_labels, (-1,)), active_loss)
train_labels = tf.reshape(train_labels, (-1,))
active_labels = tf.boolean_mask(train_labels, active_loss)
cross_entropy = loss_fct(active_labels, active_logits) cross_entropy = loss_fct(active_labels, active_logits)
loss = tf.reduce_sum(cross_entropy) * (1.0 / train_batch_size) loss = tf.reduce_sum(cross_entropy) * (1.0 / train_batch_size)
grads = tape.gradient(loss, model.trainable_variables) grads = tape.gradient(loss, model.trainable_variables)
...@@ -329,11 +329,9 @@ def evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode) ...@@ -329,11 +329,9 @@ def evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode)
with strategy.scope(): with strategy.scope():
logits = model(eval_features["input_ids"], **inputs)[0] logits = model(eval_features["input_ids"], **inputs)[0]
tmp_logits = tf.reshape(logits, (-1, len(labels) + 1)) active_loss = tf.reshape(eval_labels, (-1,)) != pad_token_label_id
active_loss = tf.reshape(eval_features["input_mask"], (-1,)) active_logits = tf.boolean_mask(tf.reshape(logits, (-1, len(labels))), active_loss)
active_logits = tf.boolean_mask(tmp_logits, active_loss) active_labels = tf.boolean_mask(tf.reshape(eval_labels, (-1,)), active_loss)
tmp_eval_labels = tf.reshape(eval_labels, (-1,))
active_labels = tf.boolean_mask(tmp_eval_labels, active_loss)
cross_entropy = loss_fct(active_labels, active_logits) cross_entropy = loss_fct(active_labels, active_logits)
loss += tf.reduce_sum(cross_entropy) * (1.0 / eval_batch_size) loss += tf.reduce_sum(cross_entropy) * (1.0 / eval_batch_size)
...@@ -497,8 +495,8 @@ def main(_): ...@@ -497,8 +495,8 @@ def main(_):
) )
labels = get_labels(args["labels"]) labels = get_labels(args["labels"])
num_labels = len(labels) + 1 num_labels = len(labels)
pad_token_label_id = 0 pad_token_label_id = -1
config = AutoConfig.from_pretrained( config = AutoConfig.from_pretrained(
args["config_name"] if args["config_name"] else args["model_name_or_path"], args["config_name"] if args["config_name"] else args["model_name_or_path"],
num_labels=num_labels, num_labels=num_labels,
...@@ -522,7 +520,6 @@ def main(_): ...@@ -522,7 +520,6 @@ def main(_):
config=config, config=config,
cache_dir=args["cache_dir"] if args["cache_dir"] else None, cache_dir=args["cache_dir"] if args["cache_dir"] else None,
) )
model.layers[-1].activation = tf.keras.activations.softmax
train_batch_size = args["per_device_train_batch_size"] * args["n_device"] train_batch_size = args["per_device_train_batch_size"] * args["n_device"]
train_dataset, num_train_examples = load_and_cache_examples( train_dataset, num_train_examples = load_and_cache_examples(
......
...@@ -214,7 +214,7 @@ class GradientAccumulator(object): ...@@ -214,7 +214,7 @@ class GradientAccumulator(object):
raise ValueError("Expected %s gradients, but got %d" % (len(self._gradients), len(gradients))) raise ValueError("Expected %s gradients, but got %d" % (len(self._gradients), len(gradients)))
for accum_gradient, gradient in zip(self._get_replica_gradients(), gradients): for accum_gradient, gradient in zip(self._get_replica_gradients(), gradients):
if accum_gradient is not None: if accum_gradient is not None and gradient is not None:
accum_gradient.assign_add(gradient) accum_gradient.assign_add(gradient)
self._accum_steps.assign_add(1) self._accum_steps.assign_add(1)
...@@ -241,6 +241,7 @@ class GradientAccumulator(object): ...@@ -241,6 +241,7 @@ class GradientAccumulator(object):
return ( return (
gradient.device_map.select_for_current_replica(gradient.values, replica_context) gradient.device_map.select_for_current_replica(gradient.values, replica_context)
for gradient in self._gradients for gradient in self._gradients
if gradient is not None
) )
else: else:
return self._gradients return self._gradients
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment