Unverified Commit 96d833b2 authored by Matt's avatar Matt Committed by GitHub
Browse files

Return scalar losses instead of per-sample means (#18013)

* Return scalar losses instead of per-sample means

* Make loss shape (1,) instead of scalar

* Allow scalar losses in test_loss_computation

* Allow scalar losses in test_loss_computation

* Allow scalar losses in test_loss_computation

* Remove XLA loss function for RAG
parent 6cb19540
......@@ -206,11 +206,9 @@ class TFCausalLanguageModelingLoss:
unmasked_loss = loss_fn(tf.nn.relu(labels), logits)
# make sure only labels that are not equal to -100 affect the loss
loss_mask = tf.cast(labels != -100, dtype=unmasked_loss.dtype)
# Avoid division by zero later
loss_denominator = tf.math.maximum(tf.cast(1, loss_mask.dtype), tf.reduce_sum(loss_mask, axis=1))
masked_loss = unmasked_loss * loss_mask
reduced_masked_loss = tf.reduce_sum(masked_loss, axis=1) / loss_denominator
return reduced_masked_loss
reduced_masked_loss = tf.reduce_sum(masked_loss) / tf.reduce_sum(loss_mask)
return tf.reshape(reduced_masked_loss, (1,))
class TFQuestionAnsweringLoss:
......@@ -266,11 +264,10 @@ class TFTokenClassificationLoss:
# are taken into account as loss
loss_mask = tf.cast(labels >= 0, dtype=unmasked_loss.dtype)
# Avoid possible division by zero later
loss_denominator = tf.math.maximum(tf.cast(1, loss_mask.dtype), tf.reduce_sum(loss_mask, axis=1))
# Masked positions will have a loss of NaN because -100 and -1 are not valid labels
masked_loss = unmasked_loss * loss_mask
reduced_masked_loss = tf.reduce_sum(masked_loss, axis=1) / loss_denominator
return reduced_masked_loss
reduced_masked_loss = tf.reduce_sum(masked_loss) / tf.reduce_sum(loss_mask)
return tf.reshape(reduced_masked_loss, (1,))
class TFSequenceClassificationLoss:
......
......@@ -118,20 +118,18 @@ class TFAlbertPreTrainingLoss:
# make sure only labels that are not equal to -100
# are taken into account for the loss computation
lm_loss_mask = tf.cast(labels["labels"] != -100, dtype=unmasked_lm_losses.dtype)
# Avoid division by zero later
lm_loss_denominator = tf.math.maximum(tf.cast(1, lm_loss_mask.dtype), tf.reduce_sum(lm_loss_mask, axis=1))
masked_lm_losses = unmasked_lm_losses * lm_loss_mask
reduced_masked_lm_loss = tf.reduce_sum(masked_lm_losses, axis=1) / lm_loss_denominator
reduced_masked_lm_loss = tf.reduce_sum(masked_lm_losses) / tf.reduce_sum(lm_loss_mask)
sop_logits = tf.reshape(logits[1], (-1, 2))
# Clip negative labels to zero here to avoid NaNs and errors - those positions will get masked later anyway
unmasked_sop_loss = loss_fn(y_true=tf.nn.relu(labels["sentence_order_label"]), y_pred=sop_logits)
sop_loss_mask = tf.cast(labels["sentence_order_label"] != -100, dtype=unmasked_sop_loss.dtype)
# No reduction because this already has shape (num_samples,)
masked_sop_loss = unmasked_sop_loss * sop_loss_mask
reduced_masked_sop_loss = tf.reduce_sum(masked_sop_loss) / tf.reduce_sum(sop_loss_mask)
return reduced_masked_lm_loss + masked_sop_loss
return tf.reshape(reduced_masked_lm_loss + reduced_masked_sop_loss, (1,))
class TFAlbertEmbeddings(tf.keras.layers.Layer):
......
......@@ -130,18 +130,17 @@ class TFBertPreTrainingLoss:
# make sure only labels that are not equal to -100
# are taken into account for the loss computation
lm_loss_mask = tf.cast(labels["labels"] != -100, dtype=unmasked_lm_losses.dtype)
# Avoid potential division by zero later
lm_loss_denominator = tf.math.maximum(tf.cast(1, lm_loss_mask.dtype), tf.reduce_sum(lm_loss_mask, axis=1))
masked_lm_losses = unmasked_lm_losses * lm_loss_mask
reduced_masked_lm_loss = tf.reduce_sum(masked_lm_losses, axis=1) / lm_loss_denominator
reduced_masked_lm_loss = tf.reduce_sum(masked_lm_losses) / tf.reduce_sum(lm_loss_mask)
# Clip negative labels to zero here to avoid NaNs and errors - those positions will get masked later anyway
unmasked_ns_loss = loss_fn(y_true=tf.nn.relu(labels["next_sentence_label"]), y_pred=logits[1])
ns_loss_mask = tf.cast(labels["next_sentence_label"] != -100, dtype=unmasked_ns_loss.dtype)
# Just zero out samples where label is -100, no reduction
masked_ns_loss = unmasked_ns_loss * ns_loss_mask
return reduced_masked_lm_loss + masked_ns_loss
reduced_masked_ns_loss = tf.reduce_sum(masked_ns_loss) / tf.reduce_sum(ns_loss_mask)
return tf.reshape(reduced_masked_lm_loss + reduced_masked_ns_loss, (1,))
class TFBertEmbeddings(tf.keras.layers.Layer):
......
......@@ -2518,7 +2518,6 @@ class TFLEDForConditionalGeneration(TFLEDPreTrainedModel):
unmasked_loss = loss_fn(tf.nn.relu(labels), logits)
# make sure only non-padding labels affect the loss
loss_mask = tf.cast(labels != self.config.pad_token_id, dtype=unmasked_loss.dtype)
loss_denominator = tf.math.maximum(tf.cast(1, loss_mask.dtype), tf.reduce_sum(loss_mask, axis=1))
masked_loss = unmasked_loss * loss_mask
reduced_masked_loss = tf.reduce_sum(masked_loss, axis=1) / loss_denominator
return reduced_masked_loss
reduced_masked_loss = tf.reduce_sum(masked_loss) / tf.reduce_sum(loss_mask)
return tf.reshape(reduced_masked_loss, (1,))
......@@ -1333,46 +1333,29 @@ class TFRagTokenForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingLoss
# Adopted modeling_tf_bart + add smooth_loss to match with pytorch version
def hf_compute_loss(self, labels, y_pred, smooth_epsilon=0.0, from_logits=True, reduce_loss=False):
"""CrossEntropyLoss that ignores pad tokens"""
if self.config.tf_legacy_loss:
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
from_logits=True,
reduction=tf.keras.losses.Reduction.SUM,
)
if from_logits is False: # convert to logits
eps = 1e-9
y_pred = tf.clip_by_value(y_pred, clip_value_min=eps, clip_value_max=1 - eps)
y_pred = tf.math.log(y_pred)
logits = y_pred
melted_labels = tf.reshape(labels, (-1,))
active_loss = tf.not_equal(melted_labels, self.config.generator.pad_token_id)
reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, logits.shape[2])), active_loss)
labels = tf.boolean_mask(melted_labels, active_loss)
nll_loss = loss_fn(labels, reduced_logits)
smooth_loss = -tf.reduce_sum(reduced_logits, axis=-1)
smooth_loss = tf.reduce_sum(smooth_loss) # sum and squeeze like torch
eps_i = smooth_epsilon / reduced_logits.shape[-1]
loss = (1.0 - smooth_epsilon) * nll_loss + eps_i * smooth_loss
return loss
# Matt: As written, this loss is not XLA-compatible, but it's doing some very weird things
# and I don't feel comfortable converting it.
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
from_logits=from_logits,
reduction=tf.keras.losses.Reduction.NONE,
from_logits=True,
reduction=tf.keras.losses.Reduction.SUM,
)
unmasked_loss = loss_fn(labels, y_pred)
loss_mask = labels != self.config.generator.pad_token_id
nll_loss = tf.reduce_sum(unmasked_loss * loss_mask)
if from_logits is False: # convert to logits
eps = 1e-9
y_pred = tf.clip_by_value(y_pred, clip_value_min=eps, clip_value_max=1 - eps)
y_pred = tf.math.log(y_pred)
logits = y_pred
melted_labels = tf.reshape(labels, (-1,))
active_loss = tf.not_equal(melted_labels, self.config.generator.pad_token_id)
reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, logits.shape[2])), active_loss)
labels = tf.boolean_mask(melted_labels, active_loss)
nll_loss = loss_fn(labels, reduced_logits)
# Matt: This makes no sense to me, but I'm just copying the old loss in XLA-compatible form
smooth_loss = -tf.reduce_sum(y_pred * tf.expand_dims(labels, -1), axis=-1)
smooth_loss = tf.reduce_sum(smooth_loss)
eps_i = smooth_epsilon / y_pred.shape[-1]
smooth_loss = -tf.reduce_sum(reduced_logits, axis=-1)
smooth_loss = tf.reduce_sum(smooth_loss) # sum and squeeze like torch
eps_i = smooth_epsilon / reduced_logits.shape[-1]
loss = (1.0 - smooth_epsilon) * nll_loss + eps_i * smooth_loss
......
......@@ -417,12 +417,12 @@ class TFXLNetModelTest(TFModelTesterMixin, unittest.TestCase):
input_ids = prepared_for_class.pop(input_name)
loss = model(input_ids, **prepared_for_class)[0]
self.assertEqual(loss.shape.as_list(), expected_loss_size)
self.assertTrue(loss.shape.as_list() == expected_loss_size or loss.shape.as_list() == [1])
# Test that model correctly compute the loss with a dict
prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
loss = model(prepared_for_class)[0]
self.assertEqual(loss.shape.as_list(), expected_loss_size)
self.assertTrue(loss.shape.as_list() == expected_loss_size or loss.shape.as_list() == [1])
# Test that model correctly compute the loss with a tuple
prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
......@@ -453,7 +453,7 @@ class TFXLNetModelTest(TFModelTesterMixin, unittest.TestCase):
# Send to model
loss = model(tuple_input[:-1])[0]
self.assertEqual(loss.shape.as_list(), expected_loss_size)
self.assertTrue(loss.shape.as_list() == expected_loss_size or loss.shape.as_list() == [1])
@require_tf
......
......@@ -1294,7 +1294,7 @@ class TFModelTesterMixin:
model_input = prepared_for_class.pop(input_name)
loss = model(model_input, **prepared_for_class)[0]
self.assertEqual(loss.shape.as_list(), expected_loss_size)
self.assertTrue(loss.shape.as_list() == expected_loss_size or loss.shape.as_list() == [1])
# Test that model correctly compute the loss when we mask some positions
prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
......@@ -1307,13 +1307,13 @@ class TFModelTesterMixin:
labels[0] = -100
prepared_for_class["labels"] = tf.convert_to_tensor(labels)
loss = model(model_input, **prepared_for_class)[0]
self.assertEqual(loss.shape.as_list(), expected_loss_size)
self.assertTrue(loss.shape.as_list() == expected_loss_size or loss.shape.as_list() == [1])
self.assertTrue(not np.any(np.isnan(loss.numpy())))
# Test that model correctly compute the loss with a dict
prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
loss = model(prepared_for_class)[0]
self.assertEqual(loss.shape.as_list(), expected_loss_size)
self.assertTrue(loss.shape.as_list() == expected_loss_size or loss.shape.as_list() == [1])
# Test that model correctly compute the loss with a tuple
prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
......@@ -1344,7 +1344,7 @@ class TFModelTesterMixin:
# Send to model
loss = model(tuple_input[:-1])[0]
self.assertEqual(loss.shape.as_list(), expected_loss_size)
self.assertTrue(loss.shape.as_list() == expected_loss_size or loss.shape.as_list() == [1])
def test_keras_fit(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment