Unverified Commit 49c0b293 authored by Nilesh's avatar Nilesh Committed by GitHub
Browse files

Fixed nll with label_smoothing to just nll (#28708)

* Fixed nll with label_smoothing to nll

* Resolved conflict by rebase

* Fixed nll with label_smoothing to nll

* Resolved conflict by rebase

* Added label_smoothing to config file

* Fixed nits
parent 4f09d0fd
...@@ -94,6 +94,10 @@ class BlipTextConfig(PretrainedConfig): ...@@ -94,6 +94,10 @@ class BlipTextConfig(PretrainedConfig):
Whether the model is used as a decoder. Whether the model is used as a decoder.
use_cache (`bool`, *optional*, defaults to `True`): use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Whether or not the model should return the last key/values attentions (not used by all models).
label_smoothing (float, *optional*):
A float in [0.0, 1.0]. Specifies the amount of smoothing when computing the loss, where 0.0 means no smoothing. The targets
become a mixture of the original ground truth and a uniform distribution as described in
`Rethinking the Inception Architecture for Computer Vision <https://arxiv.org/abs/1512.00567>`__. Default: :math:`0.0`.
Example: Example:
...@@ -133,6 +137,7 @@ class BlipTextConfig(PretrainedConfig): ...@@ -133,6 +137,7 @@ class BlipTextConfig(PretrainedConfig):
sep_token_id=102, sep_token_id=102,
is_decoder=True, is_decoder=True,
use_cache=True, use_cache=True,
label_smoothing=0.0,
**kwargs, **kwargs,
): ):
super().__init__( super().__init__(
...@@ -158,6 +163,7 @@ class BlipTextConfig(PretrainedConfig): ...@@ -158,6 +163,7 @@ class BlipTextConfig(PretrainedConfig):
self.attention_probs_dropout_prob = attention_probs_dropout_prob self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.is_decoder = is_decoder self.is_decoder = is_decoder
self.use_cache = use_cache self.use_cache = use_cache
self.label_smoothing = label_smoothing
@classmethod @classmethod
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
...@@ -298,6 +304,10 @@ class BlipConfig(PretrainedConfig): ...@@ -298,6 +304,10 @@ class BlipConfig(PretrainedConfig):
The inital value of the *logit_scale* paramter. Default is used as per the original BLIP implementation. The inital value of the *logit_scale* paramter. Default is used as per the original BLIP implementation.
image_text_hidden_size (`int`, *optional*, defaults to 256): image_text_hidden_size (`int`, *optional*, defaults to 256):
Dimentionality of the hidden state of the image-text fusion layer. Dimentionality of the hidden state of the image-text fusion layer.
label_smoothing (float, optional, *optional*, defaults to 0.0):
A float in [0.0, 1.0]. Specifies the amount of smoothing when computing the loss, where 0.0 means no smoothing. The targets
become a mixture of the original ground truth and a uniform distribution as described in
`Rethinking the Inception Architecture for Computer Vision <https://arxiv.org/abs/1512.00567>`__. Default: :math:`0.0`.
kwargs (*optional*): kwargs (*optional*):
Dictionary of keyword arguments. Dictionary of keyword arguments.
...@@ -333,6 +343,7 @@ class BlipConfig(PretrainedConfig): ...@@ -333,6 +343,7 @@ class BlipConfig(PretrainedConfig):
projection_dim=512, projection_dim=512,
logit_scale_init_value=2.6592, logit_scale_init_value=2.6592,
image_text_hidden_size=256, image_text_hidden_size=256,
label_smoothing=0.0,
**kwargs, **kwargs,
): ):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -355,6 +366,7 @@ class BlipConfig(PretrainedConfig): ...@@ -355,6 +366,7 @@ class BlipConfig(PretrainedConfig):
self.initializer_factor = 1.0 self.initializer_factor = 1.0
self.initializer_range = 0.02 self.initializer_range = 0.02
self.image_text_hidden_size = image_text_hidden_size self.image_text_hidden_size = image_text_hidden_size
self.label_smoothing = label_smoothing
@classmethod @classmethod
def from_text_vision_configs(cls, text_config: BlipTextConfig, vision_config: BlipVisionConfig, **kwargs): def from_text_vision_configs(cls, text_config: BlipTextConfig, vision_config: BlipVisionConfig, **kwargs):
......
...@@ -813,6 +813,7 @@ class BlipTextLMHeadModel(BlipTextPreTrainedModel): ...@@ -813,6 +813,7 @@ class BlipTextLMHeadModel(BlipTextPreTrainedModel):
self.bert = BlipTextModel(config, add_pooling_layer=False) self.bert = BlipTextModel(config, add_pooling_layer=False)
self.cls = BlipTextOnlyMLMHead(config) self.cls = BlipTextOnlyMLMHead(config)
self.label_smoothing = config.label_smoothing
def get_output_embeddings(self): def get_output_embeddings(self):
return self.cls.predictions.decoder return self.cls.predictions.decoder
...@@ -893,7 +894,7 @@ class BlipTextLMHeadModel(BlipTextPreTrainedModel): ...@@ -893,7 +894,7 @@ class BlipTextLMHeadModel(BlipTextPreTrainedModel):
# we are doing next-token prediction; shift prediction scores and input ids by one # we are doing next-token prediction; shift prediction scores and input ids by one
shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous() shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
labels = labels[:, 1:].contiguous().to(shifted_prediction_scores.device) labels = labels[:, 1:].contiguous().to(shifted_prediction_scores.device)
loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=0.1) loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=self.label_smoothing)
lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
if reduction == "none": if reduction == "none":
lm_loss = lm_loss.view(prediction_scores.size(0), -1).sum(1) lm_loss = lm_loss.view(prediction_scores.size(0), -1).sum(1)
......
...@@ -976,6 +976,7 @@ class TFBlipTextLMHeadModel(TFBlipTextPreTrainedModel): ...@@ -976,6 +976,7 @@ class TFBlipTextLMHeadModel(TFBlipTextPreTrainedModel):
self.bert = TFBlipTextModel(config, add_pooling_layer=False, name="bert") self.bert = TFBlipTextModel(config, add_pooling_layer=False, name="bert")
self.cls = TFBlipTextOnlyMLMHead(config, name="cls") self.cls = TFBlipTextOnlyMLMHead(config, name="cls")
self.label_smoothing = config.label_smoothing
def get_output_embeddings(self): def get_output_embeddings(self):
return self.cls.predictions.decoder return self.cls.predictions.decoder
...@@ -1063,7 +1064,9 @@ class TFBlipTextLMHeadModel(TFBlipTextPreTrainedModel): ...@@ -1063,7 +1064,9 @@ class TFBlipTextLMHeadModel(TFBlipTextPreTrainedModel):
# Keras won't give us label smoothing for sparse CE, so we de-sparsify things here # Keras won't give us label smoothing for sparse CE, so we de-sparsify things here
# Use relu to clamp masked labels at 0 to avoid NaN (we will be zeroing those out later anyway) # Use relu to clamp masked labels at 0 to avoid NaN (we will be zeroing those out later anyway)
one_hot_labels = tf.one_hot(tf.nn.relu(labels), depth=self.config.vocab_size, dtype=tf.float32) one_hot_labels = tf.one_hot(tf.nn.relu(labels), depth=self.config.vocab_size, dtype=tf.float32)
loss_fct = keras.losses.CategoricalCrossentropy(from_logits=True, label_smoothing=0.1, reduction="none") loss_fct = keras.losses.CategoricalCrossentropy(
from_logits=True, label_smoothing=self.label_smoothing, reduction="none"
)
masked_positions = tf.cast(tf.not_equal(labels, -100), dtype=tf.float32) masked_positions = tf.cast(tf.not_equal(labels, -100), dtype=tf.float32)
lm_loss = loss_fct(one_hot_labels, shifted_prediction_scores) lm_loss = loss_fct(one_hot_labels, shifted_prediction_scores)
lm_loss *= masked_positions lm_loss *= masked_positions
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment