Early labels validation (#31240)

* Move label validation checks - fail early * Remove some formatting changes - add back labels change wav2vec2

Early labels validation (#31240)
* Move label validation checks - fail early * Remove some formatting changes - add back labels change wav2vec2
54659048 · amyeroberts · GitHub · 03ea1609 · 54659048 · 54659048
Unverified Commit 54659048 authored Jun 05, 2024 by amyeroberts Committed by GitHub Jun 05, 2024
20 changed files
--- a/src/transformers/models/bark/modeling_bark.py
+++ b/src/transformers/models/bark/modeling_bark.py
@@ -763,6 +763,12 @@ class BarkCausalModel(BarkPreTrainedModel):
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        loss = None
+        if labels is not None:
+            raise NotImplementedError(
+                "Training is not implemented yet for Bark - ensure you do not pass `labels` to the model."
+            )
        # Verify if input_embeds already exists
        # then compute embeddings.
        if input_ids is not None and input_embeds is not None:
@@ -870,12 +876,6 @@ class BarkCausalModel(BarkPreTrainedModel):
        logits = self.lm_head(hidden_states)
-        loss = None
-        if labels is not None:
-            raise NotImplementedError(
-                "Training is not implemented yet for Bark - ensure you do not pass `labels` to the model."
-            )
        if not return_dict:
            return tuple(
                v for v in [None, logits, present_key_values, all_hidden_states, all_self_attentions] if v is not None
@@ -1393,6 +1393,10 @@ class BarkFineModel(BarkPreTrainedModel):
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        loss = None
+        if labels is not None:
+            raise NotImplementedError("Training is not implemented yet")
        if codebook_idx == 0:
            raise ValueError("Cannot predict 0th codebook - 0th codebook should be predicted by the coarse model")
@@ -1470,10 +1474,6 @@ class BarkFineModel(BarkPreTrainedModel):
        logits = self.lm_heads[codebook_idx - self.config.n_codes_given](hidden_states)
-        loss = None
-        if labels is not None:
-            raise NotImplementedError("Training is not implemented yet")
        if not return_dict:
            return tuple(v for v in [None, logits, all_hidden_states, all_self_attentions] if v is not None)

--- a/src/transformers/models/beit/modeling_beit.py
+++ b/src/transformers/models/beit/modeling_beit.py
@@ -1247,6 +1247,9 @@ class BeitForSemanticSegmentation(BeitPreTrainedModel):
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
+        if labels is not None and self.config.num_labels == 1:
+            raise ValueError("The number of labels should be greater than one")
        outputs = self.beit(
            pixel_values,
            head_mask=head_mask,
@@ -1279,9 +1282,6 @@ class BeitForSemanticSegmentation(BeitPreTrainedModel):
        loss = None
        if labels is not None:
-            if self.config.num_labels == 1:
-                raise ValueError("The number of labels should be greater than one")
-            else:
            loss = self.compute_loss(logits, auxiliary_logits, labels)
        if not return_dict:

--- a/src/transformers/models/data2vec/modeling_data2vec_audio.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_audio.py
@@ -1372,9 +1372,11 @@ class Data2VecAudioForCTC(Data2VecAudioPreTrainedModel):
            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
            config.vocab_size - 1]`.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None and labels.max() >= self.config.vocab_size:
+            raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
        outputs = self.data2vec_audio(
            input_values,
            attention_mask=attention_mask,
@@ -1390,9 +1392,6 @@ class Data2VecAudioForCTC(Data2VecAudioPreTrainedModel):
        loss = None
        if labels is not None:
-            if labels.max() >= self.config.vocab_size:
-                raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
            # retrieve loss input_lengths from attention_mask
            attention_mask = (
                attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)

--- a/src/transformers/models/data2vec/modeling_data2vec_vision.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_vision.py
@@ -1173,6 +1173,9 @@ class Data2VecVisionForSemanticSegmentation(Data2VecVisionPreTrainedModel):
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
+        if labels is not None and self.config.num_labels == 1:
+            raise ValueError("The number of labels should be greater than one")
        outputs = self.data2vec_vision(
            pixel_values,
            head_mask=head_mask,
@@ -1205,9 +1208,6 @@ class Data2VecVisionForSemanticSegmentation(Data2VecVisionPreTrainedModel):
        loss = None
        if labels is not None:
-            if self.config.num_labels == 1:
-                raise ValueError("The number of labels should be greater than one")
-            else:
            loss = self.compute_loss(logits, auxiliary_logits, labels)
        if not return_dict:

--- a/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py
+++ b/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py
@@ -1633,6 +1633,9 @@ class TFData2VecVisionForSemanticSegmentation(TFData2VecVisionPreTrainedModel):
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
+        if labels is not None and self.config.num_labels == 1:
+            raise ValueError("The number of labels should be greater than one")
        outputs = self.data2vec_vision(
            pixel_values,
            head_mask=head_mask,
@@ -1672,9 +1675,6 @@ class TFData2VecVisionForSemanticSegmentation(TFData2VecVisionPreTrainedModel):
        loss = None
        if labels is not None:
-            if self.config.num_labels == 1:
-                raise ValueError("The number of labels should be greater than one")
-            else:
            loss = self.compute_loss(logits, auxiliary_logits, labels)
        if not return_dict:

--- a/src/transformers/models/deprecated/mctct/modeling_mctct.py
+++ b/src/transformers/models/deprecated/mctct/modeling_mctct.py
@@ -732,6 +732,8 @@ class MCTCTForCTC(MCTCTPreTrainedModel):
            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
            config.vocab_size - 1]`.
        """
+        if labels is not None and labels.max() >= self.config.vocab_size:
+            raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        outputs = self.mctct(
@@ -749,9 +751,6 @@ class MCTCTForCTC(MCTCTPreTrainedModel):
        loss = None
        if labels is not None:
-            if labels.max() >= self.config.vocab_size:
-                raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
            # retrieve loss input_lengths from attention_mask
            attention_mask = (
                attention_mask

--- a/src/transformers/models/deprecated/realm/modeling_realm.py
+++ b/src/transformers/models/deprecated/realm/modeling_realm.py
@@ -1440,9 +1440,13 @@ class RealmKnowledgeAugEncoder(RealmPreTrainedModel):
        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        ```"""
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None and relevance_score is None:
+            raise ValueError(
+                "You have to specify `relevance_score` when `labels` is specified in order to compute loss."
+            )
        (flattened_input_ids, flattened_attention_mask, flattened_token_type_ids) = self._flatten_inputs(
            input_ids, attention_mask, token_type_ids
        )
@@ -1468,11 +1472,6 @@ class RealmKnowledgeAugEncoder(RealmPreTrainedModel):
        masked_lm_loss = None
        if labels is not None:
-            if candidate_score is None:
-                raise ValueError(
-                    "You have to specify `relevance_score` when `labels` is specified in order to compute loss."
-                )
            batch_size, seq_length = labels.size()
            if mlm_mask is None:

--- a/src/transformers/models/depth_anything/modeling_depth_anything.py
+++ b/src/transformers/models/depth_anything/modeling_depth_anything.py
@@ -424,6 +424,10 @@ class DepthAnythingForDepthEstimation(DepthAnythingPreTrainedModel):
        >>> formatted = (output * 255 / np.max(output)).astype("uint8")
        >>> depth = Image.fromarray(formatted)
        ```"""
+        loss = None
+        if labels is not None:
+            raise NotImplementedError("Training is not implemented yet")
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -444,10 +448,6 @@ class DepthAnythingForDepthEstimation(DepthAnythingPreTrainedModel):
        predicted_depth = self.head(hidden_states, patch_height, patch_width)
-        loss = None
-        if labels is not None:
-            raise NotImplementedError("Training is not implemented yet")
        if not return_dict:
            if output_hidden_states:
                output = (predicted_depth,) + outputs[1:]

--- a/src/transformers/models/dpt/modeling_dpt.py
+++ b/src/transformers/models/dpt/modeling_dpt.py
@@ -1136,6 +1136,10 @@ class DPTForDepthEstimation(DPTPreTrainedModel):
        >>> formatted = (output * 255 / np.max(output)).astype("uint8")
        >>> depth = Image.fromarray(formatted)
        ```"""
+        loss = None
+        if labels is not None:
+            raise NotImplementedError("Training is not implemented yet")
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1183,10 +1187,6 @@ class DPTForDepthEstimation(DPTPreTrainedModel):
        predicted_depth = self.head(hidden_states)
-        loss = None
-        if labels is not None:
-            raise NotImplementedError("Training is not implemented yet")
        if not return_dict:
            if output_hidden_states:
                output = (predicted_depth,) + outputs[1:]
@@ -1308,6 +1308,9 @@ class DPTForSemanticSegmentation(DPTPreTrainedModel):
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
+        if labels is not None and self.config.num_labels == 1:
+            raise ValueError("The number of labels should be greater than one")
        outputs = self.dpt(
            pixel_values,
            head_mask=head_mask,
@@ -1342,9 +1345,6 @@ class DPTForSemanticSegmentation(DPTPreTrainedModel):
        loss = None
        if labels is not None:
-            if self.config.num_labels == 1:
-                raise ValueError("The number of labels should be greater than one")
-            else:
            # upsample logits to the images' original size
            upsampled_logits = nn.functional.interpolate(
                logits, size=labels.shape[-2:], mode="bilinear", align_corners=False

--- a/src/transformers/models/gptj/modeling_tf_gptj.py
+++ b/src/transformers/models/gptj/modeling_tf_gptj.py
@@ -921,6 +921,8 @@ class TFGPTJForSequenceClassification(TFGPTJPreTrainedModel, TFSequenceClassific
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
+        if labels is not None and self.config.pad_token_id is None and input_ids.shape[0] != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
        transformer_outputs = self.transformer(
            input_ids=input_ids,
@@ -963,9 +965,6 @@ class TFGPTJForSequenceClassification(TFGPTJPreTrainedModel, TFSequenceClassific
        loss = None
        if labels is not None:
-            if self.config.pad_token_id is None and logits_shape[0] != 1:
-                raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
            if not tf.is_tensor(sequence_lengths):
                in_logits = logits[0 : logits_shape[0], sequence_lengths]

--- a/src/transformers/models/hubert/modeling_hubert.py
+++ b/src/transformers/models/hubert/modeling_hubert.py
@@ -1574,9 +1574,11 @@ class HubertForCTC(HubertPreTrainedModel):
            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
            config.vocab_size - 1]`.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None and labels.max() >= self.config.vocab_size:
+            raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
        outputs = self.hubert(
            input_values,
            attention_mask=attention_mask,
@@ -1592,9 +1594,6 @@ class HubertForCTC(HubertPreTrainedModel):
        loss = None
        if labels is not None:
-            if labels.max() >= self.config.vocab_size:
-                raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
            # retrieve loss input_lengths from attention_mask
            attention_mask = (
                attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)

--- a/src/transformers/models/hubert/modeling_tf_hubert.py
+++ b/src/transformers/models/hubert/modeling_tf_hubert.py
@@ -1600,6 +1600,8 @@ class TFHubertForCTC(TFHubertPreTrainedModel):
        >>> loss = model(input_values, labels=labels).loss
        ```"""
+        if labels is not None and tf.reduce_max(labels) >= self.config.vocab_size:
+            raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
        outputs = self.hubert(
            input_values=input_values,
@@ -1619,9 +1621,6 @@ class TFHubertForCTC(TFHubertPreTrainedModel):
        logits = self.lm_head(hidden_states)
        if labels is not None:
-            if tf.reduce_max(labels) >= self.config.vocab_size:
-                raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
            attention_mask = (
                attention_mask if attention_mask is not None else tf.ones_like(input_values, dtype=tf.float32)
            )

--- a/src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py
+++ b/src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py
@@ -822,6 +822,9 @@ class MobileNetV2ForSemanticSegmentation(MobileNetV2PreTrainedModel):
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None and self.config.num_labels == 1:
+            raise ValueError("The number of labels should be greater than one")
        outputs = self.mobilenet_v2(
            pixel_values,
            output_hidden_states=True,  # we need the intermediate hidden states
@@ -834,9 +837,6 @@ class MobileNetV2ForSemanticSegmentation(MobileNetV2PreTrainedModel):
        loss = None
        if labels is not None:
-            if self.config.num_labels == 1:
-                raise ValueError("The number of labels should be greater than one")
-            else:
            # upsample logits to the images' original size
            upsampled_logits = nn.functional.interpolate(
                logits, size=labels.shape[-2:], mode="bilinear", align_corners=False

--- a/src/transformers/models/mobilevit/modeling_mobilevit.py
+++ b/src/transformers/models/mobilevit/modeling_mobilevit.py
@@ -1026,6 +1026,9 @@ class MobileViTForSemanticSegmentation(MobileViTPreTrainedModel):
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None and self.config.num_labels == 1:
+            raise ValueError("The number of labels should be greater than one")
        outputs = self.mobilevit(
            pixel_values,
            output_hidden_states=True,  # we need the intermediate hidden states
@@ -1038,9 +1041,6 @@ class MobileViTForSemanticSegmentation(MobileViTPreTrainedModel):
        loss = None
        if labels is not None:
-            if self.config.num_labels == 1:
-                raise ValueError("The number of labels should be greater than one")
-            else:
            # upsample logits to the images' original size
            upsampled_logits = nn.functional.interpolate(
                logits, size=labels.shape[-2:], mode="bilinear", align_corners=False

--- a/src/transformers/models/mobilevit/modeling_tf_mobilevit.py
+++ b/src/transformers/models/mobilevit/modeling_tf_mobilevit.py
@@ -1323,6 +1323,9 @@ class TFMobileViTForSemanticSegmentation(TFMobileViTPreTrainedModel):
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None and not self.config.num_labels > 1:
+            raise ValueError("The number of labels should be greater than one")
        outputs = self.mobilevit(
            pixel_values,
            output_hidden_states=True,  # we need the intermediate hidden states
@@ -1336,9 +1339,6 @@ class TFMobileViTForSemanticSegmentation(TFMobileViTPreTrainedModel):
        loss = None
        if labels is not None:
-            if not self.config.num_labels > 1:
-                raise ValueError("The number of labels should be greater than one")
-            else:
            loss = self.hf_compute_loss(logits=logits, labels=labels)
        # make logits of shape (batch_size, num_labels, height, width) to

--- a/src/transformers/models/mobilevitv2/modeling_mobilevitv2.py
+++ b/src/transformers/models/mobilevitv2/modeling_mobilevitv2.py
@@ -990,6 +990,9 @@ class MobileViTV2ForSemanticSegmentation(MobileViTV2PreTrainedModel):
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None and self.config.num_labels == 1:
+            raise ValueError("The number of labels should be greater than one")
        outputs = self.mobilevitv2(
            pixel_values,
            output_hidden_states=True,  # we need the intermediate hidden states
@@ -1002,9 +1005,6 @@ class MobileViTV2ForSemanticSegmentation(MobileViTV2PreTrainedModel):
        loss = None
        if labels is not None:
-            if self.config.num_labels == 1:
-                raise ValueError("The number of labels should be greater than one")
-            else:
            # upsample logits to the images' original size
            upsampled_logits = nn.functional.interpolate(
                logits, size=labels.shape[-2:], mode="bilinear", align_corners=False

--- a/src/transformers/models/perceiver/modeling_perceiver.py
+++ b/src/transformers/models/perceiver/modeling_perceiver.py
@@ -1740,6 +1740,10 @@ class PerceiverForOpticalFlow(PerceiverPreTrainedModel):
        ```"""
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        loss = None
+        if labels is not None:
+            raise NotImplementedError("Optical flow training is not yet supported")
        outputs = self.perceiver(
            inputs=inputs,
            attention_mask=attention_mask,
@@ -1750,10 +1754,6 @@ class PerceiverForOpticalFlow(PerceiverPreTrainedModel):
        )
        logits = outputs.logits if return_dict else outputs[0]
-        loss = None
-        if labels is not None:
-            raise NotImplementedError("Optical flow training is not yet supported")
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output
@@ -1974,6 +1974,10 @@ class PerceiverForMultimodalAutoencoding(PerceiverPreTrainedModel):
        ```"""
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        loss = None
+        if labels is not None:
+            raise NotImplementedError("Multimodal autoencoding training is not yet supported")
        outputs = self.perceiver(
            inputs=inputs,
            attention_mask=attention_mask,
@@ -1985,10 +1989,6 @@ class PerceiverForMultimodalAutoencoding(PerceiverPreTrainedModel):
        )
        logits = outputs.logits if return_dict else outputs[0]
-        loss = None
-        if labels is not None:
-            raise NotImplementedError("Multimodal autoencoding training is not yet supported")
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

--- a/src/transformers/models/segformer/modeling_segformer.py
+++ b/src/transformers/models/segformer/modeling_segformer.py
@@ -784,6 +784,9 @@ class SegformerForSemanticSegmentation(SegformerPreTrainedModel):
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
+        if labels is not None and self.config.num_labels < 1:
+            raise ValueError(f"Number of labels should be >=0: {self.config.num_labels}")
        outputs = self.segformer(
            pixel_values,
            output_attentions=output_attentions,
@@ -809,8 +812,6 @@ class SegformerForSemanticSegmentation(SegformerPreTrainedModel):
                loss_fct = BCEWithLogitsLoss(reduction="none")
                loss = loss_fct(upsampled_logits.squeeze(1), labels.float())
                loss = (loss * valid_mask).mean()
-            else:
-                raise ValueError(f"Number of labels should be >=0: {self.config.num_labels}")
        if not return_dict:
            if output_hidden_states:

--- a/src/transformers/models/segformer/modeling_tf_segformer.py
+++ b/src/transformers/models/segformer/modeling_tf_segformer.py
@@ -988,6 +988,9 @@ class TFSegformerForSemanticSegmentation(TFSegformerPreTrainedModel):
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
+        if labels is not None and not self.config.num_labels > 1:
+            raise ValueError("The number of labels should be greater than one")
        outputs = self.segformer(
            pixel_values,
            output_attentions=output_attentions,
@@ -1001,9 +1004,6 @@ class TFSegformerForSemanticSegmentation(TFSegformerPreTrainedModel):
        loss = None
        if labels is not None:
-            if not self.config.num_labels > 1:
-                raise ValueError("The number of labels should be greater than one")
-            else:
            loss = self.hf_compute_loss(logits=logits, labels=labels)
        # make logits of shape (batch_size, num_labels, height, width) to

--- a/src/transformers/models/sew/modeling_sew.py
+++ b/src/transformers/models/sew/modeling_sew.py
@@ -1418,9 +1418,11 @@ class SEWForCTC(SEWPreTrainedModel):
            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
            config.vocab_size - 1]`.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None and labels.max() >= self.config.vocab_size:
+            raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
        outputs = self.sew(
            input_values,
            attention_mask=attention_mask,
@@ -1436,9 +1438,6 @@ class SEWForCTC(SEWPreTrainedModel):
        loss = None
        if labels is not None:
-            if labels.max() >= self.config.vocab_size:
-                raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
            # retrieve loss input_lengths from attention_mask
            attention_mask = (
                attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)