Fix loss calculation in TFXXXForTokenClassification models (#15294)

* Fix loss calculation in TFFunnelForTokenClassification * revert the change in TFFunnelForTokenClassification * fix FunnelForTokenClassification loss * fix other TokenClassification loss * fix more * fix more * add num_labels to ElectraForTokenClassification * revert the change to research projects Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>

Fix loss calculation in TFXXXForTokenClassification models (#15294)
* Fix loss calculation in TFFunnelForTokenClassification * revert the change in TFFunnelForTokenClassification * fix FunnelForTokenClassification loss * fix other TokenClassification loss * fix more * fix more * add num_labels to ElectraForTokenClassification * revert the change to research projects Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
554d333e · Yih-Dar · GitHub · 44c7857b · 554d333e · 554d333e
Unverified Commit 554d333e authored Jan 31, 2022 by Yih-Dar Committed by GitHub Jan 31, 2022
7 changed files
--- a/src/transformers/models/rembert/modeling_rembert.py
+++ b/src/transformers/models/rembert/modeling_rembert.py
@@ -1413,15 +1413,6 @@ class RemBertForTokenClassification(RemBertPreTrainedModel):
        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
-            # Only keep active parts of the loss
-            if attention_mask is not None:
-                active_loss = attention_mask.view(-1) == 1
-                active_logits = logits.view(-1, self.num_labels)
-                active_labels = torch.where(
-                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
-                )
-                loss = loss_fct(active_logits, active_labels)
-            else:
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:

--- a/src/transformers/models/roberta/modeling_roberta.py
+++ b/src/transformers/models/roberta/modeling_roberta.py
@@ -1414,15 +1414,6 @@ class RobertaForTokenClassification(RobertaPreTrainedModel):
        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
-            # Only keep active parts of the loss
-            if attention_mask is not None:
-                active_loss = attention_mask.view(-1) == 1
-                active_logits = logits.view(-1, self.num_labels)
-                active_labels = torch.where(
-                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
-                )
-                loss = loss_fct(active_logits, active_labels)
-            else:
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:

--- a/src/transformers/models/roformer/modeling_roformer.py
+++ b/src/transformers/models/roformer/modeling_roformer.py
@@ -1472,15 +1472,6 @@ class RoFormerForTokenClassification(RoFormerPreTrainedModel):
        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
-            # Only keep active parts of the loss
-            if attention_mask is not None:
-                active_loss = attention_mask.view(-1) == 1
-                active_logits = logits.view(-1, self.num_labels)
-                active_labels = torch.where(
-                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
-                )
-                loss = loss_fct(active_logits, active_labels)
-            else:
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:

--- a/src/transformers/models/squeezebert/modeling_squeezebert.py
+++ b/src/transformers/models/squeezebert/modeling_squeezebert.py
@@ -984,15 +984,6 @@ class SqueezeBertForTokenClassification(SqueezeBertPreTrainedModel):
        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
-            # Only keep active parts of the loss
-            if attention_mask is not None:
-                active_loss = attention_mask.view(-1) == 1
-                active_logits = logits.view(-1, self.num_labels)
-                active_labels = torch.where(
-                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
-                )
-                loss = loss_fct(active_logits, active_labels)
-            else:
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:

--- a/src/transformers/models/xlm/modeling_xlm.py
+++ b/src/transformers/models/xlm/modeling_xlm.py
@@ -1169,15 +1169,6 @@ class XLMForTokenClassification(XLMPreTrainedModel):
        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
-            # Only keep active parts of the loss
-            if attention_mask is not None:
-                active_loss = attention_mask.view(-1) == 1
-                active_logits = logits.view(-1, self.num_labels)
-                active_labels = torch.where(
-                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
-                )
-                loss = loss_fct(active_logits, active_labels)
-            else:
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:

--- a/src/transformers/models/xlnet/modeling_xlnet.py
+++ b/src/transformers/models/xlnet/modeling_xlnet.py
@@ -1680,15 +1680,6 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):
        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
-            # Only keep active parts of the loss
-            if attention_mask is not None:
-                active_loss = attention_mask.view(-1) == 1
-                active_logits = logits.view(-1, self.num_labels)
-                active_labels = torch.where(
-                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
-                )
-                loss = loss_fct(active_logits, active_labels)
-            else:
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:

--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py
@@ -1455,15 +1455,6 @@ class {{cookiecutter.camelcase_modelname}}ForTokenClassification({{cookiecutter.
        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
-            # Only keep active parts of the loss
-            if attention_mask is not None:
-                active_loss = attention_mask.view(-1) == 1
-                active_logits = logits.view(-1, self.num_labels)
-                active_labels = torch.where(
-                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
-                )
-                loss = loss_fct(active_logits, active_labels)
-            else:
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict: