Fixes bug that appears when using QA bert and distilation. (#12026)

* Fixing bug that appears when using distilation (and potentially other uses). During backward pass Pytorch complains with: RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation This happens because the QA model code modifies the start_positions and end_positions input tensors, using clamp_ function: as a consequence the teacher and the student both modifies the inputs, and backward pass fails. * Fixing all models QA clamp_ bug.

Fixes bug that appears when using QA bert and distilation. (#12026)
* Fixing bug that appears when using distilation (and potentially other uses). During backward pass Pytorch complains with: RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation This happens because the QA model code modifies the start_positions and end_positions input tensors, using clamp_ function: as a consequence the teacher and the student both modifies the inputs, and backward pass fails. * Fixing all models QA clamp_ bug.
f8bd8c6c · François Lagunas · GitHub · 59f75d53 · f8bd8c6c · f8bd8c6c
Unverified Commit f8bd8c6c authored Jun 07, 2021 by François Lagunas Committed by GitHub Jun 07, 2021
5 changed files
--- a/src/transformers/models/roformer/modeling_roformer.py
+++ b/src/transformers/models/roformer/modeling_roformer.py
@@ -1554,8 +1554,8 @@ class RoFormerForQuestionAnswering(RoFormerPreTrainedModel):
                end_positions = end_positions.squeeze(-1)
            # sometimes the start/end positions are outside our model inputs, we ignore these terms
            ignored_index = start_logits.size(1)
-            start_positions.clamp_(0, ignored_index)
+            start_positions = start_positions.clamp(0, ignored_index)
-            end_positions.clamp_(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)

--- a/src/transformers/models/squeezebert/modeling_squeezebert.py
+++ b/src/transformers/models/squeezebert/modeling_squeezebert.py
@@ -1080,8 +1080,8 @@ class SqueezeBertForQuestionAnswering(SqueezeBertPreTrainedModel):
                end_positions = end_positions.squeeze(-1)
            # sometimes the start/end positions are outside our model inputs, we ignore these terms
            ignored_index = start_logits.size(1)
-            start_positions.clamp_(0, ignored_index)
+            start_positions = start_positions.clamp(0, ignored_index)
-            end_positions.clamp_(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)

--- a/src/transformers/models/xlm/modeling_xlm.py
+++ b/src/transformers/models/xlm/modeling_xlm.py
@@ -953,8 +953,8 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
                end_positions = end_positions.squeeze(-1)
            # sometimes the start/end positions are outside our model inputs, we ignore these terms
            ignored_index = start_logits.size(1)
-            start_positions.clamp_(0, ignored_index)
+            start_positions = start_positions.clamp(0, ignored_index)
-            end_positions.clamp_(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)

--- a/src/transformers/models/xlnet/modeling_xlnet.py
+++ b/src/transformers/models/xlnet/modeling_xlnet.py
@@ -1874,8 +1874,8 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
                end_positions = end_positions.squeeze(-1)
            # sometimes the start/end positions are outside our model inputs, we ignore these terms
            ignored_index = start_logits.size(1)
-            start_positions.clamp_(0, ignored_index)
+            start_positions = start_positions.clamp(0, ignored_index)
-            end_positions.clamp_(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)

--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py
@@ -1516,8 +1516,8 @@ class {{cookiecutter.camelcase_modelname}}ForQuestionAnswering({{cookiecutter.ca
                end_positions = end_positions.squeeze(-1)
            # sometimes the start/end positions are outside our model inputs, we ignore these terms
            ignored_index = start_logits.size(1)
-            start_positions.clamp_(0, ignored_index)
+            start_positions = start_positions.clamp(0, ignored_index)
-            end_positions.clamp_(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
@@ -3066,8 +3066,8 @@ class {{cookiecutter.camelcase_modelname}}ForQuestionAnswering({{cookiecutter.ca
                end_positions = end_positions.squeeze(-1)
            # sometimes the start/end positions are outside our model inputs, we ignore these terms
            ignored_index = start_logits.size(1)
-            start_positions.clamp_(0, ignored_index)
+            start_positions = start_positions.clamp(0, ignored_index)
-            end_positions.clamp_(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)