feat(model parallelism): moving the labels to the same device as the logits...

feat(model parallelism): moving the labels to the same device as the logits for gpt2 and bart (#22591)

feat(model parallelism): moving the labels to the same device as the logits...
feat(model parallelism): moving the labels to the same device as the logits for gpt2 and bart (#22591)
15641892 · Kaustubh · GitHub · e577bd0f · 15641892 · 15641892
Unverified Commit 15641892 authored Apr 06, 2023 by Kaustubh Committed by GitHub Apr 05, 2023
9 changed files
--- a/src/transformers/models/bart/modeling_bart.py
+++ b/src/transformers/models/bart/modeling_bart.py
@@ -1398,6 +1398,7 @@ class BartForConditionalGeneration(BartPretrainedModel):
        masked_lm_loss = None
        if labels is not None:
+            labels = labels.to(lm_logits.device)
            loss_fct = CrossEntropyLoss()
            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
@@ -1553,6 +1554,7 @@ class BartForSequenceClassification(BartPretrainedModel):
        loss = None
        if labels is not None:
+            labels = labels.to(logits.device)
            if self.config.problem_type is None:
                if self.config.num_labels == 1:
                    self.config.problem_type = "regression"
@@ -1896,6 +1898,7 @@ class BartForCausalLM(BartPretrainedModel):
        loss = None
        if labels is not None:
+            labels = labels.to(logits.device)
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1))

--- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
+++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
@@ -2581,6 +2581,7 @@ class BigBirdPegasusForConditionalGeneration(BigBirdPegasusPreTrainedModel):
        masked_lm_loss = None
        if labels is not None:
+            labels = labels.to(lm_logits.device)
            loss_fct = CrossEntropyLoss()
            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
@@ -2735,6 +2736,7 @@ class BigBirdPegasusForSequenceClassification(BigBirdPegasusPreTrainedModel):
        loss = None
        if labels is not None:
+            labels = labels.to(logits.device)
            if self.config.problem_type is None:
                if self.config.num_labels == 1:
                    self.config.problem_type = "regression"

--- a/src/transformers/models/blenderbot/modeling_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_blenderbot.py
@@ -1596,6 +1596,7 @@ class BlenderbotForCausalLM(BlenderbotPreTrainedModel):
        loss = None
        if labels is not None:
+            labels = labels.to(logits.device)
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1))

--- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
@@ -1563,6 +1563,7 @@ class BlenderbotSmallForCausalLM(BlenderbotSmallPreTrainedModel):
        loss = None
        if labels is not None:
+            labels = labels.to(logits.device)
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1))

--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -1098,6 +1098,8 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
        loss = None
        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(lm_logits.device)
            # Shift so that tokens < n predict n
            shift_logits = lm_logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
@@ -1318,6 +1320,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
            mc_loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1))
        lm_loss = None
        if labels is not None:
+            labels = labels.to(lm_logits.device)
            shift_logits = lm_logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            loss_fct = CrossEntropyLoss()
@@ -1569,6 +1572,7 @@ class GPT2ForTokenClassification(GPT2PreTrainedModel):
        loss = None
        if labels is not None:
+            labels = labels.to(logits.device)
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

--- a/src/transformers/models/marian/modeling_marian.py
+++ b/src/transformers/models/marian/modeling_marian.py
@@ -1715,6 +1715,7 @@ class MarianForCausalLM(MarianPreTrainedModel):
        loss = None
        if labels is not None:
+            labels = labels.to(logits.device)
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1))

--- a/src/transformers/models/mbart/modeling_mbart.py
+++ b/src/transformers/models/mbart/modeling_mbart.py
@@ -1528,6 +1528,7 @@ class MBartForSequenceClassification(MBartPreTrainedModel):
        loss = None
        if labels is not None:
+            labels = labels.to(logits.device)
            if self.config.problem_type is None:
                if self.config.num_labels == 1:
                    self.config.problem_type = "regression"
@@ -1866,6 +1867,7 @@ class MBartForCausalLM(MBartPreTrainedModel):
        loss = None
        if labels is not None:
+            labels = labels.to(logits.device)
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1))

--- a/src/transformers/models/pegasus/modeling_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_pegasus.py
@@ -1694,6 +1694,7 @@ class PegasusForCausalLM(PegasusPreTrainedModel):
        loss = None
        if labels is not None:
+            labels = labels.to(logits.device)
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1))

--- a/src/transformers/models/plbart/modeling_plbart.py
+++ b/src/transformers/models/plbart/modeling_plbart.py
@@ -1499,6 +1499,7 @@ class PLBartForSequenceClassification(PLBartPreTrainedModel):
        loss = None
        if labels is not None:
+            labels = labels.to(logits.device)
            if self.config.problem_type is None:
                if self.config.num_labels == 1:
                    self.config.problem_type = "regression"
@@ -1713,6 +1714,7 @@ class PLBartForCausalLM(PLBartPreTrainedModel):
        loss = None
        if labels is not None:
+            labels = labels.to(logits.device)
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1))