Make vilt, switch_transformers compatible with model parallelism (#22703)

* Update modeling_vilt.py Vilt compatible with model parallelism * Update modeling_switch_transformers.py switch_transformers compatible with model parallelism

Make vilt, switch_transformers compatible with model parallelism (#22703)
* Update modeling_vilt.py Vilt compatible with model parallelism * Update modeling_switch_transformers.py switch_transformers compatible with model parallelism
95e70575 · Rinat · GitHub · 89087597 · 95e70575 · 95e70575
Unverified Commit 95e70575 authored Apr 13, 2023 by Rinat Committed by GitHub Apr 13, 2023
2 changed files
--- a/src/transformers/models/switch_transformers/modeling_switch_transformers.py
+++ b/src/transformers/models/switch_transformers/modeling_switch_transformers.py
@@ -1700,6 +1700,8 @@ class SwitchTransformersForConditionalGeneration(SwitchTransformersPreTrainedMod
                decoder_router_probs = nn.Softmax(dim=-1)(decoder_router_logits)
                decoder_aux_loss = load_balancing_loss_func(decoder_router_probs, decoder_expert_indexes)
+            # move labels to correct device to enable PP
+            labels = labels.to(lm_logits.device)
            loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
            if output_router_logits and labels is not None:

--- a/src/transformers/models/vilt/modeling_vilt.py
+++ b/src/transformers/models/vilt/modeling_vilt.py
@@ -1009,6 +1009,8 @@ class ViltForMaskedLM(ViltPreTrainedModel):
        masked_lm_loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            # move labels to correct device to enable PP
+            labels = labels.to(mlm_logits.device)
            masked_lm_loss = loss_fct(mlm_logits.view(-1, self.config.vocab_size), labels.view(-1))
        if not return_dict:
@@ -1155,6 +1157,8 @@ class ViltForQuestionAnswering(ViltPreTrainedModel):
        loss = None
        if labels is not None:
+            # move labels to correct device to enable PP
+            labels = labels.to(logits.device)
            loss = nn.functional.binary_cross_entropy_with_logits(logits, labels) * labels.shape[1]
            # see https://github.com/jnhwkim/ban-vqa/blob/master/train.py#L19
@@ -1395,6 +1399,8 @@ class ViltForImagesAndTextClassification(ViltPreTrainedModel):
        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
+            # move labels to correct device to enable PP
+            labels = labels.to(logits.device)
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        if not return_dict:
@@ -1481,6 +1487,8 @@ class ViltForTokenClassification(ViltPreTrainedModel):
        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
+            # move labels to correct device to enable PP
+            labels = labels.to(logits.device)
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        if not return_dict: