Unverified Commit 656e869a authored by Arun Brahma's avatar Arun Brahma Committed by GitHub
Browse files

moved labels to the same device as logits for BLOOM, GPT Neo, GPT NeoX,...

moved labels to the same device as logits for BLOOM, GPT Neo, GPT NeoX, RoBERTa and VIT models (#22663)

moved labels to the same device as logits
parent 6db23af5
...@@ -927,6 +927,8 @@ class BloomForCausalLM(BloomPreTrainedModel): ...@@ -927,6 +927,8 @@ class BloomForCausalLM(BloomPreTrainedModel):
loss = None loss = None
if labels is not None: if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(lm_logits.device)
# Shift so that tokens < n predict n # Shift so that tokens < n predict n
shift_logits = lm_logits[..., :-1, :].contiguous() shift_logits = lm_logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous() shift_labels = labels[..., 1:].contiguous()
...@@ -1194,6 +1196,8 @@ class BloomForTokenClassification(BloomPreTrainedModel): ...@@ -1194,6 +1196,8 @@ class BloomForTokenClassification(BloomPreTrainedModel):
loss = None loss = None
if labels is not None: if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(logits.device)
batch_size, seq_length = labels.shape batch_size, seq_length = labels.shape
loss_fct = CrossEntropyLoss() loss_fct = CrossEntropyLoss()
loss = loss_fct( loss = loss_fct(
......
...@@ -1015,6 +1015,8 @@ class CamembertForMaskedLM(CamembertPreTrainedModel): ...@@ -1015,6 +1015,8 @@ class CamembertForMaskedLM(CamembertPreTrainedModel):
masked_lm_loss = None masked_lm_loss = None
if labels is not None: if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(prediction_scores.device)
loss_fct = CrossEntropyLoss() loss_fct = CrossEntropyLoss()
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
...@@ -1097,6 +1099,8 @@ class CamembertForSequenceClassification(CamembertPreTrainedModel): ...@@ -1097,6 +1099,8 @@ class CamembertForSequenceClassification(CamembertPreTrainedModel):
loss = None loss = None
if labels is not None: if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(logits.device)
if self.config.problem_type is None: if self.config.problem_type is None:
if self.num_labels == 1: if self.num_labels == 1:
self.config.problem_type = "regression" self.config.problem_type = "regression"
...@@ -1210,6 +1214,8 @@ class CamembertForMultipleChoice(CamembertPreTrainedModel): ...@@ -1210,6 +1214,8 @@ class CamembertForMultipleChoice(CamembertPreTrainedModel):
loss = None loss = None
if labels is not None: if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(reshaped_logits.device)
loss_fct = CrossEntropyLoss() loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels) loss = loss_fct(reshaped_logits, labels)
...@@ -1297,6 +1303,8 @@ class CamembertForTokenClassification(CamembertPreTrainedModel): ...@@ -1297,6 +1303,8 @@ class CamembertForTokenClassification(CamembertPreTrainedModel):
loss = None loss = None
if labels is not None: if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(logits.device)
loss_fct = CrossEntropyLoss() loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
...@@ -1534,6 +1542,8 @@ class CamembertForCausalLM(CamembertPreTrainedModel): ...@@ -1534,6 +1542,8 @@ class CamembertForCausalLM(CamembertPreTrainedModel):
lm_loss = None lm_loss = None
if labels is not None: if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(prediction_scores.device)
# we are doing next-token prediction; shift prediction scores and input ids by one # we are doing next-token prediction; shift prediction scores and input ids by one
shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous() shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
labels = labels[:, 1:].contiguous() labels = labels[:, 1:].contiguous()
......
...@@ -757,6 +757,8 @@ class GPTNeoForCausalLM(GPTNeoPreTrainedModel): ...@@ -757,6 +757,8 @@ class GPTNeoForCausalLM(GPTNeoPreTrainedModel):
loss = None loss = None
if labels is not None: if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(lm_logits.device)
# Compute loss in fp32 to match with mesh-tf version # Compute loss in fp32 to match with mesh-tf version
# https://github.com/EleutherAI/gpt-neo/blob/89ce74164da2fb16179106f54e2269b5da8db333/models/gpt2/gpt2.py#L179 # https://github.com/EleutherAI/gpt-neo/blob/89ce74164da2fb16179106f54e2269b5da8db333/models/gpt2/gpt2.py#L179
lm_logits = lm_logits.to(torch.float32) lm_logits = lm_logits.to(torch.float32)
......
...@@ -677,6 +677,8 @@ class GPTNeoXForCausalLM(GPTNeoXPreTrainedModel): ...@@ -677,6 +677,8 @@ class GPTNeoXForCausalLM(GPTNeoXPreTrainedModel):
lm_loss = None lm_loss = None
if labels is not None: if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(lm_logits.device)
# we are doing next-token prediction; shift prediction scores and input ids by one # we are doing next-token prediction; shift prediction scores and input ids by one
shift_logits = lm_logits[:, :-1, :].contiguous() shift_logits = lm_logits[:, :-1, :].contiguous()
labels = labels[:, 1:].contiguous() labels = labels[:, 1:].contiguous()
......
...@@ -993,6 +993,8 @@ class RobertaForCausalLM(RobertaPreTrainedModel): ...@@ -993,6 +993,8 @@ class RobertaForCausalLM(RobertaPreTrainedModel):
lm_loss = None lm_loss = None
if labels is not None: if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(prediction_scores.device)
# we are doing next-token prediction; shift prediction scores and input ids by one # we are doing next-token prediction; shift prediction scores and input ids by one
shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous() shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
labels = labels[:, 1:].contiguous() labels = labels[:, 1:].contiguous()
...@@ -1113,6 +1115,8 @@ class RobertaForMaskedLM(RobertaPreTrainedModel): ...@@ -1113,6 +1115,8 @@ class RobertaForMaskedLM(RobertaPreTrainedModel):
masked_lm_loss = None masked_lm_loss = None
if labels is not None: if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(prediction_scores.device)
loss_fct = CrossEntropyLoss() loss_fct = CrossEntropyLoss()
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
...@@ -1225,6 +1229,8 @@ class RobertaForSequenceClassification(RobertaPreTrainedModel): ...@@ -1225,6 +1229,8 @@ class RobertaForSequenceClassification(RobertaPreTrainedModel):
loss = None loss = None
if labels is not None: if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(logits.device)
if self.config.problem_type is None: if self.config.problem_type is None:
if self.num_labels == 1: if self.num_labels == 1:
self.config.problem_type = "regression" self.config.problem_type = "regression"
...@@ -1335,6 +1341,8 @@ class RobertaForMultipleChoice(RobertaPreTrainedModel): ...@@ -1335,6 +1341,8 @@ class RobertaForMultipleChoice(RobertaPreTrainedModel):
loss = None loss = None
if labels is not None: if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(reshaped_logits.device)
loss_fct = CrossEntropyLoss() loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels) loss = loss_fct(reshaped_logits, labels)
...@@ -1421,6 +1429,8 @@ class RobertaForTokenClassification(RobertaPreTrainedModel): ...@@ -1421,6 +1429,8 @@ class RobertaForTokenClassification(RobertaPreTrainedModel):
loss = None loss = None
if labels is not None: if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(logits.device)
loss_fct = CrossEntropyLoss() loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
......
...@@ -1000,6 +1000,8 @@ class RobertaPreLayerNormForCausalLM(RobertaPreLayerNormPreTrainedModel): ...@@ -1000,6 +1000,8 @@ class RobertaPreLayerNormForCausalLM(RobertaPreLayerNormPreTrainedModel):
lm_loss = None lm_loss = None
if labels is not None: if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(prediction_scores.device)
# we are doing next-token prediction; shift prediction scores and input ids by one # we are doing next-token prediction; shift prediction scores and input ids by one
shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous() shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
labels = labels[:, 1:].contiguous() labels = labels[:, 1:].contiguous()
...@@ -1124,6 +1126,8 @@ class RobertaPreLayerNormForMaskedLM(RobertaPreLayerNormPreTrainedModel): ...@@ -1124,6 +1126,8 @@ class RobertaPreLayerNormForMaskedLM(RobertaPreLayerNormPreTrainedModel):
masked_lm_loss = None masked_lm_loss = None
if labels is not None: if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(prediction_scores.device)
loss_fct = CrossEntropyLoss() loss_fct = CrossEntropyLoss()
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
...@@ -1236,6 +1240,8 @@ class RobertaPreLayerNormForSequenceClassification(RobertaPreLayerNormPreTrained ...@@ -1236,6 +1240,8 @@ class RobertaPreLayerNormForSequenceClassification(RobertaPreLayerNormPreTrained
loss = None loss = None
if labels is not None: if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(logits.device)
if self.config.problem_type is None: if self.config.problem_type is None:
if self.num_labels == 1: if self.num_labels == 1:
self.config.problem_type = "regression" self.config.problem_type = "regression"
...@@ -1349,6 +1355,8 @@ class RobertaPreLayerNormForMultipleChoice(RobertaPreLayerNormPreTrainedModel): ...@@ -1349,6 +1355,8 @@ class RobertaPreLayerNormForMultipleChoice(RobertaPreLayerNormPreTrainedModel):
loss = None loss = None
if labels is not None: if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(reshaped_logits.device)
loss_fct = CrossEntropyLoss() loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels) loss = loss_fct(reshaped_logits, labels)
...@@ -1434,6 +1442,8 @@ class RobertaPreLayerNormForTokenClassification(RobertaPreLayerNormPreTrainedMod ...@@ -1434,6 +1442,8 @@ class RobertaPreLayerNormForTokenClassification(RobertaPreLayerNormPreTrainedMod
loss = None loss = None
if labels is not None: if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(logits.device)
loss_fct = CrossEntropyLoss() loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
......
...@@ -809,6 +809,8 @@ class ViTForImageClassification(ViTPreTrainedModel): ...@@ -809,6 +809,8 @@ class ViTForImageClassification(ViTPreTrainedModel):
loss = None loss = None
if labels is not None: if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(logits.device)
if self.config.problem_type is None: if self.config.problem_type is None:
if self.num_labels == 1: if self.num_labels == 1:
self.config.problem_type = "regression" self.config.problem_type = "regression"
......
...@@ -702,6 +702,8 @@ class ViTHybridForImageClassification(ViTHybridPreTrainedModel): ...@@ -702,6 +702,8 @@ class ViTHybridForImageClassification(ViTHybridPreTrainedModel):
loss = None loss = None
if labels is not None: if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(logits.device)
if self.config.problem_type is None: if self.config.problem_type is None:
if self.num_labels == 1: if self.num_labels == 1:
self.config.problem_type = "regression" self.config.problem_type = "regression"
......
...@@ -997,6 +997,8 @@ class XLMRobertaForCausalLM(XLMRobertaPreTrainedModel): ...@@ -997,6 +997,8 @@ class XLMRobertaForCausalLM(XLMRobertaPreTrainedModel):
lm_loss = None lm_loss = None
if labels is not None: if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(prediction_scores.device)
# we are doing next-token prediction; shift prediction scores and input ids by one # we are doing next-token prediction; shift prediction scores and input ids by one
shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous() shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
labels = labels[:, 1:].contiguous() labels = labels[:, 1:].contiguous()
...@@ -1121,6 +1123,8 @@ class XLMRobertaForMaskedLM(XLMRobertaPreTrainedModel): ...@@ -1121,6 +1123,8 @@ class XLMRobertaForMaskedLM(XLMRobertaPreTrainedModel):
masked_lm_loss = None masked_lm_loss = None
if labels is not None: if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(prediction_scores.device)
loss_fct = CrossEntropyLoss() loss_fct = CrossEntropyLoss()
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
...@@ -1235,6 +1239,8 @@ class XLMRobertaForSequenceClassification(XLMRobertaPreTrainedModel): ...@@ -1235,6 +1239,8 @@ class XLMRobertaForSequenceClassification(XLMRobertaPreTrainedModel):
loss = None loss = None
if labels is not None: if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(logits.device)
if self.config.problem_type is None: if self.config.problem_type is None:
if self.num_labels == 1: if self.num_labels == 1:
self.config.problem_type = "regression" self.config.problem_type = "regression"
...@@ -1348,6 +1354,8 @@ class XLMRobertaForMultipleChoice(XLMRobertaPreTrainedModel): ...@@ -1348,6 +1354,8 @@ class XLMRobertaForMultipleChoice(XLMRobertaPreTrainedModel):
loss = None loss = None
if labels is not None: if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(reshaped_logits.device)
loss_fct = CrossEntropyLoss() loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels) loss = loss_fct(reshaped_logits, labels)
...@@ -1435,6 +1443,8 @@ class XLMRobertaForTokenClassification(XLMRobertaPreTrainedModel): ...@@ -1435,6 +1443,8 @@ class XLMRobertaForTokenClassification(XLMRobertaPreTrainedModel):
loss = None loss = None
if labels is not None: if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(logits.device)
loss_fct = CrossEntropyLoss() loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment