Unverified Commit 656e869a authored by Arun Brahma's avatar Arun Brahma Committed by GitHub
Browse files

moved labels to the same device as logits for BLOOM, GPT Neo, GPT NeoX,...

moved labels to the same device as logits for BLOOM, GPT Neo, GPT NeoX, RoBERTa and VIT models (#22663)

moved labels to the same device as logits
parent 6db23af5
......@@ -927,6 +927,8 @@ class BloomForCausalLM(BloomPreTrainedModel):
loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(lm_logits.device)
# Shift so that tokens < n predict n
shift_logits = lm_logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
......@@ -1194,6 +1196,8 @@ class BloomForTokenClassification(BloomPreTrainedModel):
loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(logits.device)
batch_size, seq_length = labels.shape
loss_fct = CrossEntropyLoss()
loss = loss_fct(
......
......@@ -1015,6 +1015,8 @@ class CamembertForMaskedLM(CamembertPreTrainedModel):
masked_lm_loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(prediction_scores.device)
loss_fct = CrossEntropyLoss()
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
......@@ -1097,6 +1099,8 @@ class CamembertForSequenceClassification(CamembertPreTrainedModel):
loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(logits.device)
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
......@@ -1210,6 +1214,8 @@ class CamembertForMultipleChoice(CamembertPreTrainedModel):
loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(reshaped_logits.device)
loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels)
......@@ -1297,6 +1303,8 @@ class CamembertForTokenClassification(CamembertPreTrainedModel):
loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(logits.device)
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
......@@ -1534,6 +1542,8 @@ class CamembertForCausalLM(CamembertPreTrainedModel):
lm_loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(prediction_scores.device)
# we are doing next-token prediction; shift prediction scores and input ids by one
shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
labels = labels[:, 1:].contiguous()
......
......@@ -757,6 +757,8 @@ class GPTNeoForCausalLM(GPTNeoPreTrainedModel):
loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(lm_logits.device)
# Compute loss in fp32 to match with mesh-tf version
# https://github.com/EleutherAI/gpt-neo/blob/89ce74164da2fb16179106f54e2269b5da8db333/models/gpt2/gpt2.py#L179
lm_logits = lm_logits.to(torch.float32)
......
......@@ -677,6 +677,8 @@ class GPTNeoXForCausalLM(GPTNeoXPreTrainedModel):
lm_loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(lm_logits.device)
# we are doing next-token prediction; shift prediction scores and input ids by one
shift_logits = lm_logits[:, :-1, :].contiguous()
labels = labels[:, 1:].contiguous()
......
......@@ -993,6 +993,8 @@ class RobertaForCausalLM(RobertaPreTrainedModel):
lm_loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(prediction_scores.device)
# we are doing next-token prediction; shift prediction scores and input ids by one
shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
labels = labels[:, 1:].contiguous()
......@@ -1113,6 +1115,8 @@ class RobertaForMaskedLM(RobertaPreTrainedModel):
masked_lm_loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(prediction_scores.device)
loss_fct = CrossEntropyLoss()
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
......@@ -1225,6 +1229,8 @@ class RobertaForSequenceClassification(RobertaPreTrainedModel):
loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(logits.device)
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
......@@ -1335,6 +1341,8 @@ class RobertaForMultipleChoice(RobertaPreTrainedModel):
loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(reshaped_logits.device)
loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels)
......@@ -1421,6 +1429,8 @@ class RobertaForTokenClassification(RobertaPreTrainedModel):
loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(logits.device)
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
......
......@@ -1000,6 +1000,8 @@ class RobertaPreLayerNormForCausalLM(RobertaPreLayerNormPreTrainedModel):
lm_loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(prediction_scores.device)
# we are doing next-token prediction; shift prediction scores and input ids by one
shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
labels = labels[:, 1:].contiguous()
......@@ -1124,6 +1126,8 @@ class RobertaPreLayerNormForMaskedLM(RobertaPreLayerNormPreTrainedModel):
masked_lm_loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(prediction_scores.device)
loss_fct = CrossEntropyLoss()
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
......@@ -1236,6 +1240,8 @@ class RobertaPreLayerNormForSequenceClassification(RobertaPreLayerNormPreTrained
loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(logits.device)
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
......@@ -1349,6 +1355,8 @@ class RobertaPreLayerNormForMultipleChoice(RobertaPreLayerNormPreTrainedModel):
loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(reshaped_logits.device)
loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels)
......@@ -1434,6 +1442,8 @@ class RobertaPreLayerNormForTokenClassification(RobertaPreLayerNormPreTrainedMod
loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(logits.device)
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
......
......@@ -809,6 +809,8 @@ class ViTForImageClassification(ViTPreTrainedModel):
loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(logits.device)
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
......
......@@ -702,6 +702,8 @@ class ViTHybridForImageClassification(ViTHybridPreTrainedModel):
loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(logits.device)
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
......
......@@ -997,6 +997,8 @@ class XLMRobertaForCausalLM(XLMRobertaPreTrainedModel):
lm_loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(prediction_scores.device)
# we are doing next-token prediction; shift prediction scores and input ids by one
shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
labels = labels[:, 1:].contiguous()
......@@ -1121,6 +1123,8 @@ class XLMRobertaForMaskedLM(XLMRobertaPreTrainedModel):
masked_lm_loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(prediction_scores.device)
loss_fct = CrossEntropyLoss()
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
......@@ -1235,6 +1239,8 @@ class XLMRobertaForSequenceClassification(XLMRobertaPreTrainedModel):
loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(logits.device)
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
......@@ -1348,6 +1354,8 @@ class XLMRobertaForMultipleChoice(XLMRobertaPreTrainedModel):
loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(reshaped_logits.device)
loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels)
......@@ -1435,6 +1443,8 @@ class XLMRobertaForTokenClassification(XLMRobertaPreTrainedModel):
loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(logits.device)
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment