Fix device issue in `OpenLlamaModelTest::test_model_parallelism` (#24195)

fix Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>

Fix device issue in `OpenLlamaModelTest::test_model_parallelism` (#24195)
fix Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
a9cdb059 · Yih-Dar · GitHub · 9f81f4f6 · a9cdb059
Unverified Commit a9cdb059 authored Jun 12, 2023 by Yih-Dar Committed by GitHub Jun 12, 2023
Show whitespace changes
Inline Side-by-side

Showing with 5 additions and 1 deletion

src/transformers/models/open_llama/modeling_open_llama.py src/transformers/models/open_llama/modeling_open_llama.py +5 -1

No files found.
--- a/src/transformers/models/open_llama/modeling_open_llama.py
+++ b/src/transformers/models/open_llama/modeling_open_llama.py
@@ -736,12 +736,16 @@ class OpenLlamaForCausalLM(OpenLlamaPreTrainedModel):
        hidden_states = outputs[0]
        if self.config.shared_input_output_embedding:
-            logits = torch.einsum("blh,vh->blv", hidden_states, self.model.embed_tokens.weight)
+            logits = torch.einsum(
+                "blh,vh->blv", hidden_states.to(self.model.embed_tokens.weight.device), self.model.embed_tokens.weight
+            )
        else:
            logits = self.lm_head(hidden_states)
        loss = None
        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
            # Shift so that tokens < n predict n
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()