Fix `get_embedding` dtype at init. time (#19473)

* cast positions dtype in XGLMModel * Get the correct dtype at init time * Get the correct dtype at init time Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>

Fix `get_embedding` dtype at init. time (#19473)
* cast positions dtype in XGLMModel * Get the correct dtype at init time * Get the correct dtype at init time Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
c6646613 · Yih-Dar · GitHub · e38cf93e · c6646613 · c6646613
Unverified Commit c6646613 authored Oct 11, 2022 by Yih-Dar Committed by GitHub Oct 11, 2022
5 changed files
--- a/src/transformers/models/m2m_100/modeling_m2m_100.py
+++ b/src/transformers/models/m2m_100/modeling_m2m_100.py
@@ -152,7 +152,7 @@ class M2M100SinusoidalPositionalEmbedding(nn.Module):
        if padding_idx is not None:
            emb[padding_idx, :] = 0

-        return emb
+        return emb.to(torch.get_default_dtype())

    @torch.no_grad()
    def forward(

--- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py
@@ -165,7 +165,7 @@ class Speech2TextSinusoidalPositionalEmbedding(nn.Module):
            emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
        if padding_idx is not None:
            emb[padding_idx, :] = 0
-        return emb
+        return emb.to(torch.get_default_dtype())

    @torch.no_grad()
    def forward(self, input_ids: torch.Tensor, past_key_values_length: int = 0):

--- a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
+++ b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
@@ -111,7 +111,7 @@ class Speech2Text2SinusoidalPositionalEmbedding(nn.Module):
            emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
        if padding_idx is not None:
            emb[padding_idx, :] = 0
-        return emb
+        return emb.to(torch.get_default_dtype())

    @torch.no_grad()
    def forward(self, input_ids: torch.Tensor, past_key_values_length: int = 0):

--- a/src/transformers/models/trocr/modeling_trocr.py
+++ b/src/transformers/models/trocr/modeling_trocr.py
@@ -126,7 +126,7 @@ class TrOCRSinusoidalPositionalEmbedding(nn.Module):
        if padding_idx is not None:
            emb[padding_idx, :] = 0

-        return emb
+        return emb.to(torch.get_default_dtype())

    @torch.no_grad()
    def forward(self, input_ids: torch.Tensor, past_key_values_length: int = 0):

--- a/src/transformers/models/xglm/modeling_xglm.py
+++ b/src/transformers/models/xglm/modeling_xglm.py
@@ -194,7 +194,7 @@ class XGLMSinusoidalPositionalEmbedding(nn.Module):
        if padding_idx is not None:
            emb[padding_idx, :] = 0

-        return emb
+        return emb.to(torch.get_default_dtype())

    @torch.no_grad()
    def forward(