[InternLM] Add support for InternLM (#26302)

* Add config.bias to LLaMA to allow InternLM models to be ported as LLaMA checkpoints * Rename bias -> attention_bias and add docstring

[InternLM] Add support for InternLM (#26302)
* Add config.bias to LLaMA to allow InternLM models to be ported as LLaMA checkpoints * Rename bias -> attention_bias and add docstring
6ba63ac3 · Matt · GitHub · 0ac38750 · 6ba63ac3 · 6ba63ac3
Unverified Commit 6ba63ac3 authored Sep 26, 2023 by Matt Committed by GitHub Sep 26, 2023
Showing with 8 additions and 5 deletions

src/transformers/models/llama/configuration_llama.py src/transformers/models/llama/configuration_llama.py +4 -0

src/transformers/models/llama/modeling_llama.py src/transformers/models/llama/modeling_llama.py +4 -5

No files found.
--- a/src/transformers/models/llama/configuration_llama.py
+++ b/src/transformers/models/llama/configuration_llama.py
@@ -87,6 +87,8 @@ class LlamaConfig(PretrainedConfig):
            these scaling strategies behave:
            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
            experimental feature, subject to breaking API changes in future versions.
+        attention_bias (`bool`, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.

        Example:

@@ -125,6 +127,7 @@ class LlamaConfig(PretrainedConfig):
        tie_word_embeddings=False,
        rope_theta=10000.0,
        rope_scaling=None,
+        attention_bias=False,
        **kwargs,
    ):
        self.vocab_size = vocab_size
@@ -147,6 +150,7 @@ class LlamaConfig(PretrainedConfig):
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling
        self._rope_scaling_validation()
+        self.attention_bias = attention_bias

        super().__init__(
            pad_token_id=pad_token_id,

--- a/src/transformers/models/llama/modeling_llama.py
+++ b/src/transformers/models/llama/modeling_llama.py
@@ -280,11 +280,10 @@ class LlamaAttention(nn.Module):
                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
                f" and `num_heads`: {self.num_heads})."
            )
-        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
-        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
-        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
-        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
-
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
        self._init_rope()

    def _init_rope(self):