[docstring] fix incorrect llama docstring: encoder -> decoder (#27071)

fix incorrect docstring: encoder -> decoder

[docstring] fix incorrect llama docstring: encoder -> decoder (#27071)
fix incorrect docstring: encoder -> decoder
a64f8c1f · Jing Hua · GitHub · 0baa9246 · a64f8c1f · a64f8c1f
Unverified Commit a64f8c1f authored Oct 26, 2023 by Jing Hua Committed by GitHub Oct 25, 2023
Showing with 3 additions and 3 deletions

src/transformers/models/llama/configuration_llama.py src/transformers/models/llama/configuration_llama.py +2 -2

src/transformers/models/llama/modeling_llama.py src/transformers/models/llama/modeling_llama.py +1 -1

No files found.
--- a/src/transformers/models/llama/configuration_llama.py
+++ b/src/transformers/models/llama/configuration_llama.py
@@ -47,9 +47,9 @@ class LlamaConfig(PretrainedConfig):
        intermediate_size (`int`, *optional*, defaults to 11008):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 32):
-            Number of hidden layers in the Transformer encoder.
+            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the Transformer encoder.
+            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if

--- a/src/transformers/models/llama/modeling_llama.py
+++ b/src/transformers/models/llama/modeling_llama.py
@@ -871,7 +871,7 @@ LLAMA_INPUTS_DOCSTRING = r"""
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+            `(batch_size, num_heads, decoder_sequence_length, embed_size_per_head)`.
            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.