add custom RMSNorm to `ALL_LAYERNORM_LAYERS` (#26227)

* add LlamaRMSNorm to ALL_LAYERNORM_LAYERS * fixup * add IdeficsRMSNorm to ALL_LAYERNORM_LAYERS and fixup

add custom RMSNorm to `ALL_LAYERNORM_LAYERS` (#26227)
* add LlamaRMSNorm to ALL_LAYERNORM_LAYERS * fixup * add IdeficsRMSNorm to ALL_LAYERNORM_LAYERS and fixup
e3a4bd2b · Shijie Wu · GitHub · 0b5024ce · e3a4bd2b · e3a4bd2b
Unverified Commit e3a4bd2b authored Sep 20, 2023 by Shijie Wu Committed by GitHub Sep 20, 2023
Showing with 9 additions and 1 deletion

src/transformers/models/idefics/modeling_idefics.py src/transformers/models/idefics/modeling_idefics.py +5 -1

src/transformers/models/llama/modeling_llama.py src/transformers/models/llama/modeling_llama.py +4 -0

No files found.
--- a/src/transformers/models/idefics/modeling_idefics.py
+++ b/src/transformers/models/idefics/modeling_idefics.py
@@ -31,6 +31,7 @@ from ... import PreTrainedModel
 from ...activations import ACT2FN
 from ...modeling_outputs import ModelOutput
 from ...modeling_utils import PretrainedConfig
+from ...pytorch_utils import ALL_LAYERNORM_LAYERS
 from ...utils import (
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
@@ -261,7 +262,7 @@ def freeze_model(model, module_exceptions=[]):
    }
    module_exceptions_mapped = [mapping[m] for m in module_exceptions]
    for module in model.modules():
-        if module_exceptions and any([isinstance(module, t) for t in module_exceptions_mapped]):
+        if module_exceptions and any(isinstance(module, t) for t in module_exceptions_mapped):
            module.requires_grad_(True)  # Explicitely setting it to true to avoid any mistakes
        else:
            module.requires_grad_(False)
@@ -496,6 +497,9 @@ class IdeficsRMSNorm(nn.Module):
        return self.weight * hidden_states
+ALL_LAYERNORM_LAYERS.append(IdeficsRMSNorm)
 # this was adapted from LlamaRotaryEmbedding
 class IdeficsEmbedding(torch.nn.Module):
    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):

--- a/src/transformers/models/llama/modeling_llama.py
+++ b/src/transformers/models/llama/modeling_llama.py
@@ -30,6 +30,7 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import ALL_LAYERNORM_LAYERS
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_llama import LlamaConfig
@@ -89,6 +90,9 @@ class LlamaRMSNorm(nn.Module):
        return self.weight * hidden_states.to(input_dtype)
+ALL_LAYERNORM_LAYERS.append(LlamaRMSNorm)
 class LlamaRotaryEmbedding(nn.Module):
    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
        super().__init__()