Unverified Commit e3a4bd2b authored by Shijie Wu's avatar Shijie Wu Committed by GitHub
Browse files

add custom RMSNorm to `ALL_LAYERNORM_LAYERS` (#26227)

* add LlamaRMSNorm to ALL_LAYERNORM_LAYERS

* fixup

* add IdeficsRMSNorm to ALL_LAYERNORM_LAYERS and fixup
parent 0b5024ce
...@@ -31,6 +31,7 @@ from ... import PreTrainedModel ...@@ -31,6 +31,7 @@ from ... import PreTrainedModel
from ...activations import ACT2FN from ...activations import ACT2FN
from ...modeling_outputs import ModelOutput from ...modeling_outputs import ModelOutput
from ...modeling_utils import PretrainedConfig from ...modeling_utils import PretrainedConfig
from ...pytorch_utils import ALL_LAYERNORM_LAYERS
from ...utils import ( from ...utils import (
add_start_docstrings, add_start_docstrings,
add_start_docstrings_to_model_forward, add_start_docstrings_to_model_forward,
...@@ -261,7 +262,7 @@ def freeze_model(model, module_exceptions=[]): ...@@ -261,7 +262,7 @@ def freeze_model(model, module_exceptions=[]):
} }
module_exceptions_mapped = [mapping[m] for m in module_exceptions] module_exceptions_mapped = [mapping[m] for m in module_exceptions]
for module in model.modules(): for module in model.modules():
if module_exceptions and any([isinstance(module, t) for t in module_exceptions_mapped]): if module_exceptions and any(isinstance(module, t) for t in module_exceptions_mapped):
module.requires_grad_(True) # Explicitely setting it to true to avoid any mistakes module.requires_grad_(True) # Explicitely setting it to true to avoid any mistakes
else: else:
module.requires_grad_(False) module.requires_grad_(False)
...@@ -496,6 +497,9 @@ class IdeficsRMSNorm(nn.Module): ...@@ -496,6 +497,9 @@ class IdeficsRMSNorm(nn.Module):
return self.weight * hidden_states return self.weight * hidden_states
ALL_LAYERNORM_LAYERS.append(IdeficsRMSNorm)
# this was adapted from LlamaRotaryEmbedding # this was adapted from LlamaRotaryEmbedding
class IdeficsEmbedding(torch.nn.Module): class IdeficsEmbedding(torch.nn.Module):
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
......
...@@ -30,6 +30,7 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss ...@@ -30,6 +30,7 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN from ...activations import ACT2FN
from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
from ...modeling_utils import PreTrainedModel from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import ALL_LAYERNORM_LAYERS
from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
from .configuration_llama import LlamaConfig from .configuration_llama import LlamaConfig
...@@ -89,6 +90,9 @@ class LlamaRMSNorm(nn.Module): ...@@ -89,6 +90,9 @@ class LlamaRMSNorm(nn.Module):
return self.weight * hidden_states.to(input_dtype) return self.weight * hidden_states.to(input_dtype)
ALL_LAYERNORM_LAYERS.append(LlamaRMSNorm)
class LlamaRotaryEmbedding(nn.Module): class LlamaRotaryEmbedding(nn.Module):
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
super().__init__() super().__init__()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment