Commit 2db90e2c authored by zhuwenwen's avatar zhuwenwen
Browse files

update moe models of LM_NN

parent 31330101
......@@ -580,15 +580,15 @@ class DeepseekV2DecoderLayer(nn.Module):
hidden_states=hidden_states,
)
if hidden_states.dtype == torch.float16:
# Fix FP16 overflow
# We scale both hidden_states and residual before
# rmsnorm, and rmsnorm result would not affect by scale.
hidden_states *= 1. / self.routed_scaling_factor
if self.layer_idx == 0:
# The residual is shared by all layers, we only scale it on
# first layer.
residual *= 1. / self.routed_scaling_factor
# if hidden_states.dtype == torch.float16:
# # Fix FP16 overflow
# # We scale both hidden_states and residual before
# # rmsnorm, and rmsnorm result would not affect by scale.
# hidden_states *= 1. / self.routed_scaling_factor
# if self.layer_idx == 0:
# # The residual is shared by all layers, we only scale it on
# # first layer.
# residual *= 1. / self.routed_scaling_factor
# Fully Connected
hidden_states, residual = self.post_attention_layernorm(
......
......@@ -427,6 +427,7 @@ class MixtralModel(nn.Module):
for layername in loaded_params:
weight = params_dict[layername]
os.environ['LM_NN'] = '0'
matches = re.findall(combined_words, layername)
......
......@@ -511,6 +511,7 @@ class Qwen2MoeModel(nn.Module):
for layername in loaded_params:
weight = params_dict[layername]
os.environ['LM_NN'] = '0'
# if self.use_fa_pad and (re.findall(qkv_bias_words, layername)):
# weight.data = pad_weight(weight.data, 32)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment