Commit 2db90e2c authored by zhuwenwen's avatar zhuwenwen
Browse files

update moe models of LM_NN

parent 31330101
...@@ -580,15 +580,15 @@ class DeepseekV2DecoderLayer(nn.Module): ...@@ -580,15 +580,15 @@ class DeepseekV2DecoderLayer(nn.Module):
hidden_states=hidden_states, hidden_states=hidden_states,
) )
if hidden_states.dtype == torch.float16: # if hidden_states.dtype == torch.float16:
# Fix FP16 overflow # # Fix FP16 overflow
# We scale both hidden_states and residual before # # We scale both hidden_states and residual before
# rmsnorm, and rmsnorm result would not affect by scale. # # rmsnorm, and rmsnorm result would not affect by scale.
hidden_states *= 1. / self.routed_scaling_factor # hidden_states *= 1. / self.routed_scaling_factor
if self.layer_idx == 0: # if self.layer_idx == 0:
# The residual is shared by all layers, we only scale it on # # The residual is shared by all layers, we only scale it on
# first layer. # # first layer.
residual *= 1. / self.routed_scaling_factor # residual *= 1. / self.routed_scaling_factor
# Fully Connected # Fully Connected
hidden_states, residual = self.post_attention_layernorm( hidden_states, residual = self.post_attention_layernorm(
......
...@@ -427,6 +427,7 @@ class MixtralModel(nn.Module): ...@@ -427,6 +427,7 @@ class MixtralModel(nn.Module):
for layername in loaded_params: for layername in loaded_params:
weight = params_dict[layername] weight = params_dict[layername]
os.environ['LM_NN'] = '0'
matches = re.findall(combined_words, layername) matches = re.findall(combined_words, layername)
......
...@@ -511,6 +511,7 @@ class Qwen2MoeModel(nn.Module): ...@@ -511,6 +511,7 @@ class Qwen2MoeModel(nn.Module):
for layername in loaded_params: for layername in loaded_params:
weight = params_dict[layername] weight = params_dict[layername]
os.environ['LM_NN'] = '0'
# if self.use_fa_pad and (re.findall(qkv_bias_words, layername)): # if self.use_fa_pad and (re.findall(qkv_bias_words, layername)):
# weight.data = pad_weight(weight.data, 32) # weight.data = pad_weight(weight.data, 32)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment