Unverified Commit 48291484 authored by ℍ𝕠𝕝𝕝𝕠𝕨 𝕄𝕒𝕟's avatar ℍ𝕠𝕝𝕝𝕠𝕨 𝕄𝕒𝕟 Committed by GitHub
Browse files

[BugFix] LoRA: Support loading base_layer of experts (#31104)


Signed-off-by: default avatarHollow Man <hollowman@opensuse.org>
parent efeaac92
...@@ -626,6 +626,7 @@ class LongcatFlashForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ...@@ -626,6 +626,7 @@ class LongcatFlashForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
# Params for weights, fp8 weight scales, fp8 activation scales # Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id) # (param_name, weight_name, expert_id, shard_id)
return FusedMoE.make_expert_params_mapping( return FusedMoE.make_expert_params_mapping(
self,
ckpt_gate_proj_name="gate_proj", ckpt_gate_proj_name="gate_proj",
ckpt_down_proj_name="down_proj", ckpt_down_proj_name="down_proj",
ckpt_up_proj_name="up_proj", ckpt_up_proj_name="up_proj",
......
...@@ -512,6 +512,7 @@ class MiMoV2Model(nn.Module): ...@@ -512,6 +512,7 @@ class MiMoV2Model(nn.Module):
# Params for weights, fp8 weight scales, fp8 activation scales # Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id) # (param_name, weight_name, expert_id, shard_id)
return FusedMoE.make_expert_params_mapping( return FusedMoE.make_expert_params_mapping(
self,
ckpt_gate_proj_name="gate_proj", ckpt_gate_proj_name="gate_proj",
ckpt_down_proj_name="down_proj", ckpt_down_proj_name="down_proj",
ckpt_up_proj_name="up_proj", ckpt_up_proj_name="up_proj",
......
...@@ -392,6 +392,7 @@ class MiniMaxM2Model(nn.Module): ...@@ -392,6 +392,7 @@ class MiniMaxM2Model(nn.Module):
def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
return FusedMoE.make_expert_params_mapping( return FusedMoE.make_expert_params_mapping(
self,
ckpt_gate_proj_name="w1", ckpt_gate_proj_name="w1",
ckpt_down_proj_name="w2", ckpt_down_proj_name="w2",
ckpt_up_proj_name="w3", ckpt_up_proj_name="w3",
......
...@@ -366,6 +366,7 @@ class MixtralModel(nn.Module): ...@@ -366,6 +366,7 @@ class MixtralModel(nn.Module):
# Params for weights, fp8 weight scales, fp8 activation scales # Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id) # (param_name, weight_name, expert_id, shard_id)
return FusedMoE.make_expert_params_mapping( return FusedMoE.make_expert_params_mapping(
self,
ckpt_gate_proj_name="w1", ckpt_gate_proj_name="w1",
ckpt_down_proj_name="w2", ckpt_down_proj_name="w2",
ckpt_up_proj_name="w3", ckpt_up_proj_name="w3",
......
...@@ -1084,6 +1084,7 @@ class Llama4ForConditionalGeneration( ...@@ -1084,6 +1084,7 @@ class Llama4ForConditionalGeneration(
# Params for weights, fp8 weight scales, fp8 activation scales # Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id) # (param_name, weight_name, expert_id, shard_id)
return FusedMoE.make_expert_params_mapping( return FusedMoE.make_expert_params_mapping(
self,
ckpt_gate_proj_name="gate_proj", ckpt_gate_proj_name="gate_proj",
ckpt_down_proj_name="down_proj", ckpt_down_proj_name="down_proj",
ckpt_up_proj_name="up_proj", ckpt_up_proj_name="up_proj",
......
...@@ -636,6 +636,7 @@ class NemotronHModel(nn.Module): ...@@ -636,6 +636,7 @@ class NemotronHModel(nn.Module):
# what the activation is applied to # what the activation is applied to
# - FusedMoe.w3 (aka up_proj) should be ignored since we're # - FusedMoe.w3 (aka up_proj) should be ignored since we're
# using non-gated MoE # using non-gated MoE
self,
ckpt_gate_proj_name="up_proj", ckpt_gate_proj_name="up_proj",
ckpt_down_proj_name="down_proj", ckpt_down_proj_name="down_proj",
ckpt_up_proj_name="", ckpt_up_proj_name="",
......
...@@ -338,6 +338,7 @@ class OlmoeModel(nn.Module): ...@@ -338,6 +338,7 @@ class OlmoeModel(nn.Module):
# Params for weights, fp8 weight scales, fp8 activation scales # Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id) # (param_name, weight_name, expert_id, shard_id)
return FusedMoE.make_expert_params_mapping( return FusedMoE.make_expert_params_mapping(
self,
ckpt_gate_proj_name="gate_proj", ckpt_gate_proj_name="gate_proj",
ckpt_down_proj_name="down_proj", ckpt_down_proj_name="down_proj",
ckpt_up_proj_name="up_proj", ckpt_up_proj_name="up_proj",
......
...@@ -1161,6 +1161,7 @@ class OpenPanguModel(nn.Module): ...@@ -1161,6 +1161,7 @@ class OpenPanguModel(nn.Module):
has_experts = hasattr(self.config, "n_routed_experts") has_experts = hasattr(self.config, "n_routed_experts")
if has_experts: if has_experts:
expert_merge_mapping = SharedFusedMoE.make_expert_params_mapping( expert_merge_mapping = SharedFusedMoE.make_expert_params_mapping(
self,
ckpt_gate_proj_name="gate_proj", ckpt_gate_proj_name="gate_proj",
ckpt_down_proj_name="down_proj", ckpt_down_proj_name="down_proj",
ckpt_up_proj_name="up_proj", ckpt_up_proj_name="up_proj",
......
...@@ -149,6 +149,7 @@ class OpenPanguMTP(nn.Module, SupportsPP): ...@@ -149,6 +149,7 @@ class OpenPanguMTP(nn.Module, SupportsPP):
] ]
expert_params_mapping = FusedMoE.make_expert_params_mapping( expert_params_mapping = FusedMoE.make_expert_params_mapping(
self,
ckpt_gate_proj_name="gate_proj", ckpt_gate_proj_name="gate_proj",
ckpt_down_proj_name="down_proj", ckpt_down_proj_name="down_proj",
ckpt_up_proj_name="up_proj", ckpt_up_proj_name="up_proj",
......
...@@ -516,6 +516,7 @@ class PhiMoEModel(nn.Module): ...@@ -516,6 +516,7 @@ class PhiMoEModel(nn.Module):
def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
return FusedMoE.make_expert_params_mapping( return FusedMoE.make_expert_params_mapping(
self,
ckpt_gate_proj_name="w1", ckpt_gate_proj_name="w1",
ckpt_down_proj_name="w2", ckpt_down_proj_name="w2",
ckpt_up_proj_name="w3", ckpt_up_proj_name="w3",
......
...@@ -423,6 +423,7 @@ class Qwen2MoeModel(nn.Module): ...@@ -423,6 +423,7 @@ class Qwen2MoeModel(nn.Module):
# Params for weights, fp8 weight scales, fp8 activation scales # Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id) # (param_name, weight_name, expert_id, shard_id)
return SharedFusedMoE.make_expert_params_mapping( return SharedFusedMoE.make_expert_params_mapping(
self,
ckpt_gate_proj_name="gate_proj", ckpt_gate_proj_name="gate_proj",
ckpt_down_proj_name="down_proj", ckpt_down_proj_name="down_proj",
ckpt_up_proj_name="up_proj", ckpt_up_proj_name="up_proj",
......
...@@ -470,6 +470,7 @@ class Qwen3MoeModel(nn.Module): ...@@ -470,6 +470,7 @@ class Qwen3MoeModel(nn.Module):
# Params for weights, fp8 weight scales, fp8 activation scales # Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id) # (param_name, weight_name, expert_id, shard_id)
return FusedMoE.make_expert_params_mapping( return FusedMoE.make_expert_params_mapping(
self,
ckpt_gate_proj_name="gate_proj", ckpt_gate_proj_name="gate_proj",
ckpt_down_proj_name="down_proj", ckpt_down_proj_name="down_proj",
ckpt_up_proj_name="up_proj", ckpt_up_proj_name="up_proj",
......
...@@ -1031,6 +1031,7 @@ class Qwen3NextModel(nn.Module): ...@@ -1031,6 +1031,7 @@ class Qwen3NextModel(nn.Module):
# Params for weights, fp8 weight scales, fp8 activation scales # Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id) # (param_name, weight_name, expert_id, shard_id)
return SharedFusedMoE.make_expert_params_mapping( return SharedFusedMoE.make_expert_params_mapping(
self,
ckpt_gate_proj_name="gate_proj", ckpt_gate_proj_name="gate_proj",
ckpt_down_proj_name="down_proj", ckpt_down_proj_name="down_proj",
ckpt_up_proj_name="up_proj", ckpt_up_proj_name="up_proj",
......
...@@ -147,6 +147,7 @@ class Qwen3NextMultiTokenPredictor(nn.Module): ...@@ -147,6 +147,7 @@ class Qwen3NextMultiTokenPredictor(nn.Module):
# Params for weights, fp8 weight scales, fp8 activation scales # Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id) # (param_name, weight_name, expert_id, shard_id)
expert_params_mapping = FusedMoE.make_expert_params_mapping( expert_params_mapping = FusedMoE.make_expert_params_mapping(
self,
ckpt_gate_proj_name="gate_proj", ckpt_gate_proj_name="gate_proj",
ckpt_down_proj_name="down_proj", ckpt_down_proj_name="down_proj",
ckpt_up_proj_name="up_proj", ckpt_up_proj_name="up_proj",
......
...@@ -165,6 +165,7 @@ class MoEMixin(MixtureOfExperts): ...@@ -165,6 +165,7 @@ class MoEMixin(MixtureOfExperts):
for gate_proj, down_proj, up_proj in ckpt_names: for gate_proj, down_proj, up_proj in ckpt_names:
expert_mapping.extend( expert_mapping.extend(
FusedMoE.make_expert_params_mapping( FusedMoE.make_expert_params_mapping(
self,
ckpt_gate_proj_name=gate_proj, ckpt_gate_proj_name=gate_proj,
ckpt_down_proj_name=down_proj, ckpt_down_proj_name=down_proj,
ckpt_up_proj_name=up_proj, ckpt_up_proj_name=up_proj,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment