Unverified Commit 48291484 authored by ℍ𝕠𝕝𝕝𝕠𝕨 𝕄𝕒𝕟's avatar ℍ𝕠𝕝𝕝𝕠𝕨 𝕄𝕒𝕟 Committed by GitHub
Browse files

[BugFix] LoRA: Support loading base_layer of experts (#31104)


Signed-off-by: default avatarHollow Man <hollowman@opensuse.org>
parent efeaac92
...@@ -2007,6 +2007,7 @@ class FusedMoE(CustomOp): ...@@ -2007,6 +2007,7 @@ class FusedMoE(CustomOp):
@classmethod @classmethod
def make_expert_params_mapping( def make_expert_params_mapping(
cls, cls,
model: torch.nn.Module,
ckpt_gate_proj_name: str, ckpt_gate_proj_name: str,
ckpt_down_proj_name: str, ckpt_down_proj_name: str,
ckpt_up_proj_name: str, ckpt_up_proj_name: str,
...@@ -2025,13 +2026,19 @@ class FusedMoE(CustomOp): ...@@ -2025,13 +2026,19 @@ class FusedMoE(CustomOp):
) )
) )
base_layer = (
"base_layer."
if any(".base_layer." in name for name, _ in model.named_parameters())
else ""
)
return [ return [
# (param_name, weight_name, expert_id, shard_id) # (param_name, weight_name, expert_id, shard_id)
( (
"experts.w13_" f"experts.{base_layer}w13_"
if weight_name in [ckpt_gate_proj_name, ckpt_up_proj_name] if weight_name in [ckpt_gate_proj_name, ckpt_up_proj_name]
else "experts.w2_", else f"experts.{base_layer}w2_",
f"experts.{physical_to_logical_map[expert_id]}.{weight_name}.", f"experts.{physical_to_logical_map[expert_id]}.{weight_name}.{base_layer}",
expert_id, expert_id,
shard_id, shard_id,
) )
......
...@@ -475,6 +475,7 @@ class AfmoeModel(nn.Module): ...@@ -475,6 +475,7 @@ class AfmoeModel(nn.Module):
# Params for weights, fp8 weight scales, fp8 activation scales # Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id) # (param_name, weight_name, expert_id, shard_id)
return SharedFusedMoE.make_expert_params_mapping( return SharedFusedMoE.make_expert_params_mapping(
self,
ckpt_gate_proj_name="gate_proj", ckpt_gate_proj_name="gate_proj",
ckpt_down_proj_name="down_proj", ckpt_down_proj_name="down_proj",
ckpt_up_proj_name="up_proj", ckpt_up_proj_name="up_proj",
......
...@@ -476,6 +476,7 @@ class BailingMoeModel(nn.Module): ...@@ -476,6 +476,7 @@ class BailingMoeModel(nn.Module):
def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
return SharedFusedMoE.make_expert_params_mapping( return SharedFusedMoE.make_expert_params_mapping(
self,
ckpt_gate_proj_name="gate_proj", ckpt_gate_proj_name="gate_proj",
ckpt_down_proj_name="down_proj", ckpt_down_proj_name="down_proj",
ckpt_up_proj_name="up_proj", ckpt_up_proj_name="up_proj",
......
...@@ -106,6 +106,7 @@ class DeepseekV2Model(nn.Module): ...@@ -106,6 +106,7 @@ class DeepseekV2Model(nn.Module):
# Params for weights, fp8 weight scales, fp8 activation scales # Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id) # (param_name, weight_name, expert_id, shard_id)
expert_params_mapping = FusedMoE.make_expert_params_mapping( expert_params_mapping = FusedMoE.make_expert_params_mapping(
self,
ckpt_gate_proj_name="gate_proj", ckpt_gate_proj_name="gate_proj",
ckpt_down_proj_name="down_proj", ckpt_down_proj_name="down_proj",
ckpt_up_proj_name="up_proj", ckpt_up_proj_name="up_proj",
......
...@@ -245,6 +245,7 @@ class DeepSeekMTP(nn.Module, SupportsPP, DeepseekV2MixtureOfExperts): ...@@ -245,6 +245,7 @@ class DeepSeekMTP(nn.Module, SupportsPP, DeepseekV2MixtureOfExperts):
] ]
expert_params_mapping = SharedFusedMoE.make_expert_params_mapping( expert_params_mapping = SharedFusedMoE.make_expert_params_mapping(
self,
ckpt_gate_proj_name="gate_proj", ckpt_gate_proj_name="gate_proj",
ckpt_down_proj_name="down_proj", ckpt_down_proj_name="down_proj",
ckpt_up_proj_name="up_proj", ckpt_up_proj_name="up_proj",
......
...@@ -1486,6 +1486,7 @@ class DeepseekV2ForCausalLM( ...@@ -1486,6 +1486,7 @@ class DeepseekV2ForCausalLM(
# Params for weights, fp8 weight scales, fp8 activation scales # Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id) # (param_name, weight_name, expert_id, shard_id)
return SharedFusedMoE.make_expert_params_mapping( return SharedFusedMoE.make_expert_params_mapping(
self,
ckpt_gate_proj_name="gate_proj", ckpt_gate_proj_name="gate_proj",
ckpt_down_proj_name="down_proj", ckpt_down_proj_name="down_proj",
ckpt_up_proj_name="up_proj", ckpt_up_proj_name="up_proj",
...@@ -1519,6 +1520,7 @@ class DeepseekV2ForCausalLM( ...@@ -1519,6 +1520,7 @@ class DeepseekV2ForCausalLM(
# Params for weights, fp8 weight scales, fp8 activation scales # Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id) # (param_name, weight_name, expert_id, shard_id)
expert_params_mapping = SharedFusedMoE.make_expert_params_mapping( expert_params_mapping = SharedFusedMoE.make_expert_params_mapping(
self,
ckpt_gate_proj_name="gate_proj", ckpt_gate_proj_name="gate_proj",
ckpt_down_proj_name="down_proj", ckpt_down_proj_name="down_proj",
ckpt_up_proj_name="up_proj", ckpt_up_proj_name="up_proj",
......
...@@ -424,6 +424,7 @@ class Dots1Model(nn.Module): ...@@ -424,6 +424,7 @@ class Dots1Model(nn.Module):
def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
return SharedFusedMoE.make_expert_params_mapping( return SharedFusedMoE.make_expert_params_mapping(
self,
ckpt_gate_proj_name="gate_proj", ckpt_gate_proj_name="gate_proj",
ckpt_down_proj_name="down_proj", ckpt_down_proj_name="down_proj",
ckpt_up_proj_name="up_proj", ckpt_up_proj_name="up_proj",
......
...@@ -497,6 +497,7 @@ class Ernie4_5_MoeModel(nn.Module): ...@@ -497,6 +497,7 @@ class Ernie4_5_MoeModel(nn.Module):
# Params for weights, fp8 weight scales, fp8 activation scales # Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id) # (param_name, weight_name, expert_id, shard_id)
return SharedFusedMoE.make_expert_params_mapping( return SharedFusedMoE.make_expert_params_mapping(
self,
ckpt_gate_proj_name="gate_proj", ckpt_gate_proj_name="gate_proj",
ckpt_down_proj_name="down_proj", ckpt_down_proj_name="down_proj",
ckpt_up_proj_name="up_proj", ckpt_up_proj_name="up_proj",
......
...@@ -675,6 +675,7 @@ class Ernie4_5_VLMoeForCausalLM(nn.Module, SupportsPP): ...@@ -675,6 +675,7 @@ class Ernie4_5_VLMoeForCausalLM(nn.Module, SupportsPP):
# Params for weights, fp8 weight scales, fp8 activation scales # Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id) # (param_name, weight_name, expert_id, shard_id)
expert_params_mapping = SharedFusedMoE.make_expert_params_mapping( expert_params_mapping = SharedFusedMoE.make_expert_params_mapping(
self,
ckpt_gate_proj_name="gate_proj", ckpt_gate_proj_name="gate_proj",
ckpt_down_proj_name="down_proj", ckpt_down_proj_name="down_proj",
ckpt_up_proj_name="up_proj", ckpt_up_proj_name="up_proj",
......
...@@ -496,6 +496,7 @@ class Glm4MoeModel(nn.Module): ...@@ -496,6 +496,7 @@ class Glm4MoeModel(nn.Module):
# Params for weights, fp8 weight scales, fp8 activation scales # Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id) # (param_name, weight_name, expert_id, shard_id)
return SharedFusedMoE.make_expert_params_mapping( return SharedFusedMoE.make_expert_params_mapping(
self,
ckpt_gate_proj_name="gate_proj", ckpt_gate_proj_name="gate_proj",
ckpt_down_proj_name="down_proj", ckpt_down_proj_name="down_proj",
ckpt_up_proj_name="up_proj", ckpt_up_proj_name="up_proj",
......
...@@ -248,6 +248,7 @@ class Glm4MoeMTP(nn.Module, SupportsPP, Glm4MixtureOfExperts): ...@@ -248,6 +248,7 @@ class Glm4MoeMTP(nn.Module, SupportsPP, Glm4MixtureOfExperts):
# Params for weights, fp8 weight scales, fp8 activation scales # Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id) # (param_name, weight_name, expert_id, shard_id)
expert_params_mapping = FusedMoE.make_expert_params_mapping( expert_params_mapping = FusedMoE.make_expert_params_mapping(
self,
ckpt_gate_proj_name="gate_proj", ckpt_gate_proj_name="gate_proj",
ckpt_down_proj_name="down_proj", ckpt_down_proj_name="down_proj",
ckpt_up_proj_name="up_proj", ckpt_up_proj_name="up_proj",
......
...@@ -729,6 +729,7 @@ class GptOssForCausalLM(nn.Module, SupportsPP, SupportsEagle3, SupportsLoRA): ...@@ -729,6 +729,7 @@ class GptOssForCausalLM(nn.Module, SupportsPP, SupportsEagle3, SupportsLoRA):
# Params for weights, weight scales, activation scales # Params for weights, weight scales, activation scales
# (param_name, weight_name, expert_id, shard_id) # (param_name, weight_name, expert_id, shard_id)
return FusedMoE.make_expert_params_mapping( return FusedMoE.make_expert_params_mapping(
self,
ckpt_gate_proj_name="gate_proj", ckpt_gate_proj_name="gate_proj",
ckpt_down_proj_name="down_proj", ckpt_down_proj_name="down_proj",
ckpt_up_proj_name="up_proj", ckpt_up_proj_name="up_proj",
......
...@@ -353,6 +353,7 @@ class GraniteMoeModel(nn.Module): ...@@ -353,6 +353,7 @@ class GraniteMoeModel(nn.Module):
# Params for weights, fp8 weight scales, fp8 activation scales # Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id) # (param_name, weight_name, expert_id, shard_id)
expert_params_mapping = FusedMoE.make_expert_params_mapping( expert_params_mapping = FusedMoE.make_expert_params_mapping(
self,
ckpt_gate_proj_name="w1", ckpt_gate_proj_name="w1",
ckpt_down_proj_name="w2", ckpt_down_proj_name="w2",
ckpt_up_proj_name="w3", ckpt_up_proj_name="w3",
......
...@@ -369,6 +369,7 @@ class Grok1Model(nn.Module): ...@@ -369,6 +369,7 @@ class Grok1Model(nn.Module):
# Grok1 uses "num_experts" in its config # Grok1 uses "num_experts" in its config
num_experts = getattr(self.config, "num_experts", 8) num_experts = getattr(self.config, "num_experts", 8)
return FusedMoE.make_expert_params_mapping( return FusedMoE.make_expert_params_mapping(
self,
ckpt_gate_proj_name="linear", # Grok1 specific ckpt_gate_proj_name="linear", # Grok1 specific
ckpt_down_proj_name="linear_1", # Grok1 specific ckpt_down_proj_name="linear_1", # Grok1 specific
ckpt_up_proj_name="linear_v", # Grok1 specific ckpt_up_proj_name="linear_v", # Grok1 specific
......
...@@ -706,6 +706,7 @@ class HunYuanModel(nn.Module): ...@@ -706,6 +706,7 @@ class HunYuanModel(nn.Module):
# Params for weights, fp8 weight scales, fp8 activation scales # Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id) # (param_name, weight_name, expert_id, shard_id)
return SharedFusedMoE.make_expert_params_mapping( return SharedFusedMoE.make_expert_params_mapping(
self,
ckpt_gate_proj_name="gate_proj", ckpt_gate_proj_name="gate_proj",
ckpt_down_proj_name="down_proj", ckpt_down_proj_name="down_proj",
ckpt_up_proj_name="up_proj", ckpt_up_proj_name="up_proj",
......
...@@ -378,6 +378,7 @@ class JambaModel(nn.Module): ...@@ -378,6 +378,7 @@ class JambaModel(nn.Module):
# Params for weights, fp8 weight scales, fp8 activation scales # Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id) # (param_name, weight_name, expert_id, shard_id)
return FusedMoE.make_expert_params_mapping( return FusedMoE.make_expert_params_mapping(
self,
ckpt_gate_proj_name="gate_proj", ckpt_gate_proj_name="gate_proj",
ckpt_down_proj_name="down_proj", ckpt_down_proj_name="down_proj",
ckpt_up_proj_name="up_proj", ckpt_up_proj_name="up_proj",
......
...@@ -560,6 +560,7 @@ class KimiLinearForCausalLM( ...@@ -560,6 +560,7 @@ class KimiLinearForCausalLM(
# Params for weights, fp8 weight scales, fp8 activation scales # Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id) # (param_name, weight_name, expert_id, shard_id)
expert_params_mapping = FusedMoE.make_expert_params_mapping( expert_params_mapping = FusedMoE.make_expert_params_mapping(
self,
ckpt_gate_proj_name="w1", ckpt_gate_proj_name="w1",
ckpt_down_proj_name="w2", ckpt_down_proj_name="w2",
ckpt_up_proj_name="w3", ckpt_up_proj_name="w3",
......
...@@ -462,6 +462,7 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): ...@@ -462,6 +462,7 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
# Params for weights, fp8 weight scales, fp8 activation scales # Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id) # (param_name, weight_name, expert_id, shard_id)
expert_params_mapping = FusedMoE.make_expert_params_mapping( expert_params_mapping = FusedMoE.make_expert_params_mapping(
self,
ckpt_gate_proj_name="gate_proj", ckpt_gate_proj_name="gate_proj",
ckpt_down_proj_name="down_proj", ckpt_down_proj_name="down_proj",
ckpt_up_proj_name="up_proj", ckpt_up_proj_name="up_proj",
......
...@@ -486,6 +486,7 @@ class Lfm2MoeModel(nn.Module): ...@@ -486,6 +486,7 @@ class Lfm2MoeModel(nn.Module):
def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
return FusedMoE.make_expert_params_mapping( return FusedMoE.make_expert_params_mapping(
self,
ckpt_gate_proj_name="w1", ckpt_gate_proj_name="w1",
ckpt_down_proj_name="w2", ckpt_down_proj_name="w2",
ckpt_up_proj_name="w3", ckpt_up_proj_name="w3",
......
...@@ -539,6 +539,7 @@ class Llama4Model(LlamaModel): ...@@ -539,6 +539,7 @@ class Llama4Model(LlamaModel):
# Expert parameter mapping for the case where the expert weights are # Expert parameter mapping for the case where the expert weights are
# not fused into a single weight tensor. # not fused into a single weight tensor.
expert_params_mapping = SharedFusedMoE.make_expert_params_mapping( expert_params_mapping = SharedFusedMoE.make_expert_params_mapping(
self,
ckpt_gate_proj_name="gate_proj", ckpt_gate_proj_name="gate_proj",
ckpt_down_proj_name="down_proj", ckpt_down_proj_name="down_proj",
ckpt_up_proj_name="up_proj", ckpt_up_proj_name="up_proj",
...@@ -548,6 +549,7 @@ class Llama4Model(LlamaModel): ...@@ -548,6 +549,7 @@ class Llama4Model(LlamaModel):
# Expert parameter mapping for the case where the expert weights are # Expert parameter mapping for the case where the expert weights are
# fused into a single weight tensor. # fused into a single weight tensor.
expert_params_mapping_fused = SharedFusedMoE.make_expert_params_mapping( expert_params_mapping_fused = SharedFusedMoE.make_expert_params_mapping(
self,
ckpt_gate_proj_name="gate_up_proj", ckpt_gate_proj_name="gate_up_proj",
ckpt_down_proj_name="down_proj", ckpt_down_proj_name="down_proj",
ckpt_up_proj_name="gate_up_proj", ckpt_up_proj_name="gate_up_proj",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment