Unverified Commit 1c2c1eb8 authored by bnellnm's avatar bnellnm Committed by GitHub
Browse files

[MoE Refactor] Rename FusedMoE.make_expert_params_mapping to...


[MoE Refactor] Rename FusedMoE.make_expert_params_mapping to fused_moe_make_expert_params_mapping (#40671)
Signed-off-by: default avatarBill Nell <bnell@redhat.com>
parent 8824f50f
...@@ -32,7 +32,10 @@ from transformers import PretrainedConfig ...@@ -32,7 +32,10 @@ from transformers import PretrainedConfig
from vllm._aiter_ops import rocm_aiter_ops from vllm._aiter_ops import rocm_aiter_ops
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.fused_moe import (
FusedMoE,
fused_moe_make_expert_params_mapping,
)
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
...@@ -260,7 +263,7 @@ class Glm4MoeLiteMTP(nn.Module, SupportsPP, Glm4MixtureOfExperts): ...@@ -260,7 +263,7 @@ class Glm4MoeLiteMTP(nn.Module, SupportsPP, Glm4MixtureOfExperts):
("fused_qkv_a_proj", "kv_a_proj_with_mqa", 1), ("fused_qkv_a_proj", "kv_a_proj_with_mqa", 1),
] ]
expert_params_mapping = FusedMoE.make_expert_params_mapping( expert_params_mapping = fused_moe_make_expert_params_mapping(
self, self,
ckpt_gate_proj_name="gate_proj", ckpt_gate_proj_name="gate_proj",
ckpt_down_proj_name="down_proj", ckpt_down_proj_name="down_proj",
......
...@@ -31,7 +31,10 @@ import torch.nn as nn ...@@ -31,7 +31,10 @@ import torch.nn as nn
from transformers import PretrainedConfig from transformers import PretrainedConfig
from vllm.config import CacheConfig, ParallelConfig, VllmConfig from vllm.config import CacheConfig, ParallelConfig, VllmConfig
from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.fused_moe import (
FusedMoE,
fused_moe_make_expert_params_mapping,
)
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
...@@ -247,7 +250,7 @@ class Glm4MoeMTP(nn.Module, Glm4MixtureOfExperts): ...@@ -247,7 +250,7 @@ class Glm4MoeMTP(nn.Module, Glm4MixtureOfExperts):
# Params for weights, fp8 weight scales, fp8 activation scales # Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id) # (param_name, weight_name, expert_id, shard_id)
expert_params_mapping = FusedMoE.make_expert_params_mapping( expert_params_mapping = fused_moe_make_expert_params_mapping(
self, self,
ckpt_gate_proj_name="gate_proj", ckpt_gate_proj_name="gate_proj",
ckpt_down_proj_name="down_proj", ckpt_down_proj_name="down_proj",
......
...@@ -20,7 +20,10 @@ from vllm.distributed import ( ...@@ -20,7 +20,10 @@ from vllm.distributed import (
tensor_model_parallel_all_gather, tensor_model_parallel_all_gather,
) )
from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.attention import Attention
from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.fused_moe import (
FusedMoE,
fused_moe_make_expert_params_mapping,
)
from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.linear import ( from vllm.model_executor.layers.linear import (
...@@ -331,7 +334,7 @@ class GptOssModel(nn.Module, EagleModelMixin): ...@@ -331,7 +334,7 @@ class GptOssModel(nn.Module, EagleModelMixin):
# Params for weights, weight scales, activation scales # Params for weights, weight scales, activation scales
# (param_name, weight_name, expert_id, shard_id) # (param_name, weight_name, expert_id, shard_id)
# NOTE: this is only used for quark. # NOTE: this is only used for quark.
return FusedMoE.make_expert_params_mapping( return fused_moe_make_expert_params_mapping(
self, self,
ckpt_gate_proj_name="w1", ckpt_gate_proj_name="w1",
ckpt_down_proj_name="w2", ckpt_down_proj_name="w2",
......
...@@ -39,7 +39,10 @@ from vllm.distributed import ( ...@@ -39,7 +39,10 @@ from vllm.distributed import (
tensor_model_parallel_all_gather, tensor_model_parallel_all_gather,
) )
from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.attention import Attention
from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.fused_moe import (
FusedMoE,
fused_moe_make_expert_params_mapping,
)
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.linear import ( from vllm.model_executor.layers.linear import (
QKVParallelLinear, QKVParallelLinear,
...@@ -351,7 +354,7 @@ class GraniteMoeModel(nn.Module): ...@@ -351,7 +354,7 @@ class GraniteMoeModel(nn.Module):
# Params for weights, fp8 weight scales, fp8 activation scales # Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id) # (param_name, weight_name, expert_id, shard_id)
expert_params_mapping = FusedMoE.make_expert_params_mapping( expert_params_mapping = fused_moe_make_expert_params_mapping(
self, self,
ckpt_gate_proj_name="w1", ckpt_gate_proj_name="w1",
ckpt_down_proj_name="w2", ckpt_down_proj_name="w2",
......
...@@ -38,7 +38,10 @@ from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size ...@@ -38,7 +38,10 @@ from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.layers.activation import GeluAndMul from vllm.model_executor.layers.activation import GeluAndMul
from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.attention import Attention
from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.fused_moe import (
FusedMoE,
fused_moe_make_expert_params_mapping,
)
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.linear import ( from vllm.model_executor.layers.linear import (
MergedColumnParallelLinear, MergedColumnParallelLinear,
...@@ -519,7 +522,7 @@ class Grok1Model(nn.Module): ...@@ -519,7 +522,7 @@ class Grok1Model(nn.Module):
def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
# Map expert parameter names to standard names # Map expert parameter names to standard names
num_experts = _get_num_experts(self.config) num_experts = _get_num_experts(self.config)
return FusedMoE.make_expert_params_mapping( return fused_moe_make_expert_params_mapping(
self, self,
ckpt_gate_proj_name=self.ckpt_gate_proj_name, ckpt_gate_proj_name=self.ckpt_gate_proj_name,
ckpt_down_proj_name=self.ckpt_down_proj_name, ckpt_down_proj_name=self.ckpt_down_proj_name,
......
...@@ -42,7 +42,10 @@ from vllm.distributed import ( ...@@ -42,7 +42,10 @@ from vllm.distributed import (
) )
from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.attention import Attention
from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.fused_moe import (
FusedMoE,
fused_moe_make_expert_params_mapping,
)
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.linear import ( from vllm.model_executor.layers.linear import (
ColumnParallelLinear, ColumnParallelLinear,
...@@ -712,7 +715,7 @@ class HunYuanModel(nn.Module, EagleModelMixin): ...@@ -712,7 +715,7 @@ class HunYuanModel(nn.Module, EagleModelMixin):
if _is_moe(self.config): if _is_moe(self.config):
# Params for weights, fp8 weight scales, fp8 activation scales # Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id) # (param_name, weight_name, expert_id, shard_id)
return FusedMoE.make_expert_params_mapping( return fused_moe_make_expert_params_mapping(
self, self,
ckpt_gate_proj_name="gate_proj", ckpt_gate_proj_name="gate_proj",
ckpt_down_proj_name="down_proj", ckpt_down_proj_name="down_proj",
......
...@@ -41,7 +41,9 @@ from vllm.distributed import ( ...@@ -41,7 +41,9 @@ from vllm.distributed import (
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.attention import Attention
from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.fused_moe import (
FusedMoE,
)
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.linear import ( from vllm.model_executor.layers.linear import (
MergedColumnParallelLinear, MergedColumnParallelLinear,
......
...@@ -14,7 +14,10 @@ from vllm.config import CacheConfig, ModelConfig, VllmConfig ...@@ -14,7 +14,10 @@ from vllm.config import CacheConfig, ModelConfig, VllmConfig
from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed import get_tensor_model_parallel_world_size
from vllm.distributed.parallel_state import get_pp_group from vllm.distributed.parallel_state import get_pp_group
from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.attention import Attention
from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.fused_moe import (
FusedMoE,
fused_moe_make_expert_params_mapping,
)
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.linear import ( from vllm.model_executor.layers.linear import (
QKVParallelLinear, QKVParallelLinear,
...@@ -378,7 +381,7 @@ class JambaModel(nn.Module): ...@@ -378,7 +381,7 @@ class JambaModel(nn.Module):
def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
# Params for weights, fp8 weight scales, fp8 activation scales # Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id) # (param_name, weight_name, expert_id, shard_id)
return FusedMoE.make_expert_params_mapping( return fused_moe_make_expert_params_mapping(
self, self,
ckpt_gate_proj_name="gate_proj", ckpt_gate_proj_name="gate_proj",
ckpt_down_proj_name="down_proj", ckpt_down_proj_name="down_proj",
......
...@@ -14,7 +14,10 @@ from vllm.distributed import ( ...@@ -14,7 +14,10 @@ from vllm.distributed import (
) )
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.fused_moe import (
FusedMoE,
fused_moe_make_expert_params_mapping,
)
from vllm.model_executor.layers.kda import KimiDeltaAttention from vllm.model_executor.layers.kda import KimiDeltaAttention
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.linear import ( from vllm.model_executor.layers.linear import (
...@@ -476,7 +479,7 @@ class KimiLinearModel(nn.Module): ...@@ -476,7 +479,7 @@ class KimiLinearModel(nn.Module):
if self.config.is_moe: if self.config.is_moe:
# Params for weights, fp8 weight scales, fp8 activation scales # Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id) # (param_name, weight_name, expert_id, shard_id)
expert_params_mapping = FusedMoE.make_expert_params_mapping( expert_params_mapping = fused_moe_make_expert_params_mapping(
self, self,
ckpt_gate_proj_name="w1", ckpt_gate_proj_name="w1",
ckpt_down_proj_name="w2", ckpt_down_proj_name="w2",
......
...@@ -15,7 +15,10 @@ from vllm.distributed import ( ...@@ -15,7 +15,10 @@ from vllm.distributed import (
) )
from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.attention import Attention
from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.fused_moe import (
FusedMoE,
fused_moe_make_expert_params_mapping,
)
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.linear import ( from vllm.model_executor.layers.linear import (
MergedColumnParallelLinear, MergedColumnParallelLinear,
...@@ -482,7 +485,7 @@ class Lfm2MoeModel(nn.Module): ...@@ -482,7 +485,7 @@ class Lfm2MoeModel(nn.Module):
return hidden_states return hidden_states
def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
return FusedMoE.make_expert_params_mapping( return fused_moe_make_expert_params_mapping(
self, self,
ckpt_gate_proj_name="w1", ckpt_gate_proj_name="w1",
ckpt_down_proj_name="w2", ckpt_down_proj_name="w2",
......
...@@ -36,7 +36,10 @@ from vllm.model_executor.layers.attention import ( ...@@ -36,7 +36,10 @@ from vllm.model_executor.layers.attention import (
Attention, Attention,
ChunkedLocalAttention, ChunkedLocalAttention,
) )
from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.fused_moe import (
FusedMoE,
fused_moe_make_expert_params_mapping,
)
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.linear import ( from vllm.model_executor.layers.linear import (
QKVParallelLinear, QKVParallelLinear,
...@@ -414,7 +417,7 @@ class Llama4Model(LlamaModel): ...@@ -414,7 +417,7 @@ class Llama4Model(LlamaModel):
params_dict: The dictionary of module parameters. params_dict: The dictionary of module parameters.
loaded_params: The set of already loaded parameters. loaded_params: The set of already loaded parameters.
expert_params_mapping: The mapping of expert parameters. Must be expert_params_mapping: The mapping of expert parameters. Must be
generated by FusedMoE.make_expert_params_mapping(). generated by fused_moe_make_expert_params_mapping().
fused: Whether the expert weights are fused into a single weight fused: Whether the expert weights are fused into a single weight
tensor or are separate weight tensors for each expert. tensor or are separate weight tensors for each expert.
When fused is True, loaded_weight should have shape of: When fused is True, loaded_weight should have shape of:
...@@ -554,7 +557,7 @@ class Llama4Model(LlamaModel): ...@@ -554,7 +557,7 @@ class Llama4Model(LlamaModel):
fused_experts_params = False fused_experts_params = False
# Expert parameter mapping for the case where the expert weights are # Expert parameter mapping for the case where the expert weights are
# not fused into a single weight tensor. # not fused into a single weight tensor.
expert_params_mapping = FusedMoE.make_expert_params_mapping( expert_params_mapping = fused_moe_make_expert_params_mapping(
self, self,
ckpt_gate_proj_name="gate_proj", ckpt_gate_proj_name="gate_proj",
ckpt_down_proj_name="down_proj", ckpt_down_proj_name="down_proj",
...@@ -564,7 +567,7 @@ class Llama4Model(LlamaModel): ...@@ -564,7 +567,7 @@ class Llama4Model(LlamaModel):
) )
# Expert parameter mapping for the case where the expert weights are # Expert parameter mapping for the case where the expert weights are
# fused into a single weight tensor. # fused into a single weight tensor.
expert_params_mapping_fused = FusedMoE.make_expert_params_mapping( expert_params_mapping_fused = fused_moe_make_expert_params_mapping(
self, self,
ckpt_gate_proj_name="gate_up_proj", ckpt_gate_proj_name="gate_up_proj",
ckpt_down_proj_name="down_proj", ckpt_down_proj_name="down_proj",
......
...@@ -46,7 +46,10 @@ from vllm.config import CacheConfig, VllmConfig ...@@ -46,7 +46,10 @@ from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import get_pp_group from vllm.distributed import get_pp_group
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.fused_moe import (
FusedMoE,
fused_moe_make_expert_params_mapping,
)
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.linear import ( from vllm.model_executor.layers.linear import (
MergedColumnParallelLinear, MergedColumnParallelLinear,
...@@ -622,7 +625,7 @@ class LongcatFlashForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ...@@ -622,7 +625,7 @@ class LongcatFlashForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
# Params for weights, fp8 weight scales, fp8 activation scales # Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id) # (param_name, weight_name, expert_id, shard_id)
return FusedMoE.make_expert_params_mapping( return fused_moe_make_expert_params_mapping(
self, self,
ckpt_gate_proj_name="gate_proj", ckpt_gate_proj_name="gate_proj",
ckpt_down_proj_name="down_proj", ckpt_down_proj_name="down_proj",
......
...@@ -22,7 +22,10 @@ from vllm.distributed import ( ...@@ -22,7 +22,10 @@ from vllm.distributed import (
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.attention import Attention
from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.fused_moe import (
FusedMoE,
fused_moe_make_expert_params_mapping,
)
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.linear import ( from vllm.model_executor.layers.linear import (
MergedColumnParallelLinear, MergedColumnParallelLinear,
...@@ -511,7 +514,7 @@ class MiMoV2Model(nn.Module): ...@@ -511,7 +514,7 @@ class MiMoV2Model(nn.Module):
def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
# Params for weights, fp8 weight scales, fp8 activation scales # Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id) # (param_name, weight_name, expert_id, shard_id)
return FusedMoE.make_expert_params_mapping( return fused_moe_make_expert_params_mapping(
self, self,
ckpt_gate_proj_name="gate_proj", ckpt_gate_proj_name="gate_proj",
ckpt_down_proj_name="down_proj", ckpt_down_proj_name="down_proj",
......
...@@ -38,7 +38,10 @@ from vllm.distributed import ( ...@@ -38,7 +38,10 @@ from vllm.distributed import (
get_tensor_model_parallel_world_size, get_tensor_model_parallel_world_size,
) )
from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.attention import Attention
from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.fused_moe import (
FusedMoE,
fused_moe_make_expert_params_mapping,
)
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.linear import ( from vllm.model_executor.layers.linear import (
QKVParallelLinear, QKVParallelLinear,
...@@ -393,7 +396,7 @@ class MiniMaxM2Model(nn.Module, EagleModelMixin): ...@@ -393,7 +396,7 @@ class MiniMaxM2Model(nn.Module, EagleModelMixin):
return hidden_states return hidden_states
def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
return FusedMoE.make_expert_params_mapping( return fused_moe_make_expert_params_mapping(
self, self,
ckpt_gate_proj_name="w1", ckpt_gate_proj_name="w1",
ckpt_down_proj_name="w2", ckpt_down_proj_name="w2",
......
...@@ -24,7 +24,9 @@ from vllm.distributed.parallel_state import ( ...@@ -24,7 +24,9 @@ from vllm.distributed.parallel_state import (
from vllm.forward_context import get_forward_context from vllm.forward_context import get_forward_context
from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.attention import Attention
from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.fused_moe import (
FusedMoE,
)
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.linear import ( from vllm.model_executor.layers.linear import (
MergedColumnParallelLinear, MergedColumnParallelLinear,
......
...@@ -40,7 +40,10 @@ from vllm.distributed import ( ...@@ -40,7 +40,10 @@ from vllm.distributed import (
get_tensor_model_parallel_world_size, get_tensor_model_parallel_world_size,
) )
from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.attention import Attention
from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.fused_moe import (
FusedMoE,
fused_moe_make_expert_params_mapping,
)
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.linear import ( from vllm.model_executor.layers.linear import (
QKVParallelLinear, QKVParallelLinear,
...@@ -364,7 +367,7 @@ class MixtralModel(nn.Module): ...@@ -364,7 +367,7 @@ class MixtralModel(nn.Module):
def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
# Params for weights, fp8 weight scales, fp8 activation scales # Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id) # (param_name, weight_name, expert_id, shard_id)
return FusedMoE.make_expert_params_mapping( return fused_moe_make_expert_params_mapping(
self, self,
ckpt_gate_proj_name="w1", ckpt_gate_proj_name="w1",
ckpt_down_proj_name="w2", ckpt_down_proj_name="w2",
......
...@@ -40,7 +40,9 @@ from vllm.config.multimodal import BaseDummyOptions ...@@ -40,7 +40,9 @@ from vllm.config.multimodal import BaseDummyOptions
from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed import get_tensor_model_parallel_world_size
from vllm.inputs import MultiModalDataDict from vllm.inputs import MultiModalDataDict
from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.attention import MMEncoderAttention
from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.fused_moe import (
fused_moe_make_expert_params_mapping,
)
from vllm.model_executor.layers.linear import ( from vllm.model_executor.layers.linear import (
ColumnParallelLinear, ColumnParallelLinear,
QKVParallelLinear, QKVParallelLinear,
...@@ -1072,7 +1074,7 @@ class Llama4ForConditionalGeneration( ...@@ -1072,7 +1074,7 @@ class Llama4ForConditionalGeneration(
def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
# Params for weights, fp8 weight scales, fp8 activation scales # Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id) # (param_name, weight_name, expert_id, shard_id)
return FusedMoE.make_expert_params_mapping( return fused_moe_make_expert_params_mapping(
self, self,
ckpt_gate_proj_name="gate_proj", ckpt_gate_proj_name="gate_proj",
ckpt_down_proj_name="down_proj", ckpt_down_proj_name="down_proj",
......
...@@ -37,6 +37,7 @@ from vllm.model_executor.layers.fused_moe import ( ...@@ -37,6 +37,7 @@ from vllm.model_executor.layers.fused_moe import (
FusedMoE, FusedMoE,
GateLinear, GateLinear,
activation_without_mul, activation_without_mul,
fused_moe_make_expert_params_mapping,
) )
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.linear import ( from vllm.model_executor.layers.linear import (
...@@ -652,7 +653,7 @@ class NemotronHModel(nn.Module): ...@@ -652,7 +653,7 @@ class NemotronHModel(nn.Module):
def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
if self.has_moe: if self.has_moe:
# (param_name, weight_name, expert_id, shard_id) # (param_name, weight_name, expert_id, shard_id)
expert_params_mapping = FusedMoE.make_expert_params_mapping( expert_params_mapping = fused_moe_make_expert_params_mapping(
# - FusedMoe.w1 (aka gate_proj) should be up_proj since that's # - FusedMoe.w1 (aka gate_proj) should be up_proj since that's
# what the activation is applied to # what the activation is applied to
# - FusedMoe.w3 (aka up_proj) should be ignored since we're # - FusedMoe.w3 (aka up_proj) should be ignored since we're
......
...@@ -11,7 +11,9 @@ import torch.nn as nn ...@@ -11,7 +11,9 @@ import torch.nn as nn
from vllm.compilation.decorators import support_torch_compile from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.config import CacheConfig, ModelConfig, VllmConfig
from vllm.config.parallel import ParallelConfig from vllm.config.parallel import ParallelConfig
from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.fused_moe import (
fused_moe_make_expert_params_mapping,
)
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.linear import ColumnParallelLinear from vllm.model_executor.layers.linear import ColumnParallelLinear
from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.logits_processor import LogitsProcessor
...@@ -399,7 +401,7 @@ class NemotronHMTP(nn.Module, SupportsPP): ...@@ -399,7 +401,7 @@ class NemotronHMTP(nn.Module, SupportsPP):
if getattr(self.config, "model_type", None) == "nemotron_h_puzzle": if getattr(self.config, "model_type", None) == "nemotron_h_puzzle":
num_experts = self.config.mtp_n_routed_experts num_experts = self.config.mtp_n_routed_experts
if num_experts is not None: if num_experts is not None:
expert_params_mapping = FusedMoE.make_expert_params_mapping( expert_params_mapping = fused_moe_make_expert_params_mapping(
self, self,
ckpt_gate_proj_name="up_proj", ckpt_gate_proj_name="up_proj",
ckpt_down_proj_name="down_proj", ckpt_down_proj_name="down_proj",
......
...@@ -32,7 +32,10 @@ from vllm.distributed import ( ...@@ -32,7 +32,10 @@ from vllm.distributed import (
from vllm.distributed.utils import split_tensor_along_last_dim from vllm.distributed.utils import split_tensor_along_last_dim
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.attention import Attention
from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.fused_moe import (
FusedMoE,
fused_moe_make_expert_params_mapping,
)
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.linear import ( from vllm.model_executor.layers.linear import (
QKVParallelLinear, QKVParallelLinear,
...@@ -336,7 +339,7 @@ class OlmoeModel(nn.Module): ...@@ -336,7 +339,7 @@ class OlmoeModel(nn.Module):
def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
# Params for weights, fp8 weight scales, fp8 activation scales # Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id) # (param_name, weight_name, expert_id, shard_id)
return FusedMoE.make_expert_params_mapping( return fused_moe_make_expert_params_mapping(
self, self,
ckpt_gate_proj_name="gate_proj", ckpt_gate_proj_name="gate_proj",
ckpt_down_proj_name="down_proj", ckpt_down_proj_name="down_proj",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment