Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
5e584ce9
Unverified
Commit
5e584ce9
authored
Apr 21, 2026
by
bnellnm
Committed by
GitHub
Apr 21, 2026
Browse files
[MoE Refactor] Remove SharedFusedMoE class (#35782)
Signed-off-by:
Bill Nell
<
bnell@redhat.com
>
parent
1842447c
Changes
33
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
62 additions
and
103 deletions
+62
-103
tests/kernels/moe/test_moe_layer.py
tests/kernels/moe/test_moe_layer.py
+3
-7
tests/kernels/moe/test_shared_fused_moe_routed_transform.py
tests/kernels/moe/test_shared_fused_moe_routed_transform.py
+10
-10
vllm/distributed/device_communicators/base_device_communicator.py
...tributed/device_communicators/base_device_communicator.py
+3
-10
vllm/distributed/elastic_ep/elastic_execute.py
vllm/distributed/elastic_ep/elastic_execute.py
+2
-4
vllm/lora/layers/fused_moe.py
vllm/lora/layers/fused_moe.py
+2
-2
vllm/model_executor/layers/fused_moe/__init__.py
vllm/model_executor/layers/fused_moe/__init__.py
+0
-2
vllm/model_executor/layers/fused_moe/shared_fused_moe.py
vllm/model_executor/layers/fused_moe/shared_fused_moe.py
+0
-25
vllm/model_executor/models/AXK1.py
vllm/model_executor/models/AXK1.py
+4
-4
vllm/model_executor/models/afmoe.py
vllm/model_executor/models/afmoe.py
+5
-5
vllm/model_executor/models/aria.py
vllm/model_executor/models/aria.py
+2
-2
vllm/model_executor/models/bailing_moe.py
vllm/model_executor/models/bailing_moe.py
+3
-3
vllm/model_executor/models/bailing_moe_linear.py
vllm/model_executor/models/bailing_moe_linear.py
+3
-3
vllm/model_executor/models/deepseek_mtp.py
vllm/model_executor/models/deepseek_mtp.py
+2
-2
vllm/model_executor/models/deepseek_v2.py
vllm/model_executor/models/deepseek_v2.py
+4
-4
vllm/model_executor/models/dots1.py
vllm/model_executor/models/dots1.py
+3
-3
vllm/model_executor/models/ernie45_moe.py
vllm/model_executor/models/ernie45_moe.py
+4
-4
vllm/model_executor/models/ernie45_vl_moe.py
vllm/model_executor/models/ernie45_vl_moe.py
+4
-4
vllm/model_executor/models/exaone_moe.py
vllm/model_executor/models/exaone_moe.py
+1
-2
vllm/model_executor/models/glm4_moe.py
vllm/model_executor/models/glm4_moe.py
+3
-3
vllm/model_executor/models/glm4_moe_lite.py
vllm/model_executor/models/glm4_moe_lite.py
+4
-4
No files found.
tests/kernels/moe/test_moe_layer.py
View file @
5e584ce9
...
@@ -37,7 +37,7 @@ from vllm.distributed.parallel_state import (
...
@@ -37,7 +37,7 @@ from vllm.distributed.parallel_state import (
get_eplb_group
,
get_eplb_group
,
)
)
from
vllm.forward_context
import
set_forward_context
from
vllm.forward_context
import
set_forward_context
from
vllm.model_executor.layers.fused_moe
import
FusedMoE
,
SharedFusedMoE
,
fused_experts
from
vllm.model_executor.layers.fused_moe
import
FusedMoE
,
fused_experts
from
vllm.model_executor.layers.fused_moe.activation
import
MoEActivation
from
vllm.model_executor.layers.fused_moe.activation
import
MoEActivation
from
vllm.model_executor.layers.fused_moe.config
import
FusedMoEQuantConfig
from
vllm.model_executor.layers.fused_moe.config
import
FusedMoEQuantConfig
from
vllm.model_executor.layers.fused_moe.router.router_factory
import
(
from
vllm.model_executor.layers.fused_moe.router.router_factory
import
(
...
@@ -858,11 +858,7 @@ def make_fused_moe_layer(
...
@@ -858,11 +858,7 @@ def make_fused_moe_layer(
quant_config
,
qw
=
make_quant_config
(
quantization
,
w1
,
w2
,
global_num_experts
)
quant_config
,
qw
=
make_quant_config
(
quantization
,
w1
,
w2
,
global_num_experts
)
kwargs
=
dict
()
kwargs
=
dict
()
if
shared_experts
is
None
:
kwargs
[
"shared_experts"
]
=
shared_experts
builder
=
FusedMoE
else
:
builder
=
SharedFusedMoE
kwargs
[
"shared_experts"
]
=
shared_experts
# Add gate and routed_input_transform if provided
# Add gate and routed_input_transform if provided
if
gate
is
not
None
:
if
gate
is
not
None
:
...
@@ -872,7 +868,7 @@ def make_fused_moe_layer(
...
@@ -872,7 +868,7 @@ def make_fused_moe_layer(
kwargs
[
"routed_input_transform"
]
=
routed_input_transform
kwargs
[
"routed_input_transform"
]
=
routed_input_transform
kwargs
[
"routed_output_transform"
]
=
routed_output_transform
kwargs
[
"routed_output_transform"
]
=
routed_output_transform
layer
=
builder
(
layer
=
FusedMoE
(
num_experts
=
global_num_experts
,
num_experts
=
global_num_experts
,
top_k
=
top_k
,
top_k
=
top_k
,
hidden_size
=
hidden_size
,
hidden_size
=
hidden_size
,
...
...
tests/kernels/moe/test_shared_fused_moe_routed_transform.py
View file @
5e584ce9
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
"""
Tests for
Shared
FusedMoE with routed_input_transform.
Tests for FusedMoE with routed_input_transform.
Verifies that applying routed_input_transform inside
Shared
FusedMoE
Verifies that applying routed_input_transform inside FusedMoE
produces the same results as applying the transform manually outside.
produces the same results as applying the transform manually outside.
"""
"""
...
@@ -13,7 +13,7 @@ import torch.nn as nn
...
@@ -13,7 +13,7 @@ import torch.nn as nn
from
vllm.config
import
VllmConfig
,
set_current_vllm_config
from
vllm.config
import
VllmConfig
,
set_current_vllm_config
from
vllm.forward_context
import
set_forward_context
from
vllm.forward_context
import
set_forward_context
from
vllm.model_executor.layers.fused_moe
.shared_fused_moe
import
Shared
FusedMoE
from
vllm.model_executor.layers.fused_moe
import
FusedMoE
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils.torch_utils
import
is_torch_equal_or_newer
,
set_random_seed
from
vllm.utils.torch_utils
import
is_torch_equal_or_newer
,
set_random_seed
...
@@ -133,9 +133,9 @@ def test_routed_input_transform_inside_vs_outside(
...
@@ -133,9 +133,9 @@ def test_routed_input_transform_inside_vs_outside(
workspace_init
,
workspace_init
,
monkeypatch
,
monkeypatch
,
):
):
"""Compare
Shared
FusedMoE with transform inside vs manually applying outside.
"""Compare FusedMoE with transform inside vs manually applying outside.
Method A (inside):
Shared
FusedMoE with routed_input_transform
Method A (inside): FusedMoE with routed_input_transform
Method B (outside): Manually transform, then
Shared
FusedMoE without transform
Method B (outside): Manually transform, then FusedMoE without transform
"""
"""
if
current_platform
.
is_rocm
():
if
current_platform
.
is_rocm
():
monkeypatch
.
setenv
(
"VLLM_ROCM_USE_AITER"
,
"1"
if
use_rocm_aiter
else
"0"
)
monkeypatch
.
setenv
(
"VLLM_ROCM_USE_AITER"
,
"1"
if
use_rocm_aiter
else
"0"
)
...
@@ -157,8 +157,8 @@ def test_routed_input_transform_inside_vs_outside(
...
@@ -157,8 +157,8 @@ def test_routed_input_transform_inside_vs_outside(
routed_transform
=
SimpleLinear
(
hidden_size
,
latent_size
,
dtype
)
routed_transform
=
SimpleLinear
(
hidden_size
,
latent_size
,
dtype
)
with
set_current_vllm_config
(
vllm_config
):
with
set_current_vllm_config
(
vllm_config
):
# Method A:
Shared
FusedMoE WITH routed_input_transform
# Method A: FusedMoE WITH routed_input_transform
moe_with_transform
=
Shared
FusedMoE
(
moe_with_transform
=
FusedMoE
(
shared_experts
=
shared_experts
,
shared_experts
=
shared_experts
,
routed_input_transform
=
routed_transform
,
routed_input_transform
=
routed_transform
,
num_experts
=
num_experts
,
num_experts
=
num_experts
,
...
@@ -173,9 +173,9 @@ def test_routed_input_transform_inside_vs_outside(
...
@@ -173,9 +173,9 @@ def test_routed_input_transform_inside_vs_outside(
prefix
=
"moe_with_transform"
,
prefix
=
"moe_with_transform"
,
)
)
# Method B:
Shared
FusedMoE WITHOUT routed_input_transform
# Method B: FusedMoE WITHOUT routed_input_transform
# Note: shared_experts=None because when transform is done outside,
# Note: shared_experts=None because when transform is done outside,
moe_without_transform
=
Shared
FusedMoE
(
moe_without_transform
=
FusedMoE
(
shared_experts
=
None
,
shared_experts
=
None
,
routed_input_transform
=
None
,
routed_input_transform
=
None
,
num_experts
=
num_experts
,
num_experts
=
num_experts
,
...
...
vllm/distributed/device_communicators/base_device_communicator.py
View file @
5e584ce9
...
@@ -7,6 +7,8 @@ import torch
...
@@ -7,6 +7,8 @@ import torch
import
torch.distributed
as
dist
import
torch.distributed
as
dist
from
torch.distributed
import
ProcessGroup
from
torch.distributed
import
ProcessGroup
from
vllm.utils
import
is_moe_layer
class
Cache
:
class
Cache
:
def
__init__
(
self
):
def
__init__
(
self
):
...
@@ -317,16 +319,7 @@ class DeviceCommunicatorBase:
...
@@ -317,16 +319,7 @@ class DeviceCommunicatorBase:
if
not
self
.
is_ep_communicator
:
if
not
self
.
is_ep_communicator
:
return
return
moe_modules
=
[
moe_modules
=
[
module
for
module
in
model
.
modules
()
if
is_moe_layer
(
module
)]
module
for
module
in
model
.
modules
()
# TODO(bnell): Should use isinstance but can't. Maybe search for
# presence of quant_method.maybe_init_modular_kernel?
if
(
module
.
__class__
.
__name__
==
"FusedMoE"
or
module
.
__class__
.
__name__
==
"SharedFusedMoE"
)
]
for
module
in
moe_modules
:
for
module
in
moe_modules
:
module
.
maybe_init_modular_kernel
()
module
.
maybe_init_modular_kernel
()
...
...
vllm/distributed/elastic_ep/elastic_execute.py
View file @
5e584ce9
...
@@ -38,6 +38,7 @@ from vllm.distributed.parallel_state import (
...
@@ -38,6 +38,7 @@ from vllm.distributed.parallel_state import (
from
vllm.distributed.stateless_coordinator
import
StatelessGroupCoordinator
from
vllm.distributed.stateless_coordinator
import
StatelessGroupCoordinator
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.fused_moe.layer
import
FusedMoEParallelConfig
from
vllm.model_executor.layers.fused_moe.layer
import
FusedMoEParallelConfig
from
vllm.utils
import
is_moe_layer
from
vllm.v1.engine
import
ReconfigureDistributedRequest
,
ReconfigureRankType
from
vllm.v1.engine
import
ReconfigureDistributedRequest
,
ReconfigureRankType
from
vllm.v1.worker.gpu_ubatch_wrapper
import
UBatchWrapper
from
vllm.v1.worker.gpu_ubatch_wrapper
import
UBatchWrapper
from
vllm.v1.worker.workspace
import
lock_workspace
,
unlock_workspace
from
vllm.v1.worker.workspace
import
lock_workspace
,
unlock_workspace
...
@@ -319,10 +320,7 @@ class ElasticEPScalingExecutor:
...
@@ -319,10 +320,7 @@ class ElasticEPScalingExecutor:
moe_modules
=
[
moe_modules
=
[
module
module
for
module
in
self
.
worker
.
model_runner
.
model
.
modules
()
for
module
in
self
.
worker
.
model_runner
.
model
.
modules
()
if
(
if
is_moe_layer
(
module
)
module
.
__class__
.
__name__
==
"FusedMoE"
or
module
.
__class__
.
__name__
==
"SharedFusedMoE"
)
]
]
num_local_experts
=
moe_modules
[
0
].
moe_config
.
num_local_experts
num_local_experts
=
moe_modules
[
0
].
moe_config
.
num_local_experts
assert
all
(
assert
all
(
...
...
vllm/lora/layers/fused_moe.py
View file @
5e584ce9
...
@@ -610,7 +610,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
...
@@ -610,7 +610,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
)
->
bool
:
)
->
bool
:
"""Returns True if the layer can be replaced by this LoRA layer."""
"""Returns True if the layer can be replaced by this LoRA layer."""
# source_layer is FusedMoE
or SharedFusedMoE
# source_layer is FusedMoE
return
isinstance
(
source_layer
,
FusedMoE
)
and
len
(
packed_modules_list
)
==
2
return
isinstance
(
source_layer
,
FusedMoE
)
and
len
(
packed_modules_list
)
==
2
...
@@ -772,5 +772,5 @@ class FusedMoE3DWithLoRA(FusedMoEWithLoRA):
...
@@ -772,5 +772,5 @@ class FusedMoE3DWithLoRA(FusedMoEWithLoRA):
model_config
:
PretrainedConfig
|
None
=
None
,
model_config
:
PretrainedConfig
|
None
=
None
,
)
->
bool
:
)
->
bool
:
"""Returns True if the layer can be replaced by this LoRA layer."""
"""Returns True if the layer can be replaced by this LoRA layer."""
# source_layer is FusedMoE
or SharedFusedMoE
# source_layer is FusedMoE
return
isinstance
(
source_layer
,
FusedMoE
)
and
len
(
packed_modules_list
)
==
1
return
isinstance
(
source_layer
,
FusedMoE
)
and
len
(
packed_modules_list
)
==
1
vllm/model_executor/layers/fused_moe/__init__.py
View file @
5e584ce9
...
@@ -29,7 +29,6 @@ from vllm.model_executor.layers.fused_moe.router.fused_moe_router import (
...
@@ -29,7 +29,6 @@ from vllm.model_executor.layers.fused_moe.router.fused_moe_router import (
FusedMoERouter
,
FusedMoERouter
,
)
)
from
vllm.model_executor.layers.fused_moe.router.gate_linear
import
GateLinear
from
vllm.model_executor.layers.fused_moe.router.gate_linear
import
GateLinear
from
vllm.model_executor.layers.fused_moe.shared_fused_moe
import
SharedFusedMoE
from
vllm.model_executor.layers.fused_moe.unquantized_fused_moe_method
import
(
from
vllm.model_executor.layers.fused_moe.unquantized_fused_moe_method
import
(
UnquantizedFusedMoEMethod
,
UnquantizedFusedMoEMethod
,
)
)
...
@@ -64,7 +63,6 @@ __all__ = [
...
@@ -64,7 +63,6 @@ __all__ = [
"FusedMoEPrepareAndFinalizeModular"
,
"FusedMoEPrepareAndFinalizeModular"
,
"GateLinear"
,
"GateLinear"
,
"RoutingMethodType"
,
"RoutingMethodType"
,
"SharedFusedMoE"
,
"activation_without_mul"
,
"activation_without_mul"
,
"apply_moe_activation"
,
"apply_moe_activation"
,
"override_config"
,
"override_config"
,
...
...
vllm/model_executor/layers/fused_moe/shared_fused_moe.py
deleted
100644 → 0
View file @
1842447c
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
torch
from
vllm.model_executor.layers.fused_moe.layer
import
FusedMoE
# TODO(bnell): Remove this entirely
class
SharedFusedMoE
(
FusedMoE
):
"""
A FusedMoE operation that also computes the results of shared experts.
If an all2all communicator is being used the shared expert computation
can be interleaved with the fused all2all dispatch communication step.
"""
def
forward
(
self
,
hidden_states
:
torch
.
Tensor
,
router_logits
:
torch
.
Tensor
,
)
->
torch
.
Tensor
:
return
super
().
forward
(
hidden_states
=
hidden_states
,
router_logits
=
router_logits
,
)
vllm/model_executor/models/AXK1.py
View file @
5e584ce9
...
@@ -42,7 +42,7 @@ from vllm.distributed import (
...
@@ -42,7 +42,7 @@ from vllm.distributed import (
)
)
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.attention
import
Attention
from
vllm.model_executor.layers.attention
import
Attention
from
vllm.model_executor.layers.fused_moe
import
Shared
FusedMoE
from
vllm.model_executor.layers.fused_moe
import
FusedMoE
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.linear
import
(
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
ColumnParallelLinear
,
...
@@ -163,7 +163,7 @@ class AXK1MoE(nn.Module):
...
@@ -163,7 +163,7 @@ class AXK1MoE(nn.Module):
prefix
=
f
"
{
prefix
}
.shared_experts"
,
prefix
=
f
"
{
prefix
}
.shared_experts"
,
)
)
self
.
experts
=
Shared
FusedMoE
(
self
.
experts
=
FusedMoE
(
shared_experts
=
self
.
shared_experts
,
shared_experts
=
self
.
shared_experts
,
gate
=
self
.
gate
,
gate
=
self
.
gate
,
num_experts
=
config
.
n_routed_experts
,
num_experts
=
config
.
n_routed_experts
,
...
@@ -916,7 +916,7 @@ class AXK1ForCausalLM(
...
@@ -916,7 +916,7 @@ class AXK1ForCausalLM(
def
get_expert_mapping
(
self
)
->
list
[
tuple
[
str
,
str
,
int
,
str
]]:
def
get_expert_mapping
(
self
)
->
list
[
tuple
[
str
,
str
,
int
,
str
]]:
# Params for weights, fp8 weight scales, fp8 activation scales
# Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id)
# (param_name, weight_name, expert_id, shard_id)
return
Shared
FusedMoE
.
make_expert_params_mapping
(
return
FusedMoE
.
make_expert_params_mapping
(
self
,
self
,
ckpt_gate_proj_name
=
"gate_proj"
,
ckpt_gate_proj_name
=
"gate_proj"
,
ckpt_down_proj_name
=
"down_proj"
,
ckpt_down_proj_name
=
"down_proj"
,
...
@@ -950,7 +950,7 @@ class AXK1ForCausalLM(
...
@@ -950,7 +950,7 @@ class AXK1ForCausalLM(
# Params for weights, fp8 weight scales, fp8 activation scales
# Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id)
# (param_name, weight_name, expert_id, shard_id)
expert_params_mapping
=
Shared
FusedMoE
.
make_expert_params_mapping
(
expert_params_mapping
=
FusedMoE
.
make_expert_params_mapping
(
self
,
self
,
ckpt_gate_proj_name
=
"gate_proj"
,
ckpt_gate_proj_name
=
"gate_proj"
,
ckpt_down_proj_name
=
"down_proj"
,
ckpt_down_proj_name
=
"down_proj"
,
...
...
vllm/model_executor/models/afmoe.py
View file @
5e584ce9
...
@@ -18,7 +18,7 @@ from vllm.distributed import (
...
@@ -18,7 +18,7 @@ from vllm.distributed import (
)
)
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.attention
import
Attention
from
vllm.model_executor.layers.attention
import
Attention
from
vllm.model_executor.layers.fused_moe
.shared_fused_moe
import
Shared
FusedMoE
from
vllm.model_executor.layers.fused_moe
import
FusedMoE
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.linear
import
(
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
ColumnParallelLinear
,
...
@@ -124,8 +124,8 @@ class AfmoeMoE(nn.Module):
...
@@ -124,8 +124,8 @@ class AfmoeMoE(nn.Module):
prefix
=
f
"
{
prefix
}
.shared_experts"
,
prefix
=
f
"
{
prefix
}
.shared_experts"
,
)
)
# Routed experts using
Shared
FusedMoE
# Routed experts using FusedMoE
self
.
experts
=
Shared
FusedMoE
(
self
.
experts
=
FusedMoE
(
shared_experts
=
self
.
shared_experts
,
shared_experts
=
self
.
shared_experts
,
num_experts
=
config
.
num_experts
,
num_experts
=
config
.
num_experts
,
top_k
=
config
.
num_experts_per_tok
,
top_k
=
config
.
num_experts_per_tok
,
...
@@ -479,7 +479,7 @@ class AfmoeModel(nn.Module, EagleModelMixin):
...
@@ -479,7 +479,7 @@ class AfmoeModel(nn.Module, EagleModelMixin):
def
get_expert_mapping
(
self
)
->
list
[
tuple
[
str
,
str
,
int
,
str
]]:
def
get_expert_mapping
(
self
)
->
list
[
tuple
[
str
,
str
,
int
,
str
]]:
# Params for weights, fp8 weight scales, fp8 activation scales
# Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id)
# (param_name, weight_name, expert_id, shard_id)
return
Shared
FusedMoE
.
make_expert_params_mapping
(
return
FusedMoE
.
make_expert_params_mapping
(
self
,
self
,
ckpt_gate_proj_name
=
"gate_proj"
,
ckpt_gate_proj_name
=
"gate_proj"
,
ckpt_down_proj_name
=
"down_proj"
,
ckpt_down_proj_name
=
"down_proj"
,
...
@@ -637,7 +637,7 @@ class AfmoeForCausalLM(nn.Module, SupportsPP, SupportsEagle3, SupportsLoRA):
...
@@ -637,7 +637,7 @@ class AfmoeForCausalLM(nn.Module, SupportsPP, SupportsEagle3, SupportsLoRA):
self
.
num_moe_layers
=
config
.
num_hidden_layers
-
config
.
num_dense_layers
self
.
num_moe_layers
=
config
.
num_hidden_layers
-
config
.
num_dense_layers
self
.
num_expert_groups
=
config
.
n_group
self
.
num_expert_groups
=
config
.
n_group
self
.
moe_layers
:
list
[
Shared
FusedMoE
]
=
[]
self
.
moe_layers
:
list
[
FusedMoE
]
=
[]
example_moe
=
None
example_moe
=
None
for
layer
in
self
.
model
.
layers
:
for
layer
in
self
.
model
.
layers
:
if
isinstance
(
layer
,
PPMissingLayer
):
if
isinstance
(
layer
,
PPMissingLayer
):
...
...
vllm/model_executor/models/aria.py
View file @
5e584ce9
...
@@ -14,7 +14,7 @@ from vllm.config.multimodal import BaseDummyOptions
...
@@ -14,7 +14,7 @@ from vllm.config.multimodal import BaseDummyOptions
from
vllm.distributed
import
get_tensor_model_parallel_rank
from
vllm.distributed
import
get_tensor_model_parallel_rank
from
vllm.inputs
import
MultiModalDataDict
from
vllm.inputs
import
MultiModalDataDict
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.fused_moe
import
Shared
FusedMoE
from
vllm.model_executor.layers.fused_moe
import
FusedMoE
from
vllm.model_executor.layers.linear
import
ColumnParallelLinear
,
RowParallelLinear
from
vllm.model_executor.layers.linear
import
ColumnParallelLinear
,
RowParallelLinear
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
...
@@ -214,7 +214,7 @@ class AriaProjector(nn.Module):
...
@@ -214,7 +214,7 @@ class AriaProjector(nn.Module):
return
out
return
out
class
AriaFusedMoE
(
Shared
FusedMoE
):
class
AriaFusedMoE
(
FusedMoE
):
def
weight_loader
(
def
weight_loader
(
self
,
param
:
nn
.
Parameter
,
loaded_weight
:
torch
.
Tensor
,
shard_id
:
str
self
,
param
:
nn
.
Parameter
,
loaded_weight
:
torch
.
Tensor
,
shard_id
:
str
)
->
None
:
)
->
None
:
...
...
vllm/model_executor/models/bailing_moe.py
View file @
5e584ce9
...
@@ -41,7 +41,7 @@ from vllm.distributed import (
...
@@ -41,7 +41,7 @@ from vllm.distributed import (
)
)
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.attention
import
Attention
from
vllm.model_executor.layers.attention
import
Attention
from
vllm.model_executor.layers.fused_moe
import
Shared
FusedMoE
from
vllm.model_executor.layers.fused_moe
import
FusedMoE
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.linear
import
(
from
vllm.model_executor.layers.linear
import
(
MergedColumnParallelLinear
,
MergedColumnParallelLinear
,
...
@@ -285,7 +285,7 @@ class BailingMoE(nn.Module):
...
@@ -285,7 +285,7 @@ class BailingMoE(nn.Module):
else
:
else
:
self
.
shared_experts
=
None
self
.
shared_experts
=
None
self
.
experts
=
Shared
FusedMoE
(
self
.
experts
=
FusedMoE
(
shared_experts
=
self
.
shared_experts
,
shared_experts
=
self
.
shared_experts
,
num_experts
=
self
.
num_experts
,
num_experts
=
self
.
num_experts
,
top_k
=
self
.
top_k
,
top_k
=
self
.
top_k
,
...
@@ -461,7 +461,7 @@ class BailingMoeModel(nn.Module):
...
@@ -461,7 +461,7 @@ class BailingMoeModel(nn.Module):
return
hidden_states
return
hidden_states
def
get_expert_mapping
(
self
)
->
list
[
tuple
[
str
,
str
,
int
,
str
]]:
def
get_expert_mapping
(
self
)
->
list
[
tuple
[
str
,
str
,
int
,
str
]]:
return
Shared
FusedMoE
.
make_expert_params_mapping
(
return
FusedMoE
.
make_expert_params_mapping
(
self
,
self
,
ckpt_gate_proj_name
=
"gate_proj"
,
ckpt_gate_proj_name
=
"gate_proj"
,
ckpt_down_proj_name
=
"down_proj"
,
ckpt_down_proj_name
=
"down_proj"
,
...
...
vllm/model_executor/models/bailing_moe_linear.py
View file @
5e584ce9
...
@@ -21,7 +21,7 @@ from vllm.model_executor.layers.fla.ops.layernorm_guard import (
...
@@ -21,7 +21,7 @@ from vllm.model_executor.layers.fla.ops.layernorm_guard import (
RMSNormGated
,
RMSNormGated
,
layernorm_fn
,
layernorm_fn
,
)
)
from
vllm.model_executor.layers.fused_moe
import
FusedMoE
,
SharedFusedMoE
from
vllm.model_executor.layers.fused_moe
import
FusedMoE
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.linear
import
(
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
ColumnParallelLinear
,
...
@@ -351,8 +351,8 @@ class BailingMoeV25(nn.Module):
...
@@ -351,8 +351,8 @@ class BailingMoeV25(nn.Module):
else
:
else
:
self
.
shared_experts
=
None
self
.
shared_experts
=
None
# Routed experts using
Shared
FusedMoE
# Routed experts using FusedMoE
self
.
experts
=
Shared
FusedMoE
(
self
.
experts
=
FusedMoE
(
shared_experts
=
self
.
shared_experts
,
shared_experts
=
self
.
shared_experts
,
num_experts
=
self
.
num_experts
,
num_experts
=
self
.
num_experts
,
top_k
=
self
.
top_k
,
top_k
=
self
.
top_k
,
...
...
vllm/model_executor/models/deepseek_mtp.py
View file @
5e584ce9
...
@@ -11,7 +11,7 @@ from vllm._aiter_ops import rocm_aiter_ops
...
@@ -11,7 +11,7 @@ from vllm._aiter_ops import rocm_aiter_ops
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
VllmConfig
from
vllm.config
import
VllmConfig
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.fused_moe
import
Shared
FusedMoE
from
vllm.model_executor.layers.fused_moe
import
FusedMoE
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
...
@@ -252,7 +252,7 @@ class DeepSeekMTP(nn.Module, DeepseekV2MixtureOfExperts):
...
@@ -252,7 +252,7 @@ class DeepSeekMTP(nn.Module, DeepseekV2MixtureOfExperts):
]
]
stacked_params_mapping
.
extend
(
indexer_fused_mapping
)
stacked_params_mapping
.
extend
(
indexer_fused_mapping
)
expert_params_mapping
=
Shared
FusedMoE
.
make_expert_params_mapping
(
expert_params_mapping
=
FusedMoE
.
make_expert_params_mapping
(
self
,
self
,
ckpt_gate_proj_name
=
"gate_proj"
,
ckpt_gate_proj_name
=
"gate_proj"
,
ckpt_down_proj_name
=
"down_proj"
,
ckpt_down_proj_name
=
"down_proj"
,
...
...
vllm/model_executor/models/deepseek_v2.py
View file @
5e584ce9
...
@@ -48,9 +48,9 @@ from vllm.model_executor.layers.activation import SiluAndMul
...
@@ -48,9 +48,9 @@ from vllm.model_executor.layers.activation import SiluAndMul
from
vllm.model_executor.layers.attention
import
Attention
from
vllm.model_executor.layers.attention
import
Attention
from
vllm.model_executor.layers.attention_layer_base
import
AttentionLayerBase
from
vllm.model_executor.layers.attention_layer_base
import
AttentionLayerBase
from
vllm.model_executor.layers.fused_moe
import
(
from
vllm.model_executor.layers.fused_moe
import
(
FusedMoE
,
GateLinear
,
GateLinear
,
RoutingMethodType
,
RoutingMethodType
,
SharedFusedMoE
,
)
)
from
vllm.model_executor.layers.layernorm
import
LayerNorm
,
RMSNorm
from
vllm.model_executor.layers.layernorm
import
LayerNorm
,
RMSNorm
from
vllm.model_executor.layers.linear
import
(
from
vllm.model_executor.layers.linear
import
(
...
@@ -311,7 +311,7 @@ class DeepseekV2MoE(nn.Module):
...
@@ -311,7 +311,7 @@ class DeepseekV2MoE(nn.Module):
prefix
=
f
"
{
prefix
}
.shared_experts"
,
prefix
=
f
"
{
prefix
}
.shared_experts"
,
)
)
self
.
experts
=
Shared
FusedMoE
(
self
.
experts
=
FusedMoE
(
shared_experts
=
self
.
shared_experts
,
shared_experts
=
self
.
shared_experts
,
gate
=
self
.
gate
,
gate
=
self
.
gate
,
num_experts
=
config
.
n_routed_experts
,
num_experts
=
config
.
n_routed_experts
,
...
@@ -1432,7 +1432,7 @@ class DeepseekV2ForCausalLM(
...
@@ -1432,7 +1432,7 @@ class DeepseekV2ForCausalLM(
def
get_expert_mapping
(
self
)
->
list
[
tuple
[
str
,
str
,
int
,
str
]]:
def
get_expert_mapping
(
self
)
->
list
[
tuple
[
str
,
str
,
int
,
str
]]:
# Params for weights, fp8 weight scales, fp8 activation scales
# Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id)
# (param_name, weight_name, expert_id, shard_id)
return
Shared
FusedMoE
.
make_expert_params_mapping
(
return
FusedMoE
.
make_expert_params_mapping
(
self
,
self
,
ckpt_gate_proj_name
=
"gate_proj"
,
ckpt_gate_proj_name
=
"gate_proj"
,
ckpt_down_proj_name
=
"down_proj"
,
ckpt_down_proj_name
=
"down_proj"
,
...
@@ -1474,7 +1474,7 @@ class DeepseekV2ForCausalLM(
...
@@ -1474,7 +1474,7 @@ class DeepseekV2ForCausalLM(
# Params for weights, fp8 weight scales, fp8 activation scales
# Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id)
# (param_name, weight_name, expert_id, shard_id)
expert_params_mapping
=
Shared
FusedMoE
.
make_expert_params_mapping
(
expert_params_mapping
=
FusedMoE
.
make_expert_params_mapping
(
self
,
self
,
ckpt_gate_proj_name
=
"gate_proj"
,
ckpt_gate_proj_name
=
"gate_proj"
,
ckpt_down_proj_name
=
"down_proj"
,
ckpt_down_proj_name
=
"down_proj"
,
...
...
vllm/model_executor/models/dots1.py
View file @
5e584ce9
...
@@ -40,7 +40,7 @@ from vllm.distributed import (
...
@@ -40,7 +40,7 @@ from vllm.distributed import (
)
)
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.attention
import
Attention
from
vllm.model_executor.layers.attention
import
Attention
from
vllm.model_executor.layers.fused_moe
import
Shared
FusedMoE
from
vllm.model_executor.layers.fused_moe
import
FusedMoE
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.linear
import
(
from
vllm.model_executor.layers.linear
import
(
MergedColumnParallelLinear
,
MergedColumnParallelLinear
,
...
@@ -155,7 +155,7 @@ class Dots1MoE(nn.Module):
...
@@ -155,7 +155,7 @@ class Dots1MoE(nn.Module):
else
:
else
:
self
.
shared_experts
=
None
self
.
shared_experts
=
None
self
.
experts
=
Shared
FusedMoE
(
self
.
experts
=
FusedMoE
(
shared_experts
=
self
.
shared_experts
,
shared_experts
=
self
.
shared_experts
,
num_experts
=
config
.
n_routed_experts
,
num_experts
=
config
.
n_routed_experts
,
top_k
=
config
.
num_experts_per_tok
,
top_k
=
config
.
num_experts_per_tok
,
...
@@ -413,7 +413,7 @@ class Dots1Model(nn.Module):
...
@@ -413,7 +413,7 @@ class Dots1Model(nn.Module):
return
hidden_states
return
hidden_states
def
get_expert_mapping
(
self
)
->
list
[
tuple
[
str
,
str
,
int
,
str
]]:
def
get_expert_mapping
(
self
)
->
list
[
tuple
[
str
,
str
,
int
,
str
]]:
return
Shared
FusedMoE
.
make_expert_params_mapping
(
return
FusedMoE
.
make_expert_params_mapping
(
self
,
self
,
ckpt_gate_proj_name
=
"gate_proj"
,
ckpt_gate_proj_name
=
"gate_proj"
,
ckpt_down_proj_name
=
"down_proj"
,
ckpt_down_proj_name
=
"down_proj"
,
...
...
vllm/model_executor/models/ernie45_moe.py
View file @
5e584ce9
...
@@ -42,7 +42,7 @@ from vllm.distributed import (
...
@@ -42,7 +42,7 @@ from vllm.distributed import (
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.attention
import
Attention
from
vllm.model_executor.layers.attention
import
Attention
from
vllm.model_executor.layers.fused_moe
import
Shared
FusedMoE
from
vllm.model_executor.layers.fused_moe
import
FusedMoE
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.linear
import
(
from
vllm.model_executor.layers.linear
import
(
MergedColumnParallelLinear
,
MergedColumnParallelLinear
,
...
@@ -188,7 +188,7 @@ class Ernie4_5_MoeMoE(nn.Module):
...
@@ -188,7 +188,7 @@ class Ernie4_5_MoeMoE(nn.Module):
else
:
else
:
self
.
shared_experts
=
None
self
.
shared_experts
=
None
self
.
experts
=
Shared
FusedMoE
(
self
.
experts
=
FusedMoE
(
shared_experts
=
self
.
shared_experts
,
shared_experts
=
self
.
shared_experts
,
num_experts
=
config
.
moe_num_experts
,
num_experts
=
config
.
moe_num_experts
,
top_k
=
config
.
moe_k
,
top_k
=
config
.
moe_k
,
...
@@ -485,7 +485,7 @@ class Ernie4_5_MoeModel(nn.Module):
...
@@ -485,7 +485,7 @@ class Ernie4_5_MoeModel(nn.Module):
def
get_expert_mapping
(
self
)
->
list
[
tuple
[
str
,
str
,
int
,
str
]]:
def
get_expert_mapping
(
self
)
->
list
[
tuple
[
str
,
str
,
int
,
str
]]:
# Params for weights, fp8 weight scales, fp8 activation scales
# Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id)
# (param_name, weight_name, expert_id, shard_id)
return
Shared
FusedMoE
.
make_expert_params_mapping
(
return
FusedMoE
.
make_expert_params_mapping
(
self
,
self
,
ckpt_gate_proj_name
=
"gate_proj"
,
ckpt_gate_proj_name
=
"gate_proj"
,
ckpt_down_proj_name
=
"down_proj"
,
ckpt_down_proj_name
=
"down_proj"
,
...
@@ -667,7 +667,7 @@ class Ernie4_5_MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA, MixtureOfExpe
...
@@ -667,7 +667,7 @@ class Ernie4_5_MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA, MixtureOfExpe
self
.
num_moe_layers
=
len
(
moe_layers_indices
)
self
.
num_moe_layers
=
len
(
moe_layers_indices
)
self
.
num_expert_groups
=
1
self
.
num_expert_groups
=
1
self
.
moe_layers
:
list
[
Shared
FusedMoE
]
=
[]
self
.
moe_layers
:
list
[
FusedMoE
]
=
[]
example_moe
=
None
example_moe
=
None
for
layer
in
self
.
model
.
layers
:
for
layer
in
self
.
model
.
layers
:
if
isinstance
(
layer
,
PPMissingLayer
):
if
isinstance
(
layer
,
PPMissingLayer
):
...
...
vllm/model_executor/models/ernie45_vl_moe.py
View file @
5e584ce9
...
@@ -36,7 +36,7 @@ from vllm.config import CacheConfig, VllmConfig
...
@@ -36,7 +36,7 @@ from vllm.config import CacheConfig, VllmConfig
from
vllm.distributed
import
get_pp_group
,
get_tensor_model_parallel_world_size
from
vllm.distributed
import
get_pp_group
,
get_tensor_model_parallel_world_size
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.attention
import
Attention
from
vllm.model_executor.layers.attention
import
Attention
from
vllm.model_executor.layers.fused_moe
import
Shared
FusedMoE
from
vllm.model_executor.layers.fused_moe
import
FusedMoE
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.linear
import
(
from
vllm.model_executor.layers.linear
import
(
QKVParallelLinear
,
QKVParallelLinear
,
...
@@ -257,7 +257,7 @@ class Ernie4_5_VLMoeMoE(nn.Module):
...
@@ -257,7 +257,7 @@ class Ernie4_5_VLMoeMoE(nn.Module):
prefix
=
f
"
{
prefix
}
.text_experts_gate"
,
prefix
=
f
"
{
prefix
}
.text_experts_gate"
,
)
)
self
.
text_experts
=
Shared
FusedMoE
(
self
.
text_experts
=
FusedMoE
(
shared_experts
=
self
.
shared_experts
,
shared_experts
=
self
.
shared_experts
,
num_experts
=
config
.
moe_num_experts
[
0
],
num_experts
=
config
.
moe_num_experts
[
0
],
top_k
=
config
.
moe_k
,
top_k
=
config
.
moe_k
,
...
@@ -294,7 +294,7 @@ class Ernie4_5_VLMoeMoE(nn.Module):
...
@@ -294,7 +294,7 @@ class Ernie4_5_VLMoeMoE(nn.Module):
prefix
=
f
"
{
prefix
}
.vision_experts_gate"
,
prefix
=
f
"
{
prefix
}
.vision_experts_gate"
,
)
)
self
.
vision_experts
=
Shared
FusedMoE
(
self
.
vision_experts
=
FusedMoE
(
shared_experts
=
self
.
shared_experts
,
shared_experts
=
self
.
shared_experts
,
num_experts
=
config
.
moe_num_experts
[
1
],
num_experts
=
config
.
moe_num_experts
[
1
],
top_k
=
config
.
moe_k
,
top_k
=
config
.
moe_k
,
...
@@ -649,7 +649,7 @@ class Ernie4_5_VLMoeForCausalLM(nn.Module, SupportsPP):
...
@@ -649,7 +649,7 @@ class Ernie4_5_VLMoeForCausalLM(nn.Module, SupportsPP):
# Params for weights, fp8 weight scales, fp8 activation scales
# Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id)
# (param_name, weight_name, expert_id, shard_id)
expert_params_mapping
=
Shared
FusedMoE
.
make_expert_params_mapping
(
expert_params_mapping
=
FusedMoE
.
make_expert_params_mapping
(
self
,
self
,
ckpt_gate_proj_name
=
"gate_proj"
,
ckpt_gate_proj_name
=
"gate_proj"
,
ckpt_down_proj_name
=
"down_proj"
,
ckpt_down_proj_name
=
"down_proj"
,
...
...
vllm/model_executor/models/exaone_moe.py
View file @
5e584ce9
...
@@ -31,7 +31,6 @@ from vllm.distributed import (
...
@@ -31,7 +31,6 @@ from vllm.distributed import (
get_tensor_model_parallel_world_size
,
get_tensor_model_parallel_world_size
,
)
)
from
vllm.model_executor.layers.fused_moe
import
FusedMoE
from
vllm.model_executor.layers.fused_moe
import
FusedMoE
from
vllm.model_executor.layers.fused_moe.shared_fused_moe
import
SharedFusedMoE
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.linear
import
ReplicatedLinear
from
vllm.model_executor.layers.linear
import
ReplicatedLinear
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
...
@@ -130,7 +129,7 @@ class ExaoneMoe(nn.Module):
...
@@ -130,7 +129,7 @@ class ExaoneMoe(nn.Module):
else
:
else
:
self
.
shared_experts
=
None
self
.
shared_experts
=
None
self
.
experts
=
Shared
FusedMoE
(
self
.
experts
=
FusedMoE
(
shared_experts
=
self
.
shared_experts
,
shared_experts
=
self
.
shared_experts
,
gate
=
self
.
gate
,
gate
=
self
.
gate
,
num_experts
=
self
.
n_routed_experts
,
num_experts
=
self
.
n_routed_experts
,
...
...
vllm/model_executor/models/glm4_moe.py
View file @
5e584ce9
...
@@ -42,7 +42,7 @@ from vllm.distributed import (
...
@@ -42,7 +42,7 @@ from vllm.distributed import (
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.attention
import
Attention
from
vllm.model_executor.layers.attention
import
Attention
from
vllm.model_executor.layers.fused_moe
import
Shared
FusedMoE
from
vllm.model_executor.layers.fused_moe
import
FusedMoE
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.linear
import
(
from
vllm.model_executor.layers.linear
import
(
MergedColumnParallelLinear
,
MergedColumnParallelLinear
,
...
@@ -178,7 +178,7 @@ class Glm4MoE(nn.Module):
...
@@ -178,7 +178,7 @@ class Glm4MoE(nn.Module):
else
:
else
:
self
.
shared_experts
=
None
self
.
shared_experts
=
None
self
.
experts
=
Shared
FusedMoE
(
self
.
experts
=
FusedMoE
(
shared_experts
=
self
.
shared_experts
,
shared_experts
=
self
.
shared_experts
,
num_experts
=
config
.
n_routed_experts
,
num_experts
=
config
.
n_routed_experts
,
top_k
=
config
.
num_experts_per_tok
,
top_k
=
config
.
num_experts_per_tok
,
...
@@ -466,7 +466,7 @@ class Glm4MoeModel(nn.Module):
...
@@ -466,7 +466,7 @@ class Glm4MoeModel(nn.Module):
def
get_expert_mapping
(
self
)
->
list
[
tuple
[
str
,
str
,
int
,
str
]]:
def
get_expert_mapping
(
self
)
->
list
[
tuple
[
str
,
str
,
int
,
str
]]:
# Params for weights, fp8 weight scales, fp8 activation scales
# Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id)
# (param_name, weight_name, expert_id, shard_id)
return
Shared
FusedMoE
.
make_expert_params_mapping
(
return
FusedMoE
.
make_expert_params_mapping
(
self
,
self
,
ckpt_gate_proj_name
=
"gate_proj"
,
ckpt_gate_proj_name
=
"gate_proj"
,
ckpt_down_proj_name
=
"down_proj"
,
ckpt_down_proj_name
=
"down_proj"
,
...
...
vllm/model_executor/models/glm4_moe_lite.py
View file @
5e584ce9
...
@@ -41,7 +41,7 @@ from vllm.distributed import (
...
@@ -41,7 +41,7 @@ from vllm.distributed import (
get_pp_group
,
get_pp_group
,
)
)
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.fused_moe
import
Shared
FusedMoE
from
vllm.model_executor.layers.fused_moe
import
FusedMoE
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
...
@@ -308,7 +308,7 @@ class Glm4MoeLiteModel(nn.Module):
...
@@ -308,7 +308,7 @@ class Glm4MoeLiteModel(nn.Module):
def
get_expert_mapping
(
self
)
->
list
[
tuple
[
str
,
str
,
int
,
str
]]:
def
get_expert_mapping
(
self
)
->
list
[
tuple
[
str
,
str
,
int
,
str
]]:
# Params for weights, fp8 weight scales, fp8 activation scales
# Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id)
# (param_name, weight_name, expert_id, shard_id)
return
Shared
FusedMoE
.
make_expert_params_mapping
(
return
FusedMoE
.
make_expert_params_mapping
(
self
,
self
,
ckpt_gate_proj_name
=
"gate_proj"
,
ckpt_gate_proj_name
=
"gate_proj"
,
ckpt_down_proj_name
=
"down_proj"
,
ckpt_down_proj_name
=
"down_proj"
,
...
@@ -334,7 +334,7 @@ class Glm4MoeLiteModel(nn.Module):
...
@@ -334,7 +334,7 @@ class Glm4MoeLiteModel(nn.Module):
# Params for weights, fp8 weight scales, fp8 activation scales
# Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id)
# (param_name, weight_name, expert_id, shard_id)
expert_params_mapping
=
Shared
FusedMoE
.
make_expert_params_mapping
(
expert_params_mapping
=
FusedMoE
.
make_expert_params_mapping
(
self
,
self
,
ckpt_gate_proj_name
=
"gate_proj"
,
ckpt_gate_proj_name
=
"gate_proj"
,
ckpt_down_proj_name
=
"down_proj"
,
ckpt_down_proj_name
=
"down_proj"
,
...
@@ -616,7 +616,7 @@ class Glm4MoeLiteForCausalLM(
...
@@ -616,7 +616,7 @@ class Glm4MoeLiteForCausalLM(
def
get_expert_mapping
(
self
)
->
list
[
tuple
[
str
,
str
,
int
,
str
]]:
def
get_expert_mapping
(
self
)
->
list
[
tuple
[
str
,
str
,
int
,
str
]]:
# Params for weights, fp8 weight scales, fp8 activation scales
# Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id)
# (param_name, weight_name, expert_id, shard_id)
return
Shared
FusedMoE
.
make_expert_params_mapping
(
return
FusedMoE
.
make_expert_params_mapping
(
self
,
self
,
ckpt_gate_proj_name
=
"gate_proj"
,
ckpt_gate_proj_name
=
"gate_proj"
,
ckpt_down_proj_name
=
"down_proj"
,
ckpt_down_proj_name
=
"down_proj"
,
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment