Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
4a4fb3de
Commit
4a4fb3de
authored
Feb 06, 2026
by
zhuwenwen
Browse files
fix nn_moe run error
parent
530e785f
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
24 additions
and
11 deletions
+24
-11
vllm/model_executor/layers/fused_moe/fused_moe.py
vllm/model_executor/layers/fused_moe/fused_moe.py
+8
-3
vllm/model_executor/layers/fused_moe/layer.py
vllm/model_executor/layers/fused_moe/layer.py
+7
-7
vllm/model_executor/layers/fused_moe/modular_kernel.py
vllm/model_executor/layers/fused_moe/modular_kernel.py
+7
-0
vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
...executor/layers/fused_moe/unquantized_fused_moe_method.py
+0
-1
vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
...quantization/compressed_tensors/compressed_tensors_moe.py
+2
-0
No files found.
vllm/model_executor/layers/fused_moe/fused_moe.py
View file @
4a4fb3de
...
...
@@ -2179,10 +2179,11 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
):
# Check constraints.
if
self
.
quant_config
.
use_int4_w4a16
:
assert
hidden_states
.
size
(
-
1
)
//
2
==
w1
.
size
(
2
),
"Hidden size mismatch"
assert
hidden_states
.
size
(
-
1
)
//
2
==
w1
.
size
(
2
)
if
not
use_nn_moe
else
w1
.
size
(
1
)
,
"Hidden size mismatch"
else
:
assert
hidden_states
.
size
(
-
1
)
==
w1
.
size
(
2
),
(
f
"Hidden size mismatch
{
hidden_states
.
size
(
-
1
)
}
!=
{
w1
.
size
(
2
)
}
"
expect_hidden_size
=
w1
.
size
(
2
)
if
not
use_nn_moe
else
w1
.
size
(
1
)
assert
hidden_states
.
size
(
-
1
)
==
expect_hidden_size
,
(
f
"Hidden size mismatch
{
hidden_states
.
size
(
-
1
)
}
!=
{
expect_hidden_size
}
"
)
assert
hidden_states
.
is_contiguous
(),
"Hidden_states must be contiguous"
...
...
@@ -2201,6 +2202,9 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
hidden_states
,
w1
,
w2
,
topk_ids
)
if
use_nn_moe
:
N
=
w1
.
size
(
-
1
)
if
global_num_experts
==
-
1
:
global_num_experts
=
E
...
...
@@ -2211,6 +2215,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
self
.
quant_config
.
config_name
(
hidden_states
.
dtype
),
num_tokens
,
block_shape
=
self
.
block_shape
,
use_nn_moe
=
use_nn_moe
,
)
if
hidden_states
.
dtype
==
torch
.
bfloat16
:
...
...
vllm/model_executor/layers/fused_moe/layer.py
View file @
4a4fb3de
...
...
@@ -1920,13 +1920,13 @@ class FusedMoE(CustomOp):
if
self
.
capture
is
not
None
:
self
.
capture
(
topk_ids
)
final_hidden_states
=
self
.
quant_method
.
apply
(
layer
=
self
,
x
=
x
,
# The type signture of this is wrong due to the hack.
topk_weights
=
topk_weights
,
topk_ids
=
topk_ids
,
use_nn_moe
=
self
.
use_nn_moe
,
# use_fused_gate=self.use_fused_gate,
)
if
has_separate_shared_experts
:
...
...
vllm/model_executor/layers/fused_moe/modular_kernel.py
View file @
4a4fb3de
...
...
@@ -1131,11 +1131,15 @@ class FusedMoEModularKernel(torch.nn.Module):
expert_map
:
torch
.
Tensor
|
None
,
apply_router_weight_on_input
:
bool
,
expert_tokens_meta
:
ExpertTokensMetadata
|
None
,
use_nn_moe
:
bool
|
None
=
False
,
)
->
torch
.
Tensor
:
_
,
M_full
,
N
,
K
,
top_k
=
self
.
fused_experts
.
moe_problem_size
(
a1q
,
w1
,
w2
,
topk_ids
)
if
use_nn_moe
:
N
=
w1
.
size
(
2
)
num_chunks
,
CHUNK_SIZE
=
self
.
_chunk_info
(
M_full
)
def
input_chunk_range
(
chunk_idx
:
int
)
->
tuple
[
int
,
int
]:
...
...
@@ -1206,6 +1210,7 @@ class FusedMoEModularKernel(torch.nn.Module):
workspace2
=
workspace2
,
expert_tokens_meta
=
c_expert_tokens_meta
,
apply_router_weight_on_input
=
apply_router_weight_on_input
,
use_nn_moe
=
use_nn_moe
,
)
return
fused_out
...
...
@@ -1289,6 +1294,7 @@ class FusedMoEModularKernel(torch.nn.Module):
global_num_experts
:
int
=
-
1
,
expert_map
:
torch
.
Tensor
|
None
=
None
,
apply_router_weight_on_input
:
bool
=
False
,
use_nn_moe
:
bool
|
None
=
False
,
)
->
torch
.
Tensor
|
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
"""
This function computes a Mixture of Experts (MoE) layer using two sets
...
...
@@ -1350,6 +1356,7 @@ class FusedMoEModularKernel(torch.nn.Module):
expert_map
=
expert_map
,
apply_router_weight_on_input
=
apply_router_weight_on_input
,
expert_tokens_meta
=
expert_tokens_meta
,
use_nn_moe
=
use_nn_moe
,
)
return
self
.
_finalize
(
...
...
vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
View file @
4a4fb3de
...
...
@@ -316,7 +316,6 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
topk_weights
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
use_nn_moe
:
bool
|
None
=
False
,
use_fused_gate
:
bool
|
None
=
False
,
)
->
torch
.
Tensor
|
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
assert
self
.
kernel
is
not
None
return
self
.
kernel
(
...
...
vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
View file @
4a4fb3de
...
...
@@ -1248,6 +1248,7 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
x
:
torch
.
Tensor
,
topk_weights
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
use_nn_moe
:
bool
|
None
=
False
,
)
->
torch
.
Tensor
|
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
from
vllm.model_executor.layers.fused_moe
import
fused_experts
...
...
@@ -1263,6 +1264,7 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
global_num_experts
=
layer
.
global_num_experts
,
expert_map
=
layer
.
expert_map
,
quant_config
=
self
.
moe_quant_config
,
use_nn_moe
=
use_nn_moe
,
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment