Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
3e197b3a
"...ssh:/git@developer.sourcefind.cn:2222/tsoc/openmm.git" did not exist on "aca24d5f09fa31d89a016b53b1557d3d563c03ed"
Commit
3e197b3a
authored
Apr 09, 2026
by
guanyu1
Browse files
0151-qwen3_5 aiter moe接入,但是还没有验证精度
parent
bcb2ba6c
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
176 additions
and
23 deletions
+176
-23
vllm/envs.py
vllm/envs.py
+2
-0
vllm/model_executor/layers/fused_moe/fused_moe.py
vllm/model_executor/layers/fused_moe/fused_moe.py
+62
-2
vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
...executor/layers/fused_moe/unquantized_fused_moe_method.py
+112
-21
No files found.
vllm/envs.py
View file @
3e197b3a
...
@@ -906,6 +906,8 @@ environment_variables: dict[str, Callable[[], Any]] = {
...
@@ -906,6 +906,8 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_FUSED_MOE_CHUNK_SIZE"
:
lambda
:
int
(
"VLLM_FUSED_MOE_CHUNK_SIZE"
:
lambda
:
int
(
os
.
getenv
(
"VLLM_FUSED_MOE_CHUNK_SIZE"
,
str
(
16
*
1024
))
os
.
getenv
(
"VLLM_FUSED_MOE_CHUNK_SIZE"
,
str
(
16
*
1024
))
),
),
# Control whether to use fused MoE activation chunking. Current chunking
# Control whether to use fused MoE activation chunking. Current chunking
# logic is incompatible with torch.compile and causes IMA. See issue
# logic is incompatible with torch.compile and causes IMA. See issue
# https://github.com/vllm-project/vllm/issues/19631.
# https://github.com/vllm-project/vllm/issues/19631.
...
...
vllm/model_executor/layers/fused_moe/fused_moe.py
View file @
3e197b3a
...
@@ -10,7 +10,7 @@ import math
...
@@ -10,7 +10,7 @@ import math
from
collections.abc
import
Callable
from
collections.abc
import
Callable
from
typing
import
Any
,
Callable
,
Dict
,
List
,
Optional
from
typing
import
Any
,
Callable
,
Dict
,
List
,
Optional
from
vllm._aiter_ops
import
rocm_aiter_ops
import
torch
import
torch
import
vllm.envs
as
envs
import
vllm.envs
as
envs
...
@@ -1767,12 +1767,72 @@ def fused_experts_impl(
...
@@ -1767,12 +1767,72 @@ def fused_experts_impl(
return
False
return
False
return
True
return
True
w1_aiter_shuffled
=
getattr
(
w1
,
"aiter_moe_shuffled"
,
False
)
w2_aiter_shuffled
=
getattr
(
w2
,
"aiter_moe_shuffled"
,
False
)
if
w1_aiter_shuffled
!=
w2_aiter_shuffled
:
raise
RuntimeError
(
"MoE weights must either both be AITER-shuffled or both be "
"unshuffled."
)
is_aiter_shuffled
=
w1_aiter_shuffled
and
w2_aiter_shuffled
if
is_aiter_shuffled
:
if
not
(
current_platform
.
is_rocm
()
and
rocm_aiter_ops
.
is_fused_moe_enabled
()
):
raise
RuntimeError
(
"AITER-shuffled MoE weights require ROCm AITER fused MoE "
"to be enabled."
)
try
:
from
aiter.fused_moe_asm_wna16
import
fused_experts_asm_impl
except
Exception
as
e
:
raise
RuntimeError
(
"AITER-shuffled MoE weights were loaded, but the ASM "
"kernel is unavailable. Ensure the `aiter` package is "
"installed and exposes `fused_moe_asm_wna16`."
)
from
e
if
activation
!=
"silu"
:
raise
RuntimeError
(
"ASM Marlin W16A16 MoE only supports activation='silu'."
)
if
apply_router_weight_on_input
:
raise
RuntimeError
(
"ASM Marlin W16A16 MoE does not support apply_router_weight_on_input=True."
)
if
w1_bias
is
not
None
or
w2_bias
is
not
None
:
raise
RuntimeError
(
"ASM Marlin W16A16 MoE does not support expert biases."
)
return
fused_experts_asm_impl
(
hidden_states
=
hidden_states
,
w1
=
w1
,
w2
=
w2
,
topk_weights
=
topk_weights
,
topk_ids
=
topk_ids
,
dtype
=
hidden_states
.
dtype
,
inplace
=
inplace
,
activation
=
activation
,
per_channel_quant
=
per_channel_quant
,
global_num_experts
=
global_num_experts
,
expert_map
=
expert_map
,
w1_scale
=
w1_scale
,
w2_scale
=
w2_scale
,
w1_zp
=
w1_zp
,
w2_zp
=
w2_zp
,
a1_scale
=
a1_scale
,
a2_scale
=
a2_scale
,
block_shape
=
block_shape
,
use_shuffle
=
1
,
)
is_packed
=
(
is_packed
=
(
getattr
(
w1
,
"marlin_w16a16_packed"
,
False
)
getattr
(
w1
,
"marlin_w16a16_packed"
,
False
)
or
getattr
(
w2
,
"marlin_w16a16_packed"
,
False
)
or
getattr
(
w2
,
"marlin_w16a16_packed"
,
False
)
or
_is_marlin_w16a16_packed
(
w1
,
w2
)
or
_is_marlin_w16a16_packed
(
w1
,
w2
)
)
)
if
is_packed
:
if
is_packed
:
if
envs
.
VLLM_USE_MOE_W16A16_TRITON
:
if
envs
.
VLLM_USE_MOE_W16A16_TRITON
:
raise
RuntimeError
(
raise
RuntimeError
(
...
...
vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
View file @
3e197b3a
...
@@ -283,28 +283,119 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
...
@@ -283,28 +283,119 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
packed
[
i
].
copy_
(
tmp
)
packed
[
i
].
copy_
(
tmp
)
del
tmp
del
tmp
return
packed
return
packed
def
_asm_shuffle_weight_b8
(
x
:
torch
.
Tensor
,
stage
:
torch
.
int32
=
1
)
->
torch
.
Tensor
:
# Hardcode BLOCK_K and BLOCK_N
assert
x
.
dtype
in
[
torch
.
float32
,
torch
.
float16
,
torch
.
bfloat16
,
torch
.
int8
,
torch
.
float8_e4m3fn
]
if
x
.
dtype
==
torch
.
int8
or
x
.
dtype
==
torch
.
float8_e4m3fn
:
N
=
16
K
=
16
IK
=
64
IN
=
64
BK
=
256
BN
=
128
if
stage
==
1
:
if
x
.
shape
[
-
2
]
%
128
!=
0
and
x
.
shape
[
-
2
]
%
64
==
0
:
BN
=
64
if
stage
==
2
:
if
x
.
shape
[
-
1
]
%
128
==
0
:
BK
=
128
elif
x
.
shape
[
-
1
]
%
128
==
96
:
BN
=
64
BK
=
64
assert
x
.
shape
[
-
2
]
%
BN
==
0
,
f
"
{
x
.
shape
[
-
2
]
}
%
{
BN
}
==
{
x
.
shape
[
-
2
]
%
BN
}
"
x_
=
x
multiple
=
x
.
shape
[
-
1
]
//
BK
*
BK
part1
=
x
[:,
:,
:
multiple
]
### part1 shuffle
# 0, 1, 2, 3, 4, 5, 6, 7, 8
part1
=
part1
.
view
(
-
1
,
part1
.
shape
[
-
2
]
//
BN
,
BN
//
IN
,
IN
//
N
,
N
,
part1
.
shape
[
-
1
]
//
BK
,
BK
//
IK
,
IK
//
K
,
K
)
part1
=
part1
.
permute
(
0
,
1
,
5
,
2
,
6
,
3
,
7
,
4
,
8
).
contiguous
()
part1
=
part1
.
flatten
(
start_dim
=
1
)
### part2 shuffle
part2
=
x
[:,
:,
multiple
:]
IK
=
32
BK
=
32
# 0, 1, 2, 3, 4, 5, 6, 7, 8
part2
=
part2
.
view
(
-
1
,
part2
.
shape
[
-
2
]
//
BN
,
BN
//
IN
,
IN
//
N
,
N
,
part2
.
shape
[
-
1
]
//
BK
,
BK
//
IK
,
IK
//
K
,
K
)
part2
=
part2
.
permute
(
0
,
1
,
5
,
2
,
6
,
3
,
7
,
4
,
8
).
contiguous
()
part2
=
part2
.
flatten
(
start_dim
=
1
)
### combine
x_
=
torch
.
cat
((
part1
,
part2
),
dim
=
1
)
x_
=
x_
.
view
(
*
x
.
shape
)
return
x_
elif
x
.
dtype
==
torch
.
float16
or
x
.
dtype
==
torch
.
bfloat16
:
N
=
16
K
=
8
IK
=
32
IN
=
64
BK
=
128
BN
=
64
if
stage
==
2
:
BK
=
32
else
:
assert
False
,
f
"not support
{
x
.
dtype
}
"
assert
x
.
shape
[
-
2
]
%
BN
==
0
,
f
"
{
x
.
shape
[
-
2
]
}
%
{
BN
}
==
{
x
.
shape
[
-
2
]
%
BN
}
"
assert
x
.
shape
[
-
1
]
%
BK
==
0
,
f
"
{
x
.
shape
[
-
1
]
}
%
{
BK
}
==
{
x
.
shape
[
-
1
]
%
BK
}
"
x_
=
x
# 0, 1, 2, 3, 4, 5, 6, 7, 8
x_
=
x_
.
view
(
-
1
,
x
.
shape
[
-
2
]
//
BN
,
BN
//
IN
,
IN
//
N
,
N
,
x
.
shape
[
-
1
]
//
BK
,
BK
//
IK
,
IK
//
K
,
K
)
x_
=
x_
.
permute
(
0
,
1
,
5
,
2
,
6
,
3
,
7
,
4
,
8
)
x_
=
x_
.
contiguous
()
x_
=
x_
.
view
(
*
x
.
shape
)
return
x_
with
torch
.
no_grad
():
with
torch
.
no_grad
():
w1_packed
=
_pack_per_expert
(
w1
)
if
current_platform
.
is_rocm
()
and
rocm_aiter_ops
.
is_fused_moe_enabled
():
w2_packed
=
_pack_per_expert
(
w2
)
replace_parameter
(
layer
,
new_w1
=
Parameter
(
w1_packed
,
requires_grad
=
False
)
"w13_weight"
,
new_w2
=
Parameter
(
w2_packed
,
requires_grad
=
False
)
_asm_shuffle_weight_b8
(
w1
,
stage
=
1
),
)
# Preserve any custom weight attributes (e.g. loaders).
replace_parameter
(
if
hasattr
(
w1
,
"__dict__"
):
layer
,
for
k
,
v
in
w1
.
__dict__
.
items
():
"w2_weight"
,
setattr
(
new_w1
,
k
,
v
)
_asm_shuffle_weight_b8
(
w2
,
stage
=
2
),
if
hasattr
(
w2
,
"__dict__"
):
)
for
k
,
v
in
w2
.
__dict__
.
items
():
setattr
(
new_w2
,
k
,
v
)
new_w1
=
layer
.
w13_weight
new_w2
=
layer
.
w2_weight
setattr
(
new_w1
,
"marlin_w16a16_packed"
,
True
)
setattr
(
new_w2
,
"marlin_w16a16_packed"
,
True
)
# Preserve any custom weight attributes (e.g. loaders).
if
hasattr
(
w1
,
"__dict__"
):
layer
.
w13_weight
=
new_w1
for
k
,
v
in
w1
.
__dict__
.
items
():
layer
.
w2_weight
=
new_w2
setattr
(
new_w1
,
k
,
v
)
layer
.
_marlin_w16a16_moe_packed
=
True
if
hasattr
(
w2
,
"__dict__"
):
for
k
,
v
in
w2
.
__dict__
.
items
():
setattr
(
new_w2
,
k
,
v
)
setattr
(
new_w1
,
"aiter_moe_shuffled"
,
True
)
setattr
(
new_w2
,
"aiter_moe_shuffled"
,
True
)
layer
.
_marlin_w16a16_moe_packed
=
True
else
:
w1_packed
=
_pack_per_expert
(
w1
)
w2_packed
=
_pack_per_expert
(
w2
)
new_w1
=
Parameter
(
w1_packed
,
requires_grad
=
False
)
new_w2
=
Parameter
(
w2_packed
,
requires_grad
=
False
)
# Preserve any custom weight attributes (e.g. loaders).
if
hasattr
(
w1
,
"__dict__"
):
for
k
,
v
in
w1
.
__dict__
.
items
():
setattr
(
new_w1
,
k
,
v
)
if
hasattr
(
w2
,
"__dict__"
):
for
k
,
v
in
w2
.
__dict__
.
items
():
setattr
(
new_w2
,
k
,
v
)
setattr
(
new_w1
,
"marlin_w16a16_packed"
,
True
)
setattr
(
new_w2
,
"marlin_w16a16_packed"
,
True
)
layer
.
w13_weight
=
new_w1
layer
.
w2_weight
=
new_w2
layer
.
_marlin_w16a16_moe_packed
=
True
return
return
except
Exception
:
except
Exception
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment