Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
94096a47
Unverified
Commit
94096a47
authored
Aug 16, 2025
by
Michael Goin
Committed by
GitHub
Aug 16, 2025
Browse files
[UX] Separate marlin moe config logic from triton moe (#23006)
parent
a258ad8b
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
7 additions
and
22 deletions
+7
-22
vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+6
-14
vllm/model_executor/layers/fused_moe/fused_moe.py
vllm/model_executor/layers/fused_moe/fused_moe.py
+1
-8
No files found.
vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
View file @
94096a47
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Fused MoE utilities for GPTQ."""
import
functools
from
typing
import
Optional
import
torch
import
vllm._custom_ops
as
ops
from
vllm.model_executor.layers.fused_moe.fused_moe
import
(
moe_align_block_size
,
try_get_optimal_moe_config
)
from
vllm.model_executor.layers.fused_moe.fused_moe
import
moe_align_block_size
from
vllm.model_executor.layers.quantization.utils.marlin_utils
import
(
marlin_make_workspace_new
,
maybe_warn_marlin_atomic_add
)
from
vllm.scalar_type
import
ScalarType
,
scalar_types
...
...
@@ -98,17 +96,11 @@ def fused_marlin_moe(hidden_states: torch.Tensor,
N
=
w2
.
shape
[
1
]
*
16
topk
=
topk_ids
.
shape
[
1
]
get_config_func
=
functools
.
partial
(
try_get_optimal_moe_config
,
w1
.
shape
,
w2
.
shape
,
topk_ids
.
shape
[
1
],
None
,
is_marlin
=
True
,
)
config
=
get_config_func
(
M
)
block_size_m
=
config
[
"BLOCK_SIZE_M"
]
# M block size selection logic
# TODO: tune this further for specific models
for
block_size_m
in
[
8
,
16
,
32
,
48
,
64
]:
if
M
*
topk
/
E
/
block_size_m
<
0.9
:
break
if
global_num_experts
==
-
1
:
global_num_experts
=
E
...
...
vllm/model_executor/layers/fused_moe/fused_moe.py
View file @
94096a47
...
...
@@ -801,7 +801,6 @@ def get_default_config(
K
:
int
,
topk
:
int
,
dtype
:
Optional
[
str
],
is_marlin
:
bool
,
block_shape
:
Optional
[
list
[
int
]]
=
None
,
)
->
dict
[
str
,
int
]:
if
dtype
==
"fp8_w8a8"
and
block_shape
is
not
None
:
...
...
@@ -832,11 +831,6 @@ def get_default_config(
config
=
{
"BLOCK_SIZE_M"
:
32
,
"GROUP_SIZE_M"
:
1
}
else
:
config
=
{
"BLOCK_SIZE_M"
:
64
,
"GROUP_SIZE_M"
:
1
}
elif
is_marlin
:
for
block_size_m
in
[
8
,
16
,
32
,
48
,
64
]:
if
M
*
topk
/
E
/
block_size_m
<
0.9
:
break
return
{
"BLOCK_SIZE_M"
:
block_size_m
}
elif
M
<=
E
:
config
=
{
"BLOCK_SIZE_M"
:
16
,
...
...
@@ -860,7 +854,6 @@ def try_get_optimal_moe_config(
top_k
:
int
,
dtype
:
Optional
[
str
],
M
:
int
,
is_marlin
:
bool
=
False
,
block_shape
:
Optional
[
list
[
int
]]
=
None
,
)
->
dict
[
str
,
int
]:
from
vllm.model_executor.layers.fused_moe
import
get_config
...
...
@@ -883,7 +876,7 @@ def try_get_optimal_moe_config(
else
:
# Else use the default config
config
=
get_default_config
(
M
,
E
,
N
,
w1_shape
[
2
],
top_k
,
dtype
,
is_marlin
,
block_shape
)
block_shape
)
return
config
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment