Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
83239ff1
Unverified
Commit
83239ff1
authored
Jan 15, 2026
by
Michael Goin
Committed by
GitHub
Jan 15, 2026
Browse files
Add thread_n=64 support to Marlin MoE (#32360)
Signed-off-by:
mgoin
<
mgoin64@gmail.com
>
parent
c277fbdf
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
8 additions
and
5 deletions
+8
-5
csrc/moe/marlin_moe_wna16/generate_kernels.py
csrc/moe/marlin_moe_wna16/generate_kernels.py
+1
-1
csrc/moe/marlin_moe_wna16/ops.cu
csrc/moe/marlin_moe_wna16/ops.cu
+4
-2
vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
...el_executor/layers/quantization/utils/marlin_utils_fp8.py
+3
-2
No files found.
csrc/moe/marlin_moe_wna16/generate_kernels.py
View file @
83239ff1
...
...
@@ -58,7 +58,7 @@ TEMPLATE = (
"( MARLIN_KERNEL_PARAMS );"
)
THREAD_CONFIGS
=
[(
128
,
128
,
256
),
(
64
,
256
,
256
),
(
64
,
128
,
128
)]
THREAD_CONFIGS
=
[(
128
,
128
,
256
),
(
64
,
256
,
256
),
(
64
,
128
,
128
),
(
128
,
64
,
128
)]
THREAD_M_BLOCKS
=
[
0.5
,
1
,
2
,
3
,
4
]
...
...
csrc/moe/marlin_moe_wna16/ops.cu
View file @
83239ff1
...
...
@@ -126,14 +126,16 @@ thread_config_t small_batch_thread_configs[] = {
// thread_k, thread_n, num_threads
{
128
,
128
,
256
},
{
64
,
128
,
128
}};
{
64
,
128
,
128
},
{
128
,
64
,
128
}};
thread_config_t
large_batch_thread_configs
[]
=
{
// Ordered by priority
// thread_k, thread_n, num_threads
{
64
,
256
,
256
},
{
64
,
128
,
128
}};
{
64
,
128
,
128
},
{
128
,
64
,
128
}};
typedef
struct
{
int
blocks_per_sm
;
...
...
vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
View file @
83239ff1
...
...
@@ -226,6 +226,7 @@ def prepare_fp8_moe_layer_for_marlin(
e
=
layer
.
num_experts
k
=
layer
.
hidden_size
n
=
layer
.
intermediate_size_per_partition
w13_n
=
w13_weight
.
size
(
1
)
weight_block_size
=
getattr
(
layer
,
"weight_block_size"
,
None
)
# WORKSPACE
...
...
@@ -240,7 +241,7 @@ def prepare_fp8_moe_layer_for_marlin(
def
repack_weight
(
name
:
str
,
weight
:
torch
.
Tensor
)
->
torch
.
Tensor
:
tensor_list
=
[]
if
"w13"
in
name
:
size_n
,
size_k
=
n
*
2
,
k
size_n
,
size_k
=
w13_n
,
k
else
:
size_n
,
size_k
=
k
,
n
...
...
@@ -268,7 +269,7 @@ def prepare_fp8_moe_layer_for_marlin(
scales
=
scales
.
to
(
layer
.
orig_dtype
)
tensor_list
=
[]
if
"w13"
in
name
:
size_n
,
size_k
=
n
*
2
,
k
size_n
,
size_k
=
w13_n
,
k
else
:
size_n
,
size_k
=
k
,
n
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment