Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
c721b814
Commit
c721b814
authored
Feb 05, 2026
by
zhuwenwen
Browse files
sync v0.15.1
parent
d53fe7e5
Changes
328
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
50 additions
and
1951 deletions
+50
-1951
vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=BW3000.json
...yers/fused_moe/configs/E=8,N=2048,device_name=BW3000.json
+0
-146
vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=DCU_K100_AI_nn.json
...ed_moe/configs/E=8,N=2048,device_name=DCU_K100_AI_nn.json
+0
-164
vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=K100_AI.json
...ers/fused_moe/configs/E=8,N=2048,device_name=K100_AI.json
+0
-146
vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=K100_AI_nn.json
.../fused_moe/configs/E=8,N=2048,device_name=K100_AI_nn.json
+0
-164
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=BW200.json
...ayers/fused_moe/configs/E=8,N=3584,device_name=BW200.json
+0
-146
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=BW200_nn.json
...rs/fused_moe/configs/E=8,N=3584,device_name=BW200_nn.json
+0
-164
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=BW3000.json
...yers/fused_moe/configs/E=8,N=3584,device_name=BW3000.json
+0
-146
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=K100_AI.json
...ers/fused_moe/configs/E=8,N=3584,device_name=K100_AI.json
+0
-146
vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=K100_AI.json
...ers/fused_moe/configs/E=8,N=7168,device_name=K100_AI.json
+0
-146
vllm/model_executor/layers/fused_moe/cutlass_moe.py
vllm/model_executor/layers/fused_moe/cutlass_moe.py
+6
-13
vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
+1
-2
vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
...l_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
+8
-24
vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
...l_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
+0
-12
vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
...model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
+17
-8
vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
...r/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+0
-373
vllm/model_executor/layers/fused_moe/fused_batched_moe.py
vllm/model_executor/layers/fused_moe/fused_batched_moe.py
+0
-6
vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+1
-1
vllm/model_executor/layers/fused_moe/fused_moe.py
vllm/model_executor/layers/fused_moe/fused_moe.py
+9
-22
vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
.../model_executor/layers/fused_moe/fused_moe_method_base.py
+1
-118
vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
...del_executor/layers/fused_moe/fused_moe_modular_method.py
+7
-4
No files found.
vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=BW3000.json
deleted
100644 → 0
View file @
d53fe7e5
{
"1"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"2"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"4"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
},
"8"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"16"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"24"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"32"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"48"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"64"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"96"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
},
"128"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"256"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
},
"512"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"1024"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"1536"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
16
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"2048"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
16
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"3072"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
16
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"4096"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
16
,
"num_warps"
:
8
,
"num_stages"
:
2
}
}
vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=DCU_K100_AI_nn.json
deleted
100644 → 0
View file @
d53fe7e5
{
"1"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
4
,
"num_ldmatrixes"
:
1
},
"2"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
2
,
"num_stages"
:
4
,
"num_ldmatrixes"
:
1
},
"4"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
2
,
"num_stages"
:
3
,
"num_ldmatrixes"
:
1
},
"8"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
4
,
"num_ldmatrixes"
:
1
},
"16"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
4
,
"num_ldmatrixes"
:
1
},
"24"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"32"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"48"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"64"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"96"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"128"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"256"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"512"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"1024"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"1536"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"2048"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"3072"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"4096"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
}
}
vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=K100_AI.json
deleted
100644 → 0
View file @
d53fe7e5
{
"1"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
},
"2"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
},
"4"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
},
"8"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
},
"16"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
},
"24"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
},
"32"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
},
"48"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"64"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"96"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"128"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
3
},
"256"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"512"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
32
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"1024"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"1536"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
64
,
"num_warps"
:
8
,
"num_stages"
:
2
},
"2048"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
64
,
"num_warps"
:
8
,
"num_stages"
:
3
},
"3072"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
16
,
"num_warps"
:
8
,
"num_stages"
:
3
},
"4096"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
16
,
"num_warps"
:
8
,
"num_stages"
:
3
}
}
vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=K100_AI_nn.json
deleted
100644 → 0
View file @
d53fe7e5
{
"1"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
4
,
"num_ldmatrixes"
:
1
},
"2"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
2
,
"num_stages"
:
4
,
"num_ldmatrixes"
:
1
},
"4"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
2
,
"num_stages"
:
3
,
"num_ldmatrixes"
:
1
},
"8"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
4
,
"num_ldmatrixes"
:
1
},
"16"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
4
,
"num_ldmatrixes"
:
1
},
"24"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"32"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"48"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"64"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"96"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"128"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"256"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"512"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"1024"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"1536"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"2048"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"3072"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"4096"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
}
}
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=BW200.json
deleted
100644 → 0
View file @
d53fe7e5
{
"1"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"2"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"4"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"8"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"16"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"24"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"32"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"48"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"64"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"96"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"128"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"256"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
},
"512"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"1024"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"1536"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"2048"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
16
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"3072"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
16
,
"num_warps"
:
8
,
"num_stages"
:
2
},
"4096"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
16
,
"num_warps"
:
8
,
"num_stages"
:
2
}
}
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=BW200_nn.json
deleted
100644 → 0
View file @
d53fe7e5
{
"1"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"2"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"4"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"8"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"16"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"24"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"32"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"48"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"64"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"96"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"128"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"256"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"512"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"1024"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"1536"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
16
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"2048"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
16
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"3072"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
16
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"4096"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
16
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
}
}
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=BW3000.json
deleted
100644 → 0
View file @
d53fe7e5
{
"1"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"2"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"4"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"8"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"16"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"24"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"32"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"48"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"64"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"96"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"128"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"256"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
},
"512"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"1024"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"1536"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"2048"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
16
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"3072"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
16
,
"num_warps"
:
8
,
"num_stages"
:
2
},
"4096"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
16
,
"num_warps"
:
8
,
"num_stages"
:
2
}
}
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=K100_AI.json
deleted
100644 → 0
View file @
d53fe7e5
{
"1"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
},
"2"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
},
"4"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
},
"8"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
},
"16"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
},
"24"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
},
"32"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
},
"48"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
},
"64"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
3
},
"96"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
3
},
"128"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
3
},
"256"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
32
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"512"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
64
,
"num_warps"
:
8
,
"num_stages"
:
2
},
"1024"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
64
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"1536"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
64
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"2048"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
32
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"3072"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
32
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"4096"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
16
,
"num_warps"
:
4
,
"num_stages"
:
2
}
}
vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=K100_AI.json
deleted
100644 → 0
View file @
d53fe7e5
{
"1"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"2"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
64
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"4"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"8"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
},
"16"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
},
"24"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
},
"32"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"48"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
4
},
"64"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
},
"96"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
32
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"128"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
64
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"256"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
16
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"512"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
32
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"1024"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
64
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"1536"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"2048"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
64
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"3072"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"4096"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
16
,
"num_warps"
:
4
,
"num_stages"
:
3
}
}
vllm/model_executor/layers/fused_moe/cutlass_moe.py
View file @
c721b814
...
@@ -104,13 +104,7 @@ def run_cutlass_moe_fp8(
...
@@ -104,13 +104,7 @@ def run_cutlass_moe_fp8(
),
"Intermediate scale shape mismatch"
),
"Intermediate scale shape mismatch"
assert
out_dtype
in
[
torch
.
half
,
torch
.
bfloat16
],
"Invalid output dtype"
assert
out_dtype
in
[
torch
.
half
,
torch
.
bfloat16
],
"Invalid output dtype"
# NOTE(rob): the expert_map is used for the STANDARD case and
if
expert_map
is
not
None
:
# the batched format is used by the BATCHED case.
# TODO(rob): update the MK interface to only pass the expert_map
# during the STANDARD case to make this clearer across all kernels.
if
use_batched_format
:
assert
expert_num_tokens
is
not
None
else
:
assert
expert_num_tokens
is
None
assert
expert_num_tokens
is
None
# We have two modes: batched experts and non-batched experts.
# We have two modes: batched experts and non-batched experts.
...
@@ -386,10 +380,7 @@ class CutlassExpertsFp8(CutlassExpertsFp8Base):
...
@@ -386,10 +380,7 @@ class CutlassExpertsFp8(CutlassExpertsFp8Base):
# needed for STANDARD activation format kernels in DP/EP mode.
# needed for STANDARD activation format kernels in DP/EP mode.
# Note that the BATCHED activation format does not use
# Note that the BATCHED activation format does not use
# the expert map for identifying experts.
# the expert map for identifying experts.
return
not
(
return
not
moe_parallel_config
.
use_all2all_kernels
moe_parallel_config
.
use_fi_all2allv_kernels
or
moe_parallel_config
.
use_deepep_ht_kernels
)
def
supports_chunking
(
self
)
->
bool
:
def
supports_chunking
(
self
)
->
bool
:
return
True
return
True
...
@@ -651,8 +642,10 @@ def run_cutlass_moe_fp4(
...
@@ -651,8 +642,10 @@ def run_cutlass_moe_fp4(
class
CutlassExpertsFp4
(
mk
.
FusedMoEPermuteExpertsUnpermute
):
class
CutlassExpertsFp4
(
mk
.
FusedMoEPermuteExpertsUnpermute
):
@
property
@
staticmethod
def
expects_unquantized_inputs
(
self
)
->
bool
:
def
expects_unquantized_inputs
(
moe_config
:
FusedMoEConfig
,
quant_config
:
FusedMoEQuantConfig
)
->
bool
:
return
True
return
True
@
staticmethod
@
staticmethod
...
...
vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
View file @
c721b814
...
@@ -148,8 +148,7 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
...
@@ -148,8 +148,7 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
@
staticmethod
@
staticmethod
def
_supports_parallel_config
(
moe_parallel_config
:
FusedMoEParallelConfig
)
->
bool
:
def
_supports_parallel_config
(
moe_parallel_config
:
FusedMoEParallelConfig
)
->
bool
:
# NOTE(rob): discovered an IMA with this combination. Needs investigation.
return
True
return
not
moe_parallel_config
.
use_fi_all2allv_kernels
def
supports_chunking
(
self
)
->
bool
:
def
supports_chunking
(
self
)
->
bool
:
return
True
return
True
...
...
vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
View file @
c721b814
...
@@ -103,7 +103,6 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
...
@@ -103,7 +103,6 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
num_experts
:
int
,
num_experts
:
int
,
a1_scale
:
torch
.
Tensor
|
None
,
a1_scale
:
torch
.
Tensor
|
None
,
quant_config
:
FusedMoEQuantConfig
,
quant_config
:
FusedMoEQuantConfig
,
defer_input_quant
:
bool
,
)
->
Callable
:
)
->
Callable
:
has_scales
=
token_scales
is
not
None
has_scales
=
token_scales
is
not
None
...
@@ -175,7 +174,6 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
...
@@ -175,7 +174,6 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
expert_topk_weights
,
expert_topk_weights
,
a1_scale
,
a1_scale
,
quant_config
,
quant_config
,
defer_input_quant
=
defer_input_quant
,
)
)
def
_receiver
(
def
_receiver
(
...
@@ -189,7 +187,6 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
...
@@ -189,7 +187,6 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
expert_topk_weights
:
torch
.
Tensor
|
None
,
expert_topk_weights
:
torch
.
Tensor
|
None
,
a1_scale
:
torch
.
Tensor
|
None
,
a1_scale
:
torch
.
Tensor
|
None
,
quant_config
:
FusedMoEQuantConfig
,
quant_config
:
FusedMoEQuantConfig
,
defer_input_quant
:
bool
,
)
->
mk
.
PrepareResultType
:
)
->
mk
.
PrepareResultType
:
if
event
.
event
is
not
None
:
if
event
.
event
is
not
None
:
event
.
current_stream_wait
()
event
.
current_stream_wait
()
...
@@ -224,15 +221,14 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
...
@@ -224,15 +221,14 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
expert_num_tokens_per_expert_list
,
device
=
expert_x
.
device
expert_num_tokens_per_expert_list
,
device
=
expert_x
.
device
)
)
#
* For non-block quant, d
ispatch
in b16
and
q
uant
ize now as
#
D
ispatch and
Q
uant
#
DeepEP kernels only support dispatching block
scales.
# DeepEP kernels only support dispatching block
-quantized
#
* For expert kernels that require unquantized inputs,
#
activation scales.
#
defer quantization to FusedMoEExpertsPermuteUnpermute.
#
Dispatch in bfloat16 and quantize afterwards
if
not
quant_config
.
is_block_quantized
and
not
defer_input_quant
:
if
not
quant_config
.
is_block_quantized
:
# Quantize after dispatch.
# Quantize after dispatch.
expert_x_scale
=
None
expert_x_scale
=
None
if
expert_x
.
numel
()
!=
0
:
if
expert_x
.
numel
()
!=
0
:
# TODO: support per_act_token_quant,
expert_x
,
expert_x_scale
=
moe_kernel_quantize_input
(
expert_x
,
expert_x_scale
=
moe_kernel_quantize_input
(
expert_x
,
expert_x
,
a1_scale
,
a1_scale
,
...
@@ -261,7 +257,6 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
...
@@ -261,7 +257,6 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
expert_map
:
torch
.
Tensor
|
None
,
expert_map
:
torch
.
Tensor
|
None
,
apply_router_weight_on_input
:
bool
,
apply_router_weight_on_input
:
bool
,
quant_config
:
FusedMoEQuantConfig
,
quant_config
:
FusedMoEQuantConfig
,
defer_input_quant
:
bool
=
False
,
)
->
mk
.
ReceiverType
:
)
->
mk
.
ReceiverType
:
if
apply_router_weight_on_input
:
if
apply_router_weight_on_input
:
topk
=
topk_ids
.
size
(
1
)
topk
=
topk_ids
.
size
(
1
)
...
@@ -271,12 +266,8 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
...
@@ -271,12 +266,8 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
)
)
a1
=
a1
*
topk_weights
.
to
(
a1
.
dtype
)
a1
=
a1
*
topk_weights
.
to
(
a1
.
dtype
)
# * DeepEP only supports fp8 block scales so quantize
if
quant_config
.
is_block_quantized
:
# before the dispatch for these models.
# Quant and Dispatch
# * For all other quantization, dispatch after.
# * For expert kernels that require unquantized inputs,
# defer quantization to FusedMoEExpertsPermuteUnpermute.
if
quant_config
.
is_block_quantized
and
not
defer_input_quant
:
a1q
,
a1q_scale
=
moe_kernel_quantize_input
(
a1q
,
a1q_scale
=
moe_kernel_quantize_input
(
a1
,
a1
,
quant_config
.
a1_scale
,
quant_config
.
a1_scale
,
...
@@ -290,11 +281,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
...
@@ -290,11 +281,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
else
:
else
:
a1q
=
a1
a1q
=
a1
a1q_scale
=
None
a1q_scale
=
None
a1_post_scale
=
(
a1_post_scale
=
quant_config
.
a1_scale
quant_config
.
a1_gscale
if
quant_config
.
quant_dtype
==
"nvfp4"
else
quant_config
.
a1_scale
)
return
self
.
_do_dispatch
(
return
self
.
_do_dispatch
(
tokens
=
a1q
,
tokens
=
a1q
,
...
@@ -304,7 +291,6 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
...
@@ -304,7 +291,6 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
num_experts
=
num_experts
,
num_experts
=
num_experts
,
a1_scale
=
a1_post_scale
,
a1_scale
=
a1_post_scale
,
quant_config
=
quant_config
,
quant_config
=
quant_config
,
defer_input_quant
=
defer_input_quant
,
)
)
def
prepare
(
def
prepare
(
...
@@ -316,7 +302,6 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
...
@@ -316,7 +302,6 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
expert_map
:
torch
.
Tensor
|
None
,
expert_map
:
torch
.
Tensor
|
None
,
apply_router_weight_on_input
:
bool
,
apply_router_weight_on_input
:
bool
,
quant_config
:
FusedMoEQuantConfig
,
quant_config
:
FusedMoEQuantConfig
,
defer_input_quant
:
bool
=
False
,
)
->
mk
.
PrepareResultType
:
)
->
mk
.
PrepareResultType
:
receiver
=
self
.
prepare_async
(
receiver
=
self
.
prepare_async
(
a1
,
a1
,
...
@@ -326,7 +311,6 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
...
@@ -326,7 +311,6 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
expert_map
,
expert_map
,
apply_router_weight_on_input
,
apply_router_weight_on_input
,
quant_config
,
quant_config
,
defer_input_quant
,
)
)
return
receiver
()
return
receiver
()
...
...
vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
View file @
c721b814
...
@@ -242,13 +242,7 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
...
@@ -242,13 +242,7 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
expert_map
:
torch
.
Tensor
|
None
,
expert_map
:
torch
.
Tensor
|
None
,
apply_router_weight_on_input
:
bool
,
apply_router_weight_on_input
:
bool
,
quant_config
:
FusedMoEQuantConfig
,
quant_config
:
FusedMoEQuantConfig
,
defer_input_quant
:
bool
=
False
,
)
->
tuple
[
Callable
,
mk
.
ReceiverType
]:
)
->
tuple
[
Callable
,
mk
.
ReceiverType
]:
if
defer_input_quant
:
raise
NotImplementedError
(
f
"
{
self
.
__class__
.
__name__
}
does not support defer_input_quant=True. "
"Please select an MoE kernel that accepts quantized inputs."
)
hidden_size
=
a1
.
size
(
1
)
hidden_size
=
a1
.
size
(
1
)
assert
hidden_size
in
self
.
SUPPORTED_HIDDEN_SIZES
,
(
assert
hidden_size
in
self
.
SUPPORTED_HIDDEN_SIZES
,
(
...
@@ -351,13 +345,7 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
...
@@ -351,13 +345,7 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
expert_map
:
torch
.
Tensor
|
None
,
expert_map
:
torch
.
Tensor
|
None
,
apply_router_weight_on_input
:
bool
,
apply_router_weight_on_input
:
bool
,
quant_config
:
FusedMoEQuantConfig
,
quant_config
:
FusedMoEQuantConfig
,
defer_input_quant
:
bool
=
False
,
)
->
mk
.
PrepareResultType
:
)
->
mk
.
PrepareResultType
:
if
defer_input_quant
:
raise
NotImplementedError
(
f
"
{
self
.
__class__
.
__name__
}
does not support defer_input_quant=True. "
"Please select an MoE kernel that accepts quantized inputs."
)
hook
,
receiver
=
self
.
prepare_async
(
hook
,
receiver
=
self
.
prepare_async
(
a1
,
a1
,
topk_weights
,
topk_weights
,
...
...
vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
View file @
c721b814
...
@@ -78,9 +78,16 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute):
...
@@ -78,9 +78,16 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute):
# - skip input activation quantization (kernel applies scaling)
# - skip input activation quantization (kernel applies scaling)
self
.
use_deepseek_fp8_block_scale
=
quant_config
.
is_block_quantized
self
.
use_deepseek_fp8_block_scale
=
quant_config
.
is_block_quantized
@
property
@
staticmethod
def
expects_unquantized_inputs
(
self
)
->
bool
:
def
expects_unquantized_inputs
(
return
self
.
quant_config
.
use_fp8_w8a8
and
self
.
quant_config
.
is_block_quantized
moe_config
:
mk
.
FusedMoEConfig
,
quant_config
:
FusedMoEQuantConfig
)
->
bool
:
# NVFP4 TP kernels and FP8 block-quantized kernels apply
# input quantization inside FusedMoEPermuteExpertsUnpermute.
return
(
quant_config
.
use_nvfp4_w4a4
and
not
moe_config
.
moe_parallel_config
.
use_all2all_kernels
)
or
(
quant_config
.
use_fp8_w8a8
and
quant_config
.
is_block_quantized
)
@
staticmethod
@
staticmethod
def
_supports_current_device
()
->
bool
:
def
_supports_current_device
()
->
bool
:
...
@@ -138,8 +145,10 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute):
...
@@ -138,8 +145,10 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute):
# FLASHINFER_CUTLASS currently uses its down P/F, which does not
# FLASHINFER_CUTLASS currently uses its down P/F, which does not
# work with SP. This will be removed in follow up after we get
# work with SP. This will be removed in follow up after we get
# rid of the FlashInfer specific P/F function.
# rid of the FlashInfer specific P/F function.
# TODO: the per-tensor fp8 kernels don't work with MNNVL FI A2As.
return
(
return
not
moe_parallel_config
.
is_sequence_parallel
moe_parallel_config
.
dp_size
==
1
or
moe_parallel_config
.
dp_size
==
moe_parallel_config
.
ep_size
)
@
staticmethod
@
staticmethod
def
activation_format
()
->
mk
.
FusedMoEActivationFormat
:
def
activation_format
()
->
mk
.
FusedMoEActivationFormat
:
...
@@ -186,9 +195,9 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute):
...
@@ -186,9 +195,9 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute):
"""
"""
workspace1
=
(
M
,
K
)
workspace1
=
(
M
,
K
)
workspace2
=
(
0
,)
workspace2
=
(
0
,)
# For NVFP4, the output is stored in a packed int8 format,
#
so the actual hidden dim is 2x the size of K here
.
#
For TP, the quantization is fused with fused_moe call
.
output_shape
=
(
M
,
K
*
2
if
self
.
quant_dtype
==
"nvfp4"
else
K
)
output_shape
=
(
M
,
K
*
2
if
self
.
quant_dtype
==
"nvfp4"
and
self
.
use_dp
else
K
)
# The workspace is determined by `aq`, since it comes after any
# The workspace is determined by `aq`, since it comes after any
# potential communication op and is involved in the expert computation.
# potential communication op and is involved in the expert computation.
return
(
workspace1
,
workspace2
,
output_shape
)
return
(
workspace1
,
workspace2
,
output_shape
)
...
...
vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
deleted
100644 → 0
View file @
d53fe7e5
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
torch
import
vllm.model_executor.layers.fused_moe.modular_kernel
as
mk
from
vllm.distributed
import
get_dp_group
,
get_ep_group
from
vllm.distributed.device_communicators.base_device_communicator
import
(
All2AllManagerBase
,
)
from
vllm.forward_context
import
get_forward_context
from
vllm.model_executor.layers.fused_moe.config
import
FusedMoEQuantConfig
from
vllm.model_executor.layers.fused_moe.prepare_finalize
import
(
MoEPrepareAndFinalizeNoEP
,
)
from
vllm.model_executor.layers.fused_moe.topk_weight_and_reduce
import
(
TopKWeightAndReduceNoOP
,
)
from
vllm.model_executor.layers.fused_moe.utils
import
moe_kernel_quantize_input
from
vllm.utils.flashinfer
import
nvfp4_block_scale_interleave
def
get_local_sizes
():
return
get_forward_context
().
dp_metadata
.
get_chunk_sizes_across_dp_rank
()
class
FlashInferCutlassMoEPrepareAndFinalize
(
mk
.
FusedMoEPrepareAndFinalize
):
"""Base class for FlashInfer MoE prepare and finalize operations."""
def
__init__
(
self
,
use_dp
:
bool
,
num_dispatchers
:
int
=
1
,
use_deepseek_fp8_block_scale
:
bool
=
False
,
):
super
().
__init__
()
self
.
num_dispatchers_
=
num_dispatchers
self
.
use_dp
=
use_dp
self
.
local_tokens
=
None
# Toggle for DeepSeek-style FP8 block-scale path where activations are
# not quantized here and weight block scales are consumed by the kernel.
self
.
use_deepseek_fp8_block_scale
=
use_deepseek_fp8_block_scale
@
property
def
activation_format
(
self
)
->
mk
.
FusedMoEActivationFormat
:
return
mk
.
FusedMoEActivationFormat
.
Standard
def
max_num_tokens_per_rank
(
self
)
->
int
|
None
:
return
None
def
topk_indices_dtype
(
self
)
->
torch
.
dtype
|
None
:
return
None
def
num_dispatchers
(
self
)
->
int
:
return
self
.
num_dispatchers_
def
output_is_reduced
(
self
)
->
bool
:
return
False
def
_apply_router_weight_on_input
(
self
,
a1
:
torch
.
Tensor
,
topk_weights
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
apply_router_weight_on_input
:
bool
,
)
->
None
:
"""Apply router weight on input if needed."""
if
apply_router_weight_on_input
:
topk
=
topk_ids
.
size
(
1
)
assert
topk
==
1
,
(
"apply_router_weight_on_input is only implemented for topk=1"
)
a1
.
mul_
(
topk_weights
.
to
(
a1
.
dtype
))
class
FlashInferAllToAllMoEPrepareAndFinalize
(
FlashInferCutlassMoEPrepareAndFinalize
):
"""FlashInfer implementation using AllToAll communication."""
def
__init__
(
self
,
use_dp
:
bool
,
num_dispatchers
:
int
=
1
,
use_deepseek_fp8_block_scale
:
bool
=
False
,
):
super
().
__init__
(
use_dp
,
num_dispatchers
,
use_deepseek_fp8_block_scale
)
self
.
alltoall_info
=
None
# Initialize all2all_manager only for DP case
self
.
all2all_manager
=
None
if
self
.
use_dp
:
self
.
all2all_manager
=
get_ep_group
().
device_communicator
.
all2all_manager
def
prepare
(
self
,
a1
:
torch
.
Tensor
,
topk_weights
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
num_experts
:
int
,
expert_map
:
torch
.
Tensor
|
None
,
apply_router_weight_on_input
:
bool
,
quant_config
:
FusedMoEQuantConfig
,
)
->
mk
.
PrepareResultType
:
self
.
_apply_router_weight_on_input
(
a1
,
topk_weights
,
topk_ids
,
apply_router_weight_on_input
)
if
not
self
.
use_dp
:
# Non-DP case: quantize activations unless using block-scale path
if
not
self
.
use_deepseek_fp8_block_scale
:
a1q
,
a1q_scale
=
moe_kernel_quantize_input
(
a1
,
quant_config
.
a1_gscale
,
quant_config
.
quant_dtype
,
quant_config
.
per_act_token_quant
,
quant_config
.
block_shape
,
is_fp4_scale_swizzled
=
not
self
.
use_dp
,
)
else
:
a1q
=
a1
a1q_scale
=
None
else
:
# DP case: use FlashInfer AllToAll
global_num_tokens_cpu
=
get_local_sizes
()
top_k
=
topk_ids
.
size
(
1
)
(
self
.
alltoall_info
,
topk_ids
,
topk_weights
,
a1q
,
a1q_scale
)
=
(
flashinfer_alltoall_dispatch
(
self
.
all2all_manager
,
global_num_tokens_cpu
,
a1
,
quant_config
.
a1_gscale
,
topk_ids
,
topk_weights
,
top_k
,
num_experts
,
quant_config
,
use_deepseek_fp8_block_scale
=
self
.
use_deepseek_fp8_block_scale
,
)
)
return
a1q
,
a1q_scale
,
None
,
topk_ids
,
topk_weights
def
finalize
(
self
,
output
:
torch
.
Tensor
,
fused_expert_output
:
torch
.
Tensor
,
topk_weights
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
apply_router_weight_on_input
:
bool
,
weight_and_reduce_impl
:
mk
.
TopKWeightAndReduce
,
)
->
None
:
if
self
.
use_dp
:
top_k
=
topk_ids
.
size
(
1
)
token_count
=
output
.
shape
[
0
]
fused_expert_output
=
flashinfer_alltoall_combine
(
self
.
all2all_manager
,
fused_expert_output
,
top_k
=
top_k
,
token_count
=
token_count
,
alltoall_info
=
self
.
alltoall_info
,
)
output
.
copy_
(
fused_expert_output
)
class
FlashInferAllGatherMoEPrepareAndFinalize
(
FlashInferCutlassMoEPrepareAndFinalize
):
def
__init__
(
self
,
use_dp
:
bool
,
num_dispatchers
:
int
=
1
,
use_deepseek_fp8_block_scale
:
bool
=
False
,
):
super
().
__init__
(
use_dp
,
num_dispatchers
,
use_deepseek_fp8_block_scale
)
def
prepare
(
self
,
a1
:
torch
.
Tensor
,
topk_weights
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
num_experts
:
int
,
expert_map
:
torch
.
Tensor
|
None
,
apply_router_weight_on_input
:
bool
,
quant_config
:
FusedMoEQuantConfig
,
)
->
mk
.
PrepareResultType
:
self
.
_apply_router_weight_on_input
(
a1
,
topk_weights
,
topk_ids
,
apply_router_weight_on_input
)
is_nvfp4
=
quant_config
.
quant_dtype
==
"nvfp4"
if
not
self
.
use_dp
and
is_nvfp4
:
return
a1
,
None
,
None
,
topk_ids
,
topk_weights
if
not
self
.
use_deepseek_fp8_block_scale
:
a1q
,
a1q_scale
=
moe_kernel_quantize_input
(
a1
,
quant_config
.
a1_gscale
if
is_nvfp4
else
quant_config
.
a1_scale
,
quant_config
.
quant_dtype
,
quant_config
.
per_act_token_quant
,
quant_config
.
block_shape
,
is_fp4_scale_swizzled
=
not
self
.
use_dp
,
)
else
:
# Block-scale path: pass activations through, omit per-token scales
a1q
=
a1
a1q_scale
=
None
if
self
.
use_dp
:
# Build gather list conditionally - omit a1q_scale if None
# (block-scale path)
gather_list
=
[
topk_weights
,
topk_ids
,
a1q
]
if
a1q_scale
is
not
None
:
gather_list
.
append
(
a1q_scale
)
gathered
=
get_dp_group
().
all_gatherv
(
gather_list
,
dim
=
0
,
sizes
=
get_local_sizes
(),
)
topk_weights
,
topk_ids
,
a1q
,
a1q_scale
=
gathered
else
:
gathered
=
get_dp_group
().
all_gatherv
(
gather_list
,
dim
=
0
,
sizes
=
get_local_sizes
(),
)
topk_weights
,
topk_ids
,
a1q
=
gathered
a1q_scale
=
None
if
is_nvfp4
and
a1q_scale
is
not
None
:
if
a1q_scale
.
element_size
()
==
1
:
a1q_scale
=
a1q_scale
.
view
(
torch
.
uint8
)
a1q_scale
=
nvfp4_block_scale_interleave
(
a1q_scale
)
return
a1q
,
a1q_scale
,
None
,
topk_ids
,
topk_weights
def
finalize
(
self
,
output
:
torch
.
Tensor
,
fused_expert_output
:
torch
.
Tensor
,
topk_weights
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
apply_router_weight_on_input
:
bool
,
weight_and_reduce_impl
:
mk
.
TopKWeightAndReduce
,
)
->
None
:
assert
isinstance
(
weight_and_reduce_impl
,
TopKWeightAndReduceNoOP
)
if
self
.
use_dp
:
fused_expert_output
=
get_dp_group
().
reduce_scatterv
(
fused_expert_output
,
dim
=
0
,
sizes
=
get_local_sizes
()
)
output
.
copy_
(
fused_expert_output
)
def
flashinfer_alltoall_dispatch
(
all2all_manager
:
All2AllManagerBase
,
global_num_tokens_cpu
:
list
[
int
],
x
:
torch
.
Tensor
,
gs
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
topk_weights
:
torch
.
Tensor
,
top_k
:
int
,
num_experts
:
int
,
quant_config
:
FusedMoEQuantConfig
,
use_deepseek_fp8_block_scale
:
bool
=
False
,
):
from
flashinfer.comm.trtllm_alltoall
import
MnnvlMoe
assert
all2all_manager
.
ensure_alltoall_workspace_initialized
(),
(
"FlashInfer AllToAll workspace not available"
)
ep_rank
=
all2all_manager
.
rank
ep_size
=
all2all_manager
.
world_size
max_num_token
=
(
max
(
global_num_tokens_cpu
)
if
global_num_tokens_cpu
is
not
None
else
x
.
shape
[
0
]
)
orig_topk_weights_dtype
=
topk_weights
.
dtype
alltoall_info
,
topk_ids
,
topk_weights
,
_
=
(
MnnvlMoe
.
mnnvl_moe_alltoallv_prepare_without_allgather
(
topk_ids
,
topk_weights
,
None
,
all2all_manager
.
prepare_workspace_tensor
,
max_num_token
,
ep_rank
,
ep_size
,
num_experts
,
num_experts
,
top_k
,
)
)
topk_weights
=
topk_weights
.
view
(
dtype
=
orig_topk_weights_dtype
)
if
not
use_deepseek_fp8_block_scale
:
x
,
x_sf
=
moe_kernel_quantize_input
(
x
,
gs
,
quant_config
.
quant_dtype
,
quant_config
.
per_act_token_quant
,
quant_config
.
block_shape
,
is_fp4_scale_swizzled
=
False
,
# delay swizzle to after comm
)
x
=
MnnvlMoe
.
mnnvl_moe_alltoallv
(
x
,
alltoall_info
,
all2all_manager
.
workspace_tensor
,
ep_rank
,
ep_size
,
)
x_sf
=
MnnvlMoe
.
mnnvl_moe_alltoallv
(
x_sf
,
alltoall_info
,
all2all_manager
.
workspace_tensor
,
ep_rank
,
ep_size
,
)
if
quant_config
.
quant_dtype
==
"nvfp4"
:
x_sf
=
nvfp4_block_scale_interleave
(
x_sf
)
else
:
# Block-scale path: pass activations through without quantization
x_sf
=
None
x
=
MnnvlMoe
.
mnnvl_moe_alltoallv
(
x
,
alltoall_info
,
all2all_manager
.
workspace_tensor
,
ep_rank
,
ep_size
,
)
return
alltoall_info
,
topk_ids
,
topk_weights
,
x
,
x_sf
def
flashinfer_alltoall_combine
(
all2all_manager
:
All2AllManagerBase
,
output
:
torch
.
Tensor
,
top_k
:
int
,
token_count
:
int
,
alltoall_info
,
):
from
flashinfer.comm.trtllm_alltoall
import
MnnvlMoe
assert
all2all_manager
.
ensure_alltoall_workspace_initialized
(),
(
"FlashInfer AllToAll workspace not available"
)
return
MnnvlMoe
.
mnnvl_moe_alltoallv_combine
(
output
,
alltoall_info
,
all2all_manager
.
workspace_tensor
,
ep_rank
=
all2all_manager
.
rank
,
ep_size
=
all2all_manager
.
world_size
,
top_k
=
top_k
,
token_count
=
token_count
,
)
def
create_flashinfer_prepare_finalize
(
use_dp
:
bool
,
use_nvfp4
:
bool
=
False
,
enable_alltoallv
:
bool
=
False
,
use_deepseek_fp8_block_scale
:
bool
=
False
,
)
->
FlashInferCutlassMoEPrepareAndFinalize
|
MoEPrepareAndFinalizeNoEP
:
"""Factory function to create the appropriate FlashInfer implementation."""
if
use_dp
:
if
enable_alltoallv
:
assert
use_nvfp4
return
FlashInferAllToAllMoEPrepareAndFinalize
(
use_dp
)
return
FlashInferAllGatherMoEPrepareAndFinalize
(
use_dp
=
True
,
use_deepseek_fp8_block_scale
=
use_deepseek_fp8_block_scale
,
)
else
:
# CUTLASS FP8 BLOCK and CUTLASS NVFP4 apply input quantization
# in a single call with the MoE experts kernel.
defer_input_quant
=
use_deepseek_fp8_block_scale
or
use_nvfp4
return
MoEPrepareAndFinalizeNoEP
(
defer_input_quant
=
defer_input_quant
)
vllm/model_executor/layers/fused_moe/fused_batched_moe.py
View file @
c721b814
...
@@ -533,13 +533,7 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
...
@@ -533,13 +533,7 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
expert_map
:
torch
.
Tensor
|
None
,
expert_map
:
torch
.
Tensor
|
None
,
apply_router_weight_on_input
:
bool
,
apply_router_weight_on_input
:
bool
,
quant_config
:
FusedMoEQuantConfig
,
quant_config
:
FusedMoEQuantConfig
,
defer_input_quant
:
bool
=
False
,
)
->
mk
.
PrepareResultType
:
)
->
mk
.
PrepareResultType
:
if
defer_input_quant
:
raise
NotImplementedError
(
f
"
{
self
.
__class__
.
__name__
}
does not support defer_input_quant=True. "
"Please select an MoE kernel that accepts quantized inputs."
)
assert
a1
.
dim
()
==
2
assert
a1
.
dim
()
==
2
assert
topk_ids
.
dim
()
==
2
assert
topk_ids
.
dim
()
==
2
assert
topk_ids
.
size
(
0
)
==
a1
.
size
(
0
)
assert
topk_ids
.
size
(
0
)
==
a1
.
size
(
0
)
...
...
vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
View file @
c721b814
...
@@ -593,7 +593,7 @@ class MarlinExpertsBase(mk.FusedMoEPermuteExpertsUnpermute):
...
@@ -593,7 +593,7 @@ class MarlinExpertsBase(mk.FusedMoEPermuteExpertsUnpermute):
@
staticmethod
@
staticmethod
def
_supports_parallel_config
(
moe_parallel_config
:
FusedMoEParallelConfig
)
->
bool
:
def
_supports_parallel_config
(
moe_parallel_config
:
FusedMoEParallelConfig
)
->
bool
:
return
not
moe_parallel_config
.
use_fi_all2allv_kernels
return
True
@
property
@
property
def
quant_type_id
(
self
)
->
int
:
def
quant_type_id
(
self
)
->
int
:
...
...
vllm/model_executor/layers/fused_moe/fused_moe.py
View file @
c721b814
...
@@ -399,27 +399,14 @@ def fused_moe_kernel(
...
@@ -399,27 +399,14 @@ def fused_moe_kernel(
# Map program ids `pid` to the block of C it should compute.
# Map program ids `pid` to the block of C it should compute.
# This is done in a grouped ordering to promote L2 data reuse.
# This is done in a grouped ordering to promote L2 data reuse.
pid
=
tl
.
program_id
(
axis
=
0
)
pid
=
tl
.
program_id
(
axis
=
0
)
# num_pid_m = tl.cdiv(EM, BLOCK_SIZE_M)
num_pid_m
=
tl
.
cdiv
(
EM
,
BLOCK_SIZE_M
)
# num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
num_pid_n
=
tl
.
cdiv
(
N
,
BLOCK_SIZE_N
)
# num_pid_in_group = GROUP_SIZE_M * num_pid_n
num_pid_in_group
=
GROUP_SIZE_M
*
num_pid_n
# group_id = pid // num_pid_in_group
group_id
=
pid
//
num_pid_in_group
# first_pid_m = group_id * GROUP_SIZE_M
first_pid_m
=
group_id
*
GROUP_SIZE_M
# group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
group_size_m
=
min
(
num_pid_m
-
first_pid_m
,
GROUP_SIZE_M
)
# pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
pid_m
=
first_pid_m
+
((
pid
%
num_pid_in_group
)
%
group_size_m
)
# pid_n = (pid % num_pid_in_group) // group_size_m
pid_n
=
(
pid
%
num_pid_in_group
)
//
group_size_m
if
GROUP_SIZE_M
==
1
:
num_pid_n
=
tl
.
cdiv
(
N
,
BLOCK_SIZE_N
)
pid_m
=
pid
//
num_pid_n
pid_n
=
pid
%
num_pid_n
else
:
num_pid_m
=
tl
.
cdiv
(
EM
,
BLOCK_SIZE_M
)
num_pid_n
=
tl
.
cdiv
(
N
,
BLOCK_SIZE_N
)
num_pid_in_group
=
GROUP_SIZE_M
*
num_pid_n
group_id
=
pid
//
num_pid_in_group
first_pid_m
=
group_id
*
GROUP_SIZE_M
group_size_m
=
min
(
num_pid_m
-
first_pid_m
,
GROUP_SIZE_M
)
pid_m
=
first_pid_m
+
((
pid
%
num_pid_in_group
)
%
group_size_m
)
pid_n
=
(
pid
%
num_pid_in_group
)
//
group_size_m
# ----------------------------------------------------------
# ----------------------------------------------------------
...
@@ -1967,7 +1954,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
...
@@ -1967,7 +1954,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
@
staticmethod
@
staticmethod
def
_supports_parallel_config
(
moe_parallel_config
:
FusedMoEParallelConfig
)
->
bool
:
def
_supports_parallel_config
(
moe_parallel_config
:
FusedMoEParallelConfig
)
->
bool
:
return
not
moe_parallel_config
.
use_fi_all2allv_kernels
return
True
def
supports_chunking
(
self
)
->
bool
:
def
supports_chunking
(
self
)
->
bool
:
return
True
return
True
...
...
vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
View file @
c721b814
...
@@ -5,7 +5,6 @@ from abc import abstractmethod
...
@@ -5,7 +5,6 @@ from abc import abstractmethod
import
torch
import
torch
import
vllm.model_executor.layers.fused_moe.modular_kernel
as
mk
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.fused_moe.config
import
(
from
vllm.model_executor.layers.fused_moe.config
import
(
FusedMoEConfig
,
FusedMoEConfig
,
...
@@ -26,120 +25,4 @@ class FusedMoEMethodBase(QuantizeMethodBase):
...
@@ -26,120 +25,4 @@ class FusedMoEMethodBase(QuantizeMethodBase):
def
__init__
(
self
,
moe
:
FusedMoEConfig
):
def
__init__
(
self
,
moe
:
FusedMoEConfig
):
super
().
__init__
()
super
().
__init__
()
self
.
moe
:
FusedMoEConfig
=
moe
self
.
moe
:
FusedMoEConfig
=
moe
self
.
moe_quant_config
:
FusedMoEQuantConfig
|
None
=
None
self
.
moe_quant_config
:
FusedMoEQuantConfig
|
None
=
None
self
.
moe_mk
:
mk
.
FusedMoEModularKernel
|
None
=
None
\ No newline at end of file
@
property
def
supports_internal_mk
(
self
)
->
bool
:
# NOTE(rob): temporary attribute to indicate support for
# completed migration to the new internal MK interface.
return
self
.
moe_mk
is
not
None
@
property
def
mk_owns_shared_expert
(
self
)
->
bool
:
# NOTE(rob): temporary attribute to indicate support for
# completed migration to the new internal MK interface.
return
self
.
moe_mk
is
not
None
and
self
.
moe_mk
.
shared_experts
is
not
None
@
abstractmethod
def
create_weights
(
self
,
layer
:
torch
.
nn
.
Module
,
num_experts
:
int
,
hidden_size
:
int
,
intermediate_size_per_partition
:
int
,
params_dtype
:
torch
.
dtype
,
**
extra_weight_attrs
,
):
raise
NotImplementedError
def
uses_weight_scale_2_pattern
(
self
)
->
bool
:
"""
Returns True if this quantization method uses 'weight_scale_2' pattern
for per-tensor weight scales (e.g., FP4 variants), False otherwise.
This method should be overridden by subclasses that use the
'weight_scale_2' pattern instead of the standard 'weight_scale' pattern.
"""
return
False
def
maybe_make_prepare_finalize
(
self
,
routing_tables
:
tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
]
|
None
=
None
,
)
->
FusedMoEPrepareAndFinalize
|
None
:
from
.all2all_utils
import
maybe_make_prepare_finalize
return
maybe_make_prepare_finalize
(
self
.
moe
,
self
.
moe_quant_config
,
routing_tables
)
def
select_gemm_impl
(
self
,
prepare_finalize
:
FusedMoEPrepareAndFinalize
,
layer
:
torch
.
nn
.
Module
,
)
->
FusedMoEPermuteExpertsUnpermute
:
# based on the all2all implementation, select the appropriate
# gemm implementation
raise
NotImplementedError
(
f
"
{
self
.
__class__
.
__name__
}
must select appropriate gemm "
"implementation based on the prepare_finalize"
)
def
prepare_dp_allgather_tensor
(
self
,
layer
:
"FusedMoE"
,
# type: ignore[name-defined] # noqa: F821
hidden_states
:
torch
.
Tensor
,
router_logits
:
torch
.
Tensor
,
)
->
tuple
[
torch
.
Tensor
,
list
[
torch
.
Tensor
]]:
"""Hook to prepare tensors and extra tensors for DP allgather + EP dispatch."""
raise
NotImplementedError
(
"Method 'prepare_dp_allgather_tensor' is not implemented in "
f
"
{
self
.
__class__
.
__name__
}
."
)
@
abstractmethod
def
get_fused_moe_quant_config
(
self
,
layer
:
torch
.
nn
.
Module
)
->
FusedMoEQuantConfig
|
None
:
raise
NotImplementedError
@
property
def
topk_indices_dtype
(
self
)
->
torch
.
dtype
|
None
:
if
self
.
moe_mk
is
not
None
:
return
self
.
moe_mk
.
prepare_finalize
.
topk_indices_dtype
()
return
None
@
property
def
supports_eplb
(
self
)
->
bool
:
return
False
@
property
def
allow_inplace
(
self
)
->
bool
:
return
False
@
property
def
method_name
(
self
)
->
str
:
return
self
.
__class__
.
__name__
@
property
def
is_monolithic
(
self
)
->
bool
:
return
False
# @abstractmethod
def
apply
(
self
,
layer
:
"FusedMoE"
,
# type: ignore[name-defined] # noqa: F821
x
:
torch
.
Tensor
,
topk_weights
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
)
->
torch
.
Tensor
|
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
raise
NotImplementedError
# @abstractmethod
def
apply_monolithic
(
self
,
layer
:
"FusedMoE"
,
# type: ignore[name-defined] # noqa: F821
x
:
torch
.
Tensor
,
router_logits
:
torch
.
Tensor
,
)
->
torch
.
Tensor
|
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
raise
NotImplementedError
vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
View file @
c721b814
...
@@ -30,11 +30,11 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
...
@@ -30,11 +30,11 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
):
):
super
().
__init__
(
old_quant_method
.
moe
)
super
().
__init__
(
old_quant_method
.
moe
)
self
.
moe_quant_config
=
old_quant_method
.
moe_quant_config
self
.
moe_quant_config
=
old_quant_method
.
moe_quant_config
self
.
moe_mk
=
experts
self
.
fused_experts
=
experts
self
.
disable_expert_map
=
getattr
(
self
.
disable_expert_map
=
getattr
(
old_quant_method
,
old_quant_method
,
"disable_expert_map"
,
"disable_expert_map"
,
not
self
.
moe_mk
.
supports_expert_map
(),
not
self
.
fused_experts
.
supports_expert_map
(),
)
)
self
.
old_quant_method
=
old_quant_method
self
.
old_quant_method
=
old_quant_method
assert
not
self
.
old_quant_method
.
is_monolithic
assert
not
self
.
old_quant_method
.
is_monolithic
...
@@ -57,6 +57,10 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
...
@@ -57,6 +57,10 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
),
),
)
)
@
property
def
topk_indices_dtype
(
self
)
->
torch
.
dtype
|
None
:
return
self
.
fused_experts
.
prepare_finalize
.
topk_indices_dtype
()
@
property
@
property
def
supports_eplb
(
self
)
->
bool
:
def
supports_eplb
(
self
)
->
bool
:
return
self
.
old_quant_method
.
supports_eplb
return
self
.
old_quant_method
.
supports_eplb
...
@@ -92,8 +96,7 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
...
@@ -92,8 +96,7 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
topk_weights
:
torch
.
Tensor
,
topk_weights
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
)
->
torch
.
Tensor
|
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
)
->
torch
.
Tensor
|
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
assert
self
.
moe_mk
is
not
None
return
self
.
fused_experts
(
return
self
.
moe_mk
(
hidden_states
=
x
,
hidden_states
=
x
,
w1
=
layer
.
w13_weight
,
w1
=
layer
.
w13_weight
,
w2
=
layer
.
w2_weight
,
w2
=
layer
.
w2_weight
,
...
...
Prev
1
2
3
4
5
6
7
8
9
10
…
17
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment