Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
7e63ef82
Commit
7e63ef82
authored
Jan 21, 2026
by
zhuwenwen
Browse files
Merge tag 'v0.14.0' into v0.14.0-dev
parents
8cbcac5d
b17039bc
Changes
681
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
531 additions
and
100 deletions
+531
-100
tests/kernels/moe/test_modular_oai_triton_moe.py
tests/kernels/moe/test_modular_oai_triton_moe.py
+2
-1
tests/kernels/moe/test_moe.py
tests/kernels/moe/test_moe.py
+129
-2
tests/kernels/moe/test_moe_align_block_size.py
tests/kernels/moe/test_moe_align_block_size.py
+2
-1
tests/kernels/moe/test_pplx_moe.py
tests/kernels/moe/test_pplx_moe.py
+7
-7
tests/kernels/moe/test_silu_mul_per_token_group_quant_fp8_colmajor.py
...s/moe/test_silu_mul_per_token_group_quant_fp8_colmajor.py
+6
-1
tests/kernels/moe/test_triton_moe_no_act_mul.py
tests/kernels/moe/test_triton_moe_no_act_mul.py
+201
-0
tests/kernels/moe/untest_cutlass_moe.py
tests/kernels/moe/untest_cutlass_moe.py
+50
-29
tests/kernels/moe/untest_moe_permute_unpermute.py
tests/kernels/moe/untest_moe_permute_unpermute.py
+3
-2
tests/kernels/moe/untest_nvfp4_moe.py
tests/kernels/moe/untest_nvfp4_moe.py
+22
-11
tests/kernels/moe/untest_pplx_cutlass_moe.py
tests/kernels/moe/untest_pplx_cutlass_moe.py
+2
-1
tests/kernels/moe/untest_silu_mul_fp8_quant_deep_gemm.py
tests/kernels/moe/untest_silu_mul_fp8_quant_deep_gemm.py
+2
-1
tests/kernels/quant_utils.py
tests/kernels/quant_utils.py
+11
-25
tests/kernels/quantization/test_awq_triton.py
tests/kernels/quantization/test_awq_triton.py
+3
-3
tests/kernels/quantization/test_cutlass_w4a8_moe.py
tests/kernels/quantization/test_cutlass_w4a8_moe.py
+3
-2
tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
...s/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
+2
-1
tests/kernels/quantization/test_flashinfer_scaled_mm.py
tests/kernels/quantization/test_flashinfer_scaled_mm.py
+2
-1
tests/kernels/quantization/test_fp8_min_max_helper.py
tests/kernels/quantization/test_fp8_min_max_helper.py
+65
-0
tests/kernels/quantization/test_fp8_quant_group.py
tests/kernels/quantization/test_fp8_quant_group.py
+14
-7
tests/kernels/quantization/test_gguf.py
tests/kernels/quantization/test_gguf.py
+4
-4
tests/kernels/quantization/test_int8_kernel.py
tests/kernels/quantization/test_int8_kernel.py
+1
-1
No files found.
Too many changes to show.
To preserve performance only
681 of 681+
files are displayed.
Plain diff
Email patch
tests/kernels/moe/test_modular_oai_triton_moe.py
View file @
7e63ef82
...
@@ -34,6 +34,7 @@ from vllm.model_executor.layers.fused_moe.prepare_finalize import (
...
@@ -34,6 +34,7 @@ from vllm.model_executor.layers.fused_moe.prepare_finalize import (
)
)
from
vllm.model_executor.layers.utils
import
shuffle_weight
from
vllm.model_executor.layers.utils
import
shuffle_weight
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils.torch_utils
import
set_random_seed
MNK
=
[
MNK
=
[
(
1
,
512
,
384
),
(
1
,
512
,
384
),
...
@@ -211,7 +212,7 @@ def test_oai_triton_moe(
...
@@ -211,7 +212,7 @@ def test_oai_triton_moe(
unfused
:
bool
,
unfused
:
bool
,
workspace_init
,
workspace_init
,
):
):
current_platform
.
seed_everything
(
0
)
set_random_seed
(
0
)
(
(
w1
,
w1
,
w2
,
w2
,
...
...
tests/kernels/moe/test_moe.py
View file @
7e63ef82
...
@@ -60,10 +60,14 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import quantize_w
...
@@ -60,10 +60,14 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import quantize_w
from
vllm.model_executor.models.mixtral
import
MixtralMoE
from
vllm.model_executor.models.mixtral
import
MixtralMoE
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.scalar_type
import
ScalarType
,
scalar_types
from
vllm.scalar_type
import
ScalarType
,
scalar_types
from
vllm.utils.torch_utils
import
set_random_seed
from
vllm.v1.worker.workspace
import
init_workspace_manager
NUM_EXPERTS
=
[
8
,
64
,
192
]
NUM_EXPERTS
=
[
8
,
64
,
192
]
NUM_EXPERTS_LARGE
=
[
128
,
256
]
EP_SIZE
=
[
1
,
4
]
EP_SIZE
=
[
1
,
4
]
TOP_KS
=
[
2
,
6
]
TOP_KS
=
[
2
,
6
]
TOP_KS_SMALL
=
[
1
,
2
]
MOE_MARLIN_QUANT_TEST_CONFIGS
=
[
MOE_MARLIN_QUANT_TEST_CONFIGS
=
[
# AWQ-INT4
# AWQ-INT4
...
@@ -131,6 +135,13 @@ FUSED_MOE_MNK_FACTORS = [
...
@@ -131,6 +135,13 @@ FUSED_MOE_MNK_FACTORS = [
(
40000
,
1024
,
1024
),
(
40000
,
1024
,
1024
),
]
]
FUSED_MOE_MNK_FACTORS_SMALL_M
=
[
(
1
,
128
,
128
),
(
1
,
2048
,
128
),
(
2
,
2048
,
128
),
(
2
,
2048
,
511
),
]
FUSED_MOE_WN16_MNK_FACTORS
=
[
FUSED_MOE_WN16_MNK_FACTORS
=
[
(
1
,
128
,
128
),
(
1
,
128
,
128
),
(
1
,
1024
,
1024
),
(
1
,
1024
,
1024
),
...
@@ -233,7 +244,7 @@ def test_fused_moe(
...
@@ -233,7 +244,7 @@ def test_fused_moe(
monkeypatch
,
monkeypatch
,
workspace_init
,
workspace_init
,
):
):
current_platform
.
seed_everything
(
7
)
set_random_seed
(
7
)
monkeypatch
.
setenv
(
"VLLM_FUSED_MOE_CHUNK_SIZE"
,
str
(
chunk_size
))
monkeypatch
.
setenv
(
"VLLM_FUSED_MOE_CHUNK_SIZE"
,
str
(
chunk_size
))
...
@@ -328,6 +339,111 @@ def test_fused_moe(
...
@@ -328,6 +339,111 @@ def test_fused_moe(
)
)
@
pytest
.
mark
.
parametrize
(
"m,n,k"
,
FUSED_MOE_MNK_FACTORS_SMALL_M
)
@
pytest
.
mark
.
parametrize
(
"e"
,
NUM_EXPERTS_LARGE
)
@
pytest
.
mark
.
parametrize
(
"topk"
,
TOP_KS_SMALL
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
bfloat16
])
@
pytest
.
mark
.
parametrize
(
"padding"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"chunk_size"
,
[
8192
])
def
test_naive_block_assignment_moe
(
m
:
int
,
n
:
int
,
k
:
int
,
e
:
int
,
topk
:
int
,
dtype
:
torch
.
dtype
,
padding
:
bool
,
chunk_size
:
int
,
monkeypatch
,
workspace_init
,
):
current_platform
.
seed_everything
(
7
)
monkeypatch
.
setenv
(
"VLLM_FUSED_MOE_CHUNK_SIZE"
,
str
(
chunk_size
))
#
# Setup test data
#
#
# Setup test data
#
a
=
torch
.
randn
((
m
,
k
),
device
=
"cuda"
,
dtype
=
dtype
)
/
10
w1
=
torch
.
randn
((
e
,
2
*
n
,
k
),
device
=
"cuda"
,
dtype
=
dtype
)
/
10
w2
=
torch
.
randn
((
e
,
k
,
n
),
device
=
"cuda"
,
dtype
=
dtype
)
/
10
score
=
torch
.
randn
((
m
,
e
),
device
=
"cuda"
,
dtype
=
dtype
)
e_map
=
None
#
# Setup test functions
#
quant_config
=
FUSED_MOE_UNQUANTIZED_CONFIG
m_fused_moe_fn
=
modular_triton_fused_moe
(
quant_config
)
def
m_fused_moe
(
a
:
torch
.
Tensor
,
w1
:
torch
.
Tensor
,
w2
:
torch
.
Tensor
,
score
:
torch
.
Tensor
,
topk
:
int
,
global_num_experts
:
int
=
-
1
,
expert_map
:
torch
.
Tensor
|
None
=
None
,
)
->
torch
.
Tensor
:
topk_weights
,
topk_ids
,
_
=
fused_topk
(
a
,
score
,
topk
,
False
)
return
m_fused_moe_fn
(
a
,
w1
,
w2
,
topk_weights
,
topk_ids
,
global_num_experts
=
global_num_experts
,
expert_map
=
expert_map
,
)
fused_moe_fn
=
functools
.
partial
(
fused_moe
,
renormalize
=
False
)
#
# Run tests
#
runner
=
functools
.
partial
(
run_moe_test
,
a
=
a
,
w1
=
w1
,
w2
=
w2
,
score
=
score
,
topk
=
topk
,
global_num_experts
=
e
,
expert_map
=
e_map
,
padding
=
padding
,
)
# Note: for now use_compile will error out if the problem size is
# large enough to trigger chunking. I'm leaving the flag and
# setup code in case we are able to revisit this later.
use_compile
=
False
use_cudagraph
=
n
>=
1024
and
k
>=
1024
and
current_platform
.
is_cuda_alike
()
with
set_current_vllm_config
(
vllm_config
):
baseline_output
=
runner
(
torch_moe
,
iterative_moe
)
runner
(
baseline_output
,
fused_moe_fn
,
use_compile
=
use_compile
,
use_cudagraph
=
use_cudagraph
,
)
runner
(
baseline_output
,
m_fused_moe
,
use_compile
=
use_compile
,
use_cudagraph
=
use_cudagraph
,
)
@
pytest
.
mark
.
parametrize
(
"m,n,k"
,
FUSED_MOE_WN16_MNK_FACTORS
)
@
pytest
.
mark
.
parametrize
(
"m,n,k"
,
FUSED_MOE_WN16_MNK_FACTORS
)
@
pytest
.
mark
.
parametrize
(
"e"
,
NUM_EXPERTS
)
@
pytest
.
mark
.
parametrize
(
"e"
,
NUM_EXPERTS
)
@
pytest
.
mark
.
parametrize
(
"topk"
,
TOP_KS
)
@
pytest
.
mark
.
parametrize
(
"topk"
,
TOP_KS
)
...
@@ -466,7 +582,12 @@ def test_fused_moe_wn16(
...
@@ -466,7 +582,12 @@ def test_fused_moe_wn16(
)
)
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
def
test_mixtral_moe
(
def
test_mixtral_moe
(
dist_init
,
dtype
:
torch
.
dtype
,
padding
:
bool
,
use_rocm_aiter
:
bool
,
monkeypatch
default_vllm_config
,
dist_init
,
dtype
:
torch
.
dtype
,
padding
:
bool
,
use_rocm_aiter
:
bool
,
monkeypatch
,
):
):
"""Make sure our Mixtral MoE implementation agrees with the one from
"""Make sure our Mixtral MoE implementation agrees with the one from
huggingface."""
huggingface."""
...
@@ -487,6 +608,7 @@ def test_mixtral_moe(
...
@@ -487,6 +608,7 @@ def test_mixtral_moe(
monkeypatch
.
setenv
(
"MASTER_ADDR"
,
"localhost"
)
monkeypatch
.
setenv
(
"MASTER_ADDR"
,
"localhost"
)
monkeypatch
.
setenv
(
"MASTER_PORT"
,
"12345"
)
monkeypatch
.
setenv
(
"MASTER_PORT"
,
"12345"
)
init_distributed_environment
()
init_distributed_environment
()
init_workspace_manager
(
torch
.
cuda
.
current_device
())
# Instantiate our and huggingface's MoE blocks
# Instantiate our and huggingface's MoE blocks
vllm_config
.
compilation_config
.
static_forward_context
=
dict
()
vllm_config
.
compilation_config
.
static_forward_context
=
dict
()
...
@@ -540,6 +662,11 @@ def test_mixtral_moe(
...
@@ -540,6 +662,11 @@ def test_mixtral_moe(
torch
.
cuda
.
synchronize
()
torch
.
cuda
.
synchronize
()
torch
.
cuda
.
empty_cache
()
torch
.
cuda
.
empty_cache
()
# FIXME (zyongye) fix this after we move self.kernel
# assignment in FusedMoE.__init__
vllm_moe
.
experts
.
quant_method
.
process_weights_after_loading
(
vllm_moe
.
experts
)
# Run forward passes for both MoE blocks
# Run forward passes for both MoE blocks
hf_states
,
_
=
hf_moe
.
forward
(
hf_inputs
)
hf_states
,
_
=
hf_moe
.
forward
(
hf_inputs
)
vllm_states
=
vllm_moe
.
forward
(
vllm_inputs
)
vllm_states
=
vllm_moe
.
forward
(
vllm_inputs
)
...
...
tests/kernels/moe/test_moe_align_block_size.py
View file @
7e63ef82
...
@@ -14,12 +14,13 @@ from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
...
@@ -14,12 +14,13 @@ from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
)
)
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils.math_utils
import
round_up
from
vllm.utils.math_utils
import
round_up
from
vllm.utils.torch_utils
import
set_random_seed
NUM_TOKENS
=
[
1
,
3
,
256
,
2256
,
4096
]
NUM_TOKENS
=
[
1
,
3
,
256
,
2256
,
4096
]
NUM_EXPERTS
=
[
32
,
160
,
256
,
257
]
NUM_EXPERTS
=
[
32
,
160
,
256
,
257
]
TOP_KS
=
[
1
,
2
,
16
,
32
]
TOP_KS
=
[
1
,
2
,
16
,
32
]
BLOCK_SIZES
=
[
32
,
128
]
BLOCK_SIZES
=
[
32
,
128
]
current_platform
.
seed_everything
(
0
)
set_random_seed
(
0
)
def
_group_tokens_by_expert
(
def
_group_tokens_by_expert
(
...
...
tests/kernels/moe/test_pplx_moe.py
View file @
7e63ef82
...
@@ -44,8 +44,8 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularK
...
@@ -44,8 +44,8 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularK
from
vllm.model_executor.layers.fused_moe.topk_weight_and_reduce
import
(
from
vllm.model_executor.layers.fused_moe.topk_weight_and_reduce
import
(
TopKWeightAndReduceDelegate
,
TopKWeightAndReduceDelegate
,
)
)
from
vllm.platforms
import
current_platform
from
vllm.utils.math_utils
import
round_up
from
vllm.utils.math_utils
import
round_up
from
vllm.utils.torch_utils
import
set_random_seed
from
vllm.v1.worker.workspace
import
init_workspace_manager
from
vllm.v1.worker.workspace
import
init_workspace_manager
from
...utils
import
multi_gpu_test
from
...utils
import
multi_gpu_test
...
@@ -184,7 +184,7 @@ def test_fused_moe_batched_experts(
...
@@ -184,7 +184,7 @@ def test_fused_moe_batched_experts(
dtype
:
torch
.
dtype
,
dtype
:
torch
.
dtype
,
workspace_init
,
workspace_init
,
):
):
current_platform
.
seed_everything
(
7
)
set_random_seed
(
7
)
a
=
torch
.
randn
((
m
,
k
),
device
=
"cuda"
,
dtype
=
dtype
)
/
10
a
=
torch
.
randn
((
m
,
k
),
device
=
"cuda"
,
dtype
=
dtype
)
/
10
w1
=
torch
.
randn
((
e
,
2
*
n
,
k
),
device
=
"cuda"
,
dtype
=
dtype
)
/
10
w1
=
torch
.
randn
((
e
,
2
*
n
,
k
),
device
=
"cuda"
,
dtype
=
dtype
)
/
10
...
@@ -491,7 +491,7 @@ def test_pplx_prepare_finalize_slow(
...
@@ -491,7 +491,7 @@ def test_pplx_prepare_finalize_slow(
if
per_act_token_quant
and
block_shape
is
not
None
:
if
per_act_token_quant
and
block_shape
is
not
None
:
pytest
.
skip
(
"Skip illegal quantization combination"
)
pytest
.
skip
(
"Skip illegal quantization combination"
)
current_platform
.
seed_everything
(
7
)
set_random_seed
(
7
)
m
,
n
,
k
=
mnk
m
,
n
,
k
=
mnk
world_size
,
dp_size
=
world_dp_size
world_size
,
dp_size
=
world_dp_size
device
=
"cuda"
device
=
"cuda"
...
@@ -809,7 +809,7 @@ def test_pplx_moe_slow(
...
@@ -809,7 +809,7 @@ def test_pplx_moe_slow(
block_shape
:
list
[
int
]
|
None
,
block_shape
:
list
[
int
]
|
None
,
use_internode
:
bool
,
use_internode
:
bool
,
):
):
current_platform
.
seed_everything
(
7
)
set_random_seed
(
7
)
m
,
n
,
k
=
mnk
m
,
n
,
k
=
mnk
world_size
,
dp_size
=
world_dp_size
world_size
,
dp_size
=
world_dp_size
...
@@ -888,7 +888,7 @@ def _pplx_test_loop(
...
@@ -888,7 +888,7 @@ def _pplx_test_loop(
new_vllm_config
.
parallel_config
.
enable_expert_parallel
=
True
new_vllm_config
.
parallel_config
.
enable_expert_parallel
=
True
_set_vllm_config
(
new_vllm_config
,
pgi
.
world_size
,
pgi
.
rank
,
pgi
.
local_rank
)
_set_vllm_config
(
new_vllm_config
,
pgi
.
world_size
,
pgi
.
rank
,
pgi
.
local_rank
)
current_platform
.
seed_everything
(
7
)
set_random_seed
(
7
)
combos
=
itertools
.
product
(
combos
=
itertools
.
product
(
PPLX_COMBOS
,
NUM_EXPERTS
,
TOP_KS
,
DTYPES
,
[
False
,
True
],
[
None
,
[
128
,
128
]]
PPLX_COMBOS
,
NUM_EXPERTS
,
TOP_KS
,
DTYPES
,
[
False
,
True
],
[
None
,
[
128
,
128
]]
)
)
...
@@ -982,7 +982,7 @@ def test_pplx_prepare_finalize(
...
@@ -982,7 +982,7 @@ def test_pplx_prepare_finalize(
world_dp_size
:
tuple
[
int
,
int
],
world_dp_size
:
tuple
[
int
,
int
],
use_internode
:
bool
,
use_internode
:
bool
,
):
):
current_platform
.
seed_everything
(
7
)
set_random_seed
(
7
)
world_size
,
dp_size
=
world_dp_size
world_size
,
dp_size
=
world_dp_size
parallel_launch
(
parallel_launch
(
world_size
*
dp_size
,
world_size
*
dp_size
,
...
@@ -1005,7 +1005,7 @@ def test_pplx_moe(
...
@@ -1005,7 +1005,7 @@ def test_pplx_moe(
use_internode
:
bool
,
use_internode
:
bool
,
use_shared_experts
:
bool
,
use_shared_experts
:
bool
,
):
):
current_platform
.
seed_everything
(
7
)
set_random_seed
(
7
)
world_size
,
dp_size
=
world_dp_size
world_size
,
dp_size
=
world_dp_size
parallel_launch
(
parallel_launch
(
world_size
,
world_size
,
...
...
tests/kernels/moe/test_silu_mul_per_token_group_quant_fp8_colmajor.py
View file @
7e63ef82
...
@@ -11,6 +11,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
...
@@ -11,6 +11,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.triton_utils
import
triton
from
vllm.triton_utils
import
triton
from
vllm.utils.deep_gemm
import
is_deep_gemm_e8m0_used
from
vllm.utils.deep_gemm
import
is_deep_gemm_e8m0_used
from
vllm.utils.torch_utils
import
set_random_seed
FLOAT8_DTYPE
=
torch
.
float8_e4m3fn
FLOAT8_DTYPE
=
torch
.
float8_e4m3fn
GROUP_SIZE
=
128
GROUP_SIZE
=
128
...
@@ -67,8 +68,12 @@ def reference(x: torch.Tensor, use_ue8m0: bool) -> tuple[torch.Tensor, torch.Ten
...
@@ -67,8 +68,12 @@ def reference(x: torch.Tensor, use_ue8m0: bool) -> tuple[torch.Tensor, torch.Ten
@
pytest
.
mark
.
parametrize
(
"T"
,
[
128
,
256
,
512
])
@
pytest
.
mark
.
parametrize
(
"T"
,
[
128
,
256
,
512
])
@
pytest
.
mark
.
parametrize
(
"N"
,
[
128
*
2
,
256
*
2
,
768
*
2
,
2048
*
2
,
7168
*
2
])
@
pytest
.
mark
.
parametrize
(
"N"
,
[
128
*
2
,
256
*
2
,
768
*
2
,
2048
*
2
,
7168
*
2
])
@
pytest
.
mark
.
skipif
(
current_platform
.
is_rocm
(),
reason
=
"ROCm does not support DeepGemm."
,
)
def
test_silu_mul_fp8_quant_deep_gemm
(
T
:
int
,
N
:
int
):
def
test_silu_mul_fp8_quant_deep_gemm
(
T
:
int
,
N
:
int
):
current_platform
.
seed_everything
(
42
)
set_random_seed
(
42
)
input
=
torch
.
rand
((
T
,
N
),
dtype
=
torch
.
bfloat16
,
device
=
"cuda"
)
input
=
torch
.
rand
((
T
,
N
),
dtype
=
torch
.
bfloat16
,
device
=
"cuda"
)
...
...
tests/kernels/moe/test_triton_moe_no_act_mul.py
0 → 100644
View file @
7e63ef82
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for MoE with non-gated activations (*_no_mul).
These tests verify that MoE layers work correctly with activations like
silu_no_mul, gelu_no_mul, relu2_no_mul where the activation output dimension
equals N (not N // 2 like gated activations).
"""
import
pytest
import
torch
from
vllm.model_executor.layers.fused_moe.config
import
(
FUSED_MOE_UNQUANTIZED_CONFIG
,
)
from
vllm.model_executor.layers.fused_moe.fused_moe
import
TritonExperts
from
vllm.model_executor.layers.fused_moe.utils
import
(
GELU_NO_MUL
,
RELU2_NO_MUL
,
SILU_NO_MUL
,
)
from
vllm.platforms
import
current_platform
# Test parameters
M_SIZES
=
[
1
,
16
,
64
]
N_SIZES
=
[
128
,
256
]
K_SIZES
=
[
64
,
128
]
TOPK_VALUES
=
[
1
,
2
]
NUM_EXPERTS
=
8
NO_MUL_ACTIVATIONS
=
[
SILU_NO_MUL
,
GELU_NO_MUL
,
RELU2_NO_MUL
]
def
make_test_tensors
(
m
:
int
,
n
:
int
,
k
:
int
,
num_experts
:
int
,
topk
:
int
,
dtype
:
torch
.
dtype
=
torch
.
bfloat16
,
device
:
str
=
"cuda"
,
):
"""Create test tensors for MoE with non-gated activation.
For non-gated activations (*_no_mul):
- w1: (E, N, K) - projects from K to N
- w2: (E, K, N) - projects from N back to K (note: N, not N//2)
"""
hidden_states
=
torch
.
randn
(
m
,
k
,
dtype
=
dtype
,
device
=
device
)
# For non-gated: w1 projects K -> N, w2 projects N -> K
w1
=
torch
.
randn
(
num_experts
,
n
,
k
,
dtype
=
dtype
,
device
=
device
)
*
0.1
w2
=
torch
.
randn
(
num_experts
,
k
,
n
,
dtype
=
dtype
,
device
=
device
)
*
0.1
topk_weights
=
torch
.
ones
(
m
,
topk
,
dtype
=
torch
.
float32
,
device
=
device
)
/
topk
topk_ids
=
torch
.
randint
(
0
,
num_experts
,
(
m
,
topk
),
device
=
device
)
return
hidden_states
,
w1
,
w2
,
topk_weights
,
topk_ids
@
pytest
.
mark
.
skipif
(
not
current_platform
.
has_device_capability
(
80
),
reason
=
"Requires compute capability >= 8.0"
,
)
@
pytest
.
mark
.
parametrize
(
"m"
,
M_SIZES
)
@
pytest
.
mark
.
parametrize
(
"n"
,
N_SIZES
)
@
pytest
.
mark
.
parametrize
(
"k"
,
K_SIZES
)
@
pytest
.
mark
.
parametrize
(
"topk"
,
TOPK_VALUES
)
@
pytest
.
mark
.
parametrize
(
"activation"
,
NO_MUL_ACTIVATIONS
)
@
torch
.
inference_mode
()
def
test_triton_experts_no_mul_activation
(
m
:
int
,
n
:
int
,
k
:
int
,
topk
:
int
,
activation
:
str
,
):
hidden_states
,
w1
,
w2
,
topk_weights
,
topk_ids
=
make_test_tensors
(
m
,
n
,
k
,
NUM_EXPERTS
,
topk
)
experts
=
TritonExperts
(
FUSED_MOE_UNQUANTIZED_CONFIG
)
ws1_shape
,
ws2_shape
,
out_shape
=
experts
.
workspace_shapes
(
M
=
m
,
N
=
n
,
K
=
k
,
topk
=
topk
,
global_num_experts
=
NUM_EXPERTS
,
local_num_experts
=
NUM_EXPERTS
,
expert_tokens_meta
=
None
,
activation
=
activation
,
)
# Verify workspace shapes are correct for no_mul activation
# workspace1 should handle activation_out_dim = N (not N//2)
assert
ws1_shape
==
(
m
,
topk
,
max
(
n
,
k
)),
(
f
"workspace1 shape mismatch: expected
{
(
m
,
topk
,
max
(
n
,
k
))
}
, got
{
ws1_shape
}
"
)
# workspace2 should handle max(N, K) for intermediate_cache1/cache3
assert
ws2_shape
==
(
m
,
topk
,
max
(
n
,
k
)),
(
f
"workspace2 shape mismatch: expected
{
(
m
,
topk
,
max
(
n
,
k
))
}
, got
{
ws2_shape
}
"
)
assert
out_shape
==
(
m
,
k
),
(
f
"output shape mismatch: expected
{
(
m
,
k
)
}
, got
{
out_shape
}
"
)
workspace1
=
torch
.
empty
(
ws1_shape
[
0
]
*
ws1_shape
[
1
]
*
ws1_shape
[
2
],
dtype
=
hidden_states
.
dtype
,
device
=
hidden_states
.
device
,
)
workspace2
=
torch
.
empty
(
ws2_shape
[
0
]
*
ws2_shape
[
1
]
*
ws2_shape
[
2
],
dtype
=
hidden_states
.
dtype
,
device
=
hidden_states
.
device
,
)
output
=
torch
.
zeros
(
m
,
k
,
dtype
=
hidden_states
.
dtype
,
device
=
hidden_states
.
device
)
experts
.
apply
(
output
=
output
,
hidden_states
=
hidden_states
,
w1
=
w1
,
w2
=
w2
,
topk_weights
=
topk_weights
,
topk_ids
=
topk_ids
,
activation
=
activation
,
global_num_experts
=
NUM_EXPERTS
,
expert_map
=
None
,
a1q_scale
=
None
,
a2_scale
=
None
,
workspace13
=
workspace1
,
workspace2
=
workspace2
,
expert_tokens_meta
=
None
,
apply_router_weight_on_input
=
False
,
)
assert
output
.
shape
==
(
m
,
k
),
f
"Expected shape
{
(
m
,
k
)
}
, got
{
output
.
shape
}
"
assert
not
torch
.
isnan
(
output
).
any
(),
"Output contains NaN"
assert
not
torch
.
isinf
(
output
).
any
(),
"Output contains Inf"
assert
output
.
abs
().
sum
()
>
0
,
"Output is all zeros"
@
pytest
.
mark
.
skipif
(
not
current_platform
.
has_device_capability
(
80
),
reason
=
"Requires compute capability >= 8.0"
,
)
@
torch
.
inference_mode
()
def
test_workspace_shapes_no_mul_vs_gated
():
"""Test that workspace shapes differ correctly between gated and non-gated."""
from
vllm.model_executor.layers.fused_moe.fused_moe
import
TritonExperts
M
,
N
,
K
,
topk
=
64
,
256
,
128
,
2
experts
=
TritonExperts
(
FUSED_MOE_UNQUANTIZED_CONFIG
)
ws1_no_mul
,
_
,
out_no_mul
=
experts
.
workspace_shapes
(
M
,
N
,
K
,
topk
,
8
,
8
,
None
,
SILU_NO_MUL
)
ws1_gated
,
_
,
out_gated
=
experts
.
workspace_shapes
(
M
,
N
,
K
,
topk
,
8
,
8
,
None
,
"silu"
)
# For no_mul: activation_out_dim = N
# For gated: activation_out_dim = N // 2
# workspace1 should use max(activation_out_dim, K)
activation_out_dim_no_mul
=
N
activation_out_dim_gated
=
N
//
2
assert
ws1_no_mul
[
2
]
==
max
(
activation_out_dim_no_mul
,
K
),
(
f
"no_mul workspace1 last dim should be max(
{
activation_out_dim_no_mul
}
,
{
K
}
)"
)
assert
ws1_gated
[
2
]
==
max
(
activation_out_dim_gated
,
K
),
(
f
"gated workspace1 last dim should be max(
{
activation_out_dim_gated
}
,
{
K
}
)"
)
# Output shapes should be the same
assert
out_no_mul
==
out_gated
==
(
M
,
K
)
@
pytest
.
mark
.
skipif
(
not
current_platform
.
has_device_capability
(
80
),
reason
=
"Requires compute capability >= 8.0"
,
)
@
torch
.
inference_mode
()
def
test_adjust_n_for_activation
():
"""Test the adjust_N_for_activation method."""
from
vllm.model_executor.layers.fused_moe.fused_moe
import
TritonExperts
experts
=
TritonExperts
(
FUSED_MOE_UNQUANTIZED_CONFIG
)
N
=
256
# Gated activations should return N // 2
assert
experts
.
adjust_N_for_activation
(
N
,
"silu"
)
==
N
//
2
assert
experts
.
adjust_N_for_activation
(
N
,
"gelu"
)
==
N
//
2
# Non-gated activations should return N
assert
experts
.
adjust_N_for_activation
(
N
,
SILU_NO_MUL
)
==
N
assert
experts
.
adjust_N_for_activation
(
N
,
GELU_NO_MUL
)
==
N
assert
experts
.
adjust_N_for_activation
(
N
,
RELU2_NO_MUL
)
==
N
tests/kernels/moe/untest_cutlass_moe.py
View file @
7e63ef82
...
@@ -7,19 +7,25 @@ from math import prod
...
@@ -7,19 +7,25 @@ from math import prod
import
pytest
import
pytest
import
torch
import
torch
import
vllm.model_executor.layers.fused_moe.modular_kernel
as
mk
from
vllm
import
_custom_ops
as
ops
from
vllm
import
_custom_ops
as
ops
from
vllm.config
import
ParallelConfig
,
VllmConfig
,
set_current_vllm_config
from
vllm.config
import
ParallelConfig
,
VllmConfig
,
set_current_vllm_config
from
vllm.model_executor.layers.fused_moe.config
import
(
from
vllm.model_executor.layers.fused_moe.config
import
(
FUSED_MOE_UNQUANTIZED_CONFIG
,
FUSED_MOE_UNQUANTIZED_CONFIG
,
FusedMoEQuantConfig
,
fp8_w8a8_moe_quant_config
,
fp8_w8a8_moe_quant_config
,
)
)
from
vllm.model_executor.layers.fused_moe.cutlass_moe
import
(
from
vllm.model_executor.layers.fused_moe.cutlass_moe
import
(
c
utlass
_moe_f
p8
,
C
utlass
ExpertsF
p8
,
run_cutlass_moe_fp8
,
run_cutlass_moe_fp8
,
)
)
from
vllm.model_executor.layers.fused_moe.fused_moe
import
fused_experts
,
fused_topk
from
vllm.model_executor.layers.fused_moe.fused_moe
import
fused_experts
,
fused_topk
from
vllm.model_executor.layers.fused_moe.prepare_finalize
import
(
MoEPrepareAndFinalizeNoEP
,
)
from
vllm.model_executor.layers.fused_moe.utils
import
moe_kernel_quantize_input
from
vllm.model_executor.layers.fused_moe.utils
import
moe_kernel_quantize_input
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils.torch_utils
import
set_random_seed
NUM_EXPERTS
=
[
40
,
64
]
NUM_EXPERTS
=
[
40
,
64
]
TOP_KS
=
[
6
,
8
]
TOP_KS
=
[
6
,
8
]
...
@@ -149,16 +155,15 @@ class MOETensors8Bit(MOETensors):
...
@@ -149,16 +155,15 @@ class MOETensors8Bit(MOETensors):
def
run_with_expert_maps
(
def
run_with_expert_maps
(
num_experts
:
int
,
num_local_experts
:
int
,
**
cutlass_moe_kwargs
num_experts
:
int
,
num_local_experts
:
int
,
quant_config
:
FusedMoEQuantConfig
,
**
cutlass_moe_kwargs
,
):
):
def
slice_experts
():
def
slice_experts
():
slice_params
=
[
slice_params
=
[
"w1_q"
,
"w1"
,
"w2_q"
,
"w2"
,
"ab_strides1"
,
"ab_strides2"
,
"c_strides1"
,
"c_strides2"
,
]
]
full_tensors
=
{
full_tensors
=
{
k
:
v
k
:
v
...
@@ -166,8 +171,6 @@ def run_with_expert_maps(
...
@@ -166,8 +171,6 @@ def run_with_expert_maps(
if
k
in
slice_params
and
k
in
cutlass_moe_kwargs
if
k
in
slice_params
and
k
in
cutlass_moe_kwargs
}
}
quant_config
=
cutlass_moe_kwargs
[
"quant_config"
]
for
i
in
range
(
0
,
num_experts
,
num_local_experts
):
for
i
in
range
(
0
,
num_experts
,
num_local_experts
):
s
,
e
=
i
,
i
+
num_local_experts
s
,
e
=
i
,
i
+
num_local_experts
...
@@ -186,13 +189,23 @@ def run_with_expert_maps(
...
@@ -186,13 +189,23 @@ def run_with_expert_maps(
new_quant_config
.
_w1
.
scale
=
quant_config
.
w1_scale
[
s
:
e
]
new_quant_config
.
_w1
.
scale
=
quant_config
.
w1_scale
[
s
:
e
]
new_quant_config
.
_w2
.
scale
=
quant_config
.
w2_scale
[
s
:
e
]
new_quant_config
.
_w2
.
scale
=
quant_config
.
w2_scale
[
s
:
e
]
cutlass_moe_kwargs
[
"quant_config"
]
=
new_quant_config
yield
cutlass_moe_kwargs
,
new_quant_config
yield
cutlass_moe_kwargs
out_tensor
=
torch
.
zeros_like
(
cutlass_moe_kwargs
[
"hidden_states"
])
for
kwargs
,
new_quant_config
in
slice_experts
():
out_tensor
=
torch
.
zeros_like
(
cutlass_moe_kwargs
[
"a"
])
kernel
=
mk
.
FusedMoEModularKernel
(
for
kwargs
in
slice_experts
():
MoEPrepareAndFinalizeNoEP
(),
out_tensor
=
out_tensor
+
cutlass_moe_fp8
(
**
kwargs
)
CutlassExpertsFp8
(
out_dtype
=
kwargs
[
"hidden_states"
].
dtype
,
# NOTE(rob): w2 is shaped as [E, hidden, intermediate]
e
=
kwargs
[
"w2"
].
shape
[
0
],
# type: ignore[union-attr]
n
=
kwargs
[
"w2"
].
shape
[
2
],
# type: ignore[union-attr]
k
=
kwargs
[
"w2"
].
shape
[
1
],
# type: ignore[union-attr]
quant_config
=
new_quant_config
,
device
=
"cuda"
,
),
)
out_tensor
=
out_tensor
+
kernel
(
**
kwargs
)
return
out_tensor
return
out_tensor
...
@@ -229,27 +242,35 @@ def run_8_bit(
...
@@ -229,27 +242,35 @@ def run_8_bit(
)
)
kwargs
=
{
kwargs
=
{
"
a
"
:
moe_tensors
.
a
,
"
hidden_states
"
:
moe_tensors
.
a
,
"w1
_q
"
:
moe_tensors
.
w1_q
,
# type: ignore[union-attr]
"w1"
:
moe_tensors
.
w1_q
,
# type: ignore[union-attr]
"w2
_q
"
:
moe_tensors
.
w2_q
,
# type: ignore[union-attr]
"w2"
:
moe_tensors
.
w2_q
,
# type: ignore[union-attr]
"topk_weights"
:
topk_weights
,
"topk_weights"
:
topk_weights
,
"topk_ids"
:
topk_ids
,
"topk_ids"
:
topk_ids
,
"ab_strides1"
:
moe_tensors
.
ab_strides1
,
"ab_strides2"
:
moe_tensors
.
ab_strides2
,
"c_strides1"
:
moe_tensors
.
c_strides1
,
"c_strides2"
:
moe_tensors
.
c_strides2
,
"quant_config"
:
quant_config
,
}
}
num_experts
=
moe_tensors
.
w1
.
size
(
0
)
num_experts
=
moe_tensors
.
w1
.
size
(
0
)
with_ep
=
num_local_experts
is
not
None
or
num_local_experts
==
num_experts
with_ep
=
num_local_experts
is
not
None
or
num_local_experts
==
num_experts
if
not
with_ep
:
if
not
with_ep
:
return
cutlass_moe_fp8
(
**
kwargs
)
kernel
=
mk
.
FusedMoEModularKernel
(
MoEPrepareAndFinalizeNoEP
(),
CutlassExpertsFp8
(
out_dtype
=
moe_tensors
.
a
.
dtype
,
# NOTE(rob): w2 is shaped as [E, hidden, intermediate]
e
=
moe_tensors
.
w2_q
.
shape
[
0
],
# type: ignore[union-attr]
n
=
moe_tensors
.
w2_q
.
shape
[
2
],
# type: ignore[union-attr]
k
=
moe_tensors
.
w2_q
.
shape
[
1
],
# type: ignore[union-attr]
quant_config
=
quant_config
,
device
=
"cuda"
,
),
)
return
kernel
(
**
kwargs
)
assert
num_local_experts
is
not
None
assert
num_local_experts
is
not
None
return
run_with_expert_maps
(
return
run_with_expert_maps
(
num_experts
,
num_experts
,
num_local_experts
,
# type: ignore[arg-type]
num_local_experts
,
# type: ignore[arg-type]
quant_config
,
**
kwargs
,
**
kwargs
,
)
)
...
@@ -277,7 +298,7 @@ def test_cutlass_moe_8_bit_no_graph(
...
@@ -277,7 +298,7 @@ def test_cutlass_moe_8_bit_no_graph(
workspace_init
,
workspace_init
,
ep_size
:
int
|
None
=
None
,
ep_size
:
int
|
None
=
None
,
):
):
current_platform
.
seed_everything
(
7
)
set_random_seed
(
7
)
monkeypatch
.
setenv
(
"VLLM_FUSED_MOE_CHUNK_SIZE"
,
"8192"
)
monkeypatch
.
setenv
(
"VLLM_FUSED_MOE_CHUNK_SIZE"
,
"8192"
)
with
set_current_vllm_config
(
vllm_config
):
with
set_current_vllm_config
(
vllm_config
):
mt
=
MOETensors8Bit
.
make_moe_tensors_8bit
(
m
,
k
,
n
,
e
,
per_act_token
,
per_out_ch
)
mt
=
MOETensors8Bit
.
make_moe_tensors_8bit
(
m
,
k
,
n
,
e
,
per_act_token
,
per_out_ch
)
...
@@ -332,7 +353,7 @@ def test_cutlass_moe_8_bit_cuda_graph(
...
@@ -332,7 +353,7 @@ def test_cutlass_moe_8_bit_cuda_graph(
monkeypatch
,
monkeypatch
,
workspace_init
,
workspace_init
,
):
):
current_platform
.
seed_everything
(
7
)
set_random_seed
(
7
)
monkeypatch
.
setenv
(
"VLLM_FUSED_MOE_CHUNK_SIZE"
,
"8192"
)
monkeypatch
.
setenv
(
"VLLM_FUSED_MOE_CHUNK_SIZE"
,
"8192"
)
with
set_current_vllm_config
(
vllm_config
):
with
set_current_vllm_config
(
vllm_config
):
dtype
=
torch
.
half
dtype
=
torch
.
half
...
@@ -469,7 +490,7 @@ def test_run_cutlass_moe_fp8(
...
@@ -469,7 +490,7 @@ def test_run_cutlass_moe_fp8(
ep_size
:
int
,
ep_size
:
int
,
workspace_init
,
workspace_init
,
):
):
current_platform
.
seed_everything
(
7
)
set_random_seed
(
7
)
with
set_current_vllm_config
(
vllm_config
):
with
set_current_vllm_config
(
vllm_config
):
mt
=
MOETensors8Bit
.
make_moe_tensors_8bit
(
mt
=
MOETensors8Bit
.
make_moe_tensors_8bit
(
m
,
k
,
n
,
e
,
per_act_token
,
per_out_channel
m
,
k
,
n
,
e
,
per_act_token
,
per_out_channel
...
...
tests/kernels/moe/untest_moe_permute_unpermute.py
View file @
7e63ef82
...
@@ -17,11 +17,12 @@ from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
...
@@ -17,11 +17,12 @@ from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
moe_unpermute
,
moe_unpermute
,
)
)
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils.torch_utils
import
set_random_seed
NUM_EXPERTS
=
[
16
,
64
,
256
]
NUM_EXPERTS
=
[
16
,
64
,
256
]
TOP_KS
=
[
2
,
6
,
8
]
TOP_KS
=
[
2
,
6
,
8
]
EP_SIZE
=
[
1
,
4
,
16
]
EP_SIZE
=
[
1
,
4
,
16
]
current_platform
.
seed_everything
(
0
)
set_random_seed
(
0
)
if
current_platform
.
is_rocm
():
if
current_platform
.
is_rocm
():
pytest
.
skip
(
pytest
.
skip
(
...
@@ -226,7 +227,7 @@ def test_moe_permute_unpermute(
...
@@ -226,7 +227,7 @@ def test_moe_permute_unpermute(
n_local_expert
,
expert_map
,
_
=
determine_expert_map
(
ep_size
,
ep_rank
,
n_expert
)
n_local_expert
,
expert_map
,
_
=
determine_expert_map
(
ep_size
,
ep_rank
,
n_expert
)
expert_map
=
expert_map
.
cuda
()
expert_map
=
expert_map
.
cuda
()
start_expert
=
n_local_expert
*
ep_rank
start_expert
=
n_local_expert
*
ep_rank
current_platform
.
seed_everything
(
0
)
set_random_seed
(
0
)
hidden_states
=
torch
.
randn
((
n_token
,
n_hidden
),
device
=
"cuda"
).
to
(
dtype
)
hidden_states
=
torch
.
randn
((
n_token
,
n_hidden
),
device
=
"cuda"
).
to
(
dtype
)
gating_output
=
torch
.
randn
((
n_token
,
n_expert
),
device
=
"cuda"
).
to
(
dtype
)
gating_output
=
torch
.
randn
((
n_token
,
n_expert
),
device
=
"cuda"
).
to
(
dtype
)
topk_weights
,
topk_ids
,
token_expert_indices
=
fused_topk
(
topk_weights
,
topk_ids
,
token_expert_indices
=
fused_topk
(
...
...
tests/kernels/moe/untest_nvfp4_moe.py
View file @
7e63ef82
...
@@ -3,6 +3,7 @@
...
@@ -3,6 +3,7 @@
import
pytest
import
pytest
import
torch
import
torch
import
vllm.model_executor.layers.fused_moe.modular_kernel
as
mk
from
tests.kernels.moe.utils
import
make_test_weights
from
tests.kernels.moe.utils
import
make_test_weights
from
tests.kernels.quantization.nvfp4_utils
import
(
from
tests.kernels.quantization.nvfp4_utils
import
(
FLOAT4_E2M1_MAX
,
FLOAT4_E2M1_MAX
,
...
@@ -13,9 +14,15 @@ from tests.kernels.utils import torch_moe
...
@@ -13,9 +14,15 @@ from tests.kernels.utils import torch_moe
from
vllm
import
_custom_ops
as
ops
from
vllm
import
_custom_ops
as
ops
from
vllm.config
import
ParallelConfig
,
VllmConfig
,
set_current_vllm_config
from
vllm.config
import
ParallelConfig
,
VllmConfig
,
set_current_vllm_config
from
vllm.model_executor.layers.fused_moe.config
import
nvfp4_moe_quant_config
from
vllm.model_executor.layers.fused_moe.config
import
nvfp4_moe_quant_config
from
vllm.model_executor.layers.fused_moe.cutlass_moe
import
cutlass_moe_fp4
from
vllm.model_executor.layers.fused_moe.cutlass_moe
import
(
CutlassExpertsFp4
,
)
from
vllm.model_executor.layers.fused_moe.fused_moe
import
fused_topk
from
vllm.model_executor.layers.fused_moe.fused_moe
import
fused_topk
from
vllm.model_executor.layers.fused_moe.prepare_finalize
import
(
MoEPrepareAndFinalizeNoEP
,
)
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils.torch_utils
import
set_random_seed
if
not
current_platform
.
has_device_capability
(
100
):
if
not
current_platform
.
has_device_capability
(
100
):
pytest
.
skip
(
pytest
.
skip
(
...
@@ -42,7 +49,7 @@ MNK_FACTORS = [
...
@@ -42,7 +49,7 @@ MNK_FACTORS = [
def
test_cutlass_fp4_moe_no_graph
(
def
test_cutlass_fp4_moe_no_graph
(
m
:
int
,
n
:
int
,
k
:
int
,
e
:
int
,
topk
:
int
,
dtype
:
torch
.
dtype
,
workspace_init
m
:
int
,
n
:
int
,
k
:
int
,
e
:
int
,
topk
:
int
,
dtype
:
torch
.
dtype
,
workspace_init
):
):
current_platform
.
seed_everything
(
7
)
set_random_seed
(
7
)
with
set_current_vllm_config
(
with
set_current_vllm_config
(
VllmConfig
(
parallel_config
=
ParallelConfig
(
pipeline_parallel_size
=
1
))
VllmConfig
(
parallel_config
=
ParallelConfig
(
pipeline_parallel_size
=
1
))
):
):
...
@@ -82,17 +89,21 @@ def test_cutlass_fp4_moe_no_graph(
...
@@ -82,17 +89,21 @@ def test_cutlass_fp4_moe_no_graph(
w2_scale
=
w2_blockscale
,
w2_scale
=
w2_blockscale
,
)
)
cutlass_output
=
cutlass_moe_fp4
(
kernel
=
mk
.
FusedMoEModularKernel
(
a
=
a
,
MoEPrepareAndFinalizeNoEP
(
defer_input_quant
=
True
),
w1_fp4
=
w1_q
,
CutlassExpertsFp4
(
w2_fp4
=
w2_q
,
out_dtype
=
dtype
,
max_experts_per_worker
=
e
,
quant_config
=
quant_config
,
),
)
cutlass_output
=
kernel
(
hidden_states
=
a
,
w1
=
w1_q
,
w2
=
w2_q
,
topk_weights
=
topk_weights
,
topk_weights
=
topk_weights
,
topk_ids
=
topk_ids
,
topk_ids
=
topk_ids
,
quant_config
=
quant_config
,
m
=
m
,
n
=
n
,
k
=
k
,
e
=
e
,
)
)
# Reference check:
# Reference check:
...
...
tests/kernels/moe/untest_pplx_cutlass_moe.py
View file @
7e63ef82
...
@@ -14,6 +14,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
...
@@ -14,6 +14,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
from
vllm.model_executor.layers.fused_moe.modular_kernel
import
FusedMoEModularKernel
from
vllm.model_executor.layers.fused_moe.modular_kernel
import
FusedMoEModularKernel
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils.math_utils
import
cdiv
from
vllm.utils.math_utils
import
cdiv
from
vllm.utils.torch_utils
import
set_random_seed
from
...utils
import
multi_gpu_test
from
...utils
import
multi_gpu_test
from
.parallel_utils
import
ProcessGroupInfo
,
parallel_launch
from
.parallel_utils
import
ProcessGroupInfo
,
parallel_launch
...
@@ -290,7 +291,7 @@ def test_cutlass_moe_pplx(
...
@@ -290,7 +291,7 @@ def test_cutlass_moe_pplx(
world_dp_size
:
tuple
[
int
,
int
],
world_dp_size
:
tuple
[
int
,
int
],
use_internode
:
bool
,
use_internode
:
bool
,
):
):
current_platform
.
seed_everything
(
7
)
set_random_seed
(
7
)
with
set_current_vllm_config
(
vllm_config
):
with
set_current_vllm_config
(
vllm_config
):
dtype
=
torch
.
half
dtype
=
torch
.
half
...
...
tests/kernels/moe/untest_silu_mul_fp8_quant_deep_gemm.py
View file @
7e63ef82
...
@@ -13,6 +13,7 @@ from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
...
@@ -13,6 +13,7 @@ from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils.deep_gemm
import
DeepGemmQuantScaleFMT
,
has_deep_gemm
from
vllm.utils.deep_gemm
import
DeepGemmQuantScaleFMT
,
has_deep_gemm
from
vllm.utils.math_utils
import
cdiv
,
round_up
from
vllm.utils.math_utils
import
cdiv
,
round_up
from
vllm.utils.torch_utils
import
set_random_seed
if
current_platform
.
is_fp8_fnuz
():
if
current_platform
.
is_fp8_fnuz
():
pytest
.
skip
(
pytest
.
skip
(
...
@@ -201,7 +202,7 @@ def token_random(E, T, H2, tokens_per_expert):
...
@@ -201,7 +202,7 @@ def token_random(E, T, H2, tokens_per_expert):
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
def
test_silu_mul_fp8_quant_deep_gemm
(
E
:
int
,
T
:
int
,
H
:
int
,
fp8_type
:
torch
.
dtype
):
def
test_silu_mul_fp8_quant_deep_gemm
(
E
:
int
,
T
:
int
,
H
:
int
,
fp8_type
:
torch
.
dtype
):
group_size
=
128
group_size
=
128
current_platform
.
seed_everything
(
42
)
set_random_seed
(
42
)
tokens_per_expert
=
torch
.
randint
(
tokens_per_expert
=
torch
.
randint
(
low
=
0
,
low
=
0
,
...
...
tests/kernels/quant_utils.py
View file @
7e63ef82
...
@@ -4,13 +4,13 @@
...
@@ -4,13 +4,13 @@
import
torch
import
torch
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
group_broadcast
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
(
get_fp8_min_max
,
group_broadcast
,
)
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils.math_utils
import
round_up
from
vllm.utils.math_utils
import
round_up
# Using the default value (240.0) from pytorch will cause accuracy
# issue on dynamic quantization models. Here use 224.0 for rocm.
ROCM_FP8FNUZ_MAX
=
224.0
FP8_DTYPE
=
current_platform
.
fp8_dtype
()
FP8_DTYPE
=
current_platform
.
fp8_dtype
()
...
@@ -25,16 +25,12 @@ def ref_dynamic_per_token_quant(
...
@@ -25,16 +25,12 @@ def ref_dynamic_per_token_quant(
if
scale_ub
is
not
None
:
if
scale_ub
is
not
None
:
assert
quant_dtype
==
FP8_DTYPE
assert
quant_dtype
==
FP8_DTYPE
qtype_traits
=
(
if
quant_dtype
==
torch
.
int8
:
torch
.
iinfo
(
quant_dtype
)
qtype_traits
=
torch
.
iinfo
(
quant_dtype
)
if
quant_dtype
==
torch
.
int8
qtype_traits_min
=
qtype_traits
.
min
else
torch
.
finfo
(
quant_dtype
)
qtype_traits_max
=
qtype_traits
.
max
)
else
:
use_fp8fnuz
=
(
qtype_traits_min
,
qtype_traits_max
=
get_fp8_min_max
()
current_platform
.
is_fp8_fnuz
()
and
quant_dtype
==
current_platform
.
fp8_dtype
()
)
qtype_traits_max
=
ROCM_FP8FNUZ_MAX
if
use_fp8fnuz
else
qtype_traits
.
max
qtype_traits_min
=
-
ROCM_FP8FNUZ_MAX
if
use_fp8fnuz
else
qtype_traits
.
min
qtype_max
=
as_float32_tensor
(
qtype_traits_max
)
qtype_max
=
as_float32_tensor
(
qtype_traits_max
)
s_1
=
as_float32_tensor
(
1.0
)
s_1
=
as_float32_tensor
(
1.0
)
s_512
=
as_float32_tensor
(
512.0
)
s_512
=
as_float32_tensor
(
512.0
)
...
@@ -72,17 +68,7 @@ def ref_dynamic_per_token_quant(
...
@@ -72,17 +68,7 @@ def ref_dynamic_per_token_quant(
def
ref_dynamic_per_tensor_fp8_quant
(
def
ref_dynamic_per_tensor_fp8_quant
(
x
:
torch
.
Tensor
,
x
:
torch
.
Tensor
,
)
->
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
)
->
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
fp8_traits
=
torch
.
finfo
(
FP8_DTYPE
)
fp8_traits_min
,
fp8_traits_max
=
get_fp8_min_max
()
fp8_traits_max
=
(
ROCM_FP8FNUZ_MAX
if
current_platform
.
is_rocm
()
and
current_platform
.
is_fp8_fnuz
()
else
fp8_traits
.
max
)
fp8_traits_min
=
(
-
ROCM_FP8FNUZ_MAX
if
current_platform
.
is_rocm
()
and
current_platform
.
is_fp8_fnuz
()
else
fp8_traits
.
min
)
fp8_max
=
as_float32_tensor
(
fp8_traits_max
)
fp8_max
=
as_float32_tensor
(
fp8_traits_max
)
one
=
as_float32_tensor
(
1.0
)
one
=
as_float32_tensor
(
1.0
)
...
...
tests/kernels/quantization/test_awq_triton.py
View file @
7e63ef82
...
@@ -13,7 +13,7 @@ from vllm.model_executor.layers.quantization.awq_triton import (
...
@@ -13,7 +13,7 @@ from vllm.model_executor.layers.quantization.awq_triton import (
awq_dequantize_triton
,
awq_dequantize_triton
,
awq_gemm_triton
,
awq_gemm_triton
,
)
)
from
vllm.
platforms
import
current_platform
from
vllm.
utils.torch_utils
import
set_random_seed
device
=
"cuda"
device
=
"cuda"
...
@@ -86,7 +86,7 @@ def test_dequantize(qweight_rows, qweight_cols, group_size):
...
@@ -86,7 +86,7 @@ def test_dequantize(qweight_rows, qweight_cols, group_size):
zeros_cols
=
qweight_cols
zeros_cols
=
qweight_cols
zeros_dtype
=
torch
.
int32
zeros_dtype
=
torch
.
int32
current_platform
.
seed_everything
(
0
)
set_random_seed
(
0
)
qweight
=
torch
.
randint
(
qweight
=
torch
.
randint
(
0
,
0
,
...
@@ -141,7 +141,7 @@ def test_gemm(N, K, M, splitK, group_size):
...
@@ -141,7 +141,7 @@ def test_gemm(N, K, M, splitK, group_size):
qzeros_rows
=
scales_rows
qzeros_rows
=
scales_rows
qzeros_cols
=
qweight_cols
qzeros_cols
=
qweight_cols
current_platform
.
seed_everything
(
0
)
set_random_seed
(
0
)
input
=
torch
.
rand
((
input_rows
,
input_cols
),
dtype
=
input_dtype
,
device
=
device
)
input
=
torch
.
rand
((
input_rows
,
input_cols
),
dtype
=
input_dtype
,
device
=
device
)
qweight
=
torch
.
randint
(
qweight
=
torch
.
randint
(
...
...
tests/kernels/quantization/test_cutlass_w4a8_moe.py
View file @
7e63ef82
...
@@ -17,6 +17,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
...
@@ -17,6 +17,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
)
)
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.scalar_type
import
ScalarType
,
scalar_types
from
vllm.scalar_type
import
ScalarType
,
scalar_types
from
vllm.utils.torch_utils
import
set_random_seed
IS_SUPPORTED_BY_GPU
=
(
IS_SUPPORTED_BY_GPU
=
(
current_platform
.
is_cuda
()
and
current_platform
.
get_device_capability
()[
0
]
>=
9
current_platform
.
is_cuda
()
and
current_platform
.
get_device_capability
()[
0
]
>=
9
...
@@ -248,7 +249,7 @@ def compute_moe_reference_output(setup: MoETestSetup) -> torch.Tensor:
...
@@ -248,7 +249,7 @@ def compute_moe_reference_output(setup: MoETestSetup) -> torch.Tensor:
@
pytest
.
mark
.
parametrize
(
"random_zero"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"random_zero"
,
[
True
,
False
])
def
test_cutlass_w4a8_moe_mm_end_to_end
(
shape
,
random_zero
):
def
test_cutlass_w4a8_moe_mm_end_to_end
(
shape
,
random_zero
):
num_experts
,
N
,
K
=
shape
num_experts
,
N
,
K
=
shape
current_platform
.
seed_everything
(
42
)
set_random_seed
(
42
)
setup
=
make_moe_test_setup
(
setup
=
make_moe_test_setup
(
num_experts
=
num_experts
,
K
=
K
,
N
=
N
,
max_blocks
=
64
,
random_zero
=
random_zero
num_experts
=
num_experts
,
K
=
K
,
N
=
N
,
max_blocks
=
64
,
random_zero
=
random_zero
)
)
...
@@ -308,7 +309,7 @@ class W4A8MoELayer(torch.nn.Module):
...
@@ -308,7 +309,7 @@ class W4A8MoELayer(torch.nn.Module):
reason
=
"W4A8 Grouped GEMM is not supported on this GPU type."
,
reason
=
"W4A8 Grouped GEMM is not supported on this GPU type."
,
)
)
def
test_cutlass_w4a8_moe_mm_cuda_graph
():
def
test_cutlass_w4a8_moe_mm_cuda_graph
():
current_platform
.
seed_everything
(
42
)
set_random_seed
(
42
)
# Fixed config for CUDA graph test (single parameter point).
# Fixed config for CUDA graph test (single parameter point).
num_experts
=
8
num_experts
=
8
K
=
512
K
=
512
...
...
tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
View file @
7e63ef82
...
@@ -12,6 +12,7 @@ from nvfp4_utils import (
...
@@ -12,6 +12,7 @@ from nvfp4_utils import (
from
vllm
import
_custom_ops
as
ops
from
vllm
import
_custom_ops
as
ops
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils.flashinfer
import
flashinfer_scaled_fp4_mm
from
vllm.utils.flashinfer
import
flashinfer_scaled_fp4_mm
from
vllm.utils.torch_utils
import
set_random_seed
if
not
current_platform
.
has_device_capability
(
100
):
if
not
current_platform
.
has_device_capability
(
100
):
pytest
.
skip
(
pytest
.
skip
(
...
@@ -72,7 +73,7 @@ def test_flashinfer_nvfp4_gemm(
...
@@ -72,7 +73,7 @@ def test_flashinfer_nvfp4_gemm(
if
backend
==
"trtllm"
and
dtype
==
torch
.
float16
:
if
backend
==
"trtllm"
and
dtype
==
torch
.
float16
:
pytest
.
skip
(
"Only torch.bfloat16 is supported for TRTLLM FP4 GEMM operations"
)
pytest
.
skip
(
"Only torch.bfloat16 is supported for TRTLLM FP4 GEMM operations"
)
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
m
,
n
,
packed_k
=
shape
m
,
n
,
packed_k
=
shape
k
=
packed_k
*
2
k
=
packed_k
*
2
block_size
=
16
block_size
=
16
...
...
tests/kernels/quantization/test_flashinfer_scaled_mm.py
View file @
7e63ef82
...
@@ -6,6 +6,7 @@ import torch
...
@@ -6,6 +6,7 @@ import torch
from
vllm
import
_custom_ops
as
ops
from
vllm
import
_custom_ops
as
ops
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils.flashinfer
import
flashinfer_scaled_fp8_mm
from
vllm.utils.flashinfer
import
flashinfer_scaled_fp8_mm
from
vllm.utils.torch_utils
import
set_random_seed
if
not
current_platform
.
has_device_capability
(
100
):
if
not
current_platform
.
has_device_capability
(
100
):
pytest
.
skip
(
pytest
.
skip
(
...
@@ -38,7 +39,7 @@ def test_flashinfer_fp8_gemm(
...
@@ -38,7 +39,7 @@ def test_flashinfer_fp8_gemm(
device
:
str
,
device
:
str
,
autotune
:
bool
,
autotune
:
bool
,
)
->
None
:
)
->
None
:
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
m
,
n
,
k
=
shape
m
,
n
,
k
=
shape
a
=
torch
.
randn
((
m
,
k
),
dtype
=
dtype
,
device
=
device
)
a
=
torch
.
randn
((
m
,
k
),
dtype
=
dtype
,
device
=
device
)
b
=
torch
.
randn
((
n
,
k
),
dtype
=
dtype
,
device
=
device
)
/
k
b
=
torch
.
randn
((
n
,
k
),
dtype
=
dtype
,
device
=
device
)
/
k
...
...
tests/kernels/quantization/test_fp8_min_max_helper.py
0 → 100644
View file @
7e63ef82
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Unit tests for the get_fp8_min_max() helper function.
These tests verify the FP8 min/max value logic for both standard
and fnuz (ROCm MI300) dtype handling.
"""
from
unittest.mock
import
patch
import
pytest
import
torch
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
(
get_fp8_min_max
,
)
class
TestGetFp8MinMax
:
"""Test cases for get_fp8_min_max() function."""
@
patch
(
"vllm.model_executor.layers.quantization.utils.quant_utils.current_platform"
)
def
test_standard_fp8_platform
(
self
,
mock_platform
):
"""Test that standard FP8 platform uses PyTorch's finfo values."""
mock_platform
.
is_fp8_fnuz
.
return_value
=
False
mock_platform
.
fp8_dtype
.
return_value
=
torch
.
float8_e4m3fn
fp8_min
,
fp8_max
=
get_fp8_min_max
()
finfo
=
torch
.
finfo
(
torch
.
float8_e4m3fn
)
# Standard FP8 max is 448.0 for e4m3fn
assert
fp8_max
==
finfo
.
max
,
f
"Expected finfo.max=
{
finfo
.
max
}
, got
{
fp8_max
}
"
assert
fp8_min
==
finfo
.
min
,
f
"Expected finfo.min=
{
finfo
.
min
}
, got
{
fp8_min
}
"
@
patch
(
"vllm.model_executor.layers.quantization.utils.quant_utils.current_platform"
)
def
test_fnuz_platform_returns_224
(
self
,
mock_platform
):
"""Test that fnuz platform returns 224.0."""
mock_platform
.
is_fp8_fnuz
.
return_value
=
True
fp8_min
,
fp8_max
=
get_fp8_min_max
()
# fnuz on ROCm MI300 should return 224.0, not 240.0
assert
fp8_max
==
224.0
,
f
"Expected 224.0 for fnuz platform, got
{
fp8_max
}
"
assert
fp8_min
==
-
224.0
,
f
"Expected -224.0 for fnuz platform, got
{
fp8_min
}
"
@
patch
(
"vllm.model_executor.layers.quantization.utils.quant_utils.current_platform"
)
def
test_non_fnuz_platform_uses_finfo
(
self
,
mock_platform
):
"""Test that non-fnuz platform uses finfo values."""
mock_platform
.
is_fp8_fnuz
.
return_value
=
False
mock_platform
.
fp8_dtype
.
return_value
=
torch
.
float8_e4m3fn
fp8_min
,
fp8_max
=
get_fp8_min_max
()
finfo
=
torch
.
finfo
(
torch
.
float8_e4m3fn
)
assert
fp8_max
==
finfo
.
max
,
(
f
"Non-fnuz platform should use finfo.max=
{
finfo
.
max
}
, got
{
fp8_max
}
"
)
assert
fp8_min
==
finfo
.
min
,
(
f
"Non-fnuz platform should use finfo.min=
{
finfo
.
min
}
, got
{
fp8_min
}
"
)
if
__name__
==
"__main__"
:
pytest
.
main
([
__file__
,
"-v"
])
tests/kernels/quantization/test_fp8_quant_group.py
View file @
7e63ef82
...
@@ -7,7 +7,7 @@ import torch
...
@@ -7,7 +7,7 @@ import torch
from
vllm.model_executor.layers.quantization.input_quant_fp8
import
QuantFP8
from
vllm.model_executor.layers.quantization.input_quant_fp8
import
QuantFP8
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
GroupShape
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
GroupShape
from
vllm.
platforms
import
current_platform
from
vllm.
utils.torch_utils
import
set_random_seed
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
...
@@ -23,14 +23,19 @@ from vllm.platforms import current_platform
...
@@ -23,14 +23,19 @@ from vllm.platforms import current_platform
@
pytest
.
mark
.
parametrize
(
"use_ue8m0"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"use_ue8m0"
,
[
True
,
False
])
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
def
test_quantfp8_group_functionality
(
def
test_quantfp8_group_functionality
(
batch_size
:
int
,
hidden_dim
:
int
,
group_size
:
int
,
seed
:
int
,
use_ue8m0
:
bool
default_vllm_config
,
batch_size
:
int
,
hidden_dim
:
int
,
group_size
:
int
,
seed
:
int
,
use_ue8m0
:
bool
,
)
->
None
:
)
->
None
:
"""Test QuantFP8 group quantization with various configurations.
"""Test QuantFP8 group quantization with various configurations.
Tests both CUDA and native implementations, column-major scales,
Tests both CUDA and native implementations, column-major scales,
and verifies consistency between implementations.
and verifies consistency between implementations.
"""
"""
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
x
=
torch
.
randn
((
batch_size
,
hidden_dim
),
dtype
=
torch
.
bfloat16
,
device
=
"cuda"
)
*
8
x
=
torch
.
randn
((
batch_size
,
hidden_dim
),
dtype
=
torch
.
bfloat16
,
device
=
"cuda"
)
*
8
expected_num_groups
=
(
hidden_dim
+
group_size
-
1
)
//
group_size
expected_num_groups
=
(
hidden_dim
+
group_size
-
1
)
//
group_size
...
@@ -82,8 +87,10 @@ def test_quantfp8_group_functionality(
...
@@ -82,8 +87,10 @@ def test_quantfp8_group_functionality(
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
42
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
42
])
@
pytest
.
mark
.
parametrize
(
"use_ue8m0"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"use_ue8m0"
,
[
True
,
False
])
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
def
test_quantfp8_group_multidimensional
(
seed
:
int
,
use_ue8m0
:
bool
)
->
None
:
def
test_quantfp8_group_multidimensional
(
current_platform
.
seed_everything
(
seed
)
default_vllm_config
,
seed
:
int
,
use_ue8m0
:
bool
)
->
None
:
set_random_seed
(
seed
)
group_size
=
64
group_size
=
64
...
@@ -135,8 +142,8 @@ def test_quantfp8_group_multidimensional(seed: int, use_ue8m0: bool) -> None:
...
@@ -135,8 +142,8 @@ def test_quantfp8_group_multidimensional(seed: int, use_ue8m0: bool) -> None:
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
42
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
42
])
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
def
test_quantfp8_group_edge_cases
(
seed
:
int
)
->
None
:
def
test_quantfp8_group_edge_cases
(
default_vllm_config
,
seed
:
int
)
->
None
:
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
batch_size
=
16
batch_size
=
16
group_size
=
64
group_size
=
64
...
...
tests/kernels/quantization/test_gguf.py
View file @
7e63ef82
...
@@ -12,8 +12,8 @@ from huggingface_hub import snapshot_download
...
@@ -12,8 +12,8 @@ from huggingface_hub import snapshot_download
import
vllm._custom_ops
as
ops
import
vllm._custom_ops
as
ops
from
vllm.model_executor.layers.fused_moe
import
fused_experts
from
vllm.model_executor.layers.fused_moe
import
fused_experts
from
vllm.model_executor.layers.quantization.gguf
import
_fused_moe_gguf
from
vllm.model_executor.layers.quantization.gguf
import
_fused_moe_gguf
from
vllm.platforms
import
current_platform
from
...utils
import
models_path_prefix
from
...utils
import
models_path_prefix
from
vllm.utils.torch_utils
import
set_random_seed
# GGUF_SAMPLE = snapshot_download("Isotr0py/test-gguf-sample")
# GGUF_SAMPLE = snapshot_download("Isotr0py/test-gguf-sample")
# GGUF_SAMPLE_MOE = snapshot_download("SzymonOzog/test-gguf-moe-sample")
# GGUF_SAMPLE_MOE = snapshot_download("SzymonOzog/test-gguf-moe-sample")
...
@@ -95,7 +95,7 @@ def test_dequantize(
...
@@ -95,7 +95,7 @@ def test_dequantize(
@
pytest
.
mark
.
parametrize
(
"quant_type"
,
QUANT_TYPES
)
@
pytest
.
mark
.
parametrize
(
"quant_type"
,
QUANT_TYPES
)
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
def
test_mmvq
(
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
quant_type
:
GGMLQuantizationType
):
def
test_mmvq
(
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
quant_type
:
GGMLQuantizationType
):
current_platform
.
seed_everything
(
0
)
set_random_seed
(
0
)
tensors
=
get_gguf_sample_tensors
(
hidden_size
,
quant_type
)
tensors
=
get_gguf_sample_tensors
(
hidden_size
,
quant_type
)
x
=
torch
.
rand
((
1
,
hidden_size
),
dtype
=
dtype
,
device
=
"cuda"
)
x
=
torch
.
rand
((
1
,
hidden_size
),
dtype
=
dtype
,
device
=
"cuda"
)
...
@@ -138,7 +138,7 @@ def test_mmq(
...
@@ -138,7 +138,7 @@ def test_mmq(
dtype
:
torch
.
dtype
,
dtype
:
torch
.
dtype
,
quant_type
:
GGMLQuantizationType
,
quant_type
:
GGMLQuantizationType
,
):
):
current_platform
.
seed_everything
(
0
)
set_random_seed
(
0
)
tensors
=
get_gguf_sample_tensors
(
hidden_size
,
quant_type
)
tensors
=
get_gguf_sample_tensors
(
hidden_size
,
quant_type
)
x
=
torch
.
rand
((
num_tokens
,
hidden_size
),
dtype
=
dtype
,
device
=
"cuda"
)
x
=
torch
.
rand
((
num_tokens
,
hidden_size
),
dtype
=
dtype
,
device
=
"cuda"
)
...
@@ -173,7 +173,7 @@ def test_moe(
...
@@ -173,7 +173,7 @@ def test_moe(
quant_type
:
GGMLQuantizationType
,
quant_type
:
GGMLQuantizationType
,
top_k
:
int
,
top_k
:
int
,
):
):
current_platform
.
seed_everything
(
0
)
set_random_seed
(
0
)
H
,
E
=
1024
,
256
H
,
E
=
1024
,
256
x
=
torch
.
rand
((
num_tokens
,
H
),
dtype
=
dtype
,
device
=
"cuda"
)
x
=
torch
.
rand
((
num_tokens
,
H
),
dtype
=
dtype
,
device
=
"cuda"
)
...
...
tests/kernels/quantization/test_int8_kernel.py
View file @
7e63ef82
...
@@ -107,7 +107,7 @@ SEEDS = [0]
...
@@ -107,7 +107,7 @@ SEEDS = [0]
itertools
.
product
(
M
,
N
,
K
,
E
,
TOP_KS
,
DTYPES
,
SEEDS
),
itertools
.
product
(
M
,
N
,
K
,
E
,
TOP_KS
,
DTYPES
,
SEEDS
),
)
)
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
def
test_w8a8_fp8_fused_moe
(
M
,
N
,
K
,
E
,
topk
,
dtype
,
seed
):
def
test_w8a8_fp8_fused_moe
(
default_vllm_config
,
M
,
N
,
K
,
E
,
topk
,
dtype
,
seed
):
torch
.
manual_seed
(
seed
)
torch
.
manual_seed
(
seed
)
# Initialize int8 quantization parameters
# Initialize int8 quantization parameters
factor_for_scale
=
1e-2
factor_for_scale
=
1e-2
...
...
Prev
1
…
20
21
22
23
24
25
26
27
28
…
35
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment