Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
bb4337b3
Unverified
Commit
bb4337b3
authored
Jan 05, 2026
by
wangxiyuan
Committed by
GitHub
Jan 04, 2026
Browse files
[Platform] Deprecate seed_everything (#31659)
Signed-off-by:
wangxiyuan
<
wangxiyuan1007@gmail.com
>
parent
367856de
Changes
77
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
58 additions
and
44 deletions
+58
-44
tests/kernels/moe/test_nvfp4_moe.py
tests/kernels/moe/test_nvfp4_moe.py
+2
-1
tests/kernels/moe/test_pplx_cutlass_moe.py
tests/kernels/moe/test_pplx_cutlass_moe.py
+2
-1
tests/kernels/moe/test_pplx_moe.py
tests/kernels/moe/test_pplx_moe.py
+7
-7
tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py
tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py
+2
-1
tests/kernels/moe/test_silu_mul_per_token_group_quant_fp8_colmajor.py
...s/moe/test_silu_mul_per_token_group_quant_fp8_colmajor.py
+2
-1
tests/kernels/quantization/test_awq_triton.py
tests/kernels/quantization/test_awq_triton.py
+3
-3
tests/kernels/quantization/test_cutlass_w4a8_moe.py
tests/kernels/quantization/test_cutlass_w4a8_moe.py
+3
-2
tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
...s/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
+2
-1
tests/kernels/quantization/test_flashinfer_scaled_mm.py
tests/kernels/quantization/test_flashinfer_scaled_mm.py
+2
-1
tests/kernels/quantization/test_fp8_quant.py
tests/kernels/quantization/test_fp8_quant.py
+4
-4
tests/kernels/quantization/test_fp8_quant_group.py
tests/kernels/quantization/test_fp8_quant_group.py
+4
-4
tests/kernels/quantization/test_gguf.py
tests/kernels/quantization/test_gguf.py
+4
-4
tests/kernels/quantization/test_int8_quant.py
tests/kernels/quantization/test_int8_quant.py
+5
-5
tests/kernels/quantization/test_mxfp4_qutlass.py
tests/kernels/quantization/test_mxfp4_qutlass.py
+2
-1
tests/kernels/quantization/test_nvfp4_quant.py
tests/kernels/quantization/test_nvfp4_quant.py
+3
-2
tests/kernels/quantization/test_nvfp4_qutlass.py
tests/kernels/quantization/test_nvfp4_qutlass.py
+2
-1
tests/kernels/quantization/test_nvfp4_scaled_mm.py
tests/kernels/quantization/test_nvfp4_scaled_mm.py
+2
-1
tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
+2
-1
tests/kernels/quantization/test_triton_scaled_mm.py
tests/kernels/quantization/test_triton_scaled_mm.py
+2
-1
tests/kernels/test_apply_repetition_penalties.py
tests/kernels/test_apply_repetition_penalties.py
+3
-2
No files found.
tests/kernels/moe/test_nvfp4_moe.py
View file @
bb4337b3
...
...
@@ -16,6 +16,7 @@ from vllm.model_executor.layers.fused_moe.config import nvfp4_moe_quant_config
from
vllm.model_executor.layers.fused_moe.cutlass_moe
import
cutlass_moe_fp4
from
vllm.model_executor.layers.fused_moe.fused_moe
import
fused_topk
from
vllm.platforms
import
current_platform
from
vllm.utils.torch_utils
import
set_random_seed
if
not
current_platform
.
has_device_capability
(
100
):
pytest
.
skip
(
...
...
@@ -42,7 +43,7 @@ MNK_FACTORS = [
def
test_cutlass_fp4_moe_no_graph
(
m
:
int
,
n
:
int
,
k
:
int
,
e
:
int
,
topk
:
int
,
dtype
:
torch
.
dtype
,
workspace_init
):
current_platform
.
seed_everything
(
7
)
set_random_seed
(
7
)
with
set_current_vllm_config
(
VllmConfig
(
parallel_config
=
ParallelConfig
(
pipeline_parallel_size
=
1
))
):
...
...
tests/kernels/moe/test_pplx_cutlass_moe.py
View file @
bb4337b3
...
...
@@ -14,6 +14,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
from
vllm.model_executor.layers.fused_moe.modular_kernel
import
FusedMoEModularKernel
from
vllm.platforms
import
current_platform
from
vllm.utils.math_utils
import
cdiv
from
vllm.utils.torch_utils
import
set_random_seed
from
...utils
import
multi_gpu_test
from
.parallel_utils
import
ProcessGroupInfo
,
parallel_launch
...
...
@@ -290,7 +291,7 @@ def test_cutlass_moe_pplx(
world_dp_size
:
tuple
[
int
,
int
],
use_internode
:
bool
,
):
current_platform
.
seed_everything
(
7
)
set_random_seed
(
7
)
with
set_current_vllm_config
(
vllm_config
):
dtype
=
torch
.
half
...
...
tests/kernels/moe/test_pplx_moe.py
View file @
bb4337b3
...
...
@@ -44,8 +44,8 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularK
from
vllm.model_executor.layers.fused_moe.topk_weight_and_reduce
import
(
TopKWeightAndReduceDelegate
,
)
from
vllm.platforms
import
current_platform
from
vllm.utils.math_utils
import
round_up
from
vllm.utils.torch_utils
import
set_random_seed
from
vllm.v1.worker.workspace
import
init_workspace_manager
from
...utils
import
multi_gpu_test
...
...
@@ -184,7 +184,7 @@ def test_fused_moe_batched_experts(
dtype
:
torch
.
dtype
,
workspace_init
,
):
current_platform
.
seed_everything
(
7
)
set_random_seed
(
7
)
a
=
torch
.
randn
((
m
,
k
),
device
=
"cuda"
,
dtype
=
dtype
)
/
10
w1
=
torch
.
randn
((
e
,
2
*
n
,
k
),
device
=
"cuda"
,
dtype
=
dtype
)
/
10
...
...
@@ -491,7 +491,7 @@ def test_pplx_prepare_finalize_slow(
if
per_act_token_quant
and
block_shape
is
not
None
:
pytest
.
skip
(
"Skip illegal quantization combination"
)
current_platform
.
seed_everything
(
7
)
set_random_seed
(
7
)
m
,
n
,
k
=
mnk
world_size
,
dp_size
=
world_dp_size
device
=
"cuda"
...
...
@@ -809,7 +809,7 @@ def test_pplx_moe_slow(
block_shape
:
list
[
int
]
|
None
,
use_internode
:
bool
,
):
current_platform
.
seed_everything
(
7
)
set_random_seed
(
7
)
m
,
n
,
k
=
mnk
world_size
,
dp_size
=
world_dp_size
...
...
@@ -888,7 +888,7 @@ def _pplx_test_loop(
new_vllm_config
.
parallel_config
.
enable_expert_parallel
=
True
_set_vllm_config
(
new_vllm_config
,
pgi
.
world_size
,
pgi
.
rank
,
pgi
.
local_rank
)
current_platform
.
seed_everything
(
7
)
set_random_seed
(
7
)
combos
=
itertools
.
product
(
PPLX_COMBOS
,
NUM_EXPERTS
,
TOP_KS
,
DTYPES
,
[
False
,
True
],
[
None
,
[
128
,
128
]]
)
...
...
@@ -982,7 +982,7 @@ def test_pplx_prepare_finalize(
world_dp_size
:
tuple
[
int
,
int
],
use_internode
:
bool
,
):
current_platform
.
seed_everything
(
7
)
set_random_seed
(
7
)
world_size
,
dp_size
=
world_dp_size
parallel_launch
(
world_size
*
dp_size
,
...
...
@@ -1005,7 +1005,7 @@ def test_pplx_moe(
use_internode
:
bool
,
use_shared_experts
:
bool
,
):
current_platform
.
seed_everything
(
7
)
set_random_seed
(
7
)
world_size
,
dp_size
=
world_dp_size
parallel_launch
(
world_size
,
...
...
tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py
View file @
bb4337b3
...
...
@@ -13,6 +13,7 @@ from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
from
vllm.platforms
import
current_platform
from
vllm.utils.deep_gemm
import
DeepGemmQuantScaleFMT
,
has_deep_gemm
from
vllm.utils.math_utils
import
cdiv
,
round_up
from
vllm.utils.torch_utils
import
set_random_seed
if
current_platform
.
is_fp8_fnuz
():
pytest
.
skip
(
...
...
@@ -201,7 +202,7 @@ def token_random(E, T, H2, tokens_per_expert):
@
torch
.
inference_mode
()
def
test_silu_mul_fp8_quant_deep_gemm
(
E
:
int
,
T
:
int
,
H
:
int
,
fp8_type
:
torch
.
dtype
):
group_size
=
128
current_platform
.
seed_everything
(
42
)
set_random_seed
(
42
)
tokens_per_expert
=
torch
.
randint
(
low
=
0
,
...
...
tests/kernels/moe/test_silu_mul_per_token_group_quant_fp8_colmajor.py
View file @
bb4337b3
...
...
@@ -11,6 +11,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
from
vllm.platforms
import
current_platform
from
vllm.triton_utils
import
triton
from
vllm.utils.deep_gemm
import
is_deep_gemm_e8m0_used
from
vllm.utils.torch_utils
import
set_random_seed
FLOAT8_DTYPE
=
torch
.
float8_e4m3fn
GROUP_SIZE
=
128
...
...
@@ -72,7 +73,7 @@ def reference(x: torch.Tensor, use_ue8m0: bool) -> tuple[torch.Tensor, torch.Ten
reason
=
"ROCm does not support DeepGemm."
,
)
def
test_silu_mul_fp8_quant_deep_gemm
(
T
:
int
,
N
:
int
):
current_platform
.
seed_everything
(
42
)
set_random_seed
(
42
)
input
=
torch
.
rand
((
T
,
N
),
dtype
=
torch
.
bfloat16
,
device
=
"cuda"
)
...
...
tests/kernels/quantization/test_awq_triton.py
View file @
bb4337b3
...
...
@@ -13,7 +13,7 @@ from vllm.model_executor.layers.quantization.awq_triton import (
awq_dequantize_triton
,
awq_gemm_triton
,
)
from
vllm.
platforms
import
current_platform
from
vllm.
utils.torch_utils
import
set_random_seed
device
=
"cuda"
...
...
@@ -86,7 +86,7 @@ def test_dequantize(qweight_rows, qweight_cols, group_size):
zeros_cols
=
qweight_cols
zeros_dtype
=
torch
.
int32
current_platform
.
seed_everything
(
0
)
set_random_seed
(
0
)
qweight
=
torch
.
randint
(
0
,
...
...
@@ -141,7 +141,7 @@ def test_gemm(N, K, M, splitK, group_size):
qzeros_rows
=
scales_rows
qzeros_cols
=
qweight_cols
current_platform
.
seed_everything
(
0
)
set_random_seed
(
0
)
input
=
torch
.
rand
((
input_rows
,
input_cols
),
dtype
=
input_dtype
,
device
=
device
)
qweight
=
torch
.
randint
(
...
...
tests/kernels/quantization/test_cutlass_w4a8_moe.py
View file @
bb4337b3
...
...
@@ -17,6 +17,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
)
from
vllm.platforms
import
current_platform
from
vllm.scalar_type
import
ScalarType
,
scalar_types
from
vllm.utils.torch_utils
import
set_random_seed
IS_SUPPORTED_BY_GPU
=
(
current_platform
.
is_cuda
()
and
current_platform
.
get_device_capability
()[
0
]
>=
9
...
...
@@ -248,7 +249,7 @@ def compute_moe_reference_output(setup: MoETestSetup) -> torch.Tensor:
@
pytest
.
mark
.
parametrize
(
"random_zero"
,
[
True
,
False
])
def
test_cutlass_w4a8_moe_mm_end_to_end
(
shape
,
random_zero
):
num_experts
,
N
,
K
=
shape
current_platform
.
seed_everything
(
42
)
set_random_seed
(
42
)
setup
=
make_moe_test_setup
(
num_experts
=
num_experts
,
K
=
K
,
N
=
N
,
max_blocks
=
64
,
random_zero
=
random_zero
)
...
...
@@ -308,7 +309,7 @@ class W4A8MoELayer(torch.nn.Module):
reason
=
"W4A8 Grouped GEMM is not supported on this GPU type."
,
)
def
test_cutlass_w4a8_moe_mm_cuda_graph
():
current_platform
.
seed_everything
(
42
)
set_random_seed
(
42
)
# Fixed config for CUDA graph test (single parameter point).
num_experts
=
8
K
=
512
...
...
tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
View file @
bb4337b3
...
...
@@ -12,6 +12,7 @@ from nvfp4_utils import (
from
vllm
import
_custom_ops
as
ops
from
vllm.platforms
import
current_platform
from
vllm.utils.flashinfer
import
flashinfer_scaled_fp4_mm
from
vllm.utils.torch_utils
import
set_random_seed
if
not
current_platform
.
has_device_capability
(
100
):
pytest
.
skip
(
...
...
@@ -72,7 +73,7 @@ def test_flashinfer_nvfp4_gemm(
if
backend
==
"trtllm"
and
dtype
==
torch
.
float16
:
pytest
.
skip
(
"Only torch.bfloat16 is supported for TRTLLM FP4 GEMM operations"
)
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
m
,
n
,
packed_k
=
shape
k
=
packed_k
*
2
block_size
=
16
...
...
tests/kernels/quantization/test_flashinfer_scaled_mm.py
View file @
bb4337b3
...
...
@@ -6,6 +6,7 @@ import torch
from
vllm
import
_custom_ops
as
ops
from
vllm.platforms
import
current_platform
from
vllm.utils.flashinfer
import
flashinfer_scaled_fp8_mm
from
vllm.utils.torch_utils
import
set_random_seed
if
not
current_platform
.
has_device_capability
(
100
):
pytest
.
skip
(
...
...
@@ -38,7 +39,7 @@ def test_flashinfer_fp8_gemm(
device
:
str
,
autotune
:
bool
,
)
->
None
:
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
m
,
n
,
k
=
shape
a
=
torch
.
randn
((
m
,
k
),
dtype
=
dtype
,
device
=
device
)
b
=
torch
.
randn
((
n
,
k
),
dtype
=
dtype
,
device
=
device
)
/
k
...
...
tests/kernels/quantization/test_fp8_quant.py
View file @
bb4337b3
...
...
@@ -11,7 +11,7 @@ from tests.kernels.quant_utils import (
ref_dynamic_per_token_quant
,
)
from
tests.kernels.utils
import
opcheck
from
vllm.
platforms
import
current_platform
from
vllm.
utils.torch_utils
import
set_random_seed
DTYPES
=
[
torch
.
bfloat16
,
torch
.
float
]
HIDDEN_SIZES
=
[
17
,
1024
,
1025
,
1026
,
5137
,
8193
]
...
...
@@ -51,7 +51,7 @@ def opcheck_fp8_quant(
def
test_dynamic_per_token_fp8_quant
(
num_tokens
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
scale_ub
:
bool
,
seed
:
int
)
->
None
:
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
x
=
(
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
+
1e-6
...
...
@@ -81,7 +81,7 @@ def test_dynamic_per_token_fp8_quant(
def
test_dynamic_per_tensor_fp8_quant
(
num_tokens
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
seed
:
int
)
->
None
:
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
...
...
@@ -101,7 +101,7 @@ def test_dynamic_per_tensor_fp8_quant(
@
torch
.
inference_mode
()
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
def
test_fp8_quant_large
(
seed
:
int
)
->
None
:
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
num_tokens
=
1024000
# Mistral-Nemo's max_position_embeddings
hidden_size
=
1152
# Smallest hidden_size to reproduce the error
...
...
tests/kernels/quantization/test_fp8_quant_group.py
View file @
bb4337b3
...
...
@@ -7,7 +7,7 @@ import torch
from
vllm.model_executor.layers.quantization.input_quant_fp8
import
QuantFP8
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
GroupShape
from
vllm.
platforms
import
current_platform
from
vllm.
utils.torch_utils
import
set_random_seed
@
pytest
.
mark
.
parametrize
(
...
...
@@ -30,7 +30,7 @@ def test_quantfp8_group_functionality(
Tests both CUDA and native implementations, column-major scales,
and verifies consistency between implementations.
"""
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
x
=
torch
.
randn
((
batch_size
,
hidden_dim
),
dtype
=
torch
.
bfloat16
,
device
=
"cuda"
)
*
8
expected_num_groups
=
(
hidden_dim
+
group_size
-
1
)
//
group_size
...
...
@@ -83,7 +83,7 @@ def test_quantfp8_group_functionality(
@
pytest
.
mark
.
parametrize
(
"use_ue8m0"
,
[
True
,
False
])
@
torch
.
inference_mode
()
def
test_quantfp8_group_multidimensional
(
seed
:
int
,
use_ue8m0
:
bool
)
->
None
:
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
group_size
=
64
...
...
@@ -136,7 +136,7 @@ def test_quantfp8_group_multidimensional(seed: int, use_ue8m0: bool) -> None:
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
42
])
@
torch
.
inference_mode
()
def
test_quantfp8_group_edge_cases
(
seed
:
int
)
->
None
:
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
batch_size
=
16
group_size
=
64
...
...
tests/kernels/quantization/test_gguf.py
View file @
bb4337b3
...
...
@@ -11,7 +11,7 @@ from huggingface_hub import snapshot_download
import
vllm._custom_ops
as
ops
from
vllm.model_executor.layers.fused_moe
import
fused_experts
from
vllm.model_executor.layers.quantization.gguf
import
_fused_moe_gguf
from
vllm.
platforms
import
current_platform
from
vllm.
utils.torch_utils
import
set_random_seed
GGUF_SAMPLE
=
snapshot_download
(
"Isotr0py/test-gguf-sample"
)
GGUF_SAMPLE_MOE
=
snapshot_download
(
"SzymonOzog/test-gguf-moe-sample"
)
...
...
@@ -91,7 +91,7 @@ def test_dequantize(
@
pytest
.
mark
.
parametrize
(
"quant_type"
,
QUANT_TYPES
)
@
torch
.
inference_mode
()
def
test_mmvq
(
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
quant_type
:
GGMLQuantizationType
):
current_platform
.
seed_everything
(
0
)
set_random_seed
(
0
)
tensors
=
get_gguf_sample_tensors
(
hidden_size
,
quant_type
)
x
=
torch
.
rand
((
1
,
hidden_size
),
dtype
=
dtype
,
device
=
"cuda"
)
...
...
@@ -134,7 +134,7 @@ def test_mmq(
dtype
:
torch
.
dtype
,
quant_type
:
GGMLQuantizationType
,
):
current_platform
.
seed_everything
(
0
)
set_random_seed
(
0
)
tensors
=
get_gguf_sample_tensors
(
hidden_size
,
quant_type
)
x
=
torch
.
rand
((
num_tokens
,
hidden_size
),
dtype
=
dtype
,
device
=
"cuda"
)
...
...
@@ -169,7 +169,7 @@ def test_moe(
quant_type
:
GGMLQuantizationType
,
top_k
:
int
,
):
current_platform
.
seed_everything
(
0
)
set_random_seed
(
0
)
H
,
E
=
1024
,
256
x
=
torch
.
rand
((
num_tokens
,
H
),
dtype
=
dtype
,
device
=
"cuda"
)
...
...
tests/kernels/quantization/test_int8_quant.py
View file @
bb4337b3
...
...
@@ -7,7 +7,7 @@ import torch
from
tests.kernels.quant_utils
import
ref_dynamic_per_token_quant
from
tests.kernels.utils
import
opcheck
from
vllm._custom_ops
import
scaled_int8_quant
from
vllm.
platforms
import
current_platform
from
vllm.
utils.torch_utils
import
set_random_seed
DTYPES
=
[
torch
.
bfloat16
,
torch
.
float
]
HIDDEN_SIZES
=
[
17
,
1024
,
1025
,
1026
,
5137
,
8193
]
...
...
@@ -46,7 +46,7 @@ def opcheck_int8_quant_dynamic(output, input, symmetric=True):
def
test_dynamic_scaled_int8_quant
(
num_tokens
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
seed
:
int
)
->
None
:
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
*
1000
...
...
@@ -70,7 +70,7 @@ def test_dynamic_scaled_int8_quant(
def
test_dynamic_scaled_int8_azp_quant
(
num_tokens
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
seed
:
int
)
->
None
:
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
int8_traits
=
torch
.
iinfo
(
torch
.
int8
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
*
1000
-
300
...
...
@@ -111,7 +111,7 @@ def test_dynamic_scaled_int8_azp_quant(
def
test_static_scaled_int8_quant
(
num_tokens
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
seed
:
int
,
scale
:
float
)
->
None
:
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
int8_traits
=
torch
.
iinfo
(
torch
.
int8
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
*
1000
...
...
@@ -144,7 +144,7 @@ def test_static_scaled_int8_azp_quant(
scale
:
float
,
azp
:
int
,
)
->
None
:
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
int8_traits
=
torch
.
iinfo
(
torch
.
int8
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
*
1000
-
300
...
...
tests/kernels/quantization/test_mxfp4_qutlass.py
View file @
bb4337b3
...
...
@@ -24,6 +24,7 @@ from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_m
from
vllm._custom_ops
import
fusedQuantizeMx
,
matmul_mxf4_bf16_tn
from
vllm.model_executor.layers.quantization.qutlass_utils
import
to_blocked
from
vllm.platforms
import
current_platform
from
vllm.utils.torch_utils
import
set_random_seed
if
not
torch
.
cuda
.
is_available
():
pytest
.
skip
(
"CUDA required for these tests."
,
allow_module_level
=
True
)
...
...
@@ -205,7 +206,7 @@ LLAMA_MODELS = {
@
pytest
.
fixture
(
autouse
=
True
)
def
_seed_each_test
():
current_platform
.
seed_everything
(
0
)
set_random_seed
(
0
)
np
.
random
.
seed
(
0
)
torch
.
random
.
manual_seed
(
0
)
...
...
tests/kernels/quantization/test_nvfp4_quant.py
View file @
bb4337b3
...
...
@@ -6,6 +6,7 @@ import torch
from
vllm
import
_custom_ops
as
ops
from
vllm.platforms
import
current_platform
from
vllm.scalar_type
import
scalar_types
from
vllm.utils.torch_utils
import
set_random_seed
if
not
current_platform
.
has_device_capability
(
100
):
pytest
.
skip
(
...
...
@@ -134,7 +135,7 @@ def test_quantize_to_fp4(
seed
:
int
,
device
:
str
,
)
->
None
:
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
torch
.
set_default_device
(
device
)
m
,
n
=
shape
...
...
@@ -156,7 +157,7 @@ def test_quantize_to_fp4(
@
torch
.
inference_mode
()
def
test_quantize_to_fp4_padded
(
pad_shape
:
tuple
[
int
,
int
])
->
None
:
dtype
=
torch
.
float16
current_platform
.
seed_everything
(
42
)
set_random_seed
(
42
)
torch
.
set_default_device
(
"cuda:0"
)
m
,
n
=
pad_shape
...
...
tests/kernels/quantization/test_nvfp4_qutlass.py
View file @
bb4337b3
...
...
@@ -25,6 +25,7 @@ from vllm import _custom_ops as ops # use existing nvfp4 gemm in vllm
from
vllm._custom_ops
import
fusedQuantizeNv
from
vllm.model_executor.layers.quantization.qutlass_utils
import
to_blocked
from
vllm.platforms
import
current_platform
from
vllm.utils.torch_utils
import
set_random_seed
if
not
torch
.
cuda
.
is_available
():
pytest
.
skip
(
"CUDA required for these tests."
,
allow_module_level
=
True
)
...
...
@@ -193,7 +194,7 @@ LLAMA_MODELS = {
@
pytest
.
fixture
(
autouse
=
True
)
def
_seed_each_test
():
current_platform
.
seed_everything
(
0
)
set_random_seed
(
0
)
np
.
random
.
seed
(
0
)
torch
.
random
.
manual_seed
(
0
)
...
...
tests/kernels/quantization/test_nvfp4_scaled_mm.py
View file @
bb4337b3
...
...
@@ -6,6 +6,7 @@ from nvfp4_utils import FLOAT4_E2M1_MAX, FLOAT8_E4M3_MAX, dequantize_nvfp4_to_dt
from
vllm
import
_custom_ops
as
ops
from
vllm.platforms
import
current_platform
from
vllm.utils.torch_utils
import
set_random_seed
if
not
current_platform
.
has_device_capability
(
100
):
pytest
.
skip
(
...
...
@@ -59,7 +60,7 @@ def test_nvfp4_gemm(
seed
:
int
,
device
:
str
,
)
->
None
:
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
m
,
n
,
packed_k
=
shape
k
=
packed_k
*
2
block_size
=
16
...
...
tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
View file @
bb4337b3
...
...
@@ -11,6 +11,7 @@ from tests.kernels.quantization.nvfp4_utils import (
from
vllm._custom_ops
import
scaled_fp4_quant
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.platforms
import
current_platform
from
vllm.utils.torch_utils
import
set_random_seed
if
not
current_platform
.
has_device_capability
(
100
):
pytest
.
skip
(
...
...
@@ -33,7 +34,7 @@ def test_silu_mul_nvfp4_quant(
dtype
:
torch
.
dtype
,
shape
:
tuple
[
int
,
int
],
)
->
None
:
current_platform
.
seed_everything
(
42
)
set_random_seed
(
42
)
device
=
"cuda:0"
torch
.
set_default_device
(
device
)
...
...
tests/kernels/quantization/test_triton_scaled_mm.py
View file @
bb4337b3
...
...
@@ -11,6 +11,7 @@ import pytest
import
torch
from
vllm.platforms
import
current_platform
from
vllm.utils.torch_utils
import
set_random_seed
device
=
"cuda"
...
...
@@ -85,7 +86,7 @@ def test_scaled_mm(
):
is_floating_point_type
=
lambda
t
:
torch
.
tensor
([
1
,
1
],
dtype
=
t
).
is_floating_point
()
current_platform
.
seed_everything
(
0
)
set_random_seed
(
0
)
# NOTE: There are cases, where if the matrix is large enough, an output
# like 65504.4 can be produced, and can easily turn into inf when
...
...
tests/kernels/test_apply_repetition_penalties.py
View file @
bb4337b3
...
...
@@ -9,6 +9,7 @@ from vllm._custom_ops import (
apply_repetition_penalties_torch
,
)
from
vllm.platforms
import
current_platform
from
vllm.utils.torch_utils
import
set_random_seed
NUM_SEQS
=
[
1
,
2
,
3
,
4
,
8
,
13
,
17
,
32
,
37
,
256
,
1023
,
1024
,
1025
]
# [stress, stress, stress, Qwen, llama 4]
...
...
@@ -38,7 +39,7 @@ def test_apply_repetition_penalties(
Test the apply_repetition_penalties custom op
against a reference implementation.
"""
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
torch
.
set_default_device
(
"cuda:0"
)
# Create test data
...
...
@@ -95,7 +96,7 @@ def test_apply_repetition_penalties_zero_seqs() -> None:
dtype
=
torch
.
float32
seed
=
0
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
torch
.
set_default_device
(
"cuda:0"
)
# Create test data
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment