Unverified Commit bb4337b3 authored by wangxiyuan's avatar wangxiyuan Committed by GitHub
Browse files

[Platform] Deprecate seed_everything (#31659)


Signed-off-by: default avatarwangxiyuan <wangxiyuan1007@gmail.com>
parent 367856de
...@@ -16,6 +16,7 @@ from vllm.model_executor.layers.fused_moe.config import nvfp4_moe_quant_config ...@@ -16,6 +16,7 @@ from vllm.model_executor.layers.fused_moe.config import nvfp4_moe_quant_config
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4 from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
if not current_platform.has_device_capability(100): if not current_platform.has_device_capability(100):
pytest.skip( pytest.skip(
...@@ -42,7 +43,7 @@ MNK_FACTORS = [ ...@@ -42,7 +43,7 @@ MNK_FACTORS = [
def test_cutlass_fp4_moe_no_graph( def test_cutlass_fp4_moe_no_graph(
m: int, n: int, k: int, e: int, topk: int, dtype: torch.dtype, workspace_init m: int, n: int, k: int, e: int, topk: int, dtype: torch.dtype, workspace_init
): ):
current_platform.seed_everything(7) set_random_seed(7)
with set_current_vllm_config( with set_current_vllm_config(
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1)) VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
): ):
......
...@@ -14,6 +14,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk ...@@ -14,6 +14,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.math_utils import cdiv from vllm.utils.math_utils import cdiv
from vllm.utils.torch_utils import set_random_seed
from ...utils import multi_gpu_test from ...utils import multi_gpu_test
from .parallel_utils import ProcessGroupInfo, parallel_launch from .parallel_utils import ProcessGroupInfo, parallel_launch
...@@ -290,7 +291,7 @@ def test_cutlass_moe_pplx( ...@@ -290,7 +291,7 @@ def test_cutlass_moe_pplx(
world_dp_size: tuple[int, int], world_dp_size: tuple[int, int],
use_internode: bool, use_internode: bool,
): ):
current_platform.seed_everything(7) set_random_seed(7)
with set_current_vllm_config(vllm_config): with set_current_vllm_config(vllm_config):
dtype = torch.half dtype = torch.half
......
...@@ -44,8 +44,8 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularK ...@@ -44,8 +44,8 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularK
from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
TopKWeightAndReduceDelegate, TopKWeightAndReduceDelegate,
) )
from vllm.platforms import current_platform
from vllm.utils.math_utils import round_up from vllm.utils.math_utils import round_up
from vllm.utils.torch_utils import set_random_seed
from vllm.v1.worker.workspace import init_workspace_manager from vllm.v1.worker.workspace import init_workspace_manager
from ...utils import multi_gpu_test from ...utils import multi_gpu_test
...@@ -184,7 +184,7 @@ def test_fused_moe_batched_experts( ...@@ -184,7 +184,7 @@ def test_fused_moe_batched_experts(
dtype: torch.dtype, dtype: torch.dtype,
workspace_init, workspace_init,
): ):
current_platform.seed_everything(7) set_random_seed(7)
a = torch.randn((m, k), device="cuda", dtype=dtype) / 10 a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10 w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
...@@ -491,7 +491,7 @@ def test_pplx_prepare_finalize_slow( ...@@ -491,7 +491,7 @@ def test_pplx_prepare_finalize_slow(
if per_act_token_quant and block_shape is not None: if per_act_token_quant and block_shape is not None:
pytest.skip("Skip illegal quantization combination") pytest.skip("Skip illegal quantization combination")
current_platform.seed_everything(7) set_random_seed(7)
m, n, k = mnk m, n, k = mnk
world_size, dp_size = world_dp_size world_size, dp_size = world_dp_size
device = "cuda" device = "cuda"
...@@ -809,7 +809,7 @@ def test_pplx_moe_slow( ...@@ -809,7 +809,7 @@ def test_pplx_moe_slow(
block_shape: list[int] | None, block_shape: list[int] | None,
use_internode: bool, use_internode: bool,
): ):
current_platform.seed_everything(7) set_random_seed(7)
m, n, k = mnk m, n, k = mnk
world_size, dp_size = world_dp_size world_size, dp_size = world_dp_size
...@@ -888,7 +888,7 @@ def _pplx_test_loop( ...@@ -888,7 +888,7 @@ def _pplx_test_loop(
new_vllm_config.parallel_config.enable_expert_parallel = True new_vllm_config.parallel_config.enable_expert_parallel = True
_set_vllm_config(new_vllm_config, pgi.world_size, pgi.rank, pgi.local_rank) _set_vllm_config(new_vllm_config, pgi.world_size, pgi.rank, pgi.local_rank)
current_platform.seed_everything(7) set_random_seed(7)
combos = itertools.product( combos = itertools.product(
PPLX_COMBOS, NUM_EXPERTS, TOP_KS, DTYPES, [False, True], [None, [128, 128]] PPLX_COMBOS, NUM_EXPERTS, TOP_KS, DTYPES, [False, True], [None, [128, 128]]
) )
...@@ -982,7 +982,7 @@ def test_pplx_prepare_finalize( ...@@ -982,7 +982,7 @@ def test_pplx_prepare_finalize(
world_dp_size: tuple[int, int], world_dp_size: tuple[int, int],
use_internode: bool, use_internode: bool,
): ):
current_platform.seed_everything(7) set_random_seed(7)
world_size, dp_size = world_dp_size world_size, dp_size = world_dp_size
parallel_launch( parallel_launch(
world_size * dp_size, world_size * dp_size,
...@@ -1005,7 +1005,7 @@ def test_pplx_moe( ...@@ -1005,7 +1005,7 @@ def test_pplx_moe(
use_internode: bool, use_internode: bool,
use_shared_experts: bool, use_shared_experts: bool,
): ):
current_platform.seed_everything(7) set_random_seed(7)
world_size, dp_size = world_dp_size world_size, dp_size = world_dp_size
parallel_launch( parallel_launch(
world_size, world_size,
......
...@@ -13,6 +13,7 @@ from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import ( ...@@ -13,6 +13,7 @@ from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.deep_gemm import DeepGemmQuantScaleFMT, has_deep_gemm from vllm.utils.deep_gemm import DeepGemmQuantScaleFMT, has_deep_gemm
from vllm.utils.math_utils import cdiv, round_up from vllm.utils.math_utils import cdiv, round_up
from vllm.utils.torch_utils import set_random_seed
if current_platform.is_fp8_fnuz(): if current_platform.is_fp8_fnuz():
pytest.skip( pytest.skip(
...@@ -201,7 +202,7 @@ def token_random(E, T, H2, tokens_per_expert): ...@@ -201,7 +202,7 @@ def token_random(E, T, H2, tokens_per_expert):
@torch.inference_mode() @torch.inference_mode()
def test_silu_mul_fp8_quant_deep_gemm(E: int, T: int, H: int, fp8_type: torch.dtype): def test_silu_mul_fp8_quant_deep_gemm(E: int, T: int, H: int, fp8_type: torch.dtype):
group_size = 128 group_size = 128
current_platform.seed_everything(42) set_random_seed(42)
tokens_per_expert = torch.randint( tokens_per_expert = torch.randint(
low=0, low=0,
......
...@@ -11,6 +11,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import ( ...@@ -11,6 +11,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.triton_utils import triton from vllm.triton_utils import triton
from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
from vllm.utils.torch_utils import set_random_seed
FLOAT8_DTYPE = torch.float8_e4m3fn FLOAT8_DTYPE = torch.float8_e4m3fn
GROUP_SIZE = 128 GROUP_SIZE = 128
...@@ -72,7 +73,7 @@ def reference(x: torch.Tensor, use_ue8m0: bool) -> tuple[torch.Tensor, torch.Ten ...@@ -72,7 +73,7 @@ def reference(x: torch.Tensor, use_ue8m0: bool) -> tuple[torch.Tensor, torch.Ten
reason="ROCm does not support DeepGemm.", reason="ROCm does not support DeepGemm.",
) )
def test_silu_mul_fp8_quant_deep_gemm(T: int, N: int): def test_silu_mul_fp8_quant_deep_gemm(T: int, N: int):
current_platform.seed_everything(42) set_random_seed(42)
input = torch.rand((T, N), dtype=torch.bfloat16, device="cuda") input = torch.rand((T, N), dtype=torch.bfloat16, device="cuda")
......
...@@ -13,7 +13,7 @@ from vllm.model_executor.layers.quantization.awq_triton import ( ...@@ -13,7 +13,7 @@ from vllm.model_executor.layers.quantization.awq_triton import (
awq_dequantize_triton, awq_dequantize_triton,
awq_gemm_triton, awq_gemm_triton,
) )
from vllm.platforms import current_platform from vllm.utils.torch_utils import set_random_seed
device = "cuda" device = "cuda"
...@@ -86,7 +86,7 @@ def test_dequantize(qweight_rows, qweight_cols, group_size): ...@@ -86,7 +86,7 @@ def test_dequantize(qweight_rows, qweight_cols, group_size):
zeros_cols = qweight_cols zeros_cols = qweight_cols
zeros_dtype = torch.int32 zeros_dtype = torch.int32
current_platform.seed_everything(0) set_random_seed(0)
qweight = torch.randint( qweight = torch.randint(
0, 0,
...@@ -141,7 +141,7 @@ def test_gemm(N, K, M, splitK, group_size): ...@@ -141,7 +141,7 @@ def test_gemm(N, K, M, splitK, group_size):
qzeros_rows = scales_rows qzeros_rows = scales_rows
qzeros_cols = qweight_cols qzeros_cols = qweight_cols
current_platform.seed_everything(0) set_random_seed(0)
input = torch.rand((input_rows, input_cols), dtype=input_dtype, device=device) input = torch.rand((input_rows, input_cols), dtype=input_dtype, device=device)
qweight = torch.randint( qweight = torch.randint(
......
...@@ -17,6 +17,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( ...@@ -17,6 +17,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
) )
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.scalar_type import ScalarType, scalar_types from vllm.scalar_type import ScalarType, scalar_types
from vllm.utils.torch_utils import set_random_seed
IS_SUPPORTED_BY_GPU = ( IS_SUPPORTED_BY_GPU = (
current_platform.is_cuda() and current_platform.get_device_capability()[0] >= 9 current_platform.is_cuda() and current_platform.get_device_capability()[0] >= 9
...@@ -248,7 +249,7 @@ def compute_moe_reference_output(setup: MoETestSetup) -> torch.Tensor: ...@@ -248,7 +249,7 @@ def compute_moe_reference_output(setup: MoETestSetup) -> torch.Tensor:
@pytest.mark.parametrize("random_zero", [True, False]) @pytest.mark.parametrize("random_zero", [True, False])
def test_cutlass_w4a8_moe_mm_end_to_end(shape, random_zero): def test_cutlass_w4a8_moe_mm_end_to_end(shape, random_zero):
num_experts, N, K = shape num_experts, N, K = shape
current_platform.seed_everything(42) set_random_seed(42)
setup = make_moe_test_setup( setup = make_moe_test_setup(
num_experts=num_experts, K=K, N=N, max_blocks=64, random_zero=random_zero num_experts=num_experts, K=K, N=N, max_blocks=64, random_zero=random_zero
) )
...@@ -308,7 +309,7 @@ class W4A8MoELayer(torch.nn.Module): ...@@ -308,7 +309,7 @@ class W4A8MoELayer(torch.nn.Module):
reason="W4A8 Grouped GEMM is not supported on this GPU type.", reason="W4A8 Grouped GEMM is not supported on this GPU type.",
) )
def test_cutlass_w4a8_moe_mm_cuda_graph(): def test_cutlass_w4a8_moe_mm_cuda_graph():
current_platform.seed_everything(42) set_random_seed(42)
# Fixed config for CUDA graph test (single parameter point). # Fixed config for CUDA graph test (single parameter point).
num_experts = 8 num_experts = 8
K = 512 K = 512
......
...@@ -12,6 +12,7 @@ from nvfp4_utils import ( ...@@ -12,6 +12,7 @@ from nvfp4_utils import (
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.flashinfer import flashinfer_scaled_fp4_mm from vllm.utils.flashinfer import flashinfer_scaled_fp4_mm
from vllm.utils.torch_utils import set_random_seed
if not current_platform.has_device_capability(100): if not current_platform.has_device_capability(100):
pytest.skip( pytest.skip(
...@@ -72,7 +73,7 @@ def test_flashinfer_nvfp4_gemm( ...@@ -72,7 +73,7 @@ def test_flashinfer_nvfp4_gemm(
if backend == "trtllm" and dtype == torch.float16: if backend == "trtllm" and dtype == torch.float16:
pytest.skip("Only torch.bfloat16 is supported for TRTLLM FP4 GEMM operations") pytest.skip("Only torch.bfloat16 is supported for TRTLLM FP4 GEMM operations")
current_platform.seed_everything(seed) set_random_seed(seed)
m, n, packed_k = shape m, n, packed_k = shape
k = packed_k * 2 k = packed_k * 2
block_size = 16 block_size = 16
......
...@@ -6,6 +6,7 @@ import torch ...@@ -6,6 +6,7 @@ import torch
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.flashinfer import flashinfer_scaled_fp8_mm from vllm.utils.flashinfer import flashinfer_scaled_fp8_mm
from vllm.utils.torch_utils import set_random_seed
if not current_platform.has_device_capability(100): if not current_platform.has_device_capability(100):
pytest.skip( pytest.skip(
...@@ -38,7 +39,7 @@ def test_flashinfer_fp8_gemm( ...@@ -38,7 +39,7 @@ def test_flashinfer_fp8_gemm(
device: str, device: str,
autotune: bool, autotune: bool,
) -> None: ) -> None:
current_platform.seed_everything(seed) set_random_seed(seed)
m, n, k = shape m, n, k = shape
a = torch.randn((m, k), dtype=dtype, device=device) a = torch.randn((m, k), dtype=dtype, device=device)
b = torch.randn((n, k), dtype=dtype, device=device) / k b = torch.randn((n, k), dtype=dtype, device=device) / k
......
...@@ -11,7 +11,7 @@ from tests.kernels.quant_utils import ( ...@@ -11,7 +11,7 @@ from tests.kernels.quant_utils import (
ref_dynamic_per_token_quant, ref_dynamic_per_token_quant,
) )
from tests.kernels.utils import opcheck from tests.kernels.utils import opcheck
from vllm.platforms import current_platform from vllm.utils.torch_utils import set_random_seed
DTYPES = [torch.bfloat16, torch.float] DTYPES = [torch.bfloat16, torch.float]
HIDDEN_SIZES = [17, 1024, 1025, 1026, 5137, 8193] HIDDEN_SIZES = [17, 1024, 1025, 1026, 5137, 8193]
...@@ -51,7 +51,7 @@ def opcheck_fp8_quant( ...@@ -51,7 +51,7 @@ def opcheck_fp8_quant(
def test_dynamic_per_token_fp8_quant( def test_dynamic_per_token_fp8_quant(
num_tokens: int, hidden_size: int, dtype: torch.dtype, scale_ub: bool, seed: int num_tokens: int, hidden_size: int, dtype: torch.dtype, scale_ub: bool, seed: int
) -> None: ) -> None:
current_platform.seed_everything(seed) set_random_seed(seed)
x = ( x = (
torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") + 1e-6 torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") + 1e-6
...@@ -81,7 +81,7 @@ def test_dynamic_per_token_fp8_quant( ...@@ -81,7 +81,7 @@ def test_dynamic_per_token_fp8_quant(
def test_dynamic_per_tensor_fp8_quant( def test_dynamic_per_tensor_fp8_quant(
num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int
) -> None: ) -> None:
current_platform.seed_everything(seed) set_random_seed(seed)
x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda")
...@@ -101,7 +101,7 @@ def test_dynamic_per_tensor_fp8_quant( ...@@ -101,7 +101,7 @@ def test_dynamic_per_tensor_fp8_quant(
@torch.inference_mode() @torch.inference_mode()
@pytest.mark.parametrize("seed", SEEDS) @pytest.mark.parametrize("seed", SEEDS)
def test_fp8_quant_large(seed: int) -> None: def test_fp8_quant_large(seed: int) -> None:
current_platform.seed_everything(seed) set_random_seed(seed)
num_tokens = 1024000 # Mistral-Nemo's max_position_embeddings num_tokens = 1024000 # Mistral-Nemo's max_position_embeddings
hidden_size = 1152 # Smallest hidden_size to reproduce the error hidden_size = 1152 # Smallest hidden_size to reproduce the error
......
...@@ -7,7 +7,7 @@ import torch ...@@ -7,7 +7,7 @@ import torch
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8 from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
from vllm.platforms import current_platform from vllm.utils.torch_utils import set_random_seed
@pytest.mark.parametrize( @pytest.mark.parametrize(
...@@ -30,7 +30,7 @@ def test_quantfp8_group_functionality( ...@@ -30,7 +30,7 @@ def test_quantfp8_group_functionality(
Tests both CUDA and native implementations, column-major scales, Tests both CUDA and native implementations, column-major scales,
and verifies consistency between implementations. and verifies consistency between implementations.
""" """
current_platform.seed_everything(seed) set_random_seed(seed)
x = torch.randn((batch_size, hidden_dim), dtype=torch.bfloat16, device="cuda") * 8 x = torch.randn((batch_size, hidden_dim), dtype=torch.bfloat16, device="cuda") * 8
expected_num_groups = (hidden_dim + group_size - 1) // group_size expected_num_groups = (hidden_dim + group_size - 1) // group_size
...@@ -83,7 +83,7 @@ def test_quantfp8_group_functionality( ...@@ -83,7 +83,7 @@ def test_quantfp8_group_functionality(
@pytest.mark.parametrize("use_ue8m0", [True, False]) @pytest.mark.parametrize("use_ue8m0", [True, False])
@torch.inference_mode() @torch.inference_mode()
def test_quantfp8_group_multidimensional(seed: int, use_ue8m0: bool) -> None: def test_quantfp8_group_multidimensional(seed: int, use_ue8m0: bool) -> None:
current_platform.seed_everything(seed) set_random_seed(seed)
group_size = 64 group_size = 64
...@@ -136,7 +136,7 @@ def test_quantfp8_group_multidimensional(seed: int, use_ue8m0: bool) -> None: ...@@ -136,7 +136,7 @@ def test_quantfp8_group_multidimensional(seed: int, use_ue8m0: bool) -> None:
@pytest.mark.parametrize("seed", [42]) @pytest.mark.parametrize("seed", [42])
@torch.inference_mode() @torch.inference_mode()
def test_quantfp8_group_edge_cases(seed: int) -> None: def test_quantfp8_group_edge_cases(seed: int) -> None:
current_platform.seed_everything(seed) set_random_seed(seed)
batch_size = 16 batch_size = 16
group_size = 64 group_size = 64
......
...@@ -11,7 +11,7 @@ from huggingface_hub import snapshot_download ...@@ -11,7 +11,7 @@ from huggingface_hub import snapshot_download
import vllm._custom_ops as ops import vllm._custom_ops as ops
from vllm.model_executor.layers.fused_moe import fused_experts from vllm.model_executor.layers.fused_moe import fused_experts
from vllm.model_executor.layers.quantization.gguf import _fused_moe_gguf from vllm.model_executor.layers.quantization.gguf import _fused_moe_gguf
from vllm.platforms import current_platform from vllm.utils.torch_utils import set_random_seed
GGUF_SAMPLE = snapshot_download("Isotr0py/test-gguf-sample") GGUF_SAMPLE = snapshot_download("Isotr0py/test-gguf-sample")
GGUF_SAMPLE_MOE = snapshot_download("SzymonOzog/test-gguf-moe-sample") GGUF_SAMPLE_MOE = snapshot_download("SzymonOzog/test-gguf-moe-sample")
...@@ -91,7 +91,7 @@ def test_dequantize( ...@@ -91,7 +91,7 @@ def test_dequantize(
@pytest.mark.parametrize("quant_type", QUANT_TYPES) @pytest.mark.parametrize("quant_type", QUANT_TYPES)
@torch.inference_mode() @torch.inference_mode()
def test_mmvq(hidden_size: int, dtype: torch.dtype, quant_type: GGMLQuantizationType): def test_mmvq(hidden_size: int, dtype: torch.dtype, quant_type: GGMLQuantizationType):
current_platform.seed_everything(0) set_random_seed(0)
tensors = get_gguf_sample_tensors(hidden_size, quant_type) tensors = get_gguf_sample_tensors(hidden_size, quant_type)
x = torch.rand((1, hidden_size), dtype=dtype, device="cuda") x = torch.rand((1, hidden_size), dtype=dtype, device="cuda")
...@@ -134,7 +134,7 @@ def test_mmq( ...@@ -134,7 +134,7 @@ def test_mmq(
dtype: torch.dtype, dtype: torch.dtype,
quant_type: GGMLQuantizationType, quant_type: GGMLQuantizationType,
): ):
current_platform.seed_everything(0) set_random_seed(0)
tensors = get_gguf_sample_tensors(hidden_size, quant_type) tensors = get_gguf_sample_tensors(hidden_size, quant_type)
x = torch.rand((num_tokens, hidden_size), dtype=dtype, device="cuda") x = torch.rand((num_tokens, hidden_size), dtype=dtype, device="cuda")
...@@ -169,7 +169,7 @@ def test_moe( ...@@ -169,7 +169,7 @@ def test_moe(
quant_type: GGMLQuantizationType, quant_type: GGMLQuantizationType,
top_k: int, top_k: int,
): ):
current_platform.seed_everything(0) set_random_seed(0)
H, E = 1024, 256 H, E = 1024, 256
x = torch.rand((num_tokens, H), dtype=dtype, device="cuda") x = torch.rand((num_tokens, H), dtype=dtype, device="cuda")
......
...@@ -7,7 +7,7 @@ import torch ...@@ -7,7 +7,7 @@ import torch
from tests.kernels.quant_utils import ref_dynamic_per_token_quant from tests.kernels.quant_utils import ref_dynamic_per_token_quant
from tests.kernels.utils import opcheck from tests.kernels.utils import opcheck
from vllm._custom_ops import scaled_int8_quant from vllm._custom_ops import scaled_int8_quant
from vllm.platforms import current_platform from vllm.utils.torch_utils import set_random_seed
DTYPES = [torch.bfloat16, torch.float] DTYPES = [torch.bfloat16, torch.float]
HIDDEN_SIZES = [17, 1024, 1025, 1026, 5137, 8193] HIDDEN_SIZES = [17, 1024, 1025, 1026, 5137, 8193]
...@@ -46,7 +46,7 @@ def opcheck_int8_quant_dynamic(output, input, symmetric=True): ...@@ -46,7 +46,7 @@ def opcheck_int8_quant_dynamic(output, input, symmetric=True):
def test_dynamic_scaled_int8_quant( def test_dynamic_scaled_int8_quant(
num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int
) -> None: ) -> None:
current_platform.seed_everything(seed) set_random_seed(seed)
x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000 x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000
...@@ -70,7 +70,7 @@ def test_dynamic_scaled_int8_quant( ...@@ -70,7 +70,7 @@ def test_dynamic_scaled_int8_quant(
def test_dynamic_scaled_int8_azp_quant( def test_dynamic_scaled_int8_azp_quant(
num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int
) -> None: ) -> None:
current_platform.seed_everything(seed) set_random_seed(seed)
int8_traits = torch.iinfo(torch.int8) int8_traits = torch.iinfo(torch.int8)
x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000 - 300 x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000 - 300
...@@ -111,7 +111,7 @@ def test_dynamic_scaled_int8_azp_quant( ...@@ -111,7 +111,7 @@ def test_dynamic_scaled_int8_azp_quant(
def test_static_scaled_int8_quant( def test_static_scaled_int8_quant(
num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int, scale: float num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int, scale: float
) -> None: ) -> None:
current_platform.seed_everything(seed) set_random_seed(seed)
int8_traits = torch.iinfo(torch.int8) int8_traits = torch.iinfo(torch.int8)
x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000 x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000
...@@ -144,7 +144,7 @@ def test_static_scaled_int8_azp_quant( ...@@ -144,7 +144,7 @@ def test_static_scaled_int8_azp_quant(
scale: float, scale: float,
azp: int, azp: int,
) -> None: ) -> None:
current_platform.seed_everything(seed) set_random_seed(seed)
int8_traits = torch.iinfo(torch.int8) int8_traits = torch.iinfo(torch.int8)
x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000 - 300 x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000 - 300
......
...@@ -24,6 +24,7 @@ from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_m ...@@ -24,6 +24,7 @@ from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_m
from vllm._custom_ops import fusedQuantizeMx, matmul_mxf4_bf16_tn from vllm._custom_ops import fusedQuantizeMx, matmul_mxf4_bf16_tn
from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
if not torch.cuda.is_available(): if not torch.cuda.is_available():
pytest.skip("CUDA required for these tests.", allow_module_level=True) pytest.skip("CUDA required for these tests.", allow_module_level=True)
...@@ -205,7 +206,7 @@ LLAMA_MODELS = { ...@@ -205,7 +206,7 @@ LLAMA_MODELS = {
@pytest.fixture(autouse=True) @pytest.fixture(autouse=True)
def _seed_each_test(): def _seed_each_test():
current_platform.seed_everything(0) set_random_seed(0)
np.random.seed(0) np.random.seed(0)
torch.random.manual_seed(0) torch.random.manual_seed(0)
......
...@@ -6,6 +6,7 @@ import torch ...@@ -6,6 +6,7 @@ import torch
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.scalar_type import scalar_types from vllm.scalar_type import scalar_types
from vllm.utils.torch_utils import set_random_seed
if not current_platform.has_device_capability(100): if not current_platform.has_device_capability(100):
pytest.skip( pytest.skip(
...@@ -134,7 +135,7 @@ def test_quantize_to_fp4( ...@@ -134,7 +135,7 @@ def test_quantize_to_fp4(
seed: int, seed: int,
device: str, device: str,
) -> None: ) -> None:
current_platform.seed_everything(seed) set_random_seed(seed)
torch.set_default_device(device) torch.set_default_device(device)
m, n = shape m, n = shape
...@@ -156,7 +157,7 @@ def test_quantize_to_fp4( ...@@ -156,7 +157,7 @@ def test_quantize_to_fp4(
@torch.inference_mode() @torch.inference_mode()
def test_quantize_to_fp4_padded(pad_shape: tuple[int, int]) -> None: def test_quantize_to_fp4_padded(pad_shape: tuple[int, int]) -> None:
dtype = torch.float16 dtype = torch.float16
current_platform.seed_everything(42) set_random_seed(42)
torch.set_default_device("cuda:0") torch.set_default_device("cuda:0")
m, n = pad_shape m, n = pad_shape
......
...@@ -25,6 +25,7 @@ from vllm import _custom_ops as ops # use existing nvfp4 gemm in vllm ...@@ -25,6 +25,7 @@ from vllm import _custom_ops as ops # use existing nvfp4 gemm in vllm
from vllm._custom_ops import fusedQuantizeNv from vllm._custom_ops import fusedQuantizeNv
from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
if not torch.cuda.is_available(): if not torch.cuda.is_available():
pytest.skip("CUDA required for these tests.", allow_module_level=True) pytest.skip("CUDA required for these tests.", allow_module_level=True)
...@@ -193,7 +194,7 @@ LLAMA_MODELS = { ...@@ -193,7 +194,7 @@ LLAMA_MODELS = {
@pytest.fixture(autouse=True) @pytest.fixture(autouse=True)
def _seed_each_test(): def _seed_each_test():
current_platform.seed_everything(0) set_random_seed(0)
np.random.seed(0) np.random.seed(0)
torch.random.manual_seed(0) torch.random.manual_seed(0)
......
...@@ -6,6 +6,7 @@ from nvfp4_utils import FLOAT4_E2M1_MAX, FLOAT8_E4M3_MAX, dequantize_nvfp4_to_dt ...@@ -6,6 +6,7 @@ from nvfp4_utils import FLOAT4_E2M1_MAX, FLOAT8_E4M3_MAX, dequantize_nvfp4_to_dt
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
if not current_platform.has_device_capability(100): if not current_platform.has_device_capability(100):
pytest.skip( pytest.skip(
...@@ -59,7 +60,7 @@ def test_nvfp4_gemm( ...@@ -59,7 +60,7 @@ def test_nvfp4_gemm(
seed: int, seed: int,
device: str, device: str,
) -> None: ) -> None:
current_platform.seed_everything(seed) set_random_seed(seed)
m, n, packed_k = shape m, n, packed_k = shape
k = packed_k * 2 k = packed_k * 2
block_size = 16 block_size = 16
......
...@@ -11,6 +11,7 @@ from tests.kernels.quantization.nvfp4_utils import ( ...@@ -11,6 +11,7 @@ from tests.kernels.quantization.nvfp4_utils import (
from vllm._custom_ops import scaled_fp4_quant from vllm._custom_ops import scaled_fp4_quant
from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.activation import SiluAndMul
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
if not current_platform.has_device_capability(100): if not current_platform.has_device_capability(100):
pytest.skip( pytest.skip(
...@@ -33,7 +34,7 @@ def test_silu_mul_nvfp4_quant( ...@@ -33,7 +34,7 @@ def test_silu_mul_nvfp4_quant(
dtype: torch.dtype, dtype: torch.dtype,
shape: tuple[int, int], shape: tuple[int, int],
) -> None: ) -> None:
current_platform.seed_everything(42) set_random_seed(42)
device = "cuda:0" device = "cuda:0"
torch.set_default_device(device) torch.set_default_device(device)
......
...@@ -11,6 +11,7 @@ import pytest ...@@ -11,6 +11,7 @@ import pytest
import torch import torch
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
device = "cuda" device = "cuda"
...@@ -85,7 +86,7 @@ def test_scaled_mm( ...@@ -85,7 +86,7 @@ def test_scaled_mm(
): ):
is_floating_point_type = lambda t: torch.tensor([1, 1], dtype=t).is_floating_point() is_floating_point_type = lambda t: torch.tensor([1, 1], dtype=t).is_floating_point()
current_platform.seed_everything(0) set_random_seed(0)
# NOTE: There are cases, where if the matrix is large enough, an output # NOTE: There are cases, where if the matrix is large enough, an output
# like 65504.4 can be produced, and can easily turn into inf when # like 65504.4 can be produced, and can easily turn into inf when
......
...@@ -9,6 +9,7 @@ from vllm._custom_ops import ( ...@@ -9,6 +9,7 @@ from vllm._custom_ops import (
apply_repetition_penalties_torch, apply_repetition_penalties_torch,
) )
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
NUM_SEQS = [1, 2, 3, 4, 8, 13, 17, 32, 37, 256, 1023, 1024, 1025] NUM_SEQS = [1, 2, 3, 4, 8, 13, 17, 32, 37, 256, 1023, 1024, 1025]
# [stress, stress, stress, Qwen, llama 4] # [stress, stress, stress, Qwen, llama 4]
...@@ -38,7 +39,7 @@ def test_apply_repetition_penalties( ...@@ -38,7 +39,7 @@ def test_apply_repetition_penalties(
Test the apply_repetition_penalties custom op Test the apply_repetition_penalties custom op
against a reference implementation. against a reference implementation.
""" """
current_platform.seed_everything(seed) set_random_seed(seed)
torch.set_default_device("cuda:0") torch.set_default_device("cuda:0")
# Create test data # Create test data
...@@ -95,7 +96,7 @@ def test_apply_repetition_penalties_zero_seqs() -> None: ...@@ -95,7 +96,7 @@ def test_apply_repetition_penalties_zero_seqs() -> None:
dtype = torch.float32 dtype = torch.float32
seed = 0 seed = 0
current_platform.seed_everything(seed) set_random_seed(seed)
torch.set_default_device("cuda:0") torch.set_default_device("cuda:0")
# Create test data # Create test data
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment