Unverified Commit bb4337b3 authored by wangxiyuan's avatar wangxiyuan Committed by GitHub
Browse files

[Platform] Deprecate seed_everything (#31659)


Signed-off-by: default avatarwangxiyuan <wangxiyuan1007@gmail.com>
parent 367856de
......@@ -16,6 +16,7 @@ from vllm.model_executor.layers.fused_moe.config import nvfp4_moe_quant_config
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
if not current_platform.has_device_capability(100):
pytest.skip(
......@@ -42,7 +43,7 @@ MNK_FACTORS = [
def test_cutlass_fp4_moe_no_graph(
m: int, n: int, k: int, e: int, topk: int, dtype: torch.dtype, workspace_init
):
current_platform.seed_everything(7)
set_random_seed(7)
with set_current_vllm_config(
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
):
......
......@@ -14,6 +14,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
from vllm.platforms import current_platform
from vllm.utils.math_utils import cdiv
from vllm.utils.torch_utils import set_random_seed
from ...utils import multi_gpu_test
from .parallel_utils import ProcessGroupInfo, parallel_launch
......@@ -290,7 +291,7 @@ def test_cutlass_moe_pplx(
world_dp_size: tuple[int, int],
use_internode: bool,
):
current_platform.seed_everything(7)
set_random_seed(7)
with set_current_vllm_config(vllm_config):
dtype = torch.half
......
......@@ -44,8 +44,8 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularK
from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
TopKWeightAndReduceDelegate,
)
from vllm.platforms import current_platform
from vllm.utils.math_utils import round_up
from vllm.utils.torch_utils import set_random_seed
from vllm.v1.worker.workspace import init_workspace_manager
from ...utils import multi_gpu_test
......@@ -184,7 +184,7 @@ def test_fused_moe_batched_experts(
dtype: torch.dtype,
workspace_init,
):
current_platform.seed_everything(7)
set_random_seed(7)
a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
......@@ -491,7 +491,7 @@ def test_pplx_prepare_finalize_slow(
if per_act_token_quant and block_shape is not None:
pytest.skip("Skip illegal quantization combination")
current_platform.seed_everything(7)
set_random_seed(7)
m, n, k = mnk
world_size, dp_size = world_dp_size
device = "cuda"
......@@ -809,7 +809,7 @@ def test_pplx_moe_slow(
block_shape: list[int] | None,
use_internode: bool,
):
current_platform.seed_everything(7)
set_random_seed(7)
m, n, k = mnk
world_size, dp_size = world_dp_size
......@@ -888,7 +888,7 @@ def _pplx_test_loop(
new_vllm_config.parallel_config.enable_expert_parallel = True
_set_vllm_config(new_vllm_config, pgi.world_size, pgi.rank, pgi.local_rank)
current_platform.seed_everything(7)
set_random_seed(7)
combos = itertools.product(
PPLX_COMBOS, NUM_EXPERTS, TOP_KS, DTYPES, [False, True], [None, [128, 128]]
)
......@@ -982,7 +982,7 @@ def test_pplx_prepare_finalize(
world_dp_size: tuple[int, int],
use_internode: bool,
):
current_platform.seed_everything(7)
set_random_seed(7)
world_size, dp_size = world_dp_size
parallel_launch(
world_size * dp_size,
......@@ -1005,7 +1005,7 @@ def test_pplx_moe(
use_internode: bool,
use_shared_experts: bool,
):
current_platform.seed_everything(7)
set_random_seed(7)
world_size, dp_size = world_dp_size
parallel_launch(
world_size,
......
......@@ -13,6 +13,7 @@ from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
from vllm.platforms import current_platform
from vllm.utils.deep_gemm import DeepGemmQuantScaleFMT, has_deep_gemm
from vllm.utils.math_utils import cdiv, round_up
from vllm.utils.torch_utils import set_random_seed
if current_platform.is_fp8_fnuz():
pytest.skip(
......@@ -201,7 +202,7 @@ def token_random(E, T, H2, tokens_per_expert):
@torch.inference_mode()
def test_silu_mul_fp8_quant_deep_gemm(E: int, T: int, H: int, fp8_type: torch.dtype):
group_size = 128
current_platform.seed_everything(42)
set_random_seed(42)
tokens_per_expert = torch.randint(
low=0,
......
......@@ -11,6 +11,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
from vllm.platforms import current_platform
from vllm.triton_utils import triton
from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
from vllm.utils.torch_utils import set_random_seed
FLOAT8_DTYPE = torch.float8_e4m3fn
GROUP_SIZE = 128
......@@ -72,7 +73,7 @@ def reference(x: torch.Tensor, use_ue8m0: bool) -> tuple[torch.Tensor, torch.Ten
reason="ROCm does not support DeepGemm.",
)
def test_silu_mul_fp8_quant_deep_gemm(T: int, N: int):
current_platform.seed_everything(42)
set_random_seed(42)
input = torch.rand((T, N), dtype=torch.bfloat16, device="cuda")
......
......@@ -13,7 +13,7 @@ from vllm.model_executor.layers.quantization.awq_triton import (
awq_dequantize_triton,
awq_gemm_triton,
)
from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
device = "cuda"
......@@ -86,7 +86,7 @@ def test_dequantize(qweight_rows, qweight_cols, group_size):
zeros_cols = qweight_cols
zeros_dtype = torch.int32
current_platform.seed_everything(0)
set_random_seed(0)
qweight = torch.randint(
0,
......@@ -141,7 +141,7 @@ def test_gemm(N, K, M, splitK, group_size):
qzeros_rows = scales_rows
qzeros_cols = qweight_cols
current_platform.seed_everything(0)
set_random_seed(0)
input = torch.rand((input_rows, input_cols), dtype=input_dtype, device=device)
qweight = torch.randint(
......
......@@ -17,6 +17,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
)
from vllm.platforms import current_platform
from vllm.scalar_type import ScalarType, scalar_types
from vllm.utils.torch_utils import set_random_seed
IS_SUPPORTED_BY_GPU = (
current_platform.is_cuda() and current_platform.get_device_capability()[0] >= 9
......@@ -248,7 +249,7 @@ def compute_moe_reference_output(setup: MoETestSetup) -> torch.Tensor:
@pytest.mark.parametrize("random_zero", [True, False])
def test_cutlass_w4a8_moe_mm_end_to_end(shape, random_zero):
num_experts, N, K = shape
current_platform.seed_everything(42)
set_random_seed(42)
setup = make_moe_test_setup(
num_experts=num_experts, K=K, N=N, max_blocks=64, random_zero=random_zero
)
......@@ -308,7 +309,7 @@ class W4A8MoELayer(torch.nn.Module):
reason="W4A8 Grouped GEMM is not supported on this GPU type.",
)
def test_cutlass_w4a8_moe_mm_cuda_graph():
current_platform.seed_everything(42)
set_random_seed(42)
# Fixed config for CUDA graph test (single parameter point).
num_experts = 8
K = 512
......
......@@ -12,6 +12,7 @@ from nvfp4_utils import (
from vllm import _custom_ops as ops
from vllm.platforms import current_platform
from vllm.utils.flashinfer import flashinfer_scaled_fp4_mm
from vllm.utils.torch_utils import set_random_seed
if not current_platform.has_device_capability(100):
pytest.skip(
......@@ -72,7 +73,7 @@ def test_flashinfer_nvfp4_gemm(
if backend == "trtllm" and dtype == torch.float16:
pytest.skip("Only torch.bfloat16 is supported for TRTLLM FP4 GEMM operations")
current_platform.seed_everything(seed)
set_random_seed(seed)
m, n, packed_k = shape
k = packed_k * 2
block_size = 16
......
......@@ -6,6 +6,7 @@ import torch
from vllm import _custom_ops as ops
from vllm.platforms import current_platform
from vllm.utils.flashinfer import flashinfer_scaled_fp8_mm
from vllm.utils.torch_utils import set_random_seed
if not current_platform.has_device_capability(100):
pytest.skip(
......@@ -38,7 +39,7 @@ def test_flashinfer_fp8_gemm(
device: str,
autotune: bool,
) -> None:
current_platform.seed_everything(seed)
set_random_seed(seed)
m, n, k = shape
a = torch.randn((m, k), dtype=dtype, device=device)
b = torch.randn((n, k), dtype=dtype, device=device) / k
......
......@@ -11,7 +11,7 @@ from tests.kernels.quant_utils import (
ref_dynamic_per_token_quant,
)
from tests.kernels.utils import opcheck
from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
DTYPES = [torch.bfloat16, torch.float]
HIDDEN_SIZES = [17, 1024, 1025, 1026, 5137, 8193]
......@@ -51,7 +51,7 @@ def opcheck_fp8_quant(
def test_dynamic_per_token_fp8_quant(
num_tokens: int, hidden_size: int, dtype: torch.dtype, scale_ub: bool, seed: int
) -> None:
current_platform.seed_everything(seed)
set_random_seed(seed)
x = (
torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") + 1e-6
......@@ -81,7 +81,7 @@ def test_dynamic_per_token_fp8_quant(
def test_dynamic_per_tensor_fp8_quant(
num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int
) -> None:
current_platform.seed_everything(seed)
set_random_seed(seed)
x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda")
......@@ -101,7 +101,7 @@ def test_dynamic_per_tensor_fp8_quant(
@torch.inference_mode()
@pytest.mark.parametrize("seed", SEEDS)
def test_fp8_quant_large(seed: int) -> None:
current_platform.seed_everything(seed)
set_random_seed(seed)
num_tokens = 1024000 # Mistral-Nemo's max_position_embeddings
hidden_size = 1152 # Smallest hidden_size to reproduce the error
......
......@@ -7,7 +7,7 @@ import torch
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
@pytest.mark.parametrize(
......@@ -30,7 +30,7 @@ def test_quantfp8_group_functionality(
Tests both CUDA and native implementations, column-major scales,
and verifies consistency between implementations.
"""
current_platform.seed_everything(seed)
set_random_seed(seed)
x = torch.randn((batch_size, hidden_dim), dtype=torch.bfloat16, device="cuda") * 8
expected_num_groups = (hidden_dim + group_size - 1) // group_size
......@@ -83,7 +83,7 @@ def test_quantfp8_group_functionality(
@pytest.mark.parametrize("use_ue8m0", [True, False])
@torch.inference_mode()
def test_quantfp8_group_multidimensional(seed: int, use_ue8m0: bool) -> None:
current_platform.seed_everything(seed)
set_random_seed(seed)
group_size = 64
......@@ -136,7 +136,7 @@ def test_quantfp8_group_multidimensional(seed: int, use_ue8m0: bool) -> None:
@pytest.mark.parametrize("seed", [42])
@torch.inference_mode()
def test_quantfp8_group_edge_cases(seed: int) -> None:
current_platform.seed_everything(seed)
set_random_seed(seed)
batch_size = 16
group_size = 64
......
......@@ -11,7 +11,7 @@ from huggingface_hub import snapshot_download
import vllm._custom_ops as ops
from vllm.model_executor.layers.fused_moe import fused_experts
from vllm.model_executor.layers.quantization.gguf import _fused_moe_gguf
from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
GGUF_SAMPLE = snapshot_download("Isotr0py/test-gguf-sample")
GGUF_SAMPLE_MOE = snapshot_download("SzymonOzog/test-gguf-moe-sample")
......@@ -91,7 +91,7 @@ def test_dequantize(
@pytest.mark.parametrize("quant_type", QUANT_TYPES)
@torch.inference_mode()
def test_mmvq(hidden_size: int, dtype: torch.dtype, quant_type: GGMLQuantizationType):
current_platform.seed_everything(0)
set_random_seed(0)
tensors = get_gguf_sample_tensors(hidden_size, quant_type)
x = torch.rand((1, hidden_size), dtype=dtype, device="cuda")
......@@ -134,7 +134,7 @@ def test_mmq(
dtype: torch.dtype,
quant_type: GGMLQuantizationType,
):
current_platform.seed_everything(0)
set_random_seed(0)
tensors = get_gguf_sample_tensors(hidden_size, quant_type)
x = torch.rand((num_tokens, hidden_size), dtype=dtype, device="cuda")
......@@ -169,7 +169,7 @@ def test_moe(
quant_type: GGMLQuantizationType,
top_k: int,
):
current_platform.seed_everything(0)
set_random_seed(0)
H, E = 1024, 256
x = torch.rand((num_tokens, H), dtype=dtype, device="cuda")
......
......@@ -7,7 +7,7 @@ import torch
from tests.kernels.quant_utils import ref_dynamic_per_token_quant
from tests.kernels.utils import opcheck
from vllm._custom_ops import scaled_int8_quant
from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
DTYPES = [torch.bfloat16, torch.float]
HIDDEN_SIZES = [17, 1024, 1025, 1026, 5137, 8193]
......@@ -46,7 +46,7 @@ def opcheck_int8_quant_dynamic(output, input, symmetric=True):
def test_dynamic_scaled_int8_quant(
num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int
) -> None:
current_platform.seed_everything(seed)
set_random_seed(seed)
x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000
......@@ -70,7 +70,7 @@ def test_dynamic_scaled_int8_quant(
def test_dynamic_scaled_int8_azp_quant(
num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int
) -> None:
current_platform.seed_everything(seed)
set_random_seed(seed)
int8_traits = torch.iinfo(torch.int8)
x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000 - 300
......@@ -111,7 +111,7 @@ def test_dynamic_scaled_int8_azp_quant(
def test_static_scaled_int8_quant(
num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int, scale: float
) -> None:
current_platform.seed_everything(seed)
set_random_seed(seed)
int8_traits = torch.iinfo(torch.int8)
x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000
......@@ -144,7 +144,7 @@ def test_static_scaled_int8_azp_quant(
scale: float,
azp: int,
) -> None:
current_platform.seed_everything(seed)
set_random_seed(seed)
int8_traits = torch.iinfo(torch.int8)
x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000 - 300
......
......@@ -24,6 +24,7 @@ from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_m
from vllm._custom_ops import fusedQuantizeMx, matmul_mxf4_bf16_tn
from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
if not torch.cuda.is_available():
pytest.skip("CUDA required for these tests.", allow_module_level=True)
......@@ -205,7 +206,7 @@ LLAMA_MODELS = {
@pytest.fixture(autouse=True)
def _seed_each_test():
current_platform.seed_everything(0)
set_random_seed(0)
np.random.seed(0)
torch.random.manual_seed(0)
......
......@@ -6,6 +6,7 @@ import torch
from vllm import _custom_ops as ops
from vllm.platforms import current_platform
from vllm.scalar_type import scalar_types
from vllm.utils.torch_utils import set_random_seed
if not current_platform.has_device_capability(100):
pytest.skip(
......@@ -134,7 +135,7 @@ def test_quantize_to_fp4(
seed: int,
device: str,
) -> None:
current_platform.seed_everything(seed)
set_random_seed(seed)
torch.set_default_device(device)
m, n = shape
......@@ -156,7 +157,7 @@ def test_quantize_to_fp4(
@torch.inference_mode()
def test_quantize_to_fp4_padded(pad_shape: tuple[int, int]) -> None:
dtype = torch.float16
current_platform.seed_everything(42)
set_random_seed(42)
torch.set_default_device("cuda:0")
m, n = pad_shape
......
......@@ -25,6 +25,7 @@ from vllm import _custom_ops as ops # use existing nvfp4 gemm in vllm
from vllm._custom_ops import fusedQuantizeNv
from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
if not torch.cuda.is_available():
pytest.skip("CUDA required for these tests.", allow_module_level=True)
......@@ -193,7 +194,7 @@ LLAMA_MODELS = {
@pytest.fixture(autouse=True)
def _seed_each_test():
current_platform.seed_everything(0)
set_random_seed(0)
np.random.seed(0)
torch.random.manual_seed(0)
......
......@@ -6,6 +6,7 @@ from nvfp4_utils import FLOAT4_E2M1_MAX, FLOAT8_E4M3_MAX, dequantize_nvfp4_to_dt
from vllm import _custom_ops as ops
from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
if not current_platform.has_device_capability(100):
pytest.skip(
......@@ -59,7 +60,7 @@ def test_nvfp4_gemm(
seed: int,
device: str,
) -> None:
current_platform.seed_everything(seed)
set_random_seed(seed)
m, n, packed_k = shape
k = packed_k * 2
block_size = 16
......
......@@ -11,6 +11,7 @@ from tests.kernels.quantization.nvfp4_utils import (
from vllm._custom_ops import scaled_fp4_quant
from vllm.model_executor.layers.activation import SiluAndMul
from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
if not current_platform.has_device_capability(100):
pytest.skip(
......@@ -33,7 +34,7 @@ def test_silu_mul_nvfp4_quant(
dtype: torch.dtype,
shape: tuple[int, int],
) -> None:
current_platform.seed_everything(42)
set_random_seed(42)
device = "cuda:0"
torch.set_default_device(device)
......
......@@ -11,6 +11,7 @@ import pytest
import torch
from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
device = "cuda"
......@@ -85,7 +86,7 @@ def test_scaled_mm(
):
is_floating_point_type = lambda t: torch.tensor([1, 1], dtype=t).is_floating_point()
current_platform.seed_everything(0)
set_random_seed(0)
# NOTE: There are cases, where if the matrix is large enough, an output
# like 65504.4 can be produced, and can easily turn into inf when
......
......@@ -9,6 +9,7 @@ from vllm._custom_ops import (
apply_repetition_penalties_torch,
)
from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
NUM_SEQS = [1, 2, 3, 4, 8, 13, 17, 32, 37, 256, 1023, 1024, 1025]
# [stress, stress, stress, Qwen, llama 4]
......@@ -38,7 +39,7 @@ def test_apply_repetition_penalties(
Test the apply_repetition_penalties custom op
against a reference implementation.
"""
current_platform.seed_everything(seed)
set_random_seed(seed)
torch.set_default_device("cuda:0")
# Create test data
......@@ -95,7 +96,7 @@ def test_apply_repetition_penalties_zero_seqs() -> None:
dtype = torch.float32
seed = 0
current_platform.seed_everything(seed)
set_random_seed(seed)
torch.set_default_device("cuda:0")
# Create test data
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment