Unverified Commit bb4337b3 authored by wangxiyuan's avatar wangxiyuan Committed by GitHub
Browse files

[Platform] Deprecate seed_everything (#31659)


Signed-off-by: default avatarwangxiyuan <wangxiyuan1007@gmail.com>
parent 367856de
...@@ -9,7 +9,7 @@ import torch ...@@ -9,7 +9,7 @@ import torch
from tests.kernels.allclose_default import get_default_atol, get_default_rtol from tests.kernels.allclose_default import get_default_atol, get_default_rtol
from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.platforms import current_platform from vllm.utils.torch_utils import set_random_seed
IS_NEOX_STYLE = [True, False] IS_NEOX_STYLE = [True, False]
DTYPES = [torch.bfloat16, torch.float] DTYPES = [torch.bfloat16, torch.float]
...@@ -79,7 +79,7 @@ def test_rotary_embedding( ...@@ -79,7 +79,7 @@ def test_rotary_embedding(
if rotary_dim is None: if rotary_dim is None:
rotary_dim = head_size rotary_dim = head_size
current_platform.seed_everything(seed) set_random_seed(seed)
torch.set_default_device(device) torch.set_default_device(device)
if rotary_dim is None: if rotary_dim is None:
rotary_dim = head_size rotary_dim = head_size
......
...@@ -12,7 +12,7 @@ from vllm.model_executor.layers.mamba.ops.causal_conv1d import ( ...@@ -12,7 +12,7 @@ from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
causal_conv1d_fn, causal_conv1d_fn,
causal_conv1d_update, causal_conv1d_update,
) )
from vllm.platforms import current_platform from vllm.utils.torch_utils import set_random_seed
def causal_conv1d_ref( def causal_conv1d_ref(
...@@ -154,7 +154,7 @@ def test_causal_conv1d_update(dim, width, seqlen, has_bias, silu_activation, ity ...@@ -154,7 +154,7 @@ def test_causal_conv1d_update(dim, width, seqlen, has_bias, silu_activation, ity
if itype == torch.bfloat16: if itype == torch.bfloat16:
rtol, atol = 1e-2, 5e-2 rtol, atol = 1e-2, 5e-2
# set seed # set seed
current_platform.seed_everything(0) set_random_seed(0)
batch = 2 batch = 2
x = torch.randn(batch, dim, seqlen, device=device, dtype=itype) x = torch.randn(batch, dim, seqlen, device=device, dtype=itype)
x_ref = x.clone() x_ref = x.clone()
...@@ -201,7 +201,7 @@ def test_causal_conv1d_update_with_batch_gather( ...@@ -201,7 +201,7 @@ def test_causal_conv1d_update_with_batch_gather(
rtol, atol = 1e-2, 5e-2 rtol, atol = 1e-2, 5e-2
# set seed # set seed
current_platform.seed_everything(0) set_random_seed(0)
padding = 5 if with_padding else 0 padding = 5 if with_padding else 0
padded_batch_size = batch_size + padding padded_batch_size = batch_size + padding
...@@ -278,7 +278,7 @@ def test_causal_conv1d_varlen( ...@@ -278,7 +278,7 @@ def test_causal_conv1d_varlen(
if itype == torch.bfloat16: if itype == torch.bfloat16:
rtol, atol = 1e-2, 5e-2 rtol, atol = 1e-2, 5e-2
# set seed # set seed
current_platform.seed_everything(0) set_random_seed(0)
seqlens = [] seqlens = []
batch_size = batch batch_size = batch
padding = 3 if with_padding else 0 padding = 3 if with_padding else 0
......
...@@ -12,8 +12,8 @@ from vllm.distributed.parallel_state import ( ...@@ -12,8 +12,8 @@ from vllm.distributed.parallel_state import (
initialize_model_parallel, initialize_model_parallel,
) )
from vllm.model_executor.layers.mamba.mamba_mixer2 import Mixer2RMSNormGated from vllm.model_executor.layers.mamba.mamba_mixer2 import Mixer2RMSNormGated
from vllm.platforms import current_platform
from vllm.utils.system_utils import update_environment_variables from vllm.utils.system_utils import update_environment_variables
from vllm.utils.torch_utils import set_random_seed
@multi_gpu_test(num_gpus=2) @multi_gpu_test(num_gpus=2)
...@@ -68,7 +68,7 @@ def mixer2_gated_norm_tensor_parallel( ...@@ -68,7 +68,7 @@ def mixer2_gated_norm_tensor_parallel(
dtype: torch.dtype, dtype: torch.dtype,
device: str, device: str,
): ):
current_platform.seed_everything(0) set_random_seed(0)
device = torch.device(f"cuda:{local_rank}") device = torch.device(f"cuda:{local_rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
......
...@@ -13,7 +13,7 @@ from vllm.model_executor.layers.mamba.ops.mamba_ssm import ( ...@@ -13,7 +13,7 @@ from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
selective_scan_fn, selective_scan_fn,
selective_state_update, selective_state_update,
) )
from vllm.platforms import current_platform from vllm.utils.torch_utils import set_random_seed
def selective_state_update_ref( def selective_state_update_ref(
...@@ -271,7 +271,7 @@ def test_selective_scan( ...@@ -271,7 +271,7 @@ def test_selective_scan(
rtolw = max(rtolw, rtol) rtolw = max(rtolw, rtol)
atolw = max(atolw, atol) atolw = max(atolw, atol)
# set seed # set seed
current_platform.seed_everything(0) set_random_seed(0)
batch_size = 1 batch_size = 1
dim = 4 dim = 4
dstate = 8 dstate = 8
...@@ -401,7 +401,7 @@ def test_selective_state_update(dim, dstate, has_z, itype): ...@@ -401,7 +401,7 @@ def test_selective_state_update(dim, dstate, has_z, itype):
if torch.version.hip: if torch.version.hip:
atol *= 2 atol *= 2
# set seed # set seed
current_platform.seed_everything(0) set_random_seed(0)
batch_size = 1 batch_size = 1
state = torch.randn(batch_size, dim, dstate, dtype=itype, device=device) state = torch.randn(batch_size, dim, dstate, dtype=itype, device=device)
x = torch.randn(batch_size, dim, device=device, dtype=itype) x = torch.randn(batch_size, dim, device=device, dtype=itype)
...@@ -438,7 +438,7 @@ def test_selective_state_update_varlen(dim, dstate, has_z, itype, max_seq_len): ...@@ -438,7 +438,7 @@ def test_selective_state_update_varlen(dim, dstate, has_z, itype, max_seq_len):
if torch.version.hip: if torch.version.hip:
atol *= 2 atol *= 2
# set seed # set seed
current_platform.seed_everything(0) set_random_seed(0)
batch_size = 4 batch_size = 4
token_counts = torch.randint(1, max_seq_len + 1, (batch_size,), device=device) token_counts = torch.randint(1, max_seq_len + 1, (batch_size,), device=device)
total_tokens = int(token_counts.sum().item()) total_tokens = int(token_counts.sum().item())
...@@ -857,7 +857,7 @@ def test_selective_state_update_with_num_accepted_tokens( ...@@ -857,7 +857,7 @@ def test_selective_state_update_with_num_accepted_tokens(
if torch.version.hip: if torch.version.hip:
atol *= 2 atol *= 2
current_platform.seed_everything(0) set_random_seed(0)
batch_size = 4 batch_size = 4
tokens_per_seq = torch.randint(1, max_seq_len + 1, (batch_size,), device=device) tokens_per_seq = torch.randint(1, max_seq_len + 1, (batch_size,), device=device)
...@@ -983,7 +983,7 @@ def test_selective_state_update_varlen_with_num_accepted( ...@@ -983,7 +983,7 @@ def test_selective_state_update_varlen_with_num_accepted(
if torch.version.hip: if torch.version.hip:
atol *= 2 atol *= 2
current_platform.seed_everything(0) set_random_seed(0)
batch_size = 4 batch_size = 4
tokens_per_seq = torch.randint(1, max_seq_len + 1, (batch_size,), device=device) tokens_per_seq = torch.randint(1, max_seq_len + 1, (batch_size,), device=device)
......
...@@ -9,7 +9,7 @@ from einops import rearrange, repeat ...@@ -9,7 +9,7 @@ from einops import rearrange, repeat
from vllm.model_executor.layers.mamba.ops.ssd_combined import ( from vllm.model_executor.layers.mamba.ops.ssd_combined import (
mamba_chunk_scan_combined_varlen, mamba_chunk_scan_combined_varlen,
) )
from vllm.platforms import current_platform from vllm.utils.torch_utils import set_random_seed
from vllm.v1.attention.backends.mamba2_attn import compute_varlen_chunk_metadata from vllm.v1.attention.backends.mamba2_attn import compute_varlen_chunk_metadata
# Added by the IBM Team, 2024 # Added by the IBM Team, 2024
...@@ -82,7 +82,7 @@ def ssd_minimal_discrete(X, A, B, C, block_len, initial_states=None): ...@@ -82,7 +82,7 @@ def ssd_minimal_discrete(X, A, B, C, block_len, initial_states=None):
def generate_random_inputs(batch_size, seqlen, n_heads, d_head, itype, device="cuda"): def generate_random_inputs(batch_size, seqlen, n_heads, d_head, itype, device="cuda"):
current_platform.seed_everything(0) set_random_seed(0)
A = -torch.exp(torch.rand(n_heads, dtype=itype, device=device)) A = -torch.exp(torch.rand(n_heads, dtype=itype, device=device))
dt = F.softplus( dt = F.softplus(
torch.randn(batch_size, seqlen, n_heads, dtype=itype, device=device) - 4 torch.randn(batch_size, seqlen, n_heads, dtype=itype, device=device) - 4
......
...@@ -10,7 +10,7 @@ from tqdm import tqdm ...@@ -10,7 +10,7 @@ from tqdm import tqdm
from vllm.config import VllmConfig, set_current_vllm_config from vllm.config import VllmConfig, set_current_vllm_config
from vllm.model_executor.layers.fused_moe.config import FUSED_MOE_UNQUANTIZED_CONFIG from vllm.model_executor.layers.fused_moe.config import FUSED_MOE_UNQUANTIZED_CONFIG
from vllm.platforms import current_platform from vllm.utils.torch_utils import set_random_seed
from .common import ( from .common import (
Config, Config,
...@@ -40,7 +40,7 @@ def rank_worker( ...@@ -40,7 +40,7 @@ def rank_worker(
config: Config, config: Config,
weights: WeightTensors, weights: WeightTensors,
): ):
current_platform.seed_everything(pgi.rank) set_random_seed(pgi.rank)
# sanity check # sanity check
from vllm import envs from vllm import envs
......
...@@ -9,7 +9,7 @@ from typing import Any ...@@ -9,7 +9,7 @@ from typing import Any
import torch import torch
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.platforms import current_platform from vllm.utils.torch_utils import set_random_seed
from .common import Config, RankTensors, WeightTensors, make_modular_kernel from .common import Config, RankTensors, WeightTensors, make_modular_kernel
from .parallel_utils import ProcessGroupInfo, parallel_launch_with_config from .parallel_utils import ProcessGroupInfo, parallel_launch_with_config
...@@ -82,7 +82,7 @@ def rank_worker( ...@@ -82,7 +82,7 @@ def rank_worker(
config: Config, config: Config,
weights: WeightTensors, weights: WeightTensors,
): ):
current_platform.seed_everything(pgi.rank) set_random_seed(pgi.rank)
# sanity check # sanity check
from vllm import envs from vllm import envs
......
...@@ -21,6 +21,7 @@ from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( ...@@ -21,6 +21,7 @@ from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.triton_utils import tl from vllm.triton_utils import tl
from vllm.utils.torch_utils import set_random_seed
MNK_FACTORS = [ MNK_FACTORS = [
(1, 128, 128), (1, 128, 128),
...@@ -115,7 +116,7 @@ def test_batched_mm( ...@@ -115,7 +116,7 @@ def test_batched_mm(
): ):
"""Note: float8_e4m3fn is not supported on CUDA architecture < 89, """Note: float8_e4m3fn is not supported on CUDA architecture < 89,
and those tests will be skipped on unsupported hardware.""" and those tests will be skipped on unsupported hardware."""
current_platform.seed_everything(7) set_random_seed(7)
use_fp8_w8a8 = dtype == torch.float8_e4m3fn use_fp8_w8a8 = dtype == torch.float8_e4m3fn
...@@ -252,7 +253,7 @@ def test_fused_moe_batched_experts( ...@@ -252,7 +253,7 @@ def test_fused_moe_batched_experts(
): ):
"""Note: float8_e4m3fn is not supported on CUDA architecture < 89, """Note: float8_e4m3fn is not supported on CUDA architecture < 89,
and those tests will be skipped on unsupported hardware.""" and those tests will be skipped on unsupported hardware."""
current_platform.seed_everything(7) set_random_seed(7)
use_fp8_w8a8 = dtype == torch.float8_e4m3fn use_fp8_w8a8 = dtype == torch.float8_e4m3fn
......
...@@ -8,6 +8,7 @@ from tests.kernels.allclose_default import get_default_atol, get_default_rtol ...@@ -8,6 +8,7 @@ from tests.kernels.allclose_default import get_default_atol, get_default_rtol
from vllm._custom_ops import cpu_fused_moe, cpu_prepack_moe_weight from vllm._custom_ops import cpu_fused_moe, cpu_prepack_moe_weight
from vllm.model_executor.layers.activation import SiluAndMul, SwigluOAIAndMul from vllm.model_executor.layers.activation import SiluAndMul, SwigluOAIAndMul
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
if not current_platform.is_cpu(): if not current_platform.is_cpu():
pytest.skip("skipping CPU-only tests", allow_module_level=True) pytest.skip("skipping CPU-only tests", allow_module_level=True)
...@@ -114,7 +115,7 @@ def test_cpu_fused_moe( ...@@ -114,7 +115,7 @@ def test_cpu_fused_moe(
act: str, act: str,
isa: str, isa: str,
): ):
current_platform.seed_everything(0) set_random_seed(0)
topk_num = max(expert_num // 2, 1) topk_num = max(expert_num // 2, 1)
up_dim = 2 * intermediate_size up_dim = 2 * intermediate_size
......
...@@ -20,6 +20,7 @@ from vllm.model_executor.layers.fused_moe.cutlass_moe import ( ...@@ -20,6 +20,7 @@ from vllm.model_executor.layers.fused_moe.cutlass_moe import (
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
NUM_EXPERTS = [40, 64] NUM_EXPERTS = [40, 64]
TOP_KS = [6, 8] TOP_KS = [6, 8]
...@@ -277,7 +278,7 @@ def test_cutlass_moe_8_bit_no_graph( ...@@ -277,7 +278,7 @@ def test_cutlass_moe_8_bit_no_graph(
workspace_init, workspace_init,
ep_size: int | None = None, ep_size: int | None = None,
): ):
current_platform.seed_everything(7) set_random_seed(7)
monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192") monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
with set_current_vllm_config(vllm_config): with set_current_vllm_config(vllm_config):
mt = MOETensors8Bit.make_moe_tensors_8bit(m, k, n, e, per_act_token, per_out_ch) mt = MOETensors8Bit.make_moe_tensors_8bit(m, k, n, e, per_act_token, per_out_ch)
...@@ -332,7 +333,7 @@ def test_cutlass_moe_8_bit_cuda_graph( ...@@ -332,7 +333,7 @@ def test_cutlass_moe_8_bit_cuda_graph(
monkeypatch, monkeypatch,
workspace_init, workspace_init,
): ):
current_platform.seed_everything(7) set_random_seed(7)
monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192") monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
with set_current_vllm_config(vllm_config): with set_current_vllm_config(vllm_config):
dtype = torch.half dtype = torch.half
...@@ -469,7 +470,7 @@ def test_run_cutlass_moe_fp8( ...@@ -469,7 +470,7 @@ def test_run_cutlass_moe_fp8(
ep_size: int, ep_size: int,
workspace_init, workspace_init,
): ):
current_platform.seed_everything(7) set_random_seed(7)
with set_current_vllm_config(vllm_config): with set_current_vllm_config(vllm_config):
mt = MOETensors8Bit.make_moe_tensors_8bit( mt = MOETensors8Bit.make_moe_tensors_8bit(
m, k, n, e, per_act_token, per_out_channel m, k, n, e, per_act_token, per_out_channel
......
...@@ -22,13 +22,13 @@ from vllm.model_executor.layers.fused_moe.config import ( ...@@ -22,13 +22,13 @@ from vllm.model_executor.layers.fused_moe.config import (
) )
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
from vllm.platforms import current_platform
from vllm.utils.deep_gemm import ( from vllm.utils.deep_gemm import (
get_mk_alignment_for_contiguous_layout, get_mk_alignment_for_contiguous_layout,
is_deep_gemm_e8m0_used, is_deep_gemm_e8m0_used,
is_deep_gemm_supported, is_deep_gemm_supported,
) )
from vllm.utils.import_utils import has_deep_ep, has_deep_gemm from vllm.utils.import_utils import has_deep_ep, has_deep_gemm
from vllm.utils.torch_utils import set_random_seed
from vllm.v1.worker.workspace import init_workspace_manager from vllm.v1.worker.workspace import init_workspace_manager
from ...utils import multi_gpu_test from ...utils import multi_gpu_test
...@@ -367,7 +367,7 @@ def _test_deepep_deepgemm_moe( ...@@ -367,7 +367,7 @@ def _test_deepep_deepgemm_moe(
device = torch.device(f"cuda:{pgi.local_rank}") device = torch.device(f"cuda:{pgi.local_rank}")
init_workspace_manager(device) init_workspace_manager(device)
current_platform.seed_everything(pgi.rank) set_random_seed(pgi.rank)
w1 = w1.to(device=torch.cuda.current_device()) w1 = w1.to(device=torch.cuda.current_device())
w2 = w2.to(device=torch.cuda.current_device()) w2 = w2.to(device=torch.cuda.current_device())
...@@ -456,7 +456,7 @@ def test_ht_deepep_deepgemm_moe( ...@@ -456,7 +456,7 @@ def test_ht_deepep_deepgemm_moe(
""" """
m, n, k = mnk m, n, k = mnk
current_platform.seed_everything(7) set_random_seed(7)
if topk > num_experts: if topk > num_experts:
pytest.skip(f"Skipping test: topk={topk} > E={num_experts}") pytest.skip(f"Skipping test: topk={topk} > E={num_experts}")
...@@ -531,7 +531,7 @@ def test_ll_deepep_deepgemm_moe( ...@@ -531,7 +531,7 @@ def test_ll_deepep_deepgemm_moe(
assert not is_deep_gemm_e8m0_used() assert not is_deep_gemm_e8m0_used()
m, n, k = mnk m, n, k = mnk
current_platform.seed_everything(7) set_random_seed(7)
if topk > num_experts: if topk > num_experts:
pytest.skip(f"Skipping test: topk={topk} > E={num_experts}") pytest.skip(f"Skipping test: topk={topk} > E={num_experts}")
......
...@@ -20,8 +20,8 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularK ...@@ -20,8 +20,8 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularK
from vllm.model_executor.layers.quantization.utils.fp8_utils import ( from vllm.model_executor.layers.quantization.utils.fp8_utils import (
per_token_group_quant_fp8, per_token_group_quant_fp8,
) )
from vllm.platforms import current_platform
from vllm.utils.import_utils import has_deep_ep from vllm.utils.import_utils import has_deep_ep
from vllm.utils.torch_utils import set_random_seed
from vllm.v1.worker.workspace import init_workspace_manager from vllm.v1.worker.workspace import init_workspace_manager
from ...utils import multi_gpu_test from ...utils import multi_gpu_test
...@@ -446,7 +446,7 @@ def test_deep_ep_moe( ...@@ -446,7 +446,7 @@ def test_deep_ep_moe(
low_latency_mode = False low_latency_mode = False
use_fp8_dispatch = False use_fp8_dispatch = False
current_platform.seed_everything(7) set_random_seed(7)
world_size, dp_size = world_dp_size world_size, dp_size = world_dp_size
config = TestConfig(dtype=dtype, topk=topk, m=m, k=k, n=n, num_experts=num_experts) config = TestConfig(dtype=dtype, topk=topk, m=m, k=k, n=n, num_experts=num_experts)
...@@ -507,7 +507,7 @@ def test_low_latency_deep_ep_moe( ...@@ -507,7 +507,7 @@ def test_low_latency_deep_ep_moe(
f"hidden sizes {DeepEPLLPrepareAndFinalize.SUPPORTED_HIDDEN_SIZES}" f"hidden sizes {DeepEPLLPrepareAndFinalize.SUPPORTED_HIDDEN_SIZES}"
) )
current_platform.seed_everything(7) set_random_seed(7)
world_size, dp_size = world_dp_size world_size, dp_size = world_dp_size
config = TestConfig(dtype=dtype, topk=topk, m=m, k=k, n=n, num_experts=num_experts) config = TestConfig(dtype=dtype, topk=topk, m=m, k=k, n=n, num_experts=num_experts)
......
...@@ -22,6 +22,7 @@ from vllm.model_executor.layers.quantization.utils.flashinfer_utils import ( ...@@ -22,6 +22,7 @@ from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
from vllm.model_executor.layers.quantization.utils.fp8_utils import input_to_float8 from vllm.model_executor.layers.quantization.utils.fp8_utils import input_to_float8
from vllm.model_executor.models.llama4 import Llama4MoE from vllm.model_executor.models.llama4 import Llama4MoE
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
try: try:
from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
...@@ -158,7 +159,7 @@ def test_flashinfer_per_tensor_moe_fp8_no_graph( ...@@ -158,7 +159,7 @@ def test_flashinfer_per_tensor_moe_fp8_no_graph(
): ):
if not current_platform.has_device_capability(100): if not current_platform.has_device_capability(100):
pytest.skip("Test is only supported for sm >= 100") pytest.skip("Test is only supported for sm >= 100")
current_platform.seed_everything(7) set_random_seed(7)
monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192") monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
with set_current_vllm_config(vllm_config): with set_current_vllm_config(vllm_config):
td = TestData.make_moe_tensors_8bit(m, k, n, e, reorder=True) td = TestData.make_moe_tensors_8bit(m, k, n, e, reorder=True)
...@@ -222,7 +223,7 @@ def test_flashinfer_cutlass_moe_fp8_no_graph( ...@@ -222,7 +223,7 @@ def test_flashinfer_cutlass_moe_fp8_no_graph(
monkeypatch, monkeypatch,
workspace_init, workspace_init,
): ):
current_platform.seed_everything(7) set_random_seed(7)
monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192") monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
with set_current_vllm_config(vllm_config): with set_current_vllm_config(vllm_config):
td = TestData.make_moe_tensors_8bit( td = TestData.make_moe_tensors_8bit(
......
...@@ -23,6 +23,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk ...@@ -23,6 +23,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
from vllm.utils.torch_utils import set_random_seed
if not has_flashinfer_cutlass_fused_moe() or not current_platform.has_device_capability( if not has_flashinfer_cutlass_fused_moe() or not current_platform.has_device_capability(
100 100
...@@ -60,7 +61,7 @@ def test_flashinfer_fp4_moe_no_graph( ...@@ -60,7 +61,7 @@ def test_flashinfer_fp4_moe_no_graph(
activation: str, activation: str,
workspace_init, workspace_init,
): ):
current_platform.seed_everything(7) set_random_seed(7)
with set_current_vllm_config( with set_current_vllm_config(
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1)) VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
): ):
......
...@@ -19,6 +19,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import ( ...@@ -19,6 +19,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import (
fused_grouped_topk, fused_grouped_topk,
) )
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
@pytest.mark.skipif( @pytest.mark.skipif(
...@@ -52,7 +53,7 @@ def test_grouped_topk( ...@@ -52,7 +53,7 @@ def test_grouped_topk(
) )
get_cached_compilation_config.cache_clear() get_cached_compilation_config.cache_clear()
current_platform.seed_everything(0) set_random_seed(0)
hidden_states = torch.randn((n_token, n_hidden), dtype=dtype, device="cuda") hidden_states = torch.randn((n_token, n_hidden), dtype=dtype, device="cuda")
gating_output = torch.randn((n_token, n_expert), dtype=dtype, device="cuda") gating_output = torch.randn((n_token, n_expert), dtype=dtype, device="cuda")
e_score_correction_bias = torch.randn( e_score_correction_bias = torch.randn(
......
...@@ -15,7 +15,7 @@ from vllm.config import VllmConfig, set_current_vllm_config ...@@ -15,7 +15,7 @@ from vllm.config import VllmConfig, set_current_vllm_config
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
from vllm.utils.import_utils import has_deep_ep, has_deep_gemm, has_pplx from vllm.utils.import_utils import has_deep_ep, has_deep_gemm, has_pplx
from vllm.utils.torch_utils import cuda_device_count_stateless from vllm.utils.torch_utils import cuda_device_count_stateless, set_random_seed
from vllm.v1.worker.workspace import init_workspace_manager from vllm.v1.worker.workspace import init_workspace_manager
from .modular_kernel_tools.common import ( from .modular_kernel_tools.common import (
...@@ -82,7 +82,7 @@ def rank_worker( ...@@ -82,7 +82,7 @@ def rank_worker(
device = torch.device(f"cuda:{pgi.local_rank}") device = torch.device(f"cuda:{pgi.local_rank}")
init_workspace_manager(device) init_workspace_manager(device)
current_platform.seed_everything(pgi.rank) set_random_seed(pgi.rank)
# sanity check # sanity check
from vllm import envs from vllm import envs
......
...@@ -34,6 +34,7 @@ from vllm.model_executor.layers.fused_moe.prepare_finalize import ( ...@@ -34,6 +34,7 @@ from vllm.model_executor.layers.fused_moe.prepare_finalize import (
) )
from vllm.model_executor.layers.utils import shuffle_weight from vllm.model_executor.layers.utils import shuffle_weight
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
MNK = [ MNK = [
(1, 512, 384), (1, 512, 384),
...@@ -211,7 +212,7 @@ def test_oai_triton_moe( ...@@ -211,7 +212,7 @@ def test_oai_triton_moe(
unfused: bool, unfused: bool,
workspace_init, workspace_init,
): ):
current_platform.seed_everything(0) set_random_seed(0)
( (
w1, w1,
w2, w2,
......
...@@ -60,6 +60,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import quantize_w ...@@ -60,6 +60,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import quantize_w
from vllm.model_executor.models.mixtral import MixtralMoE from vllm.model_executor.models.mixtral import MixtralMoE
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.scalar_type import ScalarType, scalar_types from vllm.scalar_type import ScalarType, scalar_types
from vllm.utils.torch_utils import set_random_seed
from vllm.v1.worker.workspace import init_workspace_manager from vllm.v1.worker.workspace import init_workspace_manager
NUM_EXPERTS = [8, 64, 192] NUM_EXPERTS = [8, 64, 192]
...@@ -234,7 +235,7 @@ def test_fused_moe( ...@@ -234,7 +235,7 @@ def test_fused_moe(
monkeypatch, monkeypatch,
workspace_init, workspace_init,
): ):
current_platform.seed_everything(7) set_random_seed(7)
monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", str(chunk_size)) monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", str(chunk_size))
......
...@@ -14,12 +14,13 @@ from vllm.model_executor.layers.fused_moe.moe_align_block_size import ( ...@@ -14,12 +14,13 @@ from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
) )
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.math_utils import round_up from vllm.utils.math_utils import round_up
from vllm.utils.torch_utils import set_random_seed
NUM_TOKENS = [1, 3, 256, 2256, 4096] NUM_TOKENS = [1, 3, 256, 2256, 4096]
NUM_EXPERTS = [32, 160, 256, 257] NUM_EXPERTS = [32, 160, 256, 257]
TOP_KS = [1, 2, 16, 32] TOP_KS = [1, 2, 16, 32]
BLOCK_SIZES = [32, 128] BLOCK_SIZES = [32, 128]
current_platform.seed_everything(0) set_random_seed(0)
def _group_tokens_by_expert( def _group_tokens_by_expert(
......
...@@ -17,11 +17,12 @@ from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import ( ...@@ -17,11 +17,12 @@ from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
moe_unpermute, moe_unpermute,
) )
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
NUM_EXPERTS = [16, 64, 256] NUM_EXPERTS = [16, 64, 256]
TOP_KS = [2, 6, 8] TOP_KS = [2, 6, 8]
EP_SIZE = [1, 4, 16] EP_SIZE = [1, 4, 16]
current_platform.seed_everything(0) set_random_seed(0)
if current_platform.is_rocm(): if current_platform.is_rocm():
pytest.skip( pytest.skip(
...@@ -226,7 +227,7 @@ def test_moe_permute_unpermute( ...@@ -226,7 +227,7 @@ def test_moe_permute_unpermute(
n_local_expert, expert_map, _ = determine_expert_map(ep_size, ep_rank, n_expert) n_local_expert, expert_map, _ = determine_expert_map(ep_size, ep_rank, n_expert)
expert_map = expert_map.cuda() expert_map = expert_map.cuda()
start_expert = n_local_expert * ep_rank start_expert = n_local_expert * ep_rank
current_platform.seed_everything(0) set_random_seed(0)
hidden_states = torch.randn((n_token, n_hidden), device="cuda").to(dtype) hidden_states = torch.randn((n_token, n_hidden), device="cuda").to(dtype)
gating_output = torch.randn((n_token, n_expert), device="cuda").to(dtype) gating_output = torch.randn((n_token, n_expert), device="cuda").to(dtype)
topk_weights, topk_ids, token_expert_indices = fused_topk( topk_weights, topk_ids, token_expert_indices = fused_topk(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment