Unverified Commit bb4337b3 authored by wangxiyuan's avatar wangxiyuan Committed by GitHub
Browse files

[Platform] Deprecate seed_everything (#31659)


Signed-off-by: default avatarwangxiyuan <wangxiyuan1007@gmail.com>
parent 367856de
...@@ -154,3 +154,4 @@ The interface for the model/module may change during vLLM's development. If you ...@@ -154,3 +154,4 @@ The interface for the model/module may change during vLLM's development. If you
!!! warning "Deprecations" !!! warning "Deprecations"
- `use_v1` parameter in `Platform.get_attn_backend_cls` is deprecated. It has been removed in v0.13.0. - `use_v1` parameter in `Platform.get_attn_backend_cls` is deprecated. It has been removed in v0.13.0.
- `_Backend` in `vllm.attention` is deprecated. It has been removed in v0.13.0. Please use `vllm.attention.backends.registry.register_backend` to add new attention backend to `AttentionBackendEnum` instead. - `_Backend` in `vllm.attention` is deprecated. It has been removed in v0.13.0. Please use `vllm.attention.backends.registry.register_backend` to add new attention backend to `AttentionBackendEnum` instead.
- `seed_everything` platform interface is deprecated. It will be removed in v0.14.0 or later. Please use `vllm.utils.torch_utils.set_random_seed` instead.
...@@ -26,6 +26,7 @@ from vllm.distributed.parallel_state import ( ...@@ -26,6 +26,7 @@ from vllm.distributed.parallel_state import (
) )
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.system_utils import update_environment_variables from vllm.utils.system_utils import update_environment_variables
from vllm.utils.torch_utils import set_random_seed
from ...models.registry import HF_EXAMPLE_MODELS from ...models.registry import HF_EXAMPLE_MODELS
from ...utils import ( from ...utils import (
...@@ -301,7 +302,7 @@ def async_tp_pass_on_test_model( ...@@ -301,7 +302,7 @@ def async_tp_pass_on_test_model(
dtype: torch.dtype, dtype: torch.dtype,
dynamic: bool, dynamic: bool,
): ):
current_platform.seed_everything(0) set_random_seed(0)
device = torch.device(f"cuda:{local_rank}") device = torch.device(f"cuda:{local_rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
......
...@@ -32,6 +32,7 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( ...@@ -32,6 +32,7 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
) )
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.system_utils import update_environment_variables from vllm.utils.system_utils import update_environment_variables
from vllm.utils.torch_utils import set_random_seed
from ...utils import has_module_attribute, multi_gpu_test from ...utils import has_module_attribute, multi_gpu_test
from ..backend import TestBackend from ..backend import TestBackend
...@@ -263,7 +264,7 @@ def all_reduce_fusion_pass_on_test_model( ...@@ -263,7 +264,7 @@ def all_reduce_fusion_pass_on_test_model(
enable_rms_norm_custom_op, enable_rms_norm_custom_op,
enable_quant_fp8_custom_op, enable_quant_fp8_custom_op,
): ):
current_platform.seed_everything(0) set_random_seed(0)
device = torch.device(f"cuda:{local_rank}") device = torch.device(f"cuda:{local_rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
......
...@@ -31,6 +31,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape ...@@ -31,6 +31,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
from vllm.model_executor.layers.quantization.utils.w8a8_utils import Fp8LinearOp from vllm.model_executor.layers.quantization.utils.w8a8_utils import Fp8LinearOp
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.system_utils import update_environment_variables from vllm.utils.system_utils import update_environment_variables
from vllm.utils.torch_utils import set_random_seed
from ...utils import multi_gpu_test from ...utils import multi_gpu_test
from ..backend import TestBackend from ..backend import TestBackend
...@@ -232,7 +233,7 @@ def sequence_parallelism_pass_on_test_model( ...@@ -232,7 +233,7 @@ def sequence_parallelism_pass_on_test_model(
fuse_norm_quant: bool, fuse_norm_quant: bool,
dynamic: bool, dynamic: bool,
): ):
current_platform.seed_everything(0) set_random_seed(0)
device = torch.device(f"cuda:{local_rank}") device = torch.device(f"cuda:{local_rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
......
...@@ -8,6 +8,7 @@ import torch ...@@ -8,6 +8,7 @@ import torch
import vllm.v1.attention.backends.rocm_aiter_fa # noqa: F401 import vllm.v1.attention.backends.rocm_aiter_fa # noqa: F401
from vllm.attention.utils.fa_utils import is_flash_attn_varlen_func_available from vllm.attention.utils.fa_utils import is_flash_attn_varlen_func_available
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
NUM_HEADS = [(4, 4), (8, 2)] NUM_HEADS = [(4, 4), (8, 2)]
HEAD_SIZES = [128, 256] HEAD_SIZES = [128, 256]
...@@ -104,7 +105,7 @@ def test_varlen_with_paged_kv( ...@@ -104,7 +105,7 @@ def test_varlen_with_paged_kv(
if not is_flash_attn_varlen_func_available(): if not is_flash_attn_varlen_func_available():
pytest.skip("flash_attn_varlen_func required to run this test.") pytest.skip("flash_attn_varlen_func required to run this test.")
torch.set_default_device("cuda") torch.set_default_device("cuda")
current_platform.seed_everything(0) set_random_seed(0)
num_seqs = len(seq_lens) num_seqs = len(seq_lens)
query_lens = [x[0] for x in seq_lens] query_lens = [x[0] for x in seq_lens]
kv_lens = [x[1] for x in seq_lens] kv_lens = [x[1] for x in seq_lens]
......
...@@ -13,6 +13,7 @@ from vllm.attention.layer import Attention ...@@ -13,6 +13,7 @@ from vllm.attention.layer import Attention
from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.mem_utils import get_max_shared_memory_bytes from vllm.utils.mem_utils import get_max_shared_memory_bytes
from vllm.utils.torch_utils import set_random_seed
FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
# This will change depending on the compute capability. # This will change depending on the compute capability.
...@@ -150,7 +151,7 @@ def test_paged_attention( ...@@ -150,7 +151,7 @@ def test_paged_attention(
global PARTITION_SIZE global PARTITION_SIZE
current_platform.seed_everything(seed) set_random_seed(seed)
torch.set_default_device(device) torch.set_default_device(device)
scale = float(1.0 / (head_size**0.5)) scale = float(1.0 / (head_size**0.5))
num_query_heads, num_kv_heads = num_heads num_query_heads, num_kv_heads = num_heads
......
...@@ -9,6 +9,7 @@ import torch ...@@ -9,6 +9,7 @@ import torch
from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
COPYING_DIRECTION = [("cuda", "cpu"), ("cuda", "cuda"), ("cpu", "cuda")] COPYING_DIRECTION = [("cuda", "cpu"), ("cuda", "cuda"), ("cpu", "cuda")]
DTYPES = [torch.bfloat16, torch.float] DTYPES = [torch.bfloat16, torch.float]
...@@ -64,7 +65,7 @@ def test_reshape_and_cache( ...@@ -64,7 +65,7 @@ def test_reshape_and_cache(
) -> None: ) -> None:
if kv_cache_dtype == "fp8" and head_size % 16: if kv_cache_dtype == "fp8" and head_size % 16:
pytest.skip() pytest.skip()
current_platform.seed_everything(seed) set_random_seed(seed)
torch.set_default_device(device) torch.set_default_device(device)
torch.cuda.set_device(device) torch.cuda.set_device(device)
# Create a random slot mapping. # Create a random slot mapping.
...@@ -185,7 +186,7 @@ def test_reshape_and_cache_flash( ...@@ -185,7 +186,7 @@ def test_reshape_and_cache_flash(
kv_cache_layout: str, kv_cache_layout: str,
implementation: str, implementation: str,
) -> None: ) -> None:
current_platform.seed_everything(seed) set_random_seed(seed)
torch.set_default_device(device) torch.set_default_device(device)
torch.cuda.set_device(device) torch.cuda.set_device(device)
assert implementation in ["cuda", "triton"] assert implementation in ["cuda", "triton"]
...@@ -355,7 +356,7 @@ def test_swap_blocks( ...@@ -355,7 +356,7 @@ def test_swap_blocks(
if kv_cache_dtype == "fp8" and head_size % 16: if kv_cache_dtype == "fp8" and head_size % 16:
pytest.skip() pytest.skip()
current_platform.seed_everything(seed) set_random_seed(seed)
src_device = device if direction[0] == "cuda" else "cpu" src_device = device if direction[0] == "cuda" else "cpu"
dst_device = device if direction[1] == "cuda" else "cpu" dst_device = device if direction[1] == "cuda" else "cpu"
...@@ -444,7 +445,7 @@ def test_fp8_e4m3_conversion( ...@@ -444,7 +445,7 @@ def test_fp8_e4m3_conversion(
seed: int, seed: int,
device: str, device: str,
) -> None: ) -> None:
current_platform.seed_everything(seed) set_random_seed(seed)
low = -224.0 low = -224.0
high = 224.0 high = 224.0
...@@ -507,7 +508,7 @@ def test_concat_and_cache_mla( ...@@ -507,7 +508,7 @@ def test_concat_and_cache_mla(
device: str, device: str,
kv_cache_dtype: str, kv_cache_dtype: str,
) -> None: ) -> None:
current_platform.seed_everything(seed) set_random_seed(seed)
torch.set_default_device(device) torch.set_default_device(device)
torch.cuda.set_device(device) torch.cuda.set_device(device)
...@@ -584,7 +585,7 @@ def test_concat_and_cache_ds_mla( ...@@ -584,7 +585,7 @@ def test_concat_and_cache_ds_mla(
if dtype.itemsize != 2: if dtype.itemsize != 2:
pytest.skip("ds_mla only supports 16-bit input") pytest.skip("ds_mla only supports 16-bit input")
kv_cache_dtype = "fp8_ds_mla" kv_cache_dtype = "fp8_ds_mla"
current_platform.seed_everything(seed) set_random_seed(seed)
torch.set_default_device(device) torch.set_default_device(device)
torch.cuda.set_device(device) torch.cuda.set_device(device)
...@@ -695,7 +696,7 @@ def test_swap_blocks_mla( ...@@ -695,7 +696,7 @@ def test_swap_blocks_mla(
device: str, device: str,
kv_cache_dtype: str, kv_cache_dtype: str,
) -> None: ) -> None:
current_platform.seed_everything(seed) set_random_seed(seed)
torch.set_default_device(device) torch.set_default_device(device)
torch.cuda.set_device(device) torch.cuda.set_device(device)
...@@ -947,7 +948,7 @@ def test_concat_and_cache_mla_cpu( ...@@ -947,7 +948,7 @@ def test_concat_and_cache_mla_cpu(
) -> None: ) -> None:
device = "cpu" device = "cpu"
kv_cache_dtype = "auto" kv_cache_dtype = "auto"
current_platform.seed_everything(seed) set_random_seed(seed)
torch.set_default_device(device) torch.set_default_device(device)
total_slots = num_blocks * block_size total_slots = num_blocks * block_size
......
...@@ -6,6 +6,7 @@ import pytest ...@@ -6,6 +6,7 @@ import pytest
import torch import torch
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
from vllm.v1.attention.backends.flash_attn import cascade_attention, merge_attn_states from vllm.v1.attention.backends.flash_attn import cascade_attention, merge_attn_states
try: try:
...@@ -39,7 +40,7 @@ def test_merge_kernel( ...@@ -39,7 +40,7 @@ def test_merge_kernel(
dtype: torch.dtype, dtype: torch.dtype,
): ):
torch.set_default_device("cuda") torch.set_default_device("cuda")
current_platform.seed_everything(0) set_random_seed(0)
num_query_heads = num_heads[0] num_query_heads = num_heads[0]
num_kv_heads = num_heads[1] num_kv_heads = num_heads[1]
assert num_query_heads % num_kv_heads == 0 assert num_query_heads % num_kv_heads == 0
...@@ -103,7 +104,7 @@ def test_cascade( ...@@ -103,7 +104,7 @@ def test_cascade(
f'to: "{fa_version_unsupported_reason(fa_version)}"' f'to: "{fa_version_unsupported_reason(fa_version)}"'
) )
current_platform.seed_everything(0) set_random_seed(0)
window_size = (-1, -1) window_size = (-1, -1)
scale = head_size**-0.5 scale = head_size**-0.5
......
...@@ -8,6 +8,7 @@ import pytest ...@@ -8,6 +8,7 @@ import pytest
import torch import torch
from vllm.platforms import CpuArchEnum, current_platform from vllm.platforms import CpuArchEnum, current_platform
from vllm.utils.torch_utils import set_random_seed
from vllm.v1.attention.backends.cpu_attn import _get_attn_isa from vllm.v1.attention.backends.cpu_attn import _get_attn_isa
if not current_platform.is_cpu(): if not current_platform.is_cpu():
...@@ -190,7 +191,7 @@ def varlen_with_paged_kv( ...@@ -190,7 +191,7 @@ def varlen_with_paged_kv(
use_sink: bool, use_sink: bool,
isa: str, isa: str,
) -> None: ) -> None:
current_platform.seed_everything(0) set_random_seed(0)
num_seqs = len(seq_lens) num_seqs = len(seq_lens)
query_lens = [x[0] for x in seq_lens] query_lens = [x[0] for x in seq_lens]
kv_lens = [x[1] for x in seq_lens] kv_lens = [x[1] for x in seq_lens]
......
...@@ -6,6 +6,7 @@ import pytest ...@@ -6,6 +6,7 @@ import pytest
import torch import torch
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
try: try:
from vllm.vllm_flash_attn import ( from vllm.vllm_flash_attn import (
...@@ -129,7 +130,7 @@ def test_varlen_with_paged_kv( ...@@ -129,7 +130,7 @@ def test_varlen_with_paged_kv(
"Flash attention with quantized inputs is only " "Flash attention with quantized inputs is only "
"supported on version 3 with bfloat16 base type" "supported on version 3 with bfloat16 base type"
) )
current_platform.seed_everything(0) set_random_seed(0)
num_seqs = len(seq_lens) num_seqs = len(seq_lens)
query_lens = [x[0] for x in seq_lens] query_lens = [x[0] for x in seq_lens]
kv_lens = [x[1] for x in seq_lens] kv_lens = [x[1] for x in seq_lens]
......
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
import pytest import pytest
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
try: try:
import flashinfer import flashinfer
...@@ -101,7 +102,7 @@ def test_flashinfer_decode_with_paged_kv( ...@@ -101,7 +102,7 @@ def test_flashinfer_decode_with_paged_kv(
sliding_window: int | None, sliding_window: int | None,
) -> None: ) -> None:
torch.set_default_device("cuda") torch.set_default_device("cuda")
current_platform.seed_everything(0) set_random_seed(0)
num_seqs = len(kv_lens) num_seqs = len(kv_lens)
num_query_heads = num_heads[0] num_query_heads = num_heads[0]
num_kv_heads = num_heads[1] num_kv_heads = num_heads[1]
...@@ -196,7 +197,7 @@ def test_flashinfer_prefill_with_paged_kv( ...@@ -196,7 +197,7 @@ def test_flashinfer_prefill_with_paged_kv(
sliding_window: int | None, sliding_window: int | None,
) -> None: ) -> None:
torch.set_default_device("cuda") torch.set_default_device("cuda")
current_platform.seed_everything(0) set_random_seed(0)
num_seqs = len(seq_lens) num_seqs = len(seq_lens)
query_lens = [x[0] for x in seq_lens] query_lens = [x[0] for x in seq_lens]
kv_lens = [x[1] for x in seq_lens] kv_lens = [x[1] for x in seq_lens]
...@@ -299,7 +300,7 @@ def test_flashinfer_prefill_with_paged_fp8_kv( ...@@ -299,7 +300,7 @@ def test_flashinfer_prefill_with_paged_fp8_kv(
) -> None: ) -> None:
pytest.skip("TODO: fix the accuracy issue") pytest.skip("TODO: fix the accuracy issue")
torch.set_default_device("cuda") torch.set_default_device("cuda")
current_platform.seed_everything(0) set_random_seed(0)
num_seqs = len(seq_lens) num_seqs = len(seq_lens)
query_lens = [x[0] for x in seq_lens] query_lens = [x[0] for x in seq_lens]
kv_lens = [x[1] for x in seq_lens] kv_lens = [x[1] for x in seq_lens]
...@@ -409,7 +410,7 @@ def test_flashinfer_decode_with_paged_fp8_kv( ...@@ -409,7 +410,7 @@ def test_flashinfer_decode_with_paged_fp8_kv(
) -> None: ) -> None:
# test doesn't work for num_heads = (16,16) # test doesn't work for num_heads = (16,16)
torch.set_default_device("cuda") torch.set_default_device("cuda")
current_platform.seed_everything(0) set_random_seed(0)
num_seqs = len(kv_lens) num_seqs = len(kv_lens)
num_query_heads = num_heads[0] num_query_heads = num_heads[0]
num_kv_heads = num_heads[1] num_kv_heads = num_heads[1]
......
...@@ -10,6 +10,7 @@ from tests.kernels.quantization.nvfp4_utils import ( ...@@ -10,6 +10,7 @@ from tests.kernels.quantization.nvfp4_utils import (
) )
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.math_utils import round_up from vllm.utils.math_utils import round_up
from vllm.utils.torch_utils import set_random_seed
if not current_platform.is_device_capability_family(100): if not current_platform.is_device_capability_family(100):
pytest.skip( pytest.skip(
...@@ -80,7 +81,7 @@ def test_flashinfer_trtllm_decode_with_baseline( ...@@ -80,7 +81,7 @@ def test_flashinfer_trtllm_decode_with_baseline(
has_sinks: bool, has_sinks: bool,
) -> None: ) -> None:
torch.set_default_device("cuda") torch.set_default_device("cuda")
current_platform.seed_everything(42) set_random_seed(42)
q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtypes q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtypes
q_quant_dtype = q_quant_dtype or dtype q_quant_dtype = q_quant_dtype or dtype
...@@ -279,7 +280,7 @@ def test_flashinfer_trtllm_prefill_with_baseline( ...@@ -279,7 +280,7 @@ def test_flashinfer_trtllm_prefill_with_baseline(
has_sinks: bool, has_sinks: bool,
) -> None: ) -> None:
torch.set_default_device("cuda") torch.set_default_device("cuda")
current_platform.seed_everything(42) set_random_seed(42)
q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtypes q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtypes
q_quant_dtype = q_quant_dtype or dtype q_quant_dtype = q_quant_dtype or dtype
......
...@@ -5,7 +5,7 @@ import pytest ...@@ -5,7 +5,7 @@ import pytest
import torch import torch
from vllm.model_executor.layers.lightning_attn import linear_decode_forward_triton from vllm.model_executor.layers.lightning_attn import linear_decode_forward_triton
from vllm.platforms import current_platform from vllm.utils.torch_utils import set_random_seed
NUM_HEADS = [4, 8] NUM_HEADS = [4, 8]
HEAD_SIZES = [64] HEAD_SIZES = [64]
...@@ -124,7 +124,7 @@ def test_linear_decode_forward_triton( ...@@ -124,7 +124,7 @@ def test_linear_decode_forward_triton(
torch.set_default_device("cuda") torch.set_default_device("cuda")
torch.manual_seed(42) torch.manual_seed(42)
torch.cuda.manual_seed_all(42) torch.cuda.manual_seed_all(42)
current_platform.seed_everything(42) set_random_seed(42)
base = 0.01 base = 0.01
q = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype) q = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
k = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype) k = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
...@@ -167,7 +167,7 @@ def test_linear_decode_forward_triton_with_padding( ...@@ -167,7 +167,7 @@ def test_linear_decode_forward_triton_with_padding(
torch.set_default_device("cuda") torch.set_default_device("cuda")
torch.manual_seed(42) torch.manual_seed(42)
torch.cuda.manual_seed_all(42) torch.cuda.manual_seed_all(42)
current_platform.seed_everything(42) set_random_seed(42)
batch_size = 4 batch_size = 4
base = 0.01 base = 0.01
...@@ -231,7 +231,7 @@ def test_lightning_attention_reference( ...@@ -231,7 +231,7 @@ def test_lightning_attention_reference(
torch.set_default_device("cuda") torch.set_default_device("cuda")
torch.manual_seed(42) torch.manual_seed(42)
torch.cuda.manual_seed_all(42) torch.cuda.manual_seed_all(42)
current_platform.seed_everything(42) set_random_seed(42)
base = 0.01 base = 0.01
q = base * torch.randn(batch_size, num_heads, seq_len, head_size, dtype=dtype) q = base * torch.randn(batch_size, num_heads, seq_len, head_size, dtype=dtype)
......
...@@ -19,6 +19,7 @@ from vllm.platforms import current_platform ...@@ -19,6 +19,7 @@ from vllm.platforms import current_platform
from vllm.platforms.cpu import CpuPlatform from vllm.platforms.cpu import CpuPlatform
from vllm.platforms.cuda import CudaPlatform from vllm.platforms.cuda import CudaPlatform
from vllm.platforms.rocm import RocmPlatform from vllm.platforms.rocm import RocmPlatform
from vllm.utils.torch_utils import set_random_seed
@pytest.fixture(autouse=True) @pytest.fixture(autouse=True)
...@@ -123,7 +124,7 @@ def test_mha_attn_forward( ...@@ -123,7 +124,7 @@ def test_mha_attn_forward(
dtype: torch.dtype, dtype: torch.dtype,
device: str, device: str,
): ):
current_platform.seed_everything(0) set_random_seed(0)
torch.set_default_device(device) torch.set_default_device(device)
torch.set_default_dtype(dtype) torch.set_default_dtype(dtype)
...@@ -168,7 +169,7 @@ def test_mha_attn_varlen_forward( ...@@ -168,7 +169,7 @@ def test_mha_attn_varlen_forward(
dtype: torch.dtype, dtype: torch.dtype,
device: str, device: str,
): ):
current_platform.seed_everything(0) set_random_seed(0)
torch.set_default_device(device) torch.set_default_device(device)
torch.set_default_dtype(dtype) torch.set_default_dtype(dtype)
......
...@@ -13,7 +13,7 @@ import torch.nn.functional as F ...@@ -13,7 +13,7 @@ import torch.nn.functional as F
from vllm.attention.ops.chunked_prefill_paged_decode import chunked_prefill_paged_decode from vllm.attention.ops.chunked_prefill_paged_decode import chunked_prefill_paged_decode
from vllm.attention.ops.prefix_prefill import context_attention_fwd from vllm.attention.ops.prefix_prefill import context_attention_fwd
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed
NUM_HEADS = [64] NUM_HEADS = [64]
NUM_QUERIES_PER_KV = [1, 64] NUM_QUERIES_PER_KV = [1, 64]
...@@ -125,7 +125,7 @@ def test_contexted_kv_attention( ...@@ -125,7 +125,7 @@ def test_contexted_kv_attention(
): ):
pytest.skip("ROCm custom paged attention does not support fp8_e5m2 KV cache") pytest.skip("ROCm custom paged attention does not support fp8_e5m2 KV cache")
current_platform.seed_everything(0) set_random_seed(0)
torch.set_default_device(device) torch.set_default_device(device)
# Need this, otherwise when we capture the graph the process # Need this, otherwise when we capture the graph the process
...@@ -346,7 +346,7 @@ def test_contexted_kv_attention_alibi( ...@@ -346,7 +346,7 @@ def test_contexted_kv_attention_alibi(
): ):
pytest.skip("ROCm custom paged attention does not support fp8_e5m2 KV cache") pytest.skip("ROCm custom paged attention does not support fp8_e5m2 KV cache")
current_platform.seed_everything(0) set_random_seed(0)
torch.set_default_device(device) torch.set_default_device(device)
# Need this, otherwise when we capture the graph the process # Need this, otherwise when we capture the graph the process
......
...@@ -8,6 +8,7 @@ import torch ...@@ -8,6 +8,7 @@ import torch
from vllm.attention.ops.triton_unified_attention import unified_attention from vllm.attention.ops.triton_unified_attention import unified_attention
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.math_utils import next_power_of_2 from vllm.utils.math_utils import next_power_of_2
from vllm.utils.torch_utils import set_random_seed
NUM_HEADS = [(4, 4), (8, 2)] NUM_HEADS = [(4, 4), (8, 2)]
HEAD_SIZES = [128, 256] HEAD_SIZES = [128, 256]
...@@ -113,7 +114,7 @@ def test_triton_unified_attn( ...@@ -113,7 +114,7 @@ def test_triton_unified_attn(
) -> None: ) -> None:
torch.set_default_device("cuda") torch.set_default_device("cuda")
current_platform.seed_everything(0) set_random_seed(0)
num_seqs = len(seq_lens) num_seqs = len(seq_lens)
query_lens = [x[0] for x in seq_lens] query_lens = [x[0] for x in seq_lens]
kv_lens = [x[1] for x in seq_lens] kv_lens = [x[1] for x in seq_lens]
......
...@@ -18,7 +18,7 @@ from vllm.model_executor.layers.activation import ( ...@@ -18,7 +18,7 @@ from vllm.model_executor.layers.activation import (
SiluAndMul, SiluAndMul,
SwigluOAIAndMul, SwigluOAIAndMul,
) )
from vllm.platforms import current_platform from vllm.utils.torch_utils import set_random_seed
DTYPES = [torch.half, torch.bfloat16, torch.float] DTYPES = [torch.half, torch.bfloat16, torch.float]
NUM_TOKENS = [7, 83, 2048] # Arbitrary values for testing NUM_TOKENS = [7, 83, 2048] # Arbitrary values for testing
...@@ -52,7 +52,7 @@ def test_act_and_mul( ...@@ -52,7 +52,7 @@ def test_act_and_mul(
seed: int, seed: int,
device: str, device: str,
) -> None: ) -> None:
current_platform.seed_everything(seed) set_random_seed(seed)
torch.set_default_device(device) torch.set_default_device(device)
x = torch.randn(num_tokens, 2 * d, dtype=dtype) x = torch.randn(num_tokens, 2 * d, dtype=dtype)
if activation == "silu_and_mul": if activation == "silu_and_mul":
...@@ -129,7 +129,7 @@ def test_activation( ...@@ -129,7 +129,7 @@ def test_activation(
seed: int, seed: int,
device: str, device: str,
) -> None: ) -> None:
current_platform.seed_everything(seed) set_random_seed(seed)
torch.set_default_device(device) torch.set_default_device(device)
x = torch.randn(num_tokens, d, dtype=dtype) x = torch.randn(num_tokens, d, dtype=dtype)
layer = activation[0]() layer = activation[0]()
......
...@@ -8,6 +8,7 @@ from tests.kernels.utils import opcheck ...@@ -8,6 +8,7 @@ from tests.kernels.utils import opcheck
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
DTYPES = [torch.bfloat16, torch.float16] DTYPES = [torch.bfloat16, torch.float16]
IS_NEOX = [True, False] IS_NEOX = [True, False]
...@@ -64,7 +65,7 @@ def test_fused_qk_norm_rope_matches_reference( ...@@ -64,7 +65,7 @@ def test_fused_qk_norm_rope_matches_reference(
rotary_ratio: float, rotary_ratio: float,
): ):
torch.set_default_device(device) torch.set_default_device(device)
current_platform.seed_everything(seed) set_random_seed(seed)
num_heads, num_kv_heads, head_dim = 16, 4, 128 num_heads, num_kv_heads, head_dim = 16, 4, 128
num_tokens = 4 num_tokens = 4
......
...@@ -7,7 +7,7 @@ import torch ...@@ -7,7 +7,7 @@ import torch
from tests.kernels.quant_utils import FP8_DTYPE from tests.kernels.quant_utils import FP8_DTYPE
from tests.kernels.utils import opcheck from tests.kernels.utils import opcheck
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.platforms import current_platform from vllm.utils.torch_utils import set_random_seed
DTYPES = [torch.half, torch.bfloat16, torch.float] DTYPES = [torch.half, torch.bfloat16, torch.float]
NUM_TOKENS = [7, 83, 4096] # Arbitrary values for testing NUM_TOKENS = [7, 83, 4096] # Arbitrary values for testing
...@@ -34,7 +34,7 @@ def test_rms_norm( ...@@ -34,7 +34,7 @@ def test_rms_norm(
device: str, device: str,
strided_input: bool, strided_input: bool,
) -> None: ) -> None:
current_platform.seed_everything(seed) set_random_seed(seed)
torch.set_default_device(device) torch.set_default_device(device)
layer = RMSNorm(hidden_size).to(dtype=dtype) layer = RMSNorm(hidden_size).to(dtype=dtype)
layer.weight.data.normal_(mean=1.0, std=0.1) layer.weight.data.normal_(mean=1.0, std=0.1)
...@@ -88,7 +88,7 @@ def test_fused_rms_norm_quant( ...@@ -88,7 +88,7 @@ def test_fused_rms_norm_quant(
device: str, device: str,
strided_input: bool, strided_input: bool,
) -> None: ) -> None:
current_platform.seed_everything(seed) set_random_seed(seed)
torch.set_default_device(device) torch.set_default_device(device)
weight = torch.empty(hidden_size, dtype=dtype).normal_(mean=1.0, std=0.1) weight = torch.empty(hidden_size, dtype=dtype).normal_(mean=1.0, std=0.1)
......
...@@ -10,6 +10,7 @@ from transformers import __version__ as TRANSFORMERS_VERSION ...@@ -10,6 +10,7 @@ from transformers import __version__ as TRANSFORMERS_VERSION
from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.transformers_utils.config import get_config from vllm.transformers_utils.config import get_config
from vllm.utils.torch_utils import set_random_seed
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
...@@ -24,7 +25,7 @@ def generate_test_data( ...@@ -24,7 +25,7 @@ def generate_test_data(
device: torch.device, device: torch.device,
): ):
"""Generate test data for given configuration.""" """Generate test data for given configuration."""
current_platform.seed_everything(42) set_random_seed(42)
# Create 2D positions (3, num_tokens) for multimodal case # Create 2D positions (3, num_tokens) for multimodal case
positions = torch.randint( positions = torch.randint(
0, max_position_embeddings // 4, (3, num_tokens), device=device 0, max_position_embeddings // 4, (3, num_tokens), device=device
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment