[CI/Build] Avoid CUDA initialization (#8534)

6ffa3f31 · Cyrus Leung · GitHub · e3515729 · 6ffa3f31 · 6ffa3f31
Unverified Commit 6ffa3f31 authored Sep 18, 2024 by Cyrus Leung Committed by GitHub Sep 18, 2024
20 changed files
--- a/benchmarks/kernels/benchmark_layernorm.py
+++ b/benchmarks/kernels/benchmark_layernorm.py
-import random
 import time

 import torch

 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
+from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
+                        seed_everything)


 @torch.inference_mode()
@@ -16,10 +16,7 @@ def main(num_tokens: int,
         do_profile: bool = False,
         num_warmup_iters: int = 5,
         num_iters: int = 100) -> None:
-    random.seed(seed)
-    torch.random.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed(seed)
+    seed_everything(seed)
    torch.set_default_device("cuda")

    layer = RMSNorm(hidden_size).to(dtype=dtype)

--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -10,7 +10,7 @@ from ray.experimental.tqdm_ray import tqdm
 from transformers import AutoConfig

 from vllm.model_executor.layers.fused_moe.fused_moe import *
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils import FlexibleArgumentParser, seed_everything


 class BenchmarkConfig(TypedDict):
@@ -166,7 +166,7 @@ class BenchmarkWorker:

    def __init__(self, seed: int) -> None:
        torch.set_default_device("cuda")
-        torch.cuda.manual_seed_all(seed)
+        seed_everything(seed)
        self.seed = seed

    def benchmark(
@@ -180,7 +180,7 @@ class BenchmarkWorker:
        use_fp8_w8a8: bool,
        use_int8_w8a16: bool,
    ) -> Tuple[Dict[str, int], float]:
-        torch.cuda.manual_seed_all(self.seed)
+        seed_everything(self.seed)
        dtype_str = get_config_dtype_str(dtype,
                                         use_int8_w8a16=use_int8_w8a16,
                                         use_fp8_w8a8=use_fp8_w8a8)

--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -6,7 +6,7 @@ import torch

 from vllm import _custom_ops as ops
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
-                        create_kv_caches_with_random)
+                        create_kv_caches_with_random, seed_everything)

 NUM_BLOCKS = 1024
 PARTITION_SIZE = 512
@@ -28,10 +28,7 @@ def main(
    device: str = "cuda",
    kv_cache_dtype: Optional[str] = None,
 ) -> None:
-    random.seed(seed)
-    torch.random.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed(seed)
+    seed_everything(seed)

    scale = float(1.0 / (head_size**0.5))
    query = torch.empty(num_seqs,

--- a/benchmarks/kernels/benchmark_quant.py
+++ b/benchmarks/kernels/benchmark_quant.py
-import random
 import time

 import torch

 from vllm import _custom_ops as ops
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
+from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
+                        seed_everything)


 @torch.inference_mode()
@@ -17,10 +17,7 @@ def main(num_tokens: int,
         do_profile: bool = False,
         num_warmup_iters: int = 5,
         num_iters: int = 100) -> None:
-    random.seed(seed)
-    torch.random.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed(seed)
+    seed_everything(seed)
    torch.set_default_device("cuda")

    x = torch.randn(num_tokens, hidden_size, dtype=dtype)

--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@@ -6,7 +6,7 @@ import torch

 from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding,
                                                         get_rope)
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils import FlexibleArgumentParser, seed_everything


 def benchmark_rope_kernels_multi_lora(
@@ -22,9 +22,7 @@ def benchmark_rope_kernels_multi_lora(
    max_position: int = 8192,
    base: int = 10000,
 ) -> None:
-    torch.random.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed(seed)
+    seed_everything(seed)
    torch.set_default_device(device)
    if rotary_dim is None:
        rotary_dim = head_size

--- a/tests/kernels/test_activation.py
+++ b/tests/kernels/test_activation.py
@@ -7,6 +7,7 @@ from tests.kernels.utils import opcheck
 from vllm.model_executor.layers.activation import (FastGELU, GeluAndMul,
                                                   NewGELU, QuickGELU,
                                                   SiluAndMul)
+from vllm.utils import seed_everything

 from .allclose_default import get_default_atol, get_default_rtol

@@ -34,9 +35,7 @@ def test_act_and_mul(
    seed: int,
    device: str,
 ) -> None:
-    torch.random.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed(seed)
+    seed_everything(seed)
    torch.set_default_device(device)
    x = torch.randn(num_tokens, 2 * d, dtype=dtype)
    if activation == "silu":
@@ -77,9 +76,7 @@ def test_activation(
    seed: int,
    device: str,
 ) -> None:
-    torch.random.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed(seed)
+    seed_everything(seed)
    torch.set_default_device(device)
    x = torch.randn(num_tokens, d, dtype=dtype)
    layer = activation[0]()

--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -6,7 +6,7 @@ import torch

 from tests.kernels.utils import opcheck
 from vllm import _custom_ops as ops
-from vllm.utils import get_max_shared_memory_bytes, is_hip
+from vllm.utils import get_max_shared_memory_bytes, is_hip, seed_everything

 from .allclose_default import get_default_atol, get_default_rtol

@@ -139,10 +139,8 @@ def test_paged_attention(
 ) -> None:
    if kv_cache_dtype == "fp8" and head_size % 16:
        pytest.skip()
-    random.seed(seed)
-    torch.random.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed(seed)
+
+    seed_everything(seed)
    torch.set_default_device(device)
    scale = float(1.0 / (head_size**0.5))
    num_query_heads, num_kv_heads = num_heads
@@ -354,10 +352,7 @@ def test_paged_attention_rocm(
    seed: int,
    device: str,
 ) -> None:
-    random.seed(seed)
-    torch.random.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed(seed)
+    seed_everything(seed)
    torch.set_default_device(device)
    scale = float(1.0 / (head_size**0.5))
    num_query_heads, num_kv_heads = num_heads
@@ -506,10 +501,7 @@ def test_multi_query_kv_attention(
    seed: int,
    device: str,
 ) -> None:
-    random.seed(seed)
-    torch.random.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed(seed)
+    seed_everything(seed)
    torch.set_default_device(device)
    # MAX_SEQ_LEN sometimes causes OOM in the reference implementation.
    # As the xformers library is already tested with its own tests, we can use

--- a/tests/kernels/test_attention_selector.py
+++ b/tests/kernels/test_attention_selector.py
@@ -45,7 +45,7 @@ def test_flash_attn(monkeypatch):
    override_backend_env_variable(monkeypatch, STR_FLASH_ATTN_VAL)

    # Unsupported CUDA arch
-    with patch("torch.cuda.get_device_capability", return_value=[7, 5]):
+    with patch("torch.cuda.get_device_capability", return_value=(7, 5)):
        backend = which_attn_to_use(8, 16, 8, None, torch.float16, None, 16)
        assert backend.name != STR_FLASH_ATTN_VAL


--- a/tests/kernels/test_awq_triton.py
+++ b/tests/kernels/test_awq_triton.py
@@ -7,6 +7,7 @@ import torch

 from vllm.model_executor.layers.quantization.awq_triton import (
    AWQ_TRITON_SUPPORTED_GROUP_SIZES, awq_dequantize_triton, awq_gemm_triton)
+from vllm.utils import seed_everything

 device = "cuda"

@@ -79,7 +80,7 @@ def test_dequantize(qweight_rows, qweight_cols, group_size):
    zeros_cols = qweight_cols
    zeros_dtype = torch.int32

-    torch.manual_seed(0)
+    seed_everything(0)

    qweight = torch.randint(0,
                            torch.iinfo(torch.int32).max,
@@ -133,7 +134,7 @@ def test_gemm(N, K, M, splitK, group_size):
    qzeros_rows = scales_rows
    qzeros_cols = qweight_cols

-    torch.manual_seed(0)
+    seed_everything(0)

    input = torch.rand((input_rows, input_cols),
                       dtype=input_dtype,

--- a/tests/kernels/test_blocksparse_attention.py
+++ b/tests/kernels/test_blocksparse_attention.py
@@ -7,7 +7,7 @@ import torch
 from vllm import _custom_ops as ops
 from vllm.attention.ops.blocksparse_attention.interface import (
    LocalStridedBlockSparseAttn)
-from vllm.utils import get_max_shared_memory_bytes, is_hip
+from vllm.utils import get_max_shared_memory_bytes, is_hip, seed_everything

 from .allclose_default import get_default_atol, get_default_rtol

@@ -172,10 +172,7 @@ def test_paged_attention(
    blocksparse_block_size: int,
    blocksparse_head_sliding_step: int,
 ) -> None:
-    random.seed(seed)
-    torch.random.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed(seed)
+    seed_everything(seed)
    torch.set_default_device(device)
    scale = float(1.0 / (head_size**0.5))
    num_query_heads, num_kv_heads = num_heads
@@ -386,10 +383,7 @@ def test_varlen_blocksparse_attention_prefill(
    seed: int,
    device: str,
 ) -> None:
-    random.seed(seed)
-    torch.random.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed(seed)
+    seed_everything(seed)
    torch.set_default_device(device)
    # MAX_SEQ_LEN sometimes causes OOM in the reference implementation.
    # As the xformers library is already tested with its own tests, we can use

--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -6,6 +6,7 @@ import torch

 from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck
 from vllm import _custom_ops as ops
+from vllm.utils import seed_everything

 COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')]
 DTYPES = [torch.half, torch.bfloat16, torch.float]
@@ -55,10 +56,7 @@ def test_copy_blocks(
 ) -> None:
    if kv_cache_dtype == "fp8" and head_size % 16:
        pytest.skip()
-    random.seed(seed)
-    torch.random.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed(seed)
+    seed_everything(seed)
    torch.set_default_device(device)
    # Generate random block mappings where each source block is mapped to two
    # destination blocks.
@@ -134,10 +132,7 @@ def test_reshape_and_cache(
 ) -> None:
    if kv_cache_dtype == "fp8" and head_size % 16:
        pytest.skip()
-    random.seed(seed)
-    torch.random.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed(seed)
+    seed_everything(seed)
    torch.set_default_device(device)
    # Create a random slot mapping.
    num_slots = block_size * num_blocks
@@ -229,9 +224,7 @@ def test_reshape_and_cache_flash(
    device: str,
    kv_cache_dtype: str,
 ) -> None:
-    random.seed(seed)
-    torch.random.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
+    seed_everything(seed)
    torch.set_default_device(device)

    # Create a random slot mapping.
@@ -345,10 +338,8 @@ def test_swap_blocks(
        pytest.skip()
    if kv_cache_dtype == "fp8" and head_size % 16:
        pytest.skip()
-    random.seed(seed)
-    torch.random.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed(seed)
+
+    seed_everything(seed)

    src_device = device if direction[0] == "cuda" else 'cpu'
    dst_device = device if direction[1] == "cuda" else 'cpu'
@@ -417,9 +408,7 @@ def test_fp8_e4m3_conversion(
    seed: int,
    device: str,
 ) -> None:
-    random.seed(seed)
-    torch.random.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
+    seed_everything(seed)

    low = -224.0
    high = 224.0

--- a/tests/kernels/test_causal_conv1d.py
+++ b/tests/kernels/test_causal_conv1d.py
@@ -7,6 +7,7 @@ from einops import rearrange

 from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
    causal_conv1d_fn, causal_conv1d_update)
+from vllm.utils import seed_everything


 def causal_conv1d_ref(
@@ -104,7 +105,7 @@ def test_causal_conv1d(batch, dim, seqlen, width, has_bias, silu_activation,
    if itype == torch.bfloat16:
        rtol, atol = 1e-2, 5e-2
    # set seed
-    torch.random.manual_seed(0)
+    seed_everything(0)
    if not channel_last:
        x = torch.randn(batch,
                        4096 + dim + 64,
@@ -175,7 +176,7 @@ def test_causal_conv1d_update(batch, dim, width, has_bias, silu_activation,
    if itype == torch.bfloat16:
        rtol, atol = 1e-2, 5e-2
    # set seed
-    torch.random.manual_seed(0)
+    seed_everything(0)
    batch = 2
    x = torch.randn(batch, dim, device=device, dtype=itype)
    conv_state = torch.randn(batch, dim, width, device=device, dtype=itype)

--- a/tests/kernels/test_cutlass.py
+++ b/tests/kernels/test_cutlass.py
@@ -15,9 +15,6 @@ CUDA_DEVICES = [
    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
 ]

-capability = current_platform.get_device_capability()
-capability = capability[0] * 10 + capability[1]
-

 def to_fp8(tensor: torch.Tensor):
    finfo = torch.finfo(torch.float8_e4m3fn)
@@ -119,7 +116,7 @@ def cutlass_int8_gemm_helper(m: int,
 @pytest.mark.parametrize("per_act_token", [True, False])
 @pytest.mark.parametrize("per_out_ch", [True, False])
 @pytest.mark.parametrize("use_bias", [True, False])
-@pytest.mark.skipif(capability < 89,
+@pytest.mark.skipif(not current_platform.has_device_capability(89),
                    reason="FP8 is not supported on this GPU type.")
 def test_cutlass_fp8_gemm(m: int, n: int, k: int, per_act_token: bool,
                          per_out_ch: bool, use_bias: bool):
@@ -157,7 +154,7 @@ def test_cutlass_int8_gemm_output_dtype(per_act_token: bool, per_out_ch: bool,
 @pytest.mark.parametrize("per_out_ch", [True, False])
 @pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16])
 @pytest.mark.parametrize("use_bias", [True, False])
-@pytest.mark.skipif(capability < 89,
+@pytest.mark.skipif(not current_platform.has_device_capability(89),
                    reason="FP8 is not supported on this GPU type.")
 def test_cutlass_fp8_gemm_output_dtype(per_act_token: bool, per_out_ch: bool,
                                       out_dtype: Type[torch.dtype],
@@ -175,7 +172,7 @@ def test_cutlass_fp8_gemm_output_dtype(per_act_token: bool, per_out_ch: bool,
 @pytest.mark.parametrize("per_out_ch", [True, False])
 @pytest.mark.parametrize("use_bias", [True, False])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
-@pytest.mark.skipif(capability < 89,
+@pytest.mark.skipif(not current_platform.has_device_capability(89),
                    reason="FP8 is not supported on this GPU type.")
 def test_cutlass_fp8_gemm_devices(per_act_token: bool, per_out_ch: bool,
                                  use_bias: bool, device: str):
@@ -207,7 +204,7 @@ def test_cutlass_int8_gemm_devices(per_act_token: bool, per_out_ch: bool,
 @pytest.mark.parametrize("per_act_token", [True, False])
 @pytest.mark.parametrize("per_out_ch", [True, False])
 @pytest.mark.parametrize("use_bias", [True, False])
-@pytest.mark.skipif(capability < 89,
+@pytest.mark.skipif(not current_platform.has_device_capability(89),
                    reason="FP8 is not supported on this GPU type.")
 def test_cutlass_fp8_gemm_m_sweep(per_act_token: bool, per_out_ch: bool,
                                  use_bias: bool):

--- a/tests/kernels/test_flash_attn.py
+++ b/tests/kernels/test_flash_attn.py
@@ -4,6 +4,7 @@ import pytest
 import torch

 import vllm.attention.backends.flash_attn  # noqa: F401
+from vllm.utils import seed_everything

 NUM_HEADS = [(4, 4), (8, 2), (16, 2)]
 HEAD_SIZES = [128, 256]
@@ -87,7 +88,7 @@ def test_flash_attn_with_paged_kv(
    num_blocks: int,
 ) -> None:
    torch.set_default_device("cuda")
-    torch.cuda.manual_seed_all(0)
+    seed_everything(0)
    num_seqs = len(kv_lens)
    num_query_heads = num_heads[0]
    num_kv_heads = num_heads[1]
@@ -174,7 +175,7 @@ def test_varlen_with_paged_kv(
    num_blocks: int,
 ) -> None:
    torch.set_default_device("cuda")
-    torch.cuda.manual_seed_all(0)
+    seed_everything(0)
    num_seqs = len(seq_lens)
    query_lens = [x[0] for x in seq_lens]
    kv_lens = [x[1] for x in seq_lens]

--- a/tests/kernels/test_flashinfer.py
+++ b/tests/kernels/test_flashinfer.py
@@ -4,6 +4,8 @@ import flashinfer
 import pytest
 import torch

+from vllm.utils import seed_everything
+
 NUM_HEADS = [(16, 16), (32, 8), (64, 8), (6, 1)]
 HEAD_SIZES = [128, 256]
 BLOCK_SIZES = [16, 32]
@@ -82,7 +84,7 @@ def test_flashinfer_decode_with_paged_kv(
    soft_cap: Optional[float],
 ) -> None:
    torch.set_default_device("cuda")
-    torch.cuda.manual_seed_all(0)
+    seed_everything(0)
    num_seqs = len(kv_lens)
    num_query_heads = num_heads[0]
    num_kv_heads = num_heads[1]
@@ -168,7 +170,7 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]],
                                          block_size: int,
                                          soft_cap: Optional[float]) -> None:
    torch.set_default_device("cuda")
-    torch.cuda.manual_seed_all(0)
+    seed_everything(0)
    num_seqs = len(seq_lens)
    query_lens = [x[0] for x in seq_lens]
    kv_lens = [x[1] for x in seq_lens]
@@ -266,7 +268,7 @@ def test_flashinfer_prefill_with_paged_fp8_kv(
        head_size: int, dtype: torch.dtype, block_size: int,
        soft_cap: Optional[float]) -> None:
    torch.set_default_device("cuda")
-    torch.cuda.manual_seed_all(0)
+    seed_everything(0)
    num_seqs = len(seq_lens)
    query_lens = [x[0] for x in seq_lens]
    kv_lens = [x[1] for x in seq_lens]
@@ -379,7 +381,7 @@ def test_flashinfer_decode_with_paged_fp8_kv(
 ) -> None:
    # test doesn't work for num_heads = (16,16)
    torch.set_default_device("cuda")
-    torch.cuda.manual_seed_all(0)
+    seed_everything(0)
    num_seqs = len(kv_lens)
    num_query_heads = num_heads[0]
    num_kv_heads = num_heads[1]

--- a/tests/kernels/test_fp8_quant.py
+++ b/tests/kernels/test_fp8_quant.py
@@ -5,6 +5,7 @@ import vllm._custom_ops as ops
 from tests.kernels.quant_utils import (FP8_DTYPE,
                                       ref_dynamic_per_tensor_fp8_quant,
                                       ref_dynamic_per_token_quant)
+from vllm.utils import seed_everything

 DTYPES = [torch.half, torch.bfloat16, torch.float]
 HIDDEN_SIZES = [1, 2, 3, 4, 16, 67, 768, 2048, 5120, 5137, 8192,
@@ -24,8 +25,7 @@ SEEDS = [0]
 def test_dynamic_per_token_fp8_quant(num_tokens: int, hidden_size: int,
                                     dtype: torch.dtype, scale_ub: bool,
                                     seed: int) -> None:
-    torch.random.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
+    seed_everything(seed)

    x = torch.rand(num_tokens, hidden_size, dtype=dtype,
                   device="cuda") + 1e-6  # avoid nans
@@ -49,8 +49,7 @@ def test_dynamic_per_token_fp8_quant(num_tokens: int, hidden_size: int,
 @torch.inference_mode()
 def test_dynamic_per_tensor_fp8_quant(num_tokens: int, hidden_size: int,
                                      dtype: torch.dtype, seed: int) -> None:
-    torch.random.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
+    seed_everything(seed)

    x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda")

@@ -67,8 +66,7 @@ def test_dynamic_per_tensor_fp8_quant(num_tokens: int, hidden_size: int,
 @torch.inference_mode()
 @pytest.mark.parametrize("seed", SEEDS)
 def test_fp8_quant_large(seed: int) -> None:
-    torch.random.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
+    seed_everything(seed)

    num_tokens = 1024000  # Mistral-Nemo's max_position_embeddings
    hidden_size = 1152  # Smallest hidden_size to reproduce the error

--- a/tests/kernels/test_gguf.py
+++ b/tests/kernels/test_gguf.py
@@ -7,6 +7,7 @@ from gguf import GGMLQuantizationType, GGUFReader, ReaderTensor, dequantize
 from huggingface_hub import snapshot_download

 import vllm._custom_ops as ops
+from vllm.utils import seed_everything

 GGUF_SAMPLE = snapshot_download("Isotr0py/test-gguf-sample")

@@ -74,7 +75,7 @@ def test_dequantize(hidden_size: int, dtype: torch.dtype,
 @torch.inference_mode()
 def test_mmvq(hidden_size: int, dtype: torch.dtype,
              quant_type: GGMLQuantizationType):
-    torch.cuda.manual_seed_all(0)
+    seed_everything(0)

    tensors = get_gguf_sample_tensors(hidden_size, quant_type)
    x = torch.rand((1, hidden_size), dtype=dtype, device="cuda")
@@ -110,7 +111,7 @@ def test_mmvq(hidden_size: int, dtype: torch.dtype,
 @torch.inference_mode()
 def test_mmq(num_tokens: int, hidden_size: int, dtype: torch.dtype,
             quant_type: GGMLQuantizationType):
-    torch.cuda.manual_seed_all(0)
+    seed_everything(0)

    tensors = get_gguf_sample_tensors(hidden_size, quant_type)
    x = torch.rand((num_tokens, hidden_size), dtype=dtype, device="cuda")

--- a/tests/kernels/test_int8_quant.py
+++ b/tests/kernels/test_int8_quant.py
@@ -4,6 +4,7 @@ import torch
 from tests.kernels.quant_utils import ref_dynamic_per_token_quant
 from tests.kernels.utils import opcheck
 from vllm._custom_ops import scaled_int8_quant
+from vllm.utils import seed_everything

 DTYPES = [torch.half, torch.bfloat16, torch.float]
 HIDDEN_SIZES = [16, 67, 768, 2048, 5120, 5137, 8192,
@@ -44,8 +45,7 @@ def opcheck_int8_quant_dynamic(output, input, symmetric=True):
 @torch.inference_mode()
 def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int,
                                   dtype: torch.dtype, seed: int) -> None:
-    torch.random.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
+    seed_everything(seed)

    x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000

@@ -68,8 +68,7 @@ def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int,
 @torch.inference_mode()
 def test_dynamic_scaled_int8_azp_quant(num_tokens: int, hidden_size: int,
                                       dtype: torch.dtype, seed: int) -> None:
-    torch.random.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
+    seed_everything(seed)
    int8_traits = torch.iinfo(torch.int8)

    x = torch.rand(num_tokens, hidden_size, dtype=dtype,
@@ -113,8 +112,7 @@ def test_dynamic_scaled_int8_azp_quant(num_tokens: int, hidden_size: int,
 def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int,
                                  dtype: torch.dtype, seed: int,
                                  scale: float) -> None:
-    torch.random.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
+    seed_everything(seed)
    int8_traits = torch.iinfo(torch.int8)

    x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000
@@ -140,8 +138,7 @@ def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int,
 def test_static_scaled_int8_azp_quant(num_tokens: int, hidden_size: int,
                                      dtype: torch.dtype, seed: int,
                                      scale: float, azp: int) -> None:
-    torch.random.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
+    seed_everything(seed)
    int8_traits = torch.iinfo(torch.int8)

    x = torch.rand(num_tokens, hidden_size, dtype=dtype,

--- a/tests/kernels/test_layernorm.py
+++ b/tests/kernels/test_layernorm.py
@@ -3,6 +3,7 @@ import torch

 from tests.kernels.utils import opcheck
 from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.utils import seed_everything

 DTYPES = [torch.half, torch.bfloat16, torch.float]
 NUM_TOKENS = [7, 83, 4096]  # Arbitrary values for testing
@@ -30,9 +31,7 @@ def test_rms_norm(
    seed: int,
    device: str,
 ) -> None:
-    torch.random.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed(seed)
+    seed_everything(seed)
    torch.set_default_device(device)
    layer = RMSNorm(hidden_size).to(dtype=dtype)
    layer.weight.data.normal_(mean=1.0, std=0.1)

--- a/tests/kernels/test_machete_gemm.py
+++ b/tests/kernels/test_machete_gemm.py
@@ -48,7 +48,7 @@ WTYPE_ZEROPOINTS = [
 #  `is_quant_method_supported` conflates kernels with quantization methods
 #  an assumption which is breaking down as quantizations methods can have
 #  have kernels and some kernels support multiple quantization methods.
-IS_SUPPORTED_BY_GPU = current_platform.get_device_capability()[0] >= 9
+IS_SUPPORTED_BY_GPU = current_platform.has_device_capability(90)


 def rand_data(shape, dtype=torch.float16):