Unverified Commit 6ffa3f31 authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[CI/Build] Avoid CUDA initialization (#8534)

parent e3515729
import random
import time import time
import torch import torch
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
seed_everything)
@torch.inference_mode() @torch.inference_mode()
...@@ -16,10 +16,7 @@ def main(num_tokens: int, ...@@ -16,10 +16,7 @@ def main(num_tokens: int,
do_profile: bool = False, do_profile: bool = False,
num_warmup_iters: int = 5, num_warmup_iters: int = 5,
num_iters: int = 100) -> None: num_iters: int = 100) -> None:
random.seed(seed) seed_everything(seed)
torch.random.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.set_default_device("cuda") torch.set_default_device("cuda")
layer = RMSNorm(hidden_size).to(dtype=dtype) layer = RMSNorm(hidden_size).to(dtype=dtype)
......
...@@ -10,7 +10,7 @@ from ray.experimental.tqdm_ray import tqdm ...@@ -10,7 +10,7 @@ from ray.experimental.tqdm_ray import tqdm
from transformers import AutoConfig from transformers import AutoConfig
from vllm.model_executor.layers.fused_moe.fused_moe import * from vllm.model_executor.layers.fused_moe.fused_moe import *
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser, seed_everything
class BenchmarkConfig(TypedDict): class BenchmarkConfig(TypedDict):
...@@ -166,7 +166,7 @@ class BenchmarkWorker: ...@@ -166,7 +166,7 @@ class BenchmarkWorker:
def __init__(self, seed: int) -> None: def __init__(self, seed: int) -> None:
torch.set_default_device("cuda") torch.set_default_device("cuda")
torch.cuda.manual_seed_all(seed) seed_everything(seed)
self.seed = seed self.seed = seed
def benchmark( def benchmark(
...@@ -180,7 +180,7 @@ class BenchmarkWorker: ...@@ -180,7 +180,7 @@ class BenchmarkWorker:
use_fp8_w8a8: bool, use_fp8_w8a8: bool,
use_int8_w8a16: bool, use_int8_w8a16: bool,
) -> Tuple[Dict[str, int], float]: ) -> Tuple[Dict[str, int], float]:
torch.cuda.manual_seed_all(self.seed) seed_everything(self.seed)
dtype_str = get_config_dtype_str(dtype, dtype_str = get_config_dtype_str(dtype,
use_int8_w8a16=use_int8_w8a16, use_int8_w8a16=use_int8_w8a16,
use_fp8_w8a8=use_fp8_w8a8) use_fp8_w8a8=use_fp8_w8a8)
......
...@@ -6,7 +6,7 @@ import torch ...@@ -6,7 +6,7 @@ import torch
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser, from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
create_kv_caches_with_random) create_kv_caches_with_random, seed_everything)
NUM_BLOCKS = 1024 NUM_BLOCKS = 1024
PARTITION_SIZE = 512 PARTITION_SIZE = 512
...@@ -28,10 +28,7 @@ def main( ...@@ -28,10 +28,7 @@ def main(
device: str = "cuda", device: str = "cuda",
kv_cache_dtype: Optional[str] = None, kv_cache_dtype: Optional[str] = None,
) -> None: ) -> None:
random.seed(seed) seed_everything(seed)
torch.random.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
scale = float(1.0 / (head_size**0.5)) scale = float(1.0 / (head_size**0.5))
query = torch.empty(num_seqs, query = torch.empty(num_seqs,
......
import random
import time import time
import torch import torch
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
seed_everything)
@torch.inference_mode() @torch.inference_mode()
...@@ -17,10 +17,7 @@ def main(num_tokens: int, ...@@ -17,10 +17,7 @@ def main(num_tokens: int,
do_profile: bool = False, do_profile: bool = False,
num_warmup_iters: int = 5, num_warmup_iters: int = 5,
num_iters: int = 100) -> None: num_iters: int = 100) -> None:
random.seed(seed) seed_everything(seed)
torch.random.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.set_default_device("cuda") torch.set_default_device("cuda")
x = torch.randn(num_tokens, hidden_size, dtype=dtype) x = torch.randn(num_tokens, hidden_size, dtype=dtype)
......
...@@ -6,7 +6,7 @@ import torch ...@@ -6,7 +6,7 @@ import torch
from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding, from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding,
get_rope) get_rope)
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser, seed_everything
def benchmark_rope_kernels_multi_lora( def benchmark_rope_kernels_multi_lora(
...@@ -22,9 +22,7 @@ def benchmark_rope_kernels_multi_lora( ...@@ -22,9 +22,7 @@ def benchmark_rope_kernels_multi_lora(
max_position: int = 8192, max_position: int = 8192,
base: int = 10000, base: int = 10000,
) -> None: ) -> None:
torch.random.manual_seed(seed) seed_everything(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.set_default_device(device) torch.set_default_device(device)
if rotary_dim is None: if rotary_dim is None:
rotary_dim = head_size rotary_dim = head_size
......
...@@ -7,6 +7,7 @@ from tests.kernels.utils import opcheck ...@@ -7,6 +7,7 @@ from tests.kernels.utils import opcheck
from vllm.model_executor.layers.activation import (FastGELU, GeluAndMul, from vllm.model_executor.layers.activation import (FastGELU, GeluAndMul,
NewGELU, QuickGELU, NewGELU, QuickGELU,
SiluAndMul) SiluAndMul)
from vllm.utils import seed_everything
from .allclose_default import get_default_atol, get_default_rtol from .allclose_default import get_default_atol, get_default_rtol
...@@ -34,9 +35,7 @@ def test_act_and_mul( ...@@ -34,9 +35,7 @@ def test_act_and_mul(
seed: int, seed: int,
device: str, device: str,
) -> None: ) -> None:
torch.random.manual_seed(seed) seed_everything(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.set_default_device(device) torch.set_default_device(device)
x = torch.randn(num_tokens, 2 * d, dtype=dtype) x = torch.randn(num_tokens, 2 * d, dtype=dtype)
if activation == "silu": if activation == "silu":
...@@ -77,9 +76,7 @@ def test_activation( ...@@ -77,9 +76,7 @@ def test_activation(
seed: int, seed: int,
device: str, device: str,
) -> None: ) -> None:
torch.random.manual_seed(seed) seed_everything(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.set_default_device(device) torch.set_default_device(device)
x = torch.randn(num_tokens, d, dtype=dtype) x = torch.randn(num_tokens, d, dtype=dtype)
layer = activation[0]() layer = activation[0]()
......
...@@ -6,7 +6,7 @@ import torch ...@@ -6,7 +6,7 @@ import torch
from tests.kernels.utils import opcheck from tests.kernels.utils import opcheck
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.utils import get_max_shared_memory_bytes, is_hip from vllm.utils import get_max_shared_memory_bytes, is_hip, seed_everything
from .allclose_default import get_default_atol, get_default_rtol from .allclose_default import get_default_atol, get_default_rtol
...@@ -139,10 +139,8 @@ def test_paged_attention( ...@@ -139,10 +139,8 @@ def test_paged_attention(
) -> None: ) -> None:
if kv_cache_dtype == "fp8" and head_size % 16: if kv_cache_dtype == "fp8" and head_size % 16:
pytest.skip() pytest.skip()
random.seed(seed)
torch.random.manual_seed(seed) seed_everything(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.set_default_device(device) torch.set_default_device(device)
scale = float(1.0 / (head_size**0.5)) scale = float(1.0 / (head_size**0.5))
num_query_heads, num_kv_heads = num_heads num_query_heads, num_kv_heads = num_heads
...@@ -354,10 +352,7 @@ def test_paged_attention_rocm( ...@@ -354,10 +352,7 @@ def test_paged_attention_rocm(
seed: int, seed: int,
device: str, device: str,
) -> None: ) -> None:
random.seed(seed) seed_everything(seed)
torch.random.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.set_default_device(device) torch.set_default_device(device)
scale = float(1.0 / (head_size**0.5)) scale = float(1.0 / (head_size**0.5))
num_query_heads, num_kv_heads = num_heads num_query_heads, num_kv_heads = num_heads
...@@ -506,10 +501,7 @@ def test_multi_query_kv_attention( ...@@ -506,10 +501,7 @@ def test_multi_query_kv_attention(
seed: int, seed: int,
device: str, device: str,
) -> None: ) -> None:
random.seed(seed) seed_everything(seed)
torch.random.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.set_default_device(device) torch.set_default_device(device)
# MAX_SEQ_LEN sometimes causes OOM in the reference implementation. # MAX_SEQ_LEN sometimes causes OOM in the reference implementation.
# As the xformers library is already tested with its own tests, we can use # As the xformers library is already tested with its own tests, we can use
......
...@@ -45,7 +45,7 @@ def test_flash_attn(monkeypatch): ...@@ -45,7 +45,7 @@ def test_flash_attn(monkeypatch):
override_backend_env_variable(monkeypatch, STR_FLASH_ATTN_VAL) override_backend_env_variable(monkeypatch, STR_FLASH_ATTN_VAL)
# Unsupported CUDA arch # Unsupported CUDA arch
with patch("torch.cuda.get_device_capability", return_value=[7, 5]): with patch("torch.cuda.get_device_capability", return_value=(7, 5)):
backend = which_attn_to_use(8, 16, 8, None, torch.float16, None, 16) backend = which_attn_to_use(8, 16, 8, None, torch.float16, None, 16)
assert backend.name != STR_FLASH_ATTN_VAL assert backend.name != STR_FLASH_ATTN_VAL
......
...@@ -7,6 +7,7 @@ import torch ...@@ -7,6 +7,7 @@ import torch
from vllm.model_executor.layers.quantization.awq_triton import ( from vllm.model_executor.layers.quantization.awq_triton import (
AWQ_TRITON_SUPPORTED_GROUP_SIZES, awq_dequantize_triton, awq_gemm_triton) AWQ_TRITON_SUPPORTED_GROUP_SIZES, awq_dequantize_triton, awq_gemm_triton)
from vllm.utils import seed_everything
device = "cuda" device = "cuda"
...@@ -79,7 +80,7 @@ def test_dequantize(qweight_rows, qweight_cols, group_size): ...@@ -79,7 +80,7 @@ def test_dequantize(qweight_rows, qweight_cols, group_size):
zeros_cols = qweight_cols zeros_cols = qweight_cols
zeros_dtype = torch.int32 zeros_dtype = torch.int32
torch.manual_seed(0) seed_everything(0)
qweight = torch.randint(0, qweight = torch.randint(0,
torch.iinfo(torch.int32).max, torch.iinfo(torch.int32).max,
...@@ -133,7 +134,7 @@ def test_gemm(N, K, M, splitK, group_size): ...@@ -133,7 +134,7 @@ def test_gemm(N, K, M, splitK, group_size):
qzeros_rows = scales_rows qzeros_rows = scales_rows
qzeros_cols = qweight_cols qzeros_cols = qweight_cols
torch.manual_seed(0) seed_everything(0)
input = torch.rand((input_rows, input_cols), input = torch.rand((input_rows, input_cols),
dtype=input_dtype, dtype=input_dtype,
......
...@@ -7,7 +7,7 @@ import torch ...@@ -7,7 +7,7 @@ import torch
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.attention.ops.blocksparse_attention.interface import ( from vllm.attention.ops.blocksparse_attention.interface import (
LocalStridedBlockSparseAttn) LocalStridedBlockSparseAttn)
from vllm.utils import get_max_shared_memory_bytes, is_hip from vllm.utils import get_max_shared_memory_bytes, is_hip, seed_everything
from .allclose_default import get_default_atol, get_default_rtol from .allclose_default import get_default_atol, get_default_rtol
...@@ -172,10 +172,7 @@ def test_paged_attention( ...@@ -172,10 +172,7 @@ def test_paged_attention(
blocksparse_block_size: int, blocksparse_block_size: int,
blocksparse_head_sliding_step: int, blocksparse_head_sliding_step: int,
) -> None: ) -> None:
random.seed(seed) seed_everything(seed)
torch.random.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.set_default_device(device) torch.set_default_device(device)
scale = float(1.0 / (head_size**0.5)) scale = float(1.0 / (head_size**0.5))
num_query_heads, num_kv_heads = num_heads num_query_heads, num_kv_heads = num_heads
...@@ -386,10 +383,7 @@ def test_varlen_blocksparse_attention_prefill( ...@@ -386,10 +383,7 @@ def test_varlen_blocksparse_attention_prefill(
seed: int, seed: int,
device: str, device: str,
) -> None: ) -> None:
random.seed(seed) seed_everything(seed)
torch.random.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.set_default_device(device) torch.set_default_device(device)
# MAX_SEQ_LEN sometimes causes OOM in the reference implementation. # MAX_SEQ_LEN sometimes causes OOM in the reference implementation.
# As the xformers library is already tested with its own tests, we can use # As the xformers library is already tested with its own tests, we can use
......
...@@ -6,6 +6,7 @@ import torch ...@@ -6,6 +6,7 @@ import torch
from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.utils import seed_everything
COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')] COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')]
DTYPES = [torch.half, torch.bfloat16, torch.float] DTYPES = [torch.half, torch.bfloat16, torch.float]
...@@ -55,10 +56,7 @@ def test_copy_blocks( ...@@ -55,10 +56,7 @@ def test_copy_blocks(
) -> None: ) -> None:
if kv_cache_dtype == "fp8" and head_size % 16: if kv_cache_dtype == "fp8" and head_size % 16:
pytest.skip() pytest.skip()
random.seed(seed) seed_everything(seed)
torch.random.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.set_default_device(device) torch.set_default_device(device)
# Generate random block mappings where each source block is mapped to two # Generate random block mappings where each source block is mapped to two
# destination blocks. # destination blocks.
...@@ -134,10 +132,7 @@ def test_reshape_and_cache( ...@@ -134,10 +132,7 @@ def test_reshape_and_cache(
) -> None: ) -> None:
if kv_cache_dtype == "fp8" and head_size % 16: if kv_cache_dtype == "fp8" and head_size % 16:
pytest.skip() pytest.skip()
random.seed(seed) seed_everything(seed)
torch.random.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.set_default_device(device) torch.set_default_device(device)
# Create a random slot mapping. # Create a random slot mapping.
num_slots = block_size * num_blocks num_slots = block_size * num_blocks
...@@ -229,9 +224,7 @@ def test_reshape_and_cache_flash( ...@@ -229,9 +224,7 @@ def test_reshape_and_cache_flash(
device: str, device: str,
kv_cache_dtype: str, kv_cache_dtype: str,
) -> None: ) -> None:
random.seed(seed) seed_everything(seed)
torch.random.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.set_default_device(device) torch.set_default_device(device)
# Create a random slot mapping. # Create a random slot mapping.
...@@ -345,10 +338,8 @@ def test_swap_blocks( ...@@ -345,10 +338,8 @@ def test_swap_blocks(
pytest.skip() pytest.skip()
if kv_cache_dtype == "fp8" and head_size % 16: if kv_cache_dtype == "fp8" and head_size % 16:
pytest.skip() pytest.skip()
random.seed(seed)
torch.random.manual_seed(seed) seed_everything(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
src_device = device if direction[0] == "cuda" else 'cpu' src_device = device if direction[0] == "cuda" else 'cpu'
dst_device = device if direction[1] == "cuda" else 'cpu' dst_device = device if direction[1] == "cuda" else 'cpu'
...@@ -417,9 +408,7 @@ def test_fp8_e4m3_conversion( ...@@ -417,9 +408,7 @@ def test_fp8_e4m3_conversion(
seed: int, seed: int,
device: str, device: str,
) -> None: ) -> None:
random.seed(seed) seed_everything(seed)
torch.random.manual_seed(seed)
torch.cuda.manual_seed(seed)
low = -224.0 low = -224.0
high = 224.0 high = 224.0
......
...@@ -7,6 +7,7 @@ from einops import rearrange ...@@ -7,6 +7,7 @@ from einops import rearrange
from vllm.model_executor.layers.mamba.ops.causal_conv1d import ( from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
causal_conv1d_fn, causal_conv1d_update) causal_conv1d_fn, causal_conv1d_update)
from vllm.utils import seed_everything
def causal_conv1d_ref( def causal_conv1d_ref(
...@@ -104,7 +105,7 @@ def test_causal_conv1d(batch, dim, seqlen, width, has_bias, silu_activation, ...@@ -104,7 +105,7 @@ def test_causal_conv1d(batch, dim, seqlen, width, has_bias, silu_activation,
if itype == torch.bfloat16: if itype == torch.bfloat16:
rtol, atol = 1e-2, 5e-2 rtol, atol = 1e-2, 5e-2
# set seed # set seed
torch.random.manual_seed(0) seed_everything(0)
if not channel_last: if not channel_last:
x = torch.randn(batch, x = torch.randn(batch,
4096 + dim + 64, 4096 + dim + 64,
...@@ -175,7 +176,7 @@ def test_causal_conv1d_update(batch, dim, width, has_bias, silu_activation, ...@@ -175,7 +176,7 @@ def test_causal_conv1d_update(batch, dim, width, has_bias, silu_activation,
if itype == torch.bfloat16: if itype == torch.bfloat16:
rtol, atol = 1e-2, 5e-2 rtol, atol = 1e-2, 5e-2
# set seed # set seed
torch.random.manual_seed(0) seed_everything(0)
batch = 2 batch = 2
x = torch.randn(batch, dim, device=device, dtype=itype) x = torch.randn(batch, dim, device=device, dtype=itype)
conv_state = torch.randn(batch, dim, width, device=device, dtype=itype) conv_state = torch.randn(batch, dim, width, device=device, dtype=itype)
......
...@@ -15,9 +15,6 @@ CUDA_DEVICES = [ ...@@ -15,9 +15,6 @@ CUDA_DEVICES = [
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
] ]
capability = current_platform.get_device_capability()
capability = capability[0] * 10 + capability[1]
def to_fp8(tensor: torch.Tensor): def to_fp8(tensor: torch.Tensor):
finfo = torch.finfo(torch.float8_e4m3fn) finfo = torch.finfo(torch.float8_e4m3fn)
...@@ -119,7 +116,7 @@ def cutlass_int8_gemm_helper(m: int, ...@@ -119,7 +116,7 @@ def cutlass_int8_gemm_helper(m: int,
@pytest.mark.parametrize("per_act_token", [True, False]) @pytest.mark.parametrize("per_act_token", [True, False])
@pytest.mark.parametrize("per_out_ch", [True, False]) @pytest.mark.parametrize("per_out_ch", [True, False])
@pytest.mark.parametrize("use_bias", [True, False]) @pytest.mark.parametrize("use_bias", [True, False])
@pytest.mark.skipif(capability < 89, @pytest.mark.skipif(not current_platform.has_device_capability(89),
reason="FP8 is not supported on this GPU type.") reason="FP8 is not supported on this GPU type.")
def test_cutlass_fp8_gemm(m: int, n: int, k: int, per_act_token: bool, def test_cutlass_fp8_gemm(m: int, n: int, k: int, per_act_token: bool,
per_out_ch: bool, use_bias: bool): per_out_ch: bool, use_bias: bool):
...@@ -157,7 +154,7 @@ def test_cutlass_int8_gemm_output_dtype(per_act_token: bool, per_out_ch: bool, ...@@ -157,7 +154,7 @@ def test_cutlass_int8_gemm_output_dtype(per_act_token: bool, per_out_ch: bool,
@pytest.mark.parametrize("per_out_ch", [True, False]) @pytest.mark.parametrize("per_out_ch", [True, False])
@pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16]) @pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16])
@pytest.mark.parametrize("use_bias", [True, False]) @pytest.mark.parametrize("use_bias", [True, False])
@pytest.mark.skipif(capability < 89, @pytest.mark.skipif(not current_platform.has_device_capability(89),
reason="FP8 is not supported on this GPU type.") reason="FP8 is not supported on this GPU type.")
def test_cutlass_fp8_gemm_output_dtype(per_act_token: bool, per_out_ch: bool, def test_cutlass_fp8_gemm_output_dtype(per_act_token: bool, per_out_ch: bool,
out_dtype: Type[torch.dtype], out_dtype: Type[torch.dtype],
...@@ -175,7 +172,7 @@ def test_cutlass_fp8_gemm_output_dtype(per_act_token: bool, per_out_ch: bool, ...@@ -175,7 +172,7 @@ def test_cutlass_fp8_gemm_output_dtype(per_act_token: bool, per_out_ch: bool,
@pytest.mark.parametrize("per_out_ch", [True, False]) @pytest.mark.parametrize("per_out_ch", [True, False])
@pytest.mark.parametrize("use_bias", [True, False]) @pytest.mark.parametrize("use_bias", [True, False])
@pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.skipif(capability < 89, @pytest.mark.skipif(not current_platform.has_device_capability(89),
reason="FP8 is not supported on this GPU type.") reason="FP8 is not supported on this GPU type.")
def test_cutlass_fp8_gemm_devices(per_act_token: bool, per_out_ch: bool, def test_cutlass_fp8_gemm_devices(per_act_token: bool, per_out_ch: bool,
use_bias: bool, device: str): use_bias: bool, device: str):
...@@ -207,7 +204,7 @@ def test_cutlass_int8_gemm_devices(per_act_token: bool, per_out_ch: bool, ...@@ -207,7 +204,7 @@ def test_cutlass_int8_gemm_devices(per_act_token: bool, per_out_ch: bool,
@pytest.mark.parametrize("per_act_token", [True, False]) @pytest.mark.parametrize("per_act_token", [True, False])
@pytest.mark.parametrize("per_out_ch", [True, False]) @pytest.mark.parametrize("per_out_ch", [True, False])
@pytest.mark.parametrize("use_bias", [True, False]) @pytest.mark.parametrize("use_bias", [True, False])
@pytest.mark.skipif(capability < 89, @pytest.mark.skipif(not current_platform.has_device_capability(89),
reason="FP8 is not supported on this GPU type.") reason="FP8 is not supported on this GPU type.")
def test_cutlass_fp8_gemm_m_sweep(per_act_token: bool, per_out_ch: bool, def test_cutlass_fp8_gemm_m_sweep(per_act_token: bool, per_out_ch: bool,
use_bias: bool): use_bias: bool):
......
...@@ -4,6 +4,7 @@ import pytest ...@@ -4,6 +4,7 @@ import pytest
import torch import torch
import vllm.attention.backends.flash_attn # noqa: F401 import vllm.attention.backends.flash_attn # noqa: F401
from vllm.utils import seed_everything
NUM_HEADS = [(4, 4), (8, 2), (16, 2)] NUM_HEADS = [(4, 4), (8, 2), (16, 2)]
HEAD_SIZES = [128, 256] HEAD_SIZES = [128, 256]
...@@ -87,7 +88,7 @@ def test_flash_attn_with_paged_kv( ...@@ -87,7 +88,7 @@ def test_flash_attn_with_paged_kv(
num_blocks: int, num_blocks: int,
) -> None: ) -> None:
torch.set_default_device("cuda") torch.set_default_device("cuda")
torch.cuda.manual_seed_all(0) seed_everything(0)
num_seqs = len(kv_lens) num_seqs = len(kv_lens)
num_query_heads = num_heads[0] num_query_heads = num_heads[0]
num_kv_heads = num_heads[1] num_kv_heads = num_heads[1]
...@@ -174,7 +175,7 @@ def test_varlen_with_paged_kv( ...@@ -174,7 +175,7 @@ def test_varlen_with_paged_kv(
num_blocks: int, num_blocks: int,
) -> None: ) -> None:
torch.set_default_device("cuda") torch.set_default_device("cuda")
torch.cuda.manual_seed_all(0) seed_everything(0)
num_seqs = len(seq_lens) num_seqs = len(seq_lens)
query_lens = [x[0] for x in seq_lens] query_lens = [x[0] for x in seq_lens]
kv_lens = [x[1] for x in seq_lens] kv_lens = [x[1] for x in seq_lens]
......
...@@ -4,6 +4,8 @@ import flashinfer ...@@ -4,6 +4,8 @@ import flashinfer
import pytest import pytest
import torch import torch
from vllm.utils import seed_everything
NUM_HEADS = [(16, 16), (32, 8), (64, 8), (6, 1)] NUM_HEADS = [(16, 16), (32, 8), (64, 8), (6, 1)]
HEAD_SIZES = [128, 256] HEAD_SIZES = [128, 256]
BLOCK_SIZES = [16, 32] BLOCK_SIZES = [16, 32]
...@@ -82,7 +84,7 @@ def test_flashinfer_decode_with_paged_kv( ...@@ -82,7 +84,7 @@ def test_flashinfer_decode_with_paged_kv(
soft_cap: Optional[float], soft_cap: Optional[float],
) -> None: ) -> None:
torch.set_default_device("cuda") torch.set_default_device("cuda")
torch.cuda.manual_seed_all(0) seed_everything(0)
num_seqs = len(kv_lens) num_seqs = len(kv_lens)
num_query_heads = num_heads[0] num_query_heads = num_heads[0]
num_kv_heads = num_heads[1] num_kv_heads = num_heads[1]
...@@ -168,7 +170,7 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]], ...@@ -168,7 +170,7 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]],
block_size: int, block_size: int,
soft_cap: Optional[float]) -> None: soft_cap: Optional[float]) -> None:
torch.set_default_device("cuda") torch.set_default_device("cuda")
torch.cuda.manual_seed_all(0) seed_everything(0)
num_seqs = len(seq_lens) num_seqs = len(seq_lens)
query_lens = [x[0] for x in seq_lens] query_lens = [x[0] for x in seq_lens]
kv_lens = [x[1] for x in seq_lens] kv_lens = [x[1] for x in seq_lens]
...@@ -266,7 +268,7 @@ def test_flashinfer_prefill_with_paged_fp8_kv( ...@@ -266,7 +268,7 @@ def test_flashinfer_prefill_with_paged_fp8_kv(
head_size: int, dtype: torch.dtype, block_size: int, head_size: int, dtype: torch.dtype, block_size: int,
soft_cap: Optional[float]) -> None: soft_cap: Optional[float]) -> None:
torch.set_default_device("cuda") torch.set_default_device("cuda")
torch.cuda.manual_seed_all(0) seed_everything(0)
num_seqs = len(seq_lens) num_seqs = len(seq_lens)
query_lens = [x[0] for x in seq_lens] query_lens = [x[0] for x in seq_lens]
kv_lens = [x[1] for x in seq_lens] kv_lens = [x[1] for x in seq_lens]
...@@ -379,7 +381,7 @@ def test_flashinfer_decode_with_paged_fp8_kv( ...@@ -379,7 +381,7 @@ def test_flashinfer_decode_with_paged_fp8_kv(
) -> None: ) -> None:
# test doesn't work for num_heads = (16,16) # test doesn't work for num_heads = (16,16)
torch.set_default_device("cuda") torch.set_default_device("cuda")
torch.cuda.manual_seed_all(0) seed_everything(0)
num_seqs = len(kv_lens) num_seqs = len(kv_lens)
num_query_heads = num_heads[0] num_query_heads = num_heads[0]
num_kv_heads = num_heads[1] num_kv_heads = num_heads[1]
......
...@@ -5,6 +5,7 @@ import vllm._custom_ops as ops ...@@ -5,6 +5,7 @@ import vllm._custom_ops as ops
from tests.kernels.quant_utils import (FP8_DTYPE, from tests.kernels.quant_utils import (FP8_DTYPE,
ref_dynamic_per_tensor_fp8_quant, ref_dynamic_per_tensor_fp8_quant,
ref_dynamic_per_token_quant) ref_dynamic_per_token_quant)
from vllm.utils import seed_everything
DTYPES = [torch.half, torch.bfloat16, torch.float] DTYPES = [torch.half, torch.bfloat16, torch.float]
HIDDEN_SIZES = [1, 2, 3, 4, 16, 67, 768, 2048, 5120, 5137, 8192, HIDDEN_SIZES = [1, 2, 3, 4, 16, 67, 768, 2048, 5120, 5137, 8192,
...@@ -24,8 +25,7 @@ SEEDS = [0] ...@@ -24,8 +25,7 @@ SEEDS = [0]
def test_dynamic_per_token_fp8_quant(num_tokens: int, hidden_size: int, def test_dynamic_per_token_fp8_quant(num_tokens: int, hidden_size: int,
dtype: torch.dtype, scale_ub: bool, dtype: torch.dtype, scale_ub: bool,
seed: int) -> None: seed: int) -> None:
torch.random.manual_seed(seed) seed_everything(seed)
torch.cuda.manual_seed(seed)
x = torch.rand(num_tokens, hidden_size, dtype=dtype, x = torch.rand(num_tokens, hidden_size, dtype=dtype,
device="cuda") + 1e-6 # avoid nans device="cuda") + 1e-6 # avoid nans
...@@ -49,8 +49,7 @@ def test_dynamic_per_token_fp8_quant(num_tokens: int, hidden_size: int, ...@@ -49,8 +49,7 @@ def test_dynamic_per_token_fp8_quant(num_tokens: int, hidden_size: int,
@torch.inference_mode() @torch.inference_mode()
def test_dynamic_per_tensor_fp8_quant(num_tokens: int, hidden_size: int, def test_dynamic_per_tensor_fp8_quant(num_tokens: int, hidden_size: int,
dtype: torch.dtype, seed: int) -> None: dtype: torch.dtype, seed: int) -> None:
torch.random.manual_seed(seed) seed_everything(seed)
torch.cuda.manual_seed(seed)
x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda")
...@@ -67,8 +66,7 @@ def test_dynamic_per_tensor_fp8_quant(num_tokens: int, hidden_size: int, ...@@ -67,8 +66,7 @@ def test_dynamic_per_tensor_fp8_quant(num_tokens: int, hidden_size: int,
@torch.inference_mode() @torch.inference_mode()
@pytest.mark.parametrize("seed", SEEDS) @pytest.mark.parametrize("seed", SEEDS)
def test_fp8_quant_large(seed: int) -> None: def test_fp8_quant_large(seed: int) -> None:
torch.random.manual_seed(seed) seed_everything(seed)
torch.cuda.manual_seed(seed)
num_tokens = 1024000 # Mistral-Nemo's max_position_embeddings num_tokens = 1024000 # Mistral-Nemo's max_position_embeddings
hidden_size = 1152 # Smallest hidden_size to reproduce the error hidden_size = 1152 # Smallest hidden_size to reproduce the error
......
...@@ -7,6 +7,7 @@ from gguf import GGMLQuantizationType, GGUFReader, ReaderTensor, dequantize ...@@ -7,6 +7,7 @@ from gguf import GGMLQuantizationType, GGUFReader, ReaderTensor, dequantize
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
import vllm._custom_ops as ops import vllm._custom_ops as ops
from vllm.utils import seed_everything
GGUF_SAMPLE = snapshot_download("Isotr0py/test-gguf-sample") GGUF_SAMPLE = snapshot_download("Isotr0py/test-gguf-sample")
...@@ -74,7 +75,7 @@ def test_dequantize(hidden_size: int, dtype: torch.dtype, ...@@ -74,7 +75,7 @@ def test_dequantize(hidden_size: int, dtype: torch.dtype,
@torch.inference_mode() @torch.inference_mode()
def test_mmvq(hidden_size: int, dtype: torch.dtype, def test_mmvq(hidden_size: int, dtype: torch.dtype,
quant_type: GGMLQuantizationType): quant_type: GGMLQuantizationType):
torch.cuda.manual_seed_all(0) seed_everything(0)
tensors = get_gguf_sample_tensors(hidden_size, quant_type) tensors = get_gguf_sample_tensors(hidden_size, quant_type)
x = torch.rand((1, hidden_size), dtype=dtype, device="cuda") x = torch.rand((1, hidden_size), dtype=dtype, device="cuda")
...@@ -110,7 +111,7 @@ def test_mmvq(hidden_size: int, dtype: torch.dtype, ...@@ -110,7 +111,7 @@ def test_mmvq(hidden_size: int, dtype: torch.dtype,
@torch.inference_mode() @torch.inference_mode()
def test_mmq(num_tokens: int, hidden_size: int, dtype: torch.dtype, def test_mmq(num_tokens: int, hidden_size: int, dtype: torch.dtype,
quant_type: GGMLQuantizationType): quant_type: GGMLQuantizationType):
torch.cuda.manual_seed_all(0) seed_everything(0)
tensors = get_gguf_sample_tensors(hidden_size, quant_type) tensors = get_gguf_sample_tensors(hidden_size, quant_type)
x = torch.rand((num_tokens, hidden_size), dtype=dtype, device="cuda") x = torch.rand((num_tokens, hidden_size), dtype=dtype, device="cuda")
......
...@@ -4,6 +4,7 @@ import torch ...@@ -4,6 +4,7 @@ import torch
from tests.kernels.quant_utils import ref_dynamic_per_token_quant from tests.kernels.quant_utils import ref_dynamic_per_token_quant
from tests.kernels.utils import opcheck from tests.kernels.utils import opcheck
from vllm._custom_ops import scaled_int8_quant from vllm._custom_ops import scaled_int8_quant
from vllm.utils import seed_everything
DTYPES = [torch.half, torch.bfloat16, torch.float] DTYPES = [torch.half, torch.bfloat16, torch.float]
HIDDEN_SIZES = [16, 67, 768, 2048, 5120, 5137, 8192, HIDDEN_SIZES = [16, 67, 768, 2048, 5120, 5137, 8192,
...@@ -44,8 +45,7 @@ def opcheck_int8_quant_dynamic(output, input, symmetric=True): ...@@ -44,8 +45,7 @@ def opcheck_int8_quant_dynamic(output, input, symmetric=True):
@torch.inference_mode() @torch.inference_mode()
def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int, def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int,
dtype: torch.dtype, seed: int) -> None: dtype: torch.dtype, seed: int) -> None:
torch.random.manual_seed(seed) seed_everything(seed)
torch.cuda.manual_seed(seed)
x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000 x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000
...@@ -68,8 +68,7 @@ def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int, ...@@ -68,8 +68,7 @@ def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int,
@torch.inference_mode() @torch.inference_mode()
def test_dynamic_scaled_int8_azp_quant(num_tokens: int, hidden_size: int, def test_dynamic_scaled_int8_azp_quant(num_tokens: int, hidden_size: int,
dtype: torch.dtype, seed: int) -> None: dtype: torch.dtype, seed: int) -> None:
torch.random.manual_seed(seed) seed_everything(seed)
torch.cuda.manual_seed(seed)
int8_traits = torch.iinfo(torch.int8) int8_traits = torch.iinfo(torch.int8)
x = torch.rand(num_tokens, hidden_size, dtype=dtype, x = torch.rand(num_tokens, hidden_size, dtype=dtype,
...@@ -113,8 +112,7 @@ def test_dynamic_scaled_int8_azp_quant(num_tokens: int, hidden_size: int, ...@@ -113,8 +112,7 @@ def test_dynamic_scaled_int8_azp_quant(num_tokens: int, hidden_size: int,
def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int, def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int,
dtype: torch.dtype, seed: int, dtype: torch.dtype, seed: int,
scale: float) -> None: scale: float) -> None:
torch.random.manual_seed(seed) seed_everything(seed)
torch.cuda.manual_seed(seed)
int8_traits = torch.iinfo(torch.int8) int8_traits = torch.iinfo(torch.int8)
x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000 x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000
...@@ -140,8 +138,7 @@ def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int, ...@@ -140,8 +138,7 @@ def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int,
def test_static_scaled_int8_azp_quant(num_tokens: int, hidden_size: int, def test_static_scaled_int8_azp_quant(num_tokens: int, hidden_size: int,
dtype: torch.dtype, seed: int, dtype: torch.dtype, seed: int,
scale: float, azp: int) -> None: scale: float, azp: int) -> None:
torch.random.manual_seed(seed) seed_everything(seed)
torch.cuda.manual_seed(seed)
int8_traits = torch.iinfo(torch.int8) int8_traits = torch.iinfo(torch.int8)
x = torch.rand(num_tokens, hidden_size, dtype=dtype, x = torch.rand(num_tokens, hidden_size, dtype=dtype,
......
...@@ -3,6 +3,7 @@ import torch ...@@ -3,6 +3,7 @@ import torch
from tests.kernels.utils import opcheck from tests.kernels.utils import opcheck
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.utils import seed_everything
DTYPES = [torch.half, torch.bfloat16, torch.float] DTYPES = [torch.half, torch.bfloat16, torch.float]
NUM_TOKENS = [7, 83, 4096] # Arbitrary values for testing NUM_TOKENS = [7, 83, 4096] # Arbitrary values for testing
...@@ -30,9 +31,7 @@ def test_rms_norm( ...@@ -30,9 +31,7 @@ def test_rms_norm(
seed: int, seed: int,
device: str, device: str,
) -> None: ) -> None:
torch.random.manual_seed(seed) seed_everything(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.set_default_device(device) torch.set_default_device(device)
layer = RMSNorm(hidden_size).to(dtype=dtype) layer = RMSNorm(hidden_size).to(dtype=dtype)
layer.weight.data.normal_(mean=1.0, std=0.1) layer.weight.data.normal_(mean=1.0, std=0.1)
......
...@@ -48,7 +48,7 @@ WTYPE_ZEROPOINTS = [ ...@@ -48,7 +48,7 @@ WTYPE_ZEROPOINTS = [
# `is_quant_method_supported` conflates kernels with quantization methods # `is_quant_method_supported` conflates kernels with quantization methods
# an assumption which is breaking down as quantizations methods can have # an assumption which is breaking down as quantizations methods can have
# have kernels and some kernels support multiple quantization methods. # have kernels and some kernels support multiple quantization methods.
IS_SUPPORTED_BY_GPU = current_platform.get_device_capability()[0] >= 9 IS_SUPPORTED_BY_GPU = current_platform.has_device_capability(90)
def rand_data(shape, dtype=torch.float16): def rand_data(shape, dtype=torch.float16):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment