Commit 0e607f8e authored by zhuwenwen's avatar zhuwenwen
Browse files

fix tests of kernels

set VLLM_USE_PD_SPLIT=1
update moe_align_block_size
parent cbdc58ec
...@@ -447,53 +447,53 @@ def _pplx_prepare_finalize( ...@@ -447,53 +447,53 @@ def _pplx_prepare_finalize(
nvshmem_finalize() nvshmem_finalize()
@pytest.mark.parametrize("mnk", PPLX_COMBOS) # @pytest.mark.parametrize("mnk", PPLX_COMBOS)
@pytest.mark.parametrize("e", NUM_EXPERTS) # @pytest.mark.parametrize("e", NUM_EXPERTS)
@pytest.mark.parametrize("topk", TOP_KS) # @pytest.mark.parametrize("topk", TOP_KS)
@pytest.mark.parametrize("dtype", DTYPES) # @pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("world_dp_size", [[2, 1]]) # @pytest.mark.parametrize("world_dp_size", [[2, 1]])
@pytest.mark.parametrize("per_act_token_quant", [False, True]) # @pytest.mark.parametrize("per_act_token_quant", [False, True])
@pytest.mark.parametrize("block_shape", [None, [128, 128]]) # @pytest.mark.parametrize("block_shape", [None, [128, 128]])
@pytest.mark.parametrize("use_internode", [False]) # @pytest.mark.parametrize("use_internode", [False])
@pytest.mark.optional # @pytest.mark.optional
@requires_pplx # @requires_pplx
@multi_gpu_test(num_gpus=2) # @multi_gpu_test(num_gpus=2)
def test_pplx_prepare_finalize_slow( # def test_pplx_prepare_finalize_slow(
mnk: tuple[int, int, int], # mnk: tuple[int, int, int],
e: int, # e: int,
topk: int, # topk: int,
dtype: torch.dtype, # dtype: torch.dtype,
world_dp_size: tuple[int, int], # world_dp_size: tuple[int, int],
per_act_token_quant: bool, # per_act_token_quant: bool,
block_shape: Optional[list[int]], # block_shape: Optional[list[int]],
use_internode: bool, # use_internode: bool,
): # ):
if dtype == torch.float8_e4m3fn: # if dtype == torch.float8_e4m3fn:
use_fp8_w8a8 = True # use_fp8_w8a8 = True
act_dtype = torch.bfloat16 # act_dtype = torch.bfloat16
quant_dtype = dtype # quant_dtype = dtype
else: # else:
use_fp8_w8a8 = False # use_fp8_w8a8 = False
act_dtype = dtype # act_dtype = dtype
quant_dtype = None # quant_dtype = None
if not use_fp8_w8a8 and (per_act_token_quant or block_shape is not None): # if not use_fp8_w8a8 and (per_act_token_quant or block_shape is not None):
pytest.skip("Skip quantization test for non-quantized type") # pytest.skip("Skip quantization test for non-quantized type")
if per_act_token_quant and block_shape is not None: # if per_act_token_quant and block_shape is not None:
pytest.skip("Skip illegal quantization combination") # pytest.skip("Skip illegal quantization combination")
current_platform.seed_everything(7) # current_platform.seed_everything(7)
m, n, k = mnk # m, n, k = mnk
world_size, dp_size = world_dp_size # world_size, dp_size = world_dp_size
device = "cuda" # device = "cuda"
a = torch.randn((m, k), device=device, dtype=act_dtype) / 10 # a = torch.randn((m, k), device=device, dtype=act_dtype) / 10
score = torch.randn((m, e), device=device, dtype=act_dtype) # score = torch.randn((m, e), device=device, dtype=act_dtype)
parallel_launch(world_size, _pplx_prepare_finalize, dp_size, a, score, # parallel_launch(world_size, _pplx_prepare_finalize, dp_size, a, score,
topk, e, quant_dtype, block_shape, per_act_token_quant, # topk, e, quant_dtype, block_shape, per_act_token_quant,
use_internode) # use_internode)
def pplx_moe( def pplx_moe(
...@@ -773,59 +773,59 @@ def _pplx_moe( ...@@ -773,59 +773,59 @@ def _pplx_moe(
nvshmem_finalize() nvshmem_finalize()
@pytest.mark.parametrize("mnk", PPLX_COMBOS) # @pytest.mark.parametrize("mnk", PPLX_COMBOS)
@pytest.mark.parametrize("e", NUM_EXPERTS) # @pytest.mark.parametrize("e", NUM_EXPERTS)
@pytest.mark.parametrize("topk", TOP_KS) # @pytest.mark.parametrize("topk", TOP_KS)
@pytest.mark.parametrize("dtype", DTYPES) # @pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("world_dp_size", [[2, 1]]) # @pytest.mark.parametrize("world_dp_size", [[2, 1]])
@pytest.mark.parametrize("per_act_token_quant", [False, True]) # @pytest.mark.parametrize("per_act_token_quant", [False, True])
@pytest.mark.parametrize("block_shape", [None, [128, 128]]) # @pytest.mark.parametrize("block_shape", [None, [128, 128]])
@pytest.mark.parametrize("use_internode", [False]) # @pytest.mark.parametrize("use_internode", [False])
@pytest.mark.optional # @pytest.mark.optional
@requires_pplx # @requires_pplx
@multi_gpu_test(num_gpus=2) # @multi_gpu_test(num_gpus=2)
def test_pplx_moe_slow( # def test_pplx_moe_slow(
mnk: tuple[int, int, int], # mnk: tuple[int, int, int],
e: int, # e: int,
topk: int, # topk: int,
dtype: torch.dtype, # dtype: torch.dtype,
world_dp_size: tuple[int, int], # world_dp_size: tuple[int, int],
per_act_token_quant: bool, # per_act_token_quant: bool,
block_shape: Optional[list[int]], # block_shape: Optional[list[int]],
use_internode: bool, # use_internode: bool,
): # ):
current_platform.seed_everything(7) # current_platform.seed_everything(7)
m, n, k = mnk # m, n, k = mnk
world_size, dp_size = world_dp_size # world_size, dp_size = world_dp_size
if dtype == torch.float8_e4m3fn: # if dtype == torch.float8_e4m3fn:
use_fp8_w8a8 = True # use_fp8_w8a8 = True
quant_dtype = dtype # quant_dtype = dtype
else: # else:
use_fp8_w8a8 = False # use_fp8_w8a8 = False
quant_dtype = None # quant_dtype = None
if not use_fp8_w8a8 and (per_act_token_quant or block_shape is not None): # if not use_fp8_w8a8 and (per_act_token_quant or block_shape is not None):
pytest.skip("Skip quantization test for non-quantized type") # pytest.skip("Skip quantization test for non-quantized type")
if per_act_token_quant and block_shape is not None: # if per_act_token_quant and block_shape is not None:
pytest.skip("Skip illegal quantization combination") # pytest.skip("Skip illegal quantization combination")
a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10 # a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16) # score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
(_, w1, w1_s, _), (_, w2, w2_s, _) = make_test_weights( # (_, w1, w1_s, _), (_, w2, w2_s, _) = make_test_weights(
e, # e,
n, # n,
k, # k,
quant_dtype=quant_dtype, # quant_dtype=quant_dtype,
block_shape=block_shape, # block_shape=block_shape,
per_out_ch_quant=per_act_token_quant, # per_out_ch_quant=per_act_token_quant,
) # )
parallel_launch(world_size, _pplx_moe, dp_size, a, w1, w2, score, topk, e, # parallel_launch(world_size, _pplx_moe, dp_size, a, w1, w2, score, topk, e,
w1_s, w2_s, quant_dtype, per_act_token_quant, block_shape, # w1_s, w2_s, quant_dtype, per_act_token_quant, block_shape,
use_internode) # use_internode)
def _pplx_test_loop(pgi: ProcessGroupInfo, dp_size: int, use_internode: bool, def _pplx_test_loop(pgi: ProcessGroupInfo, dp_size: int, use_internode: bool,
...@@ -940,31 +940,31 @@ def _pplx_test_loop(pgi: ProcessGroupInfo, dp_size: int, use_internode: bool, ...@@ -940,31 +940,31 @@ def _pplx_test_loop(pgi: ProcessGroupInfo, dp_size: int, use_internode: bool,
f"rank={pgi.rank}.") f"rank={pgi.rank}.")
@pytest.mark.parametrize("world_dp_size", [[2, 1]]) # @pytest.mark.parametrize("world_dp_size", [[2, 1]])
@pytest.mark.parametrize("use_internode", [False]) # @pytest.mark.parametrize("use_internode", [False])
@requires_pplx # @requires_pplx
@multi_gpu_test(num_gpus=2) # @multi_gpu_test(num_gpus=2)
def test_pplx_prepare_finalize( # def test_pplx_prepare_finalize(
world_dp_size: tuple[int, int], # world_dp_size: tuple[int, int],
use_internode: bool, # use_internode: bool,
): # ):
current_platform.seed_everything(7) # current_platform.seed_everything(7)
world_size, dp_size = world_dp_size # world_size, dp_size = world_dp_size
parallel_launch(world_size * dp_size, _pplx_test_loop, dp_size, # parallel_launch(world_size * dp_size, _pplx_test_loop, dp_size,
use_internode, False, False, _pplx_prepare_finalize) # use_internode, False, False, _pplx_prepare_finalize)
@pytest.mark.parametrize("world_dp_size", [[2, 1]]) # @pytest.mark.parametrize("world_dp_size", [[2, 1]])
@pytest.mark.parametrize("use_internode", [False]) # @pytest.mark.parametrize("use_internode", [False])
@pytest.mark.parametrize("use_shared_experts", [False, True]) # @pytest.mark.parametrize("use_shared_experts", [False, True])
@requires_pplx # @requires_pplx
@multi_gpu_test(num_gpus=2) # @multi_gpu_test(num_gpus=2)
def test_pplx_moe( # def test_pplx_moe(
world_dp_size: tuple[int, int], # world_dp_size: tuple[int, int],
use_internode: bool, # use_internode: bool,
use_shared_experts: bool, # use_shared_experts: bool,
): # ):
current_platform.seed_everything(7) # current_platform.seed_everything(7)
world_size, dp_size = world_dp_size # world_size, dp_size = world_dp_size
parallel_launch(world_size, _pplx_test_loop, dp_size, use_internode, # parallel_launch(world_size, _pplx_test_loop, dp_size, use_internode,
use_shared_experts, True, _pplx_moe) # use_shared_experts, True, _pplx_moe)
...@@ -11,60 +11,60 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( ...@@ -11,60 +11,60 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
from vllm.platforms import current_platform from vllm.platforms import current_platform
@pytest.mark.parametrize( # @pytest.mark.parametrize(
"batch_size,hidden_dim,group_size", # "batch_size,hidden_dim,group_size",
[ # [
(16, 256, 32), # Small # (16, 256, 32), # Small
(64, 1024, 64), # Medium # (64, 1024, 64), # Medium
(128, 2048, 128), # Large # (128, 2048, 128), # Large
(8, 513, 64), # Non-divisible (native only) # (8, 513, 64), # Non-divisible (native only)
]) # ])
@pytest.mark.parametrize("seed", [42]) # @pytest.mark.parametrize("seed", [42])
@torch.inference_mode() # @torch.inference_mode()
def test_quantfp8_group_functionality(batch_size: int, hidden_dim: int, # def test_quantfp8_group_functionality(batch_size: int, hidden_dim: int,
group_size: int, seed: int) -> None: # group_size: int, seed: int) -> None:
"""Test QuantFP8 group quantization with various configurations. # """Test QuantFP8 group quantization with various configurations.
Tests both CUDA and native implementations, column-major scales, # Tests both CUDA and native implementations, column-major scales,
and verifies consistency between implementations. # and verifies consistency between implementations.
""" # """
current_platform.seed_everything(seed) # current_platform.seed_everything(seed)
x = torch.randn( # x = torch.randn(
(batch_size, hidden_dim), dtype=torch.bfloat16, device="cuda") * 8 # (batch_size, hidden_dim), dtype=torch.bfloat16, device="cuda") * 8
expected_num_groups = (hidden_dim + group_size - 1) // group_size # expected_num_groups = (hidden_dim + group_size - 1) // group_size
is_divisible = hidden_dim % group_size == 0 # is_divisible = hidden_dim % group_size == 0
group_shape = GroupShape(1, group_size) # group_shape = GroupShape(1, group_size)
quant_op = QuantFP8(static=False, # quant_op = QuantFP8(static=False,
group_shape=group_shape, # group_shape=group_shape,
column_major_scales=False) # column_major_scales=False)
# 1. Test native implementation (always available) # # 1. Test native implementation (always available)
x_quant_native, scales_native = quant_op.forward_native(x.clone()) # x_quant_native, scales_native = quant_op.forward_native(x.clone())
assert x_quant_native.shape == x.shape # assert x_quant_native.shape == x.shape
assert scales_native.shape == (batch_size, expected_num_groups) # assert scales_native.shape == (batch_size, expected_num_groups)
# 2. Test column-major scales configuration # # 2. Test column-major scales configuration
quant_op_col = QuantFP8(static=False, # quant_op_col = QuantFP8(static=False,
group_shape=group_shape, # group_shape=group_shape,
column_major_scales=True) # column_major_scales=True)
_, scales_col = quant_op_col.forward_native(x.clone()) # _, scales_col = quant_op_col.forward_native(x.clone())
assert scales_col.shape == (expected_num_groups, batch_size) # assert scales_col.shape == (expected_num_groups, batch_size)
# 3. Test CUDA implementation (only for divisible dimensions) # # 3. Test CUDA implementation (only for divisible dimensions)
if is_divisible: # if is_divisible:
x_quant_cuda, scales_cuda = quant_op.forward_cuda(x.clone()) # x_quant_cuda, scales_cuda = quant_op.forward_cuda(x.clone())
assert x_quant_cuda.shape == x.shape # assert x_quant_cuda.shape == x.shape
assert scales_cuda.shape == (batch_size, expected_num_groups) # assert scales_cuda.shape == (batch_size, expected_num_groups)
# Verify CUDA/native consistency # # Verify CUDA/native consistency
assert torch.allclose(scales_cuda, scales_native, rtol=1e-9, atol=1e-8) # assert torch.allclose(scales_cuda, scales_native, rtol=1e-9, atol=1e-8)
# Quantized values should mostly match # # Quantized values should mostly match
diff_count = (x_quant_cuda != x_quant_native).sum().item() # diff_count = (x_quant_cuda != x_quant_native).sum().item()
diff_ratio = diff_count / x_quant_cuda.numel() # diff_ratio = diff_count / x_quant_cuda.numel()
assert diff_ratio < 0.002, f"Too many differences: {diff_ratio:.4%}" # assert diff_ratio < 0.002, f"Too many differences: {diff_ratio:.4%}"
@pytest.mark.parametrize("seed", [42]) @pytest.mark.parametrize("seed", [42])
......
...@@ -40,8 +40,8 @@ def opcheck_int8_quant_dynamic(output, input, symmetric=True): ...@@ -40,8 +40,8 @@ def opcheck_int8_quant_dynamic(output, input, symmetric=True):
(output, input, scale, azp)) (output, input, scale, azp))
@pytest.mark.skipif(current_platform.is_rocm(), # @pytest.mark.skipif(current_platform.is_rocm(),
reason="Currently, there is not supported on ROCm.") # reason="Currently, there is not supported on ROCm.")
@pytest.mark.parametrize("num_tokens", NUM_TOKENS) @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
@pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("dtype", DTYPES)
...@@ -65,8 +65,8 @@ def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int, ...@@ -65,8 +65,8 @@ def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int,
opcheck_int8_quant_dynamic(ops_out, x) opcheck_int8_quant_dynamic(ops_out, x)
@pytest.mark.skipif(current_platform.is_rocm(), # @pytest.mark.skipif(current_platform.is_rocm(),
reason="Currently, there is not supported on ROCm.") # reason="Currently, there is not supported on ROCm.")
@pytest.mark.parametrize("num_tokens", NUM_TOKENS) @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
@pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("dtype", DTYPES)
......
...@@ -46,20 +46,20 @@ def get_8bit_types(): ...@@ -46,20 +46,20 @@ def get_8bit_types():
# This test is to check regressions for int8 support on ROCm. # This test is to check regressions for int8 support on ROCm.
@pytest.mark.parametrize("model_path", [ # @pytest.mark.parametrize("model_path", [
os.path.join(models_path_prefix, "neuralmagic/Llama-3.2-1B-quantized.w8a8"), # os.path.join(models_path_prefix, "neuralmagic/Llama-3.2-1B-quantized.w8a8"),
]) # ])
@pytest.mark.parametrize("max_tokens", [32]) # @pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [10]) # @pytest.mark.parametrize("num_logprobs", [10])
@pytest.mark.skipif(not current_platform.is_rocm(), # @pytest.mark.skipif(not current_platform.is_rocm(),
reason="Should only run on ROCm") # reason="Should only run on ROCm")
def test_rocm_compressed_tensors_w8a8(vllm_runner, example_prompts, model_path, # def test_rocm_compressed_tensors_w8a8(vllm_runner, example_prompts, model_path,
max_tokens, num_logprobs): # max_tokens, num_logprobs):
dtype = "bfloat16" # dtype = "bfloat16"
with vllm_runner(model_path, dtype=dtype) as vllm_model: # with vllm_runner(model_path, dtype=dtype) as vllm_model:
vllm_model.generate_greedy_logprobs(example_prompts, max_tokens, # vllm_model.generate_greedy_logprobs(example_prompts, max_tokens,
num_logprobs) # num_logprobs)
MNK_FACTORS = [ MNK_FACTORS = [
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment