untest_ggml.py 1.73 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
5
6
7
8
9
10
11
12
13
14
15
import gguf
import pytest
import torch

from tests.kernels.utils import opcheck
from vllm import _custom_ops as ops  # noqa: F401


@pytest.mark.parametrize("quant_type", [12])
def test_ggml_opcheck(quant_type):
    block_size, type_size = gguf.GGML_QUANT_SIZES[quant_type]
    shape = [256, 1152]
16
    qweight = torch.randint(0, 100, shape, device="cuda", dtype=torch.uint8)
17
18
    m = qweight.shape[0]
    n = qweight.shape[1] // type_size * block_size
19
    opcheck(torch.ops._C.ggml_dequantize, (qweight, quant_type, m, n, torch.float16))
20

21
22
23
24
25
    x = torch.rand((m, 512), device="cuda", dtype=torch.float16)
    opcheck(torch.ops._C.ggml_mul_mat_a8, (qweight, x, quant_type, qweight.shape[0]))
    opcheck(
        torch.ops._C.ggml_mul_mat_vec_a8, (qweight, x, quant_type, qweight.shape[0])
    )
26
27

    shape = [256, 1024, 336]
28
29
30
31
32
    qweight = torch.randint(0, 100, shape, device="cuda", dtype=torch.uint8)
    x = torch.rand((1, 1024), device="cuda", dtype=torch.float16)
    sorted_token_ids = torch.arange(776, device="cuda")
    expert_ids = torch.randint(0, 256, (194,), device="cuda")
    num_tokens_post_padded = torch.tensor([1], dtype=torch.int64, device="cuda")
33

34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
    opcheck(
        torch.ops._C.ggml_moe_a8,
        (
            x,
            qweight,
            sorted_token_ids,
            expert_ids,
            num_tokens_post_padded,
            quant_type,
            qweight.shape[0],
            1,
            x.shape[0],
        ),
    )

    topk_ids = torch.zeros((1, 1), device="cuda", dtype=torch.int32)
50
51
52

    opcheck(
        torch.ops._C.ggml_moe_a8_vec,
53
54
        (x, qweight, topk_ids, 1, quant_type, qweight.shape[0], x.shape[0]),
    )