gguf.py 1.56 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import torch


def ggml_dequantize(
    weight: torch.Tensor, quant_type: int, M: int, N: int, dtype: torch.dtype
):
    assert M > 0 and N > 0, "GGUF weight Input shape must be of positive dimensions"
    return torch.ops.sgl_kernel.ggml_dequantize.default(weight, quant_type, M, N, dtype)


def ggml_mul_mat_vec_a8(
    weight: torch.Tensor, x: torch.Tensor, quant_type: int, row: int
) -> torch.Tensor:
    return torch.ops.sgl_kernel.ggml_mul_mat_vec_a8.default(weight, x, quant_type, row)


def ggml_mul_mat_a8(
    weight: torch.Tensor, x: torch.Tensor, quant_type: int, row: int
) -> torch.Tensor:
    return torch.ops.sgl_kernel.ggml_mul_mat_a8.default(weight, x, quant_type, row)


def ggml_moe_a8(
    input: torch.Tensor,
    weight: torch.Tensor,
    sorted_token_ids: torch.Tensor,
    expert_ids: torch.Tensor,
    num_token_post_padded: torch.Tensor,
    type: int,
    row: int,
    topk: int,
    tokens: int,
) -> torch.Tensor:
    return torch.ops.sgl_kernel.ggml_moe_a8.default(
        input,
        weight,
        sorted_token_ids,
        expert_ids,
        num_token_post_padded,
        type,
        row,
        topk,
        tokens,
    )


def ggml_moe_a8_vec(
    input: torch.Tensor,
    weight: torch.Tensor,
    topk_ids: torch.Tensor,
    top_k: int,
    type: int,
    row: int,
    tokens: int,
) -> torch.Tensor:
    return torch.ops.sgl_kernel.ggml_moe_a8_vec.default(
        input, weight, topk_ids, top_k, type, row, tokens
    )


def ggml_moe_get_block_size(type: int) -> int:
    return torch.ops.sgl_kernel.ggml_moe_get_block_size.default(type)