quants.py 4.21 KB
Newer Older
mashun1's avatar
v1  
mashun1 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
from __future__ import annotations
from typing import Callable, Sequence

from numpy.typing import DTypeLike

from .constants import GGML_QUANT_SIZES, GGMLQuantizationType
from .lazy import LazyNumpyTensor

import numpy as np


def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):
    block_size, type_size = GGML_QUANT_SIZES[quant_type]
    if shape[-1] % block_size != 0:
        raise ValueError(f"Quantized tensor row size ({shape[-1]}) is not a multiple of {quant_type.name} block size ({block_size})")
    return (*shape[:-1], shape[-1] // block_size * type_size)


def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):
    block_size, type_size = GGML_QUANT_SIZES[quant_type]
    if shape[-1] % type_size != 0:
        raise ValueError(f"Quantized tensor bytes per row ({shape[-1]}) is not a multiple of {quant_type.name} type size ({type_size})")
    return (*shape[:-1], shape[-1] // type_size * block_size)


# same as ggml_compute_fp32_to_bf16 in ggml-impl.h
def __compute_fp32_to_bf16(n: np.ndarray) -> np.ndarray:
xuxzh1's avatar
init  
xuxzh1 committed
28
    n = n.astype(np.float32, copy=False).view(np.uint32)
mashun1's avatar
v1  
mashun1 committed
29
    # force nan to quiet
xuxzh1's avatar
init  
xuxzh1 committed
30
    n = np.where((n & 0x7fffffff) > 0x7f800000, (n & np.uint32(0xffff0000)) | np.uint32(64 << 16), n)
mashun1's avatar
v1  
mashun1 committed
31
    # round to nearest even
xuxzh1's avatar
init  
xuxzh1 committed
32
33
    n = (np.uint64(n) + (0x7fff + ((n >> 16) & 1))) >> 16
    return n.astype(np.uint16)
mashun1's avatar
v1  
mashun1 committed
34
35
36
37
38
39
40
41
42
43


# This is faster than np.vectorize and np.apply_along_axis because it works on more than one row at a time
def __apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np.ndarray, otype: DTypeLike, oshape: tuple[int, ...]) -> np.ndarray:
    rows = arr.reshape((-1, arr.shape[-1]))
    osize = 1
    for dim in oshape:
        osize *= dim
    out = np.empty(shape=osize, dtype=otype)
    # compute over groups of 16 rows (arbitrary, but seems good for performance)
xuxzh1's avatar
init  
xuxzh1 committed
44
    n_groups = (rows.shape[0] // 16) or 1
mashun1's avatar
v1  
mashun1 committed
45
46
47
48
49
    np.concatenate([func(group).ravel() for group in np.array_split(rows, n_groups)], axis=0, out=out)
    return out.reshape(oshape)


def __quantize_bf16_array(n: np.ndarray) -> np.ndarray:
xuxzh1's avatar
init  
xuxzh1 committed
50
    return __apply_over_grouped_rows(__compute_fp32_to_bf16, arr=n, otype=np.uint16, oshape=n.shape)
mashun1's avatar
v1  
mashun1 committed
51
52


xuxzh1's avatar
init  
xuxzh1 committed
53
__quantize_bf16_lazy = LazyNumpyTensor._wrap_fn(__quantize_bf16_array, meta_noop=np.uint16)
mashun1's avatar
v1  
mashun1 committed
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121


def quantize_bf16(n: np.ndarray):
    if type(n) is LazyNumpyTensor:
        return __quantize_bf16_lazy(n)
    else:
        return __quantize_bf16_array(n)


__q8_block_size, __q8_type_size = GGML_QUANT_SIZES[GGMLQuantizationType.Q8_0]


def can_quantize_to_q8_0(n: np.ndarray) -> bool:
    return n.shape[-1] % __q8_block_size == 0


# round away from zero
# ref: https://stackoverflow.com/a/59143326/22827863
def np_roundf(n: np.ndarray) -> np.ndarray:
    a = abs(n)
    floored = np.floor(a)
    b = floored + np.floor(2 * (a - floored))
    return np.sign(n) * b


def __quantize_q8_0_shape_change(s: tuple[int, ...]) -> tuple[int, ...]:
    return (*s[:-1], s[-1] // __q8_block_size * __q8_type_size)


# Implementation of Q8_0 with bit-exact same results as reference implementation in ggml-quants.c
def __quantize_q8_0_rows(n: np.ndarray) -> np.ndarray:
    shape = n.shape
    assert shape[-1] % __q8_block_size == 0

    n_blocks = n.size // __q8_block_size

    blocks = n.reshape((n_blocks, __q8_block_size)).astype(np.float32, copy=False)

    d = abs(blocks).max(axis=1, keepdims=True) / 127
    with np.errstate(divide="ignore"):
        id = np.where(d == 0, 0, 1 / d)
    qs = np_roundf(blocks * id)

    # (n_blocks, 2)
    d = d.astype(np.float16).view(np.uint8)
    # (n_blocks, block_size)
    qs = qs.astype(np.int8).view(np.uint8)

    assert d.shape[1] + qs.shape[1] == __q8_type_size

    return np.concatenate([d, qs], axis=1).reshape(__quantize_q8_0_shape_change(shape))


def __quantize_q8_0_array(n: np.ndarray) -> np.ndarray:
    return __apply_over_grouped_rows(__quantize_q8_0_rows, arr=n, otype=np.uint8, oshape=__quantize_q8_0_shape_change(n.shape))


__quantize_q8_0_lazy = LazyNumpyTensor._wrap_fn(
    __quantize_q8_0_array,
    meta_noop=(np.uint8, __quantize_q8_0_shape_change),
)


def quantize_q8_0(data: np.ndarray):
    if type(data) is LazyNumpyTensor:
        return __quantize_q8_0_lazy(data)
    else:
        return __quantize_q8_0_array(data)