qdq_4.cuh 4.07 KB
Newer Older
nicodafagood's avatar
nicodafagood committed
1
2
3
4
5
6
7
8
9
10
/*
Copied from https://github.com/turboderp/exllamav2
*/

#ifndef _qdq_4_cuh
#define _qdq_4_cuh

#include "qdq_util.cuh"

namespace vllm {
nicodafagood's avatar
nicodafagood committed
11
namespace mygq {
nicodafagood's avatar
nicodafagood committed
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
// Permutation:
//
// 77775555 33331111  66664444 22220000

__forceinline__ __device__ void shuffle_4bit_8
(
    uint32_t* q,
    int stride
)
{
    uint32_t qa = q[0];
    uint32_t qb = 0;

    #pragma unroll
    for (int i = 0; i < 4; i++)
    {
        uint32_t qa0 = qa & 0x0f;
        uint32_t qa1 = (qa & 0xf0) >> 4;
        qa >>= 8;
        qb |= (qa1 << (i * 4 + 16));
        qb |= (qa0 << (i * 4));
    }
    q[0] = qb;
}

__forceinline__ __device__ void dequant_4bit_8
(
    const uint32_t q_0,
    half2 (&dq)[4],
    int stride,
    const uint32_t zero
)
{
    const uint32_t c0 = 0x64006400;
    const half y16_ = __float2half_rn(1.0f / 16.0f);
    const half2 y16 = __halves2half2(y16_, y16_);
    const half_uint16 z1_(0xe400 | zero); // half(-1024.0f - zero);
    const half z16_ = __hsub(__int2half_rn(-64), __int2half_rn(zero));
    const half2 z1 = __half2half2(z1_.as_half);
    const half2 z16 = __half2half2(z16_);

    uint32_t qa = q_0;
    half2_uint32 q0((qa & 0x000f000f) | c0); // half2(q[ 0], q[ 1])      + 1024
    half2_uint32 q1((qa & 0x00f000f0) | c0); // half2(q[ 2], q[ 3]) * 16 + 1024
    qa >>= 8;
    half2_uint32 q2((qa & 0x000f000f) | c0); // half2(q[ 4], q[ 5])      + 1024
    half2_uint32 q3((qa & 0x00f000f0) | c0); // half2(q[ 6], q[ 7]) * 16 + 1024

    dq[0] = __hadd2(q0.as_half2, z1);
    dq[1] = __hfma2(q1.as_half2, y16, z16);
    dq[2] = __hadd2(q2.as_half2, z1);
    dq[3] = __hfma2(q3.as_half2, y16, z16);
}

__forceinline__ __device__ void dequant_4bit_8_prep_zero_scale
(
    const uint32_t zero,
    const half scale,
    half2 (&z1z16)[2],
    half2 (&y1y16)[2]
)
{
    half_uint16 z1(0xe400 | zero); // half(-1024.0f - zero);
    half z16 = __hsub(__int2half_rn(-64), __int2half_rn(zero));

    half2 scale2 = __half2half2(scale);

    z1z16[0] = __hmul2(scale2, __half2half2(z1.as_half));
    z1z16[1] = __hmul2(scale2, __half2half2(z16));

    const half y1 = __float2half_rn(1.0f);
    const half y16 = __float2half_rn(1.0f / 16.0f);

    y1y16[0] = __hmul2(scale2, __half2half2(y1));
    y1y16[1] = __hmul2(scale2, __half2half2(y16));
}

__forceinline__ __device__ void dequant_4bit_8_prep_zero
(
    const uint32_t zero,
    half2(&z1z16)[2],
    half2(&y1y16)[2]
)
{
    half_uint16 z1(0xe400 | zero); // half(-1024.0f - zero);
    half z16 = __hsub(__int2half_rn(-64), __int2half_rn(zero));

    z1z16[0] = __half2half2(z1.as_half);
    z1z16[1] = __half2half2(z16);

    const half y1 = __float2half_rn(1.0f);
    const half y16 = __float2half_rn(1.0f / 16.0f);

    y1y16[0] = __half2half2(y1);
    y1y16[1] = __half2half2(y16);
}


nicodafagood's avatar
nicodafagood committed
110
__forceinline__ __device__ void dequant_4bit_8_mygq
nicodafagood's avatar
nicodafagood committed
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
(
    const uint32_t q_0,
    half2 (&dq)[4],
    half2 (&z1z16)[2],
    half2 (&y1y16)[2],
    int stride,
    bool scaled
)
{
    const uint32_t c0 = 0x64006400;

    uint32_t qa = q_0;
    half2_uint32 q0((qa & 0x000f000f) | c0); // half2( q[0]      + 1024, q[1]      + 1024 )
    half2_uint32 q1((qa & 0x00f000f0) | c0); // half2( q[2] * 16 + 1024, q[3] * 16 + 1024 )
    qa >>= 8;
    half2_uint32 q2((qa & 0x000f000f) | c0); // half2( q[4]      + 1024, q[5]      + 1024 )
    half2_uint32 q3((qa & 0x00f000f0) | c0); // half2( q[6] * 16 + 1024, q[7] * 16 + 1024 )

    if (scaled)
    {
        dq[0] = __hfma2(q0.as_half2, y1y16[0], z1z16[0]);  // half2( q[0] * s - z * s, q[1] * s - z * s)
        dq[1] = __hfma2(q1.as_half2, y1y16[1], z1z16[1]);  // half2( q[2] * s - z * s, q[3] * s - z * s)
        dq[2] = __hfma2(q2.as_half2, y1y16[0], z1z16[0]);
        dq[3] = __hfma2(q3.as_half2, y1y16[1], z1z16[1]);
    }
    else
    {
        dq[0] = __hadd2(q0.as_half2,           z1z16[0]);  // half2( q[0] - z, q[1] - z )
        dq[1] = __hfma2(q1.as_half2, y1y16[1], z1z16[1]);  // half2( q[2] - z, q[3] - z )
        dq[2] = __hadd2(q2.as_half2,           z1z16[0]);  // half2( q[4] - z, q[5] - z )
        dq[3] = __hfma2(q3.as_half2, y1y16[1], z1z16[1]);  // half2( q[6] - z, q[7] - z )
    }
}
nicodafagood's avatar
nicodafagood committed
144
}  // namespace mygq
nicodafagood's avatar
nicodafagood committed
145
146
147
}  // namespace vllm

#endif