gemm_template.h 17 KB
Newer Older
1
2
3
4
5
6
7
// Copyright (c) OpenMMLab. All rights reserved.

#pragma once

#include "common.h"
#include "cta_iterator.h"
#include "warp_iterator.h"
gaoqiong's avatar
gaoqiong committed
8
//#include <cuda_pipeline_primitives.h>
9
10
11

namespace turbomind {

12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
// __inline__ __device__ void
// mma_m16n8k8_row_col(Array<float, 4>& d, const Array<half, 4>& a, const Array<half, 2>& b, Array<float, 4>& c)
// {
// #if TURBOMIND_ARCH_SM75
//     uint32_t const* A = reinterpret_cast<uint32_t const*>(&a);
//     uint32_t const* B = reinterpret_cast<uint32_t const*>(&b);
//     float const*    C = reinterpret_cast<float const*>(&c);
//     float*          D = reinterpret_cast<float*>(&d);
//     asm("mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32  {%0,%1,%2,%3}, "
//         "{%4,%5}, {%6}, {%7,%8,%9,%10};\n"
//         : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
//         : "r"(A[0]), "r"(A[1]), "r"(B[0]), "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]));
// #else
//     assert(TURBOMIND_ARCH_SM75);
// #endif
// }
Li Zhang's avatar
Li Zhang committed
28

29
30
31
__inline__ __device__ void
mma_m16n8k16_row_col(Array<float, 4>& d, const Array<half, 8>& a, const Array<half, 4>& b, Array<float, 4>& c)
{
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
// #if TURBOMIND_ARCH_SM80
//     uint32_t const* A = reinterpret_cast<uint32_t const*>(&a);
//     uint32_t const* B = reinterpret_cast<uint32_t const*>(&b);
//     float const*    C = reinterpret_cast<float const*>(&c);
//     float*          D = reinterpret_cast<float*>(&d);
//     asm("mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32  {%0,%1,%2,%3}, "
//         "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
//         : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
//         : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]));
// #else
//     const Array<half, 4>* _a = (const Array<half, 4>*)&a;
//     const Array<half, 2>* _b = (const Array<half, 2>*)&b;
//     mma_m16n8k8_row_col(d, _a[0], _b[0], c);
//     mma_m16n8k8_row_col(d, _a[1], _b[1], d);
// #endif
47
48
}

49
50
__inline__ __device__ uint transpose_m8n8_b16_warp_shuffle(uint value, int lane_id)
{
gaoqiong's avatar
gaoqiong committed
51
52
53
    // int    src_lane = lane_id / 8 + lane_id % 4 * 8;
    // uint   u0       = __shfl_sync(0xffffffff, value, src_lane);
    // uint   u1       = __shfl_sync(0xffffffff, value, src_lane + 4);
54
55
    short2 r;

gaoqiong's avatar
gaoqiong committed
56
57
58
59
60
61
62
63
    // if (lane_id % 8 < 4) {
    //     r.x = ((short2&)u0).x;
    //     r.y = ((short2&)u1).x;
    // }
    // else {
    //     r.x = ((short2&)u0).y;
    //     r.y = ((short2&)u1).y;
    // }
64
65
66
    return (uint&)r;
}

67
68
69
70
71
72
73
74
75
76
77
78
79
// #if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 8)
// __inline__ __device__ uint transpose_m8n8_b16_movmatrix(uint a)
// {
// #if TURBOMIND_ARCH_SM75
//     uint d;
//     asm("movmatrix.sync.aligned.m8n8.trans.b16 %0, %1;\n" : "=r"(d) : "r"(a));
//     return d;
// #else
//     assert(TURBOMIND_ARCH_SM75);
//     return 0;
// #endif
// }
// #endif
80
81
82
83

__inline__ __device__ uint transpose_m8n8_b16(uint a, int lane_id)
{

84
85
86
87
88
89
// #if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 8)
//     (void)lane_id;
//     return transpose_m8n8_b16_movmatrix(a);
// #else
//     return transpose_m8n8_b16_warp_shuffle(a, lane_id);
// #endif
gaoqiong's avatar
gaoqiong committed
90
return a;
91
}
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161

namespace ops {

__inline__ __device__ float4 operator+(const float4& a, const float4& b)
{
    return {a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w};
}

__inline__ __device__ float2 operator+(const float2& a, const float2& b)
{
    return {a.x + b.x, a.y + b.y};
}

}  // namespace ops

template<int CTA_M,
         int CTA_N,
         int CTA_K,
         int WARP_M,
         int WARP_N,
         int WARP_K,
         int STAGES,
         int GROUP_SIZE,
         typename OutputOps>
struct Gemm {

    static constexpr int kWarpCountM = CTA_M / WARP_M;
    static constexpr int kWarpCountN = CTA_N / WARP_N;
    static constexpr int kWarpCountK = CTA_K / WARP_K;

    static constexpr int kWarpCountMN = kWarpCountM * kWarpCountN;
    static constexpr int kWarpCount   = kWarpCountMN * kWarpCountK;

    static constexpr int SLICES  = kWarpCountK;
    static constexpr int SLICE_K = CTA_K / SLICES;

    static_assert(SLICE_K % WARP_K == 0, "infeasible sliced-k setting");

    using IteratorA = turbomind::IteratorA<kWarpCountMN, CTA_M, CTA_N, CTA_K, STAGES, SLICES>;
    using IteratorQ = turbomind::IteratorQ<kWarpCountMN, CTA_M, CTA_N, CTA_K, STAGES, SLICES, GROUP_SIZE>;
    using IteratorB = turbomind::IteratorB<kWarpCountMN, CTA_M, CTA_N, CTA_K, STAGES, SLICES>;

    static constexpr int OP_M = 16;
    static constexpr int OP_N = 8;
    static constexpr int OP_K = 16;

    using WarpIterA = turbomind::WarpIteratorA<CTA_M,
                                               CTA_K,
                                               WARP_M,
                                               WARP_K,
                                               OP_M,
                                               OP_K,
                                               GROUP_SIZE,
                                               STAGES,
                                               IteratorA::kSizePerStage,
                                               IteratorQ::kSizePerStage>;

    using WarpIterB =
        turbomind::WarpIteratorB<CTA_N, CTA_K, WARP_N, WARP_K, OP_N, OP_K, IteratorB::kSmemPadCtaK, STAGES>;

    __device__ void warp_mma(IteratorA& iter_A,
                             IteratorQ& iter_Q,
                             IteratorB& iter_B,
                             WarpIterA& warp_iter_A,
                             WarpIterB& warp_iter_B,
                             float*     accum,
                             int        slice_id,
                             int&       gemm_iter)
    {

gaoqiong's avatar
gaoqiong committed
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
        // constexpr int ITER_M = WARP_M / OP_M;
        // constexpr int ITER_N = WARP_N / OP_N;
        // constexpr int ITER_K = WARP_K / OP_K;

        // constexpr int kBatchA = (IteratorA::kIterCount + ITER_K - 1) / ITER_K;
        // constexpr int kBatchQ = (IteratorQ::kIterCount + ITER_K - 1) / ITER_K;
        // constexpr int kBatchB = (IteratorB::kIterCount + ITER_K - 1) / ITER_K;

        // auto frag_C_ptr = (Array<float, 4>*)accum;  // [ITER_N, ITER_M]

        // PRAGMA_UNROLL
        // for (int iter_k = 0; iter_k < ITER_K; ++iter_k) {

        //     warp_iter_A.load(warp_frag_A_[(iter_k + 1) % 2], (iter_k + 1) % ITER_K);
        //     warp_iter_B.load(warp_frag_B_[(iter_k + 1) % 2], (iter_k + 1) % ITER_K);

        //     auto warp_frag_A = warp_frag_A_[iter_k % 2];
        //     auto warp_frag_B = warp_frag_B_[iter_k % 2];

        //     PRAGMA_UNROLL
        //     for (int iter_m = 0; iter_m < ITER_M; ++iter_m) {
        //         PRAGMA_UNROLL
        //         for (int iter_n = 0; iter_n < ITER_N; ++iter_n) {
        //             auto& frag_A = warp_frag_A[iter_m];
        //             auto& frag_B = warp_frag_B[iter_n];
        //             auto& frag_C = frag_C_ptr[iter_n * ITER_M + iter_m];
        //             mma_m16n8k16_row_col(frag_C, frag_A, frag_B, frag_C);
        //         }
        //     }

        //     if (iter_k < ITER_K - 1) {
        //         iter_A.prefetch_batch(iter_k, kBatchA, gemm_iter > 0);
        //         iter_Q.prefetch_batch(iter_k, kBatchQ, gemm_iter > 0);
        //         iter_B.prefetch_batch(iter_k, kBatchB, gemm_iter > 0);
        //     }

        //     if (iter_k == ITER_K - 2) {
        //         iter_A.prefetch_batch(iter_k + 1, kBatchA, gemm_iter > 0);
        //         iter_Q.prefetch_batch(iter_k + 1, kBatchQ, gemm_iter > 0);
        //         iter_B.prefetch_batch(iter_k + 1, kBatchB, gemm_iter > 0);

        //         __pipeline_commit();
        //         __pipeline_wait_prior(STAGES - 2);
        //         sync_slice(slice_id);

        //         iter_A.next_stage();
        //         iter_Q.next_stage();
        //         iter_B.next_stage();

        //         warp_iter_A.next_stage();
        //         warp_iter_B.next_stage();

        //         --gemm_iter;
        //     }
        // }
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
    }

    template<typename T, int N>
    __device__ static void copy(T (&dst)[N], const T (&src)[N])
    {
        PRAGMA_UNROLL
        for (int i = 0; i < N; ++i) {
            dst[i] = src[i];
        }
    }

    template<typename T, int N>
    __device__ static void clear(T (&dst)[N])
    {
        PRAGMA_UNROLL
        for (int i = 0; i < N; ++i) {
            dst[i] = T{};
        }
    }

    __device__ void sync_slice(int slice_id)
    {
gaoqiong's avatar
gaoqiong committed
239
240
241
242
243
244
245
246
247
        // if constexpr (SLICES == 1) {
        //     __syncthreads();
        // }
        // else {
        //     constexpr int      SLICE_GROUP = (SLICES + 7) / 8;
        //     constexpr uint32_t num_threads = kWarpCountMN * WARP_SIZE;
        //     const uint32_t     barrier_id  = slice_id / SLICE_GROUP + 1;
        //     // asm volatile("bar.sync %0, %1;" : : "r"(barrier_id), "n"(num_threads));
        // }
248
249
250
251
    }

    __device__ void load_partial(float* tb_frag_C, const float* partial_C, int cta, int slice_id)
    {
gaoqiong's avatar
gaoqiong committed
252
253
254
255
256
257
        // if (slice_id == 0) {
        //     PRAGMA_UNROLL
        //     for (int i = 0; i < CTA_N; ++i) {
        //         tb_frag_C[i] += partial_C[cta * CTA_N * CTA_M + i * CTA_M + threadIdx.x];
        //     }
        // }
258
259
260
261
    }

    __device__ void store_partial(float* partial_C, const float* tb_frag_C, int cta, int slice_id)
    {
gaoqiong's avatar
gaoqiong committed
262
263
264
265
266
267
        // if (slice_id == 0) {
        //     PRAGMA_UNROLL
        //     for (int i = 0; i < CTA_N; ++i) {
        //         partial_C[cta * CTA_N * CTA_M + i * CTA_M + threadIdx.x] = tb_frag_C[i];
        //     }
        // }
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
    }

    template<int Index>
    __device__ void store_accum(float* tb_frag_C,
                                float* tb_smem_C,
                                half*  C,
                                int    m,
                                int    n,
                                int    cta_m,
                                int    cta_n,
                                int    warp_id_m,
                                int    warp_id_n,
                                int    lane_id,
                                int    slice_id)
    {

gaoqiong's avatar
gaoqiong committed
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
        // if (slice_id != 0) {
        //     return;
        // }

        // // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#mma-16816-c
        // PRAGMA_UNROLL
        // for (int i = 0; i < WARP_N / OP_N; ++i) {
        //     const float2* frag_C = (float2*)&tb_frag_C[i * WARP_M / OP_M * 4];
        //     const int     nn     = cta_n + warp_id_n * WARP_N + i * OP_N + lane_id / 4;
        //     PRAGMA_UNROLL
        //     for (int j = 0; j < WARP_M / OP_M; ++j) {
        //         PRAGMA_UNROLL
        //         for (int x = 0; x < 2; ++x) {
        //             const int mm = cta_m + warp_id_m * WARP_M + j * OP_M + x * 8 + lane_id % 4 * 2;
        //             // convert to half
        //             half2 half_C = __float22half2_rn(frag_C[j * 2 + x]);
        //             // transpose 8x8 accum tile
        //             uint trans_C = transpose_m8n8_b16((uint&)half_C, lane_id);
        //             // store to global memory
        //             OutputOps::template apply<Index>(trans_C, mm, nn, C, m, n);
        //         }
        //     }
        // }
307
308
309
310
311
312
    }

    __device__ void
    sum_slices(float* tb_frag_C, float* tb_smem_C, int warp_id_m, int warp_id_n, int lane_id, int slice_id)
    {

gaoqiong's avatar
gaoqiong committed
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
        // int offset_m = warp_id_m * WARP_M / OP_M;
        // int offset_n = warp_id_n * WARP_N / OP_N;

        // PRAGMA_UNROLL
        // for (int z = 0; z < SLICES; ++z) {
        //     if (slice_id == z) {
        //         PRAGMA_UNROLL
        //         for (int i = 0; i < WARP_N / OP_N; ++i) {
        //             PRAGMA_UNROLL
        //             for (int j = 0; j < WARP_M / OP_M; ++j) {
        //                 PRAGMA_UNROLL
        //                 for (int x = 0; x < 4; ++x) {
        //                     int src = (i * WARP_M / OP_M + j) * 4 + x;
        //                     int dst = ((i + offset_n) * CTA_M / OP_M + j + offset_m) * 4 + x;
        //                     if (z > 0) {
        //                         using namespace ops;
        //                         tb_frag_C[src] = tb_smem_C[dst * WARP_SIZE + lane_id] + tb_frag_C[src];
        //                     }
        //                     tb_smem_C[dst * WARP_SIZE + lane_id] = tb_frag_C[src];
        //                 }
        //             }
        //         }
        //     }
        //     __syncthreads();
        // }

        // if (slice_id == 0) {
        //     PRAGMA_UNROLL
        //     for (int i = 0; i < WARP_N / OP_N; ++i) {
        //         PRAGMA_UNROLL
        //         for (int j = 0; j < WARP_M / OP_M; ++j) {
        //             PRAGMA_UNROLL
        //             for (int x = 0; x < 4; ++x) {
        //                 int src = ((i + offset_n) * CTA_M / OP_M + j + offset_m) * 4 + x;
        //                 int dst = (i * WARP_M / OP_M + j) * 4 + x;

        //                 tb_frag_C[dst] = tb_smem_C[src * WARP_SIZE + lane_id];
        //             }
        //         }
        //     }
        // }
354
355
    }

gaoqiong's avatar
gaoqiong committed
356
357
    // Array<half, 8> warp_frag_A_[2][WARP_M / OP_M];
    // Array<half, 4> warp_frag_B_[2][WARP_N / OP_N];
358
359
360
361
362
363
364
365
366
367

    __device__ void run_v2(half* __restrict__ C,
                           const uint* __restrict__ A,
                           const half* __restrict__ B,
                           const half2* __restrict__ Q,
                           int M,
                           int N,
                           int K,
                           int output_op_idx)
    {
gaoqiong's avatar
gaoqiong committed
368
        // static_assert(WARP_M % OP_N == 0);
369

gaoqiong's avatar
gaoqiong committed
370
        // float tb_frag_C[(WARP_N / OP_N) * (WARP_M / OP_M) * 4];
371

gaoqiong's avatar
gaoqiong committed
372
        // extern __shared__ uint8_t smem[];
373

gaoqiong's avatar
gaoqiong committed
374
375
        // const int warp_id = threadIdx.x / WARP_SIZE;
        // const int lane_id = threadIdx.x % WARP_SIZE;
376

gaoqiong's avatar
gaoqiong committed
377
378
379
380
        // const int warp_id_m  = warp_id % kWarpCountM;
        // const int warp_id_nk = warp_id / kWarpCountM;
        // const int warp_id_n  = warp_id_nk % kWarpCountN;
        // const int warp_id_k  = warp_id_nk / kWarpCountN;
381

gaoqiong's avatar
gaoqiong committed
382
        // const int warp_id_mn = warp_id_n * kWarpCountM + warp_id_m;
383

gaoqiong's avatar
gaoqiong committed
384
        // const int slice_id = warp_id_k;
385

gaoqiong's avatar
gaoqiong committed
386
387
388
        // const int cta_k = slice_id * SLICE_K;  // sliced-k offset
        // const int cta_m = blockIdx.x * CTA_M;
        // const int cta_n = blockIdx.y * CTA_N;
389

gaoqiong's avatar
gaoqiong committed
390
391
392
        // // each slice has its own partition of smem
        // uint4* const tb_smem_A = (uint4*)(smem + IteratorA::kSmemByteSize * slice_id);
        // half* const tb_smem_B = (half*)(smem + IteratorA::kSmemByteSize * SLICES + IteratorB::kSmemByteSize * slice_id);
393

gaoqiong's avatar
gaoqiong committed
394
395
        // // [CTA_N / OP_N, CTA_M / OP_M, 4, WARP_SIZE], all mn fragments in CTA
        // float* const tb_smem_C = (float*)smem;
396

gaoqiong's avatar
gaoqiong committed
397
        // __shared__ typename IteratorQ::Storage tb_smem_Q_storage;
398

gaoqiong's avatar
gaoqiong committed
399
        // auto tb_smem_Q = tb_smem_Q_storage.data[slice_id];
400

gaoqiong's avatar
gaoqiong committed
401
402
403
        // IteratorA iter_A{A, tb_smem_A, M, K, cta_m, cta_k, warp_id_mn, lane_id};
        // IteratorQ iter_Q{Q, tb_smem_Q, M, K, cta_m, cta_k, warp_id_mn, lane_id};
        // IteratorB iter_B{B, tb_smem_B, K, N, cta_n, cta_k, warp_id_mn, lane_id};
404

gaoqiong's avatar
gaoqiong committed
405
        // const int offset_m = warp_id_m * WARP_M + lane_id;
406

gaoqiong's avatar
gaoqiong committed
407
408
        // WarpIterA warp_iter_A(iter_A.smem_, iter_Q.smem_, warp_id, lane_id, offset_m, cta_k);
        // WarpIterB warp_iter_B(iter_B.smem_int_ptr_, warp_id_n, lane_id, 0);
409

gaoqiong's avatar
gaoqiong committed
410
        // int gemm_iter = (K + CTA_K - 1) / CTA_K;
411

gaoqiong's avatar
gaoqiong committed
412
413
414
415
416
417
418
        // PRAGMA_UNROLL
        // for (int stage = 0; stage < STAGES - 1; ++stage, --gemm_iter) {
        //     iter_A.prefetch_stage(gemm_iter > 0);
        //     iter_Q.prefetch_stage(gemm_iter > 0);
        //     iter_B.prefetch_stage(gemm_iter > 0);
        //     __pipeline_commit();
        // }
419

gaoqiong's avatar
gaoqiong committed
420
        // clear(tb_frag_C);
421

gaoqiong's avatar
gaoqiong committed
422
423
        // __pipeline_wait_prior(STAGES - 2);
        // sync_slice(slice_id);
424

gaoqiong's avatar
gaoqiong committed
425
426
        // warp_iter_A.load(warp_frag_A_[0], 0);
        // warp_iter_B.load(warp_frag_B_[0], 0);
427

gaoqiong's avatar
gaoqiong committed
428
429
430
431
        // PRAGMA_NO_UNROLL
        // for (; gemm_iter > -STAGES + 1;) {
        //     warp_mma(iter_A, iter_Q, iter_B, warp_iter_A, warp_iter_B, tb_frag_C, slice_id, gemm_iter);
        // }
432

gaoqiong's avatar
gaoqiong committed
433
434
435
        // __pipeline_commit();
        // __pipeline_wait_prior(0);
        // __syncthreads();
436

gaoqiong's avatar
gaoqiong committed
437
438
439
        // if constexpr (SLICES > 1) {
        //     sum_slices(tb_frag_C, tb_smem_C, warp_id_m, warp_id_n, lane_id, slice_id);
        // }
440

gaoqiong's avatar
gaoqiong committed
441
442
443
444
445
446
447
448
449
450
        // switch (output_op_idx) {
        //     case 0:
        //         store_accum<0>(tb_frag_C, tb_smem_C, C, M, N, cta_m, cta_n, warp_id_m, warp_id_n, lane_id, slice_id);
        //         break;
        //     case 1:
        //         store_accum<1>(tb_frag_C, tb_smem_C, C, M, N, cta_m, cta_n, warp_id_m, warp_id_n, lane_id, slice_id);
        //         break;
        //     default:
        //         return;
        // }
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
    }
};

template<typename Gemm>
__global__ void gemm_s4_f16_nn(half* __restrict__ C,
                               const uint* __restrict__ A,
                               const half* __restrict__ B,
                               const half2* __restrict__ Q,
                               int M,
                               int N,
                               int K,
                               int output_op_idx)
{
    Gemm{}.run_v2(C, A, B, Q, M, N, K, output_op_idx);
}

}  // namespace turbomind