utils.h 20.6 KB
Newer Older
Tri Dao's avatar
Tri Dao committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
/******************************************************************************
 * Copyright (c) 2023, Tri Dao.
 ******************************************************************************/

#pragma once

#include <assert.h>
#include <stdint.h>
#include <stdlib.h>

#include <cuda_fp16.h>

#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
#include <cuda_bf16.h>
#endif

#include <cute/algorithm/copy.hpp>
#include <cute/algorithm/gemm.hpp>

#include <cutlass/array.h>
#include <cutlass/cutlass.h>
#include <cutlass/numeric_conversion.h>
#include <cutlass/numeric_types.h>

////////////////////////////////////////////////////////////////////////////////////////////////////

namespace flash {

////////////////////////////////////////////////////////////////////////////////////////////////////

template<typename T>
32
__forceinline__ __device__ uint32_t relu2(const uint32_t x);
Tri Dao's avatar
Tri Dao committed
33
34

template<>
35
__forceinline__ __device__ uint32_t relu2<cutlass::half_t>(const uint32_t x) {
Tri Dao's avatar
Tri Dao committed
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
    uint32_t res;
    const uint32_t zero = 0u;
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
    asm volatile("max.f16x2 %0, %1, %2;\n" : "=r"(res) : "r"(x), "r"(zero));
#else
    asm volatile( \
        "{\n" \
        "\t .reg .f16x2 sela;\n" \
        "\t set.gtu.u32.f16x2 sela, %1, %2;\n" \
        "\t and.b32 %0, sela, %1;\n" 
        "}\n" : "=r"(res) : "r"(x), "r"(zero));
#endif
    return res;
}

#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
template<>
53
__forceinline__ __device__ uint32_t relu2<cutlass::bfloat16_t>(const uint32_t x) {
Tri Dao's avatar
Tri Dao committed
54
55
56
57
58
59
60
61
62
63
64
65
    uint32_t res;
    const uint32_t zero = 0u;
    asm volatile("max.bf16x2 %0, %1, %2;\n" : "=r"(res) : "r"(x), "r"(zero));
    return res;
}
#endif

////////////////////////////////////////////////////////////////////////////////////////////////////

#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800

template<typename T>
66
__forceinline__ __device__ uint32_t convert_relu2(const float2 x);
Tri Dao's avatar
Tri Dao committed
67
68

template<>
69
__forceinline__ __device__ uint32_t convert_relu2<cutlass::half_t>(const float2 x) {
Tri Dao's avatar
Tri Dao committed
70
71
72
73
74
75
76
77
    uint32_t res;
    const uint32_t a = reinterpret_cast<const uint32_t&>(x.x);
    const uint32_t b = reinterpret_cast<const uint32_t&>(x.y);
    asm volatile("cvt.rn.relu.f16x2.f32 %0, %1, %2;\n" : "=r"(res) : "r"(b), "r"(a));
    return res;
}

template<>
78
__forceinline__ __device__ uint32_t convert_relu2<cutlass::bfloat16_t>(const float2 x) {
Tri Dao's avatar
Tri Dao committed
79
80
81
82
83
84
85
86
87
88
89
90
91
    uint32_t res;
    const uint32_t a = reinterpret_cast<const uint32_t&>(x.x);
    const uint32_t b = reinterpret_cast<const uint32_t&>(x.y);
    asm volatile("cvt.rn.relu.bf16x2.f32 %0, %1, %2;\n" : "=r"(res) : "r"(b), "r"(a));
    return res;
}

#endif

////////////////////////////////////////////////////////////////////////////////////////////////////

template<typename T>
struct MaxOp {
92
__device__ __forceinline__ T operator()(T const & x, T const & y) { return x > y ? x : y; }
Tri Dao's avatar
Tri Dao committed
93
94
95
96
97
};

template <>
struct MaxOp<float> {
// This is slightly faster
98
__device__ __forceinline__ float operator()(float const &x, float const &y) { return max(x, y); }
Tri Dao's avatar
Tri Dao committed
99
100
101
102
103
104
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template<typename T>
struct SumOp {
105
__device__ __forceinline__ T operator()(T const & x, T const & y) { return x + y; }
Tri Dao's avatar
Tri Dao committed
106
107
108
109
110
111
112
113
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template<int THREADS>
struct Allreduce {
    static_assert(THREADS == 32 || THREADS == 16 || THREADS == 8 || THREADS == 4);
    template<typename T, typename Operator>
114
    static __device__ __forceinline__ T run(T x, Operator &op) {
Tri Dao's avatar
Tri Dao committed
115
116
117
118
119
120
121
122
123
124
125
        constexpr int OFFSET = THREADS / 2;
        x = op(x, __shfl_xor_sync(uint32_t(-1), x, OFFSET));
        return Allreduce<OFFSET>::run(x, op);
    }
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template<>
struct Allreduce<2> {
template<typename T, typename Operator> 
126
static __device__ __forceinline__ T run(T x, Operator &op) {
Tri Dao's avatar
Tri Dao committed
127
128
129
130
131
132
133
134
135
    x = op(x, __shfl_xor_sync(uint32_t(-1), x, 1));
    return x;
}
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template<bool A_in_regs=false, bool B_in_regs=false, typename Tensor0, typename Tensor1,
         typename Tensor2, typename Tensor3, typename Tensor4,
Tri Dao's avatar
Tri Dao committed
136
137
         typename TiledMma, typename TiledCopyA, typename TiledCopyB,
         typename ThrCopyA, typename ThrCopyB>
138
__forceinline__ __device__ void gemm(Tensor0 &acc, Tensor1 &tCrA, Tensor2 &tCrB, Tensor3 const& tCsA,
Tri Dao's avatar
Tri Dao committed
139
                            Tensor4 const& tCsB, TiledMma tiled_mma,
Tri Dao's avatar
Tri Dao committed
140
141
                            TiledCopyA smem_tiled_copy_A, TiledCopyB smem_tiled_copy_B,
                            ThrCopyA smem_thr_copy_A, ThrCopyB smem_thr_copy_B) {
Tri Dao's avatar
Tri Dao committed
142
143
144
145
146
147
148
    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(acc));                     // MMA_M
    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(acc));                     // MMA_N
    CUTE_STATIC_ASSERT_V(size<2>(tCrA) == size<2>(tCrB));                     // MMA_K
    Tensor tCrA_copy_view = smem_thr_copy_A.retile_D(tCrA);
    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));            // M
    Tensor tCrB_copy_view = smem_thr_copy_B.retile_D(tCrB);
    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<1>(tCrB_copy_view));            // N
Tri Dao's avatar
Tri Dao committed
149
150
    if (!A_in_regs) { cute::copy(smem_tiled_copy_A, tCsA(_, _, _0{}), tCrA_copy_view(_, _, _0{})); }
    if (!B_in_regs) { cute::copy(smem_tiled_copy_B, tCsB(_, _, _0{}), tCrB_copy_view(_, _, _0{})); }
Tri Dao's avatar
Tri Dao committed
151
152
153
    #pragma unroll
    for (int i = 0; i < size<2>(tCrA); ++i) {
        if (i < size<2>(tCrA) - 1) {
Tri Dao's avatar
Tri Dao committed
154
155
            if (!A_in_regs) { cute::copy(smem_tiled_copy_A, tCsA(_, _, i + 1), tCrA_copy_view(_, _, i + 1)); }
            if (!B_in_regs) { cute::copy(smem_tiled_copy_B, tCsB(_, _, i + 1), tCrB_copy_view(_, _, i + 1)); }
Tri Dao's avatar
Tri Dao committed
156
157
158
159
160
161
162
163
        }
        cute::gemm(tiled_mma, tCrA(_, _, i), tCrB(_, _, i), acc);
    }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template<typename Tensor0, typename Tensor1, typename Tensor2, typename Tensor3,
Tri Dao's avatar
Tri Dao committed
164
         typename TiledMma, typename TiledCopy, typename ThrCopy>
165
__forceinline__ __device__ void gemm_rs(Tensor0 &acc, Tensor1 &tCrA, Tensor2 &tCrB, Tensor3 const& tCsB,
166
167
                               TiledMma tiled_mma, TiledCopy smem_tiled_copy_B,
                               ThrCopy smem_thr_copy_B) {
Tri Dao's avatar
Tri Dao committed
168
169
170
171
172
    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(acc));                     // MMA_M
    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(acc));                     // MMA_N
    CUTE_STATIC_ASSERT_V(size<2>(tCrA) == size<2>(tCrB));                     // MMA_K
    Tensor tCrB_copy_view = smem_thr_copy_B.retile_D(tCrB);
    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<1>(tCrB_copy_view));            // N
Tri Dao's avatar
Tri Dao committed
173
    cute::copy(smem_tiled_copy_B, tCsB(_, _, _0{}), tCrB_copy_view(_, _, _0{}));
Tri Dao's avatar
Tri Dao committed
174
175
176
    #pragma unroll
    for (int i = 0; i < size<2>(tCrA); ++i) {
        if (i < size<2>(tCrA) - 1) {
Tri Dao's avatar
Tri Dao committed
177
            cute::copy(smem_tiled_copy_B, tCsB(_, _, i + 1), tCrB_copy_view(_, _, i + 1));
Tri Dao's avatar
Tri Dao committed
178
179
180
181
182
183
184
185
186
        }
        cute::gemm(tiled_mma, tCrA(_, _, i), tCrB(_, _, i), acc);
    }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

// Convert acc_layout from (MMA=4, MMA_M, MMA_N) to (nrow=(2, MMA_M), ncol=(2, MMA_N))
template<typename Layout>
187
__forceinline__ __device__ auto convert_layout_acc_rowcol(Layout acc_layout) {
Tri Dao's avatar
Tri Dao committed
188
189
190
    static_assert(decltype(size<0>(acc_layout))::value == 4);
    static_assert(decltype(rank(acc_layout))::value == 3);
    auto l = logical_divide(acc_layout, Shape<_2>{});  // ((2, 2), MMA_M, MMA_N)
191
    return make_layout(make_layout(get<0, 1>(l), get<1>(l)), make_layout(get<0, 0>(l), get<2>(l)));
Tri Dao's avatar
Tri Dao committed
192
193
194
195
};

////////////////////////////////////////////////////////////////////////////////////////////////////

196
197
// Convert acc_layout from (MMA=4, MMA_M, MMA_N) to ((4, 2), MMA_M, MMA_N / 2)
// if using m16n8k16, or to (4, MMA_M, MMA_N) if using m16n8k8.
Tri Dao's avatar
Tri Dao committed
198
template<typename MMA_traits, typename Layout>
199
__forceinline__ __device__ auto convert_layout_acc_Aregs(Layout acc_layout) {
Tri Dao's avatar
Tri Dao committed
200
    using X = Underscore;
201
202
    static_assert(decltype(size<0>(acc_layout))::value == 4);
    static_assert(decltype(rank(acc_layout))::value == 3);
Tri Dao's avatar
Tri Dao committed
203
204
    constexpr int mma_shape_K = get<2>(typename MMA_traits::Shape_MNK{});
    static_assert(mma_shape_K == 8 || mma_shape_K == 16);
205
206
207
208
209
210
    if constexpr (mma_shape_K == 8) {
        return acc_layout;
    } else {
        auto l = logical_divide(acc_layout, Shape<X, X, _2>{});  // (4, MMA_M, (2, MMA_N / 2)))
        return make_layout(make_layout(get<0>(l), get<2, 0>(l)), get<1>(l), get<2, 1>(l));
    }
Tri Dao's avatar
Tri Dao committed
211
212
213
214
};

////////////////////////////////////////////////////////////////////////////////////////////////////

215
// Convert acc_layout from (MMA=4, MMA_M, MMA_N) to ((4, 2), MMA_M, MMA_N / 2)
216
template<typename Layout>
217
__forceinline__ __device__ auto convert_layout_acc_dropout(Layout acc_layout) {
218
    using X = Underscore;
219
220
221
222
    static_assert(decltype(size<0>(acc_layout))::value == 4);
    static_assert(decltype(rank(acc_layout))::value == 3);
    auto l = logical_divide(acc_layout, Shape<X, X, _2>{});  // (4, MMA_M, (2, MMA_N / 2)))
    return make_layout(make_layout(get<0>(l), get<2, 0>(l)), get<1>(l), get<2, 1>(l));
223
224
225
226
};

////////////////////////////////////////////////////////////////////////////////////////////////////

Tri Dao's avatar
Tri Dao committed
227
template <typename To_type, typename Engine, typename Layout>
228
__forceinline__ __device__ auto convert_type(Tensor<Engine, Layout> const &tensor) {
Tri Dao's avatar
Tri Dao committed
229
230
231
232
233
234
235
236
237
238
239
    using From_type = typename Engine::value_type;
    constexpr int numel = decltype(size(tensor))::value;
    cutlass::NumericArrayConverter<To_type, From_type, numel> convert_op;
    // HACK: this requires tensor to be "contiguous"
    auto frag = convert_op(*reinterpret_cast<const cutlass::Array<From_type, numel> *>(tensor.data()));
    return make_tensor(make_rmem_ptr<To_type>(&frag), tensor.layout());
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Engine, typename Layout>
240
__forceinline__ __device__ void relu_(Tensor<Engine, Layout> &tensor) {
Tri Dao's avatar
Tri Dao committed
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
    constexpr int numel = decltype(size(tensor))::value;
    static_assert(numel % 2 == 0);
    using value_t = typename Engine::value_type;
    // HACK: this requires tensor to be "contiguous"
    Tensor tensor_uint32 = recast<uint32_t>(tensor);
    #pragma unroll
    for (int i = 0; i < size(tensor_uint32); ++i) {
        tensor_uint32(i) = relu2<value_t>(tensor_uint32(i));
    }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

// On SM80 and above, we can fuse fp32 -> fp16/bf16 conversion and relu into 1 instruction
template <typename To_type, typename Engine, typename Layout>
256
__forceinline__ __device__ auto convert_type_relu(Tensor<Engine, Layout> const &tensor) {
Tri Dao's avatar
Tri Dao committed
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
    using From_type = typename Engine::value_type;
    static_assert(std::is_same_v<To_type, cutlass::half_t> || std::is_same_v<To_type, cutlass::bfloat16_t>);
    static_assert(std::is_same_v<float, From_type>);
    constexpr int numel = decltype(size(tensor))::value;
    static_assert(numel % 2 == 0);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
    // HACK: this requires tensor to be "contiguous"
    Tensor tensor_float2 = recast<float2>(tensor);
    Tensor out_uint32 = make_tensor<uint32_t>(tensor_float2.layout());
    #pragma unroll
    for (int i = 0; i < size(out_uint32); ++i) {
        out_uint32(i) = convert_relu2<To_type>(tensor_float2(i));
    }
    Tensor out = make_tensor(make_rmem_ptr<To_type>(out_uint32.data()), tensor.layout());
#else
    Tensor out = flash::convert_type<To_type>(tensor);
    flash::relu_(out);
#endif
    return out;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

// Blocks until all but N previous cp.async.commit_group operations have committed.
// This differs from cute::cp_async_wait in that when N = 0 we don't call cp.async.wait_all
// (which is equivalent to commit_group then wait_group 0).
// Instead we just call cp.async.wait_group 0, which is slightly faster.
// https://github.com/NVIDIA/cutlass/blob/master/include/cute/arch/copy_sm80.hpp#L113
template <int N>
CUTE_HOST_DEVICE
void cp_async_wait() {
#if defined(CUTE_ARCH_CP_ASYNC_SM80_ENABLED)
    asm volatile("cp.async.wait_group %0;\n" :: "n"(N));
#endif
}

////////////////////////////////////////////////////////////////////////////////////////////////////

skrider's avatar
skrider committed
295
296
// resolves initial base offset of a slice of a paged kv copy from gmem.
// assumes that the tensor has already been positioned at the correct head.
297
298
template <typename Kernel_traits>
__forceinline__ __device__
skrider's avatar
skrider committed
299
300
int init_thread_kv_page_slice_offset(const int tidx, const int n_block_max, const int page_block_size, 
                            const int* block_table, const int page_stride, const int row_stride) {
skrider's avatar
skrider committed
301
302
303
304
305
306
307
308
    constexpr int kGmemThreadsPerRow = Kernel_traits::kGmemThreadsPerRow;
    constexpr int kGmemRowsPerThread = Kernel_traits::kGmemRowsPerThread;
    constexpr int kGmemElemsPerLoad = Kernel_traits::kGmemElemsPerLoad;
    constexpr int kBlockN = Kernel_traits::kBlockN;
    
    const int col_offset = tidx % kGmemThreadsPerRow * kGmemElemsPerLoad;
    const int block_row_offset = tidx / kGmemThreadsPerRow * kGmemRowsPerThread;
    const int global_row_offset = block_row_offset + (n_block_max - 1) * kBlockN;
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
    const int page_offset = global_row_offset % page_block_size;
    const int virtual_page_idx = global_row_offset / page_block_size;

    return block_table[virtual_page_idx] * page_stride 
        + page_offset * row_stride 
        + col_offset;
}
 
////////////////////////////////////////////////////////////////////////////////////////////////////

// advances base address of a slice of a paged copy from gmem
template <typename Kernel_traits>
__forceinline__ __device__
int advance_thread_kv_page_slice_offset(const int tidx, const int n_block, const int page_block_size, 
                            const int* block_table, const int page_stride, const int row_stride) {
324
325
326
327
328
    constexpr int kGmemThreadsPerRow = Kernel_traits::kGmemThreadsPerRow;
    constexpr int kGmemRowsPerThread = Kernel_traits::kGmemRowsPerThread;
    constexpr int kBlockN = Kernel_traits::kBlockN;
    
    const int block_row_offset = tidx / kGmemThreadsPerRow * kGmemRowsPerThread;
skrider's avatar
skrider committed
329

330
331
    const int global_row_offset_cur = block_row_offset + n_block * kBlockN;
    const int global_row_offset_next = block_row_offset + (n_block - 1) * kBlockN;
skrider's avatar
skrider committed
332
    
333
334
335
336
337
338
339
340
341
342
343
344
345
346
    const int page_offset_cur = global_row_offset_cur % page_block_size;
    const int page_offset_next = global_row_offset_next % page_block_size;

    const int virtual_page_idx_cur = global_row_offset_cur / page_block_size;
    const int virtual_page_idx_next = global_row_offset_next / page_block_size;

    const int table_diff = block_table[virtual_page_idx_next] - block_table[virtual_page_idx_cur];
    const int offset_diff = page_offset_next - page_offset_cur;

    return table_diff * page_stride + offset_diff * row_stride;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

skrider's avatar
skrider committed
347
348
349
350
351
352
353
354
// somewhat unorthodox reshape function. Given a tuple ((v1, v2), m, k), returns (v1, v2, k),         
// where v2 may be a tuple itself, in the case of swizzled smem-backed thread tiles. This ensures
// that paged and non-paged copies result in equivalently shaped, if not necessarily strided, tensors.
template <class Shape, class Stride>
__forceinline__ __device__
auto reshape_thread_tile(Layout<Shape, Stride> l) {
    return make_layout(append(get<0>(l.shape()), get<2>(l.shape())),
                        append(get<0>(l.stride()), get<2>(l.stride())));
skrider's avatar
skrider committed
355
356
357
358
}

////////////////////////////////////////////////////////////////////////////////////////////////////

Tri Dao's avatar
Tri Dao committed
359
360
361
template <bool Is_even_MN=true, bool Is_even_K=true, bool Clear_OOB_MN=false, bool Clear_OOB_K=true,
          typename TiledCopy, typename Engine0, typename Layout0, typename Engine1, typename Layout1,
          typename Engine2, typename Layout2, typename Engine3, typename Layout3>
362
__forceinline__ __device__ void copy(TiledCopy tiled_copy, Tensor<Engine0, Layout0> const &S,
Tri Dao's avatar
Tri Dao committed
363
                            Tensor<Engine1, Layout1> &D, Tensor<Engine2, Layout2> const &identity_MN,
Tri Dao's avatar
Tri Dao committed
364
                            Tensor<Engine3, Layout3> const &predicate_K, const int max_MN=0) {
Tri Dao's avatar
Tri Dao committed
365
366
367
368
369
370
371
372
373
374
375
376
377
    CUTE_STATIC_ASSERT_V(rank(S) == Int<3>{});
    CUTE_STATIC_ASSERT_V(rank(D) == Int<3>{});
    CUTE_STATIC_ASSERT_V(size<0>(S) == size<0>(D));                     // MMA
    CUTE_STATIC_ASSERT_V(size<1>(S) == size<1>(D));                     // MMA_M
    CUTE_STATIC_ASSERT_V(size<2>(S) == size<2>(D));                     // MMA_K
    // There's no case where !Clear_OOB_K && Clear_OOB_MN
    static_assert(!(Clear_OOB_MN && !Clear_OOB_K));
    #pragma unroll
    for (int m = 0; m < size<1>(S); ++m) {
        if (Is_even_MN || get<0>(identity_MN(0, m, 0)) < max_MN) {
            #pragma unroll
            for (int k = 0; k < size<2>(S); ++k) {
                if (Is_even_K || predicate_K(k)) {
Tri Dao's avatar
Tri Dao committed
378
                    cute::copy(tiled_copy, S(_, m, k), D(_, m, k));
Tri Dao's avatar
Tri Dao committed
379
                } else if (Clear_OOB_K) {
Tri Dao's avatar
Tri Dao committed
380
                    cute::clear(D(_, m, k));
Tri Dao's avatar
Tri Dao committed
381
382
383
                }
            }
        } else if (Clear_OOB_MN) {
Tri Dao's avatar
Tri Dao committed
384
            cute::clear(D(_, m, _));
Tri Dao's avatar
Tri Dao committed
385
386
387
388
389
390
391
392
        }
    }
    // TD [2023-04-13]: Strange that the code below can cause race condition.
    // I think it's because the copies are under an if statement.
    // if (Is_even_K) {
    //     #pragma unroll
    //     for (int m = 0; m < size<1>(S); ++m) {
    //         if (Is_even_MN || get<0>(identity_MN(0, m, 0)) < max_MN) {
Tri Dao's avatar
Tri Dao committed
393
    //             copy(tiled_copy, S(_, m, _), D(_, m, _));
Tri Dao's avatar
Tri Dao committed
394
395
396
397
398
399
400
401
402
403
404
    //         } else if (Clear_OOB_MN) {
    //             clear(D(_, m, _));
    //         }
    //     }
    // } else {  // It's slightly faster in this case if iterate over K first
    //     #pragma unroll
    //     for (int k = 0; k < size<2>(S); ++k) {
    //         if (predicate_K(k)) {
    //             #pragma unroll
    //             for (int m = 0; m < size<1>(S); ++m) {
    //                 if (Is_even_MN || get<0>(identity_MN(0, m, 0)) < max_MN) {
Tri Dao's avatar
Tri Dao committed
405
    //                     copy(tiled_copy, S(_, m, k), D(_, m, k));
Tri Dao's avatar
Tri Dao committed
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
    //                 } else if (Clear_OOB_MN) {
    //                     clear(D(_, m, k));
    //                 }
    //             }
    //         } else if (Clear_OOB_K) {  // There's no case where !Clear_OOB_K && Clear_OOB_MN
    //             if (Clear_OOB_MN || Is_even_MN) {
    //                 clear(D(_, _, k));
    //             } else {
    //                 #pragma unroll
    //                 for (int m = 0; m < size<1>(S); ++m) {
    //                     if (!(Is_even_MN || get<0>(identity_MN(0, m, 0)) < max_MN)) {
    //                         clear(D(_, m, k));
    //                     }
    //                 }
    //             }
    //         }
    //     }
    // }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

428
429
template <bool Is_even_K=true,
          typename Engine0, typename Layout0, typename Engine1, typename Layout1,
Tri Dao's avatar
Tri Dao committed
430
          typename Engine2, typename Layout2, typename Engine3, typename Layout3>
431
__forceinline__ __device__ void copy_w_min_idx(Tensor<Engine0, Layout0> const &S,
Tri Dao's avatar
Tri Dao committed
432
433
                                      Tensor<Engine1, Layout1> &D, Tensor<Engine2, Layout2> const &identity_MN,
                                      Tensor<Engine3, Layout3> const &predicate_K,
434
435
                                      const int max_MN=0, const int min_MN=0) {
    CUTE_STATIC_ASSERT_V(rank(S) == Int<3>{});
Tri Dao's avatar
Tri Dao committed
436
    CUTE_STATIC_ASSERT_V(rank(D) == Int<3>{});
437
438
439
440
    CUTE_STATIC_ASSERT_V(size<0>(S) == size<0>(D));                     // MMA
    CUTE_STATIC_ASSERT_V(size<1>(S) == size<1>(D));                     // MMA_M
    CUTE_STATIC_ASSERT_V(size<2>(S) == size<2>(D));                     // MMA_K
    // if (threadIdx.x == 0 && blockIdx.z == 0) { printf("blockIdx.y = %d, max_MN = %d, min_MN = %d\n", blockIdx.y, max_MN, min_MN); }
Tri Dao's avatar
Tri Dao committed
441
    #pragma unroll
442
443
444
445
    for (int m = 0; m < size<1>(S); ++m) {
        // if (threadIdx.x == 0 && blockIdx.z == 0) { printf("blockIdx.y = %d, m = %d\n", blockIdx.y, get<0>(identity_MN(0, m, 0))); }
        if (get<0>(identity_MN(0, m, 0)) >= min_MN && get<0>(identity_MN(0, m, 0)) < max_MN) {
            // if (threadIdx.x == 0 && blockIdx.z == 0) { printf("Inner loop, blockIdx.y = %d, m = %d\n", blockIdx.y, get<0>(identity_MN(0, m, 0))); }
Tri Dao's avatar
Tri Dao committed
446
            #pragma unroll
447
            for (int k = 0; k < size<2>(S); ++k) {
Tri Dao's avatar
Tri Dao committed
448
                if (Is_even_K || predicate_K(k)) {
449
450
451
452
453
454
455
456
457
458
                    cute::copy(S(_, m, k), D(_, m, k));
                }
            }
        }
    }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

}  // namespace flash