utils.h 20 KB
Newer Older
Tri Dao's avatar
Tri Dao committed
1
2
3
4
5
6
/******************************************************************************
 * Copyright (c) 2023, Tri Dao.
 ******************************************************************************/

#pragma once

skrider's avatar
skrider committed
7
8
#include "debug.h"

Tri Dao's avatar
Tri Dao committed
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
#include <assert.h>
#include <stdint.h>
#include <stdlib.h>

#include <cuda_fp16.h>

#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
#include <cuda_bf16.h>
#endif

#include <cute/algorithm/copy.hpp>
#include <cute/algorithm/gemm.hpp>

#include <cutlass/array.h>
#include <cutlass/cutlass.h>
#include <cutlass/numeric_conversion.h>
#include <cutlass/numeric_types.h>

////////////////////////////////////////////////////////////////////////////////////////////////////

namespace flash {

////////////////////////////////////////////////////////////////////////////////////////////////////

template<typename T>
34
__forceinline__ __device__ uint32_t relu2(const uint32_t x);
Tri Dao's avatar
Tri Dao committed
35
36

template<>
37
__forceinline__ __device__ uint32_t relu2<cutlass::half_t>(const uint32_t x) {
Tri Dao's avatar
Tri Dao committed
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
    uint32_t res;
    const uint32_t zero = 0u;
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
    asm volatile("max.f16x2 %0, %1, %2;\n" : "=r"(res) : "r"(x), "r"(zero));
#else
    asm volatile( \
        "{\n" \
        "\t .reg .f16x2 sela;\n" \
        "\t set.gtu.u32.f16x2 sela, %1, %2;\n" \
        "\t and.b32 %0, sela, %1;\n" 
        "}\n" : "=r"(res) : "r"(x), "r"(zero));
#endif
    return res;
}

#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
template<>
55
__forceinline__ __device__ uint32_t relu2<cutlass::bfloat16_t>(const uint32_t x) {
Tri Dao's avatar
Tri Dao committed
56
57
58
59
60
61
62
63
64
65
66
67
    uint32_t res;
    const uint32_t zero = 0u;
    asm volatile("max.bf16x2 %0, %1, %2;\n" : "=r"(res) : "r"(x), "r"(zero));
    return res;
}
#endif

////////////////////////////////////////////////////////////////////////////////////////////////////

#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800

template<typename T>
68
__forceinline__ __device__ uint32_t convert_relu2(const float2 x);
Tri Dao's avatar
Tri Dao committed
69
70

template<>
71
__forceinline__ __device__ uint32_t convert_relu2<cutlass::half_t>(const float2 x) {
Tri Dao's avatar
Tri Dao committed
72
73
74
75
76
77
78
79
    uint32_t res;
    const uint32_t a = reinterpret_cast<const uint32_t&>(x.x);
    const uint32_t b = reinterpret_cast<const uint32_t&>(x.y);
    asm volatile("cvt.rn.relu.f16x2.f32 %0, %1, %2;\n" : "=r"(res) : "r"(b), "r"(a));
    return res;
}

template<>
80
__forceinline__ __device__ uint32_t convert_relu2<cutlass::bfloat16_t>(const float2 x) {
Tri Dao's avatar
Tri Dao committed
81
82
83
84
85
86
87
88
89
90
91
92
93
    uint32_t res;
    const uint32_t a = reinterpret_cast<const uint32_t&>(x.x);
    const uint32_t b = reinterpret_cast<const uint32_t&>(x.y);
    asm volatile("cvt.rn.relu.bf16x2.f32 %0, %1, %2;\n" : "=r"(res) : "r"(b), "r"(a));
    return res;
}

#endif

////////////////////////////////////////////////////////////////////////////////////////////////////

template<typename T>
struct MaxOp {
94
__device__ __forceinline__ T operator()(T const & x, T const & y) { return x > y ? x : y; }
Tri Dao's avatar
Tri Dao committed
95
96
97
98
99
};

template <>
struct MaxOp<float> {
// This is slightly faster
100
__device__ __forceinline__ float operator()(float const &x, float const &y) { return max(x, y); }
Tri Dao's avatar
Tri Dao committed
101
102
103
104
105
106
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template<typename T>
struct SumOp {
107
__device__ __forceinline__ T operator()(T const & x, T const & y) { return x + y; }
Tri Dao's avatar
Tri Dao committed
108
109
110
111
112
113
114
115
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template<int THREADS>
struct Allreduce {
    static_assert(THREADS == 32 || THREADS == 16 || THREADS == 8 || THREADS == 4);
    template<typename T, typename Operator>
116
    static __device__ __forceinline__ T run(T x, Operator &op) {
Tri Dao's avatar
Tri Dao committed
117
118
119
120
121
122
123
124
125
126
127
        constexpr int OFFSET = THREADS / 2;
        x = op(x, __shfl_xor_sync(uint32_t(-1), x, OFFSET));
        return Allreduce<OFFSET>::run(x, op);
    }
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template<>
struct Allreduce<2> {
template<typename T, typename Operator> 
128
static __device__ __forceinline__ T run(T x, Operator &op) {
Tri Dao's avatar
Tri Dao committed
129
130
131
132
133
134
135
136
137
    x = op(x, __shfl_xor_sync(uint32_t(-1), x, 1));
    return x;
}
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template<bool A_in_regs=false, bool B_in_regs=false, typename Tensor0, typename Tensor1,
         typename Tensor2, typename Tensor3, typename Tensor4,
Tri Dao's avatar
Tri Dao committed
138
139
         typename TiledMma, typename TiledCopyA, typename TiledCopyB,
         typename ThrCopyA, typename ThrCopyB>
140
__forceinline__ __device__ void gemm(Tensor0 &acc, Tensor1 &tCrA, Tensor2 &tCrB, Tensor3 const& tCsA,
Tri Dao's avatar
Tri Dao committed
141
                            Tensor4 const& tCsB, TiledMma tiled_mma,
Tri Dao's avatar
Tri Dao committed
142
143
                            TiledCopyA smem_tiled_copy_A, TiledCopyB smem_tiled_copy_B,
                            ThrCopyA smem_thr_copy_A, ThrCopyB smem_thr_copy_B) {
Tri Dao's avatar
Tri Dao committed
144
145
146
147
148
149
150
    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(acc));                     // MMA_M
    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(acc));                     // MMA_N
    CUTE_STATIC_ASSERT_V(size<2>(tCrA) == size<2>(tCrB));                     // MMA_K
    Tensor tCrA_copy_view = smem_thr_copy_A.retile_D(tCrA);
    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));            // M
    Tensor tCrB_copy_view = smem_thr_copy_B.retile_D(tCrB);
    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<1>(tCrB_copy_view));            // N
Tri Dao's avatar
Tri Dao committed
151
152
    if (!A_in_regs) { cute::copy(smem_tiled_copy_A, tCsA(_, _, _0{}), tCrA_copy_view(_, _, _0{})); }
    if (!B_in_regs) { cute::copy(smem_tiled_copy_B, tCsB(_, _, _0{}), tCrB_copy_view(_, _, _0{})); }
Tri Dao's avatar
Tri Dao committed
153
154
155
    #pragma unroll
    for (int i = 0; i < size<2>(tCrA); ++i) {
        if (i < size<2>(tCrA) - 1) {
Tri Dao's avatar
Tri Dao committed
156
157
            if (!A_in_regs) { cute::copy(smem_tiled_copy_A, tCsA(_, _, i + 1), tCrA_copy_view(_, _, i + 1)); }
            if (!B_in_regs) { cute::copy(smem_tiled_copy_B, tCsB(_, _, i + 1), tCrB_copy_view(_, _, i + 1)); }
Tri Dao's avatar
Tri Dao committed
158
159
160
161
162
163
164
165
        }
        cute::gemm(tiled_mma, tCrA(_, _, i), tCrB(_, _, i), acc);
    }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template<typename Tensor0, typename Tensor1, typename Tensor2, typename Tensor3,
Tri Dao's avatar
Tri Dao committed
166
         typename TiledMma, typename TiledCopy, typename ThrCopy>
167
__forceinline__ __device__ void gemm_rs(Tensor0 &acc, Tensor1 &tCrA, Tensor2 &tCrB, Tensor3 const& tCsB,
168
169
                               TiledMma tiled_mma, TiledCopy smem_tiled_copy_B,
                               ThrCopy smem_thr_copy_B) {
Tri Dao's avatar
Tri Dao committed
170
171
172
173
174
    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(acc));                     // MMA_M
    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(acc));                     // MMA_N
    CUTE_STATIC_ASSERT_V(size<2>(tCrA) == size<2>(tCrB));                     // MMA_K
    Tensor tCrB_copy_view = smem_thr_copy_B.retile_D(tCrB);
    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<1>(tCrB_copy_view));            // N
Tri Dao's avatar
Tri Dao committed
175
    cute::copy(smem_tiled_copy_B, tCsB(_, _, _0{}), tCrB_copy_view(_, _, _0{}));
Tri Dao's avatar
Tri Dao committed
176
177
178
    #pragma unroll
    for (int i = 0; i < size<2>(tCrA); ++i) {
        if (i < size<2>(tCrA) - 1) {
Tri Dao's avatar
Tri Dao committed
179
            cute::copy(smem_tiled_copy_B, tCsB(_, _, i + 1), tCrB_copy_view(_, _, i + 1));
Tri Dao's avatar
Tri Dao committed
180
181
182
183
184
185
186
187
188
        }
        cute::gemm(tiled_mma, tCrA(_, _, i), tCrB(_, _, i), acc);
    }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

// Convert acc_layout from (MMA=4, MMA_M, MMA_N) to (nrow=(2, MMA_M), ncol=(2, MMA_N))
template<typename Layout>
189
__forceinline__ __device__ auto convert_layout_acc_rowcol(Layout acc_layout) {
Tri Dao's avatar
Tri Dao committed
190
191
192
    static_assert(decltype(size<0>(acc_layout))::value == 4);
    static_assert(decltype(rank(acc_layout))::value == 3);
    auto l = logical_divide(acc_layout, Shape<_2>{});  // ((2, 2), MMA_M, MMA_N)
193
    return make_layout(make_layout(get<0, 1>(l), get<1>(l)), make_layout(get<0, 0>(l), get<2>(l)));
Tri Dao's avatar
Tri Dao committed
194
195
196
197
};

////////////////////////////////////////////////////////////////////////////////////////////////////

198
199
// Convert acc_layout from (MMA=4, MMA_M, MMA_N) to ((4, 2), MMA_M, MMA_N / 2)
// if using m16n8k16, or to (4, MMA_M, MMA_N) if using m16n8k8.
Tri Dao's avatar
Tri Dao committed
200
template<typename MMA_traits, typename Layout>
201
__forceinline__ __device__ auto convert_layout_acc_Aregs(Layout acc_layout) {
Tri Dao's avatar
Tri Dao committed
202
    using X = Underscore;
203
204
    static_assert(decltype(size<0>(acc_layout))::value == 4);
    static_assert(decltype(rank(acc_layout))::value == 3);
Tri Dao's avatar
Tri Dao committed
205
206
    constexpr int mma_shape_K = get<2>(typename MMA_traits::Shape_MNK{});
    static_assert(mma_shape_K == 8 || mma_shape_K == 16);
207
208
209
210
211
212
    if constexpr (mma_shape_K == 8) {
        return acc_layout;
    } else {
        auto l = logical_divide(acc_layout, Shape<X, X, _2>{});  // (4, MMA_M, (2, MMA_N / 2)))
        return make_layout(make_layout(get<0>(l), get<2, 0>(l)), get<1>(l), get<2, 1>(l));
    }
Tri Dao's avatar
Tri Dao committed
213
214
215
216
};

////////////////////////////////////////////////////////////////////////////////////////////////////

217
// Convert acc_layout from (MMA=4, MMA_M, MMA_N) to ((4, 2), MMA_M, MMA_N / 2)
218
template<typename Layout>
219
__forceinline__ __device__ auto convert_layout_acc_dropout(Layout acc_layout) {
220
    using X = Underscore;
221
222
223
224
    static_assert(decltype(size<0>(acc_layout))::value == 4);
    static_assert(decltype(rank(acc_layout))::value == 3);
    auto l = logical_divide(acc_layout, Shape<X, X, _2>{});  // (4, MMA_M, (2, MMA_N / 2)))
    return make_layout(make_layout(get<0>(l), get<2, 0>(l)), get<1>(l), get<2, 1>(l));
225
226
227
228
};

////////////////////////////////////////////////////////////////////////////////////////////////////

Tri Dao's avatar
Tri Dao committed
229
template <typename To_type, typename Engine, typename Layout>
230
__forceinline__ __device__ auto convert_type(Tensor<Engine, Layout> const &tensor) {
Tri Dao's avatar
Tri Dao committed
231
232
233
234
235
236
237
238
239
240
241
    using From_type = typename Engine::value_type;
    constexpr int numel = decltype(size(tensor))::value;
    cutlass::NumericArrayConverter<To_type, From_type, numel> convert_op;
    // HACK: this requires tensor to be "contiguous"
    auto frag = convert_op(*reinterpret_cast<const cutlass::Array<From_type, numel> *>(tensor.data()));
    return make_tensor(make_rmem_ptr<To_type>(&frag), tensor.layout());
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Engine, typename Layout>
242
__forceinline__ __device__ void relu_(Tensor<Engine, Layout> &tensor) {
Tri Dao's avatar
Tri Dao committed
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
    constexpr int numel = decltype(size(tensor))::value;
    static_assert(numel % 2 == 0);
    using value_t = typename Engine::value_type;
    // HACK: this requires tensor to be "contiguous"
    Tensor tensor_uint32 = recast<uint32_t>(tensor);
    #pragma unroll
    for (int i = 0; i < size(tensor_uint32); ++i) {
        tensor_uint32(i) = relu2<value_t>(tensor_uint32(i));
    }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

// On SM80 and above, we can fuse fp32 -> fp16/bf16 conversion and relu into 1 instruction
template <typename To_type, typename Engine, typename Layout>
258
__forceinline__ __device__ auto convert_type_relu(Tensor<Engine, Layout> const &tensor) {
Tri Dao's avatar
Tri Dao committed
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
    using From_type = typename Engine::value_type;
    static_assert(std::is_same_v<To_type, cutlass::half_t> || std::is_same_v<To_type, cutlass::bfloat16_t>);
    static_assert(std::is_same_v<float, From_type>);
    constexpr int numel = decltype(size(tensor))::value;
    static_assert(numel % 2 == 0);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
    // HACK: this requires tensor to be "contiguous"
    Tensor tensor_float2 = recast<float2>(tensor);
    Tensor out_uint32 = make_tensor<uint32_t>(tensor_float2.layout());
    #pragma unroll
    for (int i = 0; i < size(out_uint32); ++i) {
        out_uint32(i) = convert_relu2<To_type>(tensor_float2(i));
    }
    Tensor out = make_tensor(make_rmem_ptr<To_type>(out_uint32.data()), tensor.layout());
#else
    Tensor out = flash::convert_type<To_type>(tensor);
    flash::relu_(out);
#endif
    return out;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

// Blocks until all but N previous cp.async.commit_group operations have committed.
// This differs from cute::cp_async_wait in that when N = 0 we don't call cp.async.wait_all
// (which is equivalent to commit_group then wait_group 0).
// Instead we just call cp.async.wait_group 0, which is slightly faster.
// https://github.com/NVIDIA/cutlass/blob/master/include/cute/arch/copy_sm80.hpp#L113
template <int N>
CUTE_HOST_DEVICE
void cp_async_wait() {
#if defined(CUTE_ARCH_CP_ASYNC_SM80_ENABLED)
    asm volatile("cp.async.wait_group %0;\n" :: "n"(N));
#endif
}

////////////////////////////////////////////////////////////////////////////////////////////////////

skrider's avatar
skrider committed
297
298
// resolves initial base offset of a slice of a paged kv copy from gmem.
// assumes that the tensor has already been positioned at the correct head.
299
300
template <typename Kernel_traits>
__forceinline__ __device__
skrider's avatar
skrider committed
301
302
int init_thread_kv_page_slice_offset(const int tidx, const int n_block_max, const int page_block_size, 
                            const int* block_table, const int page_stride, const int row_stride) {
skrider's avatar
skrider committed
303
304
305
306
307
308
309
310
    constexpr int kGmemThreadsPerRow = Kernel_traits::kGmemThreadsPerRow;
    constexpr int kGmemRowsPerThread = Kernel_traits::kGmemRowsPerThread;
    constexpr int kGmemElemsPerLoad = Kernel_traits::kGmemElemsPerLoad;
    constexpr int kBlockN = Kernel_traits::kBlockN;
    
    const int col_offset = tidx % kGmemThreadsPerRow * kGmemElemsPerLoad;
    const int block_row_offset = tidx / kGmemThreadsPerRow * kGmemRowsPerThread;
    const int global_row_offset = block_row_offset + (n_block_max - 1) * kBlockN;
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
    const int page_offset = global_row_offset % page_block_size;
    const int virtual_page_idx = global_row_offset / page_block_size;

    return block_table[virtual_page_idx] * page_stride 
        + page_offset * row_stride 
        + col_offset;
}
 
////////////////////////////////////////////////////////////////////////////////////////////////////

// advances base address of a slice of a paged copy from gmem
template <typename Kernel_traits>
__forceinline__ __device__
int advance_thread_kv_page_slice_offset(const int tidx, const int n_block, const int page_block_size, 
                            const int* block_table, const int page_stride, const int row_stride) {
326
327
328
329
330
331
    constexpr int kGmemThreadsPerRow = Kernel_traits::kGmemThreadsPerRow;
    constexpr int kGmemRowsPerThread = Kernel_traits::kGmemRowsPerThread;
    constexpr int kGmemElemsPerLoad = Kernel_traits::kGmemElemsPerLoad;
    constexpr int kBlockN = Kernel_traits::kBlockN;
    
    const int block_row_offset = tidx / kGmemThreadsPerRow * kGmemRowsPerThread;
skrider's avatar
skrider committed
332

333
334
    const int global_row_offset_cur = block_row_offset + n_block * kBlockN;
    const int global_row_offset_next = block_row_offset + (n_block - 1) * kBlockN;
skrider's avatar
skrider committed
335
    
336
337
338
339
340
341
342
343
344
345
346
347
348
349
    const int page_offset_cur = global_row_offset_cur % page_block_size;
    const int page_offset_next = global_row_offset_next % page_block_size;

    const int virtual_page_idx_cur = global_row_offset_cur / page_block_size;
    const int virtual_page_idx_next = global_row_offset_next / page_block_size;

    const int table_diff = block_table[virtual_page_idx_next] - block_table[virtual_page_idx_cur];
    const int offset_diff = page_offset_next - page_offset_cur;

    return table_diff * page_stride + offset_diff * row_stride;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

Tri Dao's avatar
Tri Dao committed
350
351
352
template <bool Is_even_MN=true, bool Is_even_K=true, bool Clear_OOB_MN=false, bool Clear_OOB_K=true,
          typename TiledCopy, typename Engine0, typename Layout0, typename Engine1, typename Layout1,
          typename Engine2, typename Layout2, typename Engine3, typename Layout3>
353
__forceinline__ __device__ void copy(TiledCopy tiled_copy, Tensor<Engine0, Layout0> const &S,
Tri Dao's avatar
Tri Dao committed
354
                            Tensor<Engine1, Layout1> &D, Tensor<Engine2, Layout2> const &identity_MN,
Tri Dao's avatar
Tri Dao committed
355
                            Tensor<Engine3, Layout3> const &predicate_K, const int max_MN=0) {
Tri Dao's avatar
Tri Dao committed
356
357
358
359
360
361
362
363
364
365
366
367
368
    CUTE_STATIC_ASSERT_V(rank(S) == Int<3>{});
    CUTE_STATIC_ASSERT_V(rank(D) == Int<3>{});
    CUTE_STATIC_ASSERT_V(size<0>(S) == size<0>(D));                     // MMA
    CUTE_STATIC_ASSERT_V(size<1>(S) == size<1>(D));                     // MMA_M
    CUTE_STATIC_ASSERT_V(size<2>(S) == size<2>(D));                     // MMA_K
    // There's no case where !Clear_OOB_K && Clear_OOB_MN
    static_assert(!(Clear_OOB_MN && !Clear_OOB_K));
    #pragma unroll
    for (int m = 0; m < size<1>(S); ++m) {
        if (Is_even_MN || get<0>(identity_MN(0, m, 0)) < max_MN) {
            #pragma unroll
            for (int k = 0; k < size<2>(S); ++k) {
                if (Is_even_K || predicate_K(k)) {
Tri Dao's avatar
Tri Dao committed
369
                    cute::copy(tiled_copy, S(_, m, k), D(_, m, k));
Tri Dao's avatar
Tri Dao committed
370
                } else if (Clear_OOB_K) {
Tri Dao's avatar
Tri Dao committed
371
                    cute::clear(D(_, m, k));
Tri Dao's avatar
Tri Dao committed
372
373
374
                }
            }
        } else if (Clear_OOB_MN) {
Tri Dao's avatar
Tri Dao committed
375
            cute::clear(D(_, m, _));
Tri Dao's avatar
Tri Dao committed
376
377
378
379
380
381
382
383
        }
    }
    // TD [2023-04-13]: Strange that the code below can cause race condition.
    // I think it's because the copies are under an if statement.
    // if (Is_even_K) {
    //     #pragma unroll
    //     for (int m = 0; m < size<1>(S); ++m) {
    //         if (Is_even_MN || get<0>(identity_MN(0, m, 0)) < max_MN) {
Tri Dao's avatar
Tri Dao committed
384
    //             copy(tiled_copy, S(_, m, _), D(_, m, _));
Tri Dao's avatar
Tri Dao committed
385
386
387
388
389
390
391
392
393
394
395
    //         } else if (Clear_OOB_MN) {
    //             clear(D(_, m, _));
    //         }
    //     }
    // } else {  // It's slightly faster in this case if iterate over K first
    //     #pragma unroll
    //     for (int k = 0; k < size<2>(S); ++k) {
    //         if (predicate_K(k)) {
    //             #pragma unroll
    //             for (int m = 0; m < size<1>(S); ++m) {
    //                 if (Is_even_MN || get<0>(identity_MN(0, m, 0)) < max_MN) {
Tri Dao's avatar
Tri Dao committed
396
    //                     copy(tiled_copy, S(_, m, k), D(_, m, k));
Tri Dao's avatar
Tri Dao committed
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
    //                 } else if (Clear_OOB_MN) {
    //                     clear(D(_, m, k));
    //                 }
    //             }
    //         } else if (Clear_OOB_K) {  // There's no case where !Clear_OOB_K && Clear_OOB_MN
    //             if (Clear_OOB_MN || Is_even_MN) {
    //                 clear(D(_, _, k));
    //             } else {
    //                 #pragma unroll
    //                 for (int m = 0; m < size<1>(S); ++m) {
    //                     if (!(Is_even_MN || get<0>(identity_MN(0, m, 0)) < max_MN)) {
    //                         clear(D(_, m, k));
    //                     }
    //                 }
    //             }
    //         }
    //     }
    // }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

419
420
template <bool Is_even_K=true,
          typename Engine0, typename Layout0, typename Engine1, typename Layout1,
Tri Dao's avatar
Tri Dao committed
421
          typename Engine2, typename Layout2, typename Engine3, typename Layout3>
422
__forceinline__ __device__ void copy_w_min_idx(Tensor<Engine0, Layout0> const &S,
Tri Dao's avatar
Tri Dao committed
423
424
                                      Tensor<Engine1, Layout1> &D, Tensor<Engine2, Layout2> const &identity_MN,
                                      Tensor<Engine3, Layout3> const &predicate_K,
425
426
                                      const int max_MN=0, const int min_MN=0) {
    CUTE_STATIC_ASSERT_V(rank(S) == Int<3>{});
Tri Dao's avatar
Tri Dao committed
427
    CUTE_STATIC_ASSERT_V(rank(D) == Int<3>{});
428
429
430
431
    CUTE_STATIC_ASSERT_V(size<0>(S) == size<0>(D));                     // MMA
    CUTE_STATIC_ASSERT_V(size<1>(S) == size<1>(D));                     // MMA_M
    CUTE_STATIC_ASSERT_V(size<2>(S) == size<2>(D));                     // MMA_K
    // if (threadIdx.x == 0 && blockIdx.z == 0) { printf("blockIdx.y = %d, max_MN = %d, min_MN = %d\n", blockIdx.y, max_MN, min_MN); }
Tri Dao's avatar
Tri Dao committed
432
    #pragma unroll
433
434
435
436
    for (int m = 0; m < size<1>(S); ++m) {
        // if (threadIdx.x == 0 && blockIdx.z == 0) { printf("blockIdx.y = %d, m = %d\n", blockIdx.y, get<0>(identity_MN(0, m, 0))); }
        if (get<0>(identity_MN(0, m, 0)) >= min_MN && get<0>(identity_MN(0, m, 0)) < max_MN) {
            // if (threadIdx.x == 0 && blockIdx.z == 0) { printf("Inner loop, blockIdx.y = %d, m = %d\n", blockIdx.y, get<0>(identity_MN(0, m, 0))); }
Tri Dao's avatar
Tri Dao committed
437
            #pragma unroll
438
            for (int k = 0; k < size<2>(S); ++k) {
Tri Dao's avatar
Tri Dao committed
439
                if (Is_even_K || predicate_K(k)) {
440
441
442
443
444
445
446
447
448
449
                    cute::copy(S(_, m, k), D(_, m, k));
                }
            }
        }
    }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

}  // namespace flash