utils.h 20.5 KB
Newer Older
Tri Dao's avatar
Tri Dao committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
/******************************************************************************
 * Copyright (c) 2023, Tri Dao.
 ******************************************************************************/

#pragma once

#include <assert.h>
#include <stdint.h>
#include <stdlib.h>

#include <cuda_fp16.h>

#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
#include <cuda_bf16.h>
#endif

Tri Dao's avatar
Tri Dao committed
17
#include <cute/tensor.hpp>
Tri Dao's avatar
Tri Dao committed
18
19
20
21
22
23
24
25
26
27
28
29
30

#include <cutlass/array.h>
#include <cutlass/cutlass.h>
#include <cutlass/numeric_conversion.h>
#include <cutlass/numeric_types.h>

////////////////////////////////////////////////////////////////////////////////////////////////////

namespace flash {

////////////////////////////////////////////////////////////////////////////////////////////////////

template<typename T>
31
__forceinline__ __device__ uint32_t relu2(const uint32_t x);
Tri Dao's avatar
Tri Dao committed
32
33

template<>
34
__forceinline__ __device__ uint32_t relu2<cutlass::half_t>(const uint32_t x) {
Tri Dao's avatar
Tri Dao committed
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
    uint32_t res;
    const uint32_t zero = 0u;
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
    asm volatile("max.f16x2 %0, %1, %2;\n" : "=r"(res) : "r"(x), "r"(zero));
#else
    asm volatile( \
        "{\n" \
        "\t .reg .f16x2 sela;\n" \
        "\t set.gtu.u32.f16x2 sela, %1, %2;\n" \
        "\t and.b32 %0, sela, %1;\n" 
        "}\n" : "=r"(res) : "r"(x), "r"(zero));
#endif
    return res;
}

#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
template<>
52
__forceinline__ __device__ uint32_t relu2<cutlass::bfloat16_t>(const uint32_t x) {
Tri Dao's avatar
Tri Dao committed
53
54
55
56
57
58
59
60
61
62
63
64
    uint32_t res;
    const uint32_t zero = 0u;
    asm volatile("max.bf16x2 %0, %1, %2;\n" : "=r"(res) : "r"(x), "r"(zero));
    return res;
}
#endif

////////////////////////////////////////////////////////////////////////////////////////////////////

#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800

template<typename T>
65
__forceinline__ __device__ uint32_t convert_relu2(const float2 x);
Tri Dao's avatar
Tri Dao committed
66
67

template<>
68
__forceinline__ __device__ uint32_t convert_relu2<cutlass::half_t>(const float2 x) {
Tri Dao's avatar
Tri Dao committed
69
70
71
72
73
74
75
76
    uint32_t res;
    const uint32_t a = reinterpret_cast<const uint32_t&>(x.x);
    const uint32_t b = reinterpret_cast<const uint32_t&>(x.y);
    asm volatile("cvt.rn.relu.f16x2.f32 %0, %1, %2;\n" : "=r"(res) : "r"(b), "r"(a));
    return res;
}

template<>
77
__forceinline__ __device__ uint32_t convert_relu2<cutlass::bfloat16_t>(const float2 x) {
Tri Dao's avatar
Tri Dao committed
78
79
80
81
82
83
84
85
86
87
88
89
90
    uint32_t res;
    const uint32_t a = reinterpret_cast<const uint32_t&>(x.x);
    const uint32_t b = reinterpret_cast<const uint32_t&>(x.y);
    asm volatile("cvt.rn.relu.bf16x2.f32 %0, %1, %2;\n" : "=r"(res) : "r"(b), "r"(a));
    return res;
}

#endif

////////////////////////////////////////////////////////////////////////////////////////////////////

template<typename T>
struct MaxOp {
91
__device__ __forceinline__ T operator()(T const & x, T const & y) { return x > y ? x : y; }
Tri Dao's avatar
Tri Dao committed
92
93
94
95
96
};

template <>
struct MaxOp<float> {
// This is slightly faster
97
__device__ __forceinline__ float operator()(float const &x, float const &y) { return max(x, y); }
Tri Dao's avatar
Tri Dao committed
98
99
100
101
102
103
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template<typename T>
struct SumOp {
104
__device__ __forceinline__ T operator()(T const & x, T const & y) { return x + y; }
Tri Dao's avatar
Tri Dao committed
105
106
107
108
109
110
111
112
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template<int THREADS>
struct Allreduce {
    static_assert(THREADS == 32 || THREADS == 16 || THREADS == 8 || THREADS == 4);
    template<typename T, typename Operator>
113
    static __device__ __forceinline__ T run(T x, Operator &op) {
Tri Dao's avatar
Tri Dao committed
114
115
116
117
118
119
120
121
122
123
124
        constexpr int OFFSET = THREADS / 2;
        x = op(x, __shfl_xor_sync(uint32_t(-1), x, OFFSET));
        return Allreduce<OFFSET>::run(x, op);
    }
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template<>
struct Allreduce<2> {
template<typename T, typename Operator> 
125
static __device__ __forceinline__ T run(T x, Operator &op) {
Tri Dao's avatar
Tri Dao committed
126
127
128
129
130
131
132
133
134
    x = op(x, __shfl_xor_sync(uint32_t(-1), x, 1));
    return x;
}
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template<bool A_in_regs=false, bool B_in_regs=false, typename Tensor0, typename Tensor1,
         typename Tensor2, typename Tensor3, typename Tensor4,
Tri Dao's avatar
Tri Dao committed
135
136
         typename TiledMma, typename TiledCopyA, typename TiledCopyB,
         typename ThrCopyA, typename ThrCopyB>
137
__forceinline__ __device__ void gemm(Tensor0 &acc, Tensor1 &tCrA, Tensor2 &tCrB, Tensor3 const& tCsA,
Tri Dao's avatar
Tri Dao committed
138
                            Tensor4 const& tCsB, TiledMma tiled_mma,
Tri Dao's avatar
Tri Dao committed
139
140
                            TiledCopyA smem_tiled_copy_A, TiledCopyB smem_tiled_copy_B,
                            ThrCopyA smem_thr_copy_A, ThrCopyB smem_thr_copy_B) {
Tri Dao's avatar
Tri Dao committed
141
142
143
144
145
146
147
    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(acc));                     // MMA_M
    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(acc));                     // MMA_N
    CUTE_STATIC_ASSERT_V(size<2>(tCrA) == size<2>(tCrB));                     // MMA_K
    Tensor tCrA_copy_view = smem_thr_copy_A.retile_D(tCrA);
    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));            // M
    Tensor tCrB_copy_view = smem_thr_copy_B.retile_D(tCrB);
    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<1>(tCrB_copy_view));            // N
Tri Dao's avatar
Tri Dao committed
148
149
    if (!A_in_regs) { cute::copy(smem_tiled_copy_A, tCsA(_, _, _0{}), tCrA_copy_view(_, _, _0{})); }
    if (!B_in_regs) { cute::copy(smem_tiled_copy_B, tCsB(_, _, _0{}), tCrB_copy_view(_, _, _0{})); }
Tri Dao's avatar
Tri Dao committed
150
151
152
    #pragma unroll
    for (int i = 0; i < size<2>(tCrA); ++i) {
        if (i < size<2>(tCrA) - 1) {
Tri Dao's avatar
Tri Dao committed
153
154
            if (!A_in_regs) { cute::copy(smem_tiled_copy_A, tCsA(_, _, i + 1), tCrA_copy_view(_, _, i + 1)); }
            if (!B_in_regs) { cute::copy(smem_tiled_copy_B, tCsB(_, _, i + 1), tCrB_copy_view(_, _, i + 1)); }
Tri Dao's avatar
Tri Dao committed
155
156
157
158
159
160
161
162
        }
        cute::gemm(tiled_mma, tCrA(_, _, i), tCrB(_, _, i), acc);
    }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template<typename Tensor0, typename Tensor1, typename Tensor2, typename Tensor3,
Tri Dao's avatar
Tri Dao committed
163
         typename TiledMma, typename TiledCopy, typename ThrCopy>
164
__forceinline__ __device__ void gemm_rs(Tensor0 &acc, Tensor1 &tCrA, Tensor2 &tCrB, Tensor3 const& tCsB,
165
166
                               TiledMma tiled_mma, TiledCopy smem_tiled_copy_B,
                               ThrCopy smem_thr_copy_B) {
Tri Dao's avatar
Tri Dao committed
167
168
169
170
171
    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(acc));                     // MMA_M
    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(acc));                     // MMA_N
    CUTE_STATIC_ASSERT_V(size<2>(tCrA) == size<2>(tCrB));                     // MMA_K
    Tensor tCrB_copy_view = smem_thr_copy_B.retile_D(tCrB);
    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<1>(tCrB_copy_view));            // N
Tri Dao's avatar
Tri Dao committed
172
    cute::copy(smem_tiled_copy_B, tCsB(_, _, _0{}), tCrB_copy_view(_, _, _0{}));
Tri Dao's avatar
Tri Dao committed
173
174
175
    #pragma unroll
    for (int i = 0; i < size<2>(tCrA); ++i) {
        if (i < size<2>(tCrA) - 1) {
Tri Dao's avatar
Tri Dao committed
176
            cute::copy(smem_tiled_copy_B, tCsB(_, _, i + 1), tCrB_copy_view(_, _, i + 1));
Tri Dao's avatar
Tri Dao committed
177
178
179
180
181
182
183
184
185
        }
        cute::gemm(tiled_mma, tCrA(_, _, i), tCrB(_, _, i), acc);
    }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

// Convert acc_layout from (MMA=4, MMA_M, MMA_N) to (nrow=(2, MMA_M), ncol=(2, MMA_N))
template<typename Layout>
186
__forceinline__ __device__ auto convert_layout_acc_rowcol(Layout acc_layout) {
Tri Dao's avatar
Tri Dao committed
187
188
189
    static_assert(decltype(size<0>(acc_layout))::value == 4);
    static_assert(decltype(rank(acc_layout))::value == 3);
    auto l = logical_divide(acc_layout, Shape<_2>{});  // ((2, 2), MMA_M, MMA_N)
190
    return make_layout(make_layout(get<0, 1>(l), get<1>(l)), make_layout(get<0, 0>(l), get<2>(l)));
Tri Dao's avatar
Tri Dao committed
191
192
193
194
};

////////////////////////////////////////////////////////////////////////////////////////////////////

195
196
// Convert acc_layout from (MMA=4, MMA_M, MMA_N) to ((4, 2), MMA_M, MMA_N / 2)
// if using m16n8k16, or to (4, MMA_M, MMA_N) if using m16n8k8.
Tri Dao's avatar
Tri Dao committed
197
template<typename MMA_traits, typename Layout>
198
__forceinline__ __device__ auto convert_layout_acc_Aregs(Layout acc_layout) {
Tri Dao's avatar
Tri Dao committed
199
    using X = Underscore;
200
201
    static_assert(decltype(size<0>(acc_layout))::value == 4);
    static_assert(decltype(rank(acc_layout))::value == 3);
Tri Dao's avatar
Tri Dao committed
202
203
    constexpr int mma_shape_K = get<2>(typename MMA_traits::Shape_MNK{});
    static_assert(mma_shape_K == 8 || mma_shape_K == 16);
204
205
206
207
208
209
    if constexpr (mma_shape_K == 8) {
        return acc_layout;
    } else {
        auto l = logical_divide(acc_layout, Shape<X, X, _2>{});  // (4, MMA_M, (2, MMA_N / 2)))
        return make_layout(make_layout(get<0>(l), get<2, 0>(l)), get<1>(l), get<2, 1>(l));
    }
Tri Dao's avatar
Tri Dao committed
210
211
212
213
};

////////////////////////////////////////////////////////////////////////////////////////////////////

214
// Convert acc_layout from (MMA=4, MMA_M, MMA_N) to ((4, 2), MMA_M, MMA_N / 2)
215
template<typename Layout>
216
__forceinline__ __device__ auto convert_layout_acc_dropout(Layout acc_layout) {
217
    using X = Underscore;
218
219
220
221
    static_assert(decltype(size<0>(acc_layout))::value == 4);
    static_assert(decltype(rank(acc_layout))::value == 3);
    auto l = logical_divide(acc_layout, Shape<X, X, _2>{});  // (4, MMA_M, (2, MMA_N / 2)))
    return make_layout(make_layout(get<0>(l), get<2, 0>(l)), get<1>(l), get<2, 1>(l));
222
223
224
225
};

////////////////////////////////////////////////////////////////////////////////////////////////////

Tri Dao's avatar
Tri Dao committed
226
template <typename To_type, typename Engine, typename Layout>
227
__forceinline__ __device__ auto convert_type(Tensor<Engine, Layout> const &tensor) {
Tri Dao's avatar
Tri Dao committed
228
229
230
231
232
233
234
235
236
237
238
    using From_type = typename Engine::value_type;
    constexpr int numel = decltype(size(tensor))::value;
    cutlass::NumericArrayConverter<To_type, From_type, numel> convert_op;
    // HACK: this requires tensor to be "contiguous"
    auto frag = convert_op(*reinterpret_cast<const cutlass::Array<From_type, numel> *>(tensor.data()));
    return make_tensor(make_rmem_ptr<To_type>(&frag), tensor.layout());
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Engine, typename Layout>
239
__forceinline__ __device__ void relu_(Tensor<Engine, Layout> &tensor) {
Tri Dao's avatar
Tri Dao committed
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
    constexpr int numel = decltype(size(tensor))::value;
    static_assert(numel % 2 == 0);
    using value_t = typename Engine::value_type;
    // HACK: this requires tensor to be "contiguous"
    Tensor tensor_uint32 = recast<uint32_t>(tensor);
    #pragma unroll
    for (int i = 0; i < size(tensor_uint32); ++i) {
        tensor_uint32(i) = relu2<value_t>(tensor_uint32(i));
    }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

// On SM80 and above, we can fuse fp32 -> fp16/bf16 conversion and relu into 1 instruction
template <typename To_type, typename Engine, typename Layout>
255
__forceinline__ __device__ auto convert_type_relu(Tensor<Engine, Layout> const &tensor) {
Tri Dao's avatar
Tri Dao committed
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
    using From_type = typename Engine::value_type;
    static_assert(std::is_same_v<To_type, cutlass::half_t> || std::is_same_v<To_type, cutlass::bfloat16_t>);
    static_assert(std::is_same_v<float, From_type>);
    constexpr int numel = decltype(size(tensor))::value;
    static_assert(numel % 2 == 0);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
    // HACK: this requires tensor to be "contiguous"
    Tensor tensor_float2 = recast<float2>(tensor);
    Tensor out_uint32 = make_tensor<uint32_t>(tensor_float2.layout());
    #pragma unroll
    for (int i = 0; i < size(out_uint32); ++i) {
        out_uint32(i) = convert_relu2<To_type>(tensor_float2(i));
    }
    Tensor out = make_tensor(make_rmem_ptr<To_type>(out_uint32.data()), tensor.layout());
#else
    Tensor out = flash::convert_type<To_type>(tensor);
    flash::relu_(out);
#endif
    return out;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

// Blocks until all but N previous cp.async.commit_group operations have committed.
// This differs from cute::cp_async_wait in that when N = 0 we don't call cp.async.wait_all
// (which is equivalent to commit_group then wait_group 0).
// Instead we just call cp.async.wait_group 0, which is slightly faster.
// https://github.com/NVIDIA/cutlass/blob/master/include/cute/arch/copy_sm80.hpp#L113
template <int N>
CUTE_HOST_DEVICE
void cp_async_wait() {
#if defined(CUTE_ARCH_CP_ASYNC_SM80_ENABLED)
    asm volatile("cp.async.wait_group %0;\n" :: "n"(N));
#endif
}

////////////////////////////////////////////////////////////////////////////////////////////////////

294
// resolves offset of a slice of a paged kv copy from gmem.
skrider's avatar
skrider committed
295
// assumes that the tensor has already been positioned at the correct head.
296
297
template <typename Kernel_traits>
__forceinline__ __device__
298
int64_t resolve_thread_kv_page_slice_offset(const int tidx, const int n_block_max, const int page_block_size, 
skrider's avatar
skrider committed
299
                            const int* block_table, const int page_stride, const int row_stride) {
skrider's avatar
skrider committed
300
301
302
303
304
    constexpr int kGmemThreadsPerRow = Kernel_traits::kGmemThreadsPerRow;
    constexpr int kGmemRowsPerThread = Kernel_traits::kGmemRowsPerThread;
    constexpr int kGmemElemsPerLoad = Kernel_traits::kGmemElemsPerLoad;
    constexpr int kBlockN = Kernel_traits::kBlockN;
    
305
306
307
308
    const int64_t col_offset = tidx % kGmemThreadsPerRow * kGmemElemsPerLoad;
    const int64_t block_row_offset = tidx / kGmemThreadsPerRow * kGmemRowsPerThread;
    const int64_t global_row_offset = block_row_offset + (n_block_max - 1) * kBlockN;
    const int64_t page_offset = global_row_offset % page_block_size;
Woosuk Kwon's avatar
Woosuk Kwon committed
309
    const int64_t virtual_page_idx = global_row_offset / page_block_size;
310
311
312

    return ((int64_t) block_table[virtual_page_idx]) * ((int64_t) page_stride)
        + page_offset * ((int64_t) row_stride)
313
314
315
316
317
        + col_offset;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

318
// Layout reshape function. Given a layout with modes ((v1, v2), m, k), returns (v1, v2, k),         
skrider's avatar
skrider committed
319
320
321
322
323
324
325
// where v2 may be a tuple itself, in the case of swizzled smem-backed thread tiles. This ensures
// that paged and non-paged copies result in equivalently shaped, if not necessarily strided, tensors.
template <class Shape, class Stride>
__forceinline__ __device__
auto reshape_thread_tile(Layout<Shape, Stride> l) {
    return make_layout(append(get<0>(l.shape()), get<2>(l.shape())),
                        append(get<0>(l.stride()), get<2>(l.stride())));
skrider's avatar
skrider committed
326
327
}

328
329
330
331
332
333
334
335
336
337
338
// reshapes and flattens the thread tile layout. A separate function is needed for the case where
// one of the modes of l is a layout itself and must be flattened, as opposed to keeping it intact
// for the case of swizzled layouts
template <class Shape, class Stride>
__forceinline__ __device__
auto reshape_flatten_thread_tile(Layout<Shape, Stride> l) {
    auto mode_0 = filter(flatten(get<0>(l)));
    return make_layout(append(mode_0.shape(), get<2>(l.shape())),
                        append(mode_0.stride(), get<2>(l.stride())));
}

skrider's avatar
skrider committed
339
340
////////////////////////////////////////////////////////////////////////////////////////////////////

Tri Dao's avatar
Tri Dao committed
341
342
343
template <bool Is_even_MN=true, bool Is_even_K=true, bool Clear_OOB_MN=false, bool Clear_OOB_K=true,
          typename TiledCopy, typename Engine0, typename Layout0, typename Engine1, typename Layout1,
          typename Engine2, typename Layout2, typename Engine3, typename Layout3>
344
__forceinline__ __device__ void copy(TiledCopy tiled_copy, Tensor<Engine0, Layout0> const &S,
Tri Dao's avatar
Tri Dao committed
345
                            Tensor<Engine1, Layout1> &D, Tensor<Engine2, Layout2> const &identity_MN,
Tri Dao's avatar
Tri Dao committed
346
                            Tensor<Engine3, Layout3> const &predicate_K, const int max_MN=0) {
Tri Dao's avatar
Tri Dao committed
347
348
349
350
351
352
353
354
355
356
357
358
359
    CUTE_STATIC_ASSERT_V(rank(S) == Int<3>{});
    CUTE_STATIC_ASSERT_V(rank(D) == Int<3>{});
    CUTE_STATIC_ASSERT_V(size<0>(S) == size<0>(D));                     // MMA
    CUTE_STATIC_ASSERT_V(size<1>(S) == size<1>(D));                     // MMA_M
    CUTE_STATIC_ASSERT_V(size<2>(S) == size<2>(D));                     // MMA_K
    // There's no case where !Clear_OOB_K && Clear_OOB_MN
    static_assert(!(Clear_OOB_MN && !Clear_OOB_K));
    #pragma unroll
    for (int m = 0; m < size<1>(S); ++m) {
        if (Is_even_MN || get<0>(identity_MN(0, m, 0)) < max_MN) {
            #pragma unroll
            for (int k = 0; k < size<2>(S); ++k) {
                if (Is_even_K || predicate_K(k)) {
Tri Dao's avatar
Tri Dao committed
360
                    cute::copy(tiled_copy, S(_, m, k), D(_, m, k));
Tri Dao's avatar
Tri Dao committed
361
                } else if (Clear_OOB_K) {
Tri Dao's avatar
Tri Dao committed
362
                    cute::clear(D(_, m, k));
Tri Dao's avatar
Tri Dao committed
363
364
365
                }
            }
        } else if (Clear_OOB_MN) {
Tri Dao's avatar
Tri Dao committed
366
            cute::clear(D(_, m, _));
Tri Dao's avatar
Tri Dao committed
367
368
369
370
371
372
373
374
        }
    }
    // TD [2023-04-13]: Strange that the code below can cause race condition.
    // I think it's because the copies are under an if statement.
    // if (Is_even_K) {
    //     #pragma unroll
    //     for (int m = 0; m < size<1>(S); ++m) {
    //         if (Is_even_MN || get<0>(identity_MN(0, m, 0)) < max_MN) {
Tri Dao's avatar
Tri Dao committed
375
    //             copy(tiled_copy, S(_, m, _), D(_, m, _));
Tri Dao's avatar
Tri Dao committed
376
377
378
379
380
381
382
383
384
385
386
    //         } else if (Clear_OOB_MN) {
    //             clear(D(_, m, _));
    //         }
    //     }
    // } else {  // It's slightly faster in this case if iterate over K first
    //     #pragma unroll
    //     for (int k = 0; k < size<2>(S); ++k) {
    //         if (predicate_K(k)) {
    //             #pragma unroll
    //             for (int m = 0; m < size<1>(S); ++m) {
    //                 if (Is_even_MN || get<0>(identity_MN(0, m, 0)) < max_MN) {
Tri Dao's avatar
Tri Dao committed
387
    //                     copy(tiled_copy, S(_, m, k), D(_, m, k));
Tri Dao's avatar
Tri Dao committed
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
    //                 } else if (Clear_OOB_MN) {
    //                     clear(D(_, m, k));
    //                 }
    //             }
    //         } else if (Clear_OOB_K) {  // There's no case where !Clear_OOB_K && Clear_OOB_MN
    //             if (Clear_OOB_MN || Is_even_MN) {
    //                 clear(D(_, _, k));
    //             } else {
    //                 #pragma unroll
    //                 for (int m = 0; m < size<1>(S); ++m) {
    //                     if (!(Is_even_MN || get<0>(identity_MN(0, m, 0)) < max_MN)) {
    //                         clear(D(_, m, k));
    //                     }
    //                 }
    //             }
    //         }
    //     }
    // }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

410
411
template <bool Is_even_K=true,
          typename Engine0, typename Layout0, typename Engine1, typename Layout1,
Tri Dao's avatar
Tri Dao committed
412
          typename Engine2, typename Layout2, typename Engine3, typename Layout3>
413
__forceinline__ __device__ void copy_w_min_idx(Tensor<Engine0, Layout0> const &S,
Tri Dao's avatar
Tri Dao committed
414
415
                                      Tensor<Engine1, Layout1> &D, Tensor<Engine2, Layout2> const &identity_MN,
                                      Tensor<Engine3, Layout3> const &predicate_K,
416
417
                                      const int max_MN=0, const int min_MN=0) {
    CUTE_STATIC_ASSERT_V(rank(S) == Int<3>{});
Tri Dao's avatar
Tri Dao committed
418
    CUTE_STATIC_ASSERT_V(rank(D) == Int<3>{});
419
420
421
422
    CUTE_STATIC_ASSERT_V(size<0>(S) == size<0>(D));                     // MMA
    CUTE_STATIC_ASSERT_V(size<1>(S) == size<1>(D));                     // MMA_M
    CUTE_STATIC_ASSERT_V(size<2>(S) == size<2>(D));                     // MMA_K
    // if (threadIdx.x == 0 && blockIdx.z == 0) { printf("blockIdx.y = %d, max_MN = %d, min_MN = %d\n", blockIdx.y, max_MN, min_MN); }
Tri Dao's avatar
Tri Dao committed
423
    #pragma unroll
424
425
426
427
    for (int m = 0; m < size<1>(S); ++m) {
        // if (threadIdx.x == 0 && blockIdx.z == 0) { printf("blockIdx.y = %d, m = %d\n", blockIdx.y, get<0>(identity_MN(0, m, 0))); }
        if (get<0>(identity_MN(0, m, 0)) >= min_MN && get<0>(identity_MN(0, m, 0)) < max_MN) {
            // if (threadIdx.x == 0 && blockIdx.z == 0) { printf("Inner loop, blockIdx.y = %d, m = %d\n", blockIdx.y, get<0>(identity_MN(0, m, 0))); }
Tri Dao's avatar
Tri Dao committed
428
            #pragma unroll
429
            for (int k = 0; k < size<2>(S); ++k) {
Tri Dao's avatar
Tri Dao committed
430
                if (Is_even_K || predicate_K(k)) {
431
432
433
434
435
436
437
438
439
                    cute::copy(S(_, m, k), D(_, m, k));
                }
            }
        }
    }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

Phil Wang's avatar
Phil Wang committed
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
template <typename Engine, typename Layout>
__forceinline__ __device__ void apply_softcap(Tensor<Engine, Layout> &tensor, const float softcap){
    #pragma unroll
    for (int i = 0; i < size(tensor); ++i) {
        tensor(i) = cutlass::fast_tanh(tensor(i) * softcap);
    }
}

template <typename Engine0, typename Layout0, typename Engine1, typename Layout1>
__forceinline__ __device__ void calculate_dtanh(Tensor<Engine0, Layout0> &src_tensor, Tensor<Engine1, Layout1> &dst_tensor, const float softcap){
    #pragma unroll
    for (int i = 0; i < size(src_tensor); ++i) {
        dst_tensor(i) = (1.f - (src_tensor(i) * src_tensor(i))) * softcap;
    }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

458
}  // namespace flash