fmha_api.cpp 29.8 KB
Newer Older
Tri Dao's avatar
Tri Dao committed
1
/******************************************************************************
Tri Dao's avatar
Tri Dao committed
2
 * Copyright (c) 2022, Tri Dao.
Tri Dao's avatar
Tri Dao committed
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
 * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

#include <torch/extension.h>
#include <ATen/cuda/CUDAContext.h>

#include "fmha.h"

Tri Dao's avatar
Tri Dao committed
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")")


void set_params_fprop(FMHA_fprop_params &params,
                      // sizes
                      const size_t b,
                      const size_t seqlen_q,
                      const size_t seqlen_k,
                      const size_t h,
                      const size_t d,
                      // device pointers
                      const at::Tensor q,
                      const at::Tensor k,
                      const at::Tensor v,
48
                      at::Tensor out,
Tri Dao's avatar
Tri Dao committed
49
50
51
52
53
54
55
56
                      void *cu_seqlens_q_d,
                      void *cu_seqlens_k_d,
                      void *o_tmp_d,
                      void *s_d,
                      void *softmax_lse_d,
                      float p_dropout,
                      float softmax_scale,
                      bool is_causal) {
Tri Dao's avatar
Tri Dao committed
57
58

    Data_type acc_type = DATA_TYPE_FP32;
Tri Dao's avatar
Tri Dao committed
59
    Data_type data_type = !(q.dtype() == torch::kBFloat16) ? DATA_TYPE_FP16 : DATA_TYPE_BF16;
Tri Dao's avatar
Tri Dao committed
60
61
62
63

    // Reset the parameters
    memset(&params, 0, sizeof(params));

Tri Dao's avatar
Tri Dao committed
64
65
    params.is_bf16 = q.dtype() == torch::kBFloat16;

Tri Dao's avatar
Tri Dao committed
66
    // Set the pointers and strides.
Tri Dao's avatar
Tri Dao committed
67
68
69
70
71
72
73
74
75
    params.q_ptr = q.data_ptr();
    params.k_ptr = k.data_ptr();
    params.v_ptr = v.data_ptr();
    params.q_row_stride_in_elts = q.stride(0);
    params.k_row_stride_in_elts = k.stride(0);
    params.v_row_stride_in_elts = v.stride(0);
    params.q_head_stride_in_elts = q.stride(1);
    params.k_head_stride_in_elts = k.stride(1);
    params.v_head_stride_in_elts = v.stride(1);
76
77
78
    params.o_ptr = out.data_ptr();
    params.o_row_stride_in_elts = out.stride(0);
    params.o_head_stride_in_elts = out.stride(1);
Tri Dao's avatar
Tri Dao committed
79
    params.o_tmp_ptr = o_tmp_d;
80
81
    params.o_tmp_row_stride_in_elts = h * d;
    params.o_tmp_head_stride_in_elts = d;
Tri Dao's avatar
Tri Dao committed
82

Tri Dao's avatar
Tri Dao committed
83
84
    params.cu_seqlens_q = static_cast<int *>(cu_seqlens_q_d);
    params.cu_seqlens_k = static_cast<int *>(cu_seqlens_k_d);
Tri Dao's avatar
Tri Dao committed
85
86
87

    // S = softmax(P)
    params.s_ptr = s_d;
Tri Dao's avatar
Tri Dao committed
88
    params.s_stride_in_bytes = get_size_in_bytes(b * h * seqlen_k, data_type);
Tri Dao's avatar
Tri Dao committed
89
90
91
92
93
94
95

    // Softmax sum
    params.softmax_lse_ptr = softmax_lse_d;

    // Set the dimensions.
    params.b = b;
    params.h = h;
Tri Dao's avatar
Tri Dao committed
96
97
    params.seqlen_q = seqlen_q;
    params.seqlen_k = seqlen_k;
Tri Dao's avatar
Tri Dao committed
98
99
100
101
102
103
104
105
106
107
108
109
    params.d = d;

    // Set the different scale values.
    // const float scale_bmm1 = 1.f / sqrtf(d);
    const float scale_bmm1 = softmax_scale;

    params.scale_bmm1f = scale_bmm1;
    set_alpha(params.scale_bmm1, scale_bmm1, data_type);

    // Set this to probability of keeping an element to simplify things.
    params.p_dropout = 1.f - p_dropout;
    // Convert p from float to int so we don't have to convert the random uint to float to compare.
110
    // [Minor] We want to round down since when we do the comparison we use <= instead of <
Tri Dao's avatar
Tri Dao committed
111
112
113
    params.p_dropout_in_uint = uint32_t(std::floor(params.p_dropout * 4294967295.0));
    params.p_dropout_in_uint16_t = uint16_t(std::floor(params.p_dropout * 65535.0));
    params.rp_dropout = 1.f / params.p_dropout;
114
    params.scale_bmm1_rp_dropout = params.rp_dropout * params.scale_bmm1f;
Tri Dao's avatar
Tri Dao committed
115
116
117
118
119
120
    TORCH_CHECK(p_dropout < 1.f);
    set_alpha(params.scale_dropout, params.rp_dropout, data_type);

    params.is_causal = is_causal;
}

Tri Dao's avatar
Tri Dao committed
121
122
123
124
125
126
127
128
129
130
131
void set_params_dgrad(FMHA_dgrad_params &params,
                      // sizes
                      const size_t b,
                      const size_t seqlen_q,
                      const size_t seqlen_k,
                      const size_t h,
                      const size_t d,
                      // device pointers
                      const at::Tensor q,
                      const at::Tensor k,
                      const at::Tensor v,
132
                      const at::Tensor out,
Tri Dao's avatar
Tri Dao committed
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
                      at::Tensor dq,
                      at::Tensor dk,
                      at::Tensor dv,
                      void *cu_seqlens_q_d,
                      void *cu_seqlens_k_d,
                      void *dq_tmp_d,
                      void *do_packed_d,
                      void *softmax_lse_d,
                      void *dsoftmax_sum_d,
                      float p_dropout,
                      float softmax_scale,
                      bool is_causal) {

    set_params_fprop(params,
                     b, seqlen_q, seqlen_k, h, d,
148
                     q, k, v, out,
Tri Dao's avatar
Tri Dao committed
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
                     cu_seqlens_q_d,
                     cu_seqlens_k_d,
                     dq_tmp_d,  // Reusing the o_tmp_ptr variable to store dq_tmp
                     nullptr,
                     softmax_lse_d,
                     p_dropout,
                     softmax_scale,
                     is_causal);

    // Set the pointers and strides.
    params.dq_ptr = dq.data_ptr();
    params.dk_ptr = dk.data_ptr();
    params.dv_ptr = dv.data_ptr();
    params.dq_row_stride_in_elts = dq.stride(0);
    params.dk_row_stride_in_elts = dk.stride(0);
    params.dv_row_stride_in_elts = dv.stride(0);
    params.dq_head_stride_in_elts = dq.stride(1);
    params.dk_head_stride_in_elts = dk.stride(1);
    params.dv_head_stride_in_elts = dv.stride(1);
    params.do_ptr = do_packed_d;

    // Softmax sum
    params.dsoftmax_sum = dsoftmax_sum_d;
}

174
std::vector<at::Tensor>
Tri Dao's avatar
Tri Dao committed
175
176
177
mha_fwd(const at::Tensor &q,         // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
        const at::Tensor &k,         // total_k x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
        const at::Tensor &v,         // total_k x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
178
        at::Tensor &out,             // total_q x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
Tri Dao's avatar
Tri Dao committed
179
180
181
182
        const at::Tensor &cu_seqlens_q,  // b+1
        const at::Tensor &cu_seqlens_k,  // b+1
        const int max_seqlen_q_,
        const int max_seqlen_k_,
Tri Dao's avatar
Tri Dao committed
183
184
185
186
187
188
189
190
        const float p_dropout,
        const float softmax_scale,
        const bool zero_tensors,
        const bool is_causal,
        const bool return_softmax,
        c10::optional<at::Generator> gen_) {

    auto dprops = at::cuda::getCurrentDeviceProperties();
Tri Dao's avatar
Tri Dao committed
191
    bool is_sm75 = dprops->major == 7 && dprops->minor == 5;
Tri Dao's avatar
Tri Dao committed
192
    bool is_sm80 = dprops->major == 8 && dprops->minor == 0;
Tri Dao's avatar
Tri Dao committed
193
194
    bool is_sm8x = dprops->major == 8 && dprops->minor >= 0;
    TORCH_CHECK(is_sm8x || is_sm75);
Tri Dao's avatar
Tri Dao committed
195
196
    auto stream = at::cuda::getCurrentCUDAStream().stream();
    bool is_dropout = p_dropout > 0.0;
Tri Dao's avatar
Tri Dao committed
197
198
    Launch_params<FMHA_fprop_params> launch_params(dprops, stream, is_dropout, return_softmax);

Tri Dao's avatar
Tri Dao committed
199
200
201
202
    auto q_dtype = q.dtype();
    TORCH_CHECK(q_dtype == torch::kFloat16 || (is_sm8x && q_dtype == torch::kBFloat16));
    TORCH_CHECK(k.dtype() == q_dtype);
    TORCH_CHECK(v.dtype() == q_dtype);
203
    TORCH_CHECK(out.dtype() == q_dtype);
Tri Dao's avatar
Tri Dao committed
204
205
206
207
208
209
    TORCH_CHECK(cu_seqlens_q.dtype() == torch::kInt32);
    TORCH_CHECK(cu_seqlens_k.dtype() == torch::kInt32);

    TORCH_CHECK(q.is_cuda());
    TORCH_CHECK(k.is_cuda());
    TORCH_CHECK(v.is_cuda());
210
    TORCH_CHECK(out.is_cuda());
Tri Dao's avatar
Tri Dao committed
211
212
213
214
215
216
    TORCH_CHECK(cu_seqlens_q.is_cuda());
    TORCH_CHECK(cu_seqlens_k.is_cuda());

    TORCH_CHECK(q.stride(-1) == 1);
    TORCH_CHECK(k.stride(-1) == 1);
    TORCH_CHECK(v.stride(-1) == 1);
217
    TORCH_CHECK(out.stride(-1) == 1);
Tri Dao's avatar
Tri Dao committed
218
219
220
221
222
223
224
    TORCH_CHECK(cu_seqlens_k.is_contiguous());
    TORCH_CHECK(cu_seqlens_k.is_contiguous());

    const auto sizes = q.sizes();

    const int batch_size = cu_seqlens_q.numel() - 1;
    const int total_q = sizes[TOTAL_DIM];
Tri Dao's avatar
Tri Dao committed
225
226
    const int num_heads = sizes[H_DIM];
    const int head_size = sizes[D_DIM];
Tri Dao's avatar
Tri Dao committed
227
    const int total_k = k.size(TOTAL_DIM);
Tri Dao's avatar
Tri Dao committed
228
229
230
    TORCH_CHECK(batch_size > 0);
    TORCH_CHECK(head_size == 16 || head_size == 32 || head_size == 64 || head_size == 128);

Tri Dao's avatar
Tri Dao committed
231
232
233
    CHECK_SHAPE(q, total_q, num_heads, head_size);
    CHECK_SHAPE(k, total_k, num_heads, head_size);
    CHECK_SHAPE(v, total_k, num_heads, head_size);
234
    CHECK_SHAPE(out, total_q, num_heads, head_size);
Tri Dao's avatar
Tri Dao committed
235
236
237
238
239
240
241
242
243
244
    CHECK_SHAPE(cu_seqlens_q, batch_size + 1);
    CHECK_SHAPE(cu_seqlens_k, batch_size + 1);

    int blocksize_c = ((head_size == 128 && (is_dropout || !is_sm80)) || (is_sm75 && head_size == 64 && is_dropout)) ? 128 : 256;
    // Need to round max_seqlen_k to multiples of blocksize_c
    int max_seqlen_k = ((max_seqlen_k_ + blocksize_c - 1) / blocksize_c) * blocksize_c;
    if( max_seqlen_k_ <= 128 ) {
        max_seqlen_k = 128;
    } else if( max_seqlen_k_ <= 256 ) {
        max_seqlen_k = 256;
Tri Dao's avatar
Tri Dao committed
245
    }
Tri Dao's avatar
Tri Dao committed
246
247
    int max_seqlen_q = ((max_seqlen_q_ + 16 - 1) / 16) * 16;
    bool loop = max_seqlen_k > blocksize_c;
Tri Dao's avatar
Tri Dao committed
248

Tri Dao's avatar
Tri Dao committed
249
    auto opts = q.options();
Tri Dao's avatar
Tri Dao committed
250

251
    // auto o = torch::empty({ total_q, num_heads, head_size }, opts);
Tri Dao's avatar
Tri Dao committed
252
253

    at::Tensor o_tmp;
Tri Dao's avatar
Tri Dao committed
254
    if (loop) { o_tmp = torch::empty({total_q, num_heads, head_size}, opts.dtype(at::kFloat)); }
Tri Dao's avatar
Tri Dao committed
255

Tri Dao's avatar
Tri Dao committed
256
257
    auto softmax_lse = torch::empty({batch_size, num_heads, max_seqlen_q}, opts.dtype(at::kFloat));
    // auto softmax_lse = torch::full({batch_size, num_heads, max_seqlen_k}, -std::numeric_limits<float>::infinity(), opts.dtype(at::kFloat));
Tri Dao's avatar
Tri Dao committed
258
259

    at::Tensor s;
Tri Dao's avatar
Tri Dao committed
260
    if (return_softmax) { s = torch::empty({ batch_size, num_heads, max_seqlen_q, max_seqlen_k }, opts); }
Tri Dao's avatar
Tri Dao committed
261
262

    if( zero_tensors ) {
263
        out.zero_();
Tri Dao's avatar
Tri Dao committed
264
265
266
267
268
269
270
        softmax_lse.fill_(-std::numeric_limits<float>::infinity());
        if (return_softmax) {s.zero_();}
    }

    auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
        gen_, at::cuda::detail::getDefaultCUDAGenerator());

Tri Dao's avatar
Tri Dao committed
271
272
273
274
275
276
    set_params_fprop(launch_params.params,
                     batch_size,
                     max_seqlen_q,
                     max_seqlen_k,
                     num_heads,
                     head_size,
277
                     q, k, v, out,
Tri Dao's avatar
Tri Dao committed
278
279
280
281
282
283
284
285
                     cu_seqlens_q.data_ptr(),
                     cu_seqlens_k.data_ptr(),
                     loop ? o_tmp.data_ptr() : nullptr,
                     return_softmax ? s.data_ptr() : nullptr,
                     softmax_lse.data_ptr(),
                     p_dropout,
                     softmax_scale,
                     is_causal);
Tri Dao's avatar
Tri Dao committed
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300

    run_fmha_fp16_sm80(launch_params, /*configure=*/ true);
    // number of times random will be generated per thread, to offset philox counter in thc random
    // state
    int64_t counter_offset = launch_params.elts_per_thread;
    at::PhiloxCudaState rng_engine_inputs;

    if( is_dropout ) {
        // See Note [Acquire lock when using random generators]
        std::lock_guard<std::mutex> lock(gen->mutex_);
        launch_params.params.philox_args = gen->philox_cuda_state(counter_offset);
    }

    run_fmha_fp16_sm80(launch_params, /*configure=*/false);

301
    std::vector<at::Tensor> result = {softmax_lse};
Tri Dao's avatar
Tri Dao committed
302
303
304
305
306
307
    if (return_softmax) {result.push_back(s);}
    return result;
}


std::vector<at::Tensor>
Tri Dao's avatar
Tri Dao committed
308
309
310
311
312
mha_bwd(const at::Tensor &dout,  // total_q x num_heads, x head_size
        const at::Tensor &q,   // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
        const at::Tensor &k,   // total_k x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
        const at::Tensor &v,   // total_k x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
        const at::Tensor &out,   // total_q x num_heads x head_size
Tri Dao's avatar
Tri Dao committed
313
        const at::Tensor &softmax_lse_,     // b x h x s softmax logsumexp
Tri Dao's avatar
Tri Dao committed
314
315
316
317
318
319
320
        at::Tensor &dq,   // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
        at::Tensor &dk,   // total_k x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
        at::Tensor &dv,   // total_k x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
        const at::Tensor &cu_seqlens_q,  // b+1
        const at::Tensor &cu_seqlens_k,  // b+1
        const int max_seqlen_q_,
        const int max_seqlen_k_,          // max sequence length to choose the kernel
Tri Dao's avatar
Tri Dao committed
321
322
323
324
325
326
327
        const float p_dropout,         // probability to drop
        const float softmax_scale,
        const bool zero_tensors,
        const bool is_causal,
        c10::optional<at::Generator> gen_
) {
    auto dprops = at::cuda::getCurrentDeviceProperties();
Tri Dao's avatar
Tri Dao committed
328
    bool is_sm75 = dprops->major == 7 && dprops->minor == 5;
Tri Dao's avatar
Tri Dao committed
329
    bool is_sm80 = dprops->major == 8 && dprops->minor == 0;
Tri Dao's avatar
Tri Dao committed
330
331
    bool is_sm8x = dprops->major == 8 && dprops->minor >= 0;
    TORCH_CHECK(is_sm8x || is_sm75);
Tri Dao's avatar
Tri Dao committed
332
333
334
335
336
    auto launch = &run_fmha_dgrad_fp16_sm80;

    bool is_dropout = p_dropout > 0.0;
    auto stream = at::cuda::getCurrentCUDAStream().stream();

Tri Dao's avatar
Tri Dao committed
337
338
339
340
341
342
343
344
345
    auto q_dtype = q.dtype();
    TORCH_CHECK(q_dtype == torch::kFloat16 || (is_sm8x && q_dtype == torch::kBFloat16));
    TORCH_CHECK(k.dtype() == q_dtype);
    TORCH_CHECK(v.dtype() == q_dtype);
    TORCH_CHECK(out.dtype() == q_dtype);
    TORCH_CHECK(dout.dtype() == q_dtype);
    TORCH_CHECK(dq.dtype() == q_dtype);
    TORCH_CHECK(dk.dtype() == q_dtype);
    TORCH_CHECK(dv.dtype() == q_dtype);
Tri Dao's avatar
Tri Dao committed
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
    TORCH_CHECK(cu_seqlens_q.dtype() == torch::kInt32);
    TORCH_CHECK(cu_seqlens_k.dtype() == torch::kInt32);

    TORCH_CHECK(q.is_cuda());
    TORCH_CHECK(k.is_cuda());
    TORCH_CHECK(v.is_cuda());
    TORCH_CHECK(out.is_cuda());
    TORCH_CHECK(dout.is_cuda());
    TORCH_CHECK(softmax_lse_.is_cuda());
    TORCH_CHECK(cu_seqlens_q.is_cuda());
    TORCH_CHECK(cu_seqlens_k.is_cuda());

    TORCH_CHECK(q.stride(-1) == 1);
    TORCH_CHECK(k.stride(-1) == 1);
    TORCH_CHECK(v.stride(-1) == 1);
    TORCH_CHECK(out.is_contiguous());
    TORCH_CHECK(dout.is_contiguous());
    TORCH_CHECK(dq.stride(-1) == 1);
    TORCH_CHECK(dk.stride(-1) == 1);
    TORCH_CHECK(dv.stride(-1) == 1);
    TORCH_CHECK(cu_seqlens_q.is_contiguous());
    TORCH_CHECK(cu_seqlens_k.is_contiguous());

    const auto sizes = q.sizes();

    const int batch_size = cu_seqlens_q.numel() - 1;
    const int total_q = sizes[TOTAL_DIM];
Tri Dao's avatar
Tri Dao committed
373
374
    const int num_heads = sizes[H_DIM];
    const int head_size = sizes[D_DIM];
Tri Dao's avatar
Tri Dao committed
375
    const int total_k = k.size(TOTAL_DIM);
Tri Dao's avatar
Tri Dao committed
376
377
    TORCH_CHECK(batch_size > 0);
    TORCH_CHECK(head_size == 16 || head_size == 32 || head_size == 64 || head_size == 128);
Tri Dao's avatar
Tri Dao committed
378
379
380
    if (head_size == 128) {  // TODO: eventually we should support SM86 and SM70 with d=128 as well
        TORCH_CHECK(is_sm80);
    }
Tri Dao's avatar
Tri Dao committed
381

Tri Dao's avatar
Tri Dao committed
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
    CHECK_SHAPE(q, total_q, num_heads, head_size);
    CHECK_SHAPE(k, total_k, num_heads, head_size);
    CHECK_SHAPE(v, total_k, num_heads, head_size);
    CHECK_SHAPE(out, total_q, num_heads, head_size);
    CHECK_SHAPE(dout, total_q, num_heads, head_size);
    CHECK_SHAPE(dq, total_q, num_heads, head_size);
    CHECK_SHAPE(dk, total_k, num_heads, head_size);
    CHECK_SHAPE(dv, total_k, num_heads, head_size);
    CHECK_SHAPE(cu_seqlens_q, batch_size + 1);
    CHECK_SHAPE(cu_seqlens_k, batch_size + 1);

    int blocksize_c = (head_size == 128 || (is_sm75 && head_size == 64)) ? 128 : 256;
    int max_seqlen_k = ((max_seqlen_k_ + blocksize_c - 1) / blocksize_c) * blocksize_c;
    if( max_seqlen_k_ <= 128 ) {
        max_seqlen_k = 128;
    } else if( max_seqlen_k_ <= 256 ) {
        max_seqlen_k = 256;
Tri Dao's avatar
Tri Dao committed
399
    }
Tri Dao's avatar
Tri Dao committed
400
401
    int max_seqlen_q = ((max_seqlen_q_ + 16 - 1) / 16) * 16;
    bool loop = max_seqlen_k > blocksize_c;
Tri Dao's avatar
Tri Dao committed
402

Tri Dao's avatar
Tri Dao committed
403
404
    // It's possible the softmax_lse_ from the fwd has a different length since blocksize_c could be different.
    auto softmax_lse = softmax_lse_.index({torch::indexing::Slice(), torch::indexing::Slice(), torch::indexing::Slice(torch::indexing::None, max_seqlen_q)}).contiguous();
Tri Dao's avatar
Tri Dao committed
405

Tri Dao's avatar
Tri Dao committed
406
407
    auto opts = q.options();
    auto softmax_d = torch::empty({batch_size, num_heads, max_seqlen_q}, opts.dtype(at::kFloat));
Tri Dao's avatar
Tri Dao committed
408
    at::Tensor dq_tmp;
Tri Dao's avatar
Tri Dao committed
409
    if (loop) { dq_tmp = torch::empty({total_q, num_heads, head_size}, opts.dtype(at::kFloat)); }
Tri Dao's avatar
Tri Dao committed
410
411

    if( zero_tensors ) {
Tri Dao's avatar
Tri Dao committed
412
413
414
        dq.zero_();
        dk.zero_();
        dv.zero_();
Tri Dao's avatar
Tri Dao committed
415
416
417
        softmax_d.zero_();
    }

Tri Dao's avatar
Tri Dao committed
418
419
420
421
422
423
424
425
    FMHA_dgrad_params params;

    set_params_dgrad(params,
                     batch_size,
                     max_seqlen_q,
                     max_seqlen_k,
                     num_heads,
                     head_size,
426
                     q, k, v, out,
Tri Dao's avatar
Tri Dao committed
427
428
429
430
431
432
433
434
435
436
                     dq, dk, dv,
                     cu_seqlens_q.data_ptr(),
                     cu_seqlens_k.data_ptr(),
                     loop ? dq_tmp.data_ptr() : nullptr,
                     dout.data_ptr(),
                     softmax_lse.data_ptr(),
                     softmax_d.data_ptr(),
                     p_dropout,
                     softmax_scale,
                     is_causal);
Tri Dao's avatar
Tri Dao committed
437
438
439
440
441

    auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
        gen_, at::cuda::detail::getDefaultCUDAGenerator());

    // We're gonna reset the rng state in Python after this kernel, so the counter offset
Tri Dao's avatar
Tri Dao committed
442
    // here doesn't matter at all. We just choose an arbitrary number.
Tri Dao's avatar
Tri Dao committed
443
444
445
446
447
448
449
450
451
    int64_t counter_offset = 4;

    if( is_dropout ) {
        // See Note [Acquire lock when using random generators]
        std::lock_guard<std::mutex> lock(gen->mutex_);
        params.philox_args = gen->philox_cuda_state(counter_offset);
    }

    launch(params, stream);
Tri Dao's avatar
Tri Dao committed
452
    return { dq, dk, dv, softmax_d };
Tri Dao's avatar
Tri Dao committed
453
454
455
}

std::vector<at::Tensor>
Tri Dao's avatar
Tri Dao committed
456
457
458
459
460
mha_fwd_block(const at::Tensor &q,         // total_q x num_heads x head_size, total := \sum_{i=0}^{b} s_i
              const at::Tensor &k,         // total_k x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
              const at::Tensor &v,         // total_k x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
              const at::Tensor &cu_seqlens_q,  // b+1
              const at::Tensor &cu_seqlens_k,  // b+1
Tri Dao's avatar
Tri Dao committed
461
              const at::Tensor &blockmask,   // (seqlen / 256, seqlen / 16)
Tri Dao's avatar
Tri Dao committed
462
463
              const int max_seqlen_q_,
              const int max_seqlen_k_,
Tri Dao's avatar
Tri Dao committed
464
465
466
467
468
469
470
471
472
473
              const float p_dropout,
              const float softmax_scale,
              const bool is_causal,
              const bool return_softmax,
              c10::optional<at::Generator> gen_) {

    auto dprops = at::cuda::getCurrentDeviceProperties();
    TORCH_CHECK(dprops->major == 8 && dprops->minor >= 0);
    auto stream = at::cuda::getCurrentCUDAStream().stream();
    bool is_dropout = p_dropout > 0.0;
Tri Dao's avatar
Tri Dao committed
474
    Launch_params<FMHA_fprop_params> launch_params(dprops, stream, is_dropout, return_softmax);
Tri Dao's avatar
Tri Dao committed
475

Tri Dao's avatar
Tri Dao committed
476
477
478
479
480
481
    TORCH_CHECK(q.dtype() == torch::kFloat16);
    TORCH_CHECK(k.dtype() == torch::kFloat16);
    TORCH_CHECK(v.dtype() == torch::kFloat16);
    TORCH_CHECK(cu_seqlens_q.dtype() == torch::kInt32);
    TORCH_CHECK(cu_seqlens_k.dtype() == torch::kInt32);
    TORCH_CHECK(blockmask.dtype() == torch::kInt32);
Tri Dao's avatar
Tri Dao committed
482

Tri Dao's avatar
Tri Dao committed
483
484
485
486
487
    TORCH_CHECK(q.is_cuda());
    TORCH_CHECK(k.is_cuda());
    TORCH_CHECK(v.is_cuda());
    TORCH_CHECK(cu_seqlens_q.is_cuda());
    TORCH_CHECK(cu_seqlens_k.is_cuda());
Tri Dao's avatar
Tri Dao committed
488
489
    TORCH_CHECK(blockmask.is_cuda())

Tri Dao's avatar
Tri Dao committed
490
491
492
493
494
    TORCH_CHECK(q.stride(-1) == 1);
    TORCH_CHECK(k.stride(-1) == 1);
    TORCH_CHECK(v.stride(-1) == 1);
    TORCH_CHECK(cu_seqlens_k.is_contiguous());
    TORCH_CHECK(cu_seqlens_k.is_contiguous());
Tri Dao's avatar
Tri Dao committed
495
496
    TORCH_CHECK(blockmask.is_contiguous())

Tri Dao's avatar
Tri Dao committed
497
    const auto sizes = q.sizes();
Tri Dao's avatar
Tri Dao committed
498

Tri Dao's avatar
Tri Dao committed
499
500
    const int batch_size = cu_seqlens_q.numel() - 1;
    const int total_q = sizes[TOTAL_DIM];
Tri Dao's avatar
Tri Dao committed
501
502
    const int num_heads = sizes[H_DIM];
    const int head_size = sizes[D_DIM];
Tri Dao's avatar
Tri Dao committed
503
    const int total_k = k.size(TOTAL_DIM);
Tri Dao's avatar
Tri Dao committed
504
    TORCH_CHECK(batch_size > 0);
Tri Dao's avatar
Tri Dao committed
505
    TORCH_CHECK(head_size == 16 || head_size == 32 || head_size == 64 || head_size == 128);
Tri Dao's avatar
Tri Dao committed
506

Tri Dao's avatar
Tri Dao committed
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
    CHECK_SHAPE(q, total_q, num_heads, head_size);
    CHECK_SHAPE(k, total_k, num_heads, head_size);
    CHECK_SHAPE(v, total_k, num_heads, head_size);
    CHECK_SHAPE(cu_seqlens_q, batch_size + 1);
    CHECK_SHAPE(cu_seqlens_k, batch_size + 1);

    int max_seqlen_k = ((max_seqlen_k_ + 256 - 1) / 256) * 256;
    if( max_seqlen_k <= 256 ) {
        max_seqlen_k = 256;
    }
    int max_seqlen_q = ((max_seqlen_q_ + 16 - 1) / 16) * 16;
    bool loop = max_seqlen_k > 256;
    CHECK_SHAPE(blockmask, max_seqlen_k / 256, max_seqlen_q / 16);

    auto opts = q.options();

    auto o = torch::zeros({ total_q, num_heads, head_size }, opts);
Tri Dao's avatar
Tri Dao committed
524
525
526
527

    at::Tensor o_tmp;
    if (loop) {
        // o_tmp = torch::zeros({total, num_heads, head_size}, opts.dtype(at::kFloat));
Tri Dao's avatar
Tri Dao committed
528
        o_tmp = torch::empty({total_q, num_heads, head_size}, opts.dtype(at::kFloat));
Tri Dao's avatar
Tri Dao committed
529
530
    }

Tri Dao's avatar
Tri Dao committed
531
532
    // auto softmax_lse = torch::full({batch_size, num_heads, max_seqlen_k}, -std::numeric_limits<float>::infinity(), opts.dtype(at::kFloat));
    auto softmax_lse = torch::empty({batch_size, num_heads, max_seqlen_q}, opts.dtype(at::kFloat));
Tri Dao's avatar
Tri Dao committed
533
534
535

    at::Tensor s;
    if (return_softmax) {
Tri Dao's avatar
Tri Dao committed
536
        s = torch::zeros({ batch_size, num_heads, max_seqlen_q, max_seqlen_k }, opts);
Tri Dao's avatar
Tri Dao committed
537
538
539
540
541
    }

    auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
        gen_, at::cuda::detail::getDefaultCUDAGenerator());

Tri Dao's avatar
Tri Dao committed
542
543
544
545
546
547
    set_params_fprop(launch_params.params,
                     batch_size,
                     max_seqlen_q,
                     max_seqlen_k,
                     num_heads,
                     head_size,
548
                     q, k, v, o,
Tri Dao's avatar
Tri Dao committed
549
550
551
552
553
554
555
556
                     cu_seqlens_q.data_ptr(),
                     cu_seqlens_k.data_ptr(),
                     loop ? o_tmp.data_ptr() : nullptr,
                     return_softmax ? s.data_ptr() : nullptr,
                     softmax_lse.data_ptr(),
                     p_dropout,
                     softmax_scale,
                     is_causal);
Tri Dao's avatar
Tri Dao committed
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
    launch_params.params.blockmask = static_cast<int *>(blockmask.data_ptr());

    run_fmha_block_fp16_sm80(launch_params, /*configure=*/ true);
    // number of times random will be generated per thread, to offset philox counter in thc random
    // state
    int64_t counter_offset = launch_params.elts_per_thread;
    at::PhiloxCudaState rng_engine_inputs;

    if( is_dropout ) {
        // See Note [Acquire lock when using random generators]
        std::lock_guard<std::mutex> lock(gen->mutex_);
        launch_params.params.philox_args = gen->philox_cuda_state(counter_offset);
    }

    run_fmha_block_fp16_sm80(launch_params, /*configure=*/false);

Tri Dao's avatar
Tri Dao committed
573
    std::vector<at::Tensor> result = {o, softmax_lse};
Tri Dao's avatar
Tri Dao committed
574
575
576
577
578
579
    if (return_softmax) {result.push_back(s);}
    return result;
}

std::vector<at::Tensor>
mha_bwd_block(const at::Tensor &dout,  // total x num_heads, x head_size
Tri Dao's avatar
Tri Dao committed
580
581
582
583
584
585
586
587
588
589
              const at::Tensor &q,   // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
              const at::Tensor &k,   // total_k x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
              const at::Tensor &v,   // total_k x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
              const at::Tensor &out,   // total_q x num_heads x head_size
              const at::Tensor &softmax_lse_,     // b x h x s softmax logsumexp
              at::Tensor &dq,   // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
              at::Tensor &dk,   // total_k x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
              at::Tensor &dv,   // total_k x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
              const at::Tensor &cu_seqlens_q,  // b+1
              const at::Tensor &cu_seqlens_k,  // b+1
Tri Dao's avatar
Tri Dao committed
590
              const at::Tensor &blockmask,   // (seqlen / 256, seqlen / 16)
Tri Dao's avatar
Tri Dao committed
591
592
              const int max_seqlen_q_,
              const int max_seqlen_k_,          // max sequence length to choose the kernel
Tri Dao's avatar
Tri Dao committed
593
594
595
596
597
598
              const float p_dropout,         // probability to drop
              const float softmax_scale,
              const bool is_causal,
              c10::optional<at::Generator> gen_
) {
    auto dprops = at::cuda::getCurrentDeviceProperties();
Tri Dao's avatar
Tri Dao committed
599
600
    bool is_sm80 = dprops->major == 8 && dprops->minor == 0;
    bool is_sm8x = dprops->major == 8 && dprops->minor >= 0;
Tri Dao's avatar
Tri Dao committed
601
602
603
604
605
606
    TORCH_CHECK(dprops->major == 8 && dprops->minor >= 0);
    auto launch = &run_fmha_block_dgrad_fp16_sm80;

    bool is_dropout = p_dropout > 0.0;
    auto stream = at::cuda::getCurrentCUDAStream().stream();

Tri Dao's avatar
Tri Dao committed
607
608
609
610
    TORCH_CHECK(q.dtype() == torch::kFloat16);
    TORCH_CHECK(k.dtype() == torch::kFloat16);
    TORCH_CHECK(v.dtype() == torch::kFloat16);
    TORCH_CHECK(out.dtype() == torch::kFloat16);
Tri Dao's avatar
Tri Dao committed
611
    TORCH_CHECK(dout.dtype() == torch::kFloat16);
Tri Dao's avatar
Tri Dao committed
612
613
614
615
616
    TORCH_CHECK(dq.dtype() == torch::kFloat16);
    TORCH_CHECK(dk.dtype() == torch::kFloat16);
    TORCH_CHECK(dv.dtype() == torch::kFloat16);
    TORCH_CHECK(cu_seqlens_q.dtype() == torch::kInt32);
    TORCH_CHECK(cu_seqlens_k.dtype() == torch::kInt32);
Tri Dao's avatar
Tri Dao committed
617
618
    TORCH_CHECK(blockmask.dtype() == torch::kInt32);

Tri Dao's avatar
Tri Dao committed
619
620
621
622
623
624
625
626
    TORCH_CHECK(q.is_cuda());
    TORCH_CHECK(k.is_cuda());
    TORCH_CHECK(v.is_cuda());
    TORCH_CHECK(out.is_cuda());
    TORCH_CHECK(dout.is_cuda());
    TORCH_CHECK(softmax_lse_.is_cuda());
    TORCH_CHECK(cu_seqlens_q.is_cuda());
    TORCH_CHECK(cu_seqlens_k.is_cuda());
Tri Dao's avatar
Tri Dao committed
627
628
    TORCH_CHECK(blockmask.is_cuda());

Tri Dao's avatar
Tri Dao committed
629
630
631
632
633
634
635
636
637
638
    TORCH_CHECK(q.stride(-1) == 1);
    TORCH_CHECK(k.stride(-1) == 1);
    TORCH_CHECK(v.stride(-1) == 1);
    TORCH_CHECK(out.is_contiguous());
    TORCH_CHECK(dout.is_contiguous());
    TORCH_CHECK(dq.stride(-1) == 1);
    TORCH_CHECK(dk.stride(-1) == 1);
    TORCH_CHECK(dv.stride(-1) == 1);
    TORCH_CHECK(cu_seqlens_q.is_contiguous());
    TORCH_CHECK(cu_seqlens_k.is_contiguous());
Tri Dao's avatar
Tri Dao committed
639
640
    TORCH_CHECK(blockmask.is_contiguous());

Tri Dao's avatar
Tri Dao committed
641
    const auto sizes = q.sizes();
Tri Dao's avatar
Tri Dao committed
642

Tri Dao's avatar
Tri Dao committed
643
644
    const int batch_size = cu_seqlens_q.numel() - 1;
    const int total_q = sizes[TOTAL_DIM];
Tri Dao's avatar
Tri Dao committed
645
646
    const int num_heads = sizes[H_DIM];
    const int head_size = sizes[D_DIM];
Tri Dao's avatar
Tri Dao committed
647
    const int total_k = k.size(TOTAL_DIM);
Tri Dao's avatar
Tri Dao committed
648
    TORCH_CHECK(batch_size > 0);
Tri Dao's avatar
Tri Dao committed
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
    TORCH_CHECK(head_size == 16 || head_size == 32 || head_size == 64 || head_size == 128);
    if (head_size == 128) {  // TODO: eventually we should support SM86 and SM70 with d=128 as well
        TORCH_CHECK(is_sm80);
    }

    CHECK_SHAPE(q, total_q, num_heads, head_size);
    CHECK_SHAPE(k, total_k, num_heads, head_size);
    CHECK_SHAPE(v, total_k, num_heads, head_size);
    CHECK_SHAPE(out, total_q, num_heads, head_size);
    CHECK_SHAPE(dout, total_q, num_heads, head_size);
    CHECK_SHAPE(dq, total_q, num_heads, head_size);
    CHECK_SHAPE(dk, total_k, num_heads, head_size);
    CHECK_SHAPE(dv, total_k, num_heads, head_size);
    CHECK_SHAPE(cu_seqlens_q, batch_size + 1);
    CHECK_SHAPE(cu_seqlens_k, batch_size + 1);

    int max_seqlen_k = ((max_seqlen_k_ + 256 - 1) / 256) * 256;
    if( max_seqlen_k <= 256 ) {
        max_seqlen_k = 256;
    }
    int max_seqlen_q = ((max_seqlen_q_ + 16 - 1) / 16) * 16;
    bool loop = max_seqlen_k > 256;
    CHECK_SHAPE(blockmask, max_seqlen_k / 256, max_seqlen_q / 16);
Tri Dao's avatar
Tri Dao committed
672

Tri Dao's avatar
Tri Dao committed
673
674
675
676
677
    // It's possible the softmax_lse_ from the fwd has a different length since blocksize_c could be different.
    auto softmax_lse = softmax_lse_.index({torch::indexing::Slice(), torch::indexing::Slice(), torch::indexing::Slice(torch::indexing::None, max_seqlen_q)}).contiguous();

    auto opts = q.options();
    auto softmax_d = torch::empty({batch_size, num_heads, max_seqlen_q}, opts.dtype(at::kFloat));
Tri Dao's avatar
Tri Dao committed
678
679
680
    at::Tensor dq_tmp;
    if (loop) {
        // dq_tmp = torch::zeros({total, num_heads, head_size}, opts.dtype(at::kFloat));
Tri Dao's avatar
Tri Dao committed
681
        dq_tmp = torch::empty({total_q, num_heads, head_size}, opts.dtype(at::kFloat));
Tri Dao's avatar
Tri Dao committed
682
683
    }

Tri Dao's avatar
Tri Dao committed
684
685
686
687
688
689
690
691
    FMHA_dgrad_params params;

    set_params_dgrad(params,
                     batch_size,
                     max_seqlen_q,
                     max_seqlen_k,
                     num_heads,
                     head_size,
692
                     q, k, v, out,
Tri Dao's avatar
Tri Dao committed
693
694
695
696
697
698
699
700
701
702
                     dq, dk, dv,
                     cu_seqlens_q.data_ptr(),
                     cu_seqlens_k.data_ptr(),
                     loop ? dq_tmp.data_ptr() : nullptr,
                     dout.data_ptr(),
                     softmax_lse.data_ptr(),
                     softmax_d.data_ptr(),
                     p_dropout,
                     softmax_scale,
                     is_causal);
Tri Dao's avatar
Tri Dao committed
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
    params.blockmask = static_cast<int *>(blockmask.data_ptr());

    auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
        gen_, at::cuda::detail::getDefaultCUDAGenerator());

    // We're gonna reset the rng state in Python after this kernel, so the counter offset
    // here doesn't matter at all. We just choose an arbitrary number;
    int64_t counter_offset = 4;

    if( is_dropout ) {
        // See Note [Acquire lock when using random generators]
        std::lock_guard<std::mutex> lock(gen->mutex_);
        params.philox_args = gen->philox_cuda_state(counter_offset);
    }

    launch(params, stream);
Tri Dao's avatar
Tri Dao committed
719
    return { dq, dk, dv, softmax_d };
Tri Dao's avatar
Tri Dao committed
720
721
722
723
724
725
726
727
728
}


PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.doc() = "Fused Multi-head Self-attention";
    m.def("fwd", &mha_fwd, "Forward pass");
    m.def("bwd", &mha_bwd, "Backward pass");
    m.def("fwd_block", &mha_fwd_block, "Forward pass (blocksparse)");
    m.def("bwd_block", &mha_bwd_block, "Backward pass (blocksparse)");
Tri Dao's avatar
Tri Dao committed
729
}