attention.py 78 KB
Newer Older
1
2
3
4
5
6
# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.

"""Attention."""
import os
7
import warnings
8
9
10
import math
from importlib.metadata import version
from contextlib import nullcontext
11
from typing import Any, Callable, Optional, Tuple, Union, Dict
12
13
14
15
16
from pkg_resources import packaging

import torch

import transformer_engine_extensions as tex
17
18
19
20
21
22
23
24
25
26
from transformer_engine.pytorch.cpp_extensions.fused_attn import (
    fused_attn_fwd_qkvpacked,
    fused_attn_bwd_qkvpacked,
    fused_attn_fwd_kvpacked,
    fused_attn_bwd_kvpacked,
    QKVLayout,
    AttnBiasType,
    AttnMaskType,
    FusedAttnBackend,
)
27
28
29
30
31
32
from transformer_engine.pytorch.module import LayerNormLinear, Linear
from transformer_engine.pytorch.utils import (
    divide,
    attention_mask_func,
    split_tensor_along_dim,
    get_device_compute_capability,
33
    get_default_init_method,
34
35
36
37
)
from transformer_engine.pytorch.constants import (
    AttnMaskTypes,
    AttnTypes,
38
    AttnBiasTypes,
39
    dist_group_type,
40
    TE_DType,
41
42
43
44
45
46
47
48
49
)
from transformer_engine.pytorch.softmax import FusedScaleMaskSoftmax
from transformer_engine.pytorch.distributed import (
    get_distributed_world_size,
    checkpoint,
)
from transformer_engine.pytorch.export import is_in_onnx_export_mode

_flash_attn_version = packaging.version.Version(version("flash-attn"))
50
_flash_attn_version_required = packaging.version.Version("1.0.6")
51
52
53
54
_flash_attn_2_available = _flash_attn_version >= packaging.version.Version("2")

if _flash_attn_2_available:
    from flash_attn.flash_attn_interface import flash_attn_varlen_func as flash_attn_forward_func # pylint: disable=no-name-in-module
55
    from flash_attn_2_cuda import varlen_bwd as flash_attn_cuda_bwd # pylint: disable=no-name-in-module
56
else:
57
    from flash_attn.flash_attn_interface import flash_attn_unpadded_func as flash_attn_forward_func # pylint: disable=no-name-in-module,ungrouped-imports
58
59


60
__all__ = ["DotProductAttention", "MultiheadAttention"]
61
62


63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
def _rotate_half(x: torch.Tensor) -> torch.Tensor:
    """
    change sign so the last dimension becomes [-odd, +even]
    """
    x = x.view(x.shape[:-1] + torch.Size((2, x.shape[-1] // 2)))
    x1, x2 = x.unbind(dim=-2)
    return torch.cat((-x2, x1), dim=-1)


def apply_rotary_pos_emb(t: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor:
    """
    input tensor t is of shape [seq_length, ..., dim]
    rotary positional embeding tensor `freqs` is of shape [seq_length, ..., dim]
    """
    rot_dim = freqs.shape[-1]
    # ideally t_pass is empty so rotary pos embedding is applied to all tensor t
    t, t_pass = t[..., :rot_dim], t[..., rot_dim:]

    # first part is cosine component
    # second part is sine component, need to change signs with _rotate_half method
    t = (t * freqs.cos()) + (_rotate_half(t) * freqs.sin())
    return torch.cat((t, t_pass), dim=-1)


87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
class _SplitLastDim(torch.autograd.Function):
    """"""

    @staticmethod
    def forward(ctx,
                mixed_x_layer: torch.Tensor,
                num_parts: int
    ) -> Tuple[torch.Tensor, ...]:
        return split_tensor_along_dim(mixed_x_layer, -1, num_parts)

    @staticmethod
    def backward(ctx,
                 *grad_outputs):
        assert len(grad_outputs) > 0, "No gradients received for backprop!"

        noop_ok = True
        strides = grad_outputs[0].stride()
        data_ptr = grad_outputs[0].storage().data_ptr()
        shape = grad_outputs[0].shape
        last_dim_size = grad_outputs[0].shape[-1]
        for i, tensor in enumerate(grad_outputs):
            if (tensor.stride() != strides or
                tensor.shape != shape or
                tensor.storage().data_ptr() != data_ptr or
                tensor.storage_offset() != i * last_dim_size):
                noop_ok = False
                break

        if noop_ok:
            ret = torch.Tensor().to(grad_outputs[0].dtype)
            ret = torch.Tensor().to(device=grad_outputs[0].device,
                                    dtype=grad_outputs[0].dtype)
            new_shape = list(shape)
            new_shape[-1] = new_shape[-1] * len(grad_outputs)
            ret.set_(grad_outputs[0].storage(),
                     grad_outputs[0].storage_offset(),
                     new_shape,
                     grad_outputs[0].stride()
            )
            return ret, None

        return torch.cat(grad_outputs, dim = -1), None

130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
class _CombineQKV(torch.autograd.Function):
    """"""

    @staticmethod
    def forward(ctx,
                query_layer: torch.Tensor,
                key_layer: torch.Tensor, # pylint: disable=unused-argument
                value_layer: torch.Tensor, # pylint: disable=unused-argument
                dim: int,
    ) -> torch.Tensor:

        mixed_layer = torch.Tensor().to(device=query_layer.device,
                                dtype=query_layer.dtype)
        new_shape = list(query_layer.shape)
        new_shape[dim] = new_shape[dim] * 3
        mixed_layer.set_(query_layer.untyped_storage(),
                 query_layer.storage_offset(),
                 new_shape,
                 query_layer.stride())
        ctx.dim = dim
        return mixed_layer

    @staticmethod
    def backward(ctx,
                 *grad_outputs,
    ) -> Tuple[torch.Tensor, ...]:
        assert len(grad_outputs) > 0, "No gradients received for backprop!"
        tensors = split_tensor_along_dim(grad_outputs[0], ctx.dim, 3)
        return tensors[0], tensors[1], tensors[2], None

class _CombineKV(torch.autograd.Function):
    """"""

    @staticmethod
    def forward(ctx,
                key_layer: torch.Tensor,
                value_layer: torch.Tensor, # pylint: disable=unused-argument
                dim: int,
    ) -> torch.Tensor:

        mixed_layer = torch.Tensor().to(device=key_layer.device,
                                dtype=key_layer.dtype)
        new_shape = list(key_layer.shape)
        new_shape[dim] = new_shape[dim] * 2
        mixed_layer.set_(key_layer.untyped_storage(),
                 key_layer.storage_offset(),
                 new_shape,
                 key_layer.stride())
        ctx.dim = dim
        return mixed_layer

    @staticmethod
    def backward(ctx,
                 *grad_outputs,
    ) -> Tuple[torch.Tensor, ...]:
        assert len(grad_outputs) > 0, "No gradients received for backprop!"
        tensors = split_tensor_along_dim(grad_outputs[0], ctx.dim, 2)
        return tensors[0], tensors[1], None
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206

class UnfusedDotProductAttention(torch.nn.Module):
    """Parallel attention w/o QKV and Proj Gemms
    BMM1 -> softmax + dropout -> BMM2
    """

    def __init__(
        self,
        norm_factor: float,
        attention_dropout: float = 0.0,
        attention_dropout_ctx: Optional[Callable] = nullcontext,
        layer_number: Optional[int] = None,
    ) -> None:
        super().__init__()

        self.norm_factor = norm_factor
        self.attention_dropout_ctx = attention_dropout_ctx
        self.layer_number = layer_number

207
        self.scale_mask_softmax = FusedScaleMaskSoftmax(attention_mask_func)
208
209
210
211
212
213

        # Dropout. Note that for a single iteration, this layer will generate
        # different outputs on different number of parallel partitions but
        # on average it should not be partition dependent.
        self.attention_dropout = torch.nn.Dropout(attention_dropout)

214
215
216
217
        # An FP16 training trick required for certain GPT-like models.
        self.apply_qk_layer_scaling = (
            bool(int(os.getenv("NVTE_APPLY_QK_LAYER_SCALING", "0"))) and layer_number is not None)

218
219
220
221
222
    def forward(
        self,
        query_layer: torch.Tensor,
        key_layer: torch.Tensor,
        value_layer: torch.Tensor,
223
        attn_mask_type: str = "causal",
224
        attention_mask: Optional[torch.Tensor] = None,
225
226
        core_attention_bias_type: str = "no_bias",
        core_attention_bias: Optional[torch.Tensor] = None,
227
228
    ) -> torch.Tensor:
        """core attention fprop"""
229
230
231
232
233

        assert (
            attn_mask_type in AttnMaskTypes
        ), f"attn_mask_type {attn_mask_type} not supported"

234
        batch_size, seqlen = query_layer.shape[1], query_layer.shape[0]
235
        apply_qk_layer_scaling = self.apply_qk_layer_scaling and key_layer.dtype == torch.float16
236
237
238
239
240
241
242
243
244

        # [b, np, sq, sk]
        output_size = (
            query_layer.size(1),
            query_layer.size(2),
            query_layer.size(0),
            key_layer.size(0),
        )

245
246
247
248
249
250
251
252
253
        assert key_layer.shape == value_layer.shape, "Keys and values must have the same shape!"
        if key_layer.shape[2] != query_layer.shape[2]:
            assert (query_layer.shape[2]%key_layer.shape[2]==0
                ),"The number of attention heads must be divisible by the number of GQA groups!"
            key_layer = key_layer.repeat_interleave(
                    int(query_layer.shape[2]/key_layer.shape[2]), dim = 2)
            value_layer = value_layer.repeat_interleave(
                    int(query_layer.shape[2]/value_layer.shape[2]), dim = 2)

254
255
256
257
258
259
260
261
        # [sq, b, np, hn] -> [sq, b * np, hn]
        query_layer = query_layer.reshape(
            output_size[2], output_size[0] * output_size[1], -1
        )
        # [sk, b, np, hn] -> [sk, b * np, hn]
        key_layer = key_layer.reshape(output_size[3], output_size[0] * output_size[1], -1)

        # preallocting result tensor: [b * np, sq, sk]
262
263
        # WAR to set dtype to FP32 as ONNX lacks BF16 support for ConstantOfShape operator
        is_bf16 = query_layer.dtype == torch.bfloat16
264
265
266
267
        matmul_result = torch.empty(
            output_size[0] * output_size[1],
            output_size[2],
            output_size[3],
268
            dtype=torch.float32 if is_in_onnx_export_mode() and is_bf16 else query_layer.dtype,
269
270
271
            device=torch.cuda.current_device(),
        )

272
273
274
        if is_in_onnx_export_mode() and is_bf16:
            matmul_result = matmul_result.bfloat16()

275
276
277
278
279
        scale = self.norm_factor
        if apply_qk_layer_scaling:
            scale *= self.layer_number

        # Raw attention scores. [b * np, sq, sk]
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
        if core_attention_bias_type == "no_bias":
            matmul_result = torch.baddbmm(
                matmul_result,
                query_layer.transpose(0, 1),  # [b * np, sq, hn]
                key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
                beta=0.0,
                alpha=(1.0 / scale),
            )

        elif core_attention_bias_type == "pre_scale_bias":
            assert core_attention_bias is not None, "core_attention_bias should not be None!"
            assert (core_attention_bias.shape == torch.Size(1, *output_size[1:])
                    ), "core_attention_bias must be in [1, h, sq, skv] shape!"
            matmul_result = torch.bmm(
                query_layer.transpose(0, 1),  # [b * np, sq, hn]
                key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
            )
            matmul_result = (matmul_result.view(
                output_size[0], output_size[1], output_size[2], output_size[3])
                + core_attention_bias).view(-1, output_size[2], output_size[3])
            matmul_result /= scale

        elif core_attention_bias_type == "post_scale_bias":
            assert core_attention_bias is not None, "core_attention_bias should not be None!"
            assert (core_attention_bias.shape == torch.Size([1, *output_size[1:]])
                    ), "core_attention_bias must be in [1, h, sq, skv] shape!"
            matmul_result = torch.baddbmm(
                matmul_result,
                query_layer.transpose(0, 1),  # [b * np, sq, hn]
                key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
                beta=0.0,
                alpha=(1.0 / scale),
            )
            matmul_result = (matmul_result.view(
                output_size[0], output_size[1], output_size[2], output_size[3])
                + core_attention_bias).view(-1, output_size[2], output_size[3])
316
317
318
319
320
321

        # change view to [b, np, sq, sk]
        attention_scores = matmul_result.view(*output_size)

        # attention scores and attention mask [b, np, sq, sk]
        softmax_scale = self.layer_number if apply_qk_layer_scaling else None
322
323
        attention_probs = self.scale_mask_softmax(
            attention_scores, attention_mask, attn_mask_type, softmax_scale)
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394

        # This is actually dropping out entire tokens to attend to, which might
        # seem a bit unusual, but is taken from the original Transformer paper.
        with self.attention_dropout_ctx():
            attention_probs = self.attention_dropout(attention_probs)

        # value_layer -> context layer.
        # [sk, b, np, hn] --> [b, np, sq, hn]
        output_size = (
            value_layer.size(1),
            value_layer.size(2),
            query_layer.size(0),
            value_layer.size(3),
        )

        # change view [sk, b * np, hn]
        value_layer = value_layer.reshape(
            value_layer.size(0), output_size[0] * output_size[1], -1
        )

        # change view [b * np, sq, sk]
        attention_probs = attention_probs.view(
            output_size[0] * output_size[1], output_size[2], -1
        )

        # matmul: [b * np, sq, hn]
        context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1))

        # change view [b, np, sq, hn]
        context_layer = context_layer.view(*output_size)

        # [b, np, sq, hn] --> [sq, b, np, hn]
        context_layer = context_layer.permute(2, 0, 1, 3).contiguous()

        # [sq, b, np, hn] --> [sq, b, hp]
        context_layer = context_layer.view(seqlen, batch_size, -1)

        return context_layer


class _PrepareQKVForFA(torch.autograd.Function):
    """This class converts QKV from interleaved (s, b, ...) layout
       to separate contiguous q, k, v tensors in (b, s, ...) layout."""

    @staticmethod
    def forward(ctx,
                query_layer: torch.Tensor,
                key_layer: torch.Tensor,
                value_layer: torch.Tensor
    ) -> torch.Tensor:
        # All inputs received are non-contiguous tensors.
        # The `query_layer` tensor is used to access the
        # full memory region of the QKV tensor.
        qkv = tex.fa_prepare_fwd(query_layer)
        q, k, v = split_tensor_along_dim(qkv, 0, 3)
        query_layer = torch.squeeze(q, 0)
        key_layer = torch.squeeze(k, 0)
        value_layer = torch.squeeze(v, 0)
        return query_layer, key_layer, value_layer

    @staticmethod
    def backward(ctx,
                 dq: torch.Tensor,
                 dk: torch.Tensor,
                 dv: torch.Tensor
    ) -> Tuple[Union[torch.Tensor, None], ...]:
        dqkv = tex.fa_prepare_bwd(dq, dk, dv)
        dq, dk, dv = split_tensor_along_dim(dqkv, -1, 3)
        return dq, dk, dv


395
def _check_qkv_layout(q, k, v):
396
397
    data_ptr = q.untyped_storage().data_ptr()
    check_ptrs = all(x.untyped_storage().data_ptr() == data_ptr for x in [q, k, v])
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
    if not check_ptrs:
        return False

    stride = q.stride()
    check_strides = all(stride == x.stride() for x in [q, k, v])
    if not check_strides:
        return False

    shape = q.shape
    check_shapes = all(shape == x.shape for x in [q, k, v])
    if not check_shapes:
        return False

    last_dim_size = shape[-1]
    check_offsets = all(i * last_dim_size == x.storage_offset()
                        for i, x in enumerate([q, k, v]))
414
415
    if check_offsets:
        return "sbh3d"
416

417
418
419
420
421
422
423
424
425
    last_dims_size = shape[-1] * shape[-2]
    check_offsets = all(i * last_dims_size == x.storage_offset()
                        for i, x in enumerate([q, k, v]))
    if check_offsets:
        return "sb3hd"

    return "other"

def _check_kv_layout(k, v):
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
    data_ptr = k.untyped_storage().data_ptr()
    check_ptrs = all(x.untyped_storage().data_ptr() == data_ptr for x in [k, v])
    if not check_ptrs:
        return False

    stride = k.stride()
    check_strides = all(stride == x.stride() for x in [k, v])
    if not check_strides:
        return False

    shape = k.shape
    check_shapes = all(shape == x.shape for x in [k, v])
    if not check_shapes:
        return False

    last_dim_size = shape[-1]
    check_offsets = all(i * last_dim_size == x.storage_offset()
                        for i, x in enumerate([k, v]))
444
445
446
447
448
449
450
451
    if check_offsets:
        return "sbh2d"

    last_dims_size = shape[-1] * shape[-2]
    check_offsets = all(i * last_dims_size == x.storage_offset()
                        for i, x in enumerate([k, v]))
    if check_offsets:
        return "sb2hd"
452

453
    return "other"
454

455
456

class FlashAttention(torch.nn.Module):
457
458
    """Dot product attention, using HazyResearch flash-attn package:
    https://github.com/HazyResearch/flash-attention
459
460
461
462
463
464
465
    """

    def __init__(
        self,
        norm_factor: float,
        attention_dropout: float = 0.0,
        attention_dropout_ctx: Optional[Callable] = nullcontext,
466
        deterministic: bool = False,
467
468
469
470
471
472
473
474
475
476
    ) -> None:
        super().__init__()

        assert (
            _flash_attn_version >= _flash_attn_version_required
        ), f"FlashAttention minimum version {_flash_attn_version_required} is required."

        self.norm_factor = norm_factor
        self.attention_dropout_ctx = attention_dropout_ctx
        self.attention_dropout = attention_dropout
477
        self.deterministic = deterministic
478
479
480
481
482
483

    def forward(
        self,
        query_layer: torch.Tensor,
        key_layer: torch.Tensor,
        value_layer: torch.Tensor,
484
        attn_mask_type: str = "causal",
485
486
487
488
    ) -> torch.Tensor:
        """flash-attn fprop"""

        assert (
489
490
491
            query_layer.dtype in [torch.float16, torch.bfloat16]
            and key_layer.dtype in [torch.float16, torch.bfloat16]
            and value_layer.dtype in [torch.float16, torch.bfloat16]
492
493
494
495
496
497
498
499
500
            ), 'FlashAttention currently only supports FP16 and BF16.'
        assert (
            query_layer.is_cuda and key_layer.is_cuda and value_layer.is_cuda
            ), 'FlashAttention currently only supports CUDA tensors.'

        # For now just 128, will make it more general in the future

        if (query_layer.shape[-1] == 128 and
            query_layer.shape[0] * query_layer.shape[1] >= 512 and
501
            _check_qkv_layout(query_layer, key_layer, value_layer) == "sbh3d"):
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
            query_layer, key_layer, value_layer = _PrepareQKVForFA.apply(query_layer,
                                                                         key_layer,
                                                                         value_layer)
        else:
            query_layer, key_layer, value_layer = [x.transpose(0,1).contiguous()
                           for x in (query_layer, key_layer, value_layer)]

        batch_size, seqlen = query_layer.shape[0], query_layer.shape[1]

        # [b, sq, np, hn]
        query_layer, key_layer, value_layer = [
            x.view(x.shape[0] * x.shape[1], *x.shape[2:])
            for x in [query_layer, key_layer, value_layer]
        ]

        max_seqlen = seqlen
        cu_seqlens = torch.arange(
            0,
            (batch_size + 1) * seqlen,
            step=seqlen,
            dtype=torch.int32,
            device=query_layer.device)

        with self.attention_dropout_ctx():
526
527
528
529
            fa_optional_forward_kwargs = {}
            if not _flash_attn_2_available:
                fa_optional_forward_kwargs["deterministic"] = self.deterministic
            output = flash_attn_forward_func(
530
531
                query_layer, key_layer, value_layer, cu_seqlens, cu_seqlens, max_seqlen, max_seqlen,
                self.attention_dropout if self.training else 0.0,
532
                softmax_scale=1.0/self.norm_factor, causal=attn_mask_type=="causal",
533
                **fa_optional_forward_kwargs
534
535
536
537
538
539
            )

        # [(b sq), np, hn] -> [sq, b, (np hn)]
        return output.view(batch_size, seqlen, -1).transpose(0, 1).contiguous()


540
541
542
543
544
545
class FusedAttnFunc_qkvpacked(torch.autograd.Function):
    """Function for FusedAttention with packed QKV input"""

    @staticmethod
    def forward(ctx, is_training, max_seqlen, cu_seqlens, qkv, qkv_dtype, attn_bias, attn_scale,
                dropout_p, fast_zero_fill, qkv_layout, attn_bias_type, attn_mask_type,
546
                rng_gen, fused_attention_backend, use_FAv2_bwd):
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
        out, aux_ctx_tensors = fused_attn_fwd_qkvpacked(
            is_training, max_seqlen, cu_seqlens, qkv, qkv_dtype,
            fused_attention_backend, attn_bias,
            None, None, None, None, None,
            attn_scale, dropout_p, fast_zero_fill, qkv_layout, attn_bias_type, attn_mask_type,
            rng_gen)

        ctx.save_for_backward(qkv, out, cu_seqlens)
        ctx.aux_ctx_tensors = aux_ctx_tensors
        ctx.max_seqlen = max_seqlen
        ctx.qkv_dtype = qkv_dtype
        ctx.attn_scale = attn_scale
        ctx.dropout_p = dropout_p
        ctx.fast_zero_fill = fast_zero_fill
        ctx.qkv_layout = qkv_layout
        ctx.attn_bias_type = attn_bias_type
        ctx.attn_mask_type = attn_mask_type
        ctx.fused_attention_backend = fused_attention_backend
565
        ctx.use_FAv2_bwd = use_FAv2_bwd
566
567
568
569
570
571

        return out

    @staticmethod
    def backward(ctx, d_out):
        qkv, out, cu_seqlens = ctx.saved_tensors
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
        if ctx.use_FAv2_bwd:
            softmax_lse, rng_state = ctx.aux_ctx_tensors
            dqkv = torch.empty_like(qkv)
            maybe_contiguous = lambda x: x.contiguous() if x.stride(-1) != 1 else x
            d_out, q, k, v, out = [maybe_contiguous(x)
                for x in (d_out, qkv[:,0], qkv[:,1], qkv[:,2], out)]
            flash_attn_cuda_bwd(
                d_out, q, k, v, out, softmax_lse, dqkv[:,0], dqkv[:,1], dqkv[:,2],
                cu_seqlens, cu_seqlens, ctx.max_seqlen, ctx.max_seqlen,
                ctx.dropout_p, ctx.attn_scale, False,
                ctx.attn_mask_type == "causal", None, rng_state
            )
            dqkv = dqkv[..., :d_out.shape[-1]]
        else:
            dqkv, *rest = fused_attn_bwd_qkvpacked(
                ctx.max_seqlen, cu_seqlens, qkv, out, d_out,
                ctx.qkv_dtype, ctx.aux_ctx_tensors,
                ctx.fused_attention_backend,
                None, None, None, None, None, None, None, None, None,
                ctx.attn_scale, ctx.dropout_p, ctx.fast_zero_fill,
                ctx.qkv_layout, ctx.attn_bias_type, ctx.attn_mask_type)
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610

        # if no_bias, return dqkv
        if ctx.attn_bias_type == "no_bias":
            return (None, None, None, dqkv, None, None, None,
                    None, None, None, None, None, None,
                    None, None, None, None, None, None)
        # else, return (dqkv, dbias)
        return (None, None, None, dqkv, None, rest[0], None,
                None, None, None, None, None, None,
                None, None, None, None, None, None)

class FusedAttnFunc_kvpacked(torch.autograd.Function):
    """Function for FusedAttention with packed KV input"""

    @staticmethod
    def forward(ctx, is_training, max_seqlen_q, max_seqlen_kv, cu_seqlens_q, cu_seqlens_kv,
                q, kv, qkv_dtype, attn_bias, attn_scale, dropout_p, fast_zero_fill,
                qkv_layout, attn_bias_type, attn_mask_type,
611
                rng_gen, fused_attention_backend, use_FAv2_bwd):
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
        out, aux_ctx_tensors = fused_attn_fwd_kvpacked(
            is_training, max_seqlen_q, max_seqlen_kv, cu_seqlens_q, cu_seqlens_kv,
            q, kv, qkv_dtype, fused_attention_backend, attn_bias,
            None, None, None, None, None,
            attn_scale, dropout_p, fast_zero_fill, qkv_layout, attn_bias_type, attn_mask_type,
            rng_gen)

        ctx.save_for_backward(q, kv, out, cu_seqlens_q, cu_seqlens_kv)
        ctx.aux_ctx_tensors = aux_ctx_tensors
        ctx.max_seqlen_q = max_seqlen_q
        ctx.max_seqlen_kv = max_seqlen_kv
        ctx.qkv_dtype = qkv_dtype
        ctx.attn_scale = attn_scale
        ctx.dropout_p = dropout_p
        ctx.fast_zero_fill = fast_zero_fill
        ctx.qkv_layout = qkv_layout
        ctx.attn_bias_type = attn_bias_type
        ctx.attn_mask_type = attn_mask_type
        ctx.fused_attention_backend = fused_attention_backend
631
        ctx.use_FAv2_bwd = use_FAv2_bwd
632
633
634
635
636
637

        return out

    @staticmethod
    def backward(ctx, d_out):
        q, kv, out, cu_seqlens_q, cu_seqlens_kv = ctx.saved_tensors
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
        if ctx.use_FAv2_bwd:
            softmax_lse, rng_state = ctx.aux_ctx_tensors
            dq = torch.empty_like(q)
            dkv = torch.empty_like(kv)
            maybe_contiguous = lambda x: x.contiguous() if x.stride(-1) != 1 else x
            d_out, q, k, v, out = [maybe_contiguous(x)
                for x in (d_out, q, kv[:,0], kv[:,1], out)]
            flash_attn_cuda_bwd(
                d_out, q, k, v, out, softmax_lse, dq, dkv[:,0], dkv[:,1],
                cu_seqlens_q, cu_seqlens_kv, ctx.max_seqlen_q, ctx.max_seqlen_kv,
                ctx.dropout_p, ctx.attn_scale, False,
                ctx.attn_mask_type == "causal", None, rng_state
            )
            dq = dq[..., :d_out.shape[-1]]
            dkv = dkv[..., :d_out.shape[-1]]
        else:
            dq, dkv, *rest = fused_attn_bwd_kvpacked(
                ctx.max_seqlen_q, ctx.max_seqlen_kv, cu_seqlens_q, cu_seqlens_kv,
                q, kv, out, d_out,
                ctx.qkv_dtype, ctx.aux_ctx_tensors,
                ctx.fused_attention_backend,
                None, None, None, None, None, None, None, None, None,
                ctx.attn_scale, ctx.dropout_p, ctx.fast_zero_fill,
                ctx.qkv_layout, ctx.attn_bias_type, ctx.attn_mask_type)
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711

        # if no_bias, return dqkv
        if ctx.attn_bias_type == "no_bias":
            return (None, None, None, None, None, dq, dkv, None, None, None,
                    None, None, None, None, None, None,
                    None, None, None, None, None, None)
        # else, return (dqkv, dbias)
        return (None, None, None, None, None, dq, dkv, None, rest[0], None,
                None, None, None, None, None, None,
                None, None, None, None, None, None)

class FusedAttention(torch.nn.Module):
    """Dot product attention, with multiple backends:

    1. FusedAttnBackend["F16_max512_seqlen"]
       cuDNN based fused attention for FP16/BF16 and <=512 sequence length.
    2. FusedAttnBackend["F16_arbitrary_seqlen"]
       cuDNN based fused attention for FP16/BF16 and any sequence length.

    Support matrix:

    | backend       | 1                       | 2               |
    | flash based   | no                      | yes             |
    | cuDNN based   | yes                     | yes             |
    | qkv dtype     | fp16/bf16               | fp16/bf16       |
    | attn_type     | self/cross              | self            |
    | qkv_layout    |                         |                 |
    |  - qkv        | qkv_interleaved         | qkv_interleaved |
    |  - (q,kv)     | kv_interleaved          |                 |
    | mask_type     | causal/no_mask          | causal          |
    | bias_type     | no_bias/post_scale_bias | no_bias         |
    | dropout       | yes                     | yes             |
    | max_seqlen    | <=512                   | any             |
    | head_dim      | 64                      | 64,128          |
    | output dtype  | fp16/bf16               | fp16/bf16       |
    """

    def __init__(
        self,
        norm_factor: float,
        attention_dropout: float = 0.0,
        attention_dropout_ctx: Optional[Callable] = nullcontext,
        attention_type: str = "self",
    ) -> None:
        super().__init__()

        self.norm_factor = norm_factor
        self.attention_dropout = attention_dropout
        self.attention_dropout_ctx = attention_dropout_ctx
        self.attention_type = attention_type
712
713
714
        self.use_FAv2_bwd = (os.getenv("NVTE_FUSED_ATTN_USE_FAv2_BWD", "1") == "1"
                        and _flash_attn_2_available
                        and get_device_compute_capability() == 9.0)
715
716
717
718
719
720

    def forward(
        self,
        query_layer: torch.Tensor,
        key_layer: torch.Tensor,
        value_layer: torch.Tensor,
721
        attn_mask_type: str = "causal",
722
723
        fused_attention_backend:
            tex.NVTE_Fused_Attn_Backend = tex.NVTE_Fused_Attn_Backend.NVTE_No_Backend,
724
725
726
727
728
729
        core_attention_bias_type: str = "no_bias",
        core_attention_bias: Optional[torch.Tensor] = None,
        fast_zero_fill: bool = True,
    ) -> torch.Tensor:
        """fused attention fprop"""

730
731
732
        assert (fused_attention_backend
                != tex.NVTE_Fused_Attn_Backend.NVTE_No_Backend
                ), 'No fused attention backend supports this input combination!'
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
        assert (
            (query_layer.dtype in [torch.float16, torch.bfloat16])
            and (key_layer.dtype in [torch.float16, torch.bfloat16])
            and (value_layer.dtype in [torch.float16, torch.bfloat16])
            ), 'FusedAttention only supports FP16 and BF16 data types.'
        assert (
            query_layer.is_cuda and key_layer.is_cuda and value_layer.is_cuda
            ), 'FusedAttention only supports CUDA tensors.'

        qkv_dtype = TE_DType[query_layer.dtype]
        seqlen_q, batch_size = query_layer.shape[0], query_layer.shape[1]
        seqlen_kv = key_layer.shape[0]
        max_seqlen_q = seqlen_q
        max_seqlen_kv = seqlen_kv

        if self.attention_type == "self":
749
750
751
            qkv_layout = _check_qkv_layout(query_layer, key_layer, value_layer)
            if qkv_layout == "sbh3d":
                mixed_layer = _CombineQKV.apply(query_layer, key_layer, value_layer, 3)
752
                # [s, b, h, 3, d]
753
754
                mixed_layer = mixed_layer.view(
                        *mixed_layer.shape[0:3], 3, query_layer.shape[-1])
755
756
                # [b, s, 3, h, d]
                mixed_layer = mixed_layer.transpose(2, 3).transpose(0, 1).contiguous()
757
758
            elif qkv_layout == "sb3hd":
                mixed_layer = _CombineQKV.apply(query_layer, key_layer, value_layer, 2)
759
                # [s, b, 3, h, d]
760
761
                mixed_layer = mixed_layer.view(
                        *mixed_layer.shape[0:2], 3, *query_layer.shape[2:])
762
763
                # [b, s, 3, h, d]
                mixed_layer = mixed_layer.transpose(0, 1).contiguous()
764
765
            else:
                raise Exception("FusedAttention only supports qkv layout sbh3d or sb3hd!")
766
767
768

            # [total_seqs, 3, h, d]
            mixed_layer = mixed_layer.view(
769
                mixed_layer.shape[0] * mixed_layer.shape[1], *mixed_layer.shape[2:])
770
771
772
773
774
775
776
777
778

            qkv_layout = "qkv_interleaved"
            max_seqlen = seqlen_q
            cu_seqlens = torch.arange(
                0,
                (batch_size + 1) * seqlen_q,
                step=seqlen_q,
                dtype=torch.int32,
                device=query_layer.device)
779
780
781
782
            use_FAv2_bwd = (self.use_FAv2_bwd
                        and (fused_attention_backend
                            == tex.NVTE_Fused_Attn_Backend.NVTE_F16_arbitrary_seqlen)
                        and core_attention_bias_type == "no_bias")
783
784
785
786
787
788
789
790
791
792
793
794
795
796

            with self.attention_dropout_ctx():
                output = FusedAttnFunc_qkvpacked.apply(
                    self.training,
                    max_seqlen,
                    cu_seqlens,
                    mixed_layer,
                    qkv_dtype,
                    core_attention_bias,
                    1.0/self.norm_factor,
                    self.attention_dropout if self.training else 0.0,
                    fast_zero_fill,
                    qkv_layout,
                    core_attention_bias_type,
797
                    attn_mask_type,
798
799
                    None, # rng_gen
                    fused_attention_backend,
800
                    use_FAv2_bwd
801
802
803
804
                )
            output = output.view(batch_size, seqlen_q, -1).transpose(0, 1).contiguous()

        if self.attention_type == "cross":
805
806
807
            kv_layout = _check_kv_layout(key_layer, value_layer)
            if kv_layout == "sbh2d":
                key_value = _CombineKV.apply(key_layer, value_layer, 3)
808
                # [s, b, h, 2, d]
809
810
                key_value = key_value.view(
                        *key_value.shape[0:3], 2, key_layer.shape[-1])
811
812
                # [b, s, 2, h, d]
                key_value = key_value.transpose(2, 3).transpose(0, 1).contiguous()
813
814
            elif qkv_layout == "sb2hd":
                key_value = _CombineKV.apply(key_layer, value_layer, 2)
815
                # [s, b, 2, h, d]
816
817
                key_value = key_value.view(
                        *key_value.shape[0:2], 2, *key_layer.shape[2:])
818
819
                # [b, s, 2, h, d]
                key_value = key_value.transpose(0, 1).contiguous()
820
821
            else:
                raise Exception("FusedAttention only supports kv layout sbh2d or sb2hd!")
822

823
            # [total_seqs, h, d]
824
825
826
            query_layer = query_layer.transpose(0, 1).contiguous()
            query_layer = query_layer.view(
                    query_layer.shape[0] * query_layer.shape[1], *query_layer.shape[2:])
827
            # [total_seqs, 2, h, d]
828
            key_value = key_value.view([key_value.shape[0] * key_value.shape[1]]
829
                + key_value.shape[2:])
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857

            qkv_layout = "kv_interleaved"
            cu_seqlens_q = torch.arange(
                0,
                (batch_size + 1) * seqlen_q,
                step=seqlen_q,
                dtype=torch.int32,
                device=query_layer.device)
            cu_seqlens_kv = torch.arange(
                0,
                (batch_size + 1) * seqlen_kv,
                step=seqlen_kv,
                dtype=torch.int32,
                device=key_layer.device)

            with self.attention_dropout_ctx():
                outputs = FusedAttnFunc_kvpacked.apply(
                    self.training,
                    max_seqlen_q, max_seqlen_kv,
                    cu_seqlens_q, cu_seqlens_kv,
                    query_layer, key_value,
                    qkv_dtype,
                    core_attention_bias,
                    1.0/self.norm_factor,
                    self.attention_dropout if self.training else 0.0,
                    fast_zero_fill,
                    qkv_layout,
                    core_attention_bias_type,
858
                    attn_mask_type,
859
860
                    None, # rng_gen
                    fused_attention_backend,
861
                    use_FAv2_bwd
862
863
864
865
866
867
868
                )

            output = (outputs[0].view(batch_size, seqlen_q, -1).transpose(0, 1).contiguous(),
                    outputs[1].view(batch_size, seqlen_q, -1).transpose(0, 1).contiguous())
        return output


869
870
871
872
873
874
875
876
877
878
879
880
class DotProductAttention(torch.nn.Module):
    """Allows the model to jointly attend to information from different
    representation subspaces as described in the paper:
    `Attention Is All You Need <https://arxiv.org/abs/1706.03762>`_.

    .. note::

        Argument :attr:`attention_mask` will be ignored in the `forward` call when
        :attr:`attn_mask_type` is set to `"causal"`.

    .. warning::

881
882
883
884
        FlashAttention uses a non-deterministic algorithm for optimal performance. To observe
        deterministic behavior at the cost of performance, use FlashAttention version < `2.0.0`
        and set the environment variable :attr:`NVTE_ALLOW_NONDETERMINISTIC_ALGO=0`. In order
        to disable`flash-attn` entirely, set :attr:`NVTE_FLASH_ATTN=0`.
885

886
887
888
889
890
    .. warning::

        Argument :attr:`attn_mask_type` has been moved to the `forward` method and
        is deprecated. It will be fully removed in future releases.

891
892
893
894
895
896
    Parameters
    ----------
    num_attention_heads : int
                         number of attention heads in the transformer layer.
    kv_channels : int
                number of key-value channels.
897
898
899
900
901
902
903
904
    num_gqa_groups : Optional[int] = None
                    number of GQA groups in the transformer layer.
                    Grouped Query Attention is described in
                    `this paper <https://arxiv.org/pdf/2305.13245.pdf>`_.
                    This only affects the keys and values, not the queries.
                    GQA-1 is equivalent to Multi-Query Attention
                    (`MQA <https://arxiv.org/pdf/1911.02150.pdf>`_), while GQA-H
                    is equivalent to MHA, i.e. `num_gqa_groups = num_attention_heads`.
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
    attention_dropout: float, default = 0.0
                      dropout probability for the dropout op during multi-head attention.
    layer_number: int, default = `None`
                 layer number of the current `DotProductAttention` when multiple such modules
                 are concatenated, for instance in consecutive transformer blocks.

    Parallelism parameters
    ----------------------
    sequence_parallel : bool, default = `False`
                       if set to `True`, uses sequence parallelism.
    tp_size : int, default = 1
             tensor parallel world size.
    tp_group : ProcessGroup, default = `None`
              tensor parallel process group.
    """

    def __init__(
        self,
        num_attention_heads: int,
        kv_channels: int,
925
        num_gqa_groups: Optional[int] = None,
926
        attention_dropout: float = 0.0,
927
        attn_mask_type: Optional[str] = None,
928
929
930
931
932
        sequence_parallel: bool = False,
        tp_size: int = 1,
        get_rng_state_tracker: Optional[Callable] = None,
        tp_group: Optional[dist_group_type] = None,
        layer_number: Optional[int] = None,
933
        attention_type: str = "self",
934
935
936
    ) -> None:
        super().__init__()

937
938
939
940
941
942
943
944
        if attn_mask_type is not None:
            warnings.warn(
                "Argument :attr:`attn_mask_type` has been moved to the `forward` method and"
                "is deprecated. It will be fully removed in future releases.",
                category=DeprecationWarning,
            )

        self.attn_mask_type = attn_mask_type
945
        self.tp_size = tp_size if tp_group is None else get_distributed_world_size(tp_group)
946
947
        self.tp_group = tp_group
        self.get_rng_state_tracker = get_rng_state_tracker
948
        self.num_attention_heads = num_attention_heads
949

950
951
952
        self.hidden_size_per_attention_head = kv_channels
        self.num_gqa_groups = (
            num_attention_heads if num_gqa_groups is None else num_gqa_groups
953
        )
954
955
956
957
        self.num_gqa_groups_per_partition = int(self.num_gqa_groups // tp_size)

        assert (num_attention_heads % self.num_gqa_groups == 0
                ), "The number of attention heads must be divisible by the number of GQA groups!"
958
959
960
961
962
963
964
965
966

        if sequence_parallel or get_rng_state_tracker is None:
            attention_dropout_ctx = nullcontext
        else:
            attention_dropout_ctx = get_rng_state_tracker().fork

        norm_factor = math.sqrt(self.hidden_size_per_attention_head)

        self.device_compute_capability = get_device_compute_capability()
967
968
        self.deterministic = not bool(int(os.getenv("NVTE_ALLOW_NONDETERMINISTIC_ALGO", "1")))

969
970
971
972
        self.use_flash_attention = (
            int(os.getenv("NVTE_FLASH_ATTN", "1"))
            and self.device_compute_capability >= 8.0
        )
973
974
975
976
        if _flash_attn_2_available and self.deterministic:
            self.use_flash_attention = False
            warnings.warn(
                "Disabling usage of FlashAttention since version 2 does not support deterministic"
977
978
                "execution. In order to use FA with deterministic behavior, please install"
                "FlashAttention version 1."
979
980
            )

981
982
983
984
        self.use_fused_attention = (
            int(os.getenv("NVTE_FUSED_ATTN", "1"))
            and self.device_compute_capability >= 8.0
        )
985
986
987
988
989

        attn_kwargs = {
            "attention_dropout": attention_dropout,
            "attention_dropout_ctx": attention_dropout_ctx,
        }
990
991
        self.attention_type = attention_type
        self.attention_dropout = attention_dropout
992
993

        if self.use_flash_attention:
994
995
996
            self.flash_attention = FlashAttention(
                norm_factor, **attn_kwargs,
                deterministic=self.deterministic)
997
        # Instantiating three types since use of flash-attn and FusedAttention
998
        # might be ruled out due to forward inputs.
999
1000
1001
1002
        if self.use_fused_attention:
            self.fused_attention = FusedAttention(
                norm_factor, **attn_kwargs,
                attention_type = attention_type)
1003
1004
1005
1006
1007
1008
1009
        self.unfused_attention = UnfusedDotProductAttention(
            norm_factor, **attn_kwargs, layer_number=layer_number)

    def _checkpointed_attention_forward(
        self,
        attention_func: Callable,
        *forward_args: Tuple[torch.Tensor, ...],
1010
        **forward_kwargs: Dict[str, Any],
1011
1012
1013
    ) -> torch.Tensor:
        """Forward method with activation checkpointing."""

1014
1015
        def custom_forward(*input_args, **input_kwargs):
            return attention_func(*input_args, **input_kwargs)
1016
1017
1018
1019
1020
1021
1022

        hidden_states = checkpoint(
            custom_forward,
            False,
            self.get_rng_state_tracker,
            self.tp_group,
            *forward_args,
1023
            **forward_kwargs,
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
        )

        return hidden_states

    def forward(
        self,
        query_layer: torch.Tensor,
        key_layer: torch.Tensor,
        value_layer: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
1034
        attn_mask_type: str = "causal",
1035
        checkpoint_core_attention: bool = False,
1036
1037
1038
        core_attention_bias_type: str = "no_bias",
        core_attention_bias: Optional[torch.Tensor] = None,
        fast_zero_fill: bool = True,
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
    ) -> torch.Tensor:
        """
        Dot Product Attention Layer.

        .. note::

            Argument :attr:`attention_mask` will be ignored when :attr:`attn_mask_type`
            is set to `"causal"`.

        .. note::

            Input tensors :attr:`query_layer`, :attr:`key_layer`, and :attr:`value_layer`
            must each be of shape (:attr:`sequence_length`, :attr:`batch_size`,
            :attr:`num_attention_heads`, :attr:`kv_channels`). Output of shape
            (:attr:`sequence_length`, :attr:`batch_size`, :attr:`num_attention_heads`
            * :attr:`kv_channels`) is returned.

1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
        .. note::

            `DotProductAttention` supports three backends: 1) `FlashAttention` which calls
            HazyResearch's FlashAttention PyTorch API, 2) `FusedAttention` which has multiple
            fused attention implementations as its backends (see `FusedAttention` for
            more details), and 3) `UnfusedDotProductAttention` which is the native PyTorch
            implementation with fused scaled masked softmax. Users can use environment variables
            `NVTE_FLASH_ATTN`, `NVTE_FUSED_ATTN`, and `NVTE_FUSED_ATTN_BACKEND` to control
            which DotProductAttention backend, and FusedAttention backend if applicable, to use.
            The default DotProductAttention backend is 1.

1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
        Parameters
        ----------
        query_layer : torch.Tensor
                     Query tensor.
        key_layer : torch.Tensor
                   Key tensor.
        value_layer : torch.Tensor
                     Value tensor.
        attention_mask : Optional[torch.Tensor], default = `None`
                        Boolean tensor used to mask out softmax input when not using flash-attn.
1077
1078
        attn_mask_type: {'causal', 'padding', 'no_mask'}, default = `causal`
                       type of attention mask passed into softmax operation.
1079
1080
1081
1082
1083
        checkpoint_core_attention : bool, default = `False`
                                   If true, forward activations for attention are recomputed
                                   during the backward pass in order to save memory that would
                                   otherwise be occupied to store the forward activations until
                                   backprop.
1084
1085
1086
1087
1088
1089
        core_attention_bias_type: str, default = `no_bias`
                    Bias type, {`no_bias`, `pre_scale_bias`, 'post_scale_bias`}
        core_attention_bias: Optional[torch.Tensor], default = `None`
                    Bias tensor for Q * K.T
        fast_zero_fill: bool, defautl = `True`
                    Whether to use the fast path to set output tensors to 0 or not.
1090
1091
        """

1092
1093
1094
1095
1096
1097
1098
1099
1100
        if self.attn_mask_type is not None:
            warnings.warn(
                "Argument :attr:`attn_mask_type` has been moved to the `forward` method and"
                "is deprecated. It will be fully removed in future releases.",
                category=DeprecationWarning,
            )
            # Keep previous functionality for current users.
            attn_mask_type = self.attn_mask_type

1101
1102
1103
1104
        assert (key_layer.shape[-2] == self.num_gqa_groups_per_partition
                and value_layer.shape[-2] == self.num_gqa_groups_per_partition
                ), f"Keys and values must have {self.num_gqa_groups} heads!"

1105
        use_flash_attention = self.use_flash_attention
1106
1107
        use_fused_attention = self.use_fused_attention

1108
1109
1110
1111
1112
1113
        if (query_layer.dtype not in [torch.bfloat16, torch.float16]
            or key_layer.dtype not in [torch.bfloat16, torch.float16]
            or value_layer.dtype not in [torch.bfloat16, torch.float16]
        ):
            use_flash_attention = False

1114
1115
1116
1117
1118
1119
        if key_layer.shape[-1] > 64:
            if self.device_compute_capability in (8.6, 8.7):
                use_flash_attention = False
            elif not _flash_attn_2_available and self.device_compute_capability == 8.9:
                use_flash_attention = False

1120
1121
1122
        if not _flash_attn_2_available and self.num_gqa_groups != self.num_attention_heads:
            use_flash_attention = False

1123
        if attn_mask_type == "padding" and attention_mask is not None:
1124
            use_flash_attention = False
1125
            use_fused_attention = False
1126

1127
1128
1129
        if core_attention_bias_type != "no_bias" or core_attention_bias is not None:
            use_flash_attention = False

1130
1131
        if is_in_onnx_export_mode():
            use_flash_attention = False
1132
1133
1134
            use_fused_attention = False

        qkv_layout = "qkv_interleaved" if self.attention_type == "self" else "kv_interleaved"
1135
1136
1137
1138
1139
1140
1141

        if use_fused_attention:
            fused_attention_backend = tex.get_fused_attn_backend(
                TE_DType[query_layer.dtype],
                TE_DType[key_layer.dtype],
                QKVLayout[qkv_layout],
                AttnBiasType[core_attention_bias_type],
1142
                AttnMaskType[attn_mask_type],
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
                self.attention_dropout,
                query_layer.shape[0], key_layer.shape[0],
                query_layer.shape[-1])
            # DPA does not support FP8; for FP8, use cpp_extensions modules directly
            is_backend_avail = (fused_attention_backend in
                [FusedAttnBackend["F16_max512_seqlen"], FusedAttnBackend["F16_arbitrary_seqlen"]])
            use_fused_attention = (use_fused_attention
                                  and is_backend_avail
                                  and self.num_gqa_groups == self.num_attention_heads)
            if (self.deterministic
                and fused_attention_backend == FusedAttnBackend["F16_arbitrary_seqlen"]):
                use_fused_attention = False
                warnings.warn(
                    "Disabling usage of FusedAttention since the FusedAttention"
                    "backend does not support deterministic exection."
                )
1159
1160
1161
1162
1163
1164

        if use_flash_attention:
            if checkpoint_core_attention:
                return self._checkpointed_attention_forward(self.flash_attention,
                                                            query_layer,
                                                            key_layer,
1165
1166
1167
1168
                                                            value_layer,
                                                            attn_mask_type=attn_mask_type)
            return self.flash_attention(
                query_layer, key_layer, value_layer, attn_mask_type=attn_mask_type)
1169

1170
1171
1172
        if use_fused_attention:
            if checkpoint_core_attention:
                return self._checkpointed_attention_forward(self.fused_attention,
1173
1174
1175
                              query_layer,
                              key_layer,
                              value_layer,
1176
1177
1178
1179
1180
                              attn_mask_type=attn_mask_type,
                              fused_attention_backend=fused_attention_backend,
                              core_attention_bias_type=core_attention_bias_type,
                              core_attention_bias=core_attention_bias,
                              fast_zero_fill=fast_zero_fill)
1181
            return self.fused_attention(query_layer, key_layer, value_layer,
1182
1183
1184
1185
1186
                              attn_mask_type=attn_mask_type,
                              fused_attention_backend=fused_attention_backend,
                              core_attention_bias_type=core_attention_bias_type,
                              core_attention_bias=core_attention_bias,
                              fast_zero_fill=fast_zero_fill)
1187

1188
1189
1190
1191
1192
1193
        if checkpoint_core_attention:
            return self._checkpointed_attention_forward(
                self.unfused_attention,
                query_layer,
                key_layer,
                value_layer,
1194
1195
1196
1197
                attn_mask_type=attn_mask_type,
                attention_mask=attention_mask,
                core_attention_bias_type=core_attention_bias_type,
                core_attention_bias=core_attention_bias,
1198
            )
1199
1200
1201
        return self.unfused_attention(query_layer,
                key_layer,
                value_layer,
1202
1203
1204
1205
                attn_mask_type=attn_mask_type,
                attention_mask=attention_mask,
                core_attention_bias_type=core_attention_bias_type,
                core_attention_bias=core_attention_bias,
1206
        )
1207
1208


1209
1210
1211
1212
1213
1214
1215
1216
class MultiheadAttention(torch.nn.Module):
    r"""
    Multi-head Attention (MHA), including Query,
    Key, Value and Output projection.

    .. note::

        Argument :attr:`attention_mask` will be ignored in the `forward` call when
1217
1218
1219
1220
1221
1222
        :attr:`attn_mask_type` is set to `"causal"`.

    .. warning::

        Argument :attr:`attn_mask_type` has been moved to the `forward` method and
        is deprecated. It will be fully removed in future releases.
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326

    Parameters
    ----------
    hidden_size : int
                 size of each input sample.
    num_attention_heads : int
                         number of attention heads in the transformer layer.
    kv_channels: int, default = `None`
                number of key-value channels. defaults to
                :attr:`hidden_size` / :attr:`num_attention_heads` if `None`.
    attention_dropout: float, default = 0.1
                      dropout probability for the dropout op during multi-head attention.
    layernorm_epsilon : float, default = 1e-5
                       a value added to the denominator of layer normalization
                       for numerical stability.
    init_method : Callable, default = `None`
                 used for initializing weights of QKV and FC1 weights in the following way:
                 `init_method(weight)`. When set to `None`, defaults to
                 `torch.nn.init.normal_(mean=0.0, std=0.023)`.
    output_layer_init_method : Callable, default = `None`
                              used for initializing weights of PROJ and FC2 in the following way:
                              `output_layer_init_method(weight)`. When set to `None`, defaults to
                              `torch.nn.init.normal_(mean=0.0, std=0.023)`.
    layer_number: int, default = `None`
                 layer number of the current `TransformerLayer` when multiple such modules are
                 concatenated to form a transformer block.
    num_gqa_groups : int, default = `None`
                         number of GQA groups in the transformer layer.
                         Grouped Query Attention is described in
                         `this paper <https://arxiv.org/pdf/2305.13245.pdf>`_.
                         This only affects the keys and values, not the querys.
                         GQA-1 is equivalent to Multi-Query Attention
                         (`MQA <https://arxiv.org/pdf/1911.02150.pdf>`_), while GQA-H
                         is equivalent to MHA, i.e. `num_gqa_groups = num_attention_heads`.
    return_layernorm_output : bool, default = `False`
                             if set to `True`, output of layernorm is returned from the forward
                             together with the output of the linear transformation.
                             Example use case: residual connection for transformer module is
                             taken post layernorm.
    input_layernorm: bool, default = `True`
                     if set to `False`, layer normalization to the input is not applied.
    attention_type: { 'self', 'cross' }, default = 'self'
                   type of attention applied.
    zero_centered_gamma : bool, default = 'False'
                         if set to 'True', gamma parameter in LayerNorm is initialized to 0 and
                         the LayerNorm formula changes to

                         .. math::
                            y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \varepsilon}} *
                            (1 + \gamma) + \beta
    normalization : { 'LayerNorm', 'RMSNorm' }, default = 'LayerNorm'
                   type of normalization applied.
    qkv_weight_interleaved : bool, default = `True`
                            if set to `False`, the QKV weight is interpreted as a concatenation of
                            query, key, and value weights along the `0th` dimension. The default
                            interpretation is that the individual `q`, `k`, and `v` weights for each
                            attention head are interleaved. This parameter is set to `False` when
                            using :attr:`fuse_qkv_params=False`.
    bias : bool, default = `True`
          if set to `False`, the transformer layer will not learn any additive biases.
    device : Union[torch.device, str], default = "cuda"
          The device on which the parameters of the model will allocated. It is the user's
          responsibility to ensure all parameters are moved to the GPU before running the
          forward pass.

    Parallelism parameters
    ----------------------
    set_parallel_mode : bool, default = `False`
                      if set to `True`, QKV and FC1 layers are used as Column Parallel
                      whereas PROJ and FC2 is used as Row Parallel as described
                      `here <https://arxiv.org/pdf/1909.08053.pdf>`_.
    sequence_parallel : bool, default = `False`
                       if set to `True`, uses sequence parallelism.
    tp_group : ProcessGroup, default = `None`
              tensor parallel process group.
    tp_size : int, default = 1
             used as TP (tensor parallel) world size when TP groups are not formed during
             initialization. In this case, users must call the
             `set_tensor_parallel_group(tp_group)` method on the initialized module before the
             forward pass to supply the tensor parallel group needed for tensor and sequence
             parallel collectives.

    Optimization parameters
    -----------------------
    fuse_wgrad_accumulation : bool, default = 'False'
                             if set to `True`, enables fusing of creation and accumulation of
                             the weight gradient. When enabled, it is assumed that the weights
                             have an additional `main_grad` attribute (used instead of the
                             regular `grad`) which is a pre-allocated buffer of the correct
                             size to accumulate gradients in.
    params_dtype : torch.dtype, default = `torch.get_default_dtype()`
                  it controls the type used to allocate the initial parameters. Useful when
                  the model is trained with lower precision and the original FP32 parameters
                  would not fit in GPU memory.
    return_bias : bool, default = `False`
                 when set to `True`, this module will not apply the additive bias itself, but
                 instead return the bias value during the forward pass together with the
                 output of the linear transformation :math:`y = xA^T`. This is useful when
                 the bias addition can be fused to subsequent operations.
    fuse_qkv_params: bool, default = 'False'
                    if set to `True`, `TransformerLayer` module exposes a single fused
                    parameter for query-key-value. This enables optimizations such as QKV
                    fusion without concatentations/splits and also enables the argument
                    `fuse_wgrad_accumulation`.
1327
1328
1329
1330
1331
1332
    """

    def __init__(
        self,
        hidden_size: int,
        num_attention_heads: int,
1333
1334
1335
1336
1337
        kv_channels: Optional[int] = None,
        attention_dropout: float = 0.1,
        layernorm_epsilon: float = 1e-5,
        init_method: Optional[Callable] = None,
        output_layer_init_method: Optional[Callable] = None,
1338
        layer_number: Optional[int] = None,
1339
        attn_mask_type: Optional[str] = None,
1340
1341
        tp_group: Optional[dist_group_type] = None,
        tp_size: int = 1,
1342
        num_gqa_groups: Optional[int] = None,
1343
1344
1345
        fuse_wgrad_accumulation: bool = False,
        get_rng_state_tracker: Optional[Callable] = None,
        sequence_parallel: bool = False,
1346
        params_dtype: Optional[torch.dtype] = None,
1347
        return_bias: bool = False,
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
        return_layernorm_output: bool = False,
        input_layernorm: bool = False,
        attention_type: str = "self",
        set_parallel_mode: bool = False,
        fuse_qkv_params: bool = False,
        zero_centered_gamma: bool = False,
        qkv_weight_interleaved: bool = True,
        ub_bulk_wgrad: bool = False,
        ub_bulk_dgrad: bool = False,
        ub_split_rs: bool = False,
        ub_split_ag: bool = False,
        bias: bool = True,
1360
        normalization: str = "LayerNorm",
1361
        device: Union[torch.device, str] = "cuda",
1362
1363
    ) -> None:
        super().__init__()
1364
1365
1366
1367
1368
1369
1370
1371
1372

        if attn_mask_type is not None:
            warnings.warn(
                "Argument :attr:`attn_mask_type` has been moved to the `forward` method and"
                "is deprecated. It will be fully removed in future releases.",
                category=DeprecationWarning,
            )

        self.attn_mask_type = attn_mask_type
1373
        self.layer_number = layer_number
1374
1375
1376
1377
1378
        self.input_layernorm = input_layernorm
        self.attention_type = attention_type
        self.get_rng_state_tracker = get_rng_state_tracker
        self.tp_group = tp_group
        self.return_layernorm_output = return_layernorm_output
1379
        self.params_dtype = torch.get_default_dtype() if params_dtype is None else params_dtype
1380
        self.num_attention_heads = num_attention_heads
1381
1382
1383
1384
1385
1386
1387
1388
        self.return_bias = return_bias

        kv_channels = kv_channels if kv_channels else (hidden_size // num_attention_heads)

        if init_method is None:
            init_method = get_default_init_method()
        if output_layer_init_method is None:
            output_layer_init_method = get_default_init_method()
1389
1390
1391
1392
1393

        if not fuse_qkv_params:
            qkv_weight_interleaved = False
        self.qkv_weight_interleaved = qkv_weight_interleaved

1394
1395
1396
        assert attention_type in AttnTypes, f"attention_type {attention_type} not supported"
        if layer_number is not None:
            assert layer_number > 0, "layer_number must be a positive integer"
1397
1398
1399
1400
1401
1402
1403

        tp_size = tp_size if tp_group is None else get_distributed_world_size(tp_group)
        self.tp_size = tp_size
        self.sequence_parallel = (tp_size > 1) and sequence_parallel

        self.hidden_size_per_attention_head = kv_channels
        self.num_attention_heads_per_partition = divide(num_attention_heads, tp_size)
1404
1405
1406
1407
1408
1409
1410
1411
1412
        self.num_gqa_groups = (
            num_attention_heads if num_gqa_groups is None else num_gqa_groups
        )
        assert (num_attention_heads % self.num_gqa_groups == 0
                ), "The number of GQA groups must be divisible by the number of attention heads!"
        assert (num_attention_heads % tp_size == 0
                ), "The number of GQA groups must be divisible by tensor parallel size!"
        self.num_gqa_groups_per_partition = int(self.num_gqa_groups // tp_size)
        self.hidden_size_kv = int(hidden_size * self.num_gqa_groups // num_attention_heads)
1413
1414
1415
1416
1417
1418
1419

        common_gemm_kwargs = {
            "fuse_wgrad_accumulation": fuse_wgrad_accumulation,
            "tp_group": tp_group,
            "tp_size": tp_size,
            "get_rng_state_tracker": get_rng_state_tracker,
            "sequence_parallel": sequence_parallel,
1420
            "params_dtype": self.params_dtype,
1421
            "device": device,
1422
1423
1424
1425
        }

        qkv_parallel_mode = "column" if set_parallel_mode else None

1426
        if self.attention_type == "self" and self.num_gqa_groups == self.num_attention_heads:
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
            if self.input_layernorm:
                self.layernorm_qkv = LayerNormLinear(
                    hidden_size,
                    3 * hidden_size,
                    eps=layernorm_epsilon,
                    init_method=init_method,
                    bias=bias,
                    return_bias=False,
                    parallel_mode=qkv_parallel_mode,
                    return_layernorm_output=return_layernorm_output,
                    parameters_split=("query_", "key_", "value_") if not fuse_qkv_params else None,
                    zero_centered_gamma=zero_centered_gamma,
                    ub_bulk_wgrad=ub_bulk_wgrad,
                    ub_bulk_dgrad=ub_bulk_dgrad,
                    ub_split_ag=ub_split_ag,
1442
                    normalization=normalization,
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
                    **common_gemm_kwargs,
                )
            else:
                self.qkv = Linear(
                    hidden_size,
                    3 * hidden_size,
                    init_method=init_method,
                    bias=bias,
                    return_bias=False,
                    parallel_mode=qkv_parallel_mode,
                    parameters_split=("query_", "key_", "value_") if not fuse_qkv_params else None,
                    **common_gemm_kwargs,
                )
1456
1457
1458
        elif ((self.attention_type == "cross")
                or (self.attention_type == "self"
                    and self.num_gqa_groups != self.num_attention_heads)):
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
            if self.input_layernorm:
                self.layernorm_query = LayerNormLinear(
                    hidden_size,
                    hidden_size,
                    eps=layernorm_epsilon,
                    init_method=init_method,
                    bias=bias,
                    return_bias=False,
                    parallel_mode=qkv_parallel_mode,
                    return_layernorm_output=return_layernorm_output,
                    zero_centered_gamma=zero_centered_gamma,
                    ub_bulk_wgrad=ub_bulk_wgrad,
                    ub_bulk_dgrad=ub_bulk_dgrad,
                    ub_split_ag=ub_split_ag,
1473
                    normalization=normalization,
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
                    **common_gemm_kwargs,
                )
            else:
                self.query_layer = Linear(
                    hidden_size,
                    hidden_size,
                    init_method=init_method,
                    bias=bias,
                    return_bias=False,
                    parallel_mode=qkv_parallel_mode,
                    **common_gemm_kwargs,
                )
            self.key_value = Linear(
                hidden_size,
1488
                2 * self.hidden_size_kv,
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
                init_method=init_method,
                bias=bias,
                return_bias=False,
                parallel_mode=qkv_parallel_mode,
                parameters_split=("key_", "value_") if not fuse_qkv_params else None,
                **common_gemm_kwargs,
            )

        # Attention.
        self.core_attention = DotProductAttention(
            num_attention_heads,
            kv_channels,
1501
1502
            num_gqa_groups=self.num_gqa_groups,
            attention_dropout=attention_dropout,
1503
1504
1505
1506
            tp_size=tp_size,
            get_rng_state_tracker=get_rng_state_tracker,
            sequence_parallel=sequence_parallel,
            tp_group=tp_group,
1507
            layer_number=self.layer_number,
1508
1509
1510
1511
1512
1513
1514
1515
        )

        # Linear
        self.proj = Linear(
            hidden_size,
            hidden_size,
            init_method=output_layer_init_method,
            bias=bias,
1516
            return_bias=return_bias,
1517
1518
1519
1520
1521
1522
1523
1524
            parallel_mode="row" if set_parallel_mode else None,
            ub_split_rs=ub_split_rs,
            ub_split_ag=ub_split_ag,
            **common_gemm_kwargs,
        )


    def _allocate_memory(
1525
        self, inference_max_sequence_len: int, batch_size: int, dtype: torch.dtype
1526
1527
1528
1529
    ) -> torch.Tensor:
        return torch.empty(
            inference_max_sequence_len,
            batch_size,
1530
            self.num_gqa_groups_per_partition,
1531
            self.hidden_size_per_attention_head,
1532
            dtype=dtype,
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
            device=torch.cuda.current_device(),
        )

    def set_tensor_parallel_group(self, tp_group: Union[dist_group_type, None]) -> None:
        """Set TP group"""
        self.tp_group = tp_group

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        encoder_output: Optional[torch.Tensor] = None,
1545
        attn_mask_type: str = "causal",
1546
1547
1548
        is_first_microbatch: Optional[bool] = None,
        checkpoint_core_attention: bool = False,
        inference_params: Optional[Any] = None,
1549
        rotary_pos_emb: Optional[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]] = None,
1550
1551
1552
        core_attention_bias_type: str = "no_bias",
        core_attention_bias: Optional[torch.Tensor] = None,
        fast_zero_fill: bool = True,
1553
    ) -> Tuple[Union[torch.Tensor, None], ...]:
1554
1555
1556
1557
1558
        """
        Forward propagation for MultiheadAttention layer.

        .. note::

1559
            Argument :attr:`attention_mask` will be ignored when :attr:`attn_mask_type`
1560
1561
1562
1563
1564
1565
1566
1567
            is set to `"causal"`.

        Parameters
        ----------
        hidden_states : torch.Tensor
             Input tensor.
        attention_mask : Optional[torch.Tensor], default = `None`
             Boolean tensor used to mask out self-attention softmax input.
1568
1569
        attn_mask_type: {'causal', 'padding', 'no_mask'}, default = `causal`
                       type of attention mask passed into softmax operation.
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
        encoder_output : Optional[torch.Tensor], default = `None`
             Output of the encoder block to be fed into the decoder block if using
             `layer_type="decoder"`.
        is_first_microbatch : {True, False, None}, default = None
                             During training using either gradient accumulation or
                             pipeline parallelism a minibatch of data is further split
                             into microbatches. Between the microbatches of the same minibatch
                             the model weights are not updated. Setting this parameter indicates
                             whether the current microbatch is the first in a minibatch or not.
                             When set, this parameter enables additional optimizations:

                             * during FP8 training, it allows caching of the FP8 versions of
                               the weights
                             * it also allows skipping gradient accumulation during the
                               first microbatch (since it is the first gradient being
                               produced)
        checkpoint_core_attention: bool, default = `False`
                                  If true, forward activations for core attention are recomputed
                                  during the backward pass in order to save memory that would
                                  otherwise be occupied to store the forward activations until
                                  backprop.
        rotary_pos_emb: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]], default = `None`
                       Embeddings for query and key tensors for applying rotary position
                       embedding. By default no input embedding is applied.
        core_attention_bias_type: str, default = `no_bias`
                    Bias type, {`no_bias`, `pre_scale_bias`, 'post_scale_bias`}
        core_attention_bias: Optional[torch.Tensor], default = `None`
                    Bias tensor for Q * K.T
        fast_zero_fill: bool, default = `True`
                    Whether to set output tensors to 0 or not before use.
        """
1601
1602
        # hidden_states: [sq, b, h]

1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
        if self.attn_mask_type is not None:
            warnings.warn(
                "Argument :attr:`attn_mask_type` has been moved to the `forward` method and"
                "is deprecated. It will be fully removed in future releases.",
                category=DeprecationWarning,
            )
            # Keep previous functionality for current users.
            attn_mask_type = self.attn_mask_type

        if attn_mask_type == "padding" and attention_mask is not None:
1613
1614
1615
1616
            assert (
                attention_mask.dtype == torch.bool
            ), "Attention mask must be a boolean tensor"

1617
1618
        assert (core_attention_bias_type in AttnBiasTypes
                ), f"core_attention_bias_type {core_attention_bias_type} is not supported!"
1619
1620
1621
1622
        # =================================================
        # Pre-allocate memory for key-values for inference.
        # =================================================

1623
        is_first_step = False
1624
1625
1626
1627
1628
        if inference_params and self.layer_number is not None:
            if self.layer_number not in inference_params.key_value_memory_dict:
                inf_max_seq_len = inference_params.max_sequence_len
                inf_max_batch_size = inference_params.max_batch_size
                inference_key_memory = self._allocate_memory(
1629
                    inf_max_seq_len, inf_max_batch_size, hidden_states.dtype
1630
1631
                )
                inference_value_memory = self._allocate_memory(
1632
                    inf_max_seq_len, inf_max_batch_size, hidden_states.dtype
1633
1634
1635
1636
1637
                )
                inference_params.key_value_memory_dict[self.layer_number] = (
                    inference_key_memory,
                    inference_value_memory,
                )
1638
                is_first_step = True
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
            else:
                (
                    inference_key_memory,
                    inference_value_memory,
                ) = inference_params.key_value_memory_dict[self.layer_number]

        # =====================
        # Query, Key, and Value
        # =====================

1649
        if self.attention_type == "self" and self.num_gqa_groups == self.num_attention_heads:
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
            # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)]
            if self.input_layernorm:
                layernorm_qkv_outputs = self.layernorm_qkv(
                    hidden_states,
                    is_first_microbatch=is_first_microbatch,
                )
                if self.return_layernorm_output:
                    mixed_x_layer, layernorm_output = layernorm_qkv_outputs
                else:
                    mixed_x_layer = layernorm_qkv_outputs
            else:
                mixed_x_layer = self.qkv(
                    hidden_states,
                    is_first_microbatch=is_first_microbatch,
                )

            if self.qkv_weight_interleaved:
                # [sq, b, (np * 3 * hn)] --> [sq, b, np, 3 * hn]
                new_tensor_shape = mixed_x_layer.size()[:-1] + (
                    self.num_attention_heads_per_partition,
                    3 * self.hidden_size_per_attention_head,
                )
                # split along last dimension
                split_dim = -1
            else:
                # [sq, b, (np * 3 * hn)] --> [sq, b, 3 * np, hn]
                new_tensor_shape = mixed_x_layer.size()[:-1] + (
                    3 * self.num_attention_heads_per_partition,
                    self.hidden_size_per_attention_head,
                )
                # split along second last dimension
                split_dim = -2

            mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)

            # mixed_x_layer --> 3 [sq, b, np, hn]
            if split_dim == -1 and not is_in_onnx_export_mode():
                query_layer, key_layer, value_layer = _SplitLastDim.apply(mixed_x_layer, 3)
            else:
                query_layer, key_layer, value_layer = split_tensor_along_dim(
                    mixed_x_layer, split_dim, 3
                )
1692
1693
1694
1695
1696
1697
1698
1699
1700
        elif ((self.attention_type == "cross")
                or (self.attention_type == "self"
                    and self.num_gqa_groups != self.num_attention_heads)):

            if self.attention_type == "cross":
                input_tensor = encoder_output
            else:
                input_tensor = hidden_states

1701
1702
            # Attention heads [sk, b, h] --> [sk, b, (np * 2 * hn)]
            mixed_kv_layer = self.key_value(
1703
                input_tensor,
1704
1705
1706
1707
1708
1709
                is_first_microbatch=is_first_microbatch,
            )

            if self.qkv_weight_interleaved:
                # [sq, b, (np * 2 * hn)] --> [sq, b, np, 2 * hn]
                new_tensor_shape = mixed_kv_layer.size()[:-1] + (
1710
                    self.num_gqa_groups_per_partition,
1711
1712
1713
1714
1715
1716
1717
                    2 * self.hidden_size_per_attention_head,
                )
                # split along last dimension
                split_dim = -1
            else:
                # [sq, b, (np * 2 * hn)] --> [sq, b, 2 * np, hn]
                new_tensor_shape = mixed_kv_layer.size()[:-1] + (
1718
                    2 * self.num_gqa_groups_per_partition,
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
                    self.hidden_size_per_attention_head,
                )
                # split along second last dimension
                split_dim = -2

            mixed_kv_layer = mixed_kv_layer.view(*new_tensor_shape)

            # mixed_kv_layer --> 2 [sk, b, np, hn]
            if split_dim == -1 and not is_in_onnx_export_mode():
                key_layer, value_layer = _SplitLastDim.apply(mixed_kv_layer, 2)
            else:
                key_layer, value_layer = split_tensor_along_dim(mixed_kv_layer, split_dim, 2)

            # Attention head [sq, b, h] --> [sq, b, hp]
            if self.input_layernorm:
                layernorm_query_outputs = self.layernorm_query(
                    hidden_states,
                    is_first_microbatch=is_first_microbatch,
                )
                if self.return_layernorm_output:
                    query_layer, layernorm_output = layernorm_query_outputs
                else:
                    query_layer = layernorm_query_outputs
            else:
                query_layer = self.query_layer(
                    hidden_states,
                    is_first_microbatch=is_first_microbatch,
                )

            # [sq, b, hp] --> [sq, b, np, hn]
            new_tensor_shape = query_layer.size()[:-1] + (
                self.num_attention_heads_per_partition,
                self.hidden_size_per_attention_head,
            )
            query_layer = query_layer.view(*new_tensor_shape)

        # ==================================
        # Adjust key and value for inference
        # ==================================

1759
1760
1761
1762
1763
        # duplicate the pos_emb for self attention
        if rotary_pos_emb is not None:
            if not isinstance(rotary_pos_emb, tuple):
                rotary_pos_emb = ((rotary_pos_emb,) * 2)

1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
        if inference_params and self.layer_number is not None:
            batch_start = inference_params.batch_size_offset
            batch_end = batch_start + key_layer.size(1)
            assert batch_end <= inference_key_memory.size(1)
            sequence_start = inference_params.sequence_len_offset
            sequence_end = sequence_start + key_layer.size(0)
            assert sequence_end <= inference_key_memory.size(0)
            # Copy key and values.
            inference_key_memory[
                sequence_start:sequence_end, batch_start:batch_end, ...
            ] = key_layer
            inference_value_memory[
                sequence_start:sequence_end, batch_start:batch_end, ...
            ] = value_layer
            key_layer = inference_key_memory[:sequence_end, batch_start:batch_end, ...]
            value_layer = inference_value_memory[
                :sequence_end, batch_start:batch_end, ...
            ]

1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
            # adjust the key rotary positional embedding
            if rotary_pos_emb is not None:
                q_pos_emb, k_pos_emb = rotary_pos_emb
                # need to cross check this condition during inference
                # if not set_inference_key_value_memory:
                if not is_first_step:
                    # In inference, we compute one token at a time.
                    # Select the correct positional embedding
                    # (only the last token in the sequence)
                    q_pos_emb = q_pos_emb[sequence_end - 1 : sequence_end]
                else:
                    # In the first forward pass of inference,
                    # we use the entire provided prefix.
                    # q_pos_emb here has the rope embeddings of the entire
                    # prefix + to-be-generated output so
                    # we slice to just the prefix.
                    q_pos_emb = q_pos_emb[:sequence_end, :, :, :]
                k_pos_emb = k_pos_emb[:sequence_end, :, :, :]
                rotary_pos_emb = (q_pos_emb, k_pos_emb)

1803
1804
1805
1806
        # ==================================
        # core attention computation
        # ==================================

1807
1808
1809
1810
1811
1812
        # apply relative positional encoding (rotary embedding)
        if rotary_pos_emb is not None:
            q_pos_emb, k_pos_emb = rotary_pos_emb
            query_layer = apply_rotary_pos_emb(query_layer, q_pos_emb)
            key_layer = apply_rotary_pos_emb(key_layer, k_pos_emb)

1813
1814
1815
1816
        context_layer = self.core_attention(
            query_layer,
            key_layer,
            value_layer,
1817
1818
            attention_mask=attention_mask,
            attn_mask_type=attn_mask_type,
1819
1820
1821
1822
            checkpoint_core_attention=checkpoint_core_attention,
            core_attention_bias_type=core_attention_bias_type,
            core_attention_bias=core_attention_bias,
            fast_zero_fill=fast_zero_fill,
1823
1824
1825
1826
1827
1828
        )

        # =================
        # Output. [sq, b, h]
        # =================

1829
        projection_output = self.proj(
1830
1831
1832
            context_layer, is_first_microbatch=is_first_microbatch
        )

1833
1834
1835
1836
1837
1838
1839
1840
        if self.return_bias:
            attention_output, attention_bias = projection_output
        else:
            attention_output, attention_bias = projection_output, None

        outputs = (attention_output,)
        if self.return_bias:
            outputs += (attention_bias,)
1841
        if self.input_layernorm and self.return_layernorm_output:
1842
1843
            outputs += (layernorm_output,)
        return outputs if len(outputs) > 1 else outputs[0]