Remove unnecessary files

ae856f3a · Woosuk Kwon · 6ac8e63a · 6ac8e63a · 6ac8e63a · 6ac8e63a
Commit ae856f3a authored Mar 28, 2024 by Woosuk Kwon
20 changed files
--- a/vllm_flash_attn/bert_padding.py
+++ b/vllm_flash_attn/bert_padding.py
-# Adapted from https://github.com/mlcommons/training_results_v1.1/blob/main/NVIDIA/benchmarks/bert/implementations/pytorch/padding.py
-
-import torch
-import torch.nn.functional as F
-from einops import rearrange, repeat
-
-
-class IndexFirstAxis(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, input, indices):
-        ctx.save_for_backward(indices)
-        assert input.ndim >= 2
-        ctx.first_axis_dim, other_shape = input.shape[0], input.shape[1:]
-        second_dim = other_shape.numel()
-        # TD [2022-03-04] For some reason torch.gather is a bit faster than indexing.
-        # return input[indices]
-        return torch.gather(
-            rearrange(input, "b ... -> b (...)"), 0, repeat(indices, "z -> z d", d=second_dim)
-        ).reshape(-1, *other_shape)
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        (indices,) = ctx.saved_tensors
-        assert grad_output.ndim >= 2
-        other_shape = grad_output.shape[1:]
-        grad_output = rearrange(grad_output, "b ... -> b (...)")
-        grad_input = torch.zeros(
-            [ctx.first_axis_dim, grad_output.shape[1]],
-            device=grad_output.device,
-            dtype=grad_output.dtype,
-        )
-        # TD [2022-03-04] For some reason torch.scatter is a bit faster than indexing.
-        # grad_input[indices] = grad_output
-        grad_input.scatter_(0, repeat(indices, "z -> z d", d=grad_output.shape[1]), grad_output)
-        return grad_input.reshape(ctx.first_axis_dim, *other_shape), None
-
-
-index_first_axis = IndexFirstAxis.apply
-
-
-class IndexPutFirstAxis(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, values, indices, first_axis_dim):
-        ctx.save_for_backward(indices)
-        assert indices.ndim == 1
-        assert values.ndim >= 2
-        output = torch.zeros(
-            first_axis_dim, *values.shape[1:], device=values.device, dtype=values.dtype
-        )
-        # TD [2022-03-04] For some reason torch.scatter is a bit faster than indexing.
-        output[indices] = values
-        # output.scatter_(0, repeat(indices, 'z -> z d', d=values.shape[1]), values)
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        (indices,) = ctx.saved_tensors
-        # TD [2022-03-04] For some reason torch.gather is a bit faster than indexing.
-        grad_values = grad_output[indices]
-        # grad_values = torch.gather(grad_output, 0, repeat(indices, 'z -> z d', d=grad_output.shape[1]))
-        return grad_values, None, None
-
-
-index_put_first_axis = IndexPutFirstAxis.apply
-
-
-class IndexFirstAxisResidual(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, input, indices):
-        ctx.save_for_backward(indices)
-        assert input.ndim >= 2
-        ctx.first_axis_dim, other_shape = input.shape[0], input.shape[1:]
-        second_dim = other_shape.numel()
-        # TD [2022-03-04] For some reason torch.gather is a bit faster than indexing.
-        output = input[indices]
-        # We don't want to reshape input (b ... -> b (...)) since it could change the channel_last
-        # memory format to channel_first. In other words, input might not be contiguous.
-        # If we don't detach, Pytorch complains about output being a view and is being modified inplace
-        return output, input.detach()
-
-    @staticmethod
-    def backward(ctx, grad_output, grad_residual):
-        (indices,) = ctx.saved_tensors
-        assert grad_output.ndim >= 2
-        other_shape = grad_output.shape[1:]
-        assert grad_residual.shape[1:] == other_shape
-        grad_input = grad_residual
-        # grad_input[indices] += grad_output
-        indices = indices.reshape(indices.shape[0], *((1,) * (grad_output.ndim - 1)))
-        indices = indices.expand_as(grad_output)
-        grad_input.scatter_add_(0, indices, grad_output)
-        return grad_input.reshape(ctx.first_axis_dim, *other_shape), None
-
-
-index_first_axis_residual = IndexFirstAxisResidual.apply
-
-
-def unpad_input(hidden_states, attention_mask):
-    """
-    Arguments:
-        hidden_states: (batch, seqlen, ...)
-        attention_mask: (batch, seqlen), bool / int, 1 means valid and 0 means not valid.
-    Return:
-        hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
-        indices: (total_nnz), the indices of non-masked tokens from the flattened input sequence.
-        cu_seqlens: (batch + 1), the cumulative sequence lengths, used to index into hidden_states.
-        max_seqlen_in_batch: int
-    """
-    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
-    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
-    max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
-    # TD [2022-03-04] We don't want to index with a bool mask, because Pytorch will expand the
-    # bool mask, then call nonzero to get the indices, then index with those. The indices is @dim
-    # times larger than it needs to be, wasting memory. It's faster and more memory-efficient to
-    # index with integer indices. Moreover, torch's index is a bit slower than it needs to be,
-    # so we write custom forward and backward to make it a bit faster.
-    return (
-        index_first_axis(rearrange(hidden_states, "b s ... -> (b s) ..."), indices),
-        indices,
-        cu_seqlens,
-        max_seqlen_in_batch,
-    )
-
-
-def unpad_input_for_concatenated_sequences(hidden_states, attention_mask_in_length):
-    """
-    Supports concatenating short samples in one sequence. The attention_mask_in_length is utilized to mask other short samples. It helps efficient training of variant lengths-based samples (e.g., the supervised fine-tuning task in large language model).
-    The motivation for this function is explained [here](https://github.com/Dao-AILab/flash-attention/issues/432#issuecomment-1668822286).
-    
-    For example, if batch = 3 and seqlen = 6, the attention_mask_in_length is:
-        ```
-        [
-          [2, 3, 0, 0, 0, 0],
-          [3, 2, 0, 0, 0, 0],
-          [6, 0, 0, 0, 0, 0]
-        ]
-        ```
-    , which refers to the 3D-attention mask:
-        ```
-        [
-          [
-            [1, 0, 0, 0, 0, 0],
-            [1, 1, 0, 0, 0, 0],
-            [0, 0, 1, 0, 0, 0],
-            [0, 0, 1, 1, 0, 0],
-            [0, 0, 1, 1, 1, 0],
-            [0, 0, 0, 0, 0, 1]
-          ],
-          [
-            [1, 0, 0, 0, 0, 0],
-            [1, 1, 0, 0, 0, 0],
-            [1, 1, 1, 0, 0, 0],
-            [0, 0, 0, 1, 0, 0],
-            [0, 0, 0, 1, 1, 0],
-            [0, 0, 0, 0, 0, 1]
-          ],
-          [
-            [1, 0, 0, 0, 0, 0],
-            [1, 1, 0, 0, 0, 0],
-            [1, 1, 1, 0, 0, 0],
-            [1, 1, 1, 1, 0, 0],
-            [1, 1, 1, 1, 1, 0],
-            [1, 1, 1, 1, 1, 1]
-          ]
-        ]
-        ```.
-
-    Arguments:
-        hidden_states: (batch, seqlen, ...)
-        attention_mask_in_length: (batch, seqlen), int, a nonzero number (e.g., 1, 2, 3, etc.) means length of concatenated sequence in b-th batch, and 0 means none.
-    Return:
-        hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
-        indices: (total_nnz), the indices of non-masked tokens from the flattened input sequence.
-        cu_seqlens: (batch + 1), the cumulative sequence lengths, used to index into hidden_states.
-        max_seqlen_in_batch: int
-    """
-    length = attention_mask_in_length.sum(dim=-1)
-    seqlen = attention_mask_in_length.size(-1)
-    attention_mask_2d = torch.arange(seqlen, device=length.device, dtype=length.dtype).expand(len(length), seqlen) < length.unsqueeze(1)
-    real_indices_idx = torch.nonzero(attention_mask_in_length.flatten(), as_tuple=False).flatten()
-    seqlens_in_batch = attention_mask_in_length.flatten()[real_indices_idx]
-    indices = torch.nonzero(attention_mask_2d.flatten(), as_tuple=False).flatten()
-    max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
-    # TD [2022-03-04] We don't want to index with a bool mask, because Pytorch will expand the
-    # bool mask, then call nonzero to get the indices, then index with those. The indices is @dim
-    # times larger than it needs to be, wasting memory. It's faster and more memory-efficient to
-    # index with integer indices. Moreover, torch's index is a bit slower than it needs to be,
-    # so we write custom forward and backward to make it a bit faster.
-    return (
-        index_first_axis(rearrange(hidden_states, "b s ... -> (b s) ..."), indices),
-        indices,
-        cu_seqlens,
-        max_seqlen_in_batch,
-    )
-
-
-def pad_input(hidden_states, indices, batch, seqlen):
-    """
-    Arguments:
-        hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
-        indices: (total_nnz), the indices that represent the non-masked tokens of the original padded input sequence.
-        batch: int, batch size for the padded sequence.
-        seqlen: int, maximum sequence length for the padded sequence.
-    Return:
-        hidden_states: (batch, seqlen, ...)
-    """
-    dim = hidden_states.shape[-1]
-    # output = torch.zeros((batch * seqlen), dim, device=hidden_states.device, dtype=hidden_states.dtype)
-    # output[indices] = hidden_states
-    output = index_put_first_axis(hidden_states, indices, batch * seqlen)
-    return rearrange(output, "(b s) ... -> b s ...", b=batch)
--- a/vllm_flash_attn/flash_attn_triton.py
+++ b/vllm_flash_attn/flash_attn_triton.py
-"""
-*Experimental* implementation of FlashAttention in Triton.
-Tested with triton==2.0.0.dev20221202.
-Triton 2.0 has a new backend (MLIR) but seems like it doesn't yet work for head dimensions
-other than 64:
-https://github.com/openai/triton/blob/d376020f90002757eea3ea9475d4f7cfc2ec5ead/python/triton/ops/flash_attention.py#L207
-We'll update this implementation with the new Triton backend once this is fixed.
-
-We use the FlashAttention implementation from Phil Tillet a starting point.
-https://github.com/openai/triton/blob/master/python/tutorials/06-fused-attention.py
-
-Changes:
- Implement both causal and non-causal attention.
- Implement both self-attention and cross-attention.
- Support arbitrary seqlens (not just multiples of 128), for both forward and backward.
- Support all head dimensions up to 128 (not just 16, 32, 64, 128), for both forward and backward.
- Support attention bias.
- Speed up the forward pass a bit, and only store the LSE instead of m and l.
- Make the backward for d=128 much faster by reducing register spilling.
- Optionally parallelize the backward pass across seqlen_k, to deal with the case of
-small batch size * nheads.
-
-Caution:
- This is an *experimental* implementation. The forward pass should be quite robust but
-I'm not 100% sure that the backward pass doesn't have race conditions (due to the Triton compiler).
- This implementation has only been tested on A100.
- If you plan to use headdim other than 64 and 128, you should test for race conditions
-(due to the Triton compiler), as done in tests/test_flash_attn.py
-"test_flash_attn_triton_race_condition". I've tested and fixed many race conditions
-for different head dimensions (40, 48, 64, 128, 80, 88, 96), but I'm still not 100% confident
-that there are none left for other head dimensions.
-
-Differences between this Triton version and the CUDA version:
- Triton version doesn't support dropout.
- Triton forward is generally faster than CUDA forward, while Triton backward is
-generally slower than CUDA backward. Overall Triton forward + backward is slightly slower
-than CUDA forward + backward.
- Triton version doesn't support different sequence lengths in a batch (i.e., RaggedTensor/NestedTensor).
- Triton version supports attention bias, while CUDA version doesn't.
-"""
-
-import math
-
-import torch
-import triton
-import triton.language as tl
-
-
-# Disabling autotune for now, set num_warps=4 if headdim=64 and num_warps=8 if headdim=128
-# @triton.autotune(
-#     configs=[
-#         triton.Config({"BLOCK_M": 128, "BLOCK_N": 128}, num_warps=4, num_stages=1),
-#         # This config has a race condition when EVEN_M == False, disabling it for now.
-#         # triton.Config({"BLOCK_M": 64, "BLOCK_N": 64}, num_warps=4, num_stages=1),
-#     ],
-#     key=['CACHE_KEY_SEQLEN_Q', 'CACHE_KEY_SEQLEN_K', 'BIAS_TYPE', 'IS_CAUSAL', 'BLOCK_HEADDIM']
-# )
-@triton.heuristics(
-    {
-        "EVEN_M": lambda args: args["seqlen_q"] % args["BLOCK_M"] == 0,
-        "EVEN_N": lambda args: args["seqlen_k"] % args["BLOCK_N"] == 0,
-        "EVEN_HEADDIM": lambda args: args["headdim"] == args["BLOCK_HEADDIM"],
-    }
-)
-@triton.jit
-def _fwd_kernel(
-    Q,
-    K,
-    V,
-    Bias,
-    Out,
-    Lse,
-    TMP,  # NOTE: TMP is a scratchpad buffer to workaround a compiler bug
-    softmax_scale,
-    stride_qb,
-    stride_qh,
-    stride_qm,
-    stride_kb,
-    stride_kh,
-    stride_kn,
-    stride_vb,
-    stride_vh,
-    stride_vn,
-    stride_bb,
-    stride_bh,
-    stride_bm,
-    stride_ob,
-    stride_oh,
-    stride_om,
-    nheads,
-    seqlen_q,
-    seqlen_k,
-    seqlen_q_rounded,
-    headdim,
-    CACHE_KEY_SEQLEN_Q,
-    CACHE_KEY_SEQLEN_K,
-    BIAS_TYPE: tl.constexpr,
-    IS_CAUSAL: tl.constexpr,
-    BLOCK_HEADDIM: tl.constexpr,
-    EVEN_M: tl.constexpr,
-    EVEN_N: tl.constexpr,
-    EVEN_HEADDIM: tl.constexpr,
-    BLOCK_M: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-):
-    start_m = tl.program_id(0)
-    off_hb = tl.program_id(1)
-    off_b = off_hb // nheads
-    off_h = off_hb % nheads
-    # off_b = tl.program_id(1)
-    # off_h = tl.program_id(2)
-    # off_hb = off_b * nheads + off_h
-    # initialize offsets
-    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-    offs_n = tl.arange(0, BLOCK_N)
-    offs_d = tl.arange(0, BLOCK_HEADDIM)
-    # Initialize pointers to Q, K, V
-    # Adding parenthesis around indexing might use int32 math instead of int64 math?
-    # https://github.com/openai/triton/issues/741
-    # I'm seeing a tiny bit of difference (5-7us)
-    q_ptrs = (
-        Q + off_b * stride_qb + off_h * stride_qh + (offs_m[:, None] * stride_qm + offs_d[None, :])
-    )
-    k_ptrs = (
-        K + off_b * stride_kb + off_h * stride_kh + (offs_n[:, None] * stride_kn + offs_d[None, :])
-    )
-    v_ptrs = (
-        V + off_b * stride_vb + off_h * stride_vh + (offs_n[:, None] * stride_vn + offs_d[None, :])
-    )
-    if BIAS_TYPE == "vector":
-        b_ptrs = Bias + off_b * stride_bb + off_h * stride_bh + offs_n
-    elif BIAS_TYPE == "matrix":
-        b_ptrs = (
-            Bias
-            + off_b * stride_bb
-            + off_h * stride_bh
-            + (offs_m[:, None] * stride_bm + offs_n[None, :])
-        )
-    # initialize pointer to m and l
-    t_ptrs = TMP + off_hb * seqlen_q_rounded + offs_m
-    lse_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
-    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
-    acc_o = tl.zeros([BLOCK_M, BLOCK_HEADDIM], dtype=tl.float32)
-    # load q: it will stay in SRAM throughout
-    # [2022-10-30] TD: Triton bug - in the case of EVEN_M=True and EVEN_N=False, if we just call
-    # tl.load(q_ptrs), we get the wrong output!
-    if EVEN_M & EVEN_N:
-        if EVEN_HEADDIM:
-            q = tl.load(q_ptrs)
-        else:
-            q = tl.load(q_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
-    else:
-        if EVEN_HEADDIM:
-            q = tl.load(q_ptrs, mask=offs_m[:, None] < seqlen_q, other=0.0)
-        else:
-            q = tl.load(
-                q_ptrs, mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim), other=0.0
-            )
-    # loop over k, v and update accumulator
-    end_n = seqlen_k if not IS_CAUSAL else tl.minimum((start_m + 1) * BLOCK_M, seqlen_k)
-    for start_n in range(0, end_n, BLOCK_N):
-        start_n = tl.multiple_of(start_n, BLOCK_N)
-        # -- compute qk ----
-        if EVEN_N & EVEN_M:  # If we just do "if EVEN_N", there seems to be some race condition
-            if EVEN_HEADDIM:
-                k = tl.load(k_ptrs + start_n * stride_kn)
-            else:
-                k = tl.load(k_ptrs + start_n * stride_kn, mask=offs_d[None, :] < headdim, other=0.0)
-        else:
-            if EVEN_HEADDIM:
-                k = tl.load(
-                    k_ptrs + start_n * stride_kn,
-                    mask=(start_n + offs_n)[:, None] < seqlen_k,
-                    other=0.0,
-                )
-            else:
-                k = tl.load(
-                    k_ptrs + start_n * stride_kn,
-                    mask=((start_n + offs_n)[:, None] < seqlen_k) & (offs_d[None, :] < headdim),
-                    other=0.0,
-                )
-        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-        qk += tl.dot(q, k, trans_b=True)
-        # Trying to combine the two masks seem to make the result wrong
-        if not EVEN_N:  # Need to mask out otherwise the softmax is wrong
-            qk += tl.where((start_n + offs_n)[None, :] < seqlen_k, 0, float("-inf"))
-        if IS_CAUSAL:
-            qk += tl.where(offs_m[:, None] >= (start_n + offs_n)[None, :], 0, float("-inf"))
-        if BIAS_TYPE != "none":
-            if BIAS_TYPE == "vector":
-                if EVEN_N:
-                    bias = tl.load(b_ptrs + start_n).to(tl.float32)
-                else:
-                    bias = tl.load(
-                        b_ptrs + start_n, mask=(start_n + offs_n) < seqlen_k, other=0.0
-                    ).to(tl.float32)
-                bias = bias[None, :]
-            elif BIAS_TYPE == "matrix":
-                if EVEN_M & EVEN_N:
-                    bias = tl.load(b_ptrs + start_n).to(tl.float32)
-                else:
-                    bias = tl.load(
-                        b_ptrs + start_n,
-                        mask=(offs_m[:, None] < seqlen_q)
-                        & ((start_n + offs_n)[None, :] < seqlen_k),
-                        other=0.0,
-                    ).to(tl.float32)
-            # Slightly faster to multiply the softmax_scale in the tl.exp below since the compiler
-            # can then fuse the mult and add into an fma instruction. But if we have bias we need to
-            # to multiply with softmax_scale here.
-            qk = qk * softmax_scale + bias
-            m_ij = tl.maximum(tl.max(qk, 1), lse_i)
-            p = tl.exp(qk - m_ij[:, None])
-        else:
-            m_ij = tl.maximum(tl.max(qk, 1) * softmax_scale, lse_i)
-            p = tl.exp(qk * softmax_scale - m_ij[:, None])
-        l_ij = tl.sum(p, 1)
-
-        # scale acc_o
-        acc_o_scale = tl.exp(m_i - m_ij)
-
-        # # -- update output accumulator --
-        # BUG: have to store and immediately load
-        tl.store(t_ptrs, acc_o_scale)
-        acc_o_scale = tl.load(t_ptrs)
-        acc_o = acc_o * acc_o_scale[:, None]
-        # update acc_o
-        if EVEN_N & EVEN_M:  # If we just do "if EVEN_N", there seems to be some race condition
-            if EVEN_HEADDIM:
-                v = tl.load(v_ptrs + start_n * stride_vn)
-            else:
-                v = tl.load(v_ptrs + start_n * stride_vn, mask=offs_d[None, :] < headdim, other=0.0)
-        else:
-            if EVEN_HEADDIM:
-                v = tl.load(
-                    v_ptrs + start_n * stride_vn,
-                    mask=(start_n + offs_n)[:, None] < seqlen_k,
-                    other=0.0,
-                )
-            else:
-                v = tl.load(
-                    v_ptrs + start_n * stride_vn,
-                    mask=((start_n + offs_n)[:, None] < seqlen_k) & (offs_d[None, :] < headdim),
-                    other=0.0,
-                )
-        p = p.to(v.dtype)
-        acc_o += tl.dot(p, v)
-
-        # -- update statistics
-        m_i = m_ij
-        l_i_new = tl.exp(lse_i - m_ij) + l_ij
-        lse_i = m_ij + tl.log(l_i_new)
-
-    o_scale = tl.exp(m_i - lse_i)
-    # BUG: have to store and immediately load
-    tl.store(t_ptrs, o_scale)
-    o_scale = tl.load(t_ptrs)
-    acc_o = acc_o * o_scale[:, None]
-    # rematerialize offsets to save registers
-    start_m = tl.program_id(0)
-    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-    # write back l and m
-    lse_ptrs = Lse + off_hb * seqlen_q_rounded + offs_m
-    tl.store(lse_ptrs, lse_i)
-    # initialize pointers to output
-    offs_d = tl.arange(0, BLOCK_HEADDIM)
-    out_ptrs = (
-        Out
-        + off_b * stride_ob
-        + off_h * stride_oh
-        + (offs_m[:, None] * stride_om + offs_d[None, :])
-    )
-    if EVEN_M:
-        if EVEN_HEADDIM:
-            tl.store(out_ptrs, acc_o)
-        else:
-            tl.store(out_ptrs, acc_o, mask=offs_d[None, :] < headdim)
-    else:
-        if EVEN_HEADDIM:
-            tl.store(out_ptrs, acc_o, mask=offs_m[:, None] < seqlen_q)
-        else:
-            tl.store(
-                out_ptrs, acc_o, mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim)
-            )
-
-
-@triton.jit
-def _bwd_preprocess_do_o_dot(
-    Out,
-    DO,
-    Delta,
-    stride_ob,
-    stride_oh,
-    stride_om,
-    stride_dob,
-    stride_doh,
-    stride_dom,
-    nheads,
-    seqlen_q,
-    seqlen_q_rounded,
-    headdim,
-    BLOCK_M: tl.constexpr,
-    BLOCK_HEADDIM: tl.constexpr,
-):
-    start_m = tl.program_id(0)
-    off_hb = tl.program_id(1)
-    off_b = off_hb // nheads
-    off_h = off_hb % nheads
-    # initialize offsets
-    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-    offs_d = tl.arange(0, BLOCK_HEADDIM)
-    # load
-    o = tl.load(
-        Out + off_b * stride_ob + off_h * stride_oh + offs_m[:, None] * stride_om + offs_d[None, :],
-        mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
-        other=0.0,
-    ).to(tl.float32)
-    do = tl.load(
-        DO
-        + off_b * stride_dob
-        + off_h * stride_doh
-        + offs_m[:, None] * stride_dom
-        + offs_d[None, :],
-        mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
-        other=0.0,
-    ).to(tl.float32)
-    delta = tl.sum(o * do, axis=1)
-    # write-back
-    tl.store(Delta + off_hb * seqlen_q_rounded + offs_m, delta)
-
-
-@triton.jit
-def _bwd_store_dk_dv(
-    dk_ptrs,
-    dv_ptrs,
-    dk,
-    dv,
-    offs_n,
-    offs_d,
-    seqlen_k,
-    headdim,
-    EVEN_M: tl.constexpr,
-    EVEN_N: tl.constexpr,
-    EVEN_HEADDIM: tl.constexpr,
-):
-    # [2022-11-01] TD: Same bug. In the case of EVEN_N=True and EVEN_M=False,
-    # if we just call tl.store(dv_ptrs), there's a race condition
-    if EVEN_N & EVEN_M:
-        if EVEN_HEADDIM:
-            tl.store(dv_ptrs, dv)
-            tl.store(dk_ptrs, dk)
-        else:
-            tl.store(dv_ptrs, dv, mask=offs_d[None, :] < headdim)
-            tl.store(dk_ptrs, dk, mask=offs_d[None, :] < headdim)
-    else:
-        if EVEN_HEADDIM:
-            tl.store(dv_ptrs, dv, mask=offs_n[:, None] < seqlen_k)
-            tl.store(dk_ptrs, dk, mask=offs_n[:, None] < seqlen_k)
-        else:
-            tl.store(dv_ptrs, dv, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim))
-            tl.store(dk_ptrs, dk, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim))
-
-
-@triton.jit
-def _bwd_kernel_one_col_block(
-    start_n,
-    Q,
-    K,
-    V,
-    Bias,
-    DO,
-    DQ,
-    DK,
-    DV,
-    LSE,
-    D,
-    softmax_scale,
-    stride_qm,
-    stride_kn,
-    stride_vn,
-    stride_bm,
-    stride_dom,
-    stride_dqm,
-    stride_dkn,
-    stride_dvn,
-    seqlen_q,
-    seqlen_k,
-    headdim,
-    ATOMIC_ADD: tl.constexpr,
-    BIAS_TYPE: tl.constexpr,
-    IS_CAUSAL: tl.constexpr,
-    BLOCK_HEADDIM: tl.constexpr,
-    EVEN_M: tl.constexpr,
-    EVEN_N: tl.constexpr,
-    EVEN_HEADDIM: tl.constexpr,
-    BLOCK_M: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-):
-    # We need to make sure begin_m is a multiple of BLOCK_M (not BLOCK_N)
-    begin_m = 0 if not IS_CAUSAL else ((start_n * BLOCK_N) // BLOCK_M) * BLOCK_M
-    # initialize row/col offsets
-    offs_qm = begin_m + tl.arange(0, BLOCK_M)
-    offs_n = start_n * BLOCK_N + tl.arange(0, BLOCK_N)
-    offs_m = tl.arange(0, BLOCK_M)
-    offs_d = tl.arange(0, BLOCK_HEADDIM)
-    # initialize pointers to value-like data
-    q_ptrs = Q + (offs_qm[:, None] * stride_qm + offs_d[None, :])
-    k_ptrs = K + (offs_n[:, None] * stride_kn + offs_d[None, :])
-    v_ptrs = V + (offs_n[:, None] * stride_vn + offs_d[None, :])
-    do_ptrs = DO + (offs_qm[:, None] * stride_dom + offs_d[None, :])
-    dq_ptrs = DQ + (offs_qm[:, None] * stride_dqm + offs_d[None, :])
-    if BIAS_TYPE == "vector":
-        b_ptrs = Bias + offs_n
-    elif BIAS_TYPE == "matrix":
-        b_ptrs = Bias + (offs_qm[:, None] * stride_bm + offs_n[None, :])
-    # initialize dv and dk
-    dv = tl.zeros([BLOCK_N, BLOCK_HEADDIM], dtype=tl.float32)
-    dk = tl.zeros([BLOCK_N, BLOCK_HEADDIM], dtype=tl.float32)
-    # There seems to be some problem with Triton pipelining that makes results wrong for
-    # headdim=64, seqlen=(113, 255), bias_type='matrix'. In this case the for loop
-    # may have zero step, and pipelining with the bias matrix could screw it up.
-    # So we just exit early.
-    if begin_m >= seqlen_q:
-        dv_ptrs = DV + (offs_n[:, None] * stride_dvn + offs_d[None, :])
-        dk_ptrs = DK + (offs_n[:, None] * stride_dkn + offs_d[None, :])
-        _bwd_store_dk_dv(
-            dk_ptrs,
-            dv_ptrs,
-            dk,
-            dv,
-            offs_n,
-            offs_d,
-            seqlen_k,
-            headdim,
-            EVEN_M=EVEN_M,
-            EVEN_N=EVEN_N,
-            EVEN_HEADDIM=EVEN_HEADDIM,
-        )
-        return
-    # k and v stay in SRAM throughout
-    # [2022-10-30] TD: Same bug as the fwd. In the case of EVEN_N=True and EVEN_M=False,
-    # if we just call tl.load(k_ptrs), we get the wrong output!
-    if EVEN_N & EVEN_M:
-        if EVEN_HEADDIM:
-            k = tl.load(k_ptrs)
-            v = tl.load(v_ptrs)
-        else:
-            k = tl.load(k_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
-            v = tl.load(v_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
-    else:
-        if EVEN_HEADDIM:
-            k = tl.load(k_ptrs, mask=offs_n[:, None] < seqlen_k, other=0.0)
-            v = tl.load(v_ptrs, mask=offs_n[:, None] < seqlen_k, other=0.0)
-        else:
-            k = tl.load(
-                k_ptrs, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim), other=0.0
-            )
-            v = tl.load(
-                v_ptrs, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim), other=0.0
-            )
-    # loop over rows
-    num_block_m = tl.cdiv(seqlen_q, BLOCK_M)
-    for start_m in range(begin_m, num_block_m * BLOCK_M, BLOCK_M):
-        start_m = tl.multiple_of(start_m, BLOCK_M)
-        offs_m_curr = start_m + offs_m
-        # load q, k, v, do on-chip
-        # Same bug as below. Otherwise gives wrong result for headdim=40, seqlen=(128, 117)
-        if EVEN_M & EVEN_HEADDIM:
-            q = tl.load(q_ptrs)
-        else:
-            if EVEN_HEADDIM:
-                q = tl.load(q_ptrs, mask=offs_m_curr[:, None] < seqlen_q, other=0.0)
-            else:
-                q = tl.load(
-                    q_ptrs,
-                    mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
-                    other=0.0,
-                )
-        # recompute p = softmax(qk, dim=-1).T
-        qk = tl.dot(q, k, trans_b=True)
-        # Trying to combine the two masks seem to make the result wrong
-        if not EVEN_N:  # Need to mask out otherwise the softmax is wrong
-            qk = tl.where(offs_n[None, :] < seqlen_k, qk, float("-inf"))
-        if IS_CAUSAL:
-            qk = tl.where(offs_m_curr[:, None] >= (offs_n[None, :]), qk, float("-inf"))
-        if BIAS_TYPE != "none":
-            tl.debug_barrier()  # Race condition otherwise
-            if BIAS_TYPE == "vector":
-                if EVEN_N:
-                    bias = tl.load(b_ptrs).to(tl.float32)
-                else:
-                    bias = tl.load(b_ptrs, mask=offs_n < seqlen_k, other=0.0).to(tl.float32)
-                bias = bias[None, :]
-            elif BIAS_TYPE == "matrix":
-                if EVEN_M & EVEN_N:
-                    bias = tl.load(b_ptrs).to(tl.float32)
-                else:
-                    bias = tl.load(
-                        b_ptrs,
-                        mask=(offs_m_curr[:, None] < seqlen_q) & (offs_n[None, :] < seqlen_k),
-                        other=0.0,
-                    ).to(tl.float32)
-            qk = qk * softmax_scale + bias
-        # There seems to be a race condition when headdim=48/96, and dq, dk, dv are wrong.
-        # Also wrong for headdim=64.
-        if not (EVEN_M & EVEN_HEADDIM):
-            tl.debug_barrier()
-        lse_i = tl.load(LSE + offs_m_curr)
-        if BIAS_TYPE == "none":
-            p = tl.exp(qk * softmax_scale - lse_i[:, None])
-        else:
-            p = tl.exp(qk - lse_i[:, None])
-        # compute dv
-        # [2022-10-30] TD: A Triton bug: if EVEN_M=True and EVEN_HEADDIM=False, if we call
-        # do = tl.load(do_ptrs, mask=offs_d[None, :] < headdim, other=0.0), we get wrong outputs
-        # in the case of headdim=48/96, seqlen_q & seqlen_k >= 512. If headdim=40 or seqlen < 512,
-        # the output is correct.
-        if EVEN_M & EVEN_HEADDIM:
-            do = tl.load(do_ptrs)
-        else:
-            # [2022-11-01] TD: Triton bug, there's a race condition if we just use m_mask and not d_mask.
-            do = tl.load(
-                do_ptrs,
-                mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
-                other=0.0,
-            )
-        # if EVEN_M:
-        #     if EVEN_HEADDIM:
-        #         do = tl.load(do_ptrs)
-        #     else:
-        #         do = tl.load(do_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
-        # else:
-        #     if EVEN_HEADDIM:
-        #         do = tl.load(do_ptrs, mask=offs_m_curr[:, None] < seqlen_q, other=0.0)
-        #     else:
-        #         do = tl.load(do_ptrs, mask=(offs_m_curr[:, None] < seqlen_q)
-        #                                    & (offs_d[None, :] < headdim), other=0.0)
-        dv += tl.dot(p.to(do.dtype), do, trans_a=True)
-        # compute dp = dot(v, do)
-        # There seems to be a race condition when headdim=48/96, and dq, dk are wrong.
-        # Also wrong for headdim=128, seqlen=(108, 256), and ATOMIC_ADD=True
-        # Also wrong for headdim=64, seqlen=(1023, 1024), and ATOMIC_ADD=False
-        if not (EVEN_M & EVEN_HEADDIM):
-            tl.debug_barrier()
-        dp = tl.dot(do, v, trans_b=True)
-        # There's a race condition for headdim=48
-        if not EVEN_HEADDIM:
-            tl.debug_barrier()
-        # compute ds = p * (dp - delta[:, None])
-        # Putting the subtraction after the dp matmul (instead of before) is slightly faster
-        Di = tl.load(D + offs_m_curr)
-        # Converting ds to q.dtype here reduces register pressure and makes it much faster
-        # for BLOCK_HEADDIM=128
-        ds = (p * (dp - Di[:, None]) * softmax_scale).to(q.dtype)
-        # compute dk = dot(ds.T, q)
-        dk += tl.dot(ds, q, trans_a=True)
-        # compute dq
-        if not (
-            EVEN_M & EVEN_HEADDIM
-        ):  # Otherewise there's a race condition when BIAS_TYPE='matrix'
-            tl.debug_barrier()
-        if not ATOMIC_ADD:
-            if EVEN_M & EVEN_HEADDIM:  # Race condition if we just do EVEN_M
-                dq = tl.load(dq_ptrs, eviction_policy="evict_last")
-                dq += tl.dot(ds, k)
-                tl.store(dq_ptrs, dq, eviction_policy="evict_last")
-            else:
-                if EVEN_HEADDIM:
-                    dq = tl.load(
-                        dq_ptrs,
-                        mask=offs_m_curr[:, None] < seqlen_q,
-                        other=0.0,
-                        eviction_policy="evict_last",
-                    )
-                    dq += tl.dot(ds, k)
-                    tl.store(
-                        dq_ptrs,
-                        dq,
-                        mask=offs_m_curr[:, None] < seqlen_q,
-                        eviction_policy="evict_last",
-                    )
-                else:
-                    dq = tl.load(
-                        dq_ptrs,
-                        mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
-                        other=0.0,
-                        eviction_policy="evict_last",
-                    )
-                    dq += tl.dot(ds, k)
-                    tl.store(
-                        dq_ptrs,
-                        dq,
-                        mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
-                        eviction_policy="evict_last",
-                    )
-        else:  # If we're parallelizing across the seqlen_k dimension
-            dq = tl.dot(ds, k)
-            if EVEN_M & EVEN_HEADDIM:  # Race condition if we just do EVEN_M
-                tl.atomic_add(dq_ptrs, dq)
-            else:
-                if EVEN_HEADDIM:
-                    tl.atomic_add(dq_ptrs, dq, mask=offs_m_curr[:, None] < seqlen_q)
-                else:
-                    tl.atomic_add(
-                        dq_ptrs,
-                        dq,
-                        mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
-                    )
-        # increment pointers
-        dq_ptrs += BLOCK_M * stride_dqm
-        q_ptrs += BLOCK_M * stride_qm
-        do_ptrs += BLOCK_M * stride_dom
-        if BIAS_TYPE == "matrix":
-            b_ptrs += BLOCK_M * stride_bm
-    # write-back
-    dv_ptrs = DV + (offs_n[:, None] * stride_dvn + offs_d[None, :])
-    dk_ptrs = DK + (offs_n[:, None] * stride_dkn + offs_d[None, :])
-    _bwd_store_dk_dv(
-        dk_ptrs,
-        dv_ptrs,
-        dk,
-        dv,
-        offs_n,
-        offs_d,
-        seqlen_k,
-        headdim,
-        EVEN_M=EVEN_M,
-        EVEN_N=EVEN_N,
-        EVEN_HEADDIM=EVEN_HEADDIM,
-    )
-
-
-def init_to_zero(name):
-    return lambda nargs: nargs[name].zero_()
-
-
-@triton.autotune(
-    configs=[
-        triton.Config(
-            {"BLOCK_M": 128, "BLOCK_N": 128, "SEQUENCE_PARALLEL": False},
-            num_warps=8,
-            num_stages=1,
-            pre_hook=init_to_zero("DQ"),
-        ),
-        triton.Config(
-            {"BLOCK_M": 128, "BLOCK_N": 128, "SEQUENCE_PARALLEL": True},
-            num_warps=8,
-            num_stages=1,
-            pre_hook=init_to_zero("DQ"),
-        ),
-        # Other configs seem to give wrong results when seqlen_q % 128 != 0, disabling them for now
-        # # Kernel is buggy (give wrong result) if we set BLOCK_m=128, BLOCK_n=64, num_warps=*4*
-        # triton.Config({"BLOCK_M": 128, "BLOCK_N": 64, "SEQUENCE_PARALLEL": False}, num_warps=8, num_stages=1, pre_hook=init_to_zero('DQ')),
-        # triton.Config({"BLOCK_M": 128, "BLOCK_N": 64, "SEQUENCE_PARALLEL": True}, num_warps=8, num_stages=1, pre_hook=init_to_zero('DQ')),
-        # triton.Config({"BLOCK_M": 64, "BLOCK_N": 64, "SEQUENCE_PARALLEL": False}, num_warps=4, num_stages=1, pre_hook=init_to_zero('DQ')),
-        # triton.Config({"BLOCK_M": 64, "BLOCK_N": 64, "SEQUENCE_PARALLEL": True}, num_warps=4, num_stages=1, pre_hook=init_to_zero('DQ')),
-    ],
-    key=["CACHE_KEY_SEQLEN_Q", "CACHE_KEY_SEQLEN_K", "BIAS_TYPE", "IS_CAUSAL", "BLOCK_HEADDIM"],
-)
-@triton.heuristics(
-    {
-        "EVEN_M": lambda args: args["seqlen_q"] % args["BLOCK_M"] == 0,
-        "EVEN_N": lambda args: args["seqlen_k"] % args["BLOCK_N"] == 0,
-        "EVEN_HEADDIM": lambda args: args["headdim"] == args["BLOCK_HEADDIM"],
-    }
-)
-@triton.jit
-def _bwd_kernel(
-    Q,
-    K,
-    V,
-    Bias,
-    DO,
-    DQ,
-    DK,
-    DV,
-    LSE,
-    D,
-    softmax_scale,
-    stride_qb,
-    stride_qh,
-    stride_qm,
-    stride_kb,
-    stride_kh,
-    stride_kn,
-    stride_vb,
-    stride_vh,
-    stride_vn,
-    stride_bb,
-    stride_bh,
-    stride_bm,
-    stride_dob,
-    stride_doh,
-    stride_dom,
-    stride_dqb,
-    stride_dqh,
-    stride_dqm,
-    stride_dkb,
-    stride_dkh,
-    stride_dkn,
-    stride_dvb,
-    stride_dvh,
-    stride_dvn,
-    nheads,
-    seqlen_q,
-    seqlen_k,
-    seqlen_q_rounded,
-    headdim,
-    CACHE_KEY_SEQLEN_Q,
-    CACHE_KEY_SEQLEN_K,
-    BIAS_TYPE: tl.constexpr,
-    IS_CAUSAL: tl.constexpr,
-    BLOCK_HEADDIM: tl.constexpr,
-    SEQUENCE_PARALLEL: tl.constexpr,
-    EVEN_M: tl.constexpr,
-    EVEN_N: tl.constexpr,
-    EVEN_HEADDIM: tl.constexpr,
-    BLOCK_M: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-):
-    off_hb = tl.program_id(1)
-    off_b = off_hb // nheads
-    off_h = off_hb % nheads
-    # offset pointers for batch/head
-    Q += off_b * stride_qb + off_h * stride_qh
-    K += off_b * stride_kb + off_h * stride_kh
-    V += off_b * stride_vb + off_h * stride_vh
-    DO += off_b * stride_dob + off_h * stride_doh
-    DQ += off_b * stride_dqb + off_h * stride_dqh
-    DK += off_b * stride_dkb + off_h * stride_dkh
-    DV += off_b * stride_dvb + off_h * stride_dvh
-    if BIAS_TYPE != "none":
-        Bias += off_b * stride_bb + off_h * stride_bh
-    # pointer to row-wise quantities in value-like data
-    D += off_hb * seqlen_q_rounded
-    LSE += off_hb * seqlen_q_rounded
-    if not SEQUENCE_PARALLEL:
-        num_block_n = tl.cdiv(seqlen_k, BLOCK_N)
-        for start_n in range(0, num_block_n):
-            _bwd_kernel_one_col_block(
-                start_n,
-                Q,
-                K,
-                V,
-                Bias,
-                DO,
-                DQ,
-                DK,
-                DV,
-                LSE,
-                D,
-                softmax_scale,
-                stride_qm,
-                stride_kn,
-                stride_vn,
-                stride_bm,
-                stride_dom,
-                stride_dqm,
-                stride_dkn,
-                stride_dvn,
-                seqlen_q,
-                seqlen_k,
-                headdim,
-                ATOMIC_ADD=False,
-                BIAS_TYPE=BIAS_TYPE,
-                IS_CAUSAL=IS_CAUSAL,
-                BLOCK_HEADDIM=BLOCK_HEADDIM,
-                EVEN_M=EVEN_M,
-                EVEN_N=EVEN_N,
-                EVEN_HEADDIM=EVEN_HEADDIM,
-                BLOCK_M=BLOCK_M,
-                BLOCK_N=BLOCK_N,
-            )
-    else:
-        start_n = tl.program_id(0)
-        _bwd_kernel_one_col_block(
-            start_n,
-            Q,
-            K,
-            V,
-            Bias,
-            DO,
-            DQ,
-            DK,
-            DV,
-            LSE,
-            D,
-            softmax_scale,
-            stride_qm,
-            stride_kn,
-            stride_vn,
-            stride_bm,
-            stride_dom,
-            stride_dqm,
-            stride_dkn,
-            stride_dvn,
-            seqlen_q,
-            seqlen_k,
-            headdim,
-            ATOMIC_ADD=True,
-            BIAS_TYPE=BIAS_TYPE,
-            IS_CAUSAL=IS_CAUSAL,
-            BLOCK_HEADDIM=BLOCK_HEADDIM,
-            EVEN_M=EVEN_M,
-            EVEN_N=EVEN_N,
-            EVEN_HEADDIM=EVEN_HEADDIM,
-            BLOCK_M=BLOCK_M,
-            BLOCK_N=BLOCK_N,
-        )
-
-
-def _flash_attn_forward(q, k, v, bias=None, causal=False, softmax_scale=None):
-    # shape constraints
-    batch, seqlen_q, nheads, d = q.shape
-    _, seqlen_k, _, _ = k.shape
-    assert k.shape == (batch, seqlen_k, nheads, d)
-    assert v.shape == (batch, seqlen_k, nheads, d)
-    assert d <= 128, "FlashAttention only support head dimensions up to 128"
-    assert q.dtype == k.dtype == v.dtype, "All tensors must have the same type"
-    assert q.dtype in [torch.float16, torch.bfloat16], "Only support fp16 and bf16"
-    assert q.is_cuda and k.is_cuda and v.is_cuda
-    softmax_scale = softmax_scale or 1.0 / math.sqrt(d)
-
-    has_bias = bias is not None
-    bias_type = "none"
-    if has_bias:
-        assert bias.dtype in [q.dtype, torch.float]
-        assert bias.is_cuda
-        assert bias.dim() == 4
-        if bias.stride(-1) != 1:
-            bias = bias.contiguous()
-        if bias.shape[2:] == (1, seqlen_k):
-            bias_type = "vector"
-        elif bias.shape[2:] == (seqlen_q, seqlen_k):
-            bias_type = "matrix"
-        else:
-            raise RuntimeError(
-                "Last 2 dimensions of bias must be (1, seqlen_k)" " or (seqlen_q, seqlen_k)"
-            )
-        bias = bias.expand(batch, nheads, seqlen_q, seqlen_k)
-    bias_strides = (bias.stride(0), bias.stride(1), bias.stride(2)) if has_bias else (0, 0, 0)
-
-    seqlen_q_rounded = math.ceil(seqlen_q / 128) * 128
-    lse = torch.empty((batch, nheads, seqlen_q_rounded), device=q.device, dtype=torch.float32)
-    tmp = torch.empty((batch, nheads, seqlen_q_rounded), device=q.device, dtype=torch.float32)
-    o = torch.empty_like(q)
-
-    BLOCK_HEADDIM = max(triton.next_power_of_2(d), 16)
-    BLOCK = 128
-    num_warps = 4 if d <= 64 else 8
-    grid = lambda META: (triton.cdiv(seqlen_q, META["BLOCK_M"]), batch * nheads)
-    _fwd_kernel[grid](
-        q,
-        k,
-        v,
-        bias,
-        o,
-        lse,
-        tmp,
-        softmax_scale,
-        q.stride(0),
-        q.stride(2),
-        q.stride(1),
-        k.stride(0),
-        k.stride(2),
-        k.stride(1),
-        v.stride(0),
-        v.stride(2),
-        v.stride(1),
-        *bias_strides,
-        o.stride(0),
-        o.stride(2),
-        o.stride(1),
-        nheads,
-        seqlen_q,
-        seqlen_k,
-        seqlen_q_rounded,
-        d,
-        seqlen_q // 32,
-        seqlen_k // 32,  # key for triton cache (limit number of compilations)
-        # Can't use kwargs here because triton autotune expects key to be args, not kwargs
-        # IS_CAUSAL=causal, BLOCK_HEADDIM=d,
-        bias_type,
-        causal,
-        BLOCK_HEADDIM,
-        BLOCK_M=BLOCK,
-        BLOCK_N=BLOCK,
-        num_warps=num_warps,
-        num_stages=1,
-    )
-    return o, lse, softmax_scale  # softmax_scale could have been updated
-
-
-def _flash_attn_backward(
-    do, q, k, v, o, lse, dq, dk, dv, bias=None, causal=False, softmax_scale=None
-):
-    # Make sure that the last dimension is contiguous
-    if do.stride(-1) != 1:
-        do = do.contiguous()
-    batch, seqlen_q, nheads, d = q.shape
-    _, seqlen_k, _, _ = k.shape
-    # assert d in {16, 32, 64, 128}
-    assert d <= 128
-    seqlen_q_rounded = math.ceil(seqlen_q / 128) * 128
-    assert lse.shape == (batch, nheads, seqlen_q_rounded)
-    assert q.stride(-1) == k.stride(-1) == v.stride(-1) == o.stride(-1) == 1
-    assert dq.stride(-1) == dk.stride(-1) == dv.stride(-1) == 1
-    softmax_scale = softmax_scale or 1.0 / math.sqrt(d)
-    # dq_accum = torch.zeros_like(q, dtype=torch.float32)
-    dq_accum = torch.empty_like(q, dtype=torch.float32)
-    delta = torch.empty_like(lse)
-    # delta = torch.zeros_like(lse)
-
-    BLOCK_HEADDIM = max(triton.next_power_of_2(d), 16)
-    grid = lambda META: (triton.cdiv(seqlen_q, META["BLOCK_M"]), batch * nheads)
-    _bwd_preprocess_do_o_dot[grid](
-        o,
-        do,
-        delta,
-        o.stride(0),
-        o.stride(2),
-        o.stride(1),
-        do.stride(0),
-        do.stride(2),
-        do.stride(1),
-        nheads,
-        seqlen_q,
-        seqlen_q_rounded,
-        d,
-        BLOCK_M=128,
-        BLOCK_HEADDIM=BLOCK_HEADDIM,
-    )
-
-    has_bias = bias is not None
-    bias_type = "none"
-    if has_bias:
-        assert bias.dtype in [q.dtype, torch.float]
-        assert bias.is_cuda
-        assert bias.dim() == 4
-        assert bias.stride(-1) == 1
-        if bias.shape[2:] == (1, seqlen_k):
-            bias_type = "vector"
-        elif bias.shape[2:] == (seqlen_q, seqlen_k):
-            bias_type = "matrix"
-        else:
-            raise RuntimeError(
-                "Last 2 dimensions of bias must be (1, seqlen_k)" " or (seqlen_q, seqlen_k)"
-            )
-        bias = bias.expand(batch, nheads, seqlen_q, seqlen_k)
-    bias_strides = (bias.stride(0), bias.stride(1), bias.stride(2)) if has_bias else (0, 0, 0)
-
-    # BLOCK_M = 128
-    # BLOCK_N = 64
-    # num_warps = 4
-    grid = lambda META: (
-        triton.cdiv(seqlen_k, META["BLOCK_N"]) if META["SEQUENCE_PARALLEL"] else 1,
-        batch * nheads,
-    )
-    _bwd_kernel[grid](
-        q,
-        k,
-        v,
-        bias,
-        do,
-        dq_accum,
-        dk,
-        dv,
-        lse,
-        delta,
-        softmax_scale,
-        q.stride(0),
-        q.stride(2),
-        q.stride(1),
-        k.stride(0),
-        k.stride(2),
-        k.stride(1),
-        v.stride(0),
-        v.stride(2),
-        v.stride(1),
-        *bias_strides,
-        do.stride(0),
-        do.stride(2),
-        do.stride(1),
-        dq_accum.stride(0),
-        dq_accum.stride(2),
-        dq_accum.stride(1),
-        dk.stride(0),
-        dk.stride(2),
-        dk.stride(1),
-        dv.stride(0),
-        dv.stride(2),
-        dv.stride(1),
-        nheads,
-        seqlen_q,
-        seqlen_k,
-        seqlen_q_rounded,
-        d,
-        seqlen_q // 32,
-        seqlen_k // 32,  # key for triton cache (limit number of compilations)
-        # Can't use kwargs here because triton autotune expects key to be args, not kwargs
-        # IS_CAUSAL=causal, BLOCK_HEADDIM=d,
-        bias_type,
-        causal,
-        BLOCK_HEADDIM,
-        # SEQUENCE_PARALLEL=False,
-        # BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N,
-        # num_warps=num_warps,
-        # num_stages=1,
-    )
-    dq.copy_(dq_accum)
-
-
-class FlashAttnQKVPackedFunc(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, qkv, bias=None, causal=False, softmax_scale=None):
-        """
-        qkv: (batch, seqlen, 3, nheads, headdim)
-        bias: optional, shape broadcastible to (batch, nheads, seqlen, seqlen).
-            For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen).
-            ALiBi mask for non-causal would have shape (1, nheads, seqlen, seqlen)
-        """
-        # Make sure that the last dimension is contiguous
-        if qkv.stride(-1) != 1:
-            qkv = qkv.contiguous()
-        o, lse, ctx.softmax_scale = _flash_attn_forward(
-            qkv[:, :, 0],
-            qkv[:, :, 1],
-            qkv[:, :, 2],
-            bias=bias,
-            causal=causal,
-            softmax_scale=softmax_scale,
-        )
-        ctx.save_for_backward(qkv, o, lse, bias)
-        ctx.causal = causal
-        return o
-
-    @staticmethod
-    def backward(ctx, do):
-        qkv, o, lse, bias = ctx.saved_tensors
-        assert not ctx.needs_input_grad[1], "FlashAttention does not support bias gradient yet"
-        # Triton's autotune causes the Tensor._version to change, and so Pytorch autograd
-        # does a memcpy. To avoid this we run in inference_mode, which doesn't track the version.
-        with torch.inference_mode():
-            dqkv = torch.empty_like(qkv)
-            _flash_attn_backward(
-                do,
-                qkv[:, :, 0],
-                qkv[:, :, 1],
-                qkv[:, :, 2],
-                o,
-                lse,
-                dqkv[:, :, 0],
-                dqkv[:, :, 1],
-                dqkv[:, :, 2],
-                bias=bias,
-                causal=ctx.causal,
-                softmax_scale=ctx.softmax_scale,
-            )
-        return dqkv, None, None, None
-
-
-flash_attn_qkvpacked_func = FlashAttnQKVPackedFunc.apply
-
-
-class FlashAttnKVPackedFunc(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, q, kv, bias=None, causal=False, softmax_scale=None):
-        """
-        q: (batch, seqlen_q, nheads, headdim)
-        kv: (batch, seqlen_k, 2, nheads, headdim)
-        bias: optional, shape broadcastible to (batch, nheads, seqlen_q, seqlen_k).
-            For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen_k).
-            ALiBi mask for non-causal would have shape (1, nheads, seqlen_q, seqlen_k)
-        """
-        # Make sure that the last dimension is contiguous
-        q, kv = [x if x.stride(-1) == 1 else x.contiguous() for x in [q, kv]]
-        o, lse, ctx.softmax_scale = _flash_attn_forward(
-            q, kv[:, :, 0], kv[:, :, 1], bias=bias, causal=causal, softmax_scale=softmax_scale
-        )
-        ctx.save_for_backward(q, kv, o, lse, bias)
-        ctx.causal = causal
-        return o
-
-    @staticmethod
-    def backward(ctx, do):
-        q, kv, o, lse, bias = ctx.saved_tensors
-        if len(ctx.needs_input_grad) >= 3:
-            assert not ctx.needs_input_grad[2], "FlashAttention does not support bias gradient yet"
-        # Triton's autotune causes the Tensor._version to change, and so Pytorch autograd
-        # does a memcpy. To avoid this we run in inference_mode, which doesn't track the version.
-        with torch.inference_mode():
-            dq = torch.empty_like(q)
-            dkv = torch.empty_like(kv)
-            _flash_attn_backward(
-                do,
-                q,
-                kv[:, :, 0],
-                kv[:, :, 1],
-                o,
-                lse,
-                dq,
-                dkv[:, :, 0],
-                dkv[:, :, 1],
-                bias=bias,
-                causal=ctx.causal,
-                softmax_scale=ctx.softmax_scale,
-            )
-        return dq, dkv, None, None, None
-
-
-flash_attn_kvpacked_func = FlashAttnKVPackedFunc.apply
-
-
-class FlashAttnFunc(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, q, k, v, bias=None, causal=False, softmax_scale=None):
-        """
-        q: (batch_size, seqlen_q, nheads, headdim)
-        k, v: (batch_size, seqlen_k, nheads, headdim)
-        bias: optional, shape broadcastible to (batch, nheads, seqlen_q, seqlen_k).
-            For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen_k).
-            ALiBi mask for non-causal would have shape (1, nheads, seqlen_q, seqlen_k)
-        """
-        # Make sure that the last dimension is contiguous
-        q, k, v = [x if x.stride(-1) == 1 else x.contiguous() for x in [q, k, v]]
-        o, lse, ctx.softmax_scale = _flash_attn_forward(
-            q, k, v, bias=bias, causal=causal, softmax_scale=softmax_scale
-        )
-        ctx.save_for_backward(q, k, v, o, lse, bias)
-        ctx.causal = causal
-        return o
-
-    @staticmethod
-    def backward(ctx, do):
-        q, k, v, o, lse, bias = ctx.saved_tensors
-        assert not ctx.needs_input_grad[3], "FlashAttention does not support bias gradient yet"
-        # Triton's autotune causes the Tensor._version to change, and so Pytorch autograd
-        # does a memcpy. To avoid this we run in inference_mode, which doesn't track the version.
-        with torch.inference_mode():
-            dq = torch.empty_like(q)
-            dk = torch.empty_like(k)
-            dv = torch.empty_like(v)
-            _flash_attn_backward(
-                do,
-                q,
-                k,
-                v,
-                o,
-                lse,
-                dq,
-                dk,
-                dv,
-                bias=bias,
-                causal=ctx.causal,
-                softmax_scale=ctx.softmax_scale,
-            )
-        return dq, dk, dv, None, None, None
-
-
-flash_attn_func = FlashAttnFunc.apply
--- a/vllm_flash_attn/flash_attn_triton_og.py
+++ b/vllm_flash_attn/flash_attn_triton_og.py
-# [2022-10-23] Downloaded from https://github.com/openai/triton/blob/master/python/tutorials/06-fused-attention.py
-# for benchmarking.
-# We fixed a few dtype cast to make it work for bf16
-
-"""
-Fused Attention
-===============
-This is a Triton implementation of the Flash Attention algorithm
-(see: Dao et al., https://arxiv.org/pdf/2205.14135v2.pdf; Rabe and Staats https://arxiv.org/pdf/2112.05682v2.pdf)
-"""
-
-import pytest
-import torch
-import triton
-import triton.language as tl
-
-
-@triton.jit
-def _fwd_kernel(
-    Q,
-    K,
-    V,
-    sm_scale,
-    TMP,
-    L,
-    M,  # NOTE: TMP is a scratchpad buffer to workaround a compiler bug
-    Out,
-    stride_qz,
-    stride_qh,
-    stride_qm,
-    stride_qk,
-    stride_kz,
-    stride_kh,
-    stride_kn,
-    stride_kk,
-    stride_vz,
-    stride_vh,
-    stride_vk,
-    stride_vn,
-    stride_oz,
-    stride_oh,
-    stride_om,
-    stride_on,
-    Z,
-    H,
-    N_CTX,
-    BLOCK_M: tl.constexpr,
-    BLOCK_DMODEL: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-):
-    start_m = tl.program_id(0)
-    off_hz = tl.program_id(1)
-    # initialize offsets
-    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-    offs_n = tl.arange(0, BLOCK_N)
-    offs_d = tl.arange(0, BLOCK_DMODEL)
-    off_q = off_hz * stride_qh + offs_m[:, None] * stride_qm + offs_d[None, :] * stride_qk
-    off_k = off_hz * stride_qh + offs_n[:, None] * stride_kn + offs_d[None, :] * stride_kk
-    off_v = off_hz * stride_qh + offs_n[:, None] * stride_qm + offs_d[None, :] * stride_qk
-    # Initialize pointers to Q, K, V
-    q_ptrs = Q + off_q
-    k_ptrs = K + off_k
-    v_ptrs = V + off_v
-    # initialize pointer to m and l
-    t_ptrs = TMP + off_hz * N_CTX + offs_m
-    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
-    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
-    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
-    # load q: it will stay in SRAM throughout
-    q = tl.load(q_ptrs)
-    # loop over k, v and update accumulator
-    for start_n in range(0, (start_m + 1) * BLOCK_M, BLOCK_N):
-        start_n = tl.multiple_of(start_n, BLOCK_N)
-        # -- compute qk ----
-        k = tl.load(k_ptrs + start_n * stride_kn)
-        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-        qk += tl.dot(q, k, trans_b=True)
-        qk *= sm_scale
-        qk += tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), 0, float("-inf"))
-        # -- compute m_ij, p, l_ij
-        m_ij = tl.max(qk, 1)
-        p = tl.exp(qk - m_ij[:, None])
-        l_ij = tl.sum(p, 1)
-        # -- update m_i and l_i
-        m_i_new = tl.maximum(m_i, m_ij)
-        alpha = tl.exp(m_i - m_i_new)
-        beta = tl.exp(m_ij - m_i_new)
-        l_i_new = alpha * l_i + beta * l_ij
-        # -- update output accumulator --
-        # scale p
-        p_scale = beta / l_i_new
-        p = p * p_scale[:, None]
-        # scale acc
-        acc_scale = l_i / l_i_new * alpha
-        tl.store(t_ptrs, acc_scale)
-        acc_scale = tl.load(t_ptrs)  # BUG: have to store and immediately load
-        acc = acc * acc_scale[:, None]
-        # update acc
-        v = tl.load(v_ptrs + start_n * stride_vk)
-        p = p.to(v.dtype)
-        acc += tl.dot(p, v)
-        # update m_i and l_i
-        l_i = l_i_new
-        m_i = m_i_new
-    # rematerialize offsets to save registers
-    start_m = tl.program_id(0)
-    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-    # write back l and m
-    l_ptrs = L + off_hz * N_CTX + offs_m
-    m_ptrs = M + off_hz * N_CTX + offs_m
-    tl.store(l_ptrs, l_i)
-    tl.store(m_ptrs, m_i)
-    # initialize pointers to output
-    offs_n = tl.arange(0, BLOCK_DMODEL)
-    off_o = off_hz * stride_oh + offs_m[:, None] * stride_om + offs_n[None, :] * stride_on
-    out_ptrs = Out + off_o
-    tl.store(out_ptrs, acc)
-
-
-@triton.jit
-def _bwd_preprocess(
-    Out,
-    DO,
-    L,
-    NewDO,
-    Delta,
-    BLOCK_M: tl.constexpr,
-    D_HEAD: tl.constexpr,
-):
-    off_m = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M)
-    off_n = tl.arange(0, D_HEAD)
-    # load
-    o = tl.load(Out + off_m[:, None] * D_HEAD + off_n[None, :]).to(tl.float32)
-    do = tl.load(DO + off_m[:, None] * D_HEAD + off_n[None, :]).to(tl.float32)
-    denom = tl.load(L + off_m).to(tl.float32)
-    # compute
-    do = do / denom[:, None]
-    delta = tl.sum(o * do, axis=1)
-    # write-back
-    tl.store(NewDO + off_m[:, None] * D_HEAD + off_n[None, :], do)
-    tl.store(Delta + off_m, delta)
-
-
-@triton.jit
-def _bwd_kernel(
-    Q,
-    K,
-    V,
-    sm_scale,
-    Out,
-    DO,
-    DQ,
-    DK,
-    DV,
-    L,
-    M,
-    D,
-    stride_qz,
-    stride_qh,
-    stride_qm,
-    stride_qk,
-    stride_kz,
-    stride_kh,
-    stride_kn,
-    stride_kk,
-    stride_vz,
-    stride_vh,
-    stride_vk,
-    stride_vn,
-    Z,
-    H,
-    N_CTX,
-    num_block,
-    BLOCK_M: tl.constexpr,
-    BLOCK_DMODEL: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-):
-    off_hz = tl.program_id(0)
-    off_z = off_hz // H
-    off_h = off_hz % H
-    # offset pointers for batch/head
-    Q += off_z * stride_qz + off_h * stride_qh
-    K += off_z * stride_qz + off_h * stride_qh
-    V += off_z * stride_qz + off_h * stride_qh
-    DO += off_z * stride_qz + off_h * stride_qh
-    DQ += off_z * stride_qz + off_h * stride_qh
-    DK += off_z * stride_qz + off_h * stride_qh
-    DV += off_z * stride_qz + off_h * stride_qh
-    for start_n in range(0, num_block):
-        lo = start_n * BLOCK_M
-        # initialize row/col offsets
-        offs_qm = lo + tl.arange(0, BLOCK_M)
-        offs_n = start_n * BLOCK_M + tl.arange(0, BLOCK_M)
-        offs_m = tl.arange(0, BLOCK_N)
-        offs_k = tl.arange(0, BLOCK_DMODEL)
-        # initialize pointers to value-like data
-        q_ptrs = Q + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk)
-        k_ptrs = K + (offs_n[:, None] * stride_kn + offs_k[None, :] * stride_kk)
-        v_ptrs = V + (offs_n[:, None] * stride_qm + offs_k[None, :] * stride_qk)
-        do_ptrs = DO + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk)
-        dq_ptrs = DQ + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk)
-        # pointer to row-wise quantities in value-like data
-        D_ptrs = D + off_hz * N_CTX
-        m_ptrs = M + off_hz * N_CTX
-        # initialize dv amd dk
-        dv = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
-        dk = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
-        # k and v stay in SRAM throughout
-        k = tl.load(k_ptrs)
-        v = tl.load(v_ptrs)
-        # loop over rows
-        for start_m in range(lo, num_block * BLOCK_M, BLOCK_M):
-            offs_m_curr = start_m + offs_m
-            # load q, k, v, do on-chip
-            q = tl.load(q_ptrs)
-            # recompute p = softmax(qk, dim=-1).T
-            # NOTE: `do` is pre-divided by `l`; no normalization here
-            qk = tl.dot(q, k, trans_b=True)
-            qk = tl.where(offs_m_curr[:, None] >= (offs_n[None, :]), qk, float("-inf"))
-            m = tl.load(m_ptrs + offs_m_curr)
-            p = tl.exp(qk * sm_scale - m[:, None])
-            # compute dv
-            do = tl.load(do_ptrs)
-            dv += tl.dot(p.to(do.dtype), do, trans_a=True)
-            # compute dp = dot(v, do)
-            Di = tl.load(D_ptrs + offs_m_curr)
-            dp = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) - Di[:, None]
-            dp += tl.dot(do, v, trans_b=True)
-            # compute ds = p * (dp - delta[:, None])
-            ds = p * dp * sm_scale
-            # compute dk = dot(ds.T, q)
-            dk += tl.dot(ds.to(q.dtype), q, trans_a=True)
-            # # compute dq
-            dq = tl.load(dq_ptrs, eviction_policy="evict_last")
-            dq += tl.dot(ds.to(k.dtype), k)
-            tl.store(dq_ptrs, dq, eviction_policy="evict_last")
-            # # increment pointers
-            dq_ptrs += BLOCK_M * stride_qm
-            q_ptrs += BLOCK_M * stride_qm
-            do_ptrs += BLOCK_M * stride_qm
-        # write-back
-        dv_ptrs = DV + (offs_n[:, None] * stride_qm + offs_k[None, :] * stride_qk)
-        dk_ptrs = DK + (offs_n[:, None] * stride_kn + offs_k[None, :] * stride_kk)
-        tl.store(dv_ptrs, dv)
-        tl.store(dk_ptrs, dk)
-
-
-class _attention(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, q, k, v, sm_scale):
-        BLOCK = 128
-        # shape constraints
-        Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
-        assert Lq == Lk and Lk == Lv
-        assert Lk in {16, 32, 64, 128}
-        o = torch.empty_like(q)
-        grid = (triton.cdiv(q.shape[2], BLOCK), q.shape[0] * q.shape[1])
-        tmp = torch.empty(
-            (q.shape[0] * q.shape[1], q.shape[2]), device=q.device, dtype=torch.float32
-        )
-        L = torch.empty((q.shape[0] * q.shape[1], q.shape[2]), device=q.device, dtype=torch.float32)
-        m = torch.empty((q.shape[0] * q.shape[1], q.shape[2]), device=q.device, dtype=torch.float32)
-        num_warps = 4 if Lk <= 64 else 8
-
-        _fwd_kernel[grid](
-            q,
-            k,
-            v,
-            sm_scale,
-            tmp,
-            L,
-            m,
-            o,
-            q.stride(0),
-            q.stride(1),
-            q.stride(2),
-            q.stride(3),
-            k.stride(0),
-            k.stride(1),
-            k.stride(2),
-            k.stride(3),
-            v.stride(0),
-            v.stride(1),
-            v.stride(2),
-            v.stride(3),
-            o.stride(0),
-            o.stride(1),
-            o.stride(2),
-            o.stride(3),
-            q.shape[0],
-            q.shape[1],
-            q.shape[2],
-            BLOCK_M=BLOCK,
-            BLOCK_N=BLOCK,
-            BLOCK_DMODEL=Lk,
-            num_warps=num_warps,
-            num_stages=1,
-        )
-        ctx.save_for_backward(q, k, v, o, L, m)
-        ctx.BLOCK = BLOCK
-        ctx.grid = grid
-        ctx.sm_scale = sm_scale
-        ctx.BLOCK_DMODEL = Lk
-        return o
-
-    @staticmethod
-    def backward(ctx, do):
-        q, k, v, o, l, m = ctx.saved_tensors
-        do = do.contiguous()
-        dq = torch.zeros_like(q, dtype=torch.float32)
-        dk = torch.empty_like(k)
-        dv = torch.empty_like(v)
-        do_scaled = torch.empty_like(do)
-        delta = torch.empty_like(l)
-        _bwd_preprocess[(ctx.grid[0] * ctx.grid[1],)](
-            o,
-            do,
-            l,
-            do_scaled,
-            delta,
-            BLOCK_M=ctx.BLOCK,
-            D_HEAD=ctx.BLOCK_DMODEL,
-        )
-
-        # NOTE: kernel currently buggy for other values of `num_warps`
-        num_warps = 8
-        _bwd_kernel[(ctx.grid[1],)](
-            q,
-            k,
-            v,
-            ctx.sm_scale,
-            o,
-            do_scaled,
-            dq,
-            dk,
-            dv,
-            l,
-            m,
-            delta,
-            q.stride(0),
-            q.stride(1),
-            q.stride(2),
-            q.stride(3),
-            k.stride(0),
-            k.stride(1),
-            k.stride(2),
-            k.stride(3),
-            v.stride(0),
-            v.stride(1),
-            v.stride(2),
-            v.stride(3),
-            q.shape[0],
-            q.shape[1],
-            q.shape[2],
-            ctx.grid[0],
-            BLOCK_M=ctx.BLOCK,
-            BLOCK_N=ctx.BLOCK,
-            BLOCK_DMODEL=ctx.BLOCK_DMODEL,
-            num_warps=num_warps,
-            num_stages=1,
-        )
-        return dq.to(q.dtype), dk, dv, None
-
-
-attention = _attention.apply
--- a/vllm_flash_attn/flash_blocksparse_attention.py
+++ b/vllm_flash_attn/flash_blocksparse_attention.py
-import math
-
-import hydra
-import torch
-import torch.nn as nn
-from einops import rearrange
-
-from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input
-from flash_attn.flash_blocksparse_attn_interface import (
-    convert_blockmask,
-    flash_blocksparse_attn_func,
-)
-
-
-class FlashBlocksparseAttention(nn.Module):
-    """Implement the scaled dot product attention with softmax.
-    Arguments
-    ---------
-        softmax_temp: The temperature to use for the softmax attention.
-                      (default: 1/sqrt(d_keys) where d_keys is computed at
-                      runtime)
-        attention_dropout: The dropout rate to apply to the attention
-                           (default: 0.1)
-    """
-
-    def __init__(
-        self,
-        sparsity_config,
-        softmax_temp=None,
-        attention_dropout=0.0,
-        max_seq_length=2048,
-        device=None,
-        dtype=None,
-    ):
-        super().__init__()
-        self.sparsity_config = hydra.utils.instantiate(sparsity_config)
-        self.softmax_temp = softmax_temp
-        self.dropout_p = attention_dropout
-
-        # initialize sparse layout and register as buffer
-        max_seq_length = ((max_seq_length + 256 - 1) // 256) * 256
-        layout = self.sparsity_config.make_layout(max_seq_length)
-        self.register_buffer("layout", layout)
-        blockmask_converted = convert_blockmask(self.layout, causal=False)
-        self.register_buffer("blockmask_converted", blockmask_converted)
-        # logger.info(f'Attention class {self.__class__}: saving={self.layout.float().mean()}')
-
-    def forward(
-        self,
-        qkv,
-        attn_mask=None,
-        key_padding_mask=None,
-        causal=False,
-        cu_seqlens=None,
-        max_s=None,
-        need_weights=False,
-        convert_mask=True,
-    ):
-        """Implements the multihead softmax attention.
-        Arguments
-        ---------
-            qkv: The tensor containing the query, key, and value. (B, S, 3, H, D) if key_padding_mask is None
-            attn_mask: An implementation of BaseMask that encodes where each
-                       query can attend to
-            key_padding_mask: An implementation of BaseMask that encodes how
-                         many query each sequence in the batch consists of
-        """
-        assert not need_weights
-        assert attn_mask is None
-        assert qkv.dtype == torch.float16
-        assert qkv.is_cuda
-
-        if cu_seqlens is None:
-            batch_size = qkv.shape[0]
-            seqlen = qkv.shape[1]
-            # Convert mask to take a subset
-            seqlen_rounded = ((seqlen + 256 - 1) // 256) * 256
-            assert seqlen_rounded // 16 <= self.layout.shape[0], (
-                seqlen_rounded // 256 <= self.layout.shape[1]
-            )
-            blockmask = self.layout[: seqlen_rounded // 16, : seqlen_rounded // 256]
-            if key_padding_mask is None:
-                qkv = rearrange(qkv, "b s ... -> (b s) ...")
-                max_s = seqlen
-                cu_seqlens = torch.arange(
-                    0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32, device=qkv.device
-                )
-                output = flash_blocksparse_attn_func(
-                    qkv,
-                    cu_seqlens,
-                    blockmask,
-                    self.dropout_p if self.training else 0.0,
-                    max_s,
-                    softmax_scale=self.softmax_temp,
-                    causal=causal,
-                )
-                output = rearrange(output, "(b s) ... -> b s ...", b=batch_size)
-            else:
-                key_padding_mask_bool = key_padding_mask.bool_matrix
-                nheads = qkv.shape[-2]
-                x = rearrange(qkv, "b s three h d -> b s (three h d)")
-                x_unpad, indices, cu_seqlens, max_s = unpad_input(x, key_padding_mask_bool)
-                x_unpad = rearrange(x_unpad, "nnz (three h d) -> nnz three h d", three=3, h=nheads)
-                output_unpad = flash_blocksparse_attn_func(
-                    x_unpad,
-                    cu_seqlens,
-                    blockmask,
-                    self.dropout_p if self.training else 0.0,
-                    max_s,
-                    softmax_scale=self.softmax_temp,
-                    causal=causal,
-                )
-                output = rearrange(
-                    pad_input(
-                        rearrange(output_unpad, "nnz h d -> nnz (h d)"), indices, batch_size, seqlen
-                    ),
-                    "b s (h d) -> b s h d",
-                    h=nheads,
-                )
-        else:
-            assert max_s is not None
-            seqlen = max_s
-            # Convert mask to take a subset
-            seqlen_rounded = ((seqlen + 256 - 1) // 256) * 256
-            assert seqlen_rounded // 16 <= self.layout.shape[0], (
-                seqlen_rounded // 256 <= self.layout.shape[1]
-            )
-            blockmask = self.layout[: seqlen_rounded // 16, : seqlen_rounded // 256]
-            if convert_mask:
-                output = flash_blocksparse_attn_func(
-                    qkv,
-                    cu_seqlens,
-                    blockmask,
-                    self.dropout_p if self.training else 0.0,
-                    max_s,
-                    softmax_scale=self.softmax_temp,
-                    causal=causal,
-                )
-            else:
-                output = flash_blocksparse_attn_func(
-                    qkv,
-                    cu_seqlens,
-                    self.blockmask_converted,
-                    self.dropout_p if self.training else 0.0,
-                    max_s,
-                    softmax_scale=self.softmax_temp,
-                    causal=causal,
-                    convert_mask=False,
-                )
-
-        return output, None
-
-
-class FlashBlocksparseMHA(nn.Module):
-    def __init__(
-        self,
-        embed_dim,
-        num_heads,
-        sparsity_config,
-        bias=True,
-        batch_first=True,
-        attention_dropout=0.0,
-        causal=False,
-        max_seq_length=2048,
-        device=None,
-        dtype=None,
-        **kwargs,
-    ) -> None:
-        assert batch_first
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        self.embed_dim = embed_dim
-        self.causal = causal
-
-        self.num_heads = num_heads
-        assert self.embed_dim % num_heads == 0, "self.kdim must be divisible by num_heads"
-        self.head_dim = self.embed_dim // num_heads
-        assert self.head_dim in [16, 32, 64], "Only support head_dim == 16, 32, or 64"
-
-        self.Wqkv = nn.Linear(embed_dim, 3 * embed_dim, bias=bias, **factory_kwargs)
-        self.inner_attn = FlashBlocksparseAttention(
-            sparsity_config,
-            attention_dropout=attention_dropout,
-            max_seq_length=max_seq_length,
-            **factory_kwargs,
-        )
-        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias, **factory_kwargs)
-
-    def forward(
-        self, x, x_ignored_, x_ignored_1_, attn_mask=None, key_padding_mask=None, need_weights=False
-    ):
-        qkv = self.Wqkv(x)
-        qkv = rearrange(qkv, "b s (three h d) -> b s three h d", three=3, h=self.num_heads)
-        context, attn_weights = self.inner_attn(
-            qkv, key_padding_mask=key_padding_mask, need_weights=need_weights, causal=self.causal
-        )
-        return self.out_proj(rearrange(context, "b s h d -> b s (h d)")), attn_weights
--- a/vllm_flash_attn/flash_blocksparse_attn_interface.py
+++ b/vllm_flash_attn/flash_blocksparse_attn_interface.py
-# Adapted from https://github.com/mlcommons/training_results_v1.1/blob/main/NVIDIA/benchmarks/bert/implementations/pytorch/fmha.py
-import flash_attn_cuda
-import torch
-import torch.nn as nn
-
-
-def convert_blockmask(blockmask, causal):
-    """Convert from the 0-1 format to the format used by the CUDA code.
-    0 means the block is skipped.
-    nonzero means the block is not skipped.
-    Argument:
-        blockmask: (row, col): a 0-1 tensor
-    Return:
-        blockmask_converted: (col, row), dtype torch.int32: for each column, it contains the row
-            indices of the nonzero blocks, padded with -1 to reach length @row.
-            The indices are multiplied by 4, with the smallest bit used to encode whether
-            it is the first nonzero in its row, and the 2nd smallest bit to encode whether it is
-            the last nonzero in its row..
-    """
-    assert not causal
-    # TD [2022-05-13]: The indexing and sorting is very tricky
-    nrow, ncol = blockmask.shape
-    # Sort does not support bool on CUDA
-    blockmask = blockmask.to(dtype=torch.uint8)
-    nonzero_val, nonzero_sorted_rowidx = blockmask.sort(dim=0, stable=True, descending=True)
-    nonzero_unsorted_rowidx = nonzero_sorted_rowidx.argsort(dim=0)
-    last_nonzero_col_per_row = blockmask.sort(dim=-1, stable=True).indices[:, -1]
-    last_nonzero_col_per_row_after_sort = nonzero_unsorted_rowidx[
-        torch.arange(nrow, device=blockmask.device), last_nonzero_col_per_row
-    ]
-    first_nonzero_col_per_row = blockmask.sort(dim=-1, stable=True, descending=True).indices[:, 0]
-    first_nonzero_col_per_row_after_sort = nonzero_unsorted_rowidx[
-        torch.arange(nrow, device=blockmask.device), first_nonzero_col_per_row
-    ]
-    nonzero_idx = nonzero_sorted_rowidx * 4
-    nonzero_idx[last_nonzero_col_per_row_after_sort, last_nonzero_col_per_row] += 2
-    nonzero_idx[first_nonzero_col_per_row_after_sort, first_nonzero_col_per_row] += 1
-    nonzero_idx[nonzero_val == 0] = -1
-    return nonzero_idx.T.contiguous().to(dtype=torch.int32)
-
-
-def _flash_blocksparse_attn_forward(
-    qkv, cu_seqlens, blockmask, dropout_p, max_s, softmax_scale, causal, return_softmax
-):
-    context, softmax_lse, *rest = flash_attn_cuda.fwd_block(
-        qkv, cu_seqlens, blockmask, dropout_p, max_s, softmax_scale, causal, return_softmax, None
-    )
-    # if context.isnan().any() or softmax_lse.isnan().any():
-    #     breakpoint()
-    S_dmask = rest[0] if return_softmax else None
-    return context, softmax_lse, S_dmask
-
-
-def _flash_blocksparse_attn_backward(
-    dout,
-    qkv,
-    out,
-    S_dmask,
-    softmax_lse,
-    cu_seqlens,
-    blockmask,
-    dropout_p,
-    max_s,
-    softmax_scale,
-    causal,
-):
-    dqkv, dp, softmax_d = flash_attn_cuda.bwd_block(
-        dout,
-        qkv,
-        out,
-        S_dmask,
-        softmax_lse,
-        cu_seqlens,
-        blockmask,
-        dropout_p,
-        softmax_scale,
-        max_s,
-        causal,
-        None,
-    )
-    # if dqkv.isnan().any() or softmax_d.isnan().any():
-    #     breakpoint()
-    return dqkv
-
-
-class FlashBlocksparseAttnFun(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, qkv, cu_seqlens, blockmask, dropout_p, max_s, softmax_scale, causal):
-        # Save rng_state because the backward pass will regenerate the dropout mask
-        rng_state = torch.cuda.get_rng_state() if dropout_p > 0 else None
-        if softmax_scale is None:
-            softmax_scale = qkv.shape[-1] ** (-0.5)
-        context, softmax_lse, S_dmask = _flash_blocksparse_attn_forward(
-            qkv,
-            cu_seqlens,
-            blockmask,
-            dropout_p,
-            max_s,
-            softmax_scale,
-            causal=causal,
-            return_softmax=False,
-        )
-        ctx.save_for_backward(qkv, context, S_dmask, softmax_lse, cu_seqlens, blockmask, rng_state)
-        ctx.dropout_p = dropout_p
-        ctx.max_s = max_s
-        ctx.softmax_scale = softmax_scale
-        ctx.causal = causal
-        return context
-
-    @staticmethod
-    def backward(ctx, dout):
-        qkv, context, S_dmask, softmax_lse, cu_seqlens, blockmask, rng_state = ctx.saved_tensors
-        if rng_state is not None:
-            cur_rng_state = torch.cuda.get_rng_state()
-            torch.cuda.set_rng_state(rng_state)
-        # S_dmask is None, temporarily use another tensor just to get it running
-        dqkv = _flash_blocksparse_attn_backward(
-            dout,
-            qkv,
-            context,
-            context,
-            softmax_lse,
-            cu_seqlens,
-            blockmask,
-            ctx.dropout_p,
-            ctx.max_s,
-            ctx.softmax_scale,
-            ctx.causal,
-        )
-        if rng_state is not None:
-            torch.cuda.set_rng_state(cur_rng_state)
-        return dqkv, None, None, None, None, None, None, None
-
-
-# We duplicate code to return both the output and the softmax for testing
-# Returning both makes backward a bit slower, so we want to keep using the other version for speed.
-class FlashBlocksparseAttnFunWithS(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, qkv, cu_seqlens, blockmask, dropout_p, max_s, softmax_scale, causal):
-        # Save rng_state because the backward pass is gonna regenerate the dropout mask
-        rng_state = torch.cuda.get_rng_state() if dropout_p > 0 else None
-        if softmax_scale is None:
-            softmax_scale = qkv.shape[-1] ** (-0.5)
-        context, softmax_lse, S_dmask = _flash_blocksparse_attn_forward(
-            qkv,
-            cu_seqlens,
-            blockmask,
-            dropout_p,
-            max_s,
-            softmax_scale,
-            causal=causal,
-            return_softmax=True,
-        )
-        ctx.save_for_backward(qkv, context, S_dmask, softmax_lse, cu_seqlens, blockmask, rng_state)
-        ctx.dropout_p = dropout_p
-        ctx.max_s = max_s
-        ctx.softmax_scale = softmax_scale
-        ctx.causal = causal
-        return context, S_dmask, softmax_lse
-
-    @staticmethod
-    def backward(ctx, dout, _dS_dmask_ignored, _dsoftmax_sum_ignored):
-        qkv, context, S_dmask, softmax_lse, cu_seqlens, blockmask, rng_state = ctx.saved_tensors
-        if rng_state is not None:
-            cur_rng_state = torch.cuda.get_rng_state()
-            torch.cuda.set_rng_state(rng_state)
-        dqkv = _flash_blocksparse_attn_backward(
-            dout,
-            qkv,
-            context,
-            S_dmask,
-            softmax_lse,
-            cu_seqlens,
-            blockmask,
-            ctx.dropout_p,
-            ctx.max_s,
-            ctx.softmax_scale,
-            ctx.causal,
-        )
-        if rng_state is not None:
-            torch.cuda.set_rng_state(cur_rng_state)
-        return dqkv, None, None, None, None, None, None
-
-
-def flash_blocksparse_attn_func(
-    qkv,
-    cu_seqlens,
-    blockmask,
-    dropout_p,
-    max_s,
-    softmax_scale=None,
-    causal=False,
-    return_attn_probs=False,
-    convert_mask=True,
-):
-    """dropout_p should be set to 0.0 during evaluation"""
-    func = FlashBlocksparseAttnFun if not return_attn_probs else FlashBlocksparseAttnFunWithS
-    if convert_mask:
-        blockmask = convert_blockmask(blockmask, causal=causal)
-    return func.apply(qkv, cu_seqlens, blockmask, dropout_p, max_s, softmax_scale, causal)
--- a/vllm_flash_attn/fused_softmax.py
+++ b/vllm_flash_attn/fused_softmax.py
-# [2022-10-23] Copied from https://github.com/NVIDIA/apex/blob/master/apex/transformer/functional/fused_softmax.py
-# for benchmarking.
-# We added support for seqlen=2k and seqlen=4k
-
-# coding=utf-8
-# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-from apex._autocast_utils import _cast_if_autocast_enabled
-from apex.transformer.enums import AttnMaskType
-from fused_softmax_lib import (
-    scaled_masked_softmax_backward,
-    scaled_masked_softmax_forward,
-    scaled_masked_softmax_get_batch_per_block,
-    scaled_upper_triang_masked_softmax_backward,
-    scaled_upper_triang_masked_softmax_forward,
-)
-
-
-class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function):
-    """
-    Fused operation which performs following three operations in sequence
-    1. Scale the tensor.
-    2. Apply upper triangular mask (typically used in gpt models).
-    3. Perform softmax.
-    """
-
-    @staticmethod
-    def forward(ctx, inputs, scale):
-        scale_t = torch.tensor([scale])
-        softmax_results = scaled_upper_triang_masked_softmax_forward(inputs, scale_t[0])
-        ctx.save_for_backward(softmax_results, scale_t)
-        return softmax_results
-
-    @staticmethod
-    def backward(ctx, output_grads):
-        softmax_results, scale_t = ctx.saved_tensors
-        input_grads = scaled_upper_triang_masked_softmax_backward(
-            output_grads, softmax_results, scale_t[0]
-        )
-        return input_grads, None
-
-
-def scaled_upper_triang_masked_softmax(inputs, _, scale):
-    b, np, sq, sk = inputs.size()
-    assert sq == sk, "causal mask is only for self attention"
-    # Reshaping input to 3D tensor (attn_batches, sq, sk)
-    inputs = inputs.view(-1, sq, sk)
-    args = _cast_if_autocast_enabled(inputs, scale)
-    with torch.cuda.amp.autocast(enabled=False):
-        probs = ScaledUpperTriangMaskedSoftmax.apply(*args)
-    return probs.view(b, np, sq, sk)
-
-
-# NOTE (mkozuki): `ScaledMaskedSoftmax` somehow doesn't work well with `torch.cuda.amp.custom_fwd`.
-# Without `cast_inputs` kwarg, somehow inputs are not cast to dtype used in the autocast context.
-# So I needed to manually write two `torch.autograd.Function` inheritances.
-# Fused operation which performs following three operations in sequence
-# 1. Scale the tensor.
-# 2. Apply the mask.
-# 3. Perform softmax.
-class ScaledMaskedSoftmax(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, inputs, mask, scale):
-        scale_t = torch.tensor([scale])
-        softmax_results = scaled_masked_softmax_forward(inputs, mask, scale_t[0])
-        ctx.save_for_backward(softmax_results, scale_t)
-        return softmax_results
-
-    @staticmethod
-    def backward(ctx, output_grads):
-        softmax_results, scale_t = ctx.saved_tensors
-        input_grads = scaled_masked_softmax_backward(output_grads, softmax_results, scale_t[0])
-        return input_grads, None, None
-
-
-def scaled_masked_softmax(inputs, mask, scale):
-    # input is 4D tensor (b, np, sq, sk)
-    args = _cast_if_autocast_enabled(inputs, mask, scale)
-    with torch.cuda.amp.autocast(enabled=False):
-        return ScaledMaskedSoftmax.apply(*args)
-
-
-class FusedScaleMaskSoftmax(torch.nn.Module):
-    """
-    fused operation: scaling + mask + softmax
-
-    Arguments:
-        input_in_fp16: flag to indicate if input in fp16 data format.
-        input_in_bf16: flag to indicate if input in bf16 data format.
-        attn_mask_type: attention mask type (pad or causal)
-        scaled_masked_softmax_fusion: flag to indicate user want to use softmax fusion
-        mask_func: mask function to be applied.
-        softmax_in_fp32: if true, softmax in performed at fp32 precision.
-        scale: scaling factor used in input tensor scaling.
-    """
-
-    def __init__(
-        self,
-        input_in_fp16,
-        input_in_bf16,
-        attn_mask_type,
-        scaled_masked_softmax_fusion,
-        mask_func,
-        softmax_in_fp32,
-        scale,
-    ):
-        super().__init__()
-        self.input_in_fp16 = input_in_fp16
-        self.input_in_bf16 = input_in_bf16
-        if self.input_in_fp16 and self.input_in_bf16:
-            raise RuntimeError("both fp16 and bf16 flags cannot be active at the same time.")
-        self.input_in_float16 = self.input_in_fp16 or self.input_in_bf16
-        self.attn_mask_type = attn_mask_type
-        self.scaled_masked_softmax_fusion = scaled_masked_softmax_fusion
-        self.mask_func = mask_func
-        self.softmax_in_fp32 = softmax_in_fp32
-        self.scale = scale
-
-        if not (self.scale is None or softmax_in_fp32):
-            raise RuntimeError("softmax should be in fp32 when scaled")
-
-        if self.scaled_masked_softmax_fusion:
-            if self.attn_mask_type == AttnMaskType.causal:
-                self.fused_softmax_func = scaled_upper_triang_masked_softmax
-            elif self.attn_mask_type == AttnMaskType.padding:
-                self.fused_softmax_func = scaled_masked_softmax
-            else:
-                raise ValueError("Invalid attn_mask_type.")
-
-    def forward(self, input, mask):
-        # [b, np, sq, sk]
-        assert input.dim() == 4
-
-        if self.is_kernel_available(mask, *input.size()):
-            return self.forward_fused_softmax(input, mask)
-        else:
-            return self.forward_torch_softmax(input, mask)
-
-    def is_kernel_available(self, mask, b, np, sq, sk):
-        attn_batches = b * np
-
-        if (
-            self.scaled_masked_softmax_fusion  # user want to fuse
-            and self.input_in_float16  # input must be fp16
-            and (
-                self.attn_mask_type == AttnMaskType.causal
-                or (self.attn_mask_type == AttnMaskType.padding and mask is not None)
-            )
-            and 16 < sk <= 8192  # sk must be 16 ~ 8192
-            and sq % 4 == 0  # sq must be divisor of 4
-            and sk % 4 == 0  # sk must be divisor of 4
-            and attn_batches % 4 == 0  # np * b must be divisor of 4
-        ):
-            if 0 <= sk <= 8192:
-                batch_per_block = self.get_batch_per_block(sq, sk, b, np)
-
-                if self.attn_mask_type == AttnMaskType.causal:
-                    if attn_batches % batch_per_block == 0:
-                        return True
-                else:
-                    if sq % batch_per_block == 0:
-                        return True
-        return False
-
-    def forward_fused_softmax(self, input, mask):
-        # input.shape = [b, np, sq, sk]
-        scale = self.scale if self.scale is not None else 1.0
-        return self.fused_softmax_func(input, mask, scale)
-
-    def forward_torch_softmax(self, input, mask):
-        if self.input_in_float16 and self.softmax_in_fp32:
-            input = input.float()
-
-        if self.scale is not None:
-            input = input * self.scale
-        mask_output = self.mask_func(input, mask) if mask is not None else input
-        probs = torch.nn.Softmax(dim=-1)(mask_output)
-
-        if self.input_in_float16 and self.softmax_in_fp32:
-            if self.input_in_fp16:
-                probs = probs.half()
-            else:
-                probs = probs.bfloat16()
-
-        return probs
-
-    @staticmethod
-    def get_batch_per_block(sq, sk, b, np):
-        return scaled_masked_softmax_get_batch_per_block(sq, sk, b, np)
--- a/vllm_flash_attn/layers/__init__.py
+++ b/vllm_flash_attn/layers/__init__.py
--- a/vllm_flash_attn/layers/patch_embed.py
+++ b/vllm_flash_attn/layers/patch_embed.py
-# We use the same API as https://github.com/rwightman/pytorch-image-models/blob/v0.6.11/timm/models/layers/patch_embed.py
-# But we use nn.Linear instead of Conv2d and it's about 8x faster.
-
-from functools import partial
-
-import torch.nn as nn
-from einops import rearrange
-from torch import _assert
-from torch.nn.modules.utils import _pair
-
-try:
-    from flash_attn.ops.fused_dense import FusedDense
-except ImportError:
-    FusedDense = None
-
-
-class PatchEmbed(nn.Module):
-    """2D Image to Patch Embedding"""
-
-    def __init__(
-        self,
-        img_size=224,
-        patch_size=16,
-        in_chans=3,
-        embed_dim=768,
-        norm_layer=None,
-        flatten=True,
-        bias=True,
-        fused_bias_fc=False,
-    ):
-        super().__init__()
-        img_size = _pair(img_size)
-        patch_size = _pair(patch_size)
-        self.img_size = img_size
-        self.patch_size = patch_size
-        self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
-        self.num_patches = self.grid_size[0] * self.grid_size[1]
-        self.flatten = flatten
-        if fused_bias_fc and FusedDense is None:
-            raise ImportError("fused_dense is not installed")
-
-        linear_cls = nn.Linear if not fused_bias_fc or not bias else FusedDense
-        self.proj = linear_cls(in_chans * patch_size[0] * patch_size[1], embed_dim, bias=bias)
-        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
-
-    def forward(self, x):
-        _, _, H, W = x.shape
-        _assert(
-            H == self.img_size[0],
-            f"Input image height ({H}) doesn't match model ({self.img_size[0]}).",
-        )
-        _assert(
-            W == self.img_size[1],
-            f"Input image width ({W}) doesn't match model ({self.img_size[1]}).",
-        )
-        x = self.proj(
-            rearrange(
-                x,
-                "b c (h p1) (w p2) -> b h w (c p1 p2)",
-                p1=self.patch_size[0],
-                p2=self.patch_size[1],
-            )
-        )
-        if self.flatten:
-            x = rearrange(x, "b h w c -> b (h w) c")
-        x = self.norm(x)
-        return x
--- a/vllm_flash_attn/layers/rotary.py
+++ b/vllm_flash_attn/layers/rotary.py
-# Copyright (c) 2023, Tri Dao.
-
-import math
-from typing import Optional, Tuple, Union
-
-import torch
-from einops import rearrange, repeat
-from flash_attn.ops.triton.rotary import apply_rotary
-
-
-def rotate_half(x, interleaved=False):
-    if not interleaved:
-        x1, x2 = x.chunk(2, dim=-1)
-        return torch.cat((-x2, x1), dim=-1)
-    else:
-        x1, x2 = x[..., ::2], x[..., 1::2]
-        return rearrange(torch.stack((-x2, x1), dim=-1), "... d two -> ... (d two)", two=2)
-
-
-def apply_rotary_emb_torch(x, cos, sin, interleaved=False):
-    """
-    x: (batch_size, seqlen, nheads, headdim)
-    cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2)
-    """
-    ro_dim = cos.shape[-1] * 2
-    assert ro_dim <= x.shape[-1]
-    cos = repeat(cos, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
-    sin = repeat(sin, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
-    return torch.cat(
-        [x[..., :ro_dim] * cos + rotate_half(x[..., :ro_dim], interleaved) * sin, x[..., ro_dim:]],
-        dim=-1,
-    )
-
-
-class ApplyRotaryEmb(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        x,
-        cos,
-        sin,
-        interleaved=False,
-        inplace=False,
-        seqlen_offsets: Union[int, torch.Tensor] = 0,
-        cu_seqlens: Optional[torch.Tensor] = None,
-        max_seqlen: Optional[int] = None,
-    ):
-        out = apply_rotary(
-            x,
-            cos,
-            sin,
-            seqlen_offsets=seqlen_offsets,
-            cu_seqlens=cu_seqlens,
-            max_seqlen=max_seqlen,
-            interleaved=interleaved,
-            inplace=inplace,
-        )
-        if isinstance(seqlen_offsets, int):
-            ctx.save_for_backward(cos, sin, cu_seqlens)  # Can't save int with save_for_backward
-            ctx.seqlen_offsets = seqlen_offsets
-        else:
-            ctx.save_for_backward(cos, sin, cu_seqlens, seqlen_offsets)
-            ctx.seqlen_offsets = None
-        ctx.interleaved = interleaved
-        ctx.inplace = inplace
-        ctx.max_seqlen = max_seqlen
-        return out if not inplace else x
-
-    @staticmethod
-    def backward(ctx, do):
-        seqlen_offsets = ctx.seqlen_offsets
-        if seqlen_offsets is None:
-            cos, sin, cu_seqlens, seqlen_offsets = ctx.saved_tensors
-        else:
-            cos, sin, cu_seqlens = ctx.saved_tensors
-        # TD [2023-09-02]: For some reason Triton (2.0.0.post1) errors with
-        # "[CUDA]: invalid device context", and cloning makes it work. Idk why. Triton 2.1.0 works.
-        if not ctx.interleaved and not ctx.inplace:
-            do = do.clone()
-        dx = apply_rotary(
-            do,
-            cos,
-            sin,
-            seqlen_offsets=seqlen_offsets,
-            cu_seqlens=cu_seqlens,
-            max_seqlen=ctx.max_seqlen,
-            interleaved=ctx.interleaved,
-            inplace=ctx.inplace,
-            conjugate=True,
-        )
-        return dx, None, None, None, None, None, None, None
-
-
-def apply_rotary_emb(
-    x,
-    cos,
-    sin,
-    interleaved=False,
-    inplace=False,
-    seqlen_offsets: Union[int, torch.Tensor] = 0,
-    cu_seqlens: Optional[torch.Tensor] = None,
-    max_seqlen: Optional[int] = None,
-):
-    """
-    Arguments:
-        x: (batch_size, seqlen, nheads, headdim) if cu_seqlens is None
-            else (total_seqlen, nheads, headdim)
-        cos, sin: (seqlen_rotary, rotary_dim / 2)
-        interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
-            of 1st half and 2nd half (GPT-NeoX style).
-        inplace: if True, apply rotary embedding in-place.
-        seqlen_offsets: (batch_size,) or int. Each sequence in x is shifted by this amount.
-            Most commonly used in inference when we have KV cache.
-        cu_seqlens: (batch + 1,) or None
-        max_seqlen: int
-    Return:
-        out: (batch_size, seqlen, nheads, headdim) if cu_seqlens is None
-            else (total_seqlen, nheads, headdim)
-    rotary_dim must be <= headdim
-    Apply rotary embedding to the first rotary_dim of x.
-    """
-    return ApplyRotaryEmb.apply(
-        x, cos, sin, interleaved, inplace, seqlen_offsets, cu_seqlens, max_seqlen
-    )
-
-
-# For backward compatibility
-apply_rotary_emb_func = apply_rotary_emb
-
-
-class ApplyRotaryEmbQKV_(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        qkv,
-        cos,
-        sin,
-        cos_k=None,
-        sin_k=None,
-        interleaved=False,
-        seqlen_offsets: Union[int, torch.Tensor] = 0,
-    ):
-        batch, seqlen, three, nheads, headdim = qkv.shape
-        assert three == 3
-        if cos_k is None and sin_k is None and qkv.is_contiguous():
-            # Call 1 kernel instead of 2 kernels
-            # We need qkv to be contiguous so that when we reshape to combine (3, nheads)
-            # dimensions, we get the same tensor
-            # qk = rearrange(qkv[:, :, :2], "b s t h d -> b s (t h) d")
-            qk = qkv[:, :, :2].reshape(batch, seqlen, -1, headdim)
-            apply_rotary(
-                qk, cos, sin, seqlen_offsets=seqlen_offsets, interleaved=interleaved, inplace=True
-            )
-        else:
-            cos_k = cos if cos_k is None else cos_k
-            sin_k = sin if sin_k is None else sin_k
-            q, k = qkv[:, :, 0], qkv[:, :, 1]
-            apply_rotary(q, cos, sin, seqlen_offsets, interleaved=interleaved, inplace=True)
-            apply_rotary(k, cos_k, sin_k, seqlen_offsets, interleaved=interleaved, inplace=True)
-            ctx.save_for_backward(cos, sin, cos_k, sin_k)
-        if isinstance(seqlen_offsets, int):
-            ctx.save_for_backward(cos, sin, cos_k, sin_k)
-            ctx.seqlen_offsets = seqlen_offsets
-        else:
-            ctx.save_for_backward(cos, sin, cos_k, sin_k, seqlen_offsets)
-            ctx.seqlen_offsets = None
-        ctx.interleaved = interleaved
-        return qkv
-
-    @staticmethod
-    def backward(ctx, dqkv):
-        seqlen_offsets = ctx.seqlen_offsets
-        if seqlen_offsets is None:
-            cos, sin, cos_k, sin_k, seqlen_offsets = ctx.saved_tensors
-        else:
-            cos, sin, cos_k, sin_k = ctx.saved_tensors
-        if cos_k is None and sin_k is None and dqkv.is_contiguous():
-            # Call 1 kernel instead of 2 kernels
-            # We need dqkv to be contiguous so that when we reshape to combine (3, nheads)
-            # dimensions, we get the same tensor
-            dqk = rearrange(dqkv[:, :, :2], "b s t h d -> b s (t h) d")
-            apply_rotary(
-                dqk,
-                cos,
-                sin,
-                seqlen_offsets=seqlen_offsets,
-                interleaved=ctx.interleaved,
-                inplace=True,
-                conjugate=True,
-            )
-        else:
-            cos_k = cos if cos_k is None else cos_k
-            sin_k = sin if sin_k is None else sin_k
-            dq, dk = dqkv[:, :, 0], dqkv[:, :, 1]
-            apply_rotary(
-                dq, cos, sin, seqlen_offsets, interleaved=ctx.interleaved, inplace=True, conjugate=True
-            )
-            apply_rotary(
-                dk,
-                cos_k,
-                sin_k,
-                seqlen_offsets,
-                interleaved=ctx.interleaved,
-                inplace=True,
-                conjugate=True,
-            )
-        return dqkv, None, None, None, None, None, None
-
-
-def apply_rotary_emb_qkv_(
-    qkv,
-    cos,
-    sin,
-    cos_k=None,
-    sin_k=None,
-    interleaved=False,
-    seqlen_offsets: Union[int, torch.Tensor] = 0,
-):
-    """
-    Arguments:
-        qkv: (batch_size, seqlen, 3, nheads, headdim)
-        cos, sin: (seqlen, rotary_dim / 2)
-        cos_k, sin_k: (seqlen, rotary_dim / 2), optional
-        interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead of
-            1st half and 2nd half (GPT-NeoX style).
-        seqlen_offsets: (batch_size,) or int. Each sequence in Q and K is shifted by this amount.
-            Most commonly used in inference when we have KV cache.
-    Return:
-        qkv: (batch_size, seqlen, 3, nheads, headdim)
-    rotary_dim must be <= headdim
-    Apply rotary embedding *inplace* to the first rotary_dim of Q and K.
-    """
-    return ApplyRotaryEmbQKV_.apply(qkv, cos, sin, cos_k, sin_k, interleaved, seqlen_offsets)
-
-
-class ApplyRotaryEmbKV_(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, kv, cos, sin, interleaved=False, seqlen_offsets: Union[int, torch.Tensor] = 0):
-        batch, seqlen, two, nheads, headdim = kv.shape
-        assert two == 2
-        k = kv[:, :, 0]
-        apply_rotary(
-            k, cos, sin, seqlen_offsets=seqlen_offsets, interleaved=interleaved, inplace=True
-        )
-        if isinstance(seqlen_offsets, int):
-            ctx.save_for_backward(cos, sin)  # Can't save int with save_for_backward
-            ctx.seqlen_offsets = seqlen_offsets
-        else:
-            ctx.save_for_backward(cos, sin, seqlen_offsets)
-            ctx.seqlen_offsets = None
-        ctx.interleaved = interleaved
-        return kv
-
-    @staticmethod
-    def backward(ctx, dkv):
-        seqlen_offsets = ctx.seqlen_offsets
-        if seqlen_offsets is None:
-            cos, sin, seqlen_offsets = ctx.saved_tensors
-        else:
-            cos, sin = ctx.saved_tensors
-        apply_rotary(
-            dkv[:, :, 0],
-            cos,
-            sin,
-            seqlen_offsets=seqlen_offsets,
-            interleaved=ctx.interleaved,
-            inplace=True,
-            conjugate=True,
-        )
-        return dkv, None, None, None, None
-
-
-apply_rotary_emb_kv_ = ApplyRotaryEmbKV_.apply
-
-
-def apply_rotary_emb_kv_(
-    kv,
-    cos,
-    sin,
-    interleaved=False,
-    seqlen_offsets: Union[int, torch.Tensor] = 0,
-):
-    """
-    Arguments:
-        kv: (batch_size, seqlen, 2, nheads, headdim)
-        cos, sin: (seqlen, rotary_dim / 2)
-        interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead of
-            1st half and 2nd half (GPT-NeoX style).
-        seqlen_offsets: (batch_size,) or int. Each sequence in Q and K is shifted by this amount.
-            Most commonly used in inference when we have KV cache.
-    Return:
-        kv: (batch_size, seqlen, 2, nheads, headdim)
-    rotary_dim must be <= headdim
-    Apply rotary embedding *inplace* to the first rotary_dim of K.
-    """
-    return ApplyRotaryEmbKV_.apply(kv, cos, sin, interleaved, seqlen_offsets)
-
-
-class RotaryEmbedding(torch.nn.Module):
-    """
-    The rotary position embeddings from RoFormer_ (Su et. al).
-    A crucial insight from the method is that the query and keys are
-    transformed by rotation matrices which depend on the relative positions.
-
-    Other implementations are available in the Rotary Transformer repo_ and in
-    GPT-NeoX_, GPT-NeoX was an inspiration
-
-    .. _RoFormer: https://arxiv.org/abs/2104.09864
-    .. _repo: https://github.com/ZhuiyiTechnology/roformer
-    .. _GPT-NeoX: https://github.com/EleutherAI/gpt-neox
-
-    If scale_base is not None, this implements XPos (Sun et al., https://arxiv.org/abs/2212.10554).
-    A recommended value for scale_base is 512: https://github.com/HazyResearch/flash-attention/issues/96
-    Reference: https://github.com/sunyt32/torchscale/blob/main/torchscale/component/xpos_relative_position.py
-    """
-
-    def __init__(
-        self,
-        dim: int,
-        base=10000.0,
-        interleaved=False,
-        scale_base=None,
-        pos_idx_in_fp32=True,
-        device=None,
-    ):
-        """
-        interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
-            of 1st half and 2nd half (GPT-NeoX style).
-        pos_idx_in_fp32: if True, the position indices [0.0, ..., seqlen - 1] are in fp32,
-            otherwise they might be in lower precision.
-            This option was added because previously (before 2023-07-02), when we construct
-            the position indices, we use the dtype of self.inv_freq. In most cases this would
-            be fp32, but if the model is trained in pure bf16 (not mixed precision), then
-            self.inv_freq would be bf16, and the position indices are also in bf16.
-            Because of the limited precision of bf16 (e.g. 1995.0 is rounded to 2000.0), the
-            embeddings for some positions will coincide.
-            To maintain compatibility with models previously trained in pure bf16,
-            we add this option.
-        """
-        super().__init__()
-        self.dim = dim
-        self.base = float(base)
-        self.pos_idx_in_fp32 = pos_idx_in_fp32
-        # Generate and save the inverse frequency buffer (non trainable)
-        inv_freq = self._compute_inv_freq(device)
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self.interleaved = interleaved
-        self.scale_base = scale_base
-        scale = (
-            (torch.arange(0, dim, 2, device=device, dtype=torch.float32) + 0.4 * dim) / (1.4 * dim)
-            if scale_base is not None
-            else None
-        )
-        self.register_buffer("scale", scale, persistent=False)
-
-        self._seq_len_cached = 0
-        self._cos_cached = None
-        self._sin_cached = None
-        self._cos_k_cached = None
-        self._sin_k_cached = None
-
-    def _compute_inv_freq(self, device=None):
-        return 1.0 / (
-            self.base
-            ** (torch.arange(0, self.dim, 2, device=device, dtype=torch.float32) / self.dim)
-        )
-
-    def _update_cos_sin_cache(self, seqlen, device=None, dtype=None):
-        # Reset the tables if the sequence length has changed,
-        # if we're on a new device (possibly due to tracing for instance),
-        # or if we're switching from inference mode to training
-        if (
-            seqlen > self._seq_len_cached
-            or self._cos_cached is None
-            or self._cos_cached.device != device
-            or self._cos_cached.dtype != dtype
-            or (self.training and self._cos_cached.is_inference())
-        ):
-            self._seq_len_cached = seqlen
-            # We want fp32 here, not self.inv_freq.dtype, since the model could be loaded in bf16
-            # And the output of arange can be quite large, so bf16 would lose a lot of precision.
-            # However, for compatibility reason, we add an option to use the dtype of self.inv_freq.
-            if self.pos_idx_in_fp32:
-                t = torch.arange(seqlen, device=device, dtype=torch.float32)
-                # We want fp32 here as well since inv_freq will be multiplied with t, and the output
-                # will be large. Having it in bf16 will lose a lot of precision and cause the
-                # cos & sin output to change significantly.
-                # We want to recompute self.inv_freq if it was not loaded in fp32
-                if self.inv_freq.dtype != torch.float32:
-                    inv_freq = self._compute_inv_freq(device=device)
-                else:
-                    inv_freq = self.inv_freq
-            else:
-                t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
-                inv_freq = self.inv_freq
-            # Don't do einsum, it converts fp32 to fp16 under AMP
-            # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-            freqs = torch.outer(t, inv_freq)
-            if self.scale is None:
-                self._cos_cached = torch.cos(freqs).to(dtype)
-                self._sin_cached = torch.sin(freqs).to(dtype)
-            else:
-                power = (
-                    torch.arange(seqlen, dtype=self.scale.dtype, device=self.scale.device)
-                    - seqlen // 2
-                ) / self.scale_base
-                scale = self.scale.to(device=power.device) ** rearrange(power, "s -> s 1")
-                # We want the multiplication by scale to happen in fp32
-                self._cos_cached = (torch.cos(freqs) * scale).to(dtype)
-                self._sin_cached = (torch.sin(freqs) * scale).to(dtype)
-                self._cos_k_cached = (torch.cos(freqs) / scale).to(dtype)
-                self._sin_k_cached = (torch.sin(freqs) / scale).to(dtype)
-
-    def forward(
-        self,
-        qkv: torch.Tensor,
-        kv: Optional[torch.Tensor] = None,
-        seqlen_offset: Union[int, torch.Tensor] = 0,
-        max_seqlen: Optional[int] = None,
-    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
-        """
-        qkv: (batch, seqlen, 3, nheads, headdim) if kv is none,
-             else it's just q of shape (batch, seqlen, nheads, headdim)
-        kv: (batch, seqlen, 2, nheads, headdim)
-        seqlen_offset: (batch_size,) or int. Each sequence in x is shifted by this amount.
-            Most commonly used in inference when we have KV cache.
-            If it's a tensor of shape (batch_size,), then to update the cos / sin cache, one
-            should pass in max_seqlen, which will update the cos / sin cache up to that length.
-        Apply rotary embedding *inplace* to qkv and / or kv.
-        """
-        seqlen = qkv.shape[1]
-        if max_seqlen is not None:
-            self._update_cos_sin_cache(max_seqlen, device=qkv.device, dtype=qkv.dtype)
-        elif isinstance(seqlen_offset, int):
-            self._update_cos_sin_cache(seqlen + seqlen_offset, device=qkv.device, dtype=qkv.dtype)
-        if kv is None:
-            if self.scale is None:
-                return apply_rotary_emb_qkv_(
-                    qkv,
-                    self._cos_cached,
-                    self._sin_cached,
-                    interleaved=self.interleaved,
-                    seqlen_offsets=seqlen_offset,
-                )
-            else:
-                return apply_rotary_emb_qkv_(
-                    qkv,
-                    self._cos_cached,
-                    self._sin_cached,
-                    self._cos_k_cached,
-                    self._sin_k_cached,
-                    interleaved=self.interleaved,
-                    seqlen_offsets=seqlen_offset,
-                )
-        else:
-            q = qkv
-            q = apply_rotary_emb_func(
-                q,
-                self._cos_cached,
-                self._sin_cached,
-                interleaved=self.interleaved,
-                inplace=True,
-                seqlen_offsets=seqlen_offset,
-            )
-            if self.scale is None:
-                kv = apply_rotary_emb_kv_(
-                    kv,
-                    self._cos_cached,
-                    self._sin_cached,
-                    interleaved=self.interleaved,
-                    seqlen_offsets=seqlen_offset,
-                )
-            else:
-                kv = apply_rotary_emb_kv_(
-                    kv,
-                    self._cos_k_cached,
-                    self._sin_k_cached,
-                    interleaved=self.interleaved,
-                    seqlen_offsets=seqlen_offset,
-                )
-            return q, kv
--- a/vllm_flash_attn/losses/__init__.py
+++ b/vllm_flash_attn/losses/__init__.py
--- a/vllm_flash_attn/losses/cross_entropy.py
+++ b/vllm_flash_attn/losses/cross_entropy.py
-# Copyright (c) 2023, Tri Dao.
-
-import torch
-import torch.nn as nn
-
-from flash_attn.ops.triton.cross_entropy import cross_entropy_loss
-
-
-class CrossEntropyLoss(nn.Module):
-    def __init__(
-        self,
-        ignore_index=-100,
-        reduction="mean",
-        label_smoothing=0.0,
-        logit_scale=1.0,
-        lse_square_scale=0.0,
-        inplace_backward=False,
-        process_group=None,
-        return_z_loss=False,
-    ):
-        """
-        Arguments:
-            ignored_index: int. If labels == ignored_index, the loss is set to 0.0.
-            label_smoothing: float
-            lse_square_scale: float. If > 0, we add lse_square_scale * lse(logits) ^ 2 to the loss.
-                This is also referred to as "z-loss".
-            inplace_backward: bool. If True, we do the backward pass in-place by modifying the logits.
-                This saves memory.
-            process_group: if not None, we're doing Tensor Parallel: each process is responsible for
-                one part of the vocab. The loss will be aggregated across processes.
-            return_z_loss: bool. If True, we return the component of the loss contributed by
-                the lse_square_scale value. This value is only for logging and does not support
-                backprop.
-        """
-        super().__init__()
-        if reduction not in ["mean", "none", "sum"]:
-            raise NotImplementedError("Only support reduction = 'mean' or 'none' or 'sum'")
-        self.ignore_index = ignore_index
-        self.reduction = reduction
-        self.label_smoothing = label_smoothing
-        self.logit_scale = logit_scale
-        self.lse_square_scale = lse_square_scale
-        self.inplace_backward = inplace_backward
-        self.process_group = process_group
-        self.return_z_loss = return_z_loss
-
-    def forward(self, input, target):
-        """
-        Arguments:
-            input: (batch, vocab_size)
-            target: (batch,)
-        Returns:
-            losses: (batch,) if reduction is 'none', else (1,), dtype float
-            z_loss: (batch,) if reduction is 'none', else (1,), dtype float (if self.return_z_loss)
-        """
-        assert input.is_cuda and target.is_cuda, "Only support CUDA tensors"
-        loss, z_loss = cross_entropy_loss(
-            input,
-            target,
-            label_smoothing=self.label_smoothing,
-            logit_scale=self.logit_scale,
-            lse_square_scale=self.lse_square_scale,
-            ignored_index=self.ignore_index,
-            inplace_backward=self.inplace_backward,
-            process_group=self.process_group,
-        )
-        if self.reduction == "mean":
-            loss = loss.sum() / (target != self.ignore_index).sum()
-        elif self.reduction == "sum":
-            loss = loss.sum()
-        else:
-            loss = loss
-
-        if not self.return_z_loss:
-            return loss
-
-        if self.reduction == "mean":
-            z_loss = z_loss.sum() / (target != self.ignore_index).sum()
-        elif self.reduction == "sum":
-            z_loss = z_loss.sum()
-        else:
-            z_loss = z_loss
-
-        return loss, z_loss
--- a/vllm_flash_attn/models/__init__.py
+++ b/vllm_flash_attn/models/__init__.py
--- a/vllm_flash_attn/models/baichuan.py
+++ b/vllm_flash_attn/models/baichuan.py
-# Copyright (c) 2023, GGGGGGXY, Tri Dao.
-
-import math
-import json
-import re
-from pathlib import Path
-
-from collections import OrderedDict
-
-import torch
-import torch.nn.functional as F
-
-from einops import rearrange
-from transformers import GPT2Config, AutoConfig, PretrainedConfig
-
-
-def remap_state_dict_hf_baichuan(state_dict, config):
-    def key_mapping_layers(key):
-        return re.sub(r"^model.", "transformer.", key)
-
-    state_dict = OrderedDict((key_mapping_layers(k), v) for k, v in state_dict.items())
-
-    # Word embedding
-    def key_mapping_emb(key):
-        return re.sub(
-            r"^transformer.embed_tokens.",
-            "transformer.embeddings.word_embeddings.",
-            key,
-        )
-
-    state_dict = OrderedDict((key_mapping_emb(k), v) for k, v in state_dict.items())
-    word_embeddings = state_dict.pop("transformer.embeddings.word_embeddings.weight")
-    # It's possible that vocab_size is padded to be a multiple of 8, for example.
-    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
-    vocab_size = (
-        math.ceil(word_embeddings.shape[0] / pad_vocab_size_multiple)
-        * pad_vocab_size_multiple
-    )
-    state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad(
-        word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0])
-    )
-    if getattr(config, "tie_word_embeddings"):
-        state_dict["lm_head.weight"] = state_dict[
-            "transformer.embeddings.word_embeddings.weight"
-        ]
-    else:
-        output_embeddings = state_dict.pop("lm_head.weight")
-        # Need to recompute vocab_size since Baichuan shards the word embeddings and output embeddings
-        # differently.
-        vocab_size = (
-            math.ceil(output_embeddings.shape[0] / pad_vocab_size_multiple)
-            * pad_vocab_size_multiple
-        )
-        # It's possible that vocab_size is padded to be a multiple of 8, for example.
-        state_dict["lm_head.weight"] = F.pad(
-            output_embeddings, (0, 0, 0, vocab_size - output_embeddings.shape[0])
-        )
-
-    # LayerNorm
-    def key_mapping_ln(key):
-        key = re.sub(r"^transformer.norm.", r"transformer.ln_f.", key)
-        key = re.sub(
-            r"^transformer.layers.(\d+).input_layernorm.",
-            r"transformer.layers.\1.norm1.",
-            key,
-        )
-        key = re.sub(
-            r"^transformer.layers.(\d+).post_attention_layernorm.",
-            r"transformer.layers.\1.norm2.",
-            key,
-        )
-        return key
-
-    state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
-
-    # MLP
-    for l in range(config.n_layer):
-        w1 = state_dict.pop(f"transformer.layers.{l}.mlp.gate_proj.weight")
-        w3 = state_dict.pop(f"transformer.layers.{l}.mlp.up_proj.weight")
-        # Our ordering is different
-        state_dict[f"transformer.layers.{l}.mlp.fc1.weight"] = torch.cat(
-            [w3, w1], dim=0
-        )
-
-    def key_mapping_mlp(key):
-        return re.sub(
-            r"^transformer.layers.(\d+).mlp.down_proj.",
-            r"transformer.layers.\1.mlp.fc2.",
-            key,
-        )
-
-    state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
-
-    # Attention
-    def key_mapping_attn(key):
-        key = re.sub(
-            r"^transformer.layers.(\d+).self_attn.W_pack.",
-            r"transformer.layers.\1.mixer.Wqkv.",
-            key,
-        )
-        key = re.sub(
-            r"^transformer.layers.(\d+).self_attn.o_proj.",
-            r"transformer.layers.\1.mixer.out_proj.",
-            key,
-        )
-        return key
-
-    state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
-    for l in range(config.n_layer):
-        # pop rotary_emb.inv_freq from state dict
-        state_dict.pop(f"transformer.layers.{l}.self_attn.rotary_emb.inv_freq", None)
-    return state_dict
-
-
-def baichuan_config_to_gpt2_config(baichuan_config: PretrainedConfig) -> GPT2Config:
-    # HACK: the config doesn't have say whether it's rotary or alibi.
-    # So we have to infer from the hidden size (7B -> rotary, 13B -> alibi).
-    # HACK: the config doesn't have say whether it uses norm head.
-    # So we have to infer from the vocab size
-    # (v1, vocab size 64k, no norm head; v2, vocab size 128k, norm head).
-    use_rotary = baichuan_config.hidden_size < 5000
-    return GPT2Config(
-        vocab_size=baichuan_config.vocab_size,
-        n_positions=0,  # No absolute position embedding
-        n_embd=baichuan_config.hidden_size,
-        n_layer=baichuan_config.num_hidden_layers,
-        n_head=baichuan_config.num_attention_heads,
-        n_inner=baichuan_config.intermediate_size,
-        activation_function="swiglu",  # Hardcode since HF calls it 'silu'
-        # baichuan doesn't have dropout, idk if it's because they only release the inference code
-        resid_pdrop=0.0,
-        embd_pdrop=0.0,
-        attn_pdrop=0.0,
-        layer_norm_epsilon=baichuan_config.rms_norm_eps,
-        initializer_range=baichuan_config.initializer_range,
-        bos_token_id=baichuan_config.bos_token_id,
-        eos_token_id=baichuan_config.eos_token_id,
-        # These are new arguments not in the original GPT2Config
-        pad_token_id=baichuan_config.pad_token_id,  # Idk if this does anything
-        rms_norm=True,
-        rotary_emb_fraction=1.0 if use_rotary else 0.0,
-        rotary_emb_interleaved=False,
-        use_alibi=not use_rotary,
-        use_flash_attn=not use_rotary,  # Alibi code path requires flash_attn
-        tie_word_embeddings=False,
-        norm_head=baichuan_config.vocab_size > 70000,
-        qkv_proj_bias=False,
-        out_proj_bias=False,
-        mlp_fc1_bias=False,
-        mlp_fc2_bias=False,
-    )
--- a/vllm_flash_attn/models/bert.py
+++ b/vllm_flash_attn/models/bert.py
-# Copyright (c) 2022, Tri Dao.
-# This BERT implementation is based on our MLPerf 2.0 and MLPerf 2.1 BERT implementation.
-# https://github.com/mlcommons/training_results_v2.0/blob/main/HazyResearch/benchmarks/bert/implementations/pytorch/modeling.py
-# https://github.com/mlcommons/training_results_v2.1/blob/main/Azure-HazyResearch/benchmarks/bert/implementations/ND96amsr_A100_v4/modeling.py
-
-# Inspired by https://github.com/huggingface/transformers/blob/main/src/transformers/models/bert/modeling_bert.py
-
-import logging
-import re
-from collections import OrderedDict
-from collections.abc import Sequence
-from functools import partial
-from typing import Any, Mapping
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from einops import rearrange
-from transformers import BertConfig, PretrainedConfig
-from transformers.models.bert.modeling_bert import (
-    BaseModelOutputWithPoolingAndCrossAttentions,
-    BertForPreTrainingOutput,
-)
-
-from flash_attn.bert_padding import (
-    index_first_axis,
-    index_first_axis_residual,
-    pad_input,
-    unpad_input,
-)
-from flash_attn.modules.block import Block
-from flash_attn.modules.embedding import BertEmbeddings
-from flash_attn.modules.mha import MHA
-from flash_attn.modules.mlp import FusedMLP, Mlp
-from flash_attn.utils.pretrained import state_dict_from_pretrained
-
-try:
-    from flash_attn.ops.fused_dense import FusedDense
-except ImportError:
-    FusedDense = None
-
-try:
-    from flash_attn.ops.triton.layer_norm import layer_norm_fn
-except ImportError:
-    layer_norm_fn = None
-
-
-try:
-    from flash_attn.losses.cross_entropy import CrossEntropyLoss
-except ImportError:
-    CrossEntropyLoss = None
-
-
-logger = logging.getLogger(__name__)
-
-
-def create_mixer_cls(config, cross_attn=False, return_residual=False):
-    use_flash_attn = getattr(config, "use_flash_attn", False)
-    fused_bias_fc = getattr(config, "fused_bias_fc", False)
-    rotary_kwargs = {}
-    if config.position_embedding_type == "rotary":
-        rotary_kwargs["rotary_emb_dim"] = getattr(config, "rotary_emb_dim", config.hidden_size)
-        rotary_kwargs["rotary_emb_base"] = getattr(config, "rotary_emb_base", 10000.0)
-        rotary_kwargs["rotary_emb_scale_base"] = getattr(config, "rotary_emb_scale_base", None)
-        rotary_kwargs["rotary_emb_interleaved"] = getattr(config, "rotary_emb_interleaved", False)
-    mixer_cls = partial(
-        MHA,
-        num_heads=config.num_attention_heads,
-        cross_attn=cross_attn,
-        dropout=config.attention_probs_dropout_prob,
-        causal=False,
-        fused_bias_fc=fused_bias_fc,
-        use_flash_attn=use_flash_attn,
-        return_residual=return_residual,
-        **rotary_kwargs,
-    )
-    return mixer_cls
-
-
-def create_mlp_cls(config, layer_idx=None, return_residual=False):
-    inner_dim = config.intermediate_size
-    fused_mlp = getattr(config, "fused_mlp", False)
-    if fused_mlp:
-        assert config.hidden_act in ["gelu_new", "gelu_fast", "gelu_pytorch_tanh"], (
-            "fused_mlp only " "supports approximate gelu"
-        )
-    if not fused_mlp:
-        approximate = (
-            "tanh"
-            if config.hidden_act in ["gelu_new", "gelu_fast", "gelu_pytorch_tanh"]
-            else "none"
-        )
-        mlp_cls = partial(
-            Mlp,
-            hidden_features=inner_dim,
-            activation=partial(F.gelu, approximate=approximate),
-            return_residual=return_residual,
-        )
-    else:
-        if FusedMLP is None:
-            raise ImportError("fused_dense is not installed")
-        mlp_checkpoint_lvl = getattr(config, "mlp_checkpoint_lvl", 0)
-        # mlp_checkpoint_lvl could be a list, which contains the checkpoint_lvl for each layer
-        if isinstance(mlp_checkpoint_lvl, Sequence):
-            assert layer_idx is not None
-            mlp_checkpoint_lvl = mlp_checkpoint_lvl[layer_idx]
-        mlp_cls = partial(
-            FusedMLP,
-            hidden_features=inner_dim,
-            checkpoint_lvl=mlp_checkpoint_lvl,
-            return_residual=return_residual,
-        )
-    return mlp_cls
-
-
-def create_block(config, layer_idx=None):
-    last_layer_subset = getattr(config, "last_layer_subset", False)
-    cross_attn = last_layer_subset and layer_idx == config.num_hidden_layers - 1
-    # TD [2022-12-19]: For cross attention (last layer), we actually want to return the
-    # residual x_kv, not residual x. But it's annoying to change the API (and it only affects
-    # one layer) so we just choose not to return residual in this case.
-    return_residual = not cross_attn
-    mixer_cls = create_mixer_cls(config, cross_attn, return_residual=return_residual)
-    mlp_cls = create_mlp_cls(config, layer_idx, return_residual=return_residual)
-    norm_cls = partial(nn.LayerNorm, eps=config.layer_norm_eps)
-    block = Block(
-        config.hidden_size,
-        mixer_cls,
-        mlp_cls,
-        norm_cls=norm_cls,
-        prenorm=False,
-        resid_dropout1=config.hidden_dropout_prob,
-        resid_dropout2=config.hidden_dropout_prob,
-        fused_dropout_add_ln=getattr(config, "fused_dropout_add_ln", False),
-        return_residual=return_residual,
-    )
-    return block
-
-
-# https://github.com/huggingface/transformers/blob/7032e0203262ebb2ebf55da8d2e01f873973e835/src/transformers/models/bert/modeling_bert.py#L748
-def _init_weights(module, initializer_range=0.02):
-    if isinstance(module, nn.Linear):
-        nn.init.normal_(module.weight, std=initializer_range)
-        if module.bias is not None:
-            nn.init.zeros_(module.bias)
-    elif isinstance(module, nn.Embedding):
-        nn.init.normal_(module.weight, std=initializer_range)
-        if module.padding_idx is not None:
-            nn.init.zeros_(module.weight[module.padding_idx])
-
-
-class BertEncoder(nn.Module):
-    def __init__(self, config: BertConfig):
-        super().__init__()
-        self.use_flash_attn = getattr(config, "use_flash_attn", False)
-        self.layers = nn.ModuleList(
-            [create_block(config, layer_idx=i) for i in range(config.num_hidden_layers)]
-        )
-
-    def forward(self, hidden_states, key_padding_mask=None, subset_mask=None):
-        """If subset_mask is not None, we only want output for the subset of the sequence.
-        This means that we only compute the last layer output for these tokens.
-        subset_mask: (batch, seqlen), dtype=torch.bool
-        """
-        if key_padding_mask is None or not self.use_flash_attn:
-            mixer_kwargs = (
-                {"key_padding_mask": key_padding_mask} if key_padding_mask is not None else None
-            )
-            for layer in self.layers:
-                hidden_states = layer(hidden_states, mixer_kwargs=mixer_kwargs)
-            if subset_mask is not None:
-                hidden_states = hidden_states[subset_mask]
-        else:
-            batch, seqlen = hidden_states.shape[:2]
-            hidden_states, indices, cu_seqlens, max_seqlen_in_batch = unpad_input(
-                hidden_states, key_padding_mask
-            )
-            mixer_kwargs = {"cu_seqlens": cu_seqlens, "max_seqlen": max_seqlen_in_batch}
-            if subset_mask is None:
-                for layer in self.layers:
-                    hidden_states = layer(hidden_states, mixer_kwargs=mixer_kwargs)
-                hidden_states = pad_input(hidden_states, indices, batch, seqlen)
-            else:
-                for layer in self.layers[:-1]:
-                    hidden_states = layer(hidden_states, mixer_kwargs=mixer_kwargs)
-                if key_padding_mask is not None:
-                    subset_idx = torch.nonzero(
-                        subset_mask[key_padding_mask], as_tuple=False
-                    ).flatten()
-                    subset_seqlens = (subset_mask & key_padding_mask).sum(dim=-1, dtype=torch.int32)
-                    subset_cu_seqlens = F.pad(
-                        torch.cumsum(subset_seqlens, dim=0, dtype=torch.torch.int32), (1, 0)
-                    )
-                else:
-                    subset_idx = torch.nonzero(subset_mask, as_tuple=False).flatten()
-                    subset_seqlens = subset_mask.sum(dim=-1, dtype=torch.int32)
-                    subset_cu_seqlens = F.pad(
-                        torch.cumsum(subset_seqlens, dim=0, dtype=torch.torch.int32), (1, 0)
-                    )
-                hidden_states_subset, hidden_states = index_first_axis_residual(
-                    hidden_states, subset_idx
-                )
-                # It's ok to set max_seqlen_q to be much larger
-                mixer_kwargs = {
-                    "x_kv": hidden_states,
-                    "cu_seqlens": subset_cu_seqlens,
-                    "max_seqlen": max_seqlen_in_batch,
-                    "cu_seqlens_k": cu_seqlens,
-                    "max_seqlen_k": max_seqlen_in_batch,
-                }
-                hidden_states = self.layers[-1](hidden_states_subset, mixer_kwargs=mixer_kwargs)
-        return hidden_states
-
-
-class BertPooler(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        fused_bias_fc = getattr(config, "fused_bias_fc", False)
-        if fused_bias_fc and FusedDense is None:
-            raise ImportError("fused_dense is not installed")
-        linear_cls = nn.Linear if not fused_bias_fc else FusedDense
-        self.dense = linear_cls(config.hidden_size, config.hidden_size)
-        self.activation = nn.Tanh()
-
-    def forward(self, hidden_states, pool=True):
-        # We "pool" the model by simply taking the hidden state corresponding
-        # to the first token.
-        first_token_tensor = hidden_states[:, 0] if pool else hidden_states
-        pooled_output = self.dense(first_token_tensor)
-        pooled_output = self.activation(pooled_output)
-        return pooled_output
-
-
-class BertPredictionHeadTransform(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        fused_bias_fc = getattr(config, "fused_bias_fc", False)
-        if fused_bias_fc and FusedDense is None:
-            raise ImportError("fused_dense is not installed")
-        self.fused_dropout_add_ln = getattr(config, "fused_dropout_add_ln", False)
-        if self.fused_dropout_add_ln and layer_norm_fn is None:
-            raise ImportError("Triton is not installed")
-        linear_cls = nn.Linear if not fused_bias_fc else FusedDense
-        self.dense = linear_cls(config.hidden_size, config.hidden_size)
-        approximate = (
-            "tanh"
-            if config.hidden_act in ["gelu_new", "gelu_fast", "gelu_pytorch_tanh"]
-            else "none"
-        )
-        self.transform_act_fn = nn.GELU(approximate=approximate)
-        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.transform_act_fn(hidden_states)
-        if not self.fused_dropout_add_ln:
-            hidden_states = self.layer_norm(hidden_states)
-        else:
-            hidden_states = layer_norm_fn(
-                hidden_states, self.layer_norm.weight, self.layer_norm.bias, eps=self.layer_norm.eps
-            )
-        return hidden_states
-
-
-class BertLMPredictionHead(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        fused_bias_fc = getattr(config, "fused_bias_fc", False)
-        if fused_bias_fc and FusedDense is None:
-            raise ImportError("fused_dense is not installed")
-        linear_cls = nn.Linear if not fused_bias_fc else FusedDense
-
-        self.transform = BertPredictionHeadTransform(config)
-
-        # The output weights are the same as the input embeddings, but there is
-        # an output-only bias for each token.
-        self.decoder = linear_cls(config.hidden_size, config.vocab_size, bias=True)
-
-    def forward(self, hidden_states):
-        hidden_states = self.transform(hidden_states)
-        hidden_states = self.decoder(hidden_states)
-        return hidden_states
-
-
-class BertPreTrainingHeads(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.predictions = BertLMPredictionHead(config)
-        self.seq_relationship = nn.Linear(config.hidden_size, 2)
-
-    def forward(self, sequence_output, pooled_output):
-        prediction_scores = self.predictions(sequence_output)
-        seq_relationship_score = self.seq_relationship(pooled_output)
-        return prediction_scores, seq_relationship_score
-
-
-class BertPreTrainedModel(nn.Module):
-    """An abstract class to handle weights initialization and
-    a simple interface for dowloading and loading pretrained models.
-    """
-
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__()
-        if not isinstance(config, BertConfig):
-            raise ValueError(
-                "Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
-                "To create a model from a Google pretrained model use "
-                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
-                    self.__class__.__name__, self.__class__.__name__
-                )
-            )
-        self.config = config
-
-    @classmethod
-    def from_pretrained(cls, model_name, config, *inputs, **kwargs):
-        """
-        Instantiate a BertPreTrainedModel from a pre-trained model file or a pytorch state dict.
-        Download and cache the pre-trained model file if needed.
-
-        Params:
-            pretrained_model_name_or_path: either:
-                - a path or url to a pretrained model archive containing:
-                    . `bert_config.json` a configuration file for the model
-                    . `pytorch_model.bin` a PyTorch dump of a BertForPretraining instance
-                - a path or url to a pretrained model archive containing:
-                    . `bert_config.json` a configuration file for the model
-                    . `model.chkpt` a TensorFlow checkpoint
-            *inputs, **kwargs: additional input for the specific Bert class
-                (ex: num_labels for BertForSequenceClassification)
-        """
-        # Instantiate model.
-        model = cls(config, *inputs, **kwargs)
-        load_return = model.load_state_dict(
-            remap_state_dict(state_dict_from_pretrained(model_name), config), strict=False
-        )
-        logger.info(load_return)
-        return model
-
-
-class BertModel(BertPreTrainedModel):
-    def __init__(self, config: BertConfig, add_pooling_layer=True):
-        super().__init__(config)
-        self.pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
-        if config.vocab_size % self.pad_vocab_size_multiple != 0:
-            config.vocab_size += self.pad_vocab_size_multiple - (
-                config.vocab_size % self.pad_vocab_size_multiple
-            )
-        self.fused_dropout_add_ln = getattr(config, "fused_dropout_add_ln", False)
-        if self.fused_dropout_add_ln and layer_norm_fn is None:
-            raise ImportError("Triton is not installed")
-        assert config.hidden_act in ["gelu", "gelu_new", "gelu_fast", "gelu_pytorch_tanh"]
-
-        self.embeddings = BertEmbeddings(
-            config.hidden_size,
-            config.vocab_size,
-            config.max_position_embeddings,
-            config.type_vocab_size,
-            padding_idx=config.pad_token_id,
-        )
-        self.emb_drop = nn.Dropout(config.hidden_dropout_prob)
-        self.emb_ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.encoder = BertEncoder(config)
-        self.pooler = BertPooler(config) if add_pooling_layer else None
-
-        self.apply(partial(_init_weights, initializer_range=config.initializer_range))
-
-    def forward(
-        self,
-        input_ids,
-        position_ids=None,
-        token_type_ids=None,
-        attention_mask=None,
-        masked_tokens_mask=None,
-    ):
-        """If masked_tokens_mask is not None (i.e. last_layer_subset == True in BertForPreTraining),
-        we only want the output for the masked tokens. This means that we only compute the last
-        layer output for these tokens.
-        masked_tokens_mask: (batch, seqlen), dtype=torch.bool
-        """
-        hidden_states = self.embeddings(
-            input_ids, position_ids=position_ids, token_type_ids=token_type_ids
-        )
-        # TD [2022-12:18]: Don't need to force residual in fp32
-        # BERT puts embedding LayerNorm before embedding dropout.
-        if not self.fused_dropout_add_ln:
-            hidden_states = self.emb_ln(hidden_states)
-        else:
-            hidden_states = layer_norm_fn(
-                hidden_states, self.emb_ln.weight, self.emb_ln.bias, eps=self.emb_ln.eps
-            )
-        hidden_states = self.emb_drop(hidden_states)
-
-        if masked_tokens_mask is not None:
-            batch_size, seqlen = input_ids.shape[:2]
-            # We also need the first column for the CLS token
-            first_col_mask = torch.zeros(
-                batch_size, seqlen, dtype=torch.bool, device=input_ids.device
-            )
-            first_col_mask[:, 0] = True
-            subset_mask = masked_tokens_mask | first_col_mask
-        else:
-            subset_mask = None
-
-        sequence_output = self.encoder(
-            hidden_states, key_padding_mask=attention_mask, subset_mask=subset_mask
-        )
-
-        if masked_tokens_mask is None:
-            pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
-        else:
-            # TD [2022-03-01]: the indexing here is very tricky.
-            if attention_mask is not None:
-                subset_idx = subset_mask[attention_mask]
-                pool_input = sequence_output[first_col_mask[attention_mask][subset_idx]]
-                sequence_output = sequence_output[masked_tokens_mask[attention_mask][subset_idx]]
-            else:
-                pool_input = sequence_output[first_col_mask[subset_mask]]
-                sequence_output = sequence_output[masked_tokens_mask[subset_mask]]
-            pooled_output = self.pooler(pool_input, pool=False) if self.pooler is not None else None
-
-        return BaseModelOutputWithPoolingAndCrossAttentions(
-            last_hidden_state=sequence_output,
-            pooler_output=pooled_output,
-        )
-
-
-class BertForPreTraining(BertPreTrainedModel):
-    def __init__(self, config: BertConfig):
-        super().__init__(config)
-        # If dense_seq_output, we only need to pass the hidden states for the masked out tokens
-        # (around 15%) to the classifier heads.
-        self.dense_seq_output = getattr(config, "dense_seq_output", False)
-        # If last_layer_subset, we only need the compute the last layer for a subset of tokens
-        # (e.g., the tokens we need to compute the masked LM loss and the next-sentence prediction).
-        self.last_layer_subset = getattr(config, "last_layer_subset", False)
-        if self.last_layer_subset:
-            assert self.dense_seq_output, "last_layer_subset requires dense_seq_output"
-        use_xentropy = getattr(config, "use_xentropy", False)
-        if use_xentropy and CrossEntropyLoss is None:
-            raise ImportError("xentropy_cuda is not installed")
-        loss_cls = (
-            nn.CrossEntropyLoss
-            if not use_xentropy
-            else partial(CrossEntropyLoss, inplace_backward=True)
-        )
-
-        self.bert = BertModel(config)
-        self.cls = BertPreTrainingHeads(config)
-        self.mlm_loss = loss_cls(ignore_index=0)
-        self.nsp_loss = loss_cls(ignore_index=-1)
-
-        # Initialize weights and apply final processing
-        self.apply(partial(_init_weights, initializer_range=config.initializer_range))
-        self.tie_weights()
-
-    def tie_weights(self):
-        self.cls.predictions.decoder.weight = self.bert.embeddings.word_embeddings.weight
-
-    def forward(
-        self,
-        input_ids,
-        position_ids=None,
-        token_type_ids=None,
-        attention_mask=None,
-        labels=None,
-        next_sentence_label=None,
-    ):
-        """
-        If labels are provided, they must be 0 for masked out tokens (as specified in the attention
-        mask).
-        Outputs:
-            if `labels` and `next_sentence_label` are not `None`:
-                Outputs the total_loss which is the sum of the masked language modeling loss and the next
-                sentence classification loss.
-            if `labels` or `next_sentence_label` is `None`:
-                Outputs a tuple comprising
-                - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
-                - the next sentence classification logits of shape [batch_size, 2].
-
-        """
-        masked_tokens_mask = labels > 0 if (self.last_layer_subset and labels is not None) else None
-        outputs = self.bert(
-            input_ids,
-            position_ids=position_ids,
-            token_type_ids=token_type_ids,
-            attention_mask=attention_mask.bool() if attention_mask is not None else None,
-            masked_tokens_mask=masked_tokens_mask,
-        )
-        sequence_output, pooled_output = outputs.last_hidden_state, outputs.pooler_output
-        if self.dense_seq_output and labels is not None:
-            masked_token_idx = torch.nonzero(labels.flatten() > 0, as_tuple=False).flatten()
-            if not self.last_layer_subset:
-                sequence_output = index_first_axis(
-                    rearrange(sequence_output, "b s d -> (b s) d"), masked_token_idx
-                )
-        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
-
-        total_loss = None
-        if labels is not None and next_sentence_label is not None:
-            if (
-                self.dense_seq_output and labels is not None
-            ):  # prediction_scores are already flattened
-                masked_lm_loss = self.mlm_loss(
-                    prediction_scores, labels.flatten()[masked_token_idx]
-                )
-            else:
-                masked_lm_loss = self.mlm_loss(
-                    rearrange(prediction_scores, "... v -> (...) v"),
-                    rearrange(labels, "... -> (...)"),
-                )
-            next_sentence_loss = self.nsp_loss(
-                rearrange(seq_relationship_score, "... t -> (...) t"),
-                rearrange(next_sentence_label, "... -> (...)"),
-            )
-            total_loss = masked_lm_loss.float() + next_sentence_loss.float()
-
-        return BertForPreTrainingOutput(
-            loss=total_loss,
-            prediction_logits=prediction_scores,
-            seq_relationship_logits=seq_relationship_score,
-        )
-
-
-def remap_state_dict(state_dict, config: PretrainedConfig):
-    """
-    Map the state_dict of a Huggingface BERT model to be flash_attn compatible.
-    """
-
-    # LayerNorm
-    def key_mapping_ln_gamma_beta(key):
-        key = re.sub(r"LayerNorm.gamma$", "LayerNorm.weight", key)
-        key = re.sub(r"LayerNorm.beta$", "LayerNorm.bias", key)
-        return key
-
-    state_dict = OrderedDict((key_mapping_ln_gamma_beta(k), v) for k, v in state_dict.items())
-
-    # Layers
-    def key_mapping_layers(key):
-        return re.sub(r"^bert.encoder.layer.", "bert.encoder.layers.", key)
-
-    state_dict = OrderedDict((key_mapping_layers(k), v) for k, v in state_dict.items())
-
-    # LayerNorm
-    def key_mapping_ln(key):
-        key = re.sub(r"^bert.embeddings.LayerNorm.", "bert.emb_ln.", key)
-        key = re.sub(
-            r"^bert.encoder.layers.(\d+).attention.output.LayerNorm.(weight|bias)",
-            r"bert.encoder.layers.\1.norm1.\2",
-            key,
-        )
-        key = re.sub(
-            r"^bert.encoder.layers.(\d+).output.LayerNorm.(weight|bias)",
-            r"bert.encoder.layers.\1.norm2.\2",
-            key,
-        )
-        key = re.sub(
-            r"^cls.predictions.transform.LayerNorm.(weight|bias)",
-            r"cls.predictions.transform.layer_norm.\1",
-            key,
-        )
-        return key
-
-    state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
-
-    # MLP
-    def key_mapping_mlp(key):
-        key = re.sub(
-            r"^bert.encoder.layers.(\d+).intermediate.dense.(weight|bias)",
-            r"bert.encoder.layers.\1.mlp.fc1.\2",
-            key,
-        )
-        key = re.sub(
-            r"^bert.encoder.layers.(\d+).output.dense.(weight|bias)",
-            r"bert.encoder.layers.\1.mlp.fc2.\2",
-            key,
-        )
-        return key
-
-    state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
-
-    # Attention
-    last_layer_subset = getattr(config, "last_layer_subset", False)
-    for d in range(config.num_hidden_layers):
-        Wq = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.query.weight")
-        Wk = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.key.weight")
-        Wv = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.value.weight")
-        bq = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.query.bias")
-        bk = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.key.bias")
-        bv = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.value.bias")
-        if not (last_layer_subset and d == config.num_hidden_layers - 1):
-            state_dict[f"bert.encoder.layers.{d}.mixer.Wqkv.weight"] = torch.cat(
-                [Wq, Wk, Wv], dim=0
-            )
-            state_dict[f"bert.encoder.layers.{d}.mixer.Wqkv.bias"] = torch.cat([bq, bk, bv], dim=0)
-        else:
-            state_dict[f"bert.encoder.layers.{d}.mixer.Wq.weight"] = Wq
-            state_dict[f"bert.encoder.layers.{d}.mixer.Wkv.weight"] = torch.cat([Wk, Wv], dim=0)
-            state_dict[f"bert.encoder.layers.{d}.mixer.Wq.bias"] = bq
-            state_dict[f"bert.encoder.layers.{d}.mixer.Wkv.bias"] = torch.cat([bk, bv], dim=0)
-
-    def key_mapping_attn(key):
-        return re.sub(
-            r"^bert.encoder.layers.(\d+).attention.output.dense.(weight|bias)",
-            r"bert.encoder.layers.\1.mixer.out_proj.\2",
-            key,
-        )
-
-    state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
-
-    def key_mapping_decoder_bias(key):
-        return re.sub(r"^cls.predictions.bias", "cls.predictions.decoder.bias", key)
-
-    state_dict = OrderedDict((key_mapping_decoder_bias(k), v) for k, v in state_dict.items())
-
-    # Word embedding
-    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
-    if pad_vocab_size_multiple > 1:
-        word_embeddings = state_dict["bert.embeddings.word_embeddings.weight"]
-        state_dict["bert.embeddings.word_embeddings.weight"] = F.pad(
-            word_embeddings, (0, 0, 0, config.vocab_size - word_embeddings.shape[0])
-        )
-        decoder_weight = state_dict["cls.predictions.decoder.weight"]
-        state_dict["cls.predictions.decoder.weight"] = F.pad(
-            decoder_weight, (0, 0, 0, config.vocab_size - decoder_weight.shape[0])
-        )
-        # If the vocab was padded, we want to set the decoder bias for those padded indices to be
-        # strongly negative (i.e. the decoder shouldn't predict those indices).
-        # TD [2022-05-09]: I don't think it affects the MLPerf training.
-        decoder_bias = state_dict["cls.predictions.decoder.bias"]
-        state_dict["cls.predictions.decoder.bias"] = F.pad(
-            decoder_bias, (0, config.vocab_size - decoder_bias.shape[0]), value=-100.0
-        )
-
-    return state_dict
-
-
-def inv_remap_state_dict(state_dict, config: PretrainedConfig):
-    """
-    Map the state_dict of a flash_attn model to be Huggingface BERT compatible.
-
-    This function is meant to be the inverse of remap_state_dict.
-    """
-    # Word embedding
-    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
-    if pad_vocab_size_multiple > 1:
-        word_embeddings = state_dict["bert.embeddings.word_embeddings.weight"]
-        decoder_weight = state_dict["cls.predictions.decoder.weight"]
-        decoder_bias = state_dict["cls.predictions.decoder.bias"]
-        # unpad embeddings
-        state_dict["bert.embeddings.word_embeddings.weight"] = word_embeddings[
-            : config.orig_vocab_size, :
-        ]
-        state_dict["cls.predictions.decoder.weight"] = decoder_weight[: config.orig_vocab_size, :]
-        state_dict["cls.predictions.decoder.bias"] = decoder_bias[: config.orig_vocab_size]
-
-    for d in range(config.num_hidden_layers):
-        last_layer_subset = getattr(config, "last_layer_subset", False)
-        if not last_layer_subset or d != (config.num_hidden_layers - 1):
-            Wqkv_weights = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wqkv.weight")
-            Wqkv_biases = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wqkv.bias")
-            state_dict[f"bert.encoder.layers.{d}.attention.self.query.weight"] = Wqkv_weights[
-                : Wqkv_weights.shape[0] // 3, :
-            ]
-            state_dict[f"bert.encoder.layers.{d}.attention.self.key.weight"] = Wqkv_weights[
-                Wqkv_weights.shape[0] // 3 : 2 * Wqkv_weights.shape[0] // 3, :
-            ]
-            state_dict[f"bert.encoder.layers.{d}.attention.self.value.weight"] = Wqkv_weights[
-                2 * Wqkv_weights.shape[0] // 3 :, :
-            ]
-            state_dict[f"bert.encoder.layers.{d}.attention.self.query.bias"] = Wqkv_biases[
-                : Wqkv_biases.shape[0] // 3
-            ]
-            state_dict[f"bert.encoder.layers.{d}.attention.self.key.bias"] = Wqkv_biases[
-                Wqkv_biases.shape[0] // 3 : 2 * Wqkv_biases.shape[0] // 3
-            ]
-            state_dict[f"bert.encoder.layers.{d}.attention.self.value.bias"] = Wqkv_biases[
-                2 * Wqkv_biases.shape[0] // 3 :
-            ]
-        else:
-            Wq_weight = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wq.weight")
-            Wkv_weights = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wkv.weight")
-            Wq_bias = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wq.bias")
-            Wkv_biases = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wkv.bias")
-            state_dict[f"bert.encoder.layers.{d}.attention.self.query.weight"] = Wq_weight
-            state_dict[f"bert.encoder.layers.{d}.attention.self.key.weight"] = Wkv_weights[
-                : Wkv_weights.shape[0] // 2, :
-            ]
-            state_dict[f"bert.encoder.layers.{d}.attention.self.value.weight"] = Wkv_weights[
-                Wkv_weights.shape[0] // 2 :, :
-            ]
-            state_dict[f"bert.encoder.layers.{d}.attention.self.query.bias"] = Wq_bias
-            state_dict[f"bert.encoder.layers.{d}.attention.self.key.bias"] = Wkv_biases[
-                : Wkv_biases.shape[0] // 2
-            ]
-            state_dict[f"bert.encoder.layers.{d}.attention.self.value.bias"] = Wkv_biases[
-                Wkv_biases.shape[0] // 2 :
-            ]
-
-    def inv_key_mapping_ln(key):
-        key = re.sub(r"bert.emb_ln.", "bert.embeddings.LayerNorm.", key)
-        key = re.sub(
-            r"bert.encoder.layers.(\d+).norm1.(weight|bias)",
-            r"bert.encoder.layers.\1.attention.output.LayerNorm.\2",
-            key,
-        )
-        key = re.sub(
-            r"bert.encoder.layers.(\d+).norm2.(weight|bias)",
-            r"bert.encoder.layers.\1.output.LayerNorm.\2",
-            key,
-        )
-        key = re.sub(
-            r"cls.predictions.transform.layer_norm.(weight|bias)",
-            r"cls.predictions.transform.LayerNorm.\1",
-            key,
-        )
-        return key
-
-    def inv_key_mapping_ln_gamma_beta(key):
-        key = re.sub(r"LayerNorm.weight$", "LayerNorm.gamma", key)
-        key = re.sub(r"LayerNorm.bias$", "LayerNorm.beta", key)
-        return key
-
-    def inv_key_mapping_layers(key):
-        return re.sub(r"bert.encoder.layers.", "bert.encoder.layer.", key)
-
-    def inv_key_mapping_mlp(key):
-        key = re.sub(
-            r"bert.encoder.layer.(\d+).mlp.fc1.(weight|bias)",
-            r"bert.encoder.layer.\1.intermediate.dense.\2",
-            key,
-        )
-        key = re.sub(
-            r"bert.encoder.layer.(\d+).mlp.fc2.(weight|bias)",
-            r"bert.encoder.layer.\1.output.dense.\2",
-            key,
-        )
-        return key
-
-    def inv_key_mapping_attn(key):
-        return re.sub(
-            r"bert.encoder.layer.(\d+).mixer.out_proj.(weight|bias)",
-            r"bert.encoder.layer.\1.attention.output.dense.\2",
-            key,
-        )
-
-    def inv_key_mapping_decoder_bias(key):
-        return re.sub(r"cls.predictions.decoder.bias", "cls.predictions.bias", key)
-
-    state_dict = OrderedDict((inv_key_mapping_ln(key), value) for key, value in state_dict.items())
-    state_dict = OrderedDict(
-        (inv_key_mapping_ln_gamma_beta(key), value) for key, value in state_dict.items()
-    )
-    state_dict = OrderedDict(
-        (inv_key_mapping_layers(key), value) for key, value in state_dict.items()
-    )
-    state_dict = OrderedDict((inv_key_mapping_mlp(key), value) for key, value in state_dict.items())
-    state_dict = OrderedDict(
-        (inv_key_mapping_attn(key), value) for key, value in state_dict.items()
-    )
-    state_dict = OrderedDict(
-        (inv_key_mapping_decoder_bias(key), value) for key, value in state_dict.items()
-    )
-
-    return state_dict
--- a/vllm_flash_attn/models/bigcode.py
+++ b/vllm_flash_attn/models/bigcode.py
-import math
-import re
-from collections import OrderedDict
-
-import torch
-import torch.nn.functional as F
-from transformers import GPT2Config, GPTBigCodeConfig, PretrainedConfig
-
-
-def remap_state_dict_hf_bigcode(state_dict, config: PretrainedConfig):
-    """
-    Map the state_dict of a Huggingface BigCode model to be flash_attn compatible.
-    """
-
-    # Word embedding and position embedding
-    def key_mapping_pos_emb(key):
-        return re.sub(r"^transformer.wpe.", "transformer.embeddings.position_embeddings.", key)
-
-    state_dict = OrderedDict((key_mapping_pos_emb(k), v) for k, v in state_dict.items())
-    word_embeddings = state_dict.pop("transformer.wte.weight")
-    # It's possible that vocab_size is padded to be a multiple of 8, for example.
-    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
-    vocab_size = math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
-    state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad(
-        word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0])
-    )
-    state_dict["lm_head.weight"] = state_dict["transformer.embeddings.word_embeddings.weight"]
-
-    # LayerNorm
-    def key_mapping_ln(key):
-        key = re.sub(r"^transformer.ln_f.(weight|bias)", r"transformer.ln_f.\1", key)
-        key = re.sub(
-            r"^transformer.h.(\d+).ln_(1|2).(weight|bias)",
-            r"transformer.layers.\1.norm\2.\3",
-            key,
-        )
-        return key
-
-    state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
-
-    def key_mapping_mlp(key):
-        key = re.sub(
-            r"^transformer.h.(\d+).mlp.c_fc.weight",
-            r"transformer.layers.\1.mlp.fc1.weight",
-            key,
-        )
-        key = re.sub(
-            r"^transformer.h.(\d+).mlp.c_proj.weight",
-            r"transformer.layers.\1.mlp.fc2.weight",
-            key,
-        )
-        key = re.sub(
-            r"^transformer.h.(\d+).mlp.c_fc.bias",
-            r"transformer.layers.\1.mlp.fc1.bias",
-            key,
-        )
-        key = re.sub(
-            r"^transformer.h.(\d+).mlp.c_proj.bias",
-            r"transformer.layers.\1.mlp.fc2.bias",
-            key,
-        )
-        return key
-
-    state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
-
-    # TODO: add support for multi-head attention
-    assert config.multi_query, "Only multi-query attention is supported"
-
-    # Attention
-    for d in range(config.num_hidden_layers):
-        embed_dim = config.n_embd
-        head_dim = embed_dim // config.n_head
-
-        c_attn_weight = state_dict.pop(f"transformer.h.{d}.attn.c_attn.weight")
-        # with multi-query attention, the weights have shape (embed_dim, embed_dim + head_dim + head_dim)
-        # see https://github.com/huggingface/transformers/blob/95b374952dc27d8511541d6f5a4e22c9ec11fb24/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py#L112
-        # see also https://github.com/ggerganov/ggml/blob/dd1d575956e54c5bdc07632f25506b3b1884dbd2/examples/starcoder/convert-hf-to-ggml.py#L183
-        # ((n_head + 2) * head_dim, embed_dim) -> (3 * n_heads * head_dim, hidden_dim)
-        q, k, v = torch.split(c_attn_weight, [embed_dim, head_dim, head_dim], dim=0)
-        # duplicate k, v along the first axis (head_dim, hidden_dim) -> (n_heads * head_dim, hidden_dim)
-        k = torch.tile(k, (config.n_head, 1))
-        v = torch.tile(v, (config.n_head, 1))
-        state_dict[f"transformer.layers.{d}.mixer.Wqkv.weight"] = torch.cat((q, k, v), dim=0)
-
-        # same deal with the bias
-        c_attn_bias = state_dict.pop(f"transformer.h.{d}.attn.c_attn.bias")
-        # ((n_head + 2) * head_dim, embed_dim) -> (3 * n_heads * head_dim, hidden_dim)
-        q, k, v = torch.split(c_attn_bias, [embed_dim, head_dim, head_dim], dim=0)
-        # duplicate k, v along the first axis (head_dim, hidden_dim) -> (n_heads * head_dim, hidden_dim)
-        k = torch.tile(k, (config.n_head,))
-        v = torch.tile(v, (config.n_head,))
-        state_dict[f"transformer.layers.{d}.mixer.Wqkv.bias"] = torch.cat((q, k, v), dim=0)
-
-    def key_mapping_attn(key):
-        key = re.sub(
-            r"^transformer.h.(\d+).attn.c_proj.weight",
-            r"transformer.layers.\1.mixer.out_proj.weight",
-            key,
-        )
-        key = re.sub(
-            r"^transformer.h.(\d+).attn.c_proj.bias",
-            r"transformer.layers.\1.mixer.out_proj.bias",
-            key,
-        )
-        return key
-
-    state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
-
-    return state_dict
-
-
-def inv_remap_state_dict_hf_bigcode(state_dict, config: PretrainedConfig):
-    """
-    Map the state_dict of a flash_attn model to be Huggingface BigCode compatible.
-
-    This function is meant to be the inverse of remap_state_dict_hf_bigcode.
-    """
-
-    # Word embedding and position embeddings
-    def inv_key_mapping_pos_emb(key):
-        return re.sub(r"^transformer.embeddings.position_embeddings.", "transformer.wpe.", key)
-
-    state_dict = OrderedDict((inv_key_mapping_pos_emb(k), v) for k, v in state_dict.items())
-    word_embeddings = state_dict.pop("transformer.embeddings.word_embeddings.weight")
-
-    word_embeddings = word_embeddings[:, : config.vocab_size]
-    state_dict["transformer.wte.weight"] = word_embeddings
-    state_dict["lm_head.weight"] = word_embeddings
-
-    # LayerNorm
-    def inv_key_mapping_ln(key):
-        key = re.sub(r"^transformer.ln_f.(weight|bias)", r"transformer.ln_f.\1", key)
-        key = re.sub(
-            r"^transformer.layers.(\d+).norm(1|2).(weight|bias)",
-            r"transformer.h.\1.ln_\2.\3",
-            key,
-        )
-        return key
-
-    state_dict = OrderedDict((inv_key_mapping_ln(k), v) for k, v in state_dict.items())
-
-    # MLPs
-    def inv_key_mapping_mlp(key):
-        key = re.sub(
-            r"^transformer.layers.(\d+).mlp.fc1.weight",
-            r"transformer.h.\1.mlp.c_fc.weight",
-            key,
-        )
-        key = re.sub(
-            r"^transformer.layers.(\d+).mlp.fc2.weight",
-            r"transformer.h.\1.mlp.c_proj.weight",
-            key,
-        )
-        key = re.sub(
-            r"^transformer.layers.(\d+).mlp.fc1.bias",
-            r"transformer.h.\1.mlp.c_fc.bias",
-            key,
-        )
-        key = re.sub(
-            r"^transformer.layers.(\d+).mlp.fc2.bias",
-            r"transformer.h.\1.mlp.c_proj.bias",
-            key,
-        )
-        return key
-
-    state_dict = OrderedDict((inv_key_mapping_mlp(k), v) for k, v in state_dict.items())
-
-    # Attention
-    for d in range(config.num_hidden_layers):
-        embed_dim = config.n_embd
-        head_dim = embed_dim // config.n_head
-
-        Wqkv_weight = state_dict.pop(f"transformer.layers.{d}.mixer.Wqkv.weight")
-        q, k, v = torch.split(
-            Wqkv_weight, [embed_dim, head_dim * config.n_head, head_dim * config.n_head], dim=0
-        )
-        c_attn_weight = torch.cat((q, k[:head_dim], v[:head_dim]), dim=0)
-        state_dict[f"transformer.h.{d}.attn.c_attn.weight"] = c_attn_weight
-
-        # Same deal with the bias
-        Wqkv_bias = state_dict.pop(f"transformer.layers.{d}.mixer.Wqkv.bias")
-        q, k, v = torch.split(
-            Wqkv_bias, [embed_dim, head_dim * config.n_head, head_dim * config.n_head], dim=0
-        )
-        c_attn_bias = torch.cat((q, k[:head_dim], v[:head_dim]), dim=0)
-        state_dict[f"transformer.h.{d}.attn.c_attn.bias"] = c_attn_bias
-
-    def inv_key_mapping_attn(key):
-        key = re.sub(
-            r"^transformer.layers.(\d+).mixer.out_proj.weight",
-            r"transformer.h.\1.attn.c_proj.weight",
-            key,
-        )
-        key = re.sub(
-            r"^transformer.layers.(\d+).mixer.out_proj.bias",
-            r"transformer.h.\1.attn.c_proj.bias",
-            key,
-        )
-        return key
-
-    state_dict = OrderedDict((inv_key_mapping_attn(k), v) for k, v in state_dict.items())
-
-    return state_dict
-
-
-def bigcode_config_to_gpt2_config(bigcode_config: GPTBigCodeConfig) -> GPT2Config:
-    return GPT2Config(
-        activation_function=bigcode_config.activation_function,
-        attn_pdrop=bigcode_config.attn_pdrop,
-        bos_token_id=bigcode_config.bos_token_id,
-        embd_pdrop=bigcode_config.embd_pdrop,
-        eos_token_id=bigcode_config.eos_token_id,
-        initializer_range=bigcode_config.initializer_range,
-        layer_norm_epsilon=bigcode_config.layer_norm_epsilon,
-        max_batch_size=bigcode_config.max_batch_size,
-        max_sequence_length=bigcode_config.max_sequence_length,
-        model_type=bigcode_config.model_type,
-        multi_query=bigcode_config.multi_query,
-        n_embd=bigcode_config.n_embd,
-        n_head=bigcode_config.n_head,
-        n_inner=bigcode_config.n_inner,
-        n_layer=bigcode_config.n_layer,
-        n_positions=bigcode_config.n_positions,
-        resid_pdrop=bigcode_config.resid_pdrop,
-        scale_attn_weights=bigcode_config.scale_attn_weights,
-        summary_activation=bigcode_config.summary_activation,
-        summary_first_dropout=bigcode_config.summary_first_dropout,
-        summary_proj_to_labels=bigcode_config.summary_proj_to_labels,
-        summary_type=bigcode_config.summary_type,
-        summary_use_proj=bigcode_config.summary_use_proj,
-        use_cache=bigcode_config.use_cache,
-        vocab_size=bigcode_config.vocab_size,
-    )
--- a/vllm_flash_attn/models/btlm.py
+++ b/vllm_flash_attn/models/btlm.py
-# Copyright (c) 2023, Tri Dao.
-
-import math
-import json
-import re
-from pathlib import Path
-
-from collections import OrderedDict
-
-import torch
-import torch.nn.functional as F
-
-from einops import rearrange
-from transformers import GPT2Config, AutoConfig, PretrainedConfig
-
-
-def remap_state_dict_hf_btlm(state_dict, config):
-    # Word embedding and position embedding
-    def key_mapping_pos_emb(key):
-        return re.sub(r"^transformer.wpe.", "transformer.embeddings.position_embeddings.", key)
-
-    if "transformer.wpe.weight" in state_dict:
-        state_dict = OrderedDict((key_mapping_pos_emb(k), v) for k, v in state_dict.items())
-    word_embeddings = state_dict.pop("transformer.wte.weight")
-    # It's possible that vocab_size is padded to be a multiple of 8, for example.
-    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
-    vocab_size = math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
-    state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad(
-        word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0])
-    )
-    state_dict["lm_head.weight"] = state_dict["transformer.embeddings.word_embeddings.weight"]
-
-    # LayerNorm
-    def key_mapping_ln(key):
-        key = re.sub(r"^transformer.ln_f.(weight|bias)", r"transformer.ln_f.\1", key)
-        key = re.sub(r"^transformer.h.(\d+).ln_(1|2).(weight|bias)", r"transformer.layers.\1.norm\2.\3", key)
-        return key
-
-    state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
-
-    # MLP
-    for d in range(config.num_hidden_layers):
-        W1 = state_dict.pop(f"transformer.h.{d}.mlp.c_fc.weight")
-        W3 = state_dict.pop(f"transformer.h.{d}.mlp.c_fc2.weight")
-        state_dict[f"transformer.layers.{d}.mlp.fc1.weight"] = torch.cat([W1.t(), W3.t()], dim=0)
-        b1 = state_dict.pop(f"transformer.h.{d}.mlp.c_fc.bias")
-        b3 = state_dict.pop(f"transformer.h.{d}.mlp.c_fc2.bias")
-        state_dict[f"transformer.layers.{d}.mlp.fc1.bias"] = torch.cat([b1, b3], dim=0)
-        W2 = state_dict.pop(f"transformer.h.{d}.mlp.c_proj.weight")
-        state_dict[f"transformer.layers.{d}.mlp.fc2.weight"] = W2.t()
-
-    def key_mapping_mlp(key):
-        key = re.sub(r"^transformer.h.(\d+).mlp.c_proj.bias", r"transformer.layers.\1.mlp.fc2.bias", key)
-        return key
-
-    state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
-
-    # Attention
-    for d in range(config.num_hidden_layers):
-        Wqkv = state_dict.pop(f"transformer.h.{d}.attn.c_attn.weight")
-        state_dict[f"transformer.layers.{d}.mixer.Wqkv.weight"] = Wqkv.t()
-        Wout = state_dict.pop(f"transformer.h.{d}.attn.c_proj.weight")
-        state_dict[f"transformer.layers.{d}.mixer.out_proj.weight"] = Wout.t()
-    state_dict.pop(f"transformer.relative_pe.slopes")  # We don't store the Alibi slopes
-
-    def key_mapping_attn(key):
-        key = re.sub(r"^transformer.h.(\d+).attn.c_attn.bias", r"transformer.layers.\1.mixer.Wqkv.bias", key)
-        key = re.sub(
-            r"^transformer.h.(\d+).attn.c_proj.bias", r"transformer.layers.\1.mixer.out_proj.bias", key
-        )
-        return key
-
-    state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
-
-    return state_dict
-
-
-def btlm_config_to_gpt2_config(btlm_config: PretrainedConfig) -> GPT2Config:
-    return GPT2Config(
-        vocab_size=btlm_config.vocab_size,
-        n_positions=0 if btlm_config.position_embedding_type == "alibi" else btlm_config.n_positions,
-        n_embd=btlm_config.hidden_size,
-        n_layer=btlm_config.num_hidden_layers,
-        n_head=btlm_config.num_attention_heads,
-        n_inner=btlm_config.n_inner,
-        activation_function=btlm_config.activation_function,
-        resid_pdrop=btlm_config.resid_pdrop,
-        embd_pdrop=btlm_config.embd_pdrop,
-        attn_pdrop=btlm_config.attn_pdrop,
-        layer_norm_epsilon=btlm_config.layer_norm_epsilon,
-        initializer_range=btlm_config.initializer_range,
-        bos_token_id=btlm_config.bos_token_id,
-        eos_token_id=btlm_config.eos_token_id,
-        # These are new arguments not in the original GPT2Config
-        use_alibi=btlm_config.position_embedding_type == "alibi",
-        use_flash_attn=btlm_config.position_embedding_type == "alibi",  # Alibi code path requires flash_attn
-        mup_width_scale=btlm_config.mup_width_scale,
-        mup_embeddings_multiplier=btlm_config.mup_embeddings_scale,
-        mup_output_multiplier=btlm_config.mup_output_alpha,
-        mup_scale_qk_dot_by_d=btlm_config.mup_scale_qk_dot_by_d,
-        mlp_multiple_of=1,
-    )
--- a/vllm_flash_attn/models/falcon.py
+++ b/vllm_flash_attn/models/falcon.py
-# Copyright (c) 2023, Tri Dao.
-
-import math
-import re
-from collections import OrderedDict
-
-import torch
-import torch.nn.functional as F
-from einops import rearrange
-from transformers import FalconConfig, GPT2Config
-
-
-def remap_state_dict_hf_falcon(state_dict, config):
-    def key_mapping_layers(key):
-        return re.sub(r"^transformer.h.", "transformer.layers.", key)
-
-    state_dict = OrderedDict((key_mapping_layers(k), v) for k, v in state_dict.items())
-    # Word embedding
-    def key_mapping_emb(key):
-        return re.sub(
-            r"^transformer.word_embeddings.", "transformer.embeddings.word_embeddings.", key
-        )
-
-    state_dict = OrderedDict((key_mapping_emb(k), v) for k, v in state_dict.items())
-    word_embeddings = state_dict.pop("transformer.embeddings.word_embeddings.weight")
-    # It's possible that vocab_size is padded to be a multiple of 8, for example.
-    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
-    vocab_size = math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
-    state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad(
-        word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0])
-    )
-    if getattr(config, "tie_word_embeddings"):
-        state_dict["lm_head.weight"] = state_dict["transformer.embeddings.word_embeddings.weight"]
-    else:
-        output_embeddings = state_dict.pop("lm_head.weight")
-        # It's possible that vocab_size is padded to be a multiple of 8, for example.
-        state_dict["lm_head.weight"] = F.pad(
-            output_embeddings, (0, 0, 0, vocab_size - output_embeddings.shape[0])
-        )
-        output_embeddings_bias = state_dict.pop("lm_head.bias")
-        state_dict["lm_head.bias"] = F.pad(
-            output_embeddings_bias, (0, vocab_size - output_embeddings_bias.shape[0])
-        )
-
-    # LayerNorm
-    def key_mapping_ln(key):
-        key = re.sub(
-            r"^transformer.layers.(\d+).input_layernorm.", r"transformer.layers.\1.norm1.", key
-        )
-        key = re.sub(
-            r"^transformer.layers.(\d+).post_attention_layernorm.",
-            r"transformer.layers.\1.norm2.",
-            key,
-        )
-        key = re.sub(r"^transformer.layers.(\d+).ln_attn.", r"transformer.layers.\1.norm1.", key)
-        key = re.sub(r"^transformer.layers.(\d+).ln_mlp.", r"transformer.layers.\1.norm2.", key)
-        return key
-
-    state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
-
-    # MLP
-    def key_mapping_mlp(key):
-        key = re.sub(
-            r"^transformer.layers.(\d+).mlp.dense_h_to_4h.", r"transformer.layers.\1.mlp.fc1.", key
-        )
-        key = re.sub(
-            r"^transformer.layers.(\d+).mlp.dense_4h_to_h.", r"transformer.layers.\1.mlp.fc2.", key
-        )
-        return key
-
-    state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
-
-    def key_mapping_attn(key):
-        key = re.sub(
-            r"^transformer.layers.(\d+).self_attention.query_key_value.",
-            r"transformer.layers.\1.mixer.Wqkv.",
-            key,
-        )
-        key = re.sub(
-            r"^transformer.layers.(\d+).self_attention.dense.",
-            r"transformer.layers.\1.mixer.out_proj.",
-            key,
-        )
-        return key
-
-    state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
-    n_head = config.n_head
-    n_head_kv = getattr(config, "n_head_kv", 1)
-    headdim = config.hidden_size // n_head
-    for l in range(config.n_layer):
-        # The weights are stored in a different layout compared to our implementation
-        Wqkv = rearrange(
-            state_dict.pop(f"transformer.layers.{l}.mixer.Wqkv.weight"),
-            "(group ratio headdim) ... -> group ratio headdim ...",
-            ratio=n_head // n_head_kv + 2,
-            headdim=headdim,
-        )
-        Wq = rearrange(Wqkv[:, :-2], "group ratio headdim ... -> (group ratio headdim) ...")
-        Wk = rearrange(Wqkv[:, [-2]], "group ratio headdim ... -> (group ratio headdim) ...")
-        Wv = rearrange(Wqkv[:, [-1]], "group ratio headdim ... -> (group ratio headdim) ...")
-        state_dict[f"transformer.layers.{l}.mixer.Wqkv.weight"] = torch.cat([Wq, Wk, Wv], dim=0)
-
-    return state_dict
-
-
-def falcon_config_to_gpt2_config(falcon_config: FalconConfig) -> GPT2Config:
-    # The 40b config uses "n_head_kv" instead of "num_kv_heads"
-    n_head_kv = getattr(
-        falcon_config,
-        "n_head_kv",
-        1 if getattr(falcon_config, "multi_query", False) else falcon_config.n_head,
-    )
-    # HACK: the 40b config has 2 LN per layer instead of 1, but that's not reflected in the config.
-    # So we have to infer it from the number of heads in the key/value block
-    parallel_block_tied_norm = n_head_kv == 1
-    return GPT2Config(
-        vocab_size=falcon_config.vocab_size,
-        n_positions=0,  # No absolute position embedding
-        n_embd=falcon_config.hidden_size,
-        n_layer=falcon_config.n_layer,
-        n_head=falcon_config.n_head,
-        n_inner=falcon_config.hidden_size * 4,
-        activation_function="gelu",
-        resid_pdrop=falcon_config.hidden_dropout,
-        embd_pdrop=0.0,  # There doesn't seem to be any embedding dropout
-        attn_pdrop=falcon_config.attention_dropout,
-        layer_norm_epsilon=falcon_config.layer_norm_epsilon,
-        initializer_range=falcon_config.initializer_range,
-        bos_token_id=falcon_config.bos_token_id,
-        eos_token_id=falcon_config.eos_token_id,
-        # These are new arguments not in the original GPT2Config
-        parallel_block=falcon_config.parallel_attn,
-        n_head_kv=n_head_kv,
-        parallel_block_tied_norm=parallel_block_tied_norm,
-        rotary_emb_fraction=1.0,
-        rotary_emb_interleaved=False,
-        tie_word_embeddings=True,
-        qkv_proj_bias=falcon_config.bias,
-        out_proj_bias=falcon_config.bias,
-        mlp_fc1_bias=falcon_config.bias,
-        mlp_fc2_bias=falcon_config.bias,
-        lm_head_bias=False,
-    )
--- a/vllm_flash_attn/models/gpt.py
+++ b/vllm_flash_attn/models/gpt.py
-# Copyright (c) 2024, Tri Dao.
-
-import logging
-import math
-import re
-from collections import OrderedDict, namedtuple
-from collections.abc import Sequence
-from functools import partial
-from typing import Dict, List
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from einops import rearrange
-from transformers import GPT2Config
-
-from flash_attn.models.bigcode import remap_state_dict_hf_bigcode
-from flash_attn.models.falcon import remap_state_dict_hf_falcon
-from flash_attn.models.gpt_neox import remap_state_dict_hf_gpt_neox
-from flash_attn.models.gptj import remap_state_dict_hf_gptj
-from flash_attn.models.llama import remap_state_dict_hf_llama
-from flash_attn.models.opt import remap_state_dict_hf_opt
-from flash_attn.modules.block import Block, ParallelBlock
-from flash_attn.modules.embedding import GPT2Embeddings, ParallelGPT2Embeddings
-from flash_attn.modules.mha import MHA, ParallelMHA
-from flash_attn.modules.mlp import (
-    FusedMLP,
-    GatedMlp,
-    Mlp,
-    ParallelFusedMLP,
-    ParallelGatedMlp,
-    ParallelMLP,
-)
-from flash_attn.ops.activations import sqrelu_fwd
-from flash_attn.utils.distributed import (
-    all_gather,
-    all_gather_raw,
-    get_dim_for_local_rank,
-    sync_shared_params,
-)
-from flash_attn.utils.generation import GenerationMixin
-from flash_attn.utils.pretrained import state_dict_from_pretrained
-
-try:
-    from flash_attn.ops.fused_dense import ColumnParallelLinear
-except ImportError:
-    ColumnParallelLinear = None
-
-try:
-    from flash_attn.ops.triton.mlp import FusedDenseSqreluDense
-except ImportError:
-    FusedDenseSqreluDense = None
-
-try:
-    from flash_attn.ops.triton.layer_norm import layer_norm_fn, RMSNorm
-except ImportError:
-    layer_norm_fn, RMSNorm = None, None
-
-logger = logging.getLogger(__name__)
-
-
-def create_mixer_cls(config, layer_idx=None, process_group=None, device=None, dtype=None):
-    factory_kwargs = {"device": device, "dtype": dtype}
-    head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
-    attn_scale_power = 0.5 if not getattr(config, "mup_scale_qk_dot_by_d", False) else 1.0
-    softmax_scale = 1.0 if not config.scale_attn_weights else (head_dim ** (-attn_scale_power))
-    softmax_scale *= getattr(config, "mup_attn_multiplier", 1.0)
-    if config.scale_attn_by_inverse_layer_idx:
-        assert layer_idx is not None
-        softmax_scale /= float(layer_idx + 1)
-    dwconv = getattr(config, "attn_dwconv", False)
-    if dwconv:
-        assert process_group is None, "TensorParallel MHA does not support dwconv yet"
-    qkv_proj_bias = getattr(config, "qkv_proj_bias", True)
-    out_proj_bias = getattr(config, "out_proj_bias", True)
-    rotary_emb_dim = int(getattr(config, "rotary_emb_fraction", 0.0) * head_dim)
-    rotary_emb_base = getattr(config, "rotary_emb_base", 10000.0)
-    rotary_emb_scale_base = getattr(config, "rotary_emb_scale_base", None)
-    rotary_emb_interleaved = getattr(config, "rotary_emb_interleaved", False)
-    use_alibi = getattr(config, "use_alibi", False)
-    window_size = getattr(config, "window_size", (-1, -1))
-    use_flash_attn = getattr(config, "use_flash_attn", False)
-    fused_bias_fc = getattr(config, "fused_bias_fc", False)
-    if not fused_bias_fc:
-        assert process_group is None, "TensorParallel MHA requires fused_bias_fc"
-    mha_cls = MHA if process_group is None else ParallelMHA
-    serial_kwargs = (
-        {"fused_bias_fc": fused_bias_fc, "dwconv": dwconv} if process_group is None else {}
-    )
-    parallel_kwargs = (
-        {
-            "process_group": process_group,
-            "sequence_parallel": getattr(config, "sequence_parallel", True),
-        }
-        if process_group is not None
-        else {}
-    )
-    num_heads_kv = getattr(config, "n_head_kv", None)
-    mixer_cls = partial(
-        mha_cls,
-        num_heads=config.num_attention_heads,
-        num_heads_kv=num_heads_kv,
-        qkv_proj_bias=qkv_proj_bias,
-        out_proj_bias=out_proj_bias,
-        dropout=config.attn_pdrop,
-        softmax_scale=softmax_scale,
-        causal=True,
-        layer_idx=layer_idx,
-        rotary_emb_dim=rotary_emb_dim,
-        rotary_emb_base=rotary_emb_base,
-        rotary_emb_scale_base=rotary_emb_scale_base,
-        rotary_emb_interleaved=rotary_emb_interleaved,
-        use_alibi=use_alibi,
-        window_size=window_size,
-        use_flash_attn=use_flash_attn,
-        **serial_kwargs,
-        **parallel_kwargs,
-        **factory_kwargs,
-    )
-    return mixer_cls
-
-
-def create_mlp_cls(config, layer_idx=None, process_group=None, device=None, dtype=None):
-    factory_kwargs = {"device": device, "dtype": dtype}
-    mlp_fc1_bias = getattr(config, "mlp_fc1_bias", True)
-    mlp_fc2_bias = getattr(config, "mlp_fc2_bias", True)
-    fused_mlp = getattr(config, "fused_mlp", False)
-    if fused_mlp:
-        assert config.activation_function in [
-            "gelu_new",
-            "gelu_fast",
-            "gelu_approx",
-            "gelu_pytorch_tanh",
-            "relu",
-            "sqrelu",
-        ]
-    fused_dense_sqrelu_dense = getattr(config, "fused_dense_sqrelu_dense", False)
-    if fused_dense_sqrelu_dense:
-        assert config.activation_function == "sqrelu", (
-            "fused_dense_sqrelu_dense only " "supports approximate activation_function sqrelu"
-        )
-    assert not (fused_dense_sqrelu_dense and fused_mlp)
-    if not fused_mlp and not fused_dense_sqrelu_dense:
-        assert config.activation_function in [
-            "gelu",
-            "gelu_new",
-            "gelu_fast",
-            "gelu_approx",
-            "gelu_pytorch_tanh",
-            "relu",
-            "sqrelu",
-            "glu",
-            "swiglu",
-            "geglu",
-        ]
-        if config.activation_function in ["glu", "swiglu", "geglu"]:
-            activation = (
-                F.sigmoid
-                if config.activation_function == "glu"
-                else (F.silu if config.activation_function == "swiglu" else F.gelu)
-            )
-            mlp_cls = GatedMlp if process_group is None else ParallelGatedMlp
-            parallel_kwargs = (
-                {
-                    "process_group": process_group,
-                    "sequence_parallel": getattr(config, "sequence_parallel", True),
-                }
-                if process_group is not None
-                else {}
-            )
-            mlp_multiple_of = getattr(config, "mlp_multiple_of", 128)
-            mlp_cls = partial(
-                mlp_cls,
-                hidden_features=config.n_inner,
-                activation=activation,
-                bias1=mlp_fc1_bias,
-                bias2=mlp_fc2_bias,
-                multiple_of=mlp_multiple_of,
-                **parallel_kwargs,
-                **factory_kwargs,
-            )
-        else:
-            if config.activation_function == "relu":
-                activation = partial(F.relu, inplace=True)
-            elif config.activation_function == "sqrelu":
-                activation = sqrelu_fwd
-            else:
-                approximate = (
-                    "tanh"
-                    if config.activation_function
-                    in ["gelu_new", "gelu_fast", "gelu_approx", "gelu_pytorch_tanh"]
-                    else "none"
-                )
-                activation = partial(F.gelu, approximate=approximate)
-            mlp_cls = Mlp if process_group is None else ParallelMLP
-            parallel_kwargs = (
-                {
-                    "process_group": process_group,
-                    "sequence_parallel": getattr(config, "sequence_parallel", True),
-                }
-                if process_group is not None
-                else {}
-            )
-            mlp_cls = partial(
-                mlp_cls,
-                hidden_features=config.n_inner,
-                activation=activation,
-                bias1=mlp_fc1_bias,
-                bias2=mlp_fc2_bias,
-                **parallel_kwargs,
-                **factory_kwargs,
-            )
-    else:
-        mlp_checkpoint_lvl = getattr(config, "mlp_checkpoint_lvl", 0)
-        # mlp_checkpoint_lvl could be a list, which contains the checkpoint_lvl for each layer
-        if isinstance(mlp_checkpoint_lvl, Sequence):
-            assert layer_idx is not None
-            mlp_checkpoint_lvl = mlp_checkpoint_lvl[layer_idx]
-        if fused_mlp:
-            if FusedMLP is None:
-                raise ImportError("fused_dense is not installed")
-            activation = (
-                "gelu_approx"
-                if config.activation_function
-                in ["gelu_new", "gelu_fast", "gelu_approx", "gelu_pytorch_tanh"]
-                else config.activation_function
-            )
-            mlp_cls = FusedMLP if process_group is None else ParallelFusedMLP
-            parallel_kwargs = (
-                {
-                    "process_group": process_group,
-                    "sequence_parallel": getattr(config, "sequence_parallel", True),
-                }
-                if process_group is not None
-                else {}
-            )
-            mlp_cls = partial(
-                mlp_cls,
-                hidden_features=config.n_inner,
-                activation=activation,
-                checkpoint_lvl=mlp_checkpoint_lvl,
-                bias1=mlp_fc1_bias,
-                bias2=mlp_fc2_bias,
-                **parallel_kwargs,
-                **factory_kwargs,
-            )
-        elif fused_dense_sqrelu_dense:
-            if process_group is not None:
-                assert fused_mlp, "Tensor Parallel is not implemented for FusedDenseSqreluDense"
-            assert FusedDenseSqreluDense is not None
-            mlp_cls = partial(
-                FusedDenseSqreluDense,
-                hidden_features=config.n_inner,
-                checkpoint_lvl=mlp_checkpoint_lvl,
-                **factory_kwargs,
-            )
-        else:
-            raise RuntimeError("MLP type not supported")
-    return mlp_cls
-
-
-def create_block(config, layer_idx=None, process_group=None, device=None, dtype=None):
-    factory_kwargs = {"device": device, "dtype": dtype}
-    sequence_parallel = getattr(config, "sequence_parallel", True)
-    mixer_cls = create_mixer_cls(config, layer_idx, process_group=process_group, **factory_kwargs)
-    mlp_cls = create_mlp_cls(config, layer_idx, process_group=process_group, **factory_kwargs)
-    use_rms_norm = getattr(config, "rms_norm", False)
-    norm_cls = partial(
-        nn.LayerNorm if not use_rms_norm else RMSNorm,
-        eps=config.layer_norm_epsilon,
-        **factory_kwargs,
-    )
-    # TD [2022-07-30]: Force residual in fp32, seems to make fp16 training more stable
-    residual_in_fp32 = getattr(config, "residual_in_fp32", False)
-    resid_dropout1 = config.resid_pdrop if layer_idx is None or layer_idx > 0 else config.embd_pdrop
-    prenorm = getattr(config, "prenorm", True)
-    parallel_block = getattr(config, "parallel_block", False)
-    if not parallel_block:
-        block = Block(
-            config.hidden_size,
-            mixer_cls,
-            mlp_cls,
-            norm_cls=norm_cls,
-            prenorm=prenorm,
-            resid_dropout1=resid_dropout1,
-            resid_dropout2=config.resid_pdrop,
-            fused_dropout_add_ln=getattr(config, "fused_dropout_add_ln", False),
-            residual_in_fp32=residual_in_fp32,
-            sequence_parallel=sequence_parallel and process_group is not None,
-            mark_shared_params=process_group is not None,
-        )
-    else:
-        assert prenorm
-        block = ParallelBlock(
-            config.hidden_size,
-            mixer_cls,
-            mlp_cls,
-            norm_cls=norm_cls,
-            resid_dropout1=resid_dropout1,
-            resid_dropout2=config.resid_pdrop,
-            tied_norm=getattr(config, "parallel_block_tied_norm", False),
-            fused_dropout_add_ln=getattr(config, "fused_dropout_add_ln", False),
-            residual_in_fp32=residual_in_fp32,
-            sequence_parallel=sequence_parallel and process_group is not None,
-            mark_shared_params=process_group is not None,
-        )
-    block.layer_idx = layer_idx
-    return block
-
-
-class GPTPreTrainedModel(nn.Module):
-    """An abstract class to handle weights initialization and
-    a simple interface for dowloading and loading pretrained models.
-    """
-
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__()
-        if not isinstance(config, GPT2Config):
-            raise ValueError(
-                "Parameter config in `{}(config)` should be an instance of class `GPT2Config`. "
-                "To create a model from a Google pretrained model use "
-                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
-                    self.__class__.__name__, self.__class__.__name__
-                )
-            )
-        self.config = config
-
-    @classmethod
-    def from_pretrained(
-        cls,
-        model_name,
-        config,
-        *args,
-        strict=True,
-        device=None,
-        dtype=None,
-        world_size=1,
-        rank=0,
-        **kwargs,
-    ):
-        """
-        Instantiate a GPTPreTrainedModel from a pre-trained model file or a pytorch state dict.
-        Download and cache the pre-trained model file if needed.
-        """
-        # Instantiate model.
-        model = cls(config, *args, device=device, dtype=dtype, **kwargs)
-        # Load state_dict in cpu because we already initialized the model in GPU, and we don't
-        # want extra stuff taking up more GPU memory
-        state_dict = state_dict_from_pretrained(model_name, device="cpu", dtype=dtype)
-        if model_name.startswith("gpt2"):
-            state_dict = remap_state_dict_hf_gpt2(state_dict, config)
-        elif model_name.startswith("facebook/opt"):
-            state_dict = remap_state_dict_hf_opt(state_dict, config)
-        elif model_name.startswith("EleutherAI/gpt-j-") or model_name.startswith(
-            "togethercomputer/GPT-JT-"
-        ):
-            state_dict = remap_state_dict_hf_gptj(state_dict, config)
-        elif (
-            model_name.startswith("EleutherAI/gpt-neox-")
-            or model_name.startswith("EleutherAI/pythia-")
-            or model_name.startswith("togethercomputer/RedPajama-INCITE-")
-        ):
-            state_dict = remap_state_dict_hf_gpt_neox(state_dict, config)
-        elif model_name.startswith("tiiuae/falcon-"):
-            state_dict = remap_state_dict_hf_falcon(state_dict, config)
-        elif model_name.startswith("meta-llama/Llama-"):
-            state_dict = remap_state_dict_hf_llama(state_dict, config)
-        elif model_name.startswith("bigcode/") or model_name.startswith("WizardLM/"):
-            state_dict = remap_state_dict_hf_bigcode(state_dict, config)
-        else:
-            raise NotImplementedError(f"Model {model_name} not supported")
-        if world_size > 1:
-            state_dict = shard_state_dict_tp(state_dict, config, world_size, rank)
-        load_return = model.load_state_dict(state_dict, strict=strict)
-        logger.info(load_return)
-        return model
-
-
-# https://github.com/huggingface/transformers/blob/c28d04e9e252a1a099944e325685f14d242ecdcd/src/transformers/models/gpt2/modeling_gpt2.py#L454
-def _init_weights(
-    module, n_layer, initializer_range=0.02, mup_width_scale=1.0, rescale_prenorm_residual=True
-):
-    mup_init_scale = math.sqrt(mup_width_scale)
-    if isinstance(module, nn.Linear):
-        nn.init.normal_(module.weight, std=initializer_range * mup_init_scale)
-        optim_cfg = getattr(module.weight, "_optim", {})
-        optim_cfg.update({"lr_multiplier": mup_width_scale})
-        setattr(module.weight, "_optim", optim_cfg)
-        if module.bias is not None:
-            nn.init.zeros_(module.bias)
-    elif isinstance(module, nn.Embedding):
-        nn.init.normal_(module.weight, std=initializer_range)
-
-    if rescale_prenorm_residual:
-        # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
-        #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
-        #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
-        #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
-        #
-        # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
-        for name, p in module.named_parameters():
-            if name in ["out_proj.weight", "fc2.weight"]:
-                # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
-                nn.init.normal_(
-                    p, mean=0.0, std=initializer_range * mup_init_scale / math.sqrt(2 * n_layer)
-                )
-
-
-class GPTModel(GPTPreTrainedModel):
-    def __init__(self, config: GPT2Config, process_group=None, device=None, dtype=None):
-        super().__init__(config)
-        factory_kwargs = {"device": device, "dtype": dtype}
-        self.process_group = process_group
-        self.sequence_parallel = getattr(config, "sequence_parallel", True)
-        assert config.activation_function in [
-            "gelu",
-            "gelu_new",
-            "gelu_fast",
-            "gelu_approx",
-            "gelu_pytorch_tanh",
-            "relu",
-            "sqrelu",
-            "glu",
-            "swiglu",
-            "geglu",
-        ]
-        pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
-        vocab_size = (
-            math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
-        )
-        self.embeddings_multiplier = getattr(config, "mup_embeddings_multiplier", 1.0)
-        # TD [2022-07-30]: Force residual in fp32, seems to make fp16 training more stable
-        self.residual_in_fp32 = getattr(config, "residual_in_fp32", False)
-        # These 2 options are for OPT-350m
-        self.prenorm = getattr(config, "prenorm", True)
-        use_rms_norm = getattr(config, "rms_norm", False)
-        word_embed_proj_dim = getattr(config, "word_embed_proj_dim", None)
-        # For GPT-J, GPT-NeoX
-        self.parallel_block = getattr(config, "parallel_block", False)
-
-        if process_group is None:
-            self.embeddings = GPT2Embeddings(
-                config.hidden_size,
-                vocab_size,
-                config.max_position_embeddings,
-                word_embed_proj_dim=word_embed_proj_dim,
-                **factory_kwargs,
-            )
-        else:
-            self.embeddings = ParallelGPT2Embeddings(
-                config.hidden_size,
-                vocab_size,
-                config.max_position_embeddings,
-                process_group=process_group,
-                sequence_parallel=self.sequence_parallel,
-                **factory_kwargs,
-            )
-
-        # We change the order of dropout, residual and layer norm:
-        # Instead of LN -> Attn / MLP -> Dropout -> Add, we do:
-        # Dropout -> Add -> LN -> Attn / MLP, returning both the residual branch (output of Add) and
-        # the main branch (output of MLP). The model definition is unchanged, but the mapping of the
-        # nn.Dropout probabilities are changed.
-        # This is for performance reason: we can fuse dropout + add + layer_norm.
-        self.layers = nn.ModuleList(
-            [
-                create_block(config, layer_idx=i, process_group=process_group, **factory_kwargs)
-                for i in range(config.num_hidden_layers)
-            ]
-        )
-        rotary_emb_fraction = getattr(config, "rotary_emb_fraction", 0.0)
-        if rotary_emb_fraction > 0.0:  # Tie all the RotaryEmbedding modules to share the same cos/sin cache
-            for layer in self.layers[1:]:
-                layer.mixer.rotary_emb = self.layers[0].mixer.rotary_emb
-
-        self.fused_dropout_add_ln = getattr(config, "fused_dropout_add_ln", False)
-        if self.fused_dropout_add_ln:
-            if layer_norm_fn is None:
-                raise ImportError("Triton is not installed")
-        if self.prenorm:
-            self.drop_f = nn.Dropout(config.resid_pdrop)
-            norm_cls = nn.LayerNorm if not use_rms_norm else RMSNorm
-            self.ln_f = norm_cls(
-                config.hidden_size, eps=config.layer_norm_epsilon, **factory_kwargs
-            )
-        if process_group is not None:
-            for p in self.ln_f.parameters():
-                # Mark the norm parameters as "shared_params" so that we sync their values at init.
-                p._shared_params = True
-                # Mark the norm params as "sequence_parallel" so we run all-reduce on their grads.
-                if self.sequence_parallel:
-                    p._sequence_parallel = True
-
-        self.apply(
-            partial(
-                _init_weights,
-                n_layer=config.num_hidden_layers,
-                initializer_range=config.initializer_range,
-                mup_width_scale=getattr(config, "mup_width_scale", 1.0),
-            )
-        )
-        self.tie_weights()
-
-    def tie_weights(self):
-        if self.process_group is not None:
-            sync_shared_params(self, self.process_group)
-
-    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
-        return {
-            i: layer.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype, **kwargs)
-            for i, layer in enumerate(self.layers)
-        }
-
-    def forward(self, input_ids, position_ids=None, inference_params=None):
-        # If using Tensor Parallel with sequence parallel, we combine the batch and the seqlen
-        # dimensions so that we can split on it easily, in case of small batch size.
-        # Only the attention layers need to know the seqlen.
-        embedding_kwargs = (
-            {"combine_batch_seqlen_dim": True}
-            if self.process_group is not None and self.sequence_parallel
-            else {}
-        )
-        hidden_states = self.embeddings(input_ids, position_ids=position_ids, **embedding_kwargs)
-        if self.embeddings_multiplier != 1.0:
-            hidden_states = hidden_states * self.embeddings_multiplier
-        if self.parallel_block:
-            hidden_states2 = None
-        residual = None
-        mixer_kwargs = (
-            {"seqlen": input_ids.shape[1]}
-            if self.process_group is not None and self.sequence_parallel
-            else {}
-        )
-        if inference_params is not None:
-            mixer_kwargs["inference_params"] = inference_params
-        for layer in self.layers:
-            if self.prenorm:
-                if not self.parallel_block:
-                    hidden_states, residual = layer(
-                        hidden_states, residual, mixer_kwargs=mixer_kwargs
-                    )
-                else:
-                    hidden_states, hidden_states2, residual = layer(
-                        hidden_states, hidden_states2, residual, mixer_kwargs=mixer_kwargs
-                    )
-            else:
-                hidden_states = layer(hidden_states, mixer_kwargs=mixer_kwargs)
-        if self.prenorm:
-            if not self.fused_dropout_add_ln:
-                dropped = self.drop_f(hidden_states)
-                if not self.parallel_block:
-                    residual = (dropped + residual) if residual is not None else dropped
-                else:
-                    dropped2 = self.drop_f(hidden_states2)
-                    residual = (
-                        (residual + dropped + dropped2)
-                        if residual is not None
-                        else dropped + dropped2
-                    )
-                hidden_states = self.ln_f(residual.to(dtype=self.ln_f.weight.dtype))
-            else:
-                # Set prenorm=False here since we don't need the residual
-                hidden_states = layer_norm_fn(
-                    hidden_states,
-                    self.ln_f.weight,
-                    self.ln_f.bias,
-                    residual=residual,
-                    x1=None if not self.parallel_block else hidden_states2,
-                    eps=self.ln_f.eps,
-                    dropout_p=self.drop_f.p if self.training else 0.0,
-                    prenorm=False,
-                    is_rms_norm=isinstance(self.ln_f, RMSNorm)
-                )
-        return hidden_states
-
-
-class GPTLMHeadModel(GPTPreTrainedModel, GenerationMixin):
-    def __init__(self, config: GPT2Config, process_group=None, device=None, dtype=None):
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__(config)
-        self.process_group = process_group
-        self.transformer = GPTModel(config, process_group=process_group, **factory_kwargs)
-        self.tie_word_embeddings = getattr(config, "tie_word_embeddings", True)
-        lm_head_bias = getattr(config, "lm_head_bias", False)
-        pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
-        vocab_size = (
-            math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
-        )
-        # This option is for OPT-350m
-        word_embed_proj_dim = getattr(config, "word_embed_proj_dim", None)
-        embed_dim = config.n_embd if word_embed_proj_dim is None else word_embed_proj_dim
-        if word_embed_proj_dim is not None:
-            self.project_out = nn.Linear(config.n_embd, embed_dim, bias=False, **factory_kwargs)
-        else:
-            self.project_out = None
-        mup_width_scale = getattr(config, "mup_width_scale", 1.0)
-        mup_output_multiplier = getattr(config, "mup_output_multiplier", 1.0)
-        self.output_scale = mup_output_multiplier * mup_width_scale
-        if process_group is None:
-            self.lm_head = nn.Linear(embed_dim, vocab_size, bias=lm_head_bias, **factory_kwargs)
-        else:
-            if ColumnParallelLinear is None:
-                raise ImportError("fused_dense_lib is not installed")
-            self.lm_head = ColumnParallelLinear(
-                embed_dim,
-                vocab_size,
-                process_group,
-                bias=lm_head_bias,
-                sequence_parallel=getattr(config, "sequence_parallel", True),
-                **factory_kwargs,
-            )
-        self.norm_head = getattr(config, "norm_head", False)
-        # Initialize weights and apply final processing
-        self.apply(
-            partial(
-                _init_weights,
-                n_layer=config.num_hidden_layers,
-                initializer_range=config.initializer_range,
-                mup_width_scale=mup_width_scale,
-            )
-        )
-        self.tie_weights()
-
-    def tie_weights(self):
-        if self.tie_word_embeddings:
-            self.lm_head.weight = self.transformer.embeddings.word_embeddings.weight
-        if self.process_group is not None:
-            sync_shared_params(self, self.process_group)
-
-    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
-        return self.transformer.allocate_inference_cache(
-            batch_size, max_seqlen, dtype=dtype, **kwargs
-        )
-
-    def forward(self, input_ids, position_ids=None, inference_params=None, num_last_tokens=0):
-        """
-        input_ids: (batch, seqlen) int tensor
-        inference_params: for generation. Adapted from Megatron-LM (and Apex)
-        https://github.com/NVIDIA/apex/blob/3ff1a10f72ec07067c4e44759442329804ac5162/apex/transformer/testing/standalone_transformer_lm.py#L470
-        num_last_tokens: if > 0, only return the logits for the last n tokens
-        """
-        assert (
-            input_ids.ndim == 2
-        ), f"Expected `input_ids` to have shape [b, slen], but got shape {input_ids.shape}"
-        b, slen = input_ids.shape
-        hidden_states = self.transformer(
-            input_ids, position_ids=position_ids, inference_params=inference_params
-        )
-        if inference_params is not None:
-            assert hidden_states.ndim == 3, "sequence_parallel is not supported in generation mode"
-        if num_last_tokens > 0:
-            hidden_states = hidden_states[:, -num_last_tokens:]
-        if self.project_out is not None:
-            hidden_states = self.project_out(hidden_states)
-        if self.output_scale != 1.0:
-            hidden_states = hidden_states * self.output_scale
-        if not self.norm_head:
-            lm_logits = self.lm_head(hidden_states)
-        else:
-            lm_head_weight = F.normalize(self.lm_head.weight)
-            if isinstance(self.lm_head, ColumnParallelLinear) and self.lm_head.sequence_parallel:
-                hidden_states = all_gather(hidden_states, self.lm_head.process_group)
-            lm_logits = F.linear(hidden_states, lm_head_weight, bias=self.lm_head.bias)
-        # During inference, we want the full logit for sampling
-        if isinstance(self.lm_head, ColumnParallelLinear) and inference_params is not None:
-            lm_logits, _ = all_gather_raw(lm_logits, self.lm_head.process_group)
-            lm_logits = rearrange(lm_logits, "(n b) ... d -> b ... (n d)", b=b)
-        CausalLMOutput = namedtuple("CausalLMOutput", ["logits"])
-        return CausalLMOutput(logits=lm_logits)
-
-    def load_state_dict(self, state_dict, strict=True):
-        # Remapping from our checkpoints that used a different ordering of layers in the block
-        # Previous: Attn / MLP -> Dropout -> Add -> LN
-        # Current: Dropout -> Add -> LN -> Attn / MLP
-        if "transformer.ln_0.weight" in state_dict:
-            n_layers = len(self.transformer.layers)
-            ln_weight = state_dict.pop(f"transformer.layers.{n_layers - 1}.norm2.weight")
-            ln_bias = state_dict.pop(f"transformer.layers.{n_layers - 1}.norm2.bias")
-            state_dict["transformer.ln_f.weight"] = ln_weight
-            state_dict["transformer.ln_f.bias"] = ln_bias
-            for l in reversed(range(n_layers)):
-                ln_weight = state_dict.pop(f"transformer.layers.{l}.norm1.weight")
-                ln_bias = state_dict.pop(f"transformer.layers.{l}.norm1.bias")
-                state_dict[f"transformer.layers.{l}.norm2.weight"] = ln_weight
-                state_dict[f"transformer.layers.{l}.norm2.bias"] = ln_bias
-                if l > 0:
-                    ln_weight = state_dict.pop(f"transformer.layers.{l - 1}.norm2.weight")
-                    ln_bias = state_dict.pop(f"transformer.layers.{l - 1}.norm2.bias")
-                    state_dict[f"transformer.layers.{l}.norm1.weight"] = ln_weight
-                    state_dict[f"transformer.layers.{l}.norm1.bias"] = ln_bias
-            ln_weight = state_dict.pop("transformer.ln_0.weight")
-            ln_bias = state_dict.pop("transformer.ln_0.bias")
-            state_dict[f"transformer.layers.0.norm1.weight"] = ln_weight
-            state_dict[f"transformer.layers.0.norm1.bias"] = ln_bias
-        return super().load_state_dict(state_dict, strict=strict)
-
-
-def shard_state_dict_tp(state_dict, config, world_size, rank):
-    """Convert the state_dict of a standard GPT model to the state_dict of a GPT model
-    with tensor parallel.
-
-    This function modifies state_dict in place.
-    """
-    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
-    vocab_size = math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
-    assert vocab_size % world_size == 0
-    assert config.hidden_size % world_size == 0
-    inner_dim = config.n_inner if config.n_inner is not None else 4 * config.hidden_size
-    assert inner_dim % world_size == 0
-
-    n_head = config.n_head
-    n_head_kv = getattr(config, "n_head_kv", n_head)
-
-    embed_dim = config.hidden_size
-    head_dim = embed_dim // n_head
-
-    def shard_first_dim(state_dict, key):
-        if key in state_dict:
-            x = state_dict[key]
-            dim = x.shape[0] // world_size
-            state_dict[key] = x[rank * dim : (rank + 1) * dim]
-
-    def shard_last_dim(state_dict, key, multiple_of=1):
-        if key in state_dict:
-            x = state_dict[key]
-            dim_each_rank = [
-                get_dim_for_local_rank(x.size(-1), world_size, local_rank, multiple_of)
-                for local_rank in range(world_size)
-            ]
-            beg, end = tuple(sum(dim_each_rank[:pos]) for pos in (rank, rank + 1))
-            state_dict[key] = x[..., beg:end]
-
-    def shard_gatedmlp_fc1_dim(state_dict, key):
-        if key in state_dict:
-            x = state_dict[key]
-            dim = x.shape[0] // world_size // 2
-            state_dict[key] = rearrange(
-                rearrange(x, "(two o) ... -> two o ...", two=2)[:, rank * dim : (rank + 1) * dim],
-                "two o ... -> (two o) ...",
-            )
-
-    def shard_qkv_headdim(state_dict, key):
-        if key in state_dict:
-            n_head_each_rank = [
-                get_dim_for_local_rank(n_head, world_size, local_rank)
-                for local_rank in range(world_size)
-            ]
-            n_head_kv_each_rank = [
-                get_dim_for_local_rank(n_head_kv, world_size, local_rank)
-                for local_rank in range(world_size)
-            ]
-
-            beg_n_head = sum(n_head_each_rank[:rank])
-            end_n_head = sum(n_head_each_rank[: rank + 1])
-
-            beg_n_head_kv = sum(n_head_kv_each_rank[:rank])
-            end_n_head_kv = sum(n_head_kv_each_rank[: rank + 1])
-
-            if n_head_kv == n_head:
-                x = rearrange(state_dict[key], "(three d) ... -> three d ...", three=3)
-                state_dict[key] = rearrange(
-                    x[:, beg_n_head * head_dim : end_n_head * head_dim],
-                    "three d ... -> (three d) ...",
-                )
-            else:
-                x = rearrange(
-                    state_dict[key],
-                    "(nheadqkv headdim) ... -> nheadqkv headdim ...",
-                    nheadqkv=n_head + 2 * n_head_kv,
-                )
-                state_dict[key] = rearrange(
-                    torch.cat(
-                        [
-                            x[beg_n_head:end_n_head],
-                            x[n_head + beg_n_head_kv : n_head + end_n_head_kv],
-                            x[
-                                n_head
-                                + n_head_kv
-                                + beg_n_head_kv : n_head
-                                + n_head_kv
-                                + end_n_head_kv
-                            ],
-                        ],
-                        dim=0,
-                    ),
-                    "nheadqkv headdim ... -> (nheadqkv headdim) ...",
-                )
-
-    shard_first_dim(state_dict, "transformer.embeddings.word_embeddings.weight")
-    if "lm_head.weight" in state_dict:
-        shard_first_dim(state_dict, "lm_head.weight")
-    if "transformer.embeddings.position_embeddings.weight" in state_dict:
-        shard_last_dim(state_dict, "transformer.embeddings.position_embeddings.weight")
-    for i in range(config.num_hidden_layers):
-        shard_qkv_headdim(state_dict, f"transformer.layers.{i}.mixer.Wqkv.weight")
-        shard_qkv_headdim(state_dict, f"transformer.layers.{i}.mixer.Wqkv.bias")
-        shard_last_dim(
-            state_dict, f"transformer.layers.{i}.mixer.out_proj.weight", multiple_of=head_dim
-        )
-        if rank != 0:
-            state_dict.pop(f"transformer.layers.{i}.mixer.out_proj.bias", None)
-        if config.activation_function in ["glu", "swiglu", "geglu"]:
-            shard_gatedmlp_fc1_dim(state_dict, f"transformer.layers.{i}.mlp.fc1.weight")
-            shard_gatedmlp_fc1_dim(state_dict, f"transformer.layers.{i}.mlp.fc1.bias")
-        else:
-            shard_first_dim(state_dict, f"transformer.layers.{i}.mlp.fc1.weight")
-            shard_first_dim(state_dict, f"transformer.layers.{i}.mlp.fc1.bias")
-        shard_last_dim(state_dict, f"transformer.layers.{i}.mlp.fc2.weight")
-        if rank != 0:
-            state_dict.pop(f"transformer.layers.{i}.mlp.fc2.bias", None)
-    return state_dict
-
-
-def combine_state_dicts_tp(state_dicts: List[Dict[str, torch.Tensor]], config: GPT2Config):
-    """Convert the list of sharded state_dict of a GPT model with tensor parallel to
-    the state_dict of a standard GPT model.
-
-    This function is meant to be the "reverse" of shard_state_dict_tp.
-
-    Precondition:
-        - state_dicts should be ordered in the same way as the shards were created.
-    """
-    world_size = len(state_dicts)
-    keys = state_dicts[0].keys()
-    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
-    vocab_size = math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
-    assert vocab_size % world_size == 0
-    assert config.hidden_size % world_size == 0
-    inner_dim = config.n_inner if config.n_inner is not None else 4 * config.hidden_size
-    assert inner_dim % world_size == 0
-    assert config.hidden_size % config.n_head == 0
-    headdim = config.hidden_size // config.n_head
-
-    # Sometimes the word embeddings are sharded on the 0th dim, sometimes on the 1st dim.
-    # vocab_size // world_size coordinates are nonzero.
-    def combine_word_embeddings(state_dicts, state_dict, key):
-        dim = 0 if state_dicts[0][key].shape[0] == vocab_size // world_size else 1
-        state_dict[key] = torch.cat([s[key] for s in state_dicts], dim=dim)
-
-    def combine_dim(state_dicts, state_dict, key, dim=-1):
-        if key in state_dict:
-            state_dict[key] = torch.cat([s[key] for s in state_dicts], dim=dim)
-
-    def combine_qkv_headdim(state_dicts, state_dict, key):
-        n_head = config.n_head
-        n_head_kv = getattr(config, "n_head_kv", n_head)
-        if key in state_dict:
-            if n_head_kv == n_head:
-                xs = [
-                    rearrange(s[key], "(three d) ... -> three d ...", three=3) for s in state_dicts
-                ]
-                state_dict[key] = rearrange(torch.cat(xs, dim=1), "three d ... -> (three d) ...")
-            else:
-                n_head_each_rank = [
-                    get_dim_for_local_rank(n_head, world_size, local_rank)
-                    for local_rank in range(world_size)
-                ]
-                n_head_kv_each_rank = [
-                    get_dim_for_local_rank(n_head_kv, world_size, local_rank)
-                    for local_rank in range(world_size)
-                ]
-                xs = [
-                    rearrange(
-                        s[key],
-                        "(nheadqkv headdim) ... -> nheadqkv headdim ...",
-                        nheadqkv=rank_n_head + 2 * rank_n_head_kv,
-                        headdim=headdim,
-                    )
-                    for s, rank_n_head, rank_n_head_kv in zip(
-                        state_dicts, n_head_each_rank, n_head_kv_each_rank
-                    )
-                ]
-                wq = torch.cat([x[: n_head_each_rank[rank]] for rank, x in enumerate(xs)], dim=0)
-                wk = torch.cat(
-                    [
-                        x[
-                            n_head_each_rank[rank] : n_head_each_rank[rank]
-                            + n_head_kv_each_rank[rank]
-                        ]
-                        for rank, x in enumerate(xs)
-                    ],
-                    dim=0,
-                )
-                wv = torch.cat(
-                    [
-                        x[n_head_each_rank[rank] + n_head_kv_each_rank[rank] :]
-                        for rank, x in enumerate(xs)
-                    ],
-                    dim=0,
-                )
-                wqkv = torch.cat(
-                    [wq, wk, wv],
-                    dim=0,
-                )
-                state_dict[key] = rearrange(
-                    wqkv,
-                    "nheadqkv headdim ... -> (nheadqkv headdim) ...",
-                )
-
-    def combine_gated_mlp(state_dicts, state_dict, key):
-        if key in state_dict:
-            xs = [rearrange(s[key], "(two d) ... -> two d ...", two=2) for s in state_dicts]
-            state_dict[key] = rearrange(torch.cat(xs, dim=1), "two d ... -> (two d) ...")
-
-    state_dict = state_dicts[0].copy()  # don't modify state_dict[0] inplace
-    combine_word_embeddings(
-        state_dicts, state_dict, "transformer.embeddings.word_embeddings.weight"
-    )
-    if "lm_head.weight" in state_dict:
-        combine_word_embeddings(state_dicts, state_dict, "lm_head.weight")
-    if "transformer.embeddings.position_embeddings.weight" in state_dict:
-        combine_dim(
-            state_dicts, state_dict, "transformer.embeddings.position_embeddings.weight", -1
-        )
-    mlp_combine_fn = (
-        combine_gated_mlp
-        if config.activation_function in ["glu", "swiglu", "geglu"]
-        else partial(combine_dim, dim=0)
-    )
-    for i in range(config.num_hidden_layers):
-        combine_qkv_headdim(state_dicts, state_dict, f"transformer.layers.{i}.mixer.Wqkv.weight")
-        combine_qkv_headdim(state_dicts, state_dict, f"transformer.layers.{i}.mixer.Wqkv.bias")
-        combine_dim(state_dicts, state_dict, f"transformer.layers.{i}.mixer.out_proj.weight", -1)
-        mlp_combine_fn(state_dicts, state_dict, f"transformer.layers.{i}.mlp.fc1.weight")
-        combine_dim(state_dicts, state_dict, f"transformer.layers.{i}.mlp.fc1.bias", 0)
-        combine_dim(state_dicts, state_dict, f"transformer.layers.{i}.mlp.fc2.weight", -1)
-    return state_dict
-
-
-def remap_state_dict_hf_gpt2(state_dict, config):
-    # Word embedding and position embedding
-    def key_mapping_pos_emb(key):
-        return re.sub(r"^wpe.", "transformer.embeddings.position_embeddings.", key)
-
-    state_dict = OrderedDict((key_mapping_pos_emb(k), v) for k, v in state_dict.items())
-    word_embeddings = state_dict.pop("wte.weight")
-    # It's possible that vocab_size is padded to be a multiple of 8, for example.
-    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
-    vocab_size = math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
-    state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad(
-        word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0])
-    )
-    state_dict["lm_head.weight"] = state_dict["transformer.embeddings.word_embeddings.weight"]
-
-    # LayerNorm
-    def key_mapping_ln(key):
-        key = re.sub(r"^ln_f.(weight|bias)", r"transformer.ln_f.\1", key)
-        key = re.sub(r"^h.(\d+).ln_(1|2).(weight|bias)", r"transformer.layers.\1.norm\2.\3", key)
-        return key
-
-    state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
-
-    # MLP
-    for d in range(config.num_hidden_layers):
-        W1 = state_dict.pop(f"h.{d}.mlp.c_fc.weight")
-        state_dict[f"transformer.layers.{d}.mlp.fc1.weight"] = W1.t()
-        W2 = state_dict.pop(f"h.{d}.mlp.c_proj.weight")
-        state_dict[f"transformer.layers.{d}.mlp.fc2.weight"] = W2.t()
-
-    def key_mapping_mlp(key):
-        key = re.sub(r"^h.(\d+).mlp.c_fc.bias", r"transformer.layers.\1.mlp.fc1.bias", key)
-        key = re.sub(r"^h.(\d+).mlp.c_proj.bias", r"transformer.layers.\1.mlp.fc2.bias", key)
-        return key
-
-    state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
-
-    # Attention
-    for d in range(config.num_hidden_layers):
-        state_dict.pop(f"h.{d}.attn.bias")  # We don't store this bias
-        Wqkv = state_dict.pop(f"h.{d}.attn.c_attn.weight")
-        state_dict[f"transformer.layers.{d}.mixer.Wqkv.weight"] = Wqkv.t()
-        Wout = state_dict.pop(f"h.{d}.attn.c_proj.weight")
-        state_dict[f"transformer.layers.{d}.mixer.out_proj.weight"] = Wout.t()
-
-    def key_mapping_attn(key):
-        key = re.sub(r"^h.(\d+).attn.c_attn.bias", r"transformer.layers.\1.mixer.Wqkv.bias", key)
-        key = re.sub(
-            r"^h.(\d+).attn.c_proj.bias", r"transformer.layers.\1.mixer.out_proj.bias", key
-        )
-        return key
-
-    state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
-
-    return state_dict
-
-
-def remap_state_dict_megatron(state_dict, config):
-    def key_mapping_transformer(key):
-        key = re.sub(r"^language_model.encoder.", "transformer.", key)
-        key = re.sub(r"^language_model.", "transformer.", key)
-        return key
-
-    state_dict = OrderedDict((key_mapping_transformer(k), v) for k, v in state_dict.items())
-
-    # Word embedding and position embedding
-    def key_mapping_pos_emb(key):
-        return re.sub(r"^wpe.", "transformer.embeddings.position_embeddings.", key)
-
-    state_dict = OrderedDict((key_mapping_pos_emb(k), v) for k, v in state_dict.items())
-    word_embeddings = state_dict.pop("transformer.embedding.word_embeddings.weight")
-    # It's possible that vocab_size is padded to be a multiple of 8, for example.
-    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
-    vocab_size = (
-        math.ceil(word_embeddings.shape[0] / pad_vocab_size_multiple) * pad_vocab_size_multiple
-    )
-    state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad(
-        word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0])
-    )
-    state_dict["lm_head.weight"] = state_dict["transformer.embeddings.word_embeddings.weight"]
-
-    # LayerNorm
-    def key_mapping_ln(key):
-        key = re.sub(r"^transformer.final_layernorm.(weight|bias)", r"transformer.ln_f.\1", key)
-        key = re.sub(
-            r"^transformer.layers.(\d+).input_layernorm.(weight|bias)",
-            r"transformer.layers.\1.norm1.\2",
-            key,
-        )
-        key = re.sub(
-            r"^transformer.layers.(\d+).post_attention_layernorm.(weight|bias)",
-            r"transformer.layers.\1.norm2.\2",
-            key,
-        )
-        return key
-
-    state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
-
-    # MLP
-    def key_mapping_mlp(key):
-        key = re.sub(
-            r"^transformer.layers.(\d+).mlp.dense_h_to_4h.(weight|bias)",
-            r"transformer.layers.\1.mlp.fc1.\2",
-            key,
-        )
-        key = re.sub(
-            r"^transformer.layers.(\d+).mlp.dense_4h_to_h.(weight|bias)",
-            r"transformer.layers.\1.mlp.fc2.\2",
-            key,
-        )
-        return key
-
-    state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
-
-    # Attention
-    def key_mapping_attn(key):
-        key = re.sub(
-            r"^transformer.layers.(\d+).self_attention.rotary_emb.inv_freq",
-            r"transformer.layers.\1.mixer.rotary_emb.inv_freq",
-            key,
-        )
-        key = re.sub(
-            r"^transformer.layers.(\d+).self_attention.query_key_value.(weight|bias)",
-            r"transformer.layers.\1.mixer.Wqkv.\2",
-            key,
-        )
-        key = re.sub(
-            r"^transformer.layers.(\d+).self_attention.dense.(weight|bias)",
-            r"transformer.layers.\1.mixer.out_proj.\2",
-            key,
-        )
-        return key
-
-    state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
-    # Megatron stores Wqkv as ((nheads 3 headdim), hidden_dim)
-    # while we store Wqkv as ((3 nheads headdim), hidden_dim)
-    headdim = config.hidden_size // config.num_attention_heads
-    for d in range(config.num_hidden_layers):
-        Wqkv = state_dict.pop(f"transformer.layers.{d}.mixer.Wqkv.weight")
-        state_dict[f"transformer.layers.{d}.mixer.Wqkv.weight"] = rearrange(
-            Wqkv,
-            "(nheads three headdim) ... -> (three nheads headdim) ...",
-            three=3,
-            headdim=headdim,
-        )
-        bqkv = state_dict.pop(f"transformer.layers.{d}.mixer.Wqkv.bias")
-        state_dict[f"transformer.layers.{d}.mixer.Wqkv.bias"] = rearrange(
-            bqkv, "(nheads three headdim) -> (three nheads headdim)", three=3, headdim=headdim
-        )
-
-    return state_dict
--- a/vllm_flash_attn/models/gpt_neox.py
+++ b/vllm_flash_attn/models/gpt_neox.py
-# Copyright (c) 2023, Tri Dao.
-
-import math
-import re
-from collections import OrderedDict
-
-import torch
-import torch.nn.functional as F
-from einops import rearrange
-from transformers import GPT2Config, GPTNeoXConfig
-
-
-def remap_state_dict_hf_gpt_neox(state_dict, config):
-    def key_mapping_layers(key):
-        return re.sub(r"^gpt_neox.", "transformer.", key)
-
-    state_dict = OrderedDict((key_mapping_layers(k), v) for k, v in state_dict.items())
-    # Word embedding
-    def key_mapping_emb(key):
-        return re.sub(r"^transformer.embed_in.", "transformer.embeddings.word_embeddings.", key)
-
-    state_dict = OrderedDict((key_mapping_emb(k), v) for k, v in state_dict.items())
-    word_embeddings = state_dict.pop("transformer.embeddings.word_embeddings.weight")
-    # It's possible that vocab_size is padded to be a multiple of 8, for example.
-    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
-    vocab_size = math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
-    state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad(
-        word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0])
-    )
-    if getattr(config, "tie_word_embeddings", False):
-        state_dict["lm_head.weight"] = state_dict["transformer.embeddings.word_embeddings.weight"]
-    else:
-        output_embeddings = state_dict.pop("embed_out.weight")
-        # It's possible that vocab_size is padded to be a multiple of 8, for example.
-        state_dict["lm_head.weight"] = F.pad(
-            output_embeddings, (0, 0, 0, vocab_size - output_embeddings.shape[0])
-        )
-
-    # LayerNorm
-    def key_mapping_ln(key):
-        key = re.sub(r"^transformer.final_layer_norm.", r"transformer.ln_f.", key)
-        key = re.sub(
-            r"^transformer.layers.(\d+).input_layernorm.", r"transformer.layers.\1.norm1.", key
-        )
-        key = re.sub(
-            r"^transformer.layers.(\d+).post_attention_layernorm.",
-            r"transformer.layers.\1.norm2.",
-            key,
-        )
-        return key
-
-    state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
-
-    # MLP
-    def key_mapping_mlp(key):
-        key = re.sub(
-            r"^transformer.layers.(\d+).mlp.dense_h_to_4h.", r"transformer.layers.\1.mlp.fc1.", key
-        )
-        key = re.sub(
-            r"^transformer.layers.(\d+).mlp.dense_4h_to_h.", r"transformer.layers.\1.mlp.fc2.", key
-        )
-        return key
-
-    state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
-
-    # Attention
-    for l in range(config.n_layer):
-        # We don't store these biases
-        state_dict.pop(f"transformer.layers.{l}.attention.bias")
-        state_dict.pop(f"transformer.layers.{l}.attention.masked_bias")
-        # We don't store these
-        state_dict.pop(f"transformer.layers.{l}.attention.rotary_emb.inv_freq", None)
-        # GPT-NeoX stores Wqkv as ((nheads 3 headdim), hidden_dim)
-        # while we store Wqkv as ((3 nheads headdim), hidden_dim)
-        headdim = config.hidden_size // config.num_attention_heads
-        Wqkv = state_dict.pop(f"transformer.layers.{l}.attention.query_key_value.weight")
-        state_dict[f"transformer.layers.{l}.mixer.Wqkv.weight"] = rearrange(
-            Wqkv,
-            "(nheads three headdim) ... -> (three nheads headdim) ...",
-            three=3,
-            headdim=headdim,
-        )
-        bqkv = state_dict.pop(f"transformer.layers.{l}.attention.query_key_value.bias")
-        state_dict[f"transformer.layers.{l}.mixer.Wqkv.bias"] = rearrange(
-            bqkv, "(nheads three headdim) -> (three nheads headdim)", three=3, headdim=headdim
-        )
-
-    def key_mapping_attn(key):
-        key = re.sub(
-            r"^transformer.layers.(\d+).attention.dense.",
-            r"transformer.layers.\1.mixer.out_proj.",
-            key,
-        )
-        return key
-
-    state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
-
-    return state_dict
-
-
-def gpt_neox_config_to_gpt2_config(gpt_neox_config: GPTNeoXConfig) -> GPT2Config:
-    assert gpt_neox_config.rotary_emb_base == 10000
-    return GPT2Config(
-        vocab_size=gpt_neox_config.vocab_size,
-        n_positions=0,  # No absolute position embedding
-        n_embd=gpt_neox_config.hidden_size,
-        n_layer=gpt_neox_config.num_hidden_layers,
-        n_head=gpt_neox_config.num_attention_heads,
-        n_inner=gpt_neox_config.intermediate_size,
-        activation_function=gpt_neox_config.hidden_act,
-        resid_pdrop=0.0,  # No dropout
-        embd_pdrop=0.0,
-        attn_pdrop=0.0,
-        layer_norm_epsilon=gpt_neox_config.layer_norm_eps,
-        initializer_range=gpt_neox_config.initializer_range,
-        bos_token_id=gpt_neox_config.bos_token_id,
-        eos_token_id=gpt_neox_config.eos_token_id,
-        # These are new arguments not in the original GPT2Config
-        prenorm=True,
-        parallel_block=gpt_neox_config.use_parallel_residual,
-        parallel_block_tied_norm=False,
-        rotary_emb_fraction=gpt_neox_config.rotary_pct,
-        tie_word_embeddings=gpt_neox_config.tie_word_embeddings,
-    )
--- a/vllm_flash_attn/models/gptj.py
+++ b/vllm_flash_attn/models/gptj.py
-# Copyright (c) 2023, Tri Dao.
-
-import math
-import re
-from collections import OrderedDict
-
-import torch
-import torch.nn.functional as F
-from transformers import GPT2Config, GPTJConfig
-
-
-def remap_state_dict_hf_gptj(state_dict, config):
-    def key_mapping_layers(key):
-        return re.sub(r"^transformer.h.", "transformer.layers.", key)
-
-    state_dict = OrderedDict((key_mapping_layers(k), v) for k, v in state_dict.items())
-    # Word embedding
-    def key_mapping_emb(key):
-        return re.sub(r"^transformer.wte.", "transformer.embeddings.word_embeddings.", key)
-
-    state_dict = OrderedDict((key_mapping_emb(k), v) for k, v in state_dict.items())
-    word_embeddings = state_dict.pop("transformer.embeddings.word_embeddings.weight")
-    # It's possible that vocab_size is padded to be a multiple of 8, for example.
-    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
-    vocab_size = math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
-    state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad(
-        word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0])
-    )
-    if getattr(config, "tie_word_embeddings"):
-        state_dict["lm_head.weight"] = state_dict["transformer.embeddings.word_embeddings.weight"]
-    else:
-        output_embeddings = state_dict.pop("lm_head.weight")
-        # It's possible that vocab_size is padded to be a multiple of 8, for example.
-        state_dict["lm_head.weight"] = F.pad(
-            output_embeddings, (0, 0, 0, vocab_size - output_embeddings.shape[0])
-        )
-        output_embeddings_bias = state_dict.pop("lm_head.bias")
-        state_dict["lm_head.bias"] = F.pad(
-            output_embeddings_bias, (0, vocab_size - output_embeddings_bias.shape[0])
-        )
-
-    # LayerNorm
-    def key_mapping_ln(key):
-        return re.sub(r"^transformer.layers.(\d+).ln_1.", r"transformer.layers.\1.norm1.", key)
-
-    state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
-
-    # MLP
-    def key_mapping_mlp(key):
-        key = re.sub(
-            r"^transformer.layers.(\d+).mlp.fc_in.", r"transformer.layers.\1.mlp.fc1.", key
-        )
-        key = re.sub(
-            r"^transformer.layers.(\d+).mlp.fc_out.", r"transformer.layers.\1.mlp.fc2.", key
-        )
-        return key
-
-    state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
-
-    # Attention
-    for l in range(config.n_layer):
-        Wq = state_dict.pop(f"transformer.layers.{l}.attn.q_proj.weight")
-        Wk = state_dict.pop(f"transformer.layers.{l}.attn.k_proj.weight")
-        Wv = state_dict.pop(f"transformer.layers.{l}.attn.v_proj.weight")
-        state_dict[f"transformer.layers.{l}.mixer.Wqkv.weight"] = torch.cat([Wq, Wk, Wv], dim=0)
-        # We don't store these biases
-        state_dict.pop(f"transformer.layers.{l}.attn.bias")
-        state_dict.pop(f"transformer.layers.{l}.attn.masked_bias")
-
-    def key_mapping_attn(key):
-        return re.sub(
-            r"^transformer.layers.(\d+).attn.out_proj.",
-            r"transformer.layers.\1.mixer.out_proj.",
-            key,
-        )
-
-    state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
-
-    return state_dict
-
-
-def gptj_config_to_gpt2_config(gptj_config: GPTJConfig) -> GPT2Config:
-    headdim = gptj_config.n_embd // gptj_config.n_head
-    return GPT2Config(
-        vocab_size=gptj_config.vocab_size,
-        n_positions=0,  # No absolute position embedding
-        n_embd=gptj_config.n_embd,
-        n_layer=gptj_config.n_layer,
-        n_head=gptj_config.n_head,
-        n_inner=gptj_config.n_inner,
-        activation_function=gptj_config.activation_function,
-        resid_pdrop=gptj_config.resid_pdrop,
-        embd_pdrop=gptj_config.embd_pdrop,
-        attn_pdrop=gptj_config.attn_pdrop,
-        layer_norm_epsilon=gptj_config.layer_norm_epsilon,
-        initializer_range=gptj_config.initializer_range,
-        bos_token_id=gptj_config.bos_token_id,
-        eos_token_id=gptj_config.eos_token_id,
-        # These are new arguments not in the original GPT2Config
-        prenorm=True,
-        parallel_block=True,
-        parallel_block_tied_norm=True,
-        rotary_emb_fraction=gptj_config.rotary_dim / headdim,
-        rotary_emb_interleaved=True,
-        tie_word_embeddings=False,
-        qkv_proj_bias=False,
-        out_proj_bias=False,
-        lm_head_bias=True,
-    )