Remove unnecessary files

ae856f3a · Woosuk Kwon · 6ac8e63a · 6ac8e63a · 6ac8e63a · 6ac8e63a
Commit ae856f3a authored Mar 28, 2024 by Woosuk Kwon
20 changed files
--- a/vllm_flash_attn/bert_padding.py
+++ b/vllm_flash_attn/bert_padding.py
-# Adapted from https://github.com/mlcommons/training_results_v1.1/blob/main/NVIDIA/benchmarks/bert/implementations/pytorch/padding.py
-
-import torch
-import torch.nn.functional as F
-from einops import rearrange, repeat
-
-
-class IndexFirstAxis(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, input, indices):
-        ctx.save_for_backward(indices)
-        assert input.ndim >= 2
-        ctx.first_axis_dim, other_shape = input.shape[0], input.shape[1:]
-        second_dim = other_shape.numel()
-        # TD [2022-03-04] For some reason torch.gather is a bit faster than indexing.
-        # return input[indices]
-        return torch.gather(
-            rearrange(input, "b ... -> b (...)"), 0, repeat(indices, "z -> z d", d=second_dim)
-        ).reshape(-1, *other_shape)
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        (indices,) = ctx.saved_tensors
-        assert grad_output.ndim >= 2
-        other_shape = grad_output.shape[1:]
-        grad_output = rearrange(grad_output, "b ... -> b (...)")
-        grad_input = torch.zeros(
-            [ctx.first_axis_dim, grad_output.shape[1]],
-            device=grad_output.device,
-            dtype=grad_output.dtype,
-        )
-        # TD [2022-03-04] For some reason torch.scatter is a bit faster than indexing.
-        # grad_input[indices] = grad_output
-        grad_input.scatter_(0, repeat(indices, "z -> z d", d=grad_output.shape[1]), grad_output)
-        return grad_input.reshape(ctx.first_axis_dim, *other_shape), None
-
-
-index_first_axis = IndexFirstAxis.apply
-
-
-class IndexPutFirstAxis(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, values, indices, first_axis_dim):
-        ctx.save_for_backward(indices)
-        assert indices.ndim == 1
-        assert values.ndim >= 2
-        output = torch.zeros(
-            first_axis_dim, *values.shape[1:], device=values.device, dtype=values.dtype
-        )
-        # TD [2022-03-04] For some reason torch.scatter is a bit faster than indexing.
-        output[indices] = values
-        # output.scatter_(0, repeat(indices, 'z -> z d', d=values.shape[1]), values)
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        (indices,) = ctx.saved_tensors
-        # TD [2022-03-04] For some reason torch.gather is a bit faster than indexing.
-        grad_values = grad_output[indices]
-        # grad_values = torch.gather(grad_output, 0, repeat(indices, 'z -> z d', d=grad_output.shape[1]))
-        return grad_values, None, None
-
-
-index_put_first_axis = IndexPutFirstAxis.apply
-
-
-class IndexFirstAxisResidual(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, input, indices):
-        ctx.save_for_backward(indices)
-        assert input.ndim >= 2
-        ctx.first_axis_dim, other_shape = input.shape[0], input.shape[1:]
-        second_dim = other_shape.numel()
-        # TD [2022-03-04] For some reason torch.gather is a bit faster than indexing.
-        output = input[indices]
-        # We don't want to reshape input (b ... -> b (...)) since it could change the channel_last
-        # memory format to channel_first. In other words, input might not be contiguous.
-        # If we don't detach, Pytorch complains about output being a view and is being modified inplace
-        return output, input.detach()
-
-    @staticmethod
-    def backward(ctx, grad_output, grad_residual):
-        (indices,) = ctx.saved_tensors
-        assert grad_output.ndim >= 2
-        other_shape = grad_output.shape[1:]
-        assert grad_residual.shape[1:] == other_shape
-        grad_input = grad_residual
-        # grad_input[indices] += grad_output
-        indices = indices.reshape(indices.shape[0], *((1,) * (grad_output.ndim - 1)))
-        indices = indices.expand_as(grad_output)
-        grad_input.scatter_add_(0, indices, grad_output)
-        return grad_input.reshape(ctx.first_axis_dim, *other_shape), None
-
-
-index_first_axis_residual = IndexFirstAxisResidual.apply
-
-
-def unpad_input(hidden_states, attention_mask):
-    """
-    Arguments:
-        hidden_states: (batch, seqlen, ...)
-        attention_mask: (batch, seqlen), bool / int, 1 means valid and 0 means not valid.
-    Return:
-        hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
-        indices: (total_nnz), the indices of non-masked tokens from the flattened input sequence.
-        cu_seqlens: (batch + 1), the cumulative sequence lengths, used to index into hidden_states.
-        max_seqlen_in_batch: int
-    """
-    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
-    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
-    max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
-    # TD [2022-03-04] We don't want to index with a bool mask, because Pytorch will expand the
-    # bool mask, then call nonzero to get the indices, then index with those. The indices is @dim
-    # times larger than it needs to be, wasting memory. It's faster and more memory-efficient to
-    # index with integer indices. Moreover, torch's index is a bit slower than it needs to be,
-    # so we write custom forward and backward to make it a bit faster.
-    return (
-        index_first_axis(rearrange(hidden_states, "b s ... -> (b s) ..."), indices),
-        indices,
-        cu_seqlens,
-        max_seqlen_in_batch,
-    )
-
-
-def unpad_input_for_concatenated_sequences(hidden_states, attention_mask_in_length):
-    """
-    Supports concatenating short samples in one sequence. The attention_mask_in_length is utilized to mask other short samples. It helps efficient training of variant lengths-based samples (e.g., the supervised fine-tuning task in large language model).
-    The motivation for this function is explained [here](https://github.com/Dao-AILab/flash-attention/issues/432#issuecomment-1668822286).
-    
-    For example, if batch = 3 and seqlen = 6, the attention_mask_in_length is:
-        ```
-        [
-          [2, 3, 0, 0, 0, 0],
-          [3, 2, 0, 0, 0, 0],
-          [6, 0, 0, 0, 0, 0]
-        ]
-        ```
-    , which refers to the 3D-attention mask:
-        ```
-        [
-          [
-            [1, 0, 0, 0, 0, 0],
-            [1, 1, 0, 0, 0, 0],
-            [0, 0, 1, 0, 0, 0],
-            [0, 0, 1, 1, 0, 0],
-            [0, 0, 1, 1, 1, 0],
-            [0, 0, 0, 0, 0, 1]
-          ],
-          [
-            [1, 0, 0, 0, 0, 0],
-            [1, 1, 0, 0, 0, 0],
-            [1, 1, 1, 0, 0, 0],
-            [0, 0, 0, 1, 0, 0],
-            [0, 0, 0, 1, 1, 0],
-            [0, 0, 0, 0, 0, 1]
-          ],
-          [
-            [1, 0, 0, 0, 0, 0],
-            [1, 1, 0, 0, 0, 0],
-            [1, 1, 1, 0, 0, 0],
-            [1, 1, 1, 1, 0, 0],
-            [1, 1, 1, 1, 1, 0],
-            [1, 1, 1, 1, 1, 1]
-          ]
-        ]
-        ```.
-
-    Arguments:
-        hidden_states: (batch, seqlen, ...)
-        attention_mask_in_length: (batch, seqlen), int, a nonzero number (e.g., 1, 2, 3, etc.) means length of concatenated sequence in b-th batch, and 0 means none.
-    Return:
-        hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
-        indices: (total_nnz), the indices of non-masked tokens from the flattened input sequence.
-        cu_seqlens: (batch + 1), the cumulative sequence lengths, used to index into hidden_states.
-        max_seqlen_in_batch: int
-    """
-    length = attention_mask_in_length.sum(dim=-1)
-    seqlen = attention_mask_in_length.size(-1)
-    attention_mask_2d = torch.arange(seqlen, device=length.device, dtype=length.dtype).expand(len(length), seqlen) < length.unsqueeze(1)
-    real_indices_idx = torch.nonzero(attention_mask_in_length.flatten(), as_tuple=False).flatten()
-    seqlens_in_batch = attention_mask_in_length.flatten()[real_indices_idx]
-    indices = torch.nonzero(attention_mask_2d.flatten(), as_tuple=False).flatten()
-    max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
-    # TD [2022-03-04] We don't want to index with a bool mask, because Pytorch will expand the
-    # bool mask, then call nonzero to get the indices, then index with those. The indices is @dim
-    # times larger than it needs to be, wasting memory. It's faster and more memory-efficient to
-    # index with integer indices. Moreover, torch's index is a bit slower than it needs to be,
-    # so we write custom forward and backward to make it a bit faster.
-    return (
-        index_first_axis(rearrange(hidden_states, "b s ... -> (b s) ..."), indices),
-        indices,
-        cu_seqlens,
-        max_seqlen_in_batch,
-    )
-
-
-def pad_input(hidden_states, indices, batch, seqlen):
-    """
-    Arguments:
-        hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
-        indices: (total_nnz), the indices that represent the non-masked tokens of the original padded input sequence.
-        batch: int, batch size for the padded sequence.
-        seqlen: int, maximum sequence length for the padded sequence.
-    Return:
-        hidden_states: (batch, seqlen, ...)
-    """
-    dim = hidden_states.shape[-1]
-    # output = torch.zeros((batch * seqlen), dim, device=hidden_states.device, dtype=hidden_states.dtype)
-    # output[indices] = hidden_states
-    output = index_put_first_axis(hidden_states, indices, batch * seqlen)
-    return rearrange(output, "(b s) ... -> b s ...", b=batch)
--- a/vllm_flash_attn/flash_attn_triton.py
+++ b/vllm_flash_attn/flash_attn_triton.py
--- a/vllm_flash_attn/flash_attn_triton_og.py
+++ b/vllm_flash_attn/flash_attn_triton_og.py
-# [2022-10-23] Downloaded from https://github.com/openai/triton/blob/master/python/tutorials/06-fused-attention.py
-# for benchmarking.
-# We fixed a few dtype cast to make it work for bf16
-
-"""
-Fused Attention
-===============
-This is a Triton implementation of the Flash Attention algorithm
-(see: Dao et al., https://arxiv.org/pdf/2205.14135v2.pdf; Rabe and Staats https://arxiv.org/pdf/2112.05682v2.pdf)
-"""
-
-import pytest
-import torch
-import triton
-import triton.language as tl
-
-
-@triton.jit
-def _fwd_kernel(
-    Q,
-    K,
-    V,
-    sm_scale,
-    TMP,
-    L,
-    M,  # NOTE: TMP is a scratchpad buffer to workaround a compiler bug
-    Out,
-    stride_qz,
-    stride_qh,
-    stride_qm,
-    stride_qk,
-    stride_kz,
-    stride_kh,
-    stride_kn,
-    stride_kk,
-    stride_vz,
-    stride_vh,
-    stride_vk,
-    stride_vn,
-    stride_oz,
-    stride_oh,
-    stride_om,
-    stride_on,
-    Z,
-    H,
-    N_CTX,
-    BLOCK_M: tl.constexpr,
-    BLOCK_DMODEL: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-):
-    start_m = tl.program_id(0)
-    off_hz = tl.program_id(1)
-    # initialize offsets
-    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-    offs_n = tl.arange(0, BLOCK_N)
-    offs_d = tl.arange(0, BLOCK_DMODEL)
-    off_q = off_hz * stride_qh + offs_m[:, None] * stride_qm + offs_d[None, :] * stride_qk
-    off_k = off_hz * stride_qh + offs_n[:, None] * stride_kn + offs_d[None, :] * stride_kk
-    off_v = off_hz * stride_qh + offs_n[:, None] * stride_qm + offs_d[None, :] * stride_qk
-    # Initialize pointers to Q, K, V
-    q_ptrs = Q + off_q
-    k_ptrs = K + off_k
-    v_ptrs = V + off_v
-    # initialize pointer to m and l
-    t_ptrs = TMP + off_hz * N_CTX + offs_m
-    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
-    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
-    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
-    # load q: it will stay in SRAM throughout
-    q = tl.load(q_ptrs)
-    # loop over k, v and update accumulator
-    for start_n in range(0, (start_m + 1) * BLOCK_M, BLOCK_N):
-        start_n = tl.multiple_of(start_n, BLOCK_N)
-        # -- compute qk ----
-        k = tl.load(k_ptrs + start_n * stride_kn)
-        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-        qk += tl.dot(q, k, trans_b=True)
-        qk *= sm_scale
-        qk += tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), 0, float("-inf"))
-        # -- compute m_ij, p, l_ij
-        m_ij = tl.max(qk, 1)
-        p = tl.exp(qk - m_ij[:, None])
-        l_ij = tl.sum(p, 1)
-        # -- update m_i and l_i
-        m_i_new = tl.maximum(m_i, m_ij)
-        alpha = tl.exp(m_i - m_i_new)
-        beta = tl.exp(m_ij - m_i_new)
-        l_i_new = alpha * l_i + beta * l_ij
-        # -- update output accumulator --
-        # scale p
-        p_scale = beta / l_i_new
-        p = p * p_scale[:, None]
-        # scale acc
-        acc_scale = l_i / l_i_new * alpha
-        tl.store(t_ptrs, acc_scale)
-        acc_scale = tl.load(t_ptrs)  # BUG: have to store and immediately load
-        acc = acc * acc_scale[:, None]
-        # update acc
-        v = tl.load(v_ptrs + start_n * stride_vk)
-        p = p.to(v.dtype)
-        acc += tl.dot(p, v)
-        # update m_i and l_i
-        l_i = l_i_new
-        m_i = m_i_new
-    # rematerialize offsets to save registers
-    start_m = tl.program_id(0)
-    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-    # write back l and m
-    l_ptrs = L + off_hz * N_CTX + offs_m
-    m_ptrs = M + off_hz * N_CTX + offs_m
-    tl.store(l_ptrs, l_i)
-    tl.store(m_ptrs, m_i)
-    # initialize pointers to output
-    offs_n = tl.arange(0, BLOCK_DMODEL)
-    off_o = off_hz * stride_oh + offs_m[:, None] * stride_om + offs_n[None, :] * stride_on
-    out_ptrs = Out + off_o
-    tl.store(out_ptrs, acc)
-
-
-@triton.jit
-def _bwd_preprocess(
-    Out,
-    DO,
-    L,
-    NewDO,
-    Delta,
-    BLOCK_M: tl.constexpr,
-    D_HEAD: tl.constexpr,
-):
-    off_m = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M)
-    off_n = tl.arange(0, D_HEAD)
-    # load
-    o = tl.load(Out + off_m[:, None] * D_HEAD + off_n[None, :]).to(tl.float32)
-    do = tl.load(DO + off_m[:, None] * D_HEAD + off_n[None, :]).to(tl.float32)
-    denom = tl.load(L + off_m).to(tl.float32)
-    # compute
-    do = do / denom[:, None]
-    delta = tl.sum(o * do, axis=1)
-    # write-back
-    tl.store(NewDO + off_m[:, None] * D_HEAD + off_n[None, :], do)
-    tl.store(Delta + off_m, delta)
-
-
-@triton.jit
-def _bwd_kernel(
-    Q,
-    K,
-    V,
-    sm_scale,
-    Out,
-    DO,
-    DQ,
-    DK,
-    DV,
-    L,
-    M,
-    D,
-    stride_qz,
-    stride_qh,
-    stride_qm,
-    stride_qk,
-    stride_kz,
-    stride_kh,
-    stride_kn,
-    stride_kk,
-    stride_vz,
-    stride_vh,
-    stride_vk,
-    stride_vn,
-    Z,
-    H,
-    N_CTX,
-    num_block,
-    BLOCK_M: tl.constexpr,
-    BLOCK_DMODEL: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-):
-    off_hz = tl.program_id(0)
-    off_z = off_hz // H
-    off_h = off_hz % H
-    # offset pointers for batch/head
-    Q += off_z * stride_qz + off_h * stride_qh
-    K += off_z * stride_qz + off_h * stride_qh
-    V += off_z * stride_qz + off_h * stride_qh
-    DO += off_z * stride_qz + off_h * stride_qh
-    DQ += off_z * stride_qz + off_h * stride_qh
-    DK += off_z * stride_qz + off_h * stride_qh
-    DV += off_z * stride_qz + off_h * stride_qh
-    for start_n in range(0, num_block):
-        lo = start_n * BLOCK_M
-        # initialize row/col offsets
-        offs_qm = lo + tl.arange(0, BLOCK_M)
-        offs_n = start_n * BLOCK_M + tl.arange(0, BLOCK_M)
-        offs_m = tl.arange(0, BLOCK_N)
-        offs_k = tl.arange(0, BLOCK_DMODEL)
-        # initialize pointers to value-like data
-        q_ptrs = Q + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk)
-        k_ptrs = K + (offs_n[:, None] * stride_kn + offs_k[None, :] * stride_kk)
-        v_ptrs = V + (offs_n[:, None] * stride_qm + offs_k[None, :] * stride_qk)
-        do_ptrs = DO + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk)
-        dq_ptrs = DQ + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk)
-        # pointer to row-wise quantities in value-like data
-        D_ptrs = D + off_hz * N_CTX
-        m_ptrs = M + off_hz * N_CTX
-        # initialize dv amd dk
-        dv = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
-        dk = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
-        # k and v stay in SRAM throughout
-        k = tl.load(k_ptrs)
-        v = tl.load(v_ptrs)
-        # loop over rows
-        for start_m in range(lo, num_block * BLOCK_M, BLOCK_M):
-            offs_m_curr = start_m + offs_m
-            # load q, k, v, do on-chip
-            q = tl.load(q_ptrs)
-            # recompute p = softmax(qk, dim=-1).T
-            # NOTE: `do` is pre-divided by `l`; no normalization here
-            qk = tl.dot(q, k, trans_b=True)
-            qk = tl.where(offs_m_curr[:, None] >= (offs_n[None, :]), qk, float("-inf"))
-            m = tl.load(m_ptrs + offs_m_curr)
-            p = tl.exp(qk * sm_scale - m[:, None])
-            # compute dv
-            do = tl.load(do_ptrs)
-            dv += tl.dot(p.to(do.dtype), do, trans_a=True)
-            # compute dp = dot(v, do)
-            Di = tl.load(D_ptrs + offs_m_curr)
-            dp = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) - Di[:, None]
-            dp += tl.dot(do, v, trans_b=True)
-            # compute ds = p * (dp - delta[:, None])
-            ds = p * dp * sm_scale
-            # compute dk = dot(ds.T, q)
-            dk += tl.dot(ds.to(q.dtype), q, trans_a=True)
-            # # compute dq
-            dq = tl.load(dq_ptrs, eviction_policy="evict_last")
-            dq += tl.dot(ds.to(k.dtype), k)
-            tl.store(dq_ptrs, dq, eviction_policy="evict_last")
-            # # increment pointers
-            dq_ptrs += BLOCK_M * stride_qm
-            q_ptrs += BLOCK_M * stride_qm
-            do_ptrs += BLOCK_M * stride_qm
-        # write-back
-        dv_ptrs = DV + (offs_n[:, None] * stride_qm + offs_k[None, :] * stride_qk)
-        dk_ptrs = DK + (offs_n[:, None] * stride_kn + offs_k[None, :] * stride_kk)
-        tl.store(dv_ptrs, dv)
-        tl.store(dk_ptrs, dk)
-
-
-class _attention(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, q, k, v, sm_scale):
-        BLOCK = 128
-        # shape constraints
-        Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
-        assert Lq == Lk and Lk == Lv
-        assert Lk in {16, 32, 64, 128}
-        o = torch.empty_like(q)
-        grid = (triton.cdiv(q.shape[2], BLOCK), q.shape[0] * q.shape[1])
-        tmp = torch.empty(
-            (q.shape[0] * q.shape[1], q.shape[2]), device=q.device, dtype=torch.float32
-        )
-        L = torch.empty((q.shape[0] * q.shape[1], q.shape[2]), device=q.device, dtype=torch.float32)
-        m = torch.empty((q.shape[0] * q.shape[1], q.shape[2]), device=q.device, dtype=torch.float32)
-        num_warps = 4 if Lk <= 64 else 8
-
-        _fwd_kernel[grid](
-            q,
-            k,
-            v,
-            sm_scale,
-            tmp,
-            L,
-            m,
-            o,
-            q.stride(0),
-            q.stride(1),
-            q.stride(2),
-            q.stride(3),
-            k.stride(0),
-            k.stride(1),
-            k.stride(2),
-            k.stride(3),
-            v.stride(0),
-            v.stride(1),
-            v.stride(2),
-            v.stride(3),
-            o.stride(0),
-            o.stride(1),
-            o.stride(2),
-            o.stride(3),
-            q.shape[0],
-            q.shape[1],
-            q.shape[2],
-            BLOCK_M=BLOCK,
-            BLOCK_N=BLOCK,
-            BLOCK_DMODEL=Lk,
-            num_warps=num_warps,
-            num_stages=1,
-        )
-        ctx.save_for_backward(q, k, v, o, L, m)
-        ctx.BLOCK = BLOCK
-        ctx.grid = grid
-        ctx.sm_scale = sm_scale
-        ctx.BLOCK_DMODEL = Lk
-        return o
-
-    @staticmethod
-    def backward(ctx, do):
-        q, k, v, o, l, m = ctx.saved_tensors
-        do = do.contiguous()
-        dq = torch.zeros_like(q, dtype=torch.float32)
-        dk = torch.empty_like(k)
-        dv = torch.empty_like(v)
-        do_scaled = torch.empty_like(do)
-        delta = torch.empty_like(l)
-        _bwd_preprocess[(ctx.grid[0] * ctx.grid[1],)](
-            o,
-            do,
-            l,
-            do_scaled,
-            delta,
-            BLOCK_M=ctx.BLOCK,
-            D_HEAD=ctx.BLOCK_DMODEL,
-        )
-
-        # NOTE: kernel currently buggy for other values of `num_warps`
-        num_warps = 8
-        _bwd_kernel[(ctx.grid[1],)](
-            q,
-            k,
-            v,
-            ctx.sm_scale,
-            o,
-            do_scaled,
-            dq,
-            dk,
-            dv,
-            l,
-            m,
-            delta,
-            q.stride(0),
-            q.stride(1),
-            q.stride(2),
-            q.stride(3),
-            k.stride(0),
-            k.stride(1),
-            k.stride(2),
-            k.stride(3),
-            v.stride(0),
-            v.stride(1),
-            v.stride(2),
-            v.stride(3),
-            q.shape[0],
-            q.shape[1],
-            q.shape[2],
-            ctx.grid[0],
-            BLOCK_M=ctx.BLOCK,
-            BLOCK_N=ctx.BLOCK,
-            BLOCK_DMODEL=ctx.BLOCK_DMODEL,
-            num_warps=num_warps,
-            num_stages=1,
-        )
-        return dq.to(q.dtype), dk, dv, None
-
-
-attention = _attention.apply
--- a/vllm_flash_attn/flash_blocksparse_attention.py
+++ b/vllm_flash_attn/flash_blocksparse_attention.py
-import math
-
-import hydra
-import torch
-import torch.nn as nn
-from einops import rearrange
-
-from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input
-from flash_attn.flash_blocksparse_attn_interface import (
-    convert_blockmask,
-    flash_blocksparse_attn_func,
-)
-
-
-class FlashBlocksparseAttention(nn.Module):
-    """Implement the scaled dot product attention with softmax.
-    Arguments
-    ---------
-        softmax_temp: The temperature to use for the softmax attention.
-                      (default: 1/sqrt(d_keys) where d_keys is computed at
-                      runtime)
-        attention_dropout: The dropout rate to apply to the attention
-                           (default: 0.1)
-    """
-
-    def __init__(
-        self,
-        sparsity_config,
-        softmax_temp=None,
-        attention_dropout=0.0,
-        max_seq_length=2048,
-        device=None,
-        dtype=None,
-    ):
-        super().__init__()
-        self.sparsity_config = hydra.utils.instantiate(sparsity_config)
-        self.softmax_temp = softmax_temp
-        self.dropout_p = attention_dropout
-
-        # initialize sparse layout and register as buffer
-        max_seq_length = ((max_seq_length + 256 - 1) // 256) * 256
-        layout = self.sparsity_config.make_layout(max_seq_length)
-        self.register_buffer("layout", layout)
-        blockmask_converted = convert_blockmask(self.layout, causal=False)
-        self.register_buffer("blockmask_converted", blockmask_converted)
-        # logger.info(f'Attention class {self.__class__}: saving={self.layout.float().mean()}')
-
-    def forward(
-        self,
-        qkv,
-        attn_mask=None,
-        key_padding_mask=None,
-        causal=False,
-        cu_seqlens=None,
-        max_s=None,
-        need_weights=False,
-        convert_mask=True,
-    ):
-        """Implements the multihead softmax attention.
-        Arguments
-        ---------
-            qkv: The tensor containing the query, key, and value. (B, S, 3, H, D) if key_padding_mask is None
-            attn_mask: An implementation of BaseMask that encodes where each
-                       query can attend to
-            key_padding_mask: An implementation of BaseMask that encodes how
-                         many query each sequence in the batch consists of
-        """
-        assert not need_weights
-        assert attn_mask is None
-        assert qkv.dtype == torch.float16
-        assert qkv.is_cuda
-
-        if cu_seqlens is None:
-            batch_size = qkv.shape[0]
-            seqlen = qkv.shape[1]
-            # Convert mask to take a subset
-            seqlen_rounded = ((seqlen + 256 - 1) // 256) * 256
-            assert seqlen_rounded // 16 <= self.layout.shape[0], (
-                seqlen_rounded // 256 <= self.layout.shape[1]
-            )
-            blockmask = self.layout[: seqlen_rounded // 16, : seqlen_rounded // 256]
-            if key_padding_mask is None:
-                qkv = rearrange(qkv, "b s ... -> (b s) ...")
-                max_s = seqlen
-                cu_seqlens = torch.arange(
-                    0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32, device=qkv.device
-                )
-                output = flash_blocksparse_attn_func(
-                    qkv,
-                    cu_seqlens,
-                    blockmask,
-                    self.dropout_p if self.training else 0.0,
-                    max_s,
-                    softmax_scale=self.softmax_temp,
-                    causal=causal,
-                )
-                output = rearrange(output, "(b s) ... -> b s ...", b=batch_size)
-            else:
-                key_padding_mask_bool = key_padding_mask.bool_matrix
-                nheads = qkv.shape[-2]
-                x = rearrange(qkv, "b s three h d -> b s (three h d)")
-                x_unpad, indices, cu_seqlens, max_s = unpad_input(x, key_padding_mask_bool)
-                x_unpad = rearrange(x_unpad, "nnz (three h d) -> nnz three h d", three=3, h=nheads)
-                output_unpad = flash_blocksparse_attn_func(
-                    x_unpad,
-                    cu_seqlens,
-                    blockmask,
-                    self.dropout_p if self.training else 0.0,
-                    max_s,
-                    softmax_scale=self.softmax_temp,
-                    causal=causal,
-                )
-                output = rearrange(
-                    pad_input(
-                        rearrange(output_unpad, "nnz h d -> nnz (h d)"), indices, batch_size, seqlen
-                    ),
-                    "b s (h d) -> b s h d",
-                    h=nheads,
-                )
-        else:
-            assert max_s is not None
-            seqlen = max_s
-            # Convert mask to take a subset
-            seqlen_rounded = ((seqlen + 256 - 1) // 256) * 256
-            assert seqlen_rounded // 16 <= self.layout.shape[0], (
-                seqlen_rounded // 256 <= self.layout.shape[1]
-            )
-            blockmask = self.layout[: seqlen_rounded // 16, : seqlen_rounded // 256]
-            if convert_mask:
-                output = flash_blocksparse_attn_func(
-                    qkv,
-                    cu_seqlens,
-                    blockmask,
-                    self.dropout_p if self.training else 0.0,
-                    max_s,
-                    softmax_scale=self.softmax_temp,
-                    causal=causal,
-                )
-            else:
-                output = flash_blocksparse_attn_func(
-                    qkv,
-                    cu_seqlens,
-                    self.blockmask_converted,
-                    self.dropout_p if self.training else 0.0,
-                    max_s,
-                    softmax_scale=self.softmax_temp,
-                    causal=causal,
-                    convert_mask=False,
-                )
-
-        return output, None
-
-
-class FlashBlocksparseMHA(nn.Module):
-    def __init__(
-        self,
-        embed_dim,
-        num_heads,
-        sparsity_config,
-        bias=True,
-        batch_first=True,
-        attention_dropout=0.0,
-        causal=False,
-        max_seq_length=2048,
-        device=None,
-        dtype=None,
-        **kwargs,
-    ) -> None:
-        assert batch_first
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        self.embed_dim = embed_dim
-        self.causal = causal
-
-        self.num_heads = num_heads
-        assert self.embed_dim % num_heads == 0, "self.kdim must be divisible by num_heads"
-        self.head_dim = self.embed_dim // num_heads
-        assert self.head_dim in [16, 32, 64], "Only support head_dim == 16, 32, or 64"
-
-        self.Wqkv = nn.Linear(embed_dim, 3 * embed_dim, bias=bias, **factory_kwargs)
-        self.inner_attn = FlashBlocksparseAttention(
-            sparsity_config,
-            attention_dropout=attention_dropout,
-            max_seq_length=max_seq_length,
-            **factory_kwargs,
-        )
-        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias, **factory_kwargs)
-
-    def forward(
-        self, x, x_ignored_, x_ignored_1_, attn_mask=None, key_padding_mask=None, need_weights=False
-    ):
-        qkv = self.Wqkv(x)
-        qkv = rearrange(qkv, "b s (three h d) -> b s three h d", three=3, h=self.num_heads)
-        context, attn_weights = self.inner_attn(
-            qkv, key_padding_mask=key_padding_mask, need_weights=need_weights, causal=self.causal
-        )
-        return self.out_proj(rearrange(context, "b s h d -> b s (h d)")), attn_weights
--- a/vllm_flash_attn/flash_blocksparse_attn_interface.py
+++ b/vllm_flash_attn/flash_blocksparse_attn_interface.py
-# Adapted from https://github.com/mlcommons/training_results_v1.1/blob/main/NVIDIA/benchmarks/bert/implementations/pytorch/fmha.py
-import flash_attn_cuda
-import torch
-import torch.nn as nn
-
-
-def convert_blockmask(blockmask, causal):
-    """Convert from the 0-1 format to the format used by the CUDA code.
-    0 means the block is skipped.
-    nonzero means the block is not skipped.
-    Argument:
-        blockmask: (row, col): a 0-1 tensor
-    Return:
-        blockmask_converted: (col, row), dtype torch.int32: for each column, it contains the row
-            indices of the nonzero blocks, padded with -1 to reach length @row.
-            The indices are multiplied by 4, with the smallest bit used to encode whether
-            it is the first nonzero in its row, and the 2nd smallest bit to encode whether it is
-            the last nonzero in its row..
-    """
-    assert not causal
-    # TD [2022-05-13]: The indexing and sorting is very tricky
-    nrow, ncol = blockmask.shape
-    # Sort does not support bool on CUDA
-    blockmask = blockmask.to(dtype=torch.uint8)
-    nonzero_val, nonzero_sorted_rowidx = blockmask.sort(dim=0, stable=True, descending=True)
-    nonzero_unsorted_rowidx = nonzero_sorted_rowidx.argsort(dim=0)
-    last_nonzero_col_per_row = blockmask.sort(dim=-1, stable=True).indices[:, -1]
-    last_nonzero_col_per_row_after_sort = nonzero_unsorted_rowidx[
-        torch.arange(nrow, device=blockmask.device), last_nonzero_col_per_row
-    ]
-    first_nonzero_col_per_row = blockmask.sort(dim=-1, stable=True, descending=True).indices[:, 0]
-    first_nonzero_col_per_row_after_sort = nonzero_unsorted_rowidx[
-        torch.arange(nrow, device=blockmask.device), first_nonzero_col_per_row
-    ]
-    nonzero_idx = nonzero_sorted_rowidx * 4
-    nonzero_idx[last_nonzero_col_per_row_after_sort, last_nonzero_col_per_row] += 2
-    nonzero_idx[first_nonzero_col_per_row_after_sort, first_nonzero_col_per_row] += 1
-    nonzero_idx[nonzero_val == 0] = -1
-    return nonzero_idx.T.contiguous().to(dtype=torch.int32)
-
-
-def _flash_blocksparse_attn_forward(
-    qkv, cu_seqlens, blockmask, dropout_p, max_s, softmax_scale, causal, return_softmax
-):
-    context, softmax_lse, *rest = flash_attn_cuda.fwd_block(
-        qkv, cu_seqlens, blockmask, dropout_p, max_s, softmax_scale, causal, return_softmax, None
-    )
-    # if context.isnan().any() or softmax_lse.isnan().any():
-    #     breakpoint()
-    S_dmask = rest[0] if return_softmax else None
-    return context, softmax_lse, S_dmask
-
-
-def _flash_blocksparse_attn_backward(
-    dout,
-    qkv,
-    out,
-    S_dmask,
-    softmax_lse,
-    cu_seqlens,
-    blockmask,
-    dropout_p,
-    max_s,
-    softmax_scale,
-    causal,
-):
-    dqkv, dp, softmax_d = flash_attn_cuda.bwd_block(
-        dout,
-        qkv,
-        out,
-        S_dmask,
-        softmax_lse,
-        cu_seqlens,
-        blockmask,
-        dropout_p,
-        softmax_scale,
-        max_s,
-        causal,
-        None,
-    )
-    # if dqkv.isnan().any() or softmax_d.isnan().any():
-    #     breakpoint()
-    return dqkv
-
-
-class FlashBlocksparseAttnFun(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, qkv, cu_seqlens, blockmask, dropout_p, max_s, softmax_scale, causal):
-        # Save rng_state because the backward pass will regenerate the dropout mask
-        rng_state = torch.cuda.get_rng_state() if dropout_p > 0 else None
-        if softmax_scale is None:
-            softmax_scale = qkv.shape[-1] ** (-0.5)
-        context, softmax_lse, S_dmask = _flash_blocksparse_attn_forward(
-            qkv,
-            cu_seqlens,
-            blockmask,
-            dropout_p,
-            max_s,
-            softmax_scale,
-            causal=causal,
-            return_softmax=False,
-        )
-        ctx.save_for_backward(qkv, context, S_dmask, softmax_lse, cu_seqlens, blockmask, rng_state)
-        ctx.dropout_p = dropout_p
-        ctx.max_s = max_s
-        ctx.softmax_scale = softmax_scale
-        ctx.causal = causal
-        return context
-
-    @staticmethod
-    def backward(ctx, dout):
-        qkv, context, S_dmask, softmax_lse, cu_seqlens, blockmask, rng_state = ctx.saved_tensors
-        if rng_state is not None:
-            cur_rng_state = torch.cuda.get_rng_state()
-            torch.cuda.set_rng_state(rng_state)
-        # S_dmask is None, temporarily use another tensor just to get it running
-        dqkv = _flash_blocksparse_attn_backward(
-            dout,
-            qkv,
-            context,
-            context,
-            softmax_lse,
-            cu_seqlens,
-            blockmask,
-            ctx.dropout_p,
-            ctx.max_s,
-            ctx.softmax_scale,
-            ctx.causal,
-        )
-        if rng_state is not None:
-            torch.cuda.set_rng_state(cur_rng_state)
-        return dqkv, None, None, None, None, None, None, None
-
-
-# We duplicate code to return both the output and the softmax for testing
-# Returning both makes backward a bit slower, so we want to keep using the other version for speed.
-class FlashBlocksparseAttnFunWithS(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, qkv, cu_seqlens, blockmask, dropout_p, max_s, softmax_scale, causal):
-        # Save rng_state because the backward pass is gonna regenerate the dropout mask
-        rng_state = torch.cuda.get_rng_state() if dropout_p > 0 else None
-        if softmax_scale is None:
-            softmax_scale = qkv.shape[-1] ** (-0.5)
-        context, softmax_lse, S_dmask = _flash_blocksparse_attn_forward(
-            qkv,
-            cu_seqlens,
-            blockmask,
-            dropout_p,
-            max_s,
-            softmax_scale,
-            causal=causal,
-            return_softmax=True,
-        )
-        ctx.save_for_backward(qkv, context, S_dmask, softmax_lse, cu_seqlens, blockmask, rng_state)
-        ctx.dropout_p = dropout_p
-        ctx.max_s = max_s
-        ctx.softmax_scale = softmax_scale
-        ctx.causal = causal
-        return context, S_dmask, softmax_lse
-
-    @staticmethod
-    def backward(ctx, dout, _dS_dmask_ignored, _dsoftmax_sum_ignored):
-        qkv, context, S_dmask, softmax_lse, cu_seqlens, blockmask, rng_state = ctx.saved_tensors
-        if rng_state is not None:
-            cur_rng_state = torch.cuda.get_rng_state()
-            torch.cuda.set_rng_state(rng_state)
-        dqkv = _flash_blocksparse_attn_backward(
-            dout,
-            qkv,
-            context,
-            S_dmask,
-            softmax_lse,
-            cu_seqlens,
-            blockmask,
-            ctx.dropout_p,
-            ctx.max_s,
-            ctx.softmax_scale,
-            ctx.causal,
-        )
-        if rng_state is not None:
-            torch.cuda.set_rng_state(cur_rng_state)
-        return dqkv, None, None, None, None, None, None
-
-
-def flash_blocksparse_attn_func(
-    qkv,
-    cu_seqlens,
-    blockmask,
-    dropout_p,
-    max_s,
-    softmax_scale=None,
-    causal=False,
-    return_attn_probs=False,
-    convert_mask=True,
-):
-    """dropout_p should be set to 0.0 during evaluation"""
-    func = FlashBlocksparseAttnFun if not return_attn_probs else FlashBlocksparseAttnFunWithS
-    if convert_mask:
-        blockmask = convert_blockmask(blockmask, causal=causal)
-    return func.apply(qkv, cu_seqlens, blockmask, dropout_p, max_s, softmax_scale, causal)
--- a/vllm_flash_attn/fused_softmax.py
+++ b/vllm_flash_attn/fused_softmax.py
-# [2022-10-23] Copied from https://github.com/NVIDIA/apex/blob/master/apex/transformer/functional/fused_softmax.py
-# for benchmarking.
-# We added support for seqlen=2k and seqlen=4k
-
-# coding=utf-8
-# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-from apex._autocast_utils import _cast_if_autocast_enabled
-from apex.transformer.enums import AttnMaskType
-from fused_softmax_lib import (
-    scaled_masked_softmax_backward,
-    scaled_masked_softmax_forward,
-    scaled_masked_softmax_get_batch_per_block,
-    scaled_upper_triang_masked_softmax_backward,
-    scaled_upper_triang_masked_softmax_forward,
-)
-
-
-class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function):
-    """
-    Fused operation which performs following three operations in sequence
-    1. Scale the tensor.
-    2. Apply upper triangular mask (typically used in gpt models).
-    3. Perform softmax.
-    """
-
-    @staticmethod
-    def forward(ctx, inputs, scale):
-        scale_t = torch.tensor([scale])
-        softmax_results = scaled_upper_triang_masked_softmax_forward(inputs, scale_t[0])
-        ctx.save_for_backward(softmax_results, scale_t)
-        return softmax_results
-
-    @staticmethod
-    def backward(ctx, output_grads):
-        softmax_results, scale_t = ctx.saved_tensors
-        input_grads = scaled_upper_triang_masked_softmax_backward(
-            output_grads, softmax_results, scale_t[0]
-        )
-        return input_grads, None
-
-
-def scaled_upper_triang_masked_softmax(inputs, _, scale):
-    b, np, sq, sk = inputs.size()
-    assert sq == sk, "causal mask is only for self attention"
-    # Reshaping input to 3D tensor (attn_batches, sq, sk)
-    inputs = inputs.view(-1, sq, sk)
-    args = _cast_if_autocast_enabled(inputs, scale)
-    with torch.cuda.amp.autocast(enabled=False):
-        probs = ScaledUpperTriangMaskedSoftmax.apply(*args)
-    return probs.view(b, np, sq, sk)
-
-
-# NOTE (mkozuki): `ScaledMaskedSoftmax` somehow doesn't work well with `torch.cuda.amp.custom_fwd`.
-# Without `cast_inputs` kwarg, somehow inputs are not cast to dtype used in the autocast context.
-# So I needed to manually write two `torch.autograd.Function` inheritances.
-# Fused operation which performs following three operations in sequence
-# 1. Scale the tensor.
-# 2. Apply the mask.
-# 3. Perform softmax.
-class ScaledMaskedSoftmax(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, inputs, mask, scale):
-        scale_t = torch.tensor([scale])
-        softmax_results = scaled_masked_softmax_forward(inputs, mask, scale_t[0])
-        ctx.save_for_backward(softmax_results, scale_t)
-        return softmax_results
-
-    @staticmethod
-    def backward(ctx, output_grads):
-        softmax_results, scale_t = ctx.saved_tensors
-        input_grads = scaled_masked_softmax_backward(output_grads, softmax_results, scale_t[0])
-        return input_grads, None, None
-
-
-def scaled_masked_softmax(inputs, mask, scale):
-    # input is 4D tensor (b, np, sq, sk)
-    args = _cast_if_autocast_enabled(inputs, mask, scale)
-    with torch.cuda.amp.autocast(enabled=False):
-        return ScaledMaskedSoftmax.apply(*args)
-
-
-class FusedScaleMaskSoftmax(torch.nn.Module):
-    """
-    fused operation: scaling + mask + softmax
-
-    Arguments:
-        input_in_fp16: flag to indicate if input in fp16 data format.
-        input_in_bf16: flag to indicate if input in bf16 data format.
-        attn_mask_type: attention mask type (pad or causal)
-        scaled_masked_softmax_fusion: flag to indicate user want to use softmax fusion
-        mask_func: mask function to be applied.
-        softmax_in_fp32: if true, softmax in performed at fp32 precision.
-        scale: scaling factor used in input tensor scaling.
-    """
-
-    def __init__(
-        self,
-        input_in_fp16,
-        input_in_bf16,
-        attn_mask_type,
-        scaled_masked_softmax_fusion,
-        mask_func,
-        softmax_in_fp32,
-        scale,
-    ):
-        super().__init__()
-        self.input_in_fp16 = input_in_fp16
-        self.input_in_bf16 = input_in_bf16
-        if self.input_in_fp16 and self.input_in_bf16:
-            raise RuntimeError("both fp16 and bf16 flags cannot be active at the same time.")
-        self.input_in_float16 = self.input_in_fp16 or self.input_in_bf16
-        self.attn_mask_type = attn_mask_type
-        self.scaled_masked_softmax_fusion = scaled_masked_softmax_fusion
-        self.mask_func = mask_func
-        self.softmax_in_fp32 = softmax_in_fp32
-        self.scale = scale
-
-        if not (self.scale is None or softmax_in_fp32):
-            raise RuntimeError("softmax should be in fp32 when scaled")
-
-        if self.scaled_masked_softmax_fusion:
-            if self.attn_mask_type == AttnMaskType.causal:
-                self.fused_softmax_func = scaled_upper_triang_masked_softmax
-            elif self.attn_mask_type == AttnMaskType.padding:
-                self.fused_softmax_func = scaled_masked_softmax
-            else:
-                raise ValueError("Invalid attn_mask_type.")
-
-    def forward(self, input, mask):
-        # [b, np, sq, sk]
-        assert input.dim() == 4
-
-        if self.is_kernel_available(mask, *input.size()):
-            return self.forward_fused_softmax(input, mask)
-        else:
-            return self.forward_torch_softmax(input, mask)
-
-    def is_kernel_available(self, mask, b, np, sq, sk):
-        attn_batches = b * np
-
-        if (
-            self.scaled_masked_softmax_fusion  # user want to fuse
-            and self.input_in_float16  # input must be fp16
-            and (
-                self.attn_mask_type == AttnMaskType.causal
-                or (self.attn_mask_type == AttnMaskType.padding and mask is not None)
-            )
-            and 16 < sk <= 8192  # sk must be 16 ~ 8192
-            and sq % 4 == 0  # sq must be divisor of 4
-            and sk % 4 == 0  # sk must be divisor of 4
-            and attn_batches % 4 == 0  # np * b must be divisor of 4
-        ):
-            if 0 <= sk <= 8192:
-                batch_per_block = self.get_batch_per_block(sq, sk, b, np)
-
-                if self.attn_mask_type == AttnMaskType.causal:
-                    if attn_batches % batch_per_block == 0:
-                        return True
-                else:
-                    if sq % batch_per_block == 0:
-                        return True
-        return False
-
-    def forward_fused_softmax(self, input, mask):
-        # input.shape = [b, np, sq, sk]
-        scale = self.scale if self.scale is not None else 1.0
-        return self.fused_softmax_func(input, mask, scale)
-
-    def forward_torch_softmax(self, input, mask):
-        if self.input_in_float16 and self.softmax_in_fp32:
-            input = input.float()
-
-        if self.scale is not None:
-            input = input * self.scale
-        mask_output = self.mask_func(input, mask) if mask is not None else input
-        probs = torch.nn.Softmax(dim=-1)(mask_output)
-
-        if self.input_in_float16 and self.softmax_in_fp32:
-            if self.input_in_fp16:
-                probs = probs.half()
-            else:
-                probs = probs.bfloat16()
-
-        return probs
-
-    @staticmethod
-    def get_batch_per_block(sq, sk, b, np):
-        return scaled_masked_softmax_get_batch_per_block(sq, sk, b, np)
--- a/vllm_flash_attn/layers/__init__.py
+++ b/vllm_flash_attn/layers/__init__.py
--- a/vllm_flash_attn/layers/patch_embed.py
+++ b/vllm_flash_attn/layers/patch_embed.py
-# We use the same API as https://github.com/rwightman/pytorch-image-models/blob/v0.6.11/timm/models/layers/patch_embed.py
-# But we use nn.Linear instead of Conv2d and it's about 8x faster.
-
-from functools import partial
-
-import torch.nn as nn
-from einops import rearrange
-from torch import _assert
-from torch.nn.modules.utils import _pair
-
-try:
-    from flash_attn.ops.fused_dense import FusedDense
-except ImportError:
-    FusedDense = None
-
-
-class PatchEmbed(nn.Module):
-    """2D Image to Patch Embedding"""
-
-    def __init__(
-        self,
-        img_size=224,
-        patch_size=16,
-        in_chans=3,
-        embed_dim=768,
-        norm_layer=None,
-        flatten=True,
-        bias=True,
-        fused_bias_fc=False,
-    ):
-        super().__init__()
-        img_size = _pair(img_size)
-        patch_size = _pair(patch_size)
-        self.img_size = img_size
-        self.patch_size = patch_size
-        self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
-        self.num_patches = self.grid_size[0] * self.grid_size[1]
-        self.flatten = flatten
-        if fused_bias_fc and FusedDense is None:
-            raise ImportError("fused_dense is not installed")
-
-        linear_cls = nn.Linear if not fused_bias_fc or not bias else FusedDense
-        self.proj = linear_cls(in_chans * patch_size[0] * patch_size[1], embed_dim, bias=bias)
-        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
-
-    def forward(self, x):
-        _, _, H, W = x.shape
-        _assert(
-            H == self.img_size[0],
-            f"Input image height ({H}) doesn't match model ({self.img_size[0]}).",
-        )
-        _assert(
-            W == self.img_size[1],
-            f"Input image width ({W}) doesn't match model ({self.img_size[1]}).",
-        )
-        x = self.proj(
-            rearrange(
-                x,
-                "b c (h p1) (w p2) -> b h w (c p1 p2)",
-                p1=self.patch_size[0],
-                p2=self.patch_size[1],
-            )
-        )
-        if self.flatten:
-            x = rearrange(x, "b h w c -> b (h w) c")
-        x = self.norm(x)
-        return x
--- a/vllm_flash_attn/layers/rotary.py
+++ b/vllm_flash_attn/layers/rotary.py
-# Copyright (c) 2023, Tri Dao.
-
-import math
-from typing import Optional, Tuple, Union
-
-import torch
-from einops import rearrange, repeat
-from flash_attn.ops.triton.rotary import apply_rotary
-
-
-def rotate_half(x, interleaved=False):
-    if not interleaved:
-        x1, x2 = x.chunk(2, dim=-1)
-        return torch.cat((-x2, x1), dim=-1)
-    else:
-        x1, x2 = x[..., ::2], x[..., 1::2]
-        return rearrange(torch.stack((-x2, x1), dim=-1), "... d two -> ... (d two)", two=2)
-
-
-def apply_rotary_emb_torch(x, cos, sin, interleaved=False):
-    """
-    x: (batch_size, seqlen, nheads, headdim)
-    cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2)
-    """
-    ro_dim = cos.shape[-1] * 2
-    assert ro_dim <= x.shape[-1]
-    cos = repeat(cos, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
-    sin = repeat(sin, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
-    return torch.cat(
-        [x[..., :ro_dim] * cos + rotate_half(x[..., :ro_dim], interleaved) * sin, x[..., ro_dim:]],
-        dim=-1,
-    )
-
-
-class ApplyRotaryEmb(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        x,
-        cos,
-        sin,
-        interleaved=False,
-        inplace=False,
-        seqlen_offsets: Union[int, torch.Tensor] = 0,
-        cu_seqlens: Optional[torch.Tensor] = None,
-        max_seqlen: Optional[int] = None,
-    ):
-        out = apply_rotary(
-            x,
-            cos,
-            sin,
-            seqlen_offsets=seqlen_offsets,
-            cu_seqlens=cu_seqlens,
-            max_seqlen=max_seqlen,
-            interleaved=interleaved,
-            inplace=inplace,
-        )
-        if isinstance(seqlen_offsets, int):
-            ctx.save_for_backward(cos, sin, cu_seqlens)  # Can't save int with save_for_backward
-            ctx.seqlen_offsets = seqlen_offsets
-        else:
-            ctx.save_for_backward(cos, sin, cu_seqlens, seqlen_offsets)
-            ctx.seqlen_offsets = None
-        ctx.interleaved = interleaved
-        ctx.inplace = inplace
-        ctx.max_seqlen = max_seqlen
-        return out if not inplace else x
-
-    @staticmethod
-    def backward(ctx, do):
-        seqlen_offsets = ctx.seqlen_offsets
-        if seqlen_offsets is None:
-            cos, sin, cu_seqlens, seqlen_offsets = ctx.saved_tensors
-        else:
-            cos, sin, cu_seqlens = ctx.saved_tensors
-        # TD [2023-09-02]: For some reason Triton (2.0.0.post1) errors with
-        # "[CUDA]: invalid device context", and cloning makes it work. Idk why. Triton 2.1.0 works.
-        if not ctx.interleaved and not ctx.inplace:
-            do = do.clone()
-        dx = apply_rotary(
-            do,
-            cos,
-            sin,
-            seqlen_offsets=seqlen_offsets,
-            cu_seqlens=cu_seqlens,
-            max_seqlen=ctx.max_seqlen,
-            interleaved=ctx.interleaved,
-            inplace=ctx.inplace,
-            conjugate=True,
-        )
-        return dx, None, None, None, None, None, None, None
-
-
-def apply_rotary_emb(
-    x,
-    cos,
-    sin,
-    interleaved=False,
-    inplace=False,
-    seqlen_offsets: Union[int, torch.Tensor] = 0,
-    cu_seqlens: Optional[torch.Tensor] = None,
-    max_seqlen: Optional[int] = None,
-):
-    """
-    Arguments:
-        x: (batch_size, seqlen, nheads, headdim) if cu_seqlens is None
-            else (total_seqlen, nheads, headdim)
-        cos, sin: (seqlen_rotary, rotary_dim / 2)
-        interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
-            of 1st half and 2nd half (GPT-NeoX style).
-        inplace: if True, apply rotary embedding in-place.
-        seqlen_offsets: (batch_size,) or int. Each sequence in x is shifted by this amount.
-            Most commonly used in inference when we have KV cache.
-        cu_seqlens: (batch + 1,) or None
-        max_seqlen: int
-    Return:
-        out: (batch_size, seqlen, nheads, headdim) if cu_seqlens is None
-            else (total_seqlen, nheads, headdim)
-    rotary_dim must be <= headdim
-    Apply rotary embedding to the first rotary_dim of x.
-    """
-    return ApplyRotaryEmb.apply(
-        x, cos, sin, interleaved, inplace, seqlen_offsets, cu_seqlens, max_seqlen
-    )
-
-
-# For backward compatibility
-apply_rotary_emb_func = apply_rotary_emb
-
-
-class ApplyRotaryEmbQKV_(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        qkv,
-        cos,
-        sin,
-        cos_k=None,
-        sin_k=None,
-        interleaved=False,
-        seqlen_offsets: Union[int, torch.Tensor] = 0,
-    ):
-        batch, seqlen, three, nheads, headdim = qkv.shape
-        assert three == 3
-        if cos_k is None and sin_k is None and qkv.is_contiguous():
-            # Call 1 kernel instead of 2 kernels
-            # We need qkv to be contiguous so that when we reshape to combine (3, nheads)
-            # dimensions, we get the same tensor
-            # qk = rearrange(qkv[:, :, :2], "b s t h d -> b s (t h) d")
-            qk = qkv[:, :, :2].reshape(batch, seqlen, -1, headdim)
-            apply_rotary(
-                qk, cos, sin, seqlen_offsets=seqlen_offsets, interleaved=interleaved, inplace=True
-            )
-        else:
-            cos_k = cos if cos_k is None else cos_k
-            sin_k = sin if sin_k is None else sin_k
-            q, k = qkv[:, :, 0], qkv[:, :, 1]
-            apply_rotary(q, cos, sin, seqlen_offsets, interleaved=interleaved, inplace=True)
-            apply_rotary(k, cos_k, sin_k, seqlen_offsets, interleaved=interleaved, inplace=True)
-            ctx.save_for_backward(cos, sin, cos_k, sin_k)
-        if isinstance(seqlen_offsets, int):
-            ctx.save_for_backward(cos, sin, cos_k, sin_k)
-            ctx.seqlen_offsets = seqlen_offsets
-        else:
-            ctx.save_for_backward(cos, sin, cos_k, sin_k, seqlen_offsets)
-            ctx.seqlen_offsets = None
-        ctx.interleaved = interleaved
-        return qkv
-
-    @staticmethod
-    def backward(ctx, dqkv):
-        seqlen_offsets = ctx.seqlen_offsets
-        if seqlen_offsets is None:
-            cos, sin, cos_k, sin_k, seqlen_offsets = ctx.saved_tensors
-        else:
-            cos, sin, cos_k, sin_k = ctx.saved_tensors
-        if cos_k is None and sin_k is None and dqkv.is_contiguous():
-            # Call 1 kernel instead of 2 kernels
-            # We need dqkv to be contiguous so that when we reshape to combine (3, nheads)
-            # dimensions, we get the same tensor
-            dqk = rearrange(dqkv[:, :, :2], "b s t h d -> b s (t h) d")
-            apply_rotary(
-                dqk,
-                cos,
-                sin,
-                seqlen_offsets=seqlen_offsets,
-                interleaved=ctx.interleaved,
-                inplace=True,
-                conjugate=True,
-            )
-        else:
-            cos_k = cos if cos_k is None else cos_k
-            sin_k = sin if sin_k is None else sin_k
-            dq, dk = dqkv[:, :, 0], dqkv[:, :, 1]
-            apply_rotary(
-                dq, cos, sin, seqlen_offsets, interleaved=ctx.interleaved, inplace=True, conjugate=True
-            )
-            apply_rotary(
-                dk,
-                cos_k,
-                sin_k,
-                seqlen_offsets,
-                interleaved=ctx.interleaved,
-                inplace=True,
-                conjugate=True,
-            )
-        return dqkv, None, None, None, None, None, None
-
-
-def apply_rotary_emb_qkv_(
-    qkv,
-    cos,
-    sin,
-    cos_k=None,
-    sin_k=None,
-    interleaved=False,
-    seqlen_offsets: Union[int, torch.Tensor] = 0,
-):
-    """
-    Arguments:
-        qkv: (batch_size, seqlen, 3, nheads, headdim)
-        cos, sin: (seqlen, rotary_dim / 2)
-        cos_k, sin_k: (seqlen, rotary_dim / 2), optional
-        interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead of
-            1st half and 2nd half (GPT-NeoX style).
-        seqlen_offsets: (batch_size,) or int. Each sequence in Q and K is shifted by this amount.
-            Most commonly used in inference when we have KV cache.
-    Return:
-        qkv: (batch_size, seqlen, 3, nheads, headdim)
-    rotary_dim must be <= headdim
-    Apply rotary embedding *inplace* to the first rotary_dim of Q and K.
-    """
-    return ApplyRotaryEmbQKV_.apply(qkv, cos, sin, cos_k, sin_k, interleaved, seqlen_offsets)
-
-
-class ApplyRotaryEmbKV_(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, kv, cos, sin, interleaved=False, seqlen_offsets: Union[int, torch.Tensor] = 0):
-        batch, seqlen, two, nheads, headdim = kv.shape
-        assert two == 2
-        k = kv[:, :, 0]
-        apply_rotary(
-            k, cos, sin, seqlen_offsets=seqlen_offsets, interleaved=interleaved, inplace=True
-        )
-        if isinstance(seqlen_offsets, int):
-            ctx.save_for_backward(cos, sin)  # Can't save int with save_for_backward
-            ctx.seqlen_offsets = seqlen_offsets
-        else:
-            ctx.save_for_backward(cos, sin, seqlen_offsets)
-            ctx.seqlen_offsets = None
-        ctx.interleaved = interleaved
-        return kv
-
-    @staticmethod
-    def backward(ctx, dkv):
-        seqlen_offsets = ctx.seqlen_offsets
-        if seqlen_offsets is None:
-            cos, sin, seqlen_offsets = ctx.saved_tensors
-        else:
-            cos, sin = ctx.saved_tensors
-        apply_rotary(
-            dkv[:, :, 0],
-            cos,
-            sin,
-            seqlen_offsets=seqlen_offsets,
-            interleaved=ctx.interleaved,
-            inplace=True,
-            conjugate=True,
-        )
-        return dkv, None, None, None, None
-
-
-apply_rotary_emb_kv_ = ApplyRotaryEmbKV_.apply
-
-
-def apply_rotary_emb_kv_(
-    kv,
-    cos,
-    sin,
-    interleaved=False,
-    seqlen_offsets: Union[int, torch.Tensor] = 0,
-):
-    """
-    Arguments:
-        kv: (batch_size, seqlen, 2, nheads, headdim)
-        cos, sin: (seqlen, rotary_dim / 2)
-        interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead of
-            1st half and 2nd half (GPT-NeoX style).
-        seqlen_offsets: (batch_size,) or int. Each sequence in Q and K is shifted by this amount.
-            Most commonly used in inference when we have KV cache.
-    Return:
-        kv: (batch_size, seqlen, 2, nheads, headdim)
-    rotary_dim must be <= headdim
-    Apply rotary embedding *inplace* to the first rotary_dim of K.
-    """
-    return ApplyRotaryEmbKV_.apply(kv, cos, sin, interleaved, seqlen_offsets)
-
-
-class RotaryEmbedding(torch.nn.Module):
-    """
-    The rotary position embeddings from RoFormer_ (Su et. al).
-    A crucial insight from the method is that the query and keys are
-    transformed by rotation matrices which depend on the relative positions.
-
-    Other implementations are available in the Rotary Transformer repo_ and in
-    GPT-NeoX_, GPT-NeoX was an inspiration
-
-    .. _RoFormer: https://arxiv.org/abs/2104.09864
-    .. _repo: https://github.com/ZhuiyiTechnology/roformer
-    .. _GPT-NeoX: https://github.com/EleutherAI/gpt-neox
-
-    If scale_base is not None, this implements XPos (Sun et al., https://arxiv.org/abs/2212.10554).
-    A recommended value for scale_base is 512: https://github.com/HazyResearch/flash-attention/issues/96
-    Reference: https://github.com/sunyt32/torchscale/blob/main/torchscale/component/xpos_relative_position.py
-    """
-
-    def __init__(
-        self,
-        dim: int,
-        base=10000.0,
-        interleaved=False,
-        scale_base=None,
-        pos_idx_in_fp32=True,
-        device=None,
-    ):
-        """
-        interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
-            of 1st half and 2nd half (GPT-NeoX style).
-        pos_idx_in_fp32: if True, the position indices [0.0, ..., seqlen - 1] are in fp32,
-            otherwise they might be in lower precision.
-            This option was added because previously (before 2023-07-02), when we construct
-            the position indices, we use the dtype of self.inv_freq. In most cases this would
-            be fp32, but if the model is trained in pure bf16 (not mixed precision), then
-            self.inv_freq would be bf16, and the position indices are also in bf16.
-            Because of the limited precision of bf16 (e.g. 1995.0 is rounded to 2000.0), the
-            embeddings for some positions will coincide.
-            To maintain compatibility with models previously trained in pure bf16,
-            we add this option.
-        """
-        super().__init__()
-        self.dim = dim
-        self.base = float(base)
-        self.pos_idx_in_fp32 = pos_idx_in_fp32
-        # Generate and save the inverse frequency buffer (non trainable)
-        inv_freq = self._compute_inv_freq(device)
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self.interleaved = interleaved
-        self.scale_base = scale_base
-        scale = (
-            (torch.arange(0, dim, 2, device=device, dtype=torch.float32) + 0.4 * dim) / (1.4 * dim)
-            if scale_base is not None
-            else None
-        )
-        self.register_buffer("scale", scale, persistent=False)
-
-        self._seq_len_cached = 0
-        self._cos_cached = None
-        self._sin_cached = None
-        self._cos_k_cached = None
-        self._sin_k_cached = None
-
-    def _compute_inv_freq(self, device=None):
-        return 1.0 / (
-            self.base
-            ** (torch.arange(0, self.dim, 2, device=device, dtype=torch.float32) / self.dim)
-        )
-
-    def _update_cos_sin_cache(self, seqlen, device=None, dtype=None):
-        # Reset the tables if the sequence length has changed,
-        # if we're on a new device (possibly due to tracing for instance),
-        # or if we're switching from inference mode to training
-        if (
-            seqlen > self._seq_len_cached
-            or self._cos_cached is None
-            or self._cos_cached.device != device
-            or self._cos_cached.dtype != dtype
-            or (self.training and self._cos_cached.is_inference())
-        ):
-            self._seq_len_cached = seqlen
-            # We want fp32 here, not self.inv_freq.dtype, since the model could be loaded in bf16
-            # And the output of arange can be quite large, so bf16 would lose a lot of precision.
-            # However, for compatibility reason, we add an option to use the dtype of self.inv_freq.
-            if self.pos_idx_in_fp32:
-                t = torch.arange(seqlen, device=device, dtype=torch.float32)
-                # We want fp32 here as well since inv_freq will be multiplied with t, and the output
-                # will be large. Having it in bf16 will lose a lot of precision and cause the
-                # cos & sin output to change significantly.
-                # We want to recompute self.inv_freq if it was not loaded in fp32
-                if self.inv_freq.dtype != torch.float32:
-                    inv_freq = self._compute_inv_freq(device=device)
-                else:
-                    inv_freq = self.inv_freq
-            else:
-                t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
-                inv_freq = self.inv_freq
-            # Don't do einsum, it converts fp32 to fp16 under AMP
-            # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-            freqs = torch.outer(t, inv_freq)
-            if self.scale is None:
-                self._cos_cached = torch.cos(freqs).to(dtype)
-                self._sin_cached = torch.sin(freqs).to(dtype)
-            else:
-                power = (
-                    torch.arange(seqlen, dtype=self.scale.dtype, device=self.scale.device)
-                    - seqlen // 2
-                ) / self.scale_base
-                scale = self.scale.to(device=power.device) ** rearrange(power, "s -> s 1")
-                # We want the multiplication by scale to happen in fp32
-                self._cos_cached = (torch.cos(freqs) * scale).to(dtype)
-                self._sin_cached = (torch.sin(freqs) * scale).to(dtype)
-                self._cos_k_cached = (torch.cos(freqs) / scale).to(dtype)
-                self._sin_k_cached = (torch.sin(freqs) / scale).to(dtype)
-
-    def forward(
-        self,
-        qkv: torch.Tensor,
-        kv: Optional[torch.Tensor] = None,
-        seqlen_offset: Union[int, torch.Tensor] = 0,
-        max_seqlen: Optional[int] = None,
-    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
-        """
-        qkv: (batch, seqlen, 3, nheads, headdim) if kv is none,
-             else it's just q of shape (batch, seqlen, nheads, headdim)
-        kv: (batch, seqlen, 2, nheads, headdim)
-        seqlen_offset: (batch_size,) or int. Each sequence in x is shifted by this amount.
-            Most commonly used in inference when we have KV cache.
-            If it's a tensor of shape (batch_size,), then to update the cos / sin cache, one
-            should pass in max_seqlen, which will update the cos / sin cache up to that length.
-        Apply rotary embedding *inplace* to qkv and / or kv.
-        """
-        seqlen = qkv.shape[1]
-        if max_seqlen is not None:
-            self._update_cos_sin_cache(max_seqlen, device=qkv.device, dtype=qkv.dtype)
-        elif isinstance(seqlen_offset, int):
-            self._update_cos_sin_cache(seqlen + seqlen_offset, device=qkv.device, dtype=qkv.dtype)
-        if kv is None:
-            if self.scale is None:
-                return apply_rotary_emb_qkv_(
-                    qkv,
-                    self._cos_cached,
-                    self._sin_cached,
-                    interleaved=self.interleaved,
-                    seqlen_offsets=seqlen_offset,
-                )
-            else:
-                return apply_rotary_emb_qkv_(
-                    qkv,
-                    self._cos_cached,
-                    self._sin_cached,
-                    self._cos_k_cached,
-                    self._sin_k_cached,
-                    interleaved=self.interleaved,
-                    seqlen_offsets=seqlen_offset,
-                )
-        else:
-            q = qkv
-            q = apply_rotary_emb_func(
-                q,
-                self._cos_cached,
-                self._sin_cached,
-                interleaved=self.interleaved,
-                inplace=True,
-                seqlen_offsets=seqlen_offset,
-            )
-            if self.scale is None:
-                kv = apply_rotary_emb_kv_(
-                    kv,
-                    self._cos_cached,
-                    self._sin_cached,
-                    interleaved=self.interleaved,
-                    seqlen_offsets=seqlen_offset,
-                )
-            else:
-                kv = apply_rotary_emb_kv_(
-                    kv,
-                    self._cos_k_cached,
-                    self._sin_k_cached,
-                    interleaved=self.interleaved,
-                    seqlen_offsets=seqlen_offset,
-                )
-            return q, kv
--- a/vllm_flash_attn/losses/__init__.py
+++ b/vllm_flash_attn/losses/__init__.py
--- a/vllm_flash_attn/losses/cross_entropy.py
+++ b/vllm_flash_attn/losses/cross_entropy.py
-# Copyright (c) 2023, Tri Dao.
-
-import torch
-import torch.nn as nn
-
-from flash_attn.ops.triton.cross_entropy import cross_entropy_loss
-
-
-class CrossEntropyLoss(nn.Module):
-    def __init__(
-        self,
-        ignore_index=-100,
-        reduction="mean",
-        label_smoothing=0.0,
-        logit_scale=1.0,
-        lse_square_scale=0.0,
-        inplace_backward=False,
-        process_group=None,
-        return_z_loss=False,
-    ):
-        """
-        Arguments:
-            ignored_index: int. If labels == ignored_index, the loss is set to 0.0.
-            label_smoothing: float
-            lse_square_scale: float. If > 0, we add lse_square_scale * lse(logits) ^ 2 to the loss.
-                This is also referred to as "z-loss".
-            inplace_backward: bool. If True, we do the backward pass in-place by modifying the logits.
-                This saves memory.
-            process_group: if not None, we're doing Tensor Parallel: each process is responsible for
-                one part of the vocab. The loss will be aggregated across processes.
-            return_z_loss: bool. If True, we return the component of the loss contributed by
-                the lse_square_scale value. This value is only for logging and does not support
-                backprop.
-        """
-        super().__init__()
-        if reduction not in ["mean", "none", "sum"]:
-            raise NotImplementedError("Only support reduction = 'mean' or 'none' or 'sum'")
-        self.ignore_index = ignore_index
-        self.reduction = reduction
-        self.label_smoothing = label_smoothing
-        self.logit_scale = logit_scale
-        self.lse_square_scale = lse_square_scale
-        self.inplace_backward = inplace_backward
-        self.process_group = process_group
-        self.return_z_loss = return_z_loss
-
-    def forward(self, input, target):
-        """
-        Arguments:
-            input: (batch, vocab_size)
-            target: (batch,)
-        Returns:
-            losses: (batch,) if reduction is 'none', else (1,), dtype float
-            z_loss: (batch,) if reduction is 'none', else (1,), dtype float (if self.return_z_loss)
-        """
-        assert input.is_cuda and target.is_cuda, "Only support CUDA tensors"
-        loss, z_loss = cross_entropy_loss(
-            input,
-            target,
-            label_smoothing=self.label_smoothing,
-            logit_scale=self.logit_scale,
-            lse_square_scale=self.lse_square_scale,
-            ignored_index=self.ignore_index,
-            inplace_backward=self.inplace_backward,
-            process_group=self.process_group,
-        )
-        if self.reduction == "mean":
-            loss = loss.sum() / (target != self.ignore_index).sum()
-        elif self.reduction == "sum":
-            loss = loss.sum()
-        else:
-            loss = loss
-
-        if not self.return_z_loss:
-            return loss
-
-        if self.reduction == "mean":
-            z_loss = z_loss.sum() / (target != self.ignore_index).sum()
-        elif self.reduction == "sum":
-            z_loss = z_loss.sum()
-        else:
-            z_loss = z_loss
-
-        return loss, z_loss
--- a/vllm_flash_attn/models/__init__.py
+++ b/vllm_flash_attn/models/__init__.py
--- a/vllm_flash_attn/models/baichuan.py
+++ b/vllm_flash_attn/models/baichuan.py
-# Copyright (c) 2023, GGGGGGXY, Tri Dao.
-
-import math
-import json
-import re
-from pathlib import Path
-
-from collections import OrderedDict
-
-import torch
-import torch.nn.functional as F
-
-from einops import rearrange
-from transformers import GPT2Config, AutoConfig, PretrainedConfig
-
-
-def remap_state_dict_hf_baichuan(state_dict, config):
-    def key_mapping_layers(key):
-        return re.sub(r"^model.", "transformer.", key)
-
-    state_dict = OrderedDict((key_mapping_layers(k), v) for k, v in state_dict.items())
-
-    # Word embedding
-    def key_mapping_emb(key):
-        return re.sub(
-            r"^transformer.embed_tokens.",
-            "transformer.embeddings.word_embeddings.",
-            key,
-        )
-
-    state_dict = OrderedDict((key_mapping_emb(k), v) for k, v in state_dict.items())
-    word_embeddings = state_dict.pop("transformer.embeddings.word_embeddings.weight")
-    # It's possible that vocab_size is padded to be a multiple of 8, for example.
-    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
-    vocab_size = (
-        math.ceil(word_embeddings.shape[0] / pad_vocab_size_multiple)
-        * pad_vocab_size_multiple
-    )
-    state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad(
-        word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0])
-    )
-    if getattr(config, "tie_word_embeddings"):
-        state_dict["lm_head.weight"] = state_dict[
-            "transformer.embeddings.word_embeddings.weight"
-        ]
-    else:
-        output_embeddings = state_dict.pop("lm_head.weight")
-        # Need to recompute vocab_size since Baichuan shards the word embeddings and output embeddings
-        # differently.
-        vocab_size = (
-            math.ceil(output_embeddings.shape[0] / pad_vocab_size_multiple)
-            * pad_vocab_size_multiple
-        )
-        # It's possible that vocab_size is padded to be a multiple of 8, for example.
-        state_dict["lm_head.weight"] = F.pad(
-            output_embeddings, (0, 0, 0, vocab_size - output_embeddings.shape[0])
-        )
-
-    # LayerNorm
-    def key_mapping_ln(key):
-        key = re.sub(r"^transformer.norm.", r"transformer.ln_f.", key)
-        key = re.sub(
-            r"^transformer.layers.(\d+).input_layernorm.",
-            r"transformer.layers.\1.norm1.",
-            key,
-        )
-        key = re.sub(
-            r"^transformer.layers.(\d+).post_attention_layernorm.",
-            r"transformer.layers.\1.norm2.",
-            key,
-        )
-        return key
-
-    state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
-
-    # MLP
-    for l in range(config.n_layer):
-        w1 = state_dict.pop(f"transformer.layers.{l}.mlp.gate_proj.weight")
-        w3 = state_dict.pop(f"transformer.layers.{l}.mlp.up_proj.weight")
-        # Our ordering is different
-        state_dict[f"transformer.layers.{l}.mlp.fc1.weight"] = torch.cat(
-            [w3, w1], dim=0
-        )
-
-    def key_mapping_mlp(key):
-        return re.sub(
-            r"^transformer.layers.(\d+).mlp.down_proj.",
-            r"transformer.layers.\1.mlp.fc2.",
-            key,
-        )
-
-    state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
-
-    # Attention
-    def key_mapping_attn(key):
-        key = re.sub(
-            r"^transformer.layers.(\d+).self_attn.W_pack.",
-            r"transformer.layers.\1.mixer.Wqkv.",
-            key,
-        )
-        key = re.sub(
-            r"^transformer.layers.(\d+).self_attn.o_proj.",
-            r"transformer.layers.\1.mixer.out_proj.",
-            key,
-        )
-        return key
-
-    state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
-    for l in range(config.n_layer):
-        # pop rotary_emb.inv_freq from state dict
-        state_dict.pop(f"transformer.layers.{l}.self_attn.rotary_emb.inv_freq", None)
-    return state_dict
-
-
-def baichuan_config_to_gpt2_config(baichuan_config: PretrainedConfig) -> GPT2Config:
-    # HACK: the config doesn't have say whether it's rotary or alibi.
-    # So we have to infer from the hidden size (7B -> rotary, 13B -> alibi).
-    # HACK: the config doesn't have say whether it uses norm head.
-    # So we have to infer from the vocab size
-    # (v1, vocab size 64k, no norm head; v2, vocab size 128k, norm head).
-    use_rotary = baichuan_config.hidden_size < 5000
-    return GPT2Config(
-        vocab_size=baichuan_config.vocab_size,
-        n_positions=0,  # No absolute position embedding
-        n_embd=baichuan_config.hidden_size,
-        n_layer=baichuan_config.num_hidden_layers,
-        n_head=baichuan_config.num_attention_heads,
-        n_inner=baichuan_config.intermediate_size,
-        activation_function="swiglu",  # Hardcode since HF calls it 'silu'
-        # baichuan doesn't have dropout, idk if it's because they only release the inference code
-        resid_pdrop=0.0,
-        embd_pdrop=0.0,
-        attn_pdrop=0.0,
-        layer_norm_epsilon=baichuan_config.rms_norm_eps,
-        initializer_range=baichuan_config.initializer_range,
-        bos_token_id=baichuan_config.bos_token_id,
-        eos_token_id=baichuan_config.eos_token_id,
-        # These are new arguments not in the original GPT2Config
-        pad_token_id=baichuan_config.pad_token_id,  # Idk if this does anything
-        rms_norm=True,
-        rotary_emb_fraction=1.0 if use_rotary else 0.0,
-        rotary_emb_interleaved=False,
-        use_alibi=not use_rotary,
-        use_flash_attn=not use_rotary,  # Alibi code path requires flash_attn
-        tie_word_embeddings=False,
-        norm_head=baichuan_config.vocab_size > 70000,
-        qkv_proj_bias=False,
-        out_proj_bias=False,
-        mlp_fc1_bias=False,
-        mlp_fc2_bias=False,
-    )
--- a/vllm_flash_attn/models/bert.py
+++ b/vllm_flash_attn/models/bert.py
--- a/vllm_flash_attn/models/bigcode.py
+++ b/vllm_flash_attn/models/bigcode.py
-import math
-import re
-from collections import OrderedDict
-
-import torch
-import torch.nn.functional as F
-from transformers import GPT2Config, GPTBigCodeConfig, PretrainedConfig
-
-
-def remap_state_dict_hf_bigcode(state_dict, config: PretrainedConfig):
-    """
-    Map the state_dict of a Huggingface BigCode model to be flash_attn compatible.
-    """
-
-    # Word embedding and position embedding
-    def key_mapping_pos_emb(key):
-        return re.sub(r"^transformer.wpe.", "transformer.embeddings.position_embeddings.", key)
-
-    state_dict = OrderedDict((key_mapping_pos_emb(k), v) for k, v in state_dict.items())
-    word_embeddings = state_dict.pop("transformer.wte.weight")
-    # It's possible that vocab_size is padded to be a multiple of 8, for example.
-    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
-    vocab_size = math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
-    state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad(
-        word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0])
-    )
-    state_dict["lm_head.weight"] = state_dict["transformer.embeddings.word_embeddings.weight"]
-
-    # LayerNorm
-    def key_mapping_ln(key):
-        key = re.sub(r"^transformer.ln_f.(weight|bias)", r"transformer.ln_f.\1", key)
-        key = re.sub(
-            r"^transformer.h.(\d+).ln_(1|2).(weight|bias)",
-            r"transformer.layers.\1.norm\2.\3",
-            key,
-        )
-        return key
-
-    state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
-
-    def key_mapping_mlp(key):
-        key = re.sub(
-            r"^transformer.h.(\d+).mlp.c_fc.weight",
-            r"transformer.layers.\1.mlp.fc1.weight",
-            key,
-        )
-        key = re.sub(
-            r"^transformer.h.(\d+).mlp.c_proj.weight",
-            r"transformer.layers.\1.mlp.fc2.weight",
-            key,
-        )
-        key = re.sub(
-            r"^transformer.h.(\d+).mlp.c_fc.bias",
-            r"transformer.layers.\1.mlp.fc1.bias",
-            key,
-        )
-        key = re.sub(
-            r"^transformer.h.(\d+).mlp.c_proj.bias",
-            r"transformer.layers.\1.mlp.fc2.bias",
-            key,
-        )
-        return key
-
-    state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
-
-    # TODO: add support for multi-head attention
-    assert config.multi_query, "Only multi-query attention is supported"
-
-    # Attention
-    for d in range(config.num_hidden_layers):
-        embed_dim = config.n_embd
-        head_dim = embed_dim // config.n_head
-
-        c_attn_weight = state_dict.pop(f"transformer.h.{d}.attn.c_attn.weight")
-        # with multi-query attention, the weights have shape (embed_dim, embed_dim + head_dim + head_dim)
-        # see https://github.com/huggingface/transformers/blob/95b374952dc27d8511541d6f5a4e22c9ec11fb24/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py#L112
-        # see also https://github.com/ggerganov/ggml/blob/dd1d575956e54c5bdc07632f25506b3b1884dbd2/examples/starcoder/convert-hf-to-ggml.py#L183
-        # ((n_head + 2) * head_dim, embed_dim) -> (3 * n_heads * head_dim, hidden_dim)
-        q, k, v = torch.split(c_attn_weight, [embed_dim, head_dim, head_dim], dim=0)
-        # duplicate k, v along the first axis (head_dim, hidden_dim) -> (n_heads * head_dim, hidden_dim)
-        k = torch.tile(k, (config.n_head, 1))
-        v = torch.tile(v, (config.n_head, 1))
-        state_dict[f"transformer.layers.{d}.mixer.Wqkv.weight"] = torch.cat((q, k, v), dim=0)
-
-        # same deal with the bias
-        c_attn_bias = state_dict.pop(f"transformer.h.{d}.attn.c_attn.bias")
-        # ((n_head + 2) * head_dim, embed_dim) -> (3 * n_heads * head_dim, hidden_dim)
-        q, k, v = torch.split(c_attn_bias, [embed_dim, head_dim, head_dim], dim=0)
-        # duplicate k, v along the first axis (head_dim, hidden_dim) -> (n_heads * head_dim, hidden_dim)
-        k = torch.tile(k, (config.n_head,))
-        v = torch.tile(v, (config.n_head,))
-        state_dict[f"transformer.layers.{d}.mixer.Wqkv.bias"] = torch.cat((q, k, v), dim=0)
-
-    def key_mapping_attn(key):
-        key = re.sub(
-            r"^transformer.h.(\d+).attn.c_proj.weight",
-            r"transformer.layers.\1.mixer.out_proj.weight",
-            key,
-        )
-        key = re.sub(
-            r"^transformer.h.(\d+).attn.c_proj.bias",
-            r"transformer.layers.\1.mixer.out_proj.bias",
-            key,
-        )
-        return key
-
-    state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
-
-    return state_dict
-
-
-def inv_remap_state_dict_hf_bigcode(state_dict, config: PretrainedConfig):
-    """
-    Map the state_dict of a flash_attn model to be Huggingface BigCode compatible.
-
-    This function is meant to be the inverse of remap_state_dict_hf_bigcode.
-    """
-
-    # Word embedding and position embeddings
-    def inv_key_mapping_pos_emb(key):
-        return re.sub(r"^transformer.embeddings.position_embeddings.", "transformer.wpe.", key)
-
-    state_dict = OrderedDict((inv_key_mapping_pos_emb(k), v) for k, v in state_dict.items())
-    word_embeddings = state_dict.pop("transformer.embeddings.word_embeddings.weight")
-
-    word_embeddings = word_embeddings[:, : config.vocab_size]
-    state_dict["transformer.wte.weight"] = word_embeddings
-    state_dict["lm_head.weight"] = word_embeddings
-
-    # LayerNorm
-    def inv_key_mapping_ln(key):
-        key = re.sub(r"^transformer.ln_f.(weight|bias)", r"transformer.ln_f.\1", key)
-        key = re.sub(
-            r"^transformer.layers.(\d+).norm(1|2).(weight|bias)",
-            r"transformer.h.\1.ln_\2.\3",
-            key,
-        )
-        return key
-
-    state_dict = OrderedDict((inv_key_mapping_ln(k), v) for k, v in state_dict.items())
-
-    # MLPs
-    def inv_key_mapping_mlp(key):
-        key = re.sub(
-            r"^transformer.layers.(\d+).mlp.fc1.weight",
-            r"transformer.h.\1.mlp.c_fc.weight",
-            key,
-        )
-        key = re.sub(
-            r"^transformer.layers.(\d+).mlp.fc2.weight",
-            r"transformer.h.\1.mlp.c_proj.weight",
-            key,
-        )
-        key = re.sub(
-            r"^transformer.layers.(\d+).mlp.fc1.bias",
-            r"transformer.h.\1.mlp.c_fc.bias",
-            key,
-        )
-        key = re.sub(
-            r"^transformer.layers.(\d+).mlp.fc2.bias",
-            r"transformer.h.\1.mlp.c_proj.bias",
-            key,
-        )
-        return key
-
-    state_dict = OrderedDict((inv_key_mapping_mlp(k), v) for k, v in state_dict.items())
-
-    # Attention
-    for d in range(config.num_hidden_layers):
-        embed_dim = config.n_embd
-        head_dim = embed_dim // config.n_head
-
-        Wqkv_weight = state_dict.pop(f"transformer.layers.{d}.mixer.Wqkv.weight")
-        q, k, v = torch.split(
-            Wqkv_weight, [embed_dim, head_dim * config.n_head, head_dim * config.n_head], dim=0
-        )
-        c_attn_weight = torch.cat((q, k[:head_dim], v[:head_dim]), dim=0)
-        state_dict[f"transformer.h.{d}.attn.c_attn.weight"] = c_attn_weight
-
-        # Same deal with the bias
-        Wqkv_bias = state_dict.pop(f"transformer.layers.{d}.mixer.Wqkv.bias")
-        q, k, v = torch.split(
-            Wqkv_bias, [embed_dim, head_dim * config.n_head, head_dim * config.n_head], dim=0
-        )
-        c_attn_bias = torch.cat((q, k[:head_dim], v[:head_dim]), dim=0)
-        state_dict[f"transformer.h.{d}.attn.c_attn.bias"] = c_attn_bias
-
-    def inv_key_mapping_attn(key):
-        key = re.sub(
-            r"^transformer.layers.(\d+).mixer.out_proj.weight",
-            r"transformer.h.\1.attn.c_proj.weight",
-            key,
-        )
-        key = re.sub(
-            r"^transformer.layers.(\d+).mixer.out_proj.bias",
-            r"transformer.h.\1.attn.c_proj.bias",
-            key,
-        )
-        return key
-
-    state_dict = OrderedDict((inv_key_mapping_attn(k), v) for k, v in state_dict.items())
-
-    return state_dict
-
-
-def bigcode_config_to_gpt2_config(bigcode_config: GPTBigCodeConfig) -> GPT2Config:
-    return GPT2Config(
-        activation_function=bigcode_config.activation_function,
-        attn_pdrop=bigcode_config.attn_pdrop,
-        bos_token_id=bigcode_config.bos_token_id,
-        embd_pdrop=bigcode_config.embd_pdrop,
-        eos_token_id=bigcode_config.eos_token_id,
-        initializer_range=bigcode_config.initializer_range,
-        layer_norm_epsilon=bigcode_config.layer_norm_epsilon,
-        max_batch_size=bigcode_config.max_batch_size,
-        max_sequence_length=bigcode_config.max_sequence_length,
-        model_type=bigcode_config.model_type,
-        multi_query=bigcode_config.multi_query,
-        n_embd=bigcode_config.n_embd,
-        n_head=bigcode_config.n_head,
-        n_inner=bigcode_config.n_inner,
-        n_layer=bigcode_config.n_layer,
-        n_positions=bigcode_config.n_positions,
-        resid_pdrop=bigcode_config.resid_pdrop,
-        scale_attn_weights=bigcode_config.scale_attn_weights,
-        summary_activation=bigcode_config.summary_activation,
-        summary_first_dropout=bigcode_config.summary_first_dropout,
-        summary_proj_to_labels=bigcode_config.summary_proj_to_labels,
-        summary_type=bigcode_config.summary_type,
-        summary_use_proj=bigcode_config.summary_use_proj,
-        use_cache=bigcode_config.use_cache,
-        vocab_size=bigcode_config.vocab_size,
-    )
--- a/vllm_flash_attn/models/btlm.py
+++ b/vllm_flash_attn/models/btlm.py
-# Copyright (c) 2023, Tri Dao.
-
-import math
-import json
-import re
-from pathlib import Path
-
-from collections import OrderedDict
-
-import torch
-import torch.nn.functional as F
-
-from einops import rearrange
-from transformers import GPT2Config, AutoConfig, PretrainedConfig
-
-
-def remap_state_dict_hf_btlm(state_dict, config):
-    # Word embedding and position embedding
-    def key_mapping_pos_emb(key):
-        return re.sub(r"^transformer.wpe.", "transformer.embeddings.position_embeddings.", key)
-
-    if "transformer.wpe.weight" in state_dict:
-        state_dict = OrderedDict((key_mapping_pos_emb(k), v) for k, v in state_dict.items())
-    word_embeddings = state_dict.pop("transformer.wte.weight")
-    # It's possible that vocab_size is padded to be a multiple of 8, for example.
-    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
-    vocab_size = math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
-    state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad(
-        word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0])
-    )
-    state_dict["lm_head.weight"] = state_dict["transformer.embeddings.word_embeddings.weight"]
-
-    # LayerNorm
-    def key_mapping_ln(key):
-        key = re.sub(r"^transformer.ln_f.(weight|bias)", r"transformer.ln_f.\1", key)
-        key = re.sub(r"^transformer.h.(\d+).ln_(1|2).(weight|bias)", r"transformer.layers.\1.norm\2.\3", key)
-        return key
-
-    state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
-
-    # MLP
-    for d in range(config.num_hidden_layers):
-        W1 = state_dict.pop(f"transformer.h.{d}.mlp.c_fc.weight")
-        W3 = state_dict.pop(f"transformer.h.{d}.mlp.c_fc2.weight")
-        state_dict[f"transformer.layers.{d}.mlp.fc1.weight"] = torch.cat([W1.t(), W3.t()], dim=0)
-        b1 = state_dict.pop(f"transformer.h.{d}.mlp.c_fc.bias")
-        b3 = state_dict.pop(f"transformer.h.{d}.mlp.c_fc2.bias")
-        state_dict[f"transformer.layers.{d}.mlp.fc1.bias"] = torch.cat([b1, b3], dim=0)
-        W2 = state_dict.pop(f"transformer.h.{d}.mlp.c_proj.weight")
-        state_dict[f"transformer.layers.{d}.mlp.fc2.weight"] = W2.t()
-
-    def key_mapping_mlp(key):
-        key = re.sub(r"^transformer.h.(\d+).mlp.c_proj.bias", r"transformer.layers.\1.mlp.fc2.bias", key)
-        return key
-
-    state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
-
-    # Attention
-    for d in range(config.num_hidden_layers):
-        Wqkv = state_dict.pop(f"transformer.h.{d}.attn.c_attn.weight")
-        state_dict[f"transformer.layers.{d}.mixer.Wqkv.weight"] = Wqkv.t()
-        Wout = state_dict.pop(f"transformer.h.{d}.attn.c_proj.weight")
-        state_dict[f"transformer.layers.{d}.mixer.out_proj.weight"] = Wout.t()
-    state_dict.pop(f"transformer.relative_pe.slopes")  # We don't store the Alibi slopes
-
-    def key_mapping_attn(key):
-        key = re.sub(r"^transformer.h.(\d+).attn.c_attn.bias", r"transformer.layers.\1.mixer.Wqkv.bias", key)
-        key = re.sub(
-            r"^transformer.h.(\d+).attn.c_proj.bias", r"transformer.layers.\1.mixer.out_proj.bias", key
-        )
-        return key
-
-    state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
-
-    return state_dict
-
-
-def btlm_config_to_gpt2_config(btlm_config: PretrainedConfig) -> GPT2Config:
-    return GPT2Config(
-        vocab_size=btlm_config.vocab_size,
-        n_positions=0 if btlm_config.position_embedding_type == "alibi" else btlm_config.n_positions,
-        n_embd=btlm_config.hidden_size,
-        n_layer=btlm_config.num_hidden_layers,
-        n_head=btlm_config.num_attention_heads,
-        n_inner=btlm_config.n_inner,
-        activation_function=btlm_config.activation_function,
-        resid_pdrop=btlm_config.resid_pdrop,
-        embd_pdrop=btlm_config.embd_pdrop,
-        attn_pdrop=btlm_config.attn_pdrop,
-        layer_norm_epsilon=btlm_config.layer_norm_epsilon,
-        initializer_range=btlm_config.initializer_range,
-        bos_token_id=btlm_config.bos_token_id,
-        eos_token_id=btlm_config.eos_token_id,
-        # These are new arguments not in the original GPT2Config
-        use_alibi=btlm_config.position_embedding_type == "alibi",
-        use_flash_attn=btlm_config.position_embedding_type == "alibi",  # Alibi code path requires flash_attn
-        mup_width_scale=btlm_config.mup_width_scale,
-        mup_embeddings_multiplier=btlm_config.mup_embeddings_scale,
-        mup_output_multiplier=btlm_config.mup_output_alpha,
-        mup_scale_qk_dot_by_d=btlm_config.mup_scale_qk_dot_by_d,
-        mlp_multiple_of=1,
-    )
--- a/vllm_flash_attn/models/falcon.py
+++ b/vllm_flash_attn/models/falcon.py
-# Copyright (c) 2023, Tri Dao.
-
-import math
-import re
-from collections import OrderedDict
-
-import torch
-import torch.nn.functional as F
-from einops import rearrange
-from transformers import FalconConfig, GPT2Config
-
-
-def remap_state_dict_hf_falcon(state_dict, config):
-    def key_mapping_layers(key):
-        return re.sub(r"^transformer.h.", "transformer.layers.", key)
-
-    state_dict = OrderedDict((key_mapping_layers(k), v) for k, v in state_dict.items())
-    # Word embedding
-    def key_mapping_emb(key):
-        return re.sub(
-            r"^transformer.word_embeddings.", "transformer.embeddings.word_embeddings.", key
-        )
-
-    state_dict = OrderedDict((key_mapping_emb(k), v) for k, v in state_dict.items())
-    word_embeddings = state_dict.pop("transformer.embeddings.word_embeddings.weight")
-    # It's possible that vocab_size is padded to be a multiple of 8, for example.
-    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
-    vocab_size = math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
-    state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad(
-        word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0])
-    )
-    if getattr(config, "tie_word_embeddings"):
-        state_dict["lm_head.weight"] = state_dict["transformer.embeddings.word_embeddings.weight"]
-    else:
-        output_embeddings = state_dict.pop("lm_head.weight")
-        # It's possible that vocab_size is padded to be a multiple of 8, for example.
-        state_dict["lm_head.weight"] = F.pad(
-            output_embeddings, (0, 0, 0, vocab_size - output_embeddings.shape[0])
-        )
-        output_embeddings_bias = state_dict.pop("lm_head.bias")
-        state_dict["lm_head.bias"] = F.pad(
-            output_embeddings_bias, (0, vocab_size - output_embeddings_bias.shape[0])
-        )
-
-    # LayerNorm
-    def key_mapping_ln(key):
-        key = re.sub(
-            r"^transformer.layers.(\d+).input_layernorm.", r"transformer.layers.\1.norm1.", key
-        )
-        key = re.sub(
-            r"^transformer.layers.(\d+).post_attention_layernorm.",
-            r"transformer.layers.\1.norm2.",
-            key,
-        )
-        key = re.sub(r"^transformer.layers.(\d+).ln_attn.", r"transformer.layers.\1.norm1.", key)
-        key = re.sub(r"^transformer.layers.(\d+).ln_mlp.", r"transformer.layers.\1.norm2.", key)
-        return key
-
-    state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
-
-    # MLP
-    def key_mapping_mlp(key):
-        key = re.sub(
-            r"^transformer.layers.(\d+).mlp.dense_h_to_4h.", r"transformer.layers.\1.mlp.fc1.", key
-        )
-        key = re.sub(
-            r"^transformer.layers.(\d+).mlp.dense_4h_to_h.", r"transformer.layers.\1.mlp.fc2.", key
-        )
-        return key
-
-    state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
-
-    def key_mapping_attn(key):
-        key = re.sub(
-            r"^transformer.layers.(\d+).self_attention.query_key_value.",
-            r"transformer.layers.\1.mixer.Wqkv.",
-            key,
-        )
-        key = re.sub(
-            r"^transformer.layers.(\d+).self_attention.dense.",
-            r"transformer.layers.\1.mixer.out_proj.",
-            key,
-        )
-        return key
-
-    state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
-    n_head = config.n_head
-    n_head_kv = getattr(config, "n_head_kv", 1)
-    headdim = config.hidden_size // n_head
-    for l in range(config.n_layer):
-        # The weights are stored in a different layout compared to our implementation
-        Wqkv = rearrange(
-            state_dict.pop(f"transformer.layers.{l}.mixer.Wqkv.weight"),
-            "(group ratio headdim) ... -> group ratio headdim ...",
-            ratio=n_head // n_head_kv + 2,
-            headdim=headdim,
-        )
-        Wq = rearrange(Wqkv[:, :-2], "group ratio headdim ... -> (group ratio headdim) ...")
-        Wk = rearrange(Wqkv[:, [-2]], "group ratio headdim ... -> (group ratio headdim) ...")
-        Wv = rearrange(Wqkv[:, [-1]], "group ratio headdim ... -> (group ratio headdim) ...")
-        state_dict[f"transformer.layers.{l}.mixer.Wqkv.weight"] = torch.cat([Wq, Wk, Wv], dim=0)
-
-    return state_dict
-
-
-def falcon_config_to_gpt2_config(falcon_config: FalconConfig) -> GPT2Config:
-    # The 40b config uses "n_head_kv" instead of "num_kv_heads"
-    n_head_kv = getattr(
-        falcon_config,
-        "n_head_kv",
-        1 if getattr(falcon_config, "multi_query", False) else falcon_config.n_head,
-    )
-    # HACK: the 40b config has 2 LN per layer instead of 1, but that's not reflected in the config.
-    # So we have to infer it from the number of heads in the key/value block
-    parallel_block_tied_norm = n_head_kv == 1
-    return GPT2Config(
-        vocab_size=falcon_config.vocab_size,
-        n_positions=0,  # No absolute position embedding
-        n_embd=falcon_config.hidden_size,
-        n_layer=falcon_config.n_layer,
-        n_head=falcon_config.n_head,
-        n_inner=falcon_config.hidden_size * 4,
-        activation_function="gelu",
-        resid_pdrop=falcon_config.hidden_dropout,
-        embd_pdrop=0.0,  # There doesn't seem to be any embedding dropout
-        attn_pdrop=falcon_config.attention_dropout,
-        layer_norm_epsilon=falcon_config.layer_norm_epsilon,
-        initializer_range=falcon_config.initializer_range,
-        bos_token_id=falcon_config.bos_token_id,
-        eos_token_id=falcon_config.eos_token_id,
-        # These are new arguments not in the original GPT2Config
-        parallel_block=falcon_config.parallel_attn,
-        n_head_kv=n_head_kv,
-        parallel_block_tied_norm=parallel_block_tied_norm,
-        rotary_emb_fraction=1.0,
-        rotary_emb_interleaved=False,
-        tie_word_embeddings=True,
-        qkv_proj_bias=falcon_config.bias,
-        out_proj_bias=falcon_config.bias,
-        mlp_fc1_bias=falcon_config.bias,
-        mlp_fc2_bias=falcon_config.bias,
-        lm_head_bias=False,
-    )
--- a/vllm_flash_attn/models/gpt.py
+++ b/vllm_flash_attn/models/gpt.py
--- a/vllm_flash_attn/models/gpt_neox.py
+++ b/vllm_flash_attn/models/gpt_neox.py
-# Copyright (c) 2023, Tri Dao.
-
-import math
-import re
-from collections import OrderedDict
-
-import torch
-import torch.nn.functional as F
-from einops import rearrange
-from transformers import GPT2Config, GPTNeoXConfig
-
-
-def remap_state_dict_hf_gpt_neox(state_dict, config):
-    def key_mapping_layers(key):
-        return re.sub(r"^gpt_neox.", "transformer.", key)
-
-    state_dict = OrderedDict((key_mapping_layers(k), v) for k, v in state_dict.items())
-    # Word embedding
-    def key_mapping_emb(key):
-        return re.sub(r"^transformer.embed_in.", "transformer.embeddings.word_embeddings.", key)
-
-    state_dict = OrderedDict((key_mapping_emb(k), v) for k, v in state_dict.items())
-    word_embeddings = state_dict.pop("transformer.embeddings.word_embeddings.weight")
-    # It's possible that vocab_size is padded to be a multiple of 8, for example.
-    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
-    vocab_size = math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
-    state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad(
-        word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0])
-    )
-    if getattr(config, "tie_word_embeddings", False):
-        state_dict["lm_head.weight"] = state_dict["transformer.embeddings.word_embeddings.weight"]
-    else:
-        output_embeddings = state_dict.pop("embed_out.weight")
-        # It's possible that vocab_size is padded to be a multiple of 8, for example.
-        state_dict["lm_head.weight"] = F.pad(
-            output_embeddings, (0, 0, 0, vocab_size - output_embeddings.shape[0])
-        )
-
-    # LayerNorm
-    def key_mapping_ln(key):
-        key = re.sub(r"^transformer.final_layer_norm.", r"transformer.ln_f.", key)
-        key = re.sub(
-            r"^transformer.layers.(\d+).input_layernorm.", r"transformer.layers.\1.norm1.", key
-        )
-        key = re.sub(
-            r"^transformer.layers.(\d+).post_attention_layernorm.",
-            r"transformer.layers.\1.norm2.",
-            key,
-        )
-        return key
-
-    state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
-
-    # MLP
-    def key_mapping_mlp(key):
-        key = re.sub(
-            r"^transformer.layers.(\d+).mlp.dense_h_to_4h.", r"transformer.layers.\1.mlp.fc1.", key
-        )
-        key = re.sub(
-            r"^transformer.layers.(\d+).mlp.dense_4h_to_h.", r"transformer.layers.\1.mlp.fc2.", key
-        )
-        return key
-
-    state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
-
-    # Attention
-    for l in range(config.n_layer):
-        # We don't store these biases
-        state_dict.pop(f"transformer.layers.{l}.attention.bias")
-        state_dict.pop(f"transformer.layers.{l}.attention.masked_bias")
-        # We don't store these
-        state_dict.pop(f"transformer.layers.{l}.attention.rotary_emb.inv_freq", None)
-        # GPT-NeoX stores Wqkv as ((nheads 3 headdim), hidden_dim)
-        # while we store Wqkv as ((3 nheads headdim), hidden_dim)
-        headdim = config.hidden_size // config.num_attention_heads
-        Wqkv = state_dict.pop(f"transformer.layers.{l}.attention.query_key_value.weight")
-        state_dict[f"transformer.layers.{l}.mixer.Wqkv.weight"] = rearrange(
-            Wqkv,
-            "(nheads three headdim) ... -> (three nheads headdim) ...",
-            three=3,
-            headdim=headdim,
-        )
-        bqkv = state_dict.pop(f"transformer.layers.{l}.attention.query_key_value.bias")
-        state_dict[f"transformer.layers.{l}.mixer.Wqkv.bias"] = rearrange(
-            bqkv, "(nheads three headdim) -> (three nheads headdim)", three=3, headdim=headdim
-        )
-
-    def key_mapping_attn(key):
-        key = re.sub(
-            r"^transformer.layers.(\d+).attention.dense.",
-            r"transformer.layers.\1.mixer.out_proj.",
-            key,
-        )
-        return key
-
-    state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
-
-    return state_dict
-
-
-def gpt_neox_config_to_gpt2_config(gpt_neox_config: GPTNeoXConfig) -> GPT2Config:
-    assert gpt_neox_config.rotary_emb_base == 10000
-    return GPT2Config(
-        vocab_size=gpt_neox_config.vocab_size,
-        n_positions=0,  # No absolute position embedding
-        n_embd=gpt_neox_config.hidden_size,
-        n_layer=gpt_neox_config.num_hidden_layers,
-        n_head=gpt_neox_config.num_attention_heads,
-        n_inner=gpt_neox_config.intermediate_size,
-        activation_function=gpt_neox_config.hidden_act,
-        resid_pdrop=0.0,  # No dropout
-        embd_pdrop=0.0,
-        attn_pdrop=0.0,
-        layer_norm_epsilon=gpt_neox_config.layer_norm_eps,
-        initializer_range=gpt_neox_config.initializer_range,
-        bos_token_id=gpt_neox_config.bos_token_id,
-        eos_token_id=gpt_neox_config.eos_token_id,
-        # These are new arguments not in the original GPT2Config
-        prenorm=True,
-        parallel_block=gpt_neox_config.use_parallel_residual,
-        parallel_block_tied_norm=False,
-        rotary_emb_fraction=gpt_neox_config.rotary_pct,
-        tie_word_embeddings=gpt_neox_config.tie_word_embeddings,
-    )
--- a/vllm_flash_attn/models/gptj.py
+++ b/vllm_flash_attn/models/gptj.py
-# Copyright (c) 2023, Tri Dao.
-
-import math
-import re
-from collections import OrderedDict
-
-import torch
-import torch.nn.functional as F
-from transformers import GPT2Config, GPTJConfig
-
-
-def remap_state_dict_hf_gptj(state_dict, config):
-    def key_mapping_layers(key):
-        return re.sub(r"^transformer.h.", "transformer.layers.", key)
-
-    state_dict = OrderedDict((key_mapping_layers(k), v) for k, v in state_dict.items())
-    # Word embedding
-    def key_mapping_emb(key):
-        return re.sub(r"^transformer.wte.", "transformer.embeddings.word_embeddings.", key)
-
-    state_dict = OrderedDict((key_mapping_emb(k), v) for k, v in state_dict.items())
-    word_embeddings = state_dict.pop("transformer.embeddings.word_embeddings.weight")
-    # It's possible that vocab_size is padded to be a multiple of 8, for example.
-    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
-    vocab_size = math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
-    state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad(
-        word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0])
-    )
-    if getattr(config, "tie_word_embeddings"):
-        state_dict["lm_head.weight"] = state_dict["transformer.embeddings.word_embeddings.weight"]
-    else:
-        output_embeddings = state_dict.pop("lm_head.weight")
-        # It's possible that vocab_size is padded to be a multiple of 8, for example.
-        state_dict["lm_head.weight"] = F.pad(
-            output_embeddings, (0, 0, 0, vocab_size - output_embeddings.shape[0])
-        )
-        output_embeddings_bias = state_dict.pop("lm_head.bias")
-        state_dict["lm_head.bias"] = F.pad(
-            output_embeddings_bias, (0, vocab_size - output_embeddings_bias.shape[0])
-        )
-
-    # LayerNorm
-    def key_mapping_ln(key):
-        return re.sub(r"^transformer.layers.(\d+).ln_1.", r"transformer.layers.\1.norm1.", key)
-
-    state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
-
-    # MLP
-    def key_mapping_mlp(key):
-        key = re.sub(
-            r"^transformer.layers.(\d+).mlp.fc_in.", r"transformer.layers.\1.mlp.fc1.", key
-        )
-        key = re.sub(
-            r"^transformer.layers.(\d+).mlp.fc_out.", r"transformer.layers.\1.mlp.fc2.", key
-        )
-        return key
-
-    state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
-
-    # Attention
-    for l in range(config.n_layer):
-        Wq = state_dict.pop(f"transformer.layers.{l}.attn.q_proj.weight")
-        Wk = state_dict.pop(f"transformer.layers.{l}.attn.k_proj.weight")
-        Wv = state_dict.pop(f"transformer.layers.{l}.attn.v_proj.weight")
-        state_dict[f"transformer.layers.{l}.mixer.Wqkv.weight"] = torch.cat([Wq, Wk, Wv], dim=0)
-        # We don't store these biases
-        state_dict.pop(f"transformer.layers.{l}.attn.bias")
-        state_dict.pop(f"transformer.layers.{l}.attn.masked_bias")
-
-    def key_mapping_attn(key):
-        return re.sub(
-            r"^transformer.layers.(\d+).attn.out_proj.",
-            r"transformer.layers.\1.mixer.out_proj.",
-            key,
-        )
-
-    state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
-
-    return state_dict
-
-
-def gptj_config_to_gpt2_config(gptj_config: GPTJConfig) -> GPT2Config:
-    headdim = gptj_config.n_embd // gptj_config.n_head
-    return GPT2Config(
-        vocab_size=gptj_config.vocab_size,
-        n_positions=0,  # No absolute position embedding
-        n_embd=gptj_config.n_embd,
-        n_layer=gptj_config.n_layer,
-        n_head=gptj_config.n_head,
-        n_inner=gptj_config.n_inner,
-        activation_function=gptj_config.activation_function,
-        resid_pdrop=gptj_config.resid_pdrop,
-        embd_pdrop=gptj_config.embd_pdrop,
-        attn_pdrop=gptj_config.attn_pdrop,
-        layer_norm_epsilon=gptj_config.layer_norm_epsilon,
-        initializer_range=gptj_config.initializer_range,
-        bos_token_id=gptj_config.bos_token_id,
-        eos_token_id=gptj_config.eos_token_id,
-        # These are new arguments not in the original GPT2Config
-        prenorm=True,
-        parallel_block=True,
-        parallel_block_tied_norm=True,
-        rotary_emb_fraction=gptj_config.rotary_dim / headdim,
-        rotary_emb_interleaved=True,
-        tie_word_embeddings=False,
-        qkv_proj_bias=False,
-        out_proj_bias=False,
-        lm_head_bias=True,
-    )