Commit ae856f3a authored by Woosuk Kwon's avatar Woosuk Kwon
Browse files

Remove unnecessary files

parent 6ac8e63a
# Adapted from https://github.com/mlcommons/training_results_v1.1/blob/main/NVIDIA/benchmarks/bert/implementations/pytorch/padding.py
import torch
import torch.nn.functional as F
from einops import rearrange, repeat
class IndexFirstAxis(torch.autograd.Function):
@staticmethod
def forward(ctx, input, indices):
ctx.save_for_backward(indices)
assert input.ndim >= 2
ctx.first_axis_dim, other_shape = input.shape[0], input.shape[1:]
second_dim = other_shape.numel()
# TD [2022-03-04] For some reason torch.gather is a bit faster than indexing.
# return input[indices]
return torch.gather(
rearrange(input, "b ... -> b (...)"), 0, repeat(indices, "z -> z d", d=second_dim)
).reshape(-1, *other_shape)
@staticmethod
def backward(ctx, grad_output):
(indices,) = ctx.saved_tensors
assert grad_output.ndim >= 2
other_shape = grad_output.shape[1:]
grad_output = rearrange(grad_output, "b ... -> b (...)")
grad_input = torch.zeros(
[ctx.first_axis_dim, grad_output.shape[1]],
device=grad_output.device,
dtype=grad_output.dtype,
)
# TD [2022-03-04] For some reason torch.scatter is a bit faster than indexing.
# grad_input[indices] = grad_output
grad_input.scatter_(0, repeat(indices, "z -> z d", d=grad_output.shape[1]), grad_output)
return grad_input.reshape(ctx.first_axis_dim, *other_shape), None
index_first_axis = IndexFirstAxis.apply
class IndexPutFirstAxis(torch.autograd.Function):
@staticmethod
def forward(ctx, values, indices, first_axis_dim):
ctx.save_for_backward(indices)
assert indices.ndim == 1
assert values.ndim >= 2
output = torch.zeros(
first_axis_dim, *values.shape[1:], device=values.device, dtype=values.dtype
)
# TD [2022-03-04] For some reason torch.scatter is a bit faster than indexing.
output[indices] = values
# output.scatter_(0, repeat(indices, 'z -> z d', d=values.shape[1]), values)
return output
@staticmethod
def backward(ctx, grad_output):
(indices,) = ctx.saved_tensors
# TD [2022-03-04] For some reason torch.gather is a bit faster than indexing.
grad_values = grad_output[indices]
# grad_values = torch.gather(grad_output, 0, repeat(indices, 'z -> z d', d=grad_output.shape[1]))
return grad_values, None, None
index_put_first_axis = IndexPutFirstAxis.apply
class IndexFirstAxisResidual(torch.autograd.Function):
@staticmethod
def forward(ctx, input, indices):
ctx.save_for_backward(indices)
assert input.ndim >= 2
ctx.first_axis_dim, other_shape = input.shape[0], input.shape[1:]
second_dim = other_shape.numel()
# TD [2022-03-04] For some reason torch.gather is a bit faster than indexing.
output = input[indices]
# We don't want to reshape input (b ... -> b (...)) since it could change the channel_last
# memory format to channel_first. In other words, input might not be contiguous.
# If we don't detach, Pytorch complains about output being a view and is being modified inplace
return output, input.detach()
@staticmethod
def backward(ctx, grad_output, grad_residual):
(indices,) = ctx.saved_tensors
assert grad_output.ndim >= 2
other_shape = grad_output.shape[1:]
assert grad_residual.shape[1:] == other_shape
grad_input = grad_residual
# grad_input[indices] += grad_output
indices = indices.reshape(indices.shape[0], *((1,) * (grad_output.ndim - 1)))
indices = indices.expand_as(grad_output)
grad_input.scatter_add_(0, indices, grad_output)
return grad_input.reshape(ctx.first_axis_dim, *other_shape), None
index_first_axis_residual = IndexFirstAxisResidual.apply
def unpad_input(hidden_states, attention_mask):
"""
Arguments:
hidden_states: (batch, seqlen, ...)
attention_mask: (batch, seqlen), bool / int, 1 means valid and 0 means not valid.
Return:
hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
indices: (total_nnz), the indices of non-masked tokens from the flattened input sequence.
cu_seqlens: (batch + 1), the cumulative sequence lengths, used to index into hidden_states.
max_seqlen_in_batch: int
"""
seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
max_seqlen_in_batch = seqlens_in_batch.max().item()
cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
# TD [2022-03-04] We don't want to index with a bool mask, because Pytorch will expand the
# bool mask, then call nonzero to get the indices, then index with those. The indices is @dim
# times larger than it needs to be, wasting memory. It's faster and more memory-efficient to
# index with integer indices. Moreover, torch's index is a bit slower than it needs to be,
# so we write custom forward and backward to make it a bit faster.
return (
index_first_axis(rearrange(hidden_states, "b s ... -> (b s) ..."), indices),
indices,
cu_seqlens,
max_seqlen_in_batch,
)
def unpad_input_for_concatenated_sequences(hidden_states, attention_mask_in_length):
"""
Supports concatenating short samples in one sequence. The attention_mask_in_length is utilized to mask other short samples. It helps efficient training of variant lengths-based samples (e.g., the supervised fine-tuning task in large language model).
The motivation for this function is explained [here](https://github.com/Dao-AILab/flash-attention/issues/432#issuecomment-1668822286).
For example, if batch = 3 and seqlen = 6, the attention_mask_in_length is:
```
[
[2, 3, 0, 0, 0, 0],
[3, 2, 0, 0, 0, 0],
[6, 0, 0, 0, 0, 0]
]
```
, which refers to the 3D-attention mask:
```
[
[
[1, 0, 0, 0, 0, 0],
[1, 1, 0, 0, 0, 0],
[0, 0, 1, 0, 0, 0],
[0, 0, 1, 1, 0, 0],
[0, 0, 1, 1, 1, 0],
[0, 0, 0, 0, 0, 1]
],
[
[1, 0, 0, 0, 0, 0],
[1, 1, 0, 0, 0, 0],
[1, 1, 1, 0, 0, 0],
[0, 0, 0, 1, 0, 0],
[0, 0, 0, 1, 1, 0],
[0, 0, 0, 0, 0, 1]
],
[
[1, 0, 0, 0, 0, 0],
[1, 1, 0, 0, 0, 0],
[1, 1, 1, 0, 0, 0],
[1, 1, 1, 1, 0, 0],
[1, 1, 1, 1, 1, 0],
[1, 1, 1, 1, 1, 1]
]
]
```.
Arguments:
hidden_states: (batch, seqlen, ...)
attention_mask_in_length: (batch, seqlen), int, a nonzero number (e.g., 1, 2, 3, etc.) means length of concatenated sequence in b-th batch, and 0 means none.
Return:
hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
indices: (total_nnz), the indices of non-masked tokens from the flattened input sequence.
cu_seqlens: (batch + 1), the cumulative sequence lengths, used to index into hidden_states.
max_seqlen_in_batch: int
"""
length = attention_mask_in_length.sum(dim=-1)
seqlen = attention_mask_in_length.size(-1)
attention_mask_2d = torch.arange(seqlen, device=length.device, dtype=length.dtype).expand(len(length), seqlen) < length.unsqueeze(1)
real_indices_idx = torch.nonzero(attention_mask_in_length.flatten(), as_tuple=False).flatten()
seqlens_in_batch = attention_mask_in_length.flatten()[real_indices_idx]
indices = torch.nonzero(attention_mask_2d.flatten(), as_tuple=False).flatten()
max_seqlen_in_batch = seqlens_in_batch.max().item()
cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
# TD [2022-03-04] We don't want to index with a bool mask, because Pytorch will expand the
# bool mask, then call nonzero to get the indices, then index with those. The indices is @dim
# times larger than it needs to be, wasting memory. It's faster and more memory-efficient to
# index with integer indices. Moreover, torch's index is a bit slower than it needs to be,
# so we write custom forward and backward to make it a bit faster.
return (
index_first_axis(rearrange(hidden_states, "b s ... -> (b s) ..."), indices),
indices,
cu_seqlens,
max_seqlen_in_batch,
)
def pad_input(hidden_states, indices, batch, seqlen):
"""
Arguments:
hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
indices: (total_nnz), the indices that represent the non-masked tokens of the original padded input sequence.
batch: int, batch size for the padded sequence.
seqlen: int, maximum sequence length for the padded sequence.
Return:
hidden_states: (batch, seqlen, ...)
"""
dim = hidden_states.shape[-1]
# output = torch.zeros((batch * seqlen), dim, device=hidden_states.device, dtype=hidden_states.dtype)
# output[indices] = hidden_states
output = index_put_first_axis(hidden_states, indices, batch * seqlen)
return rearrange(output, "(b s) ... -> b s ...", b=batch)
"""
*Experimental* implementation of FlashAttention in Triton.
Tested with triton==2.0.0.dev20221202.
Triton 2.0 has a new backend (MLIR) but seems like it doesn't yet work for head dimensions
other than 64:
https://github.com/openai/triton/blob/d376020f90002757eea3ea9475d4f7cfc2ec5ead/python/triton/ops/flash_attention.py#L207
We'll update this implementation with the new Triton backend once this is fixed.
We use the FlashAttention implementation from Phil Tillet a starting point.
https://github.com/openai/triton/blob/master/python/tutorials/06-fused-attention.py
Changes:
- Implement both causal and non-causal attention.
- Implement both self-attention and cross-attention.
- Support arbitrary seqlens (not just multiples of 128), for both forward and backward.
- Support all head dimensions up to 128 (not just 16, 32, 64, 128), for both forward and backward.
- Support attention bias.
- Speed up the forward pass a bit, and only store the LSE instead of m and l.
- Make the backward for d=128 much faster by reducing register spilling.
- Optionally parallelize the backward pass across seqlen_k, to deal with the case of
small batch size * nheads.
Caution:
- This is an *experimental* implementation. The forward pass should be quite robust but
I'm not 100% sure that the backward pass doesn't have race conditions (due to the Triton compiler).
- This implementation has only been tested on A100.
- If you plan to use headdim other than 64 and 128, you should test for race conditions
(due to the Triton compiler), as done in tests/test_flash_attn.py
"test_flash_attn_triton_race_condition". I've tested and fixed many race conditions
for different head dimensions (40, 48, 64, 128, 80, 88, 96), but I'm still not 100% confident
that there are none left for other head dimensions.
Differences between this Triton version and the CUDA version:
- Triton version doesn't support dropout.
- Triton forward is generally faster than CUDA forward, while Triton backward is
generally slower than CUDA backward. Overall Triton forward + backward is slightly slower
than CUDA forward + backward.
- Triton version doesn't support different sequence lengths in a batch (i.e., RaggedTensor/NestedTensor).
- Triton version supports attention bias, while CUDA version doesn't.
"""
import math
import torch
import triton
import triton.language as tl
# Disabling autotune for now, set num_warps=4 if headdim=64 and num_warps=8 if headdim=128
# @triton.autotune(
# configs=[
# triton.Config({"BLOCK_M": 128, "BLOCK_N": 128}, num_warps=4, num_stages=1),
# # This config has a race condition when EVEN_M == False, disabling it for now.
# # triton.Config({"BLOCK_M": 64, "BLOCK_N": 64}, num_warps=4, num_stages=1),
# ],
# key=['CACHE_KEY_SEQLEN_Q', 'CACHE_KEY_SEQLEN_K', 'BIAS_TYPE', 'IS_CAUSAL', 'BLOCK_HEADDIM']
# )
@triton.heuristics(
{
"EVEN_M": lambda args: args["seqlen_q"] % args["BLOCK_M"] == 0,
"EVEN_N": lambda args: args["seqlen_k"] % args["BLOCK_N"] == 0,
"EVEN_HEADDIM": lambda args: args["headdim"] == args["BLOCK_HEADDIM"],
}
)
@triton.jit
def _fwd_kernel(
Q,
K,
V,
Bias,
Out,
Lse,
TMP, # NOTE: TMP is a scratchpad buffer to workaround a compiler bug
softmax_scale,
stride_qb,
stride_qh,
stride_qm,
stride_kb,
stride_kh,
stride_kn,
stride_vb,
stride_vh,
stride_vn,
stride_bb,
stride_bh,
stride_bm,
stride_ob,
stride_oh,
stride_om,
nheads,
seqlen_q,
seqlen_k,
seqlen_q_rounded,
headdim,
CACHE_KEY_SEQLEN_Q,
CACHE_KEY_SEQLEN_K,
BIAS_TYPE: tl.constexpr,
IS_CAUSAL: tl.constexpr,
BLOCK_HEADDIM: tl.constexpr,
EVEN_M: tl.constexpr,
EVEN_N: tl.constexpr,
EVEN_HEADDIM: tl.constexpr,
BLOCK_M: tl.constexpr,
BLOCK_N: tl.constexpr,
):
start_m = tl.program_id(0)
off_hb = tl.program_id(1)
off_b = off_hb // nheads
off_h = off_hb % nheads
# off_b = tl.program_id(1)
# off_h = tl.program_id(2)
# off_hb = off_b * nheads + off_h
# initialize offsets
offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
offs_n = tl.arange(0, BLOCK_N)
offs_d = tl.arange(0, BLOCK_HEADDIM)
# Initialize pointers to Q, K, V
# Adding parenthesis around indexing might use int32 math instead of int64 math?
# https://github.com/openai/triton/issues/741
# I'm seeing a tiny bit of difference (5-7us)
q_ptrs = (
Q + off_b * stride_qb + off_h * stride_qh + (offs_m[:, None] * stride_qm + offs_d[None, :])
)
k_ptrs = (
K + off_b * stride_kb + off_h * stride_kh + (offs_n[:, None] * stride_kn + offs_d[None, :])
)
v_ptrs = (
V + off_b * stride_vb + off_h * stride_vh + (offs_n[:, None] * stride_vn + offs_d[None, :])
)
if BIAS_TYPE == "vector":
b_ptrs = Bias + off_b * stride_bb + off_h * stride_bh + offs_n
elif BIAS_TYPE == "matrix":
b_ptrs = (
Bias
+ off_b * stride_bb
+ off_h * stride_bh
+ (offs_m[:, None] * stride_bm + offs_n[None, :])
)
# initialize pointer to m and l
t_ptrs = TMP + off_hb * seqlen_q_rounded + offs_m
lse_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
acc_o = tl.zeros([BLOCK_M, BLOCK_HEADDIM], dtype=tl.float32)
# load q: it will stay in SRAM throughout
# [2022-10-30] TD: Triton bug - in the case of EVEN_M=True and EVEN_N=False, if we just call
# tl.load(q_ptrs), we get the wrong output!
if EVEN_M & EVEN_N:
if EVEN_HEADDIM:
q = tl.load(q_ptrs)
else:
q = tl.load(q_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
else:
if EVEN_HEADDIM:
q = tl.load(q_ptrs, mask=offs_m[:, None] < seqlen_q, other=0.0)
else:
q = tl.load(
q_ptrs, mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim), other=0.0
)
# loop over k, v and update accumulator
end_n = seqlen_k if not IS_CAUSAL else tl.minimum((start_m + 1) * BLOCK_M, seqlen_k)
for start_n in range(0, end_n, BLOCK_N):
start_n = tl.multiple_of(start_n, BLOCK_N)
# -- compute qk ----
if EVEN_N & EVEN_M: # If we just do "if EVEN_N", there seems to be some race condition
if EVEN_HEADDIM:
k = tl.load(k_ptrs + start_n * stride_kn)
else:
k = tl.load(k_ptrs + start_n * stride_kn, mask=offs_d[None, :] < headdim, other=0.0)
else:
if EVEN_HEADDIM:
k = tl.load(
k_ptrs + start_n * stride_kn,
mask=(start_n + offs_n)[:, None] < seqlen_k,
other=0.0,
)
else:
k = tl.load(
k_ptrs + start_n * stride_kn,
mask=((start_n + offs_n)[:, None] < seqlen_k) & (offs_d[None, :] < headdim),
other=0.0,
)
qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
qk += tl.dot(q, k, trans_b=True)
# Trying to combine the two masks seem to make the result wrong
if not EVEN_N: # Need to mask out otherwise the softmax is wrong
qk += tl.where((start_n + offs_n)[None, :] < seqlen_k, 0, float("-inf"))
if IS_CAUSAL:
qk += tl.where(offs_m[:, None] >= (start_n + offs_n)[None, :], 0, float("-inf"))
if BIAS_TYPE != "none":
if BIAS_TYPE == "vector":
if EVEN_N:
bias = tl.load(b_ptrs + start_n).to(tl.float32)
else:
bias = tl.load(
b_ptrs + start_n, mask=(start_n + offs_n) < seqlen_k, other=0.0
).to(tl.float32)
bias = bias[None, :]
elif BIAS_TYPE == "matrix":
if EVEN_M & EVEN_N:
bias = tl.load(b_ptrs + start_n).to(tl.float32)
else:
bias = tl.load(
b_ptrs + start_n,
mask=(offs_m[:, None] < seqlen_q)
& ((start_n + offs_n)[None, :] < seqlen_k),
other=0.0,
).to(tl.float32)
# Slightly faster to multiply the softmax_scale in the tl.exp below since the compiler
# can then fuse the mult and add into an fma instruction. But if we have bias we need to
# to multiply with softmax_scale here.
qk = qk * softmax_scale + bias
m_ij = tl.maximum(tl.max(qk, 1), lse_i)
p = tl.exp(qk - m_ij[:, None])
else:
m_ij = tl.maximum(tl.max(qk, 1) * softmax_scale, lse_i)
p = tl.exp(qk * softmax_scale - m_ij[:, None])
l_ij = tl.sum(p, 1)
# scale acc_o
acc_o_scale = tl.exp(m_i - m_ij)
# # -- update output accumulator --
# BUG: have to store and immediately load
tl.store(t_ptrs, acc_o_scale)
acc_o_scale = tl.load(t_ptrs)
acc_o = acc_o * acc_o_scale[:, None]
# update acc_o
if EVEN_N & EVEN_M: # If we just do "if EVEN_N", there seems to be some race condition
if EVEN_HEADDIM:
v = tl.load(v_ptrs + start_n * stride_vn)
else:
v = tl.load(v_ptrs + start_n * stride_vn, mask=offs_d[None, :] < headdim, other=0.0)
else:
if EVEN_HEADDIM:
v = tl.load(
v_ptrs + start_n * stride_vn,
mask=(start_n + offs_n)[:, None] < seqlen_k,
other=0.0,
)
else:
v = tl.load(
v_ptrs + start_n * stride_vn,
mask=((start_n + offs_n)[:, None] < seqlen_k) & (offs_d[None, :] < headdim),
other=0.0,
)
p = p.to(v.dtype)
acc_o += tl.dot(p, v)
# -- update statistics
m_i = m_ij
l_i_new = tl.exp(lse_i - m_ij) + l_ij
lse_i = m_ij + tl.log(l_i_new)
o_scale = tl.exp(m_i - lse_i)
# BUG: have to store and immediately load
tl.store(t_ptrs, o_scale)
o_scale = tl.load(t_ptrs)
acc_o = acc_o * o_scale[:, None]
# rematerialize offsets to save registers
start_m = tl.program_id(0)
offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
# write back l and m
lse_ptrs = Lse + off_hb * seqlen_q_rounded + offs_m
tl.store(lse_ptrs, lse_i)
# initialize pointers to output
offs_d = tl.arange(0, BLOCK_HEADDIM)
out_ptrs = (
Out
+ off_b * stride_ob
+ off_h * stride_oh
+ (offs_m[:, None] * stride_om + offs_d[None, :])
)
if EVEN_M:
if EVEN_HEADDIM:
tl.store(out_ptrs, acc_o)
else:
tl.store(out_ptrs, acc_o, mask=offs_d[None, :] < headdim)
else:
if EVEN_HEADDIM:
tl.store(out_ptrs, acc_o, mask=offs_m[:, None] < seqlen_q)
else:
tl.store(
out_ptrs, acc_o, mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim)
)
@triton.jit
def _bwd_preprocess_do_o_dot(
Out,
DO,
Delta,
stride_ob,
stride_oh,
stride_om,
stride_dob,
stride_doh,
stride_dom,
nheads,
seqlen_q,
seqlen_q_rounded,
headdim,
BLOCK_M: tl.constexpr,
BLOCK_HEADDIM: tl.constexpr,
):
start_m = tl.program_id(0)
off_hb = tl.program_id(1)
off_b = off_hb // nheads
off_h = off_hb % nheads
# initialize offsets
offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
offs_d = tl.arange(0, BLOCK_HEADDIM)
# load
o = tl.load(
Out + off_b * stride_ob + off_h * stride_oh + offs_m[:, None] * stride_om + offs_d[None, :],
mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
other=0.0,
).to(tl.float32)
do = tl.load(
DO
+ off_b * stride_dob
+ off_h * stride_doh
+ offs_m[:, None] * stride_dom
+ offs_d[None, :],
mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
other=0.0,
).to(tl.float32)
delta = tl.sum(o * do, axis=1)
# write-back
tl.store(Delta + off_hb * seqlen_q_rounded + offs_m, delta)
@triton.jit
def _bwd_store_dk_dv(
dk_ptrs,
dv_ptrs,
dk,
dv,
offs_n,
offs_d,
seqlen_k,
headdim,
EVEN_M: tl.constexpr,
EVEN_N: tl.constexpr,
EVEN_HEADDIM: tl.constexpr,
):
# [2022-11-01] TD: Same bug. In the case of EVEN_N=True and EVEN_M=False,
# if we just call tl.store(dv_ptrs), there's a race condition
if EVEN_N & EVEN_M:
if EVEN_HEADDIM:
tl.store(dv_ptrs, dv)
tl.store(dk_ptrs, dk)
else:
tl.store(dv_ptrs, dv, mask=offs_d[None, :] < headdim)
tl.store(dk_ptrs, dk, mask=offs_d[None, :] < headdim)
else:
if EVEN_HEADDIM:
tl.store(dv_ptrs, dv, mask=offs_n[:, None] < seqlen_k)
tl.store(dk_ptrs, dk, mask=offs_n[:, None] < seqlen_k)
else:
tl.store(dv_ptrs, dv, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim))
tl.store(dk_ptrs, dk, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim))
@triton.jit
def _bwd_kernel_one_col_block(
start_n,
Q,
K,
V,
Bias,
DO,
DQ,
DK,
DV,
LSE,
D,
softmax_scale,
stride_qm,
stride_kn,
stride_vn,
stride_bm,
stride_dom,
stride_dqm,
stride_dkn,
stride_dvn,
seqlen_q,
seqlen_k,
headdim,
ATOMIC_ADD: tl.constexpr,
BIAS_TYPE: tl.constexpr,
IS_CAUSAL: tl.constexpr,
BLOCK_HEADDIM: tl.constexpr,
EVEN_M: tl.constexpr,
EVEN_N: tl.constexpr,
EVEN_HEADDIM: tl.constexpr,
BLOCK_M: tl.constexpr,
BLOCK_N: tl.constexpr,
):
# We need to make sure begin_m is a multiple of BLOCK_M (not BLOCK_N)
begin_m = 0 if not IS_CAUSAL else ((start_n * BLOCK_N) // BLOCK_M) * BLOCK_M
# initialize row/col offsets
offs_qm = begin_m + tl.arange(0, BLOCK_M)
offs_n = start_n * BLOCK_N + tl.arange(0, BLOCK_N)
offs_m = tl.arange(0, BLOCK_M)
offs_d = tl.arange(0, BLOCK_HEADDIM)
# initialize pointers to value-like data
q_ptrs = Q + (offs_qm[:, None] * stride_qm + offs_d[None, :])
k_ptrs = K + (offs_n[:, None] * stride_kn + offs_d[None, :])
v_ptrs = V + (offs_n[:, None] * stride_vn + offs_d[None, :])
do_ptrs = DO + (offs_qm[:, None] * stride_dom + offs_d[None, :])
dq_ptrs = DQ + (offs_qm[:, None] * stride_dqm + offs_d[None, :])
if BIAS_TYPE == "vector":
b_ptrs = Bias + offs_n
elif BIAS_TYPE == "matrix":
b_ptrs = Bias + (offs_qm[:, None] * stride_bm + offs_n[None, :])
# initialize dv and dk
dv = tl.zeros([BLOCK_N, BLOCK_HEADDIM], dtype=tl.float32)
dk = tl.zeros([BLOCK_N, BLOCK_HEADDIM], dtype=tl.float32)
# There seems to be some problem with Triton pipelining that makes results wrong for
# headdim=64, seqlen=(113, 255), bias_type='matrix'. In this case the for loop
# may have zero step, and pipelining with the bias matrix could screw it up.
# So we just exit early.
if begin_m >= seqlen_q:
dv_ptrs = DV + (offs_n[:, None] * stride_dvn + offs_d[None, :])
dk_ptrs = DK + (offs_n[:, None] * stride_dkn + offs_d[None, :])
_bwd_store_dk_dv(
dk_ptrs,
dv_ptrs,
dk,
dv,
offs_n,
offs_d,
seqlen_k,
headdim,
EVEN_M=EVEN_M,
EVEN_N=EVEN_N,
EVEN_HEADDIM=EVEN_HEADDIM,
)
return
# k and v stay in SRAM throughout
# [2022-10-30] TD: Same bug as the fwd. In the case of EVEN_N=True and EVEN_M=False,
# if we just call tl.load(k_ptrs), we get the wrong output!
if EVEN_N & EVEN_M:
if EVEN_HEADDIM:
k = tl.load(k_ptrs)
v = tl.load(v_ptrs)
else:
k = tl.load(k_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
v = tl.load(v_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
else:
if EVEN_HEADDIM:
k = tl.load(k_ptrs, mask=offs_n[:, None] < seqlen_k, other=0.0)
v = tl.load(v_ptrs, mask=offs_n[:, None] < seqlen_k, other=0.0)
else:
k = tl.load(
k_ptrs, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim), other=0.0
)
v = tl.load(
v_ptrs, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim), other=0.0
)
# loop over rows
num_block_m = tl.cdiv(seqlen_q, BLOCK_M)
for start_m in range(begin_m, num_block_m * BLOCK_M, BLOCK_M):
start_m = tl.multiple_of(start_m, BLOCK_M)
offs_m_curr = start_m + offs_m
# load q, k, v, do on-chip
# Same bug as below. Otherwise gives wrong result for headdim=40, seqlen=(128, 117)
if EVEN_M & EVEN_HEADDIM:
q = tl.load(q_ptrs)
else:
if EVEN_HEADDIM:
q = tl.load(q_ptrs, mask=offs_m_curr[:, None] < seqlen_q, other=0.0)
else:
q = tl.load(
q_ptrs,
mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
other=0.0,
)
# recompute p = softmax(qk, dim=-1).T
qk = tl.dot(q, k, trans_b=True)
# Trying to combine the two masks seem to make the result wrong
if not EVEN_N: # Need to mask out otherwise the softmax is wrong
qk = tl.where(offs_n[None, :] < seqlen_k, qk, float("-inf"))
if IS_CAUSAL:
qk = tl.where(offs_m_curr[:, None] >= (offs_n[None, :]), qk, float("-inf"))
if BIAS_TYPE != "none":
tl.debug_barrier() # Race condition otherwise
if BIAS_TYPE == "vector":
if EVEN_N:
bias = tl.load(b_ptrs).to(tl.float32)
else:
bias = tl.load(b_ptrs, mask=offs_n < seqlen_k, other=0.0).to(tl.float32)
bias = bias[None, :]
elif BIAS_TYPE == "matrix":
if EVEN_M & EVEN_N:
bias = tl.load(b_ptrs).to(tl.float32)
else:
bias = tl.load(
b_ptrs,
mask=(offs_m_curr[:, None] < seqlen_q) & (offs_n[None, :] < seqlen_k),
other=0.0,
).to(tl.float32)
qk = qk * softmax_scale + bias
# There seems to be a race condition when headdim=48/96, and dq, dk, dv are wrong.
# Also wrong for headdim=64.
if not (EVEN_M & EVEN_HEADDIM):
tl.debug_barrier()
lse_i = tl.load(LSE + offs_m_curr)
if BIAS_TYPE == "none":
p = tl.exp(qk * softmax_scale - lse_i[:, None])
else:
p = tl.exp(qk - lse_i[:, None])
# compute dv
# [2022-10-30] TD: A Triton bug: if EVEN_M=True and EVEN_HEADDIM=False, if we call
# do = tl.load(do_ptrs, mask=offs_d[None, :] < headdim, other=0.0), we get wrong outputs
# in the case of headdim=48/96, seqlen_q & seqlen_k >= 512. If headdim=40 or seqlen < 512,
# the output is correct.
if EVEN_M & EVEN_HEADDIM:
do = tl.load(do_ptrs)
else:
# [2022-11-01] TD: Triton bug, there's a race condition if we just use m_mask and not d_mask.
do = tl.load(
do_ptrs,
mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
other=0.0,
)
# if EVEN_M:
# if EVEN_HEADDIM:
# do = tl.load(do_ptrs)
# else:
# do = tl.load(do_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
# else:
# if EVEN_HEADDIM:
# do = tl.load(do_ptrs, mask=offs_m_curr[:, None] < seqlen_q, other=0.0)
# else:
# do = tl.load(do_ptrs, mask=(offs_m_curr[:, None] < seqlen_q)
# & (offs_d[None, :] < headdim), other=0.0)
dv += tl.dot(p.to(do.dtype), do, trans_a=True)
# compute dp = dot(v, do)
# There seems to be a race condition when headdim=48/96, and dq, dk are wrong.
# Also wrong for headdim=128, seqlen=(108, 256), and ATOMIC_ADD=True
# Also wrong for headdim=64, seqlen=(1023, 1024), and ATOMIC_ADD=False
if not (EVEN_M & EVEN_HEADDIM):
tl.debug_barrier()
dp = tl.dot(do, v, trans_b=True)
# There's a race condition for headdim=48
if not EVEN_HEADDIM:
tl.debug_barrier()
# compute ds = p * (dp - delta[:, None])
# Putting the subtraction after the dp matmul (instead of before) is slightly faster
Di = tl.load(D + offs_m_curr)
# Converting ds to q.dtype here reduces register pressure and makes it much faster
# for BLOCK_HEADDIM=128
ds = (p * (dp - Di[:, None]) * softmax_scale).to(q.dtype)
# compute dk = dot(ds.T, q)
dk += tl.dot(ds, q, trans_a=True)
# compute dq
if not (
EVEN_M & EVEN_HEADDIM
): # Otherewise there's a race condition when BIAS_TYPE='matrix'
tl.debug_barrier()
if not ATOMIC_ADD:
if EVEN_M & EVEN_HEADDIM: # Race condition if we just do EVEN_M
dq = tl.load(dq_ptrs, eviction_policy="evict_last")
dq += tl.dot(ds, k)
tl.store(dq_ptrs, dq, eviction_policy="evict_last")
else:
if EVEN_HEADDIM:
dq = tl.load(
dq_ptrs,
mask=offs_m_curr[:, None] < seqlen_q,
other=0.0,
eviction_policy="evict_last",
)
dq += tl.dot(ds, k)
tl.store(
dq_ptrs,
dq,
mask=offs_m_curr[:, None] < seqlen_q,
eviction_policy="evict_last",
)
else:
dq = tl.load(
dq_ptrs,
mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
other=0.0,
eviction_policy="evict_last",
)
dq += tl.dot(ds, k)
tl.store(
dq_ptrs,
dq,
mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
eviction_policy="evict_last",
)
else: # If we're parallelizing across the seqlen_k dimension
dq = tl.dot(ds, k)
if EVEN_M & EVEN_HEADDIM: # Race condition if we just do EVEN_M
tl.atomic_add(dq_ptrs, dq)
else:
if EVEN_HEADDIM:
tl.atomic_add(dq_ptrs, dq, mask=offs_m_curr[:, None] < seqlen_q)
else:
tl.atomic_add(
dq_ptrs,
dq,
mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
)
# increment pointers
dq_ptrs += BLOCK_M * stride_dqm
q_ptrs += BLOCK_M * stride_qm
do_ptrs += BLOCK_M * stride_dom
if BIAS_TYPE == "matrix":
b_ptrs += BLOCK_M * stride_bm
# write-back
dv_ptrs = DV + (offs_n[:, None] * stride_dvn + offs_d[None, :])
dk_ptrs = DK + (offs_n[:, None] * stride_dkn + offs_d[None, :])
_bwd_store_dk_dv(
dk_ptrs,
dv_ptrs,
dk,
dv,
offs_n,
offs_d,
seqlen_k,
headdim,
EVEN_M=EVEN_M,
EVEN_N=EVEN_N,
EVEN_HEADDIM=EVEN_HEADDIM,
)
def init_to_zero(name):
return lambda nargs: nargs[name].zero_()
@triton.autotune(
configs=[
triton.Config(
{"BLOCK_M": 128, "BLOCK_N": 128, "SEQUENCE_PARALLEL": False},
num_warps=8,
num_stages=1,
pre_hook=init_to_zero("DQ"),
),
triton.Config(
{"BLOCK_M": 128, "BLOCK_N": 128, "SEQUENCE_PARALLEL": True},
num_warps=8,
num_stages=1,
pre_hook=init_to_zero("DQ"),
),
# Other configs seem to give wrong results when seqlen_q % 128 != 0, disabling them for now
# # Kernel is buggy (give wrong result) if we set BLOCK_m=128, BLOCK_n=64, num_warps=*4*
# triton.Config({"BLOCK_M": 128, "BLOCK_N": 64, "SEQUENCE_PARALLEL": False}, num_warps=8, num_stages=1, pre_hook=init_to_zero('DQ')),
# triton.Config({"BLOCK_M": 128, "BLOCK_N": 64, "SEQUENCE_PARALLEL": True}, num_warps=8, num_stages=1, pre_hook=init_to_zero('DQ')),
# triton.Config({"BLOCK_M": 64, "BLOCK_N": 64, "SEQUENCE_PARALLEL": False}, num_warps=4, num_stages=1, pre_hook=init_to_zero('DQ')),
# triton.Config({"BLOCK_M": 64, "BLOCK_N": 64, "SEQUENCE_PARALLEL": True}, num_warps=4, num_stages=1, pre_hook=init_to_zero('DQ')),
],
key=["CACHE_KEY_SEQLEN_Q", "CACHE_KEY_SEQLEN_K", "BIAS_TYPE", "IS_CAUSAL", "BLOCK_HEADDIM"],
)
@triton.heuristics(
{
"EVEN_M": lambda args: args["seqlen_q"] % args["BLOCK_M"] == 0,
"EVEN_N": lambda args: args["seqlen_k"] % args["BLOCK_N"] == 0,
"EVEN_HEADDIM": lambda args: args["headdim"] == args["BLOCK_HEADDIM"],
}
)
@triton.jit
def _bwd_kernel(
Q,
K,
V,
Bias,
DO,
DQ,
DK,
DV,
LSE,
D,
softmax_scale,
stride_qb,
stride_qh,
stride_qm,
stride_kb,
stride_kh,
stride_kn,
stride_vb,
stride_vh,
stride_vn,
stride_bb,
stride_bh,
stride_bm,
stride_dob,
stride_doh,
stride_dom,
stride_dqb,
stride_dqh,
stride_dqm,
stride_dkb,
stride_dkh,
stride_dkn,
stride_dvb,
stride_dvh,
stride_dvn,
nheads,
seqlen_q,
seqlen_k,
seqlen_q_rounded,
headdim,
CACHE_KEY_SEQLEN_Q,
CACHE_KEY_SEQLEN_K,
BIAS_TYPE: tl.constexpr,
IS_CAUSAL: tl.constexpr,
BLOCK_HEADDIM: tl.constexpr,
SEQUENCE_PARALLEL: tl.constexpr,
EVEN_M: tl.constexpr,
EVEN_N: tl.constexpr,
EVEN_HEADDIM: tl.constexpr,
BLOCK_M: tl.constexpr,
BLOCK_N: tl.constexpr,
):
off_hb = tl.program_id(1)
off_b = off_hb // nheads
off_h = off_hb % nheads
# offset pointers for batch/head
Q += off_b * stride_qb + off_h * stride_qh
K += off_b * stride_kb + off_h * stride_kh
V += off_b * stride_vb + off_h * stride_vh
DO += off_b * stride_dob + off_h * stride_doh
DQ += off_b * stride_dqb + off_h * stride_dqh
DK += off_b * stride_dkb + off_h * stride_dkh
DV += off_b * stride_dvb + off_h * stride_dvh
if BIAS_TYPE != "none":
Bias += off_b * stride_bb + off_h * stride_bh
# pointer to row-wise quantities in value-like data
D += off_hb * seqlen_q_rounded
LSE += off_hb * seqlen_q_rounded
if not SEQUENCE_PARALLEL:
num_block_n = tl.cdiv(seqlen_k, BLOCK_N)
for start_n in range(0, num_block_n):
_bwd_kernel_one_col_block(
start_n,
Q,
K,
V,
Bias,
DO,
DQ,
DK,
DV,
LSE,
D,
softmax_scale,
stride_qm,
stride_kn,
stride_vn,
stride_bm,
stride_dom,
stride_dqm,
stride_dkn,
stride_dvn,
seqlen_q,
seqlen_k,
headdim,
ATOMIC_ADD=False,
BIAS_TYPE=BIAS_TYPE,
IS_CAUSAL=IS_CAUSAL,
BLOCK_HEADDIM=BLOCK_HEADDIM,
EVEN_M=EVEN_M,
EVEN_N=EVEN_N,
EVEN_HEADDIM=EVEN_HEADDIM,
BLOCK_M=BLOCK_M,
BLOCK_N=BLOCK_N,
)
else:
start_n = tl.program_id(0)
_bwd_kernel_one_col_block(
start_n,
Q,
K,
V,
Bias,
DO,
DQ,
DK,
DV,
LSE,
D,
softmax_scale,
stride_qm,
stride_kn,
stride_vn,
stride_bm,
stride_dom,
stride_dqm,
stride_dkn,
stride_dvn,
seqlen_q,
seqlen_k,
headdim,
ATOMIC_ADD=True,
BIAS_TYPE=BIAS_TYPE,
IS_CAUSAL=IS_CAUSAL,
BLOCK_HEADDIM=BLOCK_HEADDIM,
EVEN_M=EVEN_M,
EVEN_N=EVEN_N,
EVEN_HEADDIM=EVEN_HEADDIM,
BLOCK_M=BLOCK_M,
BLOCK_N=BLOCK_N,
)
def _flash_attn_forward(q, k, v, bias=None, causal=False, softmax_scale=None):
# shape constraints
batch, seqlen_q, nheads, d = q.shape
_, seqlen_k, _, _ = k.shape
assert k.shape == (batch, seqlen_k, nheads, d)
assert v.shape == (batch, seqlen_k, nheads, d)
assert d <= 128, "FlashAttention only support head dimensions up to 128"
assert q.dtype == k.dtype == v.dtype, "All tensors must have the same type"
assert q.dtype in [torch.float16, torch.bfloat16], "Only support fp16 and bf16"
assert q.is_cuda and k.is_cuda and v.is_cuda
softmax_scale = softmax_scale or 1.0 / math.sqrt(d)
has_bias = bias is not None
bias_type = "none"
if has_bias:
assert bias.dtype in [q.dtype, torch.float]
assert bias.is_cuda
assert bias.dim() == 4
if bias.stride(-1) != 1:
bias = bias.contiguous()
if bias.shape[2:] == (1, seqlen_k):
bias_type = "vector"
elif bias.shape[2:] == (seqlen_q, seqlen_k):
bias_type = "matrix"
else:
raise RuntimeError(
"Last 2 dimensions of bias must be (1, seqlen_k)" " or (seqlen_q, seqlen_k)"
)
bias = bias.expand(batch, nheads, seqlen_q, seqlen_k)
bias_strides = (bias.stride(0), bias.stride(1), bias.stride(2)) if has_bias else (0, 0, 0)
seqlen_q_rounded = math.ceil(seqlen_q / 128) * 128
lse = torch.empty((batch, nheads, seqlen_q_rounded), device=q.device, dtype=torch.float32)
tmp = torch.empty((batch, nheads, seqlen_q_rounded), device=q.device, dtype=torch.float32)
o = torch.empty_like(q)
BLOCK_HEADDIM = max(triton.next_power_of_2(d), 16)
BLOCK = 128
num_warps = 4 if d <= 64 else 8
grid = lambda META: (triton.cdiv(seqlen_q, META["BLOCK_M"]), batch * nheads)
_fwd_kernel[grid](
q,
k,
v,
bias,
o,
lse,
tmp,
softmax_scale,
q.stride(0),
q.stride(2),
q.stride(1),
k.stride(0),
k.stride(2),
k.stride(1),
v.stride(0),
v.stride(2),
v.stride(1),
*bias_strides,
o.stride(0),
o.stride(2),
o.stride(1),
nheads,
seqlen_q,
seqlen_k,
seqlen_q_rounded,
d,
seqlen_q // 32,
seqlen_k // 32, # key for triton cache (limit number of compilations)
# Can't use kwargs here because triton autotune expects key to be args, not kwargs
# IS_CAUSAL=causal, BLOCK_HEADDIM=d,
bias_type,
causal,
BLOCK_HEADDIM,
BLOCK_M=BLOCK,
BLOCK_N=BLOCK,
num_warps=num_warps,
num_stages=1,
)
return o, lse, softmax_scale # softmax_scale could have been updated
def _flash_attn_backward(
do, q, k, v, o, lse, dq, dk, dv, bias=None, causal=False, softmax_scale=None
):
# Make sure that the last dimension is contiguous
if do.stride(-1) != 1:
do = do.contiguous()
batch, seqlen_q, nheads, d = q.shape
_, seqlen_k, _, _ = k.shape
# assert d in {16, 32, 64, 128}
assert d <= 128
seqlen_q_rounded = math.ceil(seqlen_q / 128) * 128
assert lse.shape == (batch, nheads, seqlen_q_rounded)
assert q.stride(-1) == k.stride(-1) == v.stride(-1) == o.stride(-1) == 1
assert dq.stride(-1) == dk.stride(-1) == dv.stride(-1) == 1
softmax_scale = softmax_scale or 1.0 / math.sqrt(d)
# dq_accum = torch.zeros_like(q, dtype=torch.float32)
dq_accum = torch.empty_like(q, dtype=torch.float32)
delta = torch.empty_like(lse)
# delta = torch.zeros_like(lse)
BLOCK_HEADDIM = max(triton.next_power_of_2(d), 16)
grid = lambda META: (triton.cdiv(seqlen_q, META["BLOCK_M"]), batch * nheads)
_bwd_preprocess_do_o_dot[grid](
o,
do,
delta,
o.stride(0),
o.stride(2),
o.stride(1),
do.stride(0),
do.stride(2),
do.stride(1),
nheads,
seqlen_q,
seqlen_q_rounded,
d,
BLOCK_M=128,
BLOCK_HEADDIM=BLOCK_HEADDIM,
)
has_bias = bias is not None
bias_type = "none"
if has_bias:
assert bias.dtype in [q.dtype, torch.float]
assert bias.is_cuda
assert bias.dim() == 4
assert bias.stride(-1) == 1
if bias.shape[2:] == (1, seqlen_k):
bias_type = "vector"
elif bias.shape[2:] == (seqlen_q, seqlen_k):
bias_type = "matrix"
else:
raise RuntimeError(
"Last 2 dimensions of bias must be (1, seqlen_k)" " or (seqlen_q, seqlen_k)"
)
bias = bias.expand(batch, nheads, seqlen_q, seqlen_k)
bias_strides = (bias.stride(0), bias.stride(1), bias.stride(2)) if has_bias else (0, 0, 0)
# BLOCK_M = 128
# BLOCK_N = 64
# num_warps = 4
grid = lambda META: (
triton.cdiv(seqlen_k, META["BLOCK_N"]) if META["SEQUENCE_PARALLEL"] else 1,
batch * nheads,
)
_bwd_kernel[grid](
q,
k,
v,
bias,
do,
dq_accum,
dk,
dv,
lse,
delta,
softmax_scale,
q.stride(0),
q.stride(2),
q.stride(1),
k.stride(0),
k.stride(2),
k.stride(1),
v.stride(0),
v.stride(2),
v.stride(1),
*bias_strides,
do.stride(0),
do.stride(2),
do.stride(1),
dq_accum.stride(0),
dq_accum.stride(2),
dq_accum.stride(1),
dk.stride(0),
dk.stride(2),
dk.stride(1),
dv.stride(0),
dv.stride(2),
dv.stride(1),
nheads,
seqlen_q,
seqlen_k,
seqlen_q_rounded,
d,
seqlen_q // 32,
seqlen_k // 32, # key for triton cache (limit number of compilations)
# Can't use kwargs here because triton autotune expects key to be args, not kwargs
# IS_CAUSAL=causal, BLOCK_HEADDIM=d,
bias_type,
causal,
BLOCK_HEADDIM,
# SEQUENCE_PARALLEL=False,
# BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N,
# num_warps=num_warps,
# num_stages=1,
)
dq.copy_(dq_accum)
class FlashAttnQKVPackedFunc(torch.autograd.Function):
@staticmethod
def forward(ctx, qkv, bias=None, causal=False, softmax_scale=None):
"""
qkv: (batch, seqlen, 3, nheads, headdim)
bias: optional, shape broadcastible to (batch, nheads, seqlen, seqlen).
For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen).
ALiBi mask for non-causal would have shape (1, nheads, seqlen, seqlen)
"""
# Make sure that the last dimension is contiguous
if qkv.stride(-1) != 1:
qkv = qkv.contiguous()
o, lse, ctx.softmax_scale = _flash_attn_forward(
qkv[:, :, 0],
qkv[:, :, 1],
qkv[:, :, 2],
bias=bias,
causal=causal,
softmax_scale=softmax_scale,
)
ctx.save_for_backward(qkv, o, lse, bias)
ctx.causal = causal
return o
@staticmethod
def backward(ctx, do):
qkv, o, lse, bias = ctx.saved_tensors
assert not ctx.needs_input_grad[1], "FlashAttention does not support bias gradient yet"
# Triton's autotune causes the Tensor._version to change, and so Pytorch autograd
# does a memcpy. To avoid this we run in inference_mode, which doesn't track the version.
with torch.inference_mode():
dqkv = torch.empty_like(qkv)
_flash_attn_backward(
do,
qkv[:, :, 0],
qkv[:, :, 1],
qkv[:, :, 2],
o,
lse,
dqkv[:, :, 0],
dqkv[:, :, 1],
dqkv[:, :, 2],
bias=bias,
causal=ctx.causal,
softmax_scale=ctx.softmax_scale,
)
return dqkv, None, None, None
flash_attn_qkvpacked_func = FlashAttnQKVPackedFunc.apply
class FlashAttnKVPackedFunc(torch.autograd.Function):
@staticmethod
def forward(ctx, q, kv, bias=None, causal=False, softmax_scale=None):
"""
q: (batch, seqlen_q, nheads, headdim)
kv: (batch, seqlen_k, 2, nheads, headdim)
bias: optional, shape broadcastible to (batch, nheads, seqlen_q, seqlen_k).
For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen_k).
ALiBi mask for non-causal would have shape (1, nheads, seqlen_q, seqlen_k)
"""
# Make sure that the last dimension is contiguous
q, kv = [x if x.stride(-1) == 1 else x.contiguous() for x in [q, kv]]
o, lse, ctx.softmax_scale = _flash_attn_forward(
q, kv[:, :, 0], kv[:, :, 1], bias=bias, causal=causal, softmax_scale=softmax_scale
)
ctx.save_for_backward(q, kv, o, lse, bias)
ctx.causal = causal
return o
@staticmethod
def backward(ctx, do):
q, kv, o, lse, bias = ctx.saved_tensors
if len(ctx.needs_input_grad) >= 3:
assert not ctx.needs_input_grad[2], "FlashAttention does not support bias gradient yet"
# Triton's autotune causes the Tensor._version to change, and so Pytorch autograd
# does a memcpy. To avoid this we run in inference_mode, which doesn't track the version.
with torch.inference_mode():
dq = torch.empty_like(q)
dkv = torch.empty_like(kv)
_flash_attn_backward(
do,
q,
kv[:, :, 0],
kv[:, :, 1],
o,
lse,
dq,
dkv[:, :, 0],
dkv[:, :, 1],
bias=bias,
causal=ctx.causal,
softmax_scale=ctx.softmax_scale,
)
return dq, dkv, None, None, None
flash_attn_kvpacked_func = FlashAttnKVPackedFunc.apply
class FlashAttnFunc(torch.autograd.Function):
@staticmethod
def forward(ctx, q, k, v, bias=None, causal=False, softmax_scale=None):
"""
q: (batch_size, seqlen_q, nheads, headdim)
k, v: (batch_size, seqlen_k, nheads, headdim)
bias: optional, shape broadcastible to (batch, nheads, seqlen_q, seqlen_k).
For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen_k).
ALiBi mask for non-causal would have shape (1, nheads, seqlen_q, seqlen_k)
"""
# Make sure that the last dimension is contiguous
q, k, v = [x if x.stride(-1) == 1 else x.contiguous() for x in [q, k, v]]
o, lse, ctx.softmax_scale = _flash_attn_forward(
q, k, v, bias=bias, causal=causal, softmax_scale=softmax_scale
)
ctx.save_for_backward(q, k, v, o, lse, bias)
ctx.causal = causal
return o
@staticmethod
def backward(ctx, do):
q, k, v, o, lse, bias = ctx.saved_tensors
assert not ctx.needs_input_grad[3], "FlashAttention does not support bias gradient yet"
# Triton's autotune causes the Tensor._version to change, and so Pytorch autograd
# does a memcpy. To avoid this we run in inference_mode, which doesn't track the version.
with torch.inference_mode():
dq = torch.empty_like(q)
dk = torch.empty_like(k)
dv = torch.empty_like(v)
_flash_attn_backward(
do,
q,
k,
v,
o,
lse,
dq,
dk,
dv,
bias=bias,
causal=ctx.causal,
softmax_scale=ctx.softmax_scale,
)
return dq, dk, dv, None, None, None
flash_attn_func = FlashAttnFunc.apply
# [2022-10-23] Downloaded from https://github.com/openai/triton/blob/master/python/tutorials/06-fused-attention.py
# for benchmarking.
# We fixed a few dtype cast to make it work for bf16
"""
Fused Attention
===============
This is a Triton implementation of the Flash Attention algorithm
(see: Dao et al., https://arxiv.org/pdf/2205.14135v2.pdf; Rabe and Staats https://arxiv.org/pdf/2112.05682v2.pdf)
"""
import pytest
import torch
import triton
import triton.language as tl
@triton.jit
def _fwd_kernel(
Q,
K,
V,
sm_scale,
TMP,
L,
M, # NOTE: TMP is a scratchpad buffer to workaround a compiler bug
Out,
stride_qz,
stride_qh,
stride_qm,
stride_qk,
stride_kz,
stride_kh,
stride_kn,
stride_kk,
stride_vz,
stride_vh,
stride_vk,
stride_vn,
stride_oz,
stride_oh,
stride_om,
stride_on,
Z,
H,
N_CTX,
BLOCK_M: tl.constexpr,
BLOCK_DMODEL: tl.constexpr,
BLOCK_N: tl.constexpr,
):
start_m = tl.program_id(0)
off_hz = tl.program_id(1)
# initialize offsets
offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
offs_n = tl.arange(0, BLOCK_N)
offs_d = tl.arange(0, BLOCK_DMODEL)
off_q = off_hz * stride_qh + offs_m[:, None] * stride_qm + offs_d[None, :] * stride_qk
off_k = off_hz * stride_qh + offs_n[:, None] * stride_kn + offs_d[None, :] * stride_kk
off_v = off_hz * stride_qh + offs_n[:, None] * stride_qm + offs_d[None, :] * stride_qk
# Initialize pointers to Q, K, V
q_ptrs = Q + off_q
k_ptrs = K + off_k
v_ptrs = V + off_v
# initialize pointer to m and l
t_ptrs = TMP + off_hz * N_CTX + offs_m
m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
# load q: it will stay in SRAM throughout
q = tl.load(q_ptrs)
# loop over k, v and update accumulator
for start_n in range(0, (start_m + 1) * BLOCK_M, BLOCK_N):
start_n = tl.multiple_of(start_n, BLOCK_N)
# -- compute qk ----
k = tl.load(k_ptrs + start_n * stride_kn)
qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
qk += tl.dot(q, k, trans_b=True)
qk *= sm_scale
qk += tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), 0, float("-inf"))
# -- compute m_ij, p, l_ij
m_ij = tl.max(qk, 1)
p = tl.exp(qk - m_ij[:, None])
l_ij = tl.sum(p, 1)
# -- update m_i and l_i
m_i_new = tl.maximum(m_i, m_ij)
alpha = tl.exp(m_i - m_i_new)
beta = tl.exp(m_ij - m_i_new)
l_i_new = alpha * l_i + beta * l_ij
# -- update output accumulator --
# scale p
p_scale = beta / l_i_new
p = p * p_scale[:, None]
# scale acc
acc_scale = l_i / l_i_new * alpha
tl.store(t_ptrs, acc_scale)
acc_scale = tl.load(t_ptrs) # BUG: have to store and immediately load
acc = acc * acc_scale[:, None]
# update acc
v = tl.load(v_ptrs + start_n * stride_vk)
p = p.to(v.dtype)
acc += tl.dot(p, v)
# update m_i and l_i
l_i = l_i_new
m_i = m_i_new
# rematerialize offsets to save registers
start_m = tl.program_id(0)
offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
# write back l and m
l_ptrs = L + off_hz * N_CTX + offs_m
m_ptrs = M + off_hz * N_CTX + offs_m
tl.store(l_ptrs, l_i)
tl.store(m_ptrs, m_i)
# initialize pointers to output
offs_n = tl.arange(0, BLOCK_DMODEL)
off_o = off_hz * stride_oh + offs_m[:, None] * stride_om + offs_n[None, :] * stride_on
out_ptrs = Out + off_o
tl.store(out_ptrs, acc)
@triton.jit
def _bwd_preprocess(
Out,
DO,
L,
NewDO,
Delta,
BLOCK_M: tl.constexpr,
D_HEAD: tl.constexpr,
):
off_m = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M)
off_n = tl.arange(0, D_HEAD)
# load
o = tl.load(Out + off_m[:, None] * D_HEAD + off_n[None, :]).to(tl.float32)
do = tl.load(DO + off_m[:, None] * D_HEAD + off_n[None, :]).to(tl.float32)
denom = tl.load(L + off_m).to(tl.float32)
# compute
do = do / denom[:, None]
delta = tl.sum(o * do, axis=1)
# write-back
tl.store(NewDO + off_m[:, None] * D_HEAD + off_n[None, :], do)
tl.store(Delta + off_m, delta)
@triton.jit
def _bwd_kernel(
Q,
K,
V,
sm_scale,
Out,
DO,
DQ,
DK,
DV,
L,
M,
D,
stride_qz,
stride_qh,
stride_qm,
stride_qk,
stride_kz,
stride_kh,
stride_kn,
stride_kk,
stride_vz,
stride_vh,
stride_vk,
stride_vn,
Z,
H,
N_CTX,
num_block,
BLOCK_M: tl.constexpr,
BLOCK_DMODEL: tl.constexpr,
BLOCK_N: tl.constexpr,
):
off_hz = tl.program_id(0)
off_z = off_hz // H
off_h = off_hz % H
# offset pointers for batch/head
Q += off_z * stride_qz + off_h * stride_qh
K += off_z * stride_qz + off_h * stride_qh
V += off_z * stride_qz + off_h * stride_qh
DO += off_z * stride_qz + off_h * stride_qh
DQ += off_z * stride_qz + off_h * stride_qh
DK += off_z * stride_qz + off_h * stride_qh
DV += off_z * stride_qz + off_h * stride_qh
for start_n in range(0, num_block):
lo = start_n * BLOCK_M
# initialize row/col offsets
offs_qm = lo + tl.arange(0, BLOCK_M)
offs_n = start_n * BLOCK_M + tl.arange(0, BLOCK_M)
offs_m = tl.arange(0, BLOCK_N)
offs_k = tl.arange(0, BLOCK_DMODEL)
# initialize pointers to value-like data
q_ptrs = Q + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk)
k_ptrs = K + (offs_n[:, None] * stride_kn + offs_k[None, :] * stride_kk)
v_ptrs = V + (offs_n[:, None] * stride_qm + offs_k[None, :] * stride_qk)
do_ptrs = DO + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk)
dq_ptrs = DQ + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk)
# pointer to row-wise quantities in value-like data
D_ptrs = D + off_hz * N_CTX
m_ptrs = M + off_hz * N_CTX
# initialize dv amd dk
dv = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
dk = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
# k and v stay in SRAM throughout
k = tl.load(k_ptrs)
v = tl.load(v_ptrs)
# loop over rows
for start_m in range(lo, num_block * BLOCK_M, BLOCK_M):
offs_m_curr = start_m + offs_m
# load q, k, v, do on-chip
q = tl.load(q_ptrs)
# recompute p = softmax(qk, dim=-1).T
# NOTE: `do` is pre-divided by `l`; no normalization here
qk = tl.dot(q, k, trans_b=True)
qk = tl.where(offs_m_curr[:, None] >= (offs_n[None, :]), qk, float("-inf"))
m = tl.load(m_ptrs + offs_m_curr)
p = tl.exp(qk * sm_scale - m[:, None])
# compute dv
do = tl.load(do_ptrs)
dv += tl.dot(p.to(do.dtype), do, trans_a=True)
# compute dp = dot(v, do)
Di = tl.load(D_ptrs + offs_m_curr)
dp = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) - Di[:, None]
dp += tl.dot(do, v, trans_b=True)
# compute ds = p * (dp - delta[:, None])
ds = p * dp * sm_scale
# compute dk = dot(ds.T, q)
dk += tl.dot(ds.to(q.dtype), q, trans_a=True)
# # compute dq
dq = tl.load(dq_ptrs, eviction_policy="evict_last")
dq += tl.dot(ds.to(k.dtype), k)
tl.store(dq_ptrs, dq, eviction_policy="evict_last")
# # increment pointers
dq_ptrs += BLOCK_M * stride_qm
q_ptrs += BLOCK_M * stride_qm
do_ptrs += BLOCK_M * stride_qm
# write-back
dv_ptrs = DV + (offs_n[:, None] * stride_qm + offs_k[None, :] * stride_qk)
dk_ptrs = DK + (offs_n[:, None] * stride_kn + offs_k[None, :] * stride_kk)
tl.store(dv_ptrs, dv)
tl.store(dk_ptrs, dk)
class _attention(torch.autograd.Function):
@staticmethod
def forward(ctx, q, k, v, sm_scale):
BLOCK = 128
# shape constraints
Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
assert Lq == Lk and Lk == Lv
assert Lk in {16, 32, 64, 128}
o = torch.empty_like(q)
grid = (triton.cdiv(q.shape[2], BLOCK), q.shape[0] * q.shape[1])
tmp = torch.empty(
(q.shape[0] * q.shape[1], q.shape[2]), device=q.device, dtype=torch.float32
)
L = torch.empty((q.shape[0] * q.shape[1], q.shape[2]), device=q.device, dtype=torch.float32)
m = torch.empty((q.shape[0] * q.shape[1], q.shape[2]), device=q.device, dtype=torch.float32)
num_warps = 4 if Lk <= 64 else 8
_fwd_kernel[grid](
q,
k,
v,
sm_scale,
tmp,
L,
m,
o,
q.stride(0),
q.stride(1),
q.stride(2),
q.stride(3),
k.stride(0),
k.stride(1),
k.stride(2),
k.stride(3),
v.stride(0),
v.stride(1),
v.stride(2),
v.stride(3),
o.stride(0),
o.stride(1),
o.stride(2),
o.stride(3),
q.shape[0],
q.shape[1],
q.shape[2],
BLOCK_M=BLOCK,
BLOCK_N=BLOCK,
BLOCK_DMODEL=Lk,
num_warps=num_warps,
num_stages=1,
)
ctx.save_for_backward(q, k, v, o, L, m)
ctx.BLOCK = BLOCK
ctx.grid = grid
ctx.sm_scale = sm_scale
ctx.BLOCK_DMODEL = Lk
return o
@staticmethod
def backward(ctx, do):
q, k, v, o, l, m = ctx.saved_tensors
do = do.contiguous()
dq = torch.zeros_like(q, dtype=torch.float32)
dk = torch.empty_like(k)
dv = torch.empty_like(v)
do_scaled = torch.empty_like(do)
delta = torch.empty_like(l)
_bwd_preprocess[(ctx.grid[0] * ctx.grid[1],)](
o,
do,
l,
do_scaled,
delta,
BLOCK_M=ctx.BLOCK,
D_HEAD=ctx.BLOCK_DMODEL,
)
# NOTE: kernel currently buggy for other values of `num_warps`
num_warps = 8
_bwd_kernel[(ctx.grid[1],)](
q,
k,
v,
ctx.sm_scale,
o,
do_scaled,
dq,
dk,
dv,
l,
m,
delta,
q.stride(0),
q.stride(1),
q.stride(2),
q.stride(3),
k.stride(0),
k.stride(1),
k.stride(2),
k.stride(3),
v.stride(0),
v.stride(1),
v.stride(2),
v.stride(3),
q.shape[0],
q.shape[1],
q.shape[2],
ctx.grid[0],
BLOCK_M=ctx.BLOCK,
BLOCK_N=ctx.BLOCK,
BLOCK_DMODEL=ctx.BLOCK_DMODEL,
num_warps=num_warps,
num_stages=1,
)
return dq.to(q.dtype), dk, dv, None
attention = _attention.apply
import math
import hydra
import torch
import torch.nn as nn
from einops import rearrange
from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input
from flash_attn.flash_blocksparse_attn_interface import (
convert_blockmask,
flash_blocksparse_attn_func,
)
class FlashBlocksparseAttention(nn.Module):
"""Implement the scaled dot product attention with softmax.
Arguments
---------
softmax_temp: The temperature to use for the softmax attention.
(default: 1/sqrt(d_keys) where d_keys is computed at
runtime)
attention_dropout: The dropout rate to apply to the attention
(default: 0.1)
"""
def __init__(
self,
sparsity_config,
softmax_temp=None,
attention_dropout=0.0,
max_seq_length=2048,
device=None,
dtype=None,
):
super().__init__()
self.sparsity_config = hydra.utils.instantiate(sparsity_config)
self.softmax_temp = softmax_temp
self.dropout_p = attention_dropout
# initialize sparse layout and register as buffer
max_seq_length = ((max_seq_length + 256 - 1) // 256) * 256
layout = self.sparsity_config.make_layout(max_seq_length)
self.register_buffer("layout", layout)
blockmask_converted = convert_blockmask(self.layout, causal=False)
self.register_buffer("blockmask_converted", blockmask_converted)
# logger.info(f'Attention class {self.__class__}: saving={self.layout.float().mean()}')
def forward(
self,
qkv,
attn_mask=None,
key_padding_mask=None,
causal=False,
cu_seqlens=None,
max_s=None,
need_weights=False,
convert_mask=True,
):
"""Implements the multihead softmax attention.
Arguments
---------
qkv: The tensor containing the query, key, and value. (B, S, 3, H, D) if key_padding_mask is None
attn_mask: An implementation of BaseMask that encodes where each
query can attend to
key_padding_mask: An implementation of BaseMask that encodes how
many query each sequence in the batch consists of
"""
assert not need_weights
assert attn_mask is None
assert qkv.dtype == torch.float16
assert qkv.is_cuda
if cu_seqlens is None:
batch_size = qkv.shape[0]
seqlen = qkv.shape[1]
# Convert mask to take a subset
seqlen_rounded = ((seqlen + 256 - 1) // 256) * 256
assert seqlen_rounded // 16 <= self.layout.shape[0], (
seqlen_rounded // 256 <= self.layout.shape[1]
)
blockmask = self.layout[: seqlen_rounded // 16, : seqlen_rounded // 256]
if key_padding_mask is None:
qkv = rearrange(qkv, "b s ... -> (b s) ...")
max_s = seqlen
cu_seqlens = torch.arange(
0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32, device=qkv.device
)
output = flash_blocksparse_attn_func(
qkv,
cu_seqlens,
blockmask,
self.dropout_p if self.training else 0.0,
max_s,
softmax_scale=self.softmax_temp,
causal=causal,
)
output = rearrange(output, "(b s) ... -> b s ...", b=batch_size)
else:
key_padding_mask_bool = key_padding_mask.bool_matrix
nheads = qkv.shape[-2]
x = rearrange(qkv, "b s three h d -> b s (three h d)")
x_unpad, indices, cu_seqlens, max_s = unpad_input(x, key_padding_mask_bool)
x_unpad = rearrange(x_unpad, "nnz (three h d) -> nnz three h d", three=3, h=nheads)
output_unpad = flash_blocksparse_attn_func(
x_unpad,
cu_seqlens,
blockmask,
self.dropout_p if self.training else 0.0,
max_s,
softmax_scale=self.softmax_temp,
causal=causal,
)
output = rearrange(
pad_input(
rearrange(output_unpad, "nnz h d -> nnz (h d)"), indices, batch_size, seqlen
),
"b s (h d) -> b s h d",
h=nheads,
)
else:
assert max_s is not None
seqlen = max_s
# Convert mask to take a subset
seqlen_rounded = ((seqlen + 256 - 1) // 256) * 256
assert seqlen_rounded // 16 <= self.layout.shape[0], (
seqlen_rounded // 256 <= self.layout.shape[1]
)
blockmask = self.layout[: seqlen_rounded // 16, : seqlen_rounded // 256]
if convert_mask:
output = flash_blocksparse_attn_func(
qkv,
cu_seqlens,
blockmask,
self.dropout_p if self.training else 0.0,
max_s,
softmax_scale=self.softmax_temp,
causal=causal,
)
else:
output = flash_blocksparse_attn_func(
qkv,
cu_seqlens,
self.blockmask_converted,
self.dropout_p if self.training else 0.0,
max_s,
softmax_scale=self.softmax_temp,
causal=causal,
convert_mask=False,
)
return output, None
class FlashBlocksparseMHA(nn.Module):
def __init__(
self,
embed_dim,
num_heads,
sparsity_config,
bias=True,
batch_first=True,
attention_dropout=0.0,
causal=False,
max_seq_length=2048,
device=None,
dtype=None,
**kwargs,
) -> None:
assert batch_first
factory_kwargs = {"device": device, "dtype": dtype}
super().__init__()
self.embed_dim = embed_dim
self.causal = causal
self.num_heads = num_heads
assert self.embed_dim % num_heads == 0, "self.kdim must be divisible by num_heads"
self.head_dim = self.embed_dim // num_heads
assert self.head_dim in [16, 32, 64], "Only support head_dim == 16, 32, or 64"
self.Wqkv = nn.Linear(embed_dim, 3 * embed_dim, bias=bias, **factory_kwargs)
self.inner_attn = FlashBlocksparseAttention(
sparsity_config,
attention_dropout=attention_dropout,
max_seq_length=max_seq_length,
**factory_kwargs,
)
self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias, **factory_kwargs)
def forward(
self, x, x_ignored_, x_ignored_1_, attn_mask=None, key_padding_mask=None, need_weights=False
):
qkv = self.Wqkv(x)
qkv = rearrange(qkv, "b s (three h d) -> b s three h d", three=3, h=self.num_heads)
context, attn_weights = self.inner_attn(
qkv, key_padding_mask=key_padding_mask, need_weights=need_weights, causal=self.causal
)
return self.out_proj(rearrange(context, "b s h d -> b s (h d)")), attn_weights
# Adapted from https://github.com/mlcommons/training_results_v1.1/blob/main/NVIDIA/benchmarks/bert/implementations/pytorch/fmha.py
import flash_attn_cuda
import torch
import torch.nn as nn
def convert_blockmask(blockmask, causal):
"""Convert from the 0-1 format to the format used by the CUDA code.
0 means the block is skipped.
nonzero means the block is not skipped.
Argument:
blockmask: (row, col): a 0-1 tensor
Return:
blockmask_converted: (col, row), dtype torch.int32: for each column, it contains the row
indices of the nonzero blocks, padded with -1 to reach length @row.
The indices are multiplied by 4, with the smallest bit used to encode whether
it is the first nonzero in its row, and the 2nd smallest bit to encode whether it is
the last nonzero in its row..
"""
assert not causal
# TD [2022-05-13]: The indexing and sorting is very tricky
nrow, ncol = blockmask.shape
# Sort does not support bool on CUDA
blockmask = blockmask.to(dtype=torch.uint8)
nonzero_val, nonzero_sorted_rowidx = blockmask.sort(dim=0, stable=True, descending=True)
nonzero_unsorted_rowidx = nonzero_sorted_rowidx.argsort(dim=0)
last_nonzero_col_per_row = blockmask.sort(dim=-1, stable=True).indices[:, -1]
last_nonzero_col_per_row_after_sort = nonzero_unsorted_rowidx[
torch.arange(nrow, device=blockmask.device), last_nonzero_col_per_row
]
first_nonzero_col_per_row = blockmask.sort(dim=-1, stable=True, descending=True).indices[:, 0]
first_nonzero_col_per_row_after_sort = nonzero_unsorted_rowidx[
torch.arange(nrow, device=blockmask.device), first_nonzero_col_per_row
]
nonzero_idx = nonzero_sorted_rowidx * 4
nonzero_idx[last_nonzero_col_per_row_after_sort, last_nonzero_col_per_row] += 2
nonzero_idx[first_nonzero_col_per_row_after_sort, first_nonzero_col_per_row] += 1
nonzero_idx[nonzero_val == 0] = -1
return nonzero_idx.T.contiguous().to(dtype=torch.int32)
def _flash_blocksparse_attn_forward(
qkv, cu_seqlens, blockmask, dropout_p, max_s, softmax_scale, causal, return_softmax
):
context, softmax_lse, *rest = flash_attn_cuda.fwd_block(
qkv, cu_seqlens, blockmask, dropout_p, max_s, softmax_scale, causal, return_softmax, None
)
# if context.isnan().any() or softmax_lse.isnan().any():
# breakpoint()
S_dmask = rest[0] if return_softmax else None
return context, softmax_lse, S_dmask
def _flash_blocksparse_attn_backward(
dout,
qkv,
out,
S_dmask,
softmax_lse,
cu_seqlens,
blockmask,
dropout_p,
max_s,
softmax_scale,
causal,
):
dqkv, dp, softmax_d = flash_attn_cuda.bwd_block(
dout,
qkv,
out,
S_dmask,
softmax_lse,
cu_seqlens,
blockmask,
dropout_p,
softmax_scale,
max_s,
causal,
None,
)
# if dqkv.isnan().any() or softmax_d.isnan().any():
# breakpoint()
return dqkv
class FlashBlocksparseAttnFun(torch.autograd.Function):
@staticmethod
def forward(ctx, qkv, cu_seqlens, blockmask, dropout_p, max_s, softmax_scale, causal):
# Save rng_state because the backward pass will regenerate the dropout mask
rng_state = torch.cuda.get_rng_state() if dropout_p > 0 else None
if softmax_scale is None:
softmax_scale = qkv.shape[-1] ** (-0.5)
context, softmax_lse, S_dmask = _flash_blocksparse_attn_forward(
qkv,
cu_seqlens,
blockmask,
dropout_p,
max_s,
softmax_scale,
causal=causal,
return_softmax=False,
)
ctx.save_for_backward(qkv, context, S_dmask, softmax_lse, cu_seqlens, blockmask, rng_state)
ctx.dropout_p = dropout_p
ctx.max_s = max_s
ctx.softmax_scale = softmax_scale
ctx.causal = causal
return context
@staticmethod
def backward(ctx, dout):
qkv, context, S_dmask, softmax_lse, cu_seqlens, blockmask, rng_state = ctx.saved_tensors
if rng_state is not None:
cur_rng_state = torch.cuda.get_rng_state()
torch.cuda.set_rng_state(rng_state)
# S_dmask is None, temporarily use another tensor just to get it running
dqkv = _flash_blocksparse_attn_backward(
dout,
qkv,
context,
context,
softmax_lse,
cu_seqlens,
blockmask,
ctx.dropout_p,
ctx.max_s,
ctx.softmax_scale,
ctx.causal,
)
if rng_state is not None:
torch.cuda.set_rng_state(cur_rng_state)
return dqkv, None, None, None, None, None, None, None
# We duplicate code to return both the output and the softmax for testing
# Returning both makes backward a bit slower, so we want to keep using the other version for speed.
class FlashBlocksparseAttnFunWithS(torch.autograd.Function):
@staticmethod
def forward(ctx, qkv, cu_seqlens, blockmask, dropout_p, max_s, softmax_scale, causal):
# Save rng_state because the backward pass is gonna regenerate the dropout mask
rng_state = torch.cuda.get_rng_state() if dropout_p > 0 else None
if softmax_scale is None:
softmax_scale = qkv.shape[-1] ** (-0.5)
context, softmax_lse, S_dmask = _flash_blocksparse_attn_forward(
qkv,
cu_seqlens,
blockmask,
dropout_p,
max_s,
softmax_scale,
causal=causal,
return_softmax=True,
)
ctx.save_for_backward(qkv, context, S_dmask, softmax_lse, cu_seqlens, blockmask, rng_state)
ctx.dropout_p = dropout_p
ctx.max_s = max_s
ctx.softmax_scale = softmax_scale
ctx.causal = causal
return context, S_dmask, softmax_lse
@staticmethod
def backward(ctx, dout, _dS_dmask_ignored, _dsoftmax_sum_ignored):
qkv, context, S_dmask, softmax_lse, cu_seqlens, blockmask, rng_state = ctx.saved_tensors
if rng_state is not None:
cur_rng_state = torch.cuda.get_rng_state()
torch.cuda.set_rng_state(rng_state)
dqkv = _flash_blocksparse_attn_backward(
dout,
qkv,
context,
S_dmask,
softmax_lse,
cu_seqlens,
blockmask,
ctx.dropout_p,
ctx.max_s,
ctx.softmax_scale,
ctx.causal,
)
if rng_state is not None:
torch.cuda.set_rng_state(cur_rng_state)
return dqkv, None, None, None, None, None, None
def flash_blocksparse_attn_func(
qkv,
cu_seqlens,
blockmask,
dropout_p,
max_s,
softmax_scale=None,
causal=False,
return_attn_probs=False,
convert_mask=True,
):
"""dropout_p should be set to 0.0 during evaluation"""
func = FlashBlocksparseAttnFun if not return_attn_probs else FlashBlocksparseAttnFunWithS
if convert_mask:
blockmask = convert_blockmask(blockmask, causal=causal)
return func.apply(qkv, cu_seqlens, blockmask, dropout_p, max_s, softmax_scale, causal)
# [2022-10-23] Copied from https://github.com/NVIDIA/apex/blob/master/apex/transformer/functional/fused_softmax.py
# for benchmarking.
# We added support for seqlen=2k and seqlen=4k
# coding=utf-8
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
from apex._autocast_utils import _cast_if_autocast_enabled
from apex.transformer.enums import AttnMaskType
from fused_softmax_lib import (
scaled_masked_softmax_backward,
scaled_masked_softmax_forward,
scaled_masked_softmax_get_batch_per_block,
scaled_upper_triang_masked_softmax_backward,
scaled_upper_triang_masked_softmax_forward,
)
class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function):
"""
Fused operation which performs following three operations in sequence
1. Scale the tensor.
2. Apply upper triangular mask (typically used in gpt models).
3. Perform softmax.
"""
@staticmethod
def forward(ctx, inputs, scale):
scale_t = torch.tensor([scale])
softmax_results = scaled_upper_triang_masked_softmax_forward(inputs, scale_t[0])
ctx.save_for_backward(softmax_results, scale_t)
return softmax_results
@staticmethod
def backward(ctx, output_grads):
softmax_results, scale_t = ctx.saved_tensors
input_grads = scaled_upper_triang_masked_softmax_backward(
output_grads, softmax_results, scale_t[0]
)
return input_grads, None
def scaled_upper_triang_masked_softmax(inputs, _, scale):
b, np, sq, sk = inputs.size()
assert sq == sk, "causal mask is only for self attention"
# Reshaping input to 3D tensor (attn_batches, sq, sk)
inputs = inputs.view(-1, sq, sk)
args = _cast_if_autocast_enabled(inputs, scale)
with torch.cuda.amp.autocast(enabled=False):
probs = ScaledUpperTriangMaskedSoftmax.apply(*args)
return probs.view(b, np, sq, sk)
# NOTE (mkozuki): `ScaledMaskedSoftmax` somehow doesn't work well with `torch.cuda.amp.custom_fwd`.
# Without `cast_inputs` kwarg, somehow inputs are not cast to dtype used in the autocast context.
# So I needed to manually write two `torch.autograd.Function` inheritances.
# Fused operation which performs following three operations in sequence
# 1. Scale the tensor.
# 2. Apply the mask.
# 3. Perform softmax.
class ScaledMaskedSoftmax(torch.autograd.Function):
@staticmethod
def forward(ctx, inputs, mask, scale):
scale_t = torch.tensor([scale])
softmax_results = scaled_masked_softmax_forward(inputs, mask, scale_t[0])
ctx.save_for_backward(softmax_results, scale_t)
return softmax_results
@staticmethod
def backward(ctx, output_grads):
softmax_results, scale_t = ctx.saved_tensors
input_grads = scaled_masked_softmax_backward(output_grads, softmax_results, scale_t[0])
return input_grads, None, None
def scaled_masked_softmax(inputs, mask, scale):
# input is 4D tensor (b, np, sq, sk)
args = _cast_if_autocast_enabled(inputs, mask, scale)
with torch.cuda.amp.autocast(enabled=False):
return ScaledMaskedSoftmax.apply(*args)
class FusedScaleMaskSoftmax(torch.nn.Module):
"""
fused operation: scaling + mask + softmax
Arguments:
input_in_fp16: flag to indicate if input in fp16 data format.
input_in_bf16: flag to indicate if input in bf16 data format.
attn_mask_type: attention mask type (pad or causal)
scaled_masked_softmax_fusion: flag to indicate user want to use softmax fusion
mask_func: mask function to be applied.
softmax_in_fp32: if true, softmax in performed at fp32 precision.
scale: scaling factor used in input tensor scaling.
"""
def __init__(
self,
input_in_fp16,
input_in_bf16,
attn_mask_type,
scaled_masked_softmax_fusion,
mask_func,
softmax_in_fp32,
scale,
):
super().__init__()
self.input_in_fp16 = input_in_fp16
self.input_in_bf16 = input_in_bf16
if self.input_in_fp16 and self.input_in_bf16:
raise RuntimeError("both fp16 and bf16 flags cannot be active at the same time.")
self.input_in_float16 = self.input_in_fp16 or self.input_in_bf16
self.attn_mask_type = attn_mask_type
self.scaled_masked_softmax_fusion = scaled_masked_softmax_fusion
self.mask_func = mask_func
self.softmax_in_fp32 = softmax_in_fp32
self.scale = scale
if not (self.scale is None or softmax_in_fp32):
raise RuntimeError("softmax should be in fp32 when scaled")
if self.scaled_masked_softmax_fusion:
if self.attn_mask_type == AttnMaskType.causal:
self.fused_softmax_func = scaled_upper_triang_masked_softmax
elif self.attn_mask_type == AttnMaskType.padding:
self.fused_softmax_func = scaled_masked_softmax
else:
raise ValueError("Invalid attn_mask_type.")
def forward(self, input, mask):
# [b, np, sq, sk]
assert input.dim() == 4
if self.is_kernel_available(mask, *input.size()):
return self.forward_fused_softmax(input, mask)
else:
return self.forward_torch_softmax(input, mask)
def is_kernel_available(self, mask, b, np, sq, sk):
attn_batches = b * np
if (
self.scaled_masked_softmax_fusion # user want to fuse
and self.input_in_float16 # input must be fp16
and (
self.attn_mask_type == AttnMaskType.causal
or (self.attn_mask_type == AttnMaskType.padding and mask is not None)
)
and 16 < sk <= 8192 # sk must be 16 ~ 8192
and sq % 4 == 0 # sq must be divisor of 4
and sk % 4 == 0 # sk must be divisor of 4
and attn_batches % 4 == 0 # np * b must be divisor of 4
):
if 0 <= sk <= 8192:
batch_per_block = self.get_batch_per_block(sq, sk, b, np)
if self.attn_mask_type == AttnMaskType.causal:
if attn_batches % batch_per_block == 0:
return True
else:
if sq % batch_per_block == 0:
return True
return False
def forward_fused_softmax(self, input, mask):
# input.shape = [b, np, sq, sk]
scale = self.scale if self.scale is not None else 1.0
return self.fused_softmax_func(input, mask, scale)
def forward_torch_softmax(self, input, mask):
if self.input_in_float16 and self.softmax_in_fp32:
input = input.float()
if self.scale is not None:
input = input * self.scale
mask_output = self.mask_func(input, mask) if mask is not None else input
probs = torch.nn.Softmax(dim=-1)(mask_output)
if self.input_in_float16 and self.softmax_in_fp32:
if self.input_in_fp16:
probs = probs.half()
else:
probs = probs.bfloat16()
return probs
@staticmethod
def get_batch_per_block(sq, sk, b, np):
return scaled_masked_softmax_get_batch_per_block(sq, sk, b, np)
# We use the same API as https://github.com/rwightman/pytorch-image-models/blob/v0.6.11/timm/models/layers/patch_embed.py
# But we use nn.Linear instead of Conv2d and it's about 8x faster.
from functools import partial
import torch.nn as nn
from einops import rearrange
from torch import _assert
from torch.nn.modules.utils import _pair
try:
from flash_attn.ops.fused_dense import FusedDense
except ImportError:
FusedDense = None
class PatchEmbed(nn.Module):
"""2D Image to Patch Embedding"""
def __init__(
self,
img_size=224,
patch_size=16,
in_chans=3,
embed_dim=768,
norm_layer=None,
flatten=True,
bias=True,
fused_bias_fc=False,
):
super().__init__()
img_size = _pair(img_size)
patch_size = _pair(patch_size)
self.img_size = img_size
self.patch_size = patch_size
self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
self.num_patches = self.grid_size[0] * self.grid_size[1]
self.flatten = flatten
if fused_bias_fc and FusedDense is None:
raise ImportError("fused_dense is not installed")
linear_cls = nn.Linear if not fused_bias_fc or not bias else FusedDense
self.proj = linear_cls(in_chans * patch_size[0] * patch_size[1], embed_dim, bias=bias)
self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
def forward(self, x):
_, _, H, W = x.shape
_assert(
H == self.img_size[0],
f"Input image height ({H}) doesn't match model ({self.img_size[0]}).",
)
_assert(
W == self.img_size[1],
f"Input image width ({W}) doesn't match model ({self.img_size[1]}).",
)
x = self.proj(
rearrange(
x,
"b c (h p1) (w p2) -> b h w (c p1 p2)",
p1=self.patch_size[0],
p2=self.patch_size[1],
)
)
if self.flatten:
x = rearrange(x, "b h w c -> b (h w) c")
x = self.norm(x)
return x
# Copyright (c) 2023, Tri Dao.
import math
from typing import Optional, Tuple, Union
import torch
from einops import rearrange, repeat
from flash_attn.ops.triton.rotary import apply_rotary
def rotate_half(x, interleaved=False):
if not interleaved:
x1, x2 = x.chunk(2, dim=-1)
return torch.cat((-x2, x1), dim=-1)
else:
x1, x2 = x[..., ::2], x[..., 1::2]
return rearrange(torch.stack((-x2, x1), dim=-1), "... d two -> ... (d two)", two=2)
def apply_rotary_emb_torch(x, cos, sin, interleaved=False):
"""
x: (batch_size, seqlen, nheads, headdim)
cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2)
"""
ro_dim = cos.shape[-1] * 2
assert ro_dim <= x.shape[-1]
cos = repeat(cos, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
sin = repeat(sin, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
return torch.cat(
[x[..., :ro_dim] * cos + rotate_half(x[..., :ro_dim], interleaved) * sin, x[..., ro_dim:]],
dim=-1,
)
class ApplyRotaryEmb(torch.autograd.Function):
@staticmethod
def forward(
ctx,
x,
cos,
sin,
interleaved=False,
inplace=False,
seqlen_offsets: Union[int, torch.Tensor] = 0,
cu_seqlens: Optional[torch.Tensor] = None,
max_seqlen: Optional[int] = None,
):
out = apply_rotary(
x,
cos,
sin,
seqlen_offsets=seqlen_offsets,
cu_seqlens=cu_seqlens,
max_seqlen=max_seqlen,
interleaved=interleaved,
inplace=inplace,
)
if isinstance(seqlen_offsets, int):
ctx.save_for_backward(cos, sin, cu_seqlens) # Can't save int with save_for_backward
ctx.seqlen_offsets = seqlen_offsets
else:
ctx.save_for_backward(cos, sin, cu_seqlens, seqlen_offsets)
ctx.seqlen_offsets = None
ctx.interleaved = interleaved
ctx.inplace = inplace
ctx.max_seqlen = max_seqlen
return out if not inplace else x
@staticmethod
def backward(ctx, do):
seqlen_offsets = ctx.seqlen_offsets
if seqlen_offsets is None:
cos, sin, cu_seqlens, seqlen_offsets = ctx.saved_tensors
else:
cos, sin, cu_seqlens = ctx.saved_tensors
# TD [2023-09-02]: For some reason Triton (2.0.0.post1) errors with
# "[CUDA]: invalid device context", and cloning makes it work. Idk why. Triton 2.1.0 works.
if not ctx.interleaved and not ctx.inplace:
do = do.clone()
dx = apply_rotary(
do,
cos,
sin,
seqlen_offsets=seqlen_offsets,
cu_seqlens=cu_seqlens,
max_seqlen=ctx.max_seqlen,
interleaved=ctx.interleaved,
inplace=ctx.inplace,
conjugate=True,
)
return dx, None, None, None, None, None, None, None
def apply_rotary_emb(
x,
cos,
sin,
interleaved=False,
inplace=False,
seqlen_offsets: Union[int, torch.Tensor] = 0,
cu_seqlens: Optional[torch.Tensor] = None,
max_seqlen: Optional[int] = None,
):
"""
Arguments:
x: (batch_size, seqlen, nheads, headdim) if cu_seqlens is None
else (total_seqlen, nheads, headdim)
cos, sin: (seqlen_rotary, rotary_dim / 2)
interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
of 1st half and 2nd half (GPT-NeoX style).
inplace: if True, apply rotary embedding in-place.
seqlen_offsets: (batch_size,) or int. Each sequence in x is shifted by this amount.
Most commonly used in inference when we have KV cache.
cu_seqlens: (batch + 1,) or None
max_seqlen: int
Return:
out: (batch_size, seqlen, nheads, headdim) if cu_seqlens is None
else (total_seqlen, nheads, headdim)
rotary_dim must be <= headdim
Apply rotary embedding to the first rotary_dim of x.
"""
return ApplyRotaryEmb.apply(
x, cos, sin, interleaved, inplace, seqlen_offsets, cu_seqlens, max_seqlen
)
# For backward compatibility
apply_rotary_emb_func = apply_rotary_emb
class ApplyRotaryEmbQKV_(torch.autograd.Function):
@staticmethod
def forward(
ctx,
qkv,
cos,
sin,
cos_k=None,
sin_k=None,
interleaved=False,
seqlen_offsets: Union[int, torch.Tensor] = 0,
):
batch, seqlen, three, nheads, headdim = qkv.shape
assert three == 3
if cos_k is None and sin_k is None and qkv.is_contiguous():
# Call 1 kernel instead of 2 kernels
# We need qkv to be contiguous so that when we reshape to combine (3, nheads)
# dimensions, we get the same tensor
# qk = rearrange(qkv[:, :, :2], "b s t h d -> b s (t h) d")
qk = qkv[:, :, :2].reshape(batch, seqlen, -1, headdim)
apply_rotary(
qk, cos, sin, seqlen_offsets=seqlen_offsets, interleaved=interleaved, inplace=True
)
else:
cos_k = cos if cos_k is None else cos_k
sin_k = sin if sin_k is None else sin_k
q, k = qkv[:, :, 0], qkv[:, :, 1]
apply_rotary(q, cos, sin, seqlen_offsets, interleaved=interleaved, inplace=True)
apply_rotary(k, cos_k, sin_k, seqlen_offsets, interleaved=interleaved, inplace=True)
ctx.save_for_backward(cos, sin, cos_k, sin_k)
if isinstance(seqlen_offsets, int):
ctx.save_for_backward(cos, sin, cos_k, sin_k)
ctx.seqlen_offsets = seqlen_offsets
else:
ctx.save_for_backward(cos, sin, cos_k, sin_k, seqlen_offsets)
ctx.seqlen_offsets = None
ctx.interleaved = interleaved
return qkv
@staticmethod
def backward(ctx, dqkv):
seqlen_offsets = ctx.seqlen_offsets
if seqlen_offsets is None:
cos, sin, cos_k, sin_k, seqlen_offsets = ctx.saved_tensors
else:
cos, sin, cos_k, sin_k = ctx.saved_tensors
if cos_k is None and sin_k is None and dqkv.is_contiguous():
# Call 1 kernel instead of 2 kernels
# We need dqkv to be contiguous so that when we reshape to combine (3, nheads)
# dimensions, we get the same tensor
dqk = rearrange(dqkv[:, :, :2], "b s t h d -> b s (t h) d")
apply_rotary(
dqk,
cos,
sin,
seqlen_offsets=seqlen_offsets,
interleaved=ctx.interleaved,
inplace=True,
conjugate=True,
)
else:
cos_k = cos if cos_k is None else cos_k
sin_k = sin if sin_k is None else sin_k
dq, dk = dqkv[:, :, 0], dqkv[:, :, 1]
apply_rotary(
dq, cos, sin, seqlen_offsets, interleaved=ctx.interleaved, inplace=True, conjugate=True
)
apply_rotary(
dk,
cos_k,
sin_k,
seqlen_offsets,
interleaved=ctx.interleaved,
inplace=True,
conjugate=True,
)
return dqkv, None, None, None, None, None, None
def apply_rotary_emb_qkv_(
qkv,
cos,
sin,
cos_k=None,
sin_k=None,
interleaved=False,
seqlen_offsets: Union[int, torch.Tensor] = 0,
):
"""
Arguments:
qkv: (batch_size, seqlen, 3, nheads, headdim)
cos, sin: (seqlen, rotary_dim / 2)
cos_k, sin_k: (seqlen, rotary_dim / 2), optional
interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead of
1st half and 2nd half (GPT-NeoX style).
seqlen_offsets: (batch_size,) or int. Each sequence in Q and K is shifted by this amount.
Most commonly used in inference when we have KV cache.
Return:
qkv: (batch_size, seqlen, 3, nheads, headdim)
rotary_dim must be <= headdim
Apply rotary embedding *inplace* to the first rotary_dim of Q and K.
"""
return ApplyRotaryEmbQKV_.apply(qkv, cos, sin, cos_k, sin_k, interleaved, seqlen_offsets)
class ApplyRotaryEmbKV_(torch.autograd.Function):
@staticmethod
def forward(ctx, kv, cos, sin, interleaved=False, seqlen_offsets: Union[int, torch.Tensor] = 0):
batch, seqlen, two, nheads, headdim = kv.shape
assert two == 2
k = kv[:, :, 0]
apply_rotary(
k, cos, sin, seqlen_offsets=seqlen_offsets, interleaved=interleaved, inplace=True
)
if isinstance(seqlen_offsets, int):
ctx.save_for_backward(cos, sin) # Can't save int with save_for_backward
ctx.seqlen_offsets = seqlen_offsets
else:
ctx.save_for_backward(cos, sin, seqlen_offsets)
ctx.seqlen_offsets = None
ctx.interleaved = interleaved
return kv
@staticmethod
def backward(ctx, dkv):
seqlen_offsets = ctx.seqlen_offsets
if seqlen_offsets is None:
cos, sin, seqlen_offsets = ctx.saved_tensors
else:
cos, sin = ctx.saved_tensors
apply_rotary(
dkv[:, :, 0],
cos,
sin,
seqlen_offsets=seqlen_offsets,
interleaved=ctx.interleaved,
inplace=True,
conjugate=True,
)
return dkv, None, None, None, None
apply_rotary_emb_kv_ = ApplyRotaryEmbKV_.apply
def apply_rotary_emb_kv_(
kv,
cos,
sin,
interleaved=False,
seqlen_offsets: Union[int, torch.Tensor] = 0,
):
"""
Arguments:
kv: (batch_size, seqlen, 2, nheads, headdim)
cos, sin: (seqlen, rotary_dim / 2)
interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead of
1st half and 2nd half (GPT-NeoX style).
seqlen_offsets: (batch_size,) or int. Each sequence in Q and K is shifted by this amount.
Most commonly used in inference when we have KV cache.
Return:
kv: (batch_size, seqlen, 2, nheads, headdim)
rotary_dim must be <= headdim
Apply rotary embedding *inplace* to the first rotary_dim of K.
"""
return ApplyRotaryEmbKV_.apply(kv, cos, sin, interleaved, seqlen_offsets)
class RotaryEmbedding(torch.nn.Module):
"""
The rotary position embeddings from RoFormer_ (Su et. al).
A crucial insight from the method is that the query and keys are
transformed by rotation matrices which depend on the relative positions.
Other implementations are available in the Rotary Transformer repo_ and in
GPT-NeoX_, GPT-NeoX was an inspiration
.. _RoFormer: https://arxiv.org/abs/2104.09864
.. _repo: https://github.com/ZhuiyiTechnology/roformer
.. _GPT-NeoX: https://github.com/EleutherAI/gpt-neox
If scale_base is not None, this implements XPos (Sun et al., https://arxiv.org/abs/2212.10554).
A recommended value for scale_base is 512: https://github.com/HazyResearch/flash-attention/issues/96
Reference: https://github.com/sunyt32/torchscale/blob/main/torchscale/component/xpos_relative_position.py
"""
def __init__(
self,
dim: int,
base=10000.0,
interleaved=False,
scale_base=None,
pos_idx_in_fp32=True,
device=None,
):
"""
interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
of 1st half and 2nd half (GPT-NeoX style).
pos_idx_in_fp32: if True, the position indices [0.0, ..., seqlen - 1] are in fp32,
otherwise they might be in lower precision.
This option was added because previously (before 2023-07-02), when we construct
the position indices, we use the dtype of self.inv_freq. In most cases this would
be fp32, but if the model is trained in pure bf16 (not mixed precision), then
self.inv_freq would be bf16, and the position indices are also in bf16.
Because of the limited precision of bf16 (e.g. 1995.0 is rounded to 2000.0), the
embeddings for some positions will coincide.
To maintain compatibility with models previously trained in pure bf16,
we add this option.
"""
super().__init__()
self.dim = dim
self.base = float(base)
self.pos_idx_in_fp32 = pos_idx_in_fp32
# Generate and save the inverse frequency buffer (non trainable)
inv_freq = self._compute_inv_freq(device)
self.register_buffer("inv_freq", inv_freq, persistent=False)
self.interleaved = interleaved
self.scale_base = scale_base
scale = (
(torch.arange(0, dim, 2, device=device, dtype=torch.float32) + 0.4 * dim) / (1.4 * dim)
if scale_base is not None
else None
)
self.register_buffer("scale", scale, persistent=False)
self._seq_len_cached = 0
self._cos_cached = None
self._sin_cached = None
self._cos_k_cached = None
self._sin_k_cached = None
def _compute_inv_freq(self, device=None):
return 1.0 / (
self.base
** (torch.arange(0, self.dim, 2, device=device, dtype=torch.float32) / self.dim)
)
def _update_cos_sin_cache(self, seqlen, device=None, dtype=None):
# Reset the tables if the sequence length has changed,
# if we're on a new device (possibly due to tracing for instance),
# or if we're switching from inference mode to training
if (
seqlen > self._seq_len_cached
or self._cos_cached is None
or self._cos_cached.device != device
or self._cos_cached.dtype != dtype
or (self.training and self._cos_cached.is_inference())
):
self._seq_len_cached = seqlen
# We want fp32 here, not self.inv_freq.dtype, since the model could be loaded in bf16
# And the output of arange can be quite large, so bf16 would lose a lot of precision.
# However, for compatibility reason, we add an option to use the dtype of self.inv_freq.
if self.pos_idx_in_fp32:
t = torch.arange(seqlen, device=device, dtype=torch.float32)
# We want fp32 here as well since inv_freq will be multiplied with t, and the output
# will be large. Having it in bf16 will lose a lot of precision and cause the
# cos & sin output to change significantly.
# We want to recompute self.inv_freq if it was not loaded in fp32
if self.inv_freq.dtype != torch.float32:
inv_freq = self._compute_inv_freq(device=device)
else:
inv_freq = self.inv_freq
else:
t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
inv_freq = self.inv_freq
# Don't do einsum, it converts fp32 to fp16 under AMP
# freqs = torch.einsum("i,j->ij", t, self.inv_freq)
freqs = torch.outer(t, inv_freq)
if self.scale is None:
self._cos_cached = torch.cos(freqs).to(dtype)
self._sin_cached = torch.sin(freqs).to(dtype)
else:
power = (
torch.arange(seqlen, dtype=self.scale.dtype, device=self.scale.device)
- seqlen // 2
) / self.scale_base
scale = self.scale.to(device=power.device) ** rearrange(power, "s -> s 1")
# We want the multiplication by scale to happen in fp32
self._cos_cached = (torch.cos(freqs) * scale).to(dtype)
self._sin_cached = (torch.sin(freqs) * scale).to(dtype)
self._cos_k_cached = (torch.cos(freqs) / scale).to(dtype)
self._sin_k_cached = (torch.sin(freqs) / scale).to(dtype)
def forward(
self,
qkv: torch.Tensor,
kv: Optional[torch.Tensor] = None,
seqlen_offset: Union[int, torch.Tensor] = 0,
max_seqlen: Optional[int] = None,
) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
"""
qkv: (batch, seqlen, 3, nheads, headdim) if kv is none,
else it's just q of shape (batch, seqlen, nheads, headdim)
kv: (batch, seqlen, 2, nheads, headdim)
seqlen_offset: (batch_size,) or int. Each sequence in x is shifted by this amount.
Most commonly used in inference when we have KV cache.
If it's a tensor of shape (batch_size,), then to update the cos / sin cache, one
should pass in max_seqlen, which will update the cos / sin cache up to that length.
Apply rotary embedding *inplace* to qkv and / or kv.
"""
seqlen = qkv.shape[1]
if max_seqlen is not None:
self._update_cos_sin_cache(max_seqlen, device=qkv.device, dtype=qkv.dtype)
elif isinstance(seqlen_offset, int):
self._update_cos_sin_cache(seqlen + seqlen_offset, device=qkv.device, dtype=qkv.dtype)
if kv is None:
if self.scale is None:
return apply_rotary_emb_qkv_(
qkv,
self._cos_cached,
self._sin_cached,
interleaved=self.interleaved,
seqlen_offsets=seqlen_offset,
)
else:
return apply_rotary_emb_qkv_(
qkv,
self._cos_cached,
self._sin_cached,
self._cos_k_cached,
self._sin_k_cached,
interleaved=self.interleaved,
seqlen_offsets=seqlen_offset,
)
else:
q = qkv
q = apply_rotary_emb_func(
q,
self._cos_cached,
self._sin_cached,
interleaved=self.interleaved,
inplace=True,
seqlen_offsets=seqlen_offset,
)
if self.scale is None:
kv = apply_rotary_emb_kv_(
kv,
self._cos_cached,
self._sin_cached,
interleaved=self.interleaved,
seqlen_offsets=seqlen_offset,
)
else:
kv = apply_rotary_emb_kv_(
kv,
self._cos_k_cached,
self._sin_k_cached,
interleaved=self.interleaved,
seqlen_offsets=seqlen_offset,
)
return q, kv
# Copyright (c) 2023, Tri Dao.
import torch
import torch.nn as nn
from flash_attn.ops.triton.cross_entropy import cross_entropy_loss
class CrossEntropyLoss(nn.Module):
def __init__(
self,
ignore_index=-100,
reduction="mean",
label_smoothing=0.0,
logit_scale=1.0,
lse_square_scale=0.0,
inplace_backward=False,
process_group=None,
return_z_loss=False,
):
"""
Arguments:
ignored_index: int. If labels == ignored_index, the loss is set to 0.0.
label_smoothing: float
lse_square_scale: float. If > 0, we add lse_square_scale * lse(logits) ^ 2 to the loss.
This is also referred to as "z-loss".
inplace_backward: bool. If True, we do the backward pass in-place by modifying the logits.
This saves memory.
process_group: if not None, we're doing Tensor Parallel: each process is responsible for
one part of the vocab. The loss will be aggregated across processes.
return_z_loss: bool. If True, we return the component of the loss contributed by
the lse_square_scale value. This value is only for logging and does not support
backprop.
"""
super().__init__()
if reduction not in ["mean", "none", "sum"]:
raise NotImplementedError("Only support reduction = 'mean' or 'none' or 'sum'")
self.ignore_index = ignore_index
self.reduction = reduction
self.label_smoothing = label_smoothing
self.logit_scale = logit_scale
self.lse_square_scale = lse_square_scale
self.inplace_backward = inplace_backward
self.process_group = process_group
self.return_z_loss = return_z_loss
def forward(self, input, target):
"""
Arguments:
input: (batch, vocab_size)
target: (batch,)
Returns:
losses: (batch,) if reduction is 'none', else (1,), dtype float
z_loss: (batch,) if reduction is 'none', else (1,), dtype float (if self.return_z_loss)
"""
assert input.is_cuda and target.is_cuda, "Only support CUDA tensors"
loss, z_loss = cross_entropy_loss(
input,
target,
label_smoothing=self.label_smoothing,
logit_scale=self.logit_scale,
lse_square_scale=self.lse_square_scale,
ignored_index=self.ignore_index,
inplace_backward=self.inplace_backward,
process_group=self.process_group,
)
if self.reduction == "mean":
loss = loss.sum() / (target != self.ignore_index).sum()
elif self.reduction == "sum":
loss = loss.sum()
else:
loss = loss
if not self.return_z_loss:
return loss
if self.reduction == "mean":
z_loss = z_loss.sum() / (target != self.ignore_index).sum()
elif self.reduction == "sum":
z_loss = z_loss.sum()
else:
z_loss = z_loss
return loss, z_loss
# Copyright (c) 2023, GGGGGGXY, Tri Dao.
import math
import json
import re
from pathlib import Path
from collections import OrderedDict
import torch
import torch.nn.functional as F
from einops import rearrange
from transformers import GPT2Config, AutoConfig, PretrainedConfig
def remap_state_dict_hf_baichuan(state_dict, config):
def key_mapping_layers(key):
return re.sub(r"^model.", "transformer.", key)
state_dict = OrderedDict((key_mapping_layers(k), v) for k, v in state_dict.items())
# Word embedding
def key_mapping_emb(key):
return re.sub(
r"^transformer.embed_tokens.",
"transformer.embeddings.word_embeddings.",
key,
)
state_dict = OrderedDict((key_mapping_emb(k), v) for k, v in state_dict.items())
word_embeddings = state_dict.pop("transformer.embeddings.word_embeddings.weight")
# It's possible that vocab_size is padded to be a multiple of 8, for example.
pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
vocab_size = (
math.ceil(word_embeddings.shape[0] / pad_vocab_size_multiple)
* pad_vocab_size_multiple
)
state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad(
word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0])
)
if getattr(config, "tie_word_embeddings"):
state_dict["lm_head.weight"] = state_dict[
"transformer.embeddings.word_embeddings.weight"
]
else:
output_embeddings = state_dict.pop("lm_head.weight")
# Need to recompute vocab_size since Baichuan shards the word embeddings and output embeddings
# differently.
vocab_size = (
math.ceil(output_embeddings.shape[0] / pad_vocab_size_multiple)
* pad_vocab_size_multiple
)
# It's possible that vocab_size is padded to be a multiple of 8, for example.
state_dict["lm_head.weight"] = F.pad(
output_embeddings, (0, 0, 0, vocab_size - output_embeddings.shape[0])
)
# LayerNorm
def key_mapping_ln(key):
key = re.sub(r"^transformer.norm.", r"transformer.ln_f.", key)
key = re.sub(
r"^transformer.layers.(\d+).input_layernorm.",
r"transformer.layers.\1.norm1.",
key,
)
key = re.sub(
r"^transformer.layers.(\d+).post_attention_layernorm.",
r"transformer.layers.\1.norm2.",
key,
)
return key
state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
# MLP
for l in range(config.n_layer):
w1 = state_dict.pop(f"transformer.layers.{l}.mlp.gate_proj.weight")
w3 = state_dict.pop(f"transformer.layers.{l}.mlp.up_proj.weight")
# Our ordering is different
state_dict[f"transformer.layers.{l}.mlp.fc1.weight"] = torch.cat(
[w3, w1], dim=0
)
def key_mapping_mlp(key):
return re.sub(
r"^transformer.layers.(\d+).mlp.down_proj.",
r"transformer.layers.\1.mlp.fc2.",
key,
)
state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
# Attention
def key_mapping_attn(key):
key = re.sub(
r"^transformer.layers.(\d+).self_attn.W_pack.",
r"transformer.layers.\1.mixer.Wqkv.",
key,
)
key = re.sub(
r"^transformer.layers.(\d+).self_attn.o_proj.",
r"transformer.layers.\1.mixer.out_proj.",
key,
)
return key
state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
for l in range(config.n_layer):
# pop rotary_emb.inv_freq from state dict
state_dict.pop(f"transformer.layers.{l}.self_attn.rotary_emb.inv_freq", None)
return state_dict
def baichuan_config_to_gpt2_config(baichuan_config: PretrainedConfig) -> GPT2Config:
# HACK: the config doesn't have say whether it's rotary or alibi.
# So we have to infer from the hidden size (7B -> rotary, 13B -> alibi).
# HACK: the config doesn't have say whether it uses norm head.
# So we have to infer from the vocab size
# (v1, vocab size 64k, no norm head; v2, vocab size 128k, norm head).
use_rotary = baichuan_config.hidden_size < 5000
return GPT2Config(
vocab_size=baichuan_config.vocab_size,
n_positions=0, # No absolute position embedding
n_embd=baichuan_config.hidden_size,
n_layer=baichuan_config.num_hidden_layers,
n_head=baichuan_config.num_attention_heads,
n_inner=baichuan_config.intermediate_size,
activation_function="swiglu", # Hardcode since HF calls it 'silu'
# baichuan doesn't have dropout, idk if it's because they only release the inference code
resid_pdrop=0.0,
embd_pdrop=0.0,
attn_pdrop=0.0,
layer_norm_epsilon=baichuan_config.rms_norm_eps,
initializer_range=baichuan_config.initializer_range,
bos_token_id=baichuan_config.bos_token_id,
eos_token_id=baichuan_config.eos_token_id,
# These are new arguments not in the original GPT2Config
pad_token_id=baichuan_config.pad_token_id, # Idk if this does anything
rms_norm=True,
rotary_emb_fraction=1.0 if use_rotary else 0.0,
rotary_emb_interleaved=False,
use_alibi=not use_rotary,
use_flash_attn=not use_rotary, # Alibi code path requires flash_attn
tie_word_embeddings=False,
norm_head=baichuan_config.vocab_size > 70000,
qkv_proj_bias=False,
out_proj_bias=False,
mlp_fc1_bias=False,
mlp_fc2_bias=False,
)
# Copyright (c) 2022, Tri Dao.
# This BERT implementation is based on our MLPerf 2.0 and MLPerf 2.1 BERT implementation.
# https://github.com/mlcommons/training_results_v2.0/blob/main/HazyResearch/benchmarks/bert/implementations/pytorch/modeling.py
# https://github.com/mlcommons/training_results_v2.1/blob/main/Azure-HazyResearch/benchmarks/bert/implementations/ND96amsr_A100_v4/modeling.py
# Inspired by https://github.com/huggingface/transformers/blob/main/src/transformers/models/bert/modeling_bert.py
import logging
import re
from collections import OrderedDict
from collections.abc import Sequence
from functools import partial
from typing import Any, Mapping
import torch
import torch.nn as nn
import torch.nn.functional as F
from einops import rearrange
from transformers import BertConfig, PretrainedConfig
from transformers.models.bert.modeling_bert import (
BaseModelOutputWithPoolingAndCrossAttentions,
BertForPreTrainingOutput,
)
from flash_attn.bert_padding import (
index_first_axis,
index_first_axis_residual,
pad_input,
unpad_input,
)
from flash_attn.modules.block import Block
from flash_attn.modules.embedding import BertEmbeddings
from flash_attn.modules.mha import MHA
from flash_attn.modules.mlp import FusedMLP, Mlp
from flash_attn.utils.pretrained import state_dict_from_pretrained
try:
from flash_attn.ops.fused_dense import FusedDense
except ImportError:
FusedDense = None
try:
from flash_attn.ops.triton.layer_norm import layer_norm_fn
except ImportError:
layer_norm_fn = None
try:
from flash_attn.losses.cross_entropy import CrossEntropyLoss
except ImportError:
CrossEntropyLoss = None
logger = logging.getLogger(__name__)
def create_mixer_cls(config, cross_attn=False, return_residual=False):
use_flash_attn = getattr(config, "use_flash_attn", False)
fused_bias_fc = getattr(config, "fused_bias_fc", False)
rotary_kwargs = {}
if config.position_embedding_type == "rotary":
rotary_kwargs["rotary_emb_dim"] = getattr(config, "rotary_emb_dim", config.hidden_size)
rotary_kwargs["rotary_emb_base"] = getattr(config, "rotary_emb_base", 10000.0)
rotary_kwargs["rotary_emb_scale_base"] = getattr(config, "rotary_emb_scale_base", None)
rotary_kwargs["rotary_emb_interleaved"] = getattr(config, "rotary_emb_interleaved", False)
mixer_cls = partial(
MHA,
num_heads=config.num_attention_heads,
cross_attn=cross_attn,
dropout=config.attention_probs_dropout_prob,
causal=False,
fused_bias_fc=fused_bias_fc,
use_flash_attn=use_flash_attn,
return_residual=return_residual,
**rotary_kwargs,
)
return mixer_cls
def create_mlp_cls(config, layer_idx=None, return_residual=False):
inner_dim = config.intermediate_size
fused_mlp = getattr(config, "fused_mlp", False)
if fused_mlp:
assert config.hidden_act in ["gelu_new", "gelu_fast", "gelu_pytorch_tanh"], (
"fused_mlp only " "supports approximate gelu"
)
if not fused_mlp:
approximate = (
"tanh"
if config.hidden_act in ["gelu_new", "gelu_fast", "gelu_pytorch_tanh"]
else "none"
)
mlp_cls = partial(
Mlp,
hidden_features=inner_dim,
activation=partial(F.gelu, approximate=approximate),
return_residual=return_residual,
)
else:
if FusedMLP is None:
raise ImportError("fused_dense is not installed")
mlp_checkpoint_lvl = getattr(config, "mlp_checkpoint_lvl", 0)
# mlp_checkpoint_lvl could be a list, which contains the checkpoint_lvl for each layer
if isinstance(mlp_checkpoint_lvl, Sequence):
assert layer_idx is not None
mlp_checkpoint_lvl = mlp_checkpoint_lvl[layer_idx]
mlp_cls = partial(
FusedMLP,
hidden_features=inner_dim,
checkpoint_lvl=mlp_checkpoint_lvl,
return_residual=return_residual,
)
return mlp_cls
def create_block(config, layer_idx=None):
last_layer_subset = getattr(config, "last_layer_subset", False)
cross_attn = last_layer_subset and layer_idx == config.num_hidden_layers - 1
# TD [2022-12-19]: For cross attention (last layer), we actually want to return the
# residual x_kv, not residual x. But it's annoying to change the API (and it only affects
# one layer) so we just choose not to return residual in this case.
return_residual = not cross_attn
mixer_cls = create_mixer_cls(config, cross_attn, return_residual=return_residual)
mlp_cls = create_mlp_cls(config, layer_idx, return_residual=return_residual)
norm_cls = partial(nn.LayerNorm, eps=config.layer_norm_eps)
block = Block(
config.hidden_size,
mixer_cls,
mlp_cls,
norm_cls=norm_cls,
prenorm=False,
resid_dropout1=config.hidden_dropout_prob,
resid_dropout2=config.hidden_dropout_prob,
fused_dropout_add_ln=getattr(config, "fused_dropout_add_ln", False),
return_residual=return_residual,
)
return block
# https://github.com/huggingface/transformers/blob/7032e0203262ebb2ebf55da8d2e01f873973e835/src/transformers/models/bert/modeling_bert.py#L748
def _init_weights(module, initializer_range=0.02):
if isinstance(module, nn.Linear):
nn.init.normal_(module.weight, std=initializer_range)
if module.bias is not None:
nn.init.zeros_(module.bias)
elif isinstance(module, nn.Embedding):
nn.init.normal_(module.weight, std=initializer_range)
if module.padding_idx is not None:
nn.init.zeros_(module.weight[module.padding_idx])
class BertEncoder(nn.Module):
def __init__(self, config: BertConfig):
super().__init__()
self.use_flash_attn = getattr(config, "use_flash_attn", False)
self.layers = nn.ModuleList(
[create_block(config, layer_idx=i) for i in range(config.num_hidden_layers)]
)
def forward(self, hidden_states, key_padding_mask=None, subset_mask=None):
"""If subset_mask is not None, we only want output for the subset of the sequence.
This means that we only compute the last layer output for these tokens.
subset_mask: (batch, seqlen), dtype=torch.bool
"""
if key_padding_mask is None or not self.use_flash_attn:
mixer_kwargs = (
{"key_padding_mask": key_padding_mask} if key_padding_mask is not None else None
)
for layer in self.layers:
hidden_states = layer(hidden_states, mixer_kwargs=mixer_kwargs)
if subset_mask is not None:
hidden_states = hidden_states[subset_mask]
else:
batch, seqlen = hidden_states.shape[:2]
hidden_states, indices, cu_seqlens, max_seqlen_in_batch = unpad_input(
hidden_states, key_padding_mask
)
mixer_kwargs = {"cu_seqlens": cu_seqlens, "max_seqlen": max_seqlen_in_batch}
if subset_mask is None:
for layer in self.layers:
hidden_states = layer(hidden_states, mixer_kwargs=mixer_kwargs)
hidden_states = pad_input(hidden_states, indices, batch, seqlen)
else:
for layer in self.layers[:-1]:
hidden_states = layer(hidden_states, mixer_kwargs=mixer_kwargs)
if key_padding_mask is not None:
subset_idx = torch.nonzero(
subset_mask[key_padding_mask], as_tuple=False
).flatten()
subset_seqlens = (subset_mask & key_padding_mask).sum(dim=-1, dtype=torch.int32)
subset_cu_seqlens = F.pad(
torch.cumsum(subset_seqlens, dim=0, dtype=torch.torch.int32), (1, 0)
)
else:
subset_idx = torch.nonzero(subset_mask, as_tuple=False).flatten()
subset_seqlens = subset_mask.sum(dim=-1, dtype=torch.int32)
subset_cu_seqlens = F.pad(
torch.cumsum(subset_seqlens, dim=0, dtype=torch.torch.int32), (1, 0)
)
hidden_states_subset, hidden_states = index_first_axis_residual(
hidden_states, subset_idx
)
# It's ok to set max_seqlen_q to be much larger
mixer_kwargs = {
"x_kv": hidden_states,
"cu_seqlens": subset_cu_seqlens,
"max_seqlen": max_seqlen_in_batch,
"cu_seqlens_k": cu_seqlens,
"max_seqlen_k": max_seqlen_in_batch,
}
hidden_states = self.layers[-1](hidden_states_subset, mixer_kwargs=mixer_kwargs)
return hidden_states
class BertPooler(nn.Module):
def __init__(self, config):
super().__init__()
fused_bias_fc = getattr(config, "fused_bias_fc", False)
if fused_bias_fc and FusedDense is None:
raise ImportError("fused_dense is not installed")
linear_cls = nn.Linear if not fused_bias_fc else FusedDense
self.dense = linear_cls(config.hidden_size, config.hidden_size)
self.activation = nn.Tanh()
def forward(self, hidden_states, pool=True):
# We "pool" the model by simply taking the hidden state corresponding
# to the first token.
first_token_tensor = hidden_states[:, 0] if pool else hidden_states
pooled_output = self.dense(first_token_tensor)
pooled_output = self.activation(pooled_output)
return pooled_output
class BertPredictionHeadTransform(nn.Module):
def __init__(self, config):
super().__init__()
fused_bias_fc = getattr(config, "fused_bias_fc", False)
if fused_bias_fc and FusedDense is None:
raise ImportError("fused_dense is not installed")
self.fused_dropout_add_ln = getattr(config, "fused_dropout_add_ln", False)
if self.fused_dropout_add_ln and layer_norm_fn is None:
raise ImportError("Triton is not installed")
linear_cls = nn.Linear if not fused_bias_fc else FusedDense
self.dense = linear_cls(config.hidden_size, config.hidden_size)
approximate = (
"tanh"
if config.hidden_act in ["gelu_new", "gelu_fast", "gelu_pytorch_tanh"]
else "none"
)
self.transform_act_fn = nn.GELU(approximate=approximate)
self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.transform_act_fn(hidden_states)
if not self.fused_dropout_add_ln:
hidden_states = self.layer_norm(hidden_states)
else:
hidden_states = layer_norm_fn(
hidden_states, self.layer_norm.weight, self.layer_norm.bias, eps=self.layer_norm.eps
)
return hidden_states
class BertLMPredictionHead(nn.Module):
def __init__(self, config):
super().__init__()
fused_bias_fc = getattr(config, "fused_bias_fc", False)
if fused_bias_fc and FusedDense is None:
raise ImportError("fused_dense is not installed")
linear_cls = nn.Linear if not fused_bias_fc else FusedDense
self.transform = BertPredictionHeadTransform(config)
# The output weights are the same as the input embeddings, but there is
# an output-only bias for each token.
self.decoder = linear_cls(config.hidden_size, config.vocab_size, bias=True)
def forward(self, hidden_states):
hidden_states = self.transform(hidden_states)
hidden_states = self.decoder(hidden_states)
return hidden_states
class BertPreTrainingHeads(nn.Module):
def __init__(self, config):
super().__init__()
self.predictions = BertLMPredictionHead(config)
self.seq_relationship = nn.Linear(config.hidden_size, 2)
def forward(self, sequence_output, pooled_output):
prediction_scores = self.predictions(sequence_output)
seq_relationship_score = self.seq_relationship(pooled_output)
return prediction_scores, seq_relationship_score
class BertPreTrainedModel(nn.Module):
"""An abstract class to handle weights initialization and
a simple interface for dowloading and loading pretrained models.
"""
def __init__(self, config, *inputs, **kwargs):
super().__init__()
if not isinstance(config, BertConfig):
raise ValueError(
"Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
"To create a model from a Google pretrained model use "
"`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
self.__class__.__name__, self.__class__.__name__
)
)
self.config = config
@classmethod
def from_pretrained(cls, model_name, config, *inputs, **kwargs):
"""
Instantiate a BertPreTrainedModel from a pre-trained model file or a pytorch state dict.
Download and cache the pre-trained model file if needed.
Params:
pretrained_model_name_or_path: either:
- a path or url to a pretrained model archive containing:
. `bert_config.json` a configuration file for the model
. `pytorch_model.bin` a PyTorch dump of a BertForPretraining instance
- a path or url to a pretrained model archive containing:
. `bert_config.json` a configuration file for the model
. `model.chkpt` a TensorFlow checkpoint
*inputs, **kwargs: additional input for the specific Bert class
(ex: num_labels for BertForSequenceClassification)
"""
# Instantiate model.
model = cls(config, *inputs, **kwargs)
load_return = model.load_state_dict(
remap_state_dict(state_dict_from_pretrained(model_name), config), strict=False
)
logger.info(load_return)
return model
class BertModel(BertPreTrainedModel):
def __init__(self, config: BertConfig, add_pooling_layer=True):
super().__init__(config)
self.pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
if config.vocab_size % self.pad_vocab_size_multiple != 0:
config.vocab_size += self.pad_vocab_size_multiple - (
config.vocab_size % self.pad_vocab_size_multiple
)
self.fused_dropout_add_ln = getattr(config, "fused_dropout_add_ln", False)
if self.fused_dropout_add_ln and layer_norm_fn is None:
raise ImportError("Triton is not installed")
assert config.hidden_act in ["gelu", "gelu_new", "gelu_fast", "gelu_pytorch_tanh"]
self.embeddings = BertEmbeddings(
config.hidden_size,
config.vocab_size,
config.max_position_embeddings,
config.type_vocab_size,
padding_idx=config.pad_token_id,
)
self.emb_drop = nn.Dropout(config.hidden_dropout_prob)
self.emb_ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.encoder = BertEncoder(config)
self.pooler = BertPooler(config) if add_pooling_layer else None
self.apply(partial(_init_weights, initializer_range=config.initializer_range))
def forward(
self,
input_ids,
position_ids=None,
token_type_ids=None,
attention_mask=None,
masked_tokens_mask=None,
):
"""If masked_tokens_mask is not None (i.e. last_layer_subset == True in BertForPreTraining),
we only want the output for the masked tokens. This means that we only compute the last
layer output for these tokens.
masked_tokens_mask: (batch, seqlen), dtype=torch.bool
"""
hidden_states = self.embeddings(
input_ids, position_ids=position_ids, token_type_ids=token_type_ids
)
# TD [2022-12:18]: Don't need to force residual in fp32
# BERT puts embedding LayerNorm before embedding dropout.
if not self.fused_dropout_add_ln:
hidden_states = self.emb_ln(hidden_states)
else:
hidden_states = layer_norm_fn(
hidden_states, self.emb_ln.weight, self.emb_ln.bias, eps=self.emb_ln.eps
)
hidden_states = self.emb_drop(hidden_states)
if masked_tokens_mask is not None:
batch_size, seqlen = input_ids.shape[:2]
# We also need the first column for the CLS token
first_col_mask = torch.zeros(
batch_size, seqlen, dtype=torch.bool, device=input_ids.device
)
first_col_mask[:, 0] = True
subset_mask = masked_tokens_mask | first_col_mask
else:
subset_mask = None
sequence_output = self.encoder(
hidden_states, key_padding_mask=attention_mask, subset_mask=subset_mask
)
if masked_tokens_mask is None:
pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
else:
# TD [2022-03-01]: the indexing here is very tricky.
if attention_mask is not None:
subset_idx = subset_mask[attention_mask]
pool_input = sequence_output[first_col_mask[attention_mask][subset_idx]]
sequence_output = sequence_output[masked_tokens_mask[attention_mask][subset_idx]]
else:
pool_input = sequence_output[first_col_mask[subset_mask]]
sequence_output = sequence_output[masked_tokens_mask[subset_mask]]
pooled_output = self.pooler(pool_input, pool=False) if self.pooler is not None else None
return BaseModelOutputWithPoolingAndCrossAttentions(
last_hidden_state=sequence_output,
pooler_output=pooled_output,
)
class BertForPreTraining(BertPreTrainedModel):
def __init__(self, config: BertConfig):
super().__init__(config)
# If dense_seq_output, we only need to pass the hidden states for the masked out tokens
# (around 15%) to the classifier heads.
self.dense_seq_output = getattr(config, "dense_seq_output", False)
# If last_layer_subset, we only need the compute the last layer for a subset of tokens
# (e.g., the tokens we need to compute the masked LM loss and the next-sentence prediction).
self.last_layer_subset = getattr(config, "last_layer_subset", False)
if self.last_layer_subset:
assert self.dense_seq_output, "last_layer_subset requires dense_seq_output"
use_xentropy = getattr(config, "use_xentropy", False)
if use_xentropy and CrossEntropyLoss is None:
raise ImportError("xentropy_cuda is not installed")
loss_cls = (
nn.CrossEntropyLoss
if not use_xentropy
else partial(CrossEntropyLoss, inplace_backward=True)
)
self.bert = BertModel(config)
self.cls = BertPreTrainingHeads(config)
self.mlm_loss = loss_cls(ignore_index=0)
self.nsp_loss = loss_cls(ignore_index=-1)
# Initialize weights and apply final processing
self.apply(partial(_init_weights, initializer_range=config.initializer_range))
self.tie_weights()
def tie_weights(self):
self.cls.predictions.decoder.weight = self.bert.embeddings.word_embeddings.weight
def forward(
self,
input_ids,
position_ids=None,
token_type_ids=None,
attention_mask=None,
labels=None,
next_sentence_label=None,
):
"""
If labels are provided, they must be 0 for masked out tokens (as specified in the attention
mask).
Outputs:
if `labels` and `next_sentence_label` are not `None`:
Outputs the total_loss which is the sum of the masked language modeling loss and the next
sentence classification loss.
if `labels` or `next_sentence_label` is `None`:
Outputs a tuple comprising
- the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
- the next sentence classification logits of shape [batch_size, 2].
"""
masked_tokens_mask = labels > 0 if (self.last_layer_subset and labels is not None) else None
outputs = self.bert(
input_ids,
position_ids=position_ids,
token_type_ids=token_type_ids,
attention_mask=attention_mask.bool() if attention_mask is not None else None,
masked_tokens_mask=masked_tokens_mask,
)
sequence_output, pooled_output = outputs.last_hidden_state, outputs.pooler_output
if self.dense_seq_output and labels is not None:
masked_token_idx = torch.nonzero(labels.flatten() > 0, as_tuple=False).flatten()
if not self.last_layer_subset:
sequence_output = index_first_axis(
rearrange(sequence_output, "b s d -> (b s) d"), masked_token_idx
)
prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
total_loss = None
if labels is not None and next_sentence_label is not None:
if (
self.dense_seq_output and labels is not None
): # prediction_scores are already flattened
masked_lm_loss = self.mlm_loss(
prediction_scores, labels.flatten()[masked_token_idx]
)
else:
masked_lm_loss = self.mlm_loss(
rearrange(prediction_scores, "... v -> (...) v"),
rearrange(labels, "... -> (...)"),
)
next_sentence_loss = self.nsp_loss(
rearrange(seq_relationship_score, "... t -> (...) t"),
rearrange(next_sentence_label, "... -> (...)"),
)
total_loss = masked_lm_loss.float() + next_sentence_loss.float()
return BertForPreTrainingOutput(
loss=total_loss,
prediction_logits=prediction_scores,
seq_relationship_logits=seq_relationship_score,
)
def remap_state_dict(state_dict, config: PretrainedConfig):
"""
Map the state_dict of a Huggingface BERT model to be flash_attn compatible.
"""
# LayerNorm
def key_mapping_ln_gamma_beta(key):
key = re.sub(r"LayerNorm.gamma$", "LayerNorm.weight", key)
key = re.sub(r"LayerNorm.beta$", "LayerNorm.bias", key)
return key
state_dict = OrderedDict((key_mapping_ln_gamma_beta(k), v) for k, v in state_dict.items())
# Layers
def key_mapping_layers(key):
return re.sub(r"^bert.encoder.layer.", "bert.encoder.layers.", key)
state_dict = OrderedDict((key_mapping_layers(k), v) for k, v in state_dict.items())
# LayerNorm
def key_mapping_ln(key):
key = re.sub(r"^bert.embeddings.LayerNorm.", "bert.emb_ln.", key)
key = re.sub(
r"^bert.encoder.layers.(\d+).attention.output.LayerNorm.(weight|bias)",
r"bert.encoder.layers.\1.norm1.\2",
key,
)
key = re.sub(
r"^bert.encoder.layers.(\d+).output.LayerNorm.(weight|bias)",
r"bert.encoder.layers.\1.norm2.\2",
key,
)
key = re.sub(
r"^cls.predictions.transform.LayerNorm.(weight|bias)",
r"cls.predictions.transform.layer_norm.\1",
key,
)
return key
state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
# MLP
def key_mapping_mlp(key):
key = re.sub(
r"^bert.encoder.layers.(\d+).intermediate.dense.(weight|bias)",
r"bert.encoder.layers.\1.mlp.fc1.\2",
key,
)
key = re.sub(
r"^bert.encoder.layers.(\d+).output.dense.(weight|bias)",
r"bert.encoder.layers.\1.mlp.fc2.\2",
key,
)
return key
state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
# Attention
last_layer_subset = getattr(config, "last_layer_subset", False)
for d in range(config.num_hidden_layers):
Wq = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.query.weight")
Wk = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.key.weight")
Wv = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.value.weight")
bq = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.query.bias")
bk = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.key.bias")
bv = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.value.bias")
if not (last_layer_subset and d == config.num_hidden_layers - 1):
state_dict[f"bert.encoder.layers.{d}.mixer.Wqkv.weight"] = torch.cat(
[Wq, Wk, Wv], dim=0
)
state_dict[f"bert.encoder.layers.{d}.mixer.Wqkv.bias"] = torch.cat([bq, bk, bv], dim=0)
else:
state_dict[f"bert.encoder.layers.{d}.mixer.Wq.weight"] = Wq
state_dict[f"bert.encoder.layers.{d}.mixer.Wkv.weight"] = torch.cat([Wk, Wv], dim=0)
state_dict[f"bert.encoder.layers.{d}.mixer.Wq.bias"] = bq
state_dict[f"bert.encoder.layers.{d}.mixer.Wkv.bias"] = torch.cat([bk, bv], dim=0)
def key_mapping_attn(key):
return re.sub(
r"^bert.encoder.layers.(\d+).attention.output.dense.(weight|bias)",
r"bert.encoder.layers.\1.mixer.out_proj.\2",
key,
)
state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
def key_mapping_decoder_bias(key):
return re.sub(r"^cls.predictions.bias", "cls.predictions.decoder.bias", key)
state_dict = OrderedDict((key_mapping_decoder_bias(k), v) for k, v in state_dict.items())
# Word embedding
pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
if pad_vocab_size_multiple > 1:
word_embeddings = state_dict["bert.embeddings.word_embeddings.weight"]
state_dict["bert.embeddings.word_embeddings.weight"] = F.pad(
word_embeddings, (0, 0, 0, config.vocab_size - word_embeddings.shape[0])
)
decoder_weight = state_dict["cls.predictions.decoder.weight"]
state_dict["cls.predictions.decoder.weight"] = F.pad(
decoder_weight, (0, 0, 0, config.vocab_size - decoder_weight.shape[0])
)
# If the vocab was padded, we want to set the decoder bias for those padded indices to be
# strongly negative (i.e. the decoder shouldn't predict those indices).
# TD [2022-05-09]: I don't think it affects the MLPerf training.
decoder_bias = state_dict["cls.predictions.decoder.bias"]
state_dict["cls.predictions.decoder.bias"] = F.pad(
decoder_bias, (0, config.vocab_size - decoder_bias.shape[0]), value=-100.0
)
return state_dict
def inv_remap_state_dict(state_dict, config: PretrainedConfig):
"""
Map the state_dict of a flash_attn model to be Huggingface BERT compatible.
This function is meant to be the inverse of remap_state_dict.
"""
# Word embedding
pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
if pad_vocab_size_multiple > 1:
word_embeddings = state_dict["bert.embeddings.word_embeddings.weight"]
decoder_weight = state_dict["cls.predictions.decoder.weight"]
decoder_bias = state_dict["cls.predictions.decoder.bias"]
# unpad embeddings
state_dict["bert.embeddings.word_embeddings.weight"] = word_embeddings[
: config.orig_vocab_size, :
]
state_dict["cls.predictions.decoder.weight"] = decoder_weight[: config.orig_vocab_size, :]
state_dict["cls.predictions.decoder.bias"] = decoder_bias[: config.orig_vocab_size]
for d in range(config.num_hidden_layers):
last_layer_subset = getattr(config, "last_layer_subset", False)
if not last_layer_subset or d != (config.num_hidden_layers - 1):
Wqkv_weights = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wqkv.weight")
Wqkv_biases = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wqkv.bias")
state_dict[f"bert.encoder.layers.{d}.attention.self.query.weight"] = Wqkv_weights[
: Wqkv_weights.shape[0] // 3, :
]
state_dict[f"bert.encoder.layers.{d}.attention.self.key.weight"] = Wqkv_weights[
Wqkv_weights.shape[0] // 3 : 2 * Wqkv_weights.shape[0] // 3, :
]
state_dict[f"bert.encoder.layers.{d}.attention.self.value.weight"] = Wqkv_weights[
2 * Wqkv_weights.shape[0] // 3 :, :
]
state_dict[f"bert.encoder.layers.{d}.attention.self.query.bias"] = Wqkv_biases[
: Wqkv_biases.shape[0] // 3
]
state_dict[f"bert.encoder.layers.{d}.attention.self.key.bias"] = Wqkv_biases[
Wqkv_biases.shape[0] // 3 : 2 * Wqkv_biases.shape[0] // 3
]
state_dict[f"bert.encoder.layers.{d}.attention.self.value.bias"] = Wqkv_biases[
2 * Wqkv_biases.shape[0] // 3 :
]
else:
Wq_weight = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wq.weight")
Wkv_weights = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wkv.weight")
Wq_bias = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wq.bias")
Wkv_biases = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wkv.bias")
state_dict[f"bert.encoder.layers.{d}.attention.self.query.weight"] = Wq_weight
state_dict[f"bert.encoder.layers.{d}.attention.self.key.weight"] = Wkv_weights[
: Wkv_weights.shape[0] // 2, :
]
state_dict[f"bert.encoder.layers.{d}.attention.self.value.weight"] = Wkv_weights[
Wkv_weights.shape[0] // 2 :, :
]
state_dict[f"bert.encoder.layers.{d}.attention.self.query.bias"] = Wq_bias
state_dict[f"bert.encoder.layers.{d}.attention.self.key.bias"] = Wkv_biases[
: Wkv_biases.shape[0] // 2
]
state_dict[f"bert.encoder.layers.{d}.attention.self.value.bias"] = Wkv_biases[
Wkv_biases.shape[0] // 2 :
]
def inv_key_mapping_ln(key):
key = re.sub(r"bert.emb_ln.", "bert.embeddings.LayerNorm.", key)
key = re.sub(
r"bert.encoder.layers.(\d+).norm1.(weight|bias)",
r"bert.encoder.layers.\1.attention.output.LayerNorm.\2",
key,
)
key = re.sub(
r"bert.encoder.layers.(\d+).norm2.(weight|bias)",
r"bert.encoder.layers.\1.output.LayerNorm.\2",
key,
)
key = re.sub(
r"cls.predictions.transform.layer_norm.(weight|bias)",
r"cls.predictions.transform.LayerNorm.\1",
key,
)
return key
def inv_key_mapping_ln_gamma_beta(key):
key = re.sub(r"LayerNorm.weight$", "LayerNorm.gamma", key)
key = re.sub(r"LayerNorm.bias$", "LayerNorm.beta", key)
return key
def inv_key_mapping_layers(key):
return re.sub(r"bert.encoder.layers.", "bert.encoder.layer.", key)
def inv_key_mapping_mlp(key):
key = re.sub(
r"bert.encoder.layer.(\d+).mlp.fc1.(weight|bias)",
r"bert.encoder.layer.\1.intermediate.dense.\2",
key,
)
key = re.sub(
r"bert.encoder.layer.(\d+).mlp.fc2.(weight|bias)",
r"bert.encoder.layer.\1.output.dense.\2",
key,
)
return key
def inv_key_mapping_attn(key):
return re.sub(
r"bert.encoder.layer.(\d+).mixer.out_proj.(weight|bias)",
r"bert.encoder.layer.\1.attention.output.dense.\2",
key,
)
def inv_key_mapping_decoder_bias(key):
return re.sub(r"cls.predictions.decoder.bias", "cls.predictions.bias", key)
state_dict = OrderedDict((inv_key_mapping_ln(key), value) for key, value in state_dict.items())
state_dict = OrderedDict(
(inv_key_mapping_ln_gamma_beta(key), value) for key, value in state_dict.items()
)
state_dict = OrderedDict(
(inv_key_mapping_layers(key), value) for key, value in state_dict.items()
)
state_dict = OrderedDict((inv_key_mapping_mlp(key), value) for key, value in state_dict.items())
state_dict = OrderedDict(
(inv_key_mapping_attn(key), value) for key, value in state_dict.items()
)
state_dict = OrderedDict(
(inv_key_mapping_decoder_bias(key), value) for key, value in state_dict.items()
)
return state_dict
import math
import re
from collections import OrderedDict
import torch
import torch.nn.functional as F
from transformers import GPT2Config, GPTBigCodeConfig, PretrainedConfig
def remap_state_dict_hf_bigcode(state_dict, config: PretrainedConfig):
"""
Map the state_dict of a Huggingface BigCode model to be flash_attn compatible.
"""
# Word embedding and position embedding
def key_mapping_pos_emb(key):
return re.sub(r"^transformer.wpe.", "transformer.embeddings.position_embeddings.", key)
state_dict = OrderedDict((key_mapping_pos_emb(k), v) for k, v in state_dict.items())
word_embeddings = state_dict.pop("transformer.wte.weight")
# It's possible that vocab_size is padded to be a multiple of 8, for example.
pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
vocab_size = math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad(
word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0])
)
state_dict["lm_head.weight"] = state_dict["transformer.embeddings.word_embeddings.weight"]
# LayerNorm
def key_mapping_ln(key):
key = re.sub(r"^transformer.ln_f.(weight|bias)", r"transformer.ln_f.\1", key)
key = re.sub(
r"^transformer.h.(\d+).ln_(1|2).(weight|bias)",
r"transformer.layers.\1.norm\2.\3",
key,
)
return key
state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
def key_mapping_mlp(key):
key = re.sub(
r"^transformer.h.(\d+).mlp.c_fc.weight",
r"transformer.layers.\1.mlp.fc1.weight",
key,
)
key = re.sub(
r"^transformer.h.(\d+).mlp.c_proj.weight",
r"transformer.layers.\1.mlp.fc2.weight",
key,
)
key = re.sub(
r"^transformer.h.(\d+).mlp.c_fc.bias",
r"transformer.layers.\1.mlp.fc1.bias",
key,
)
key = re.sub(
r"^transformer.h.(\d+).mlp.c_proj.bias",
r"transformer.layers.\1.mlp.fc2.bias",
key,
)
return key
state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
# TODO: add support for multi-head attention
assert config.multi_query, "Only multi-query attention is supported"
# Attention
for d in range(config.num_hidden_layers):
embed_dim = config.n_embd
head_dim = embed_dim // config.n_head
c_attn_weight = state_dict.pop(f"transformer.h.{d}.attn.c_attn.weight")
# with multi-query attention, the weights have shape (embed_dim, embed_dim + head_dim + head_dim)
# see https://github.com/huggingface/transformers/blob/95b374952dc27d8511541d6f5a4e22c9ec11fb24/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py#L112
# see also https://github.com/ggerganov/ggml/blob/dd1d575956e54c5bdc07632f25506b3b1884dbd2/examples/starcoder/convert-hf-to-ggml.py#L183
# ((n_head + 2) * head_dim, embed_dim) -> (3 * n_heads * head_dim, hidden_dim)
q, k, v = torch.split(c_attn_weight, [embed_dim, head_dim, head_dim], dim=0)
# duplicate k, v along the first axis (head_dim, hidden_dim) -> (n_heads * head_dim, hidden_dim)
k = torch.tile(k, (config.n_head, 1))
v = torch.tile(v, (config.n_head, 1))
state_dict[f"transformer.layers.{d}.mixer.Wqkv.weight"] = torch.cat((q, k, v), dim=0)
# same deal with the bias
c_attn_bias = state_dict.pop(f"transformer.h.{d}.attn.c_attn.bias")
# ((n_head + 2) * head_dim, embed_dim) -> (3 * n_heads * head_dim, hidden_dim)
q, k, v = torch.split(c_attn_bias, [embed_dim, head_dim, head_dim], dim=0)
# duplicate k, v along the first axis (head_dim, hidden_dim) -> (n_heads * head_dim, hidden_dim)
k = torch.tile(k, (config.n_head,))
v = torch.tile(v, (config.n_head,))
state_dict[f"transformer.layers.{d}.mixer.Wqkv.bias"] = torch.cat((q, k, v), dim=0)
def key_mapping_attn(key):
key = re.sub(
r"^transformer.h.(\d+).attn.c_proj.weight",
r"transformer.layers.\1.mixer.out_proj.weight",
key,
)
key = re.sub(
r"^transformer.h.(\d+).attn.c_proj.bias",
r"transformer.layers.\1.mixer.out_proj.bias",
key,
)
return key
state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
return state_dict
def inv_remap_state_dict_hf_bigcode(state_dict, config: PretrainedConfig):
"""
Map the state_dict of a flash_attn model to be Huggingface BigCode compatible.
This function is meant to be the inverse of remap_state_dict_hf_bigcode.
"""
# Word embedding and position embeddings
def inv_key_mapping_pos_emb(key):
return re.sub(r"^transformer.embeddings.position_embeddings.", "transformer.wpe.", key)
state_dict = OrderedDict((inv_key_mapping_pos_emb(k), v) for k, v in state_dict.items())
word_embeddings = state_dict.pop("transformer.embeddings.word_embeddings.weight")
word_embeddings = word_embeddings[:, : config.vocab_size]
state_dict["transformer.wte.weight"] = word_embeddings
state_dict["lm_head.weight"] = word_embeddings
# LayerNorm
def inv_key_mapping_ln(key):
key = re.sub(r"^transformer.ln_f.(weight|bias)", r"transformer.ln_f.\1", key)
key = re.sub(
r"^transformer.layers.(\d+).norm(1|2).(weight|bias)",
r"transformer.h.\1.ln_\2.\3",
key,
)
return key
state_dict = OrderedDict((inv_key_mapping_ln(k), v) for k, v in state_dict.items())
# MLPs
def inv_key_mapping_mlp(key):
key = re.sub(
r"^transformer.layers.(\d+).mlp.fc1.weight",
r"transformer.h.\1.mlp.c_fc.weight",
key,
)
key = re.sub(
r"^transformer.layers.(\d+).mlp.fc2.weight",
r"transformer.h.\1.mlp.c_proj.weight",
key,
)
key = re.sub(
r"^transformer.layers.(\d+).mlp.fc1.bias",
r"transformer.h.\1.mlp.c_fc.bias",
key,
)
key = re.sub(
r"^transformer.layers.(\d+).mlp.fc2.bias",
r"transformer.h.\1.mlp.c_proj.bias",
key,
)
return key
state_dict = OrderedDict((inv_key_mapping_mlp(k), v) for k, v in state_dict.items())
# Attention
for d in range(config.num_hidden_layers):
embed_dim = config.n_embd
head_dim = embed_dim // config.n_head
Wqkv_weight = state_dict.pop(f"transformer.layers.{d}.mixer.Wqkv.weight")
q, k, v = torch.split(
Wqkv_weight, [embed_dim, head_dim * config.n_head, head_dim * config.n_head], dim=0
)
c_attn_weight = torch.cat((q, k[:head_dim], v[:head_dim]), dim=0)
state_dict[f"transformer.h.{d}.attn.c_attn.weight"] = c_attn_weight
# Same deal with the bias
Wqkv_bias = state_dict.pop(f"transformer.layers.{d}.mixer.Wqkv.bias")
q, k, v = torch.split(
Wqkv_bias, [embed_dim, head_dim * config.n_head, head_dim * config.n_head], dim=0
)
c_attn_bias = torch.cat((q, k[:head_dim], v[:head_dim]), dim=0)
state_dict[f"transformer.h.{d}.attn.c_attn.bias"] = c_attn_bias
def inv_key_mapping_attn(key):
key = re.sub(
r"^transformer.layers.(\d+).mixer.out_proj.weight",
r"transformer.h.\1.attn.c_proj.weight",
key,
)
key = re.sub(
r"^transformer.layers.(\d+).mixer.out_proj.bias",
r"transformer.h.\1.attn.c_proj.bias",
key,
)
return key
state_dict = OrderedDict((inv_key_mapping_attn(k), v) for k, v in state_dict.items())
return state_dict
def bigcode_config_to_gpt2_config(bigcode_config: GPTBigCodeConfig) -> GPT2Config:
return GPT2Config(
activation_function=bigcode_config.activation_function,
attn_pdrop=bigcode_config.attn_pdrop,
bos_token_id=bigcode_config.bos_token_id,
embd_pdrop=bigcode_config.embd_pdrop,
eos_token_id=bigcode_config.eos_token_id,
initializer_range=bigcode_config.initializer_range,
layer_norm_epsilon=bigcode_config.layer_norm_epsilon,
max_batch_size=bigcode_config.max_batch_size,
max_sequence_length=bigcode_config.max_sequence_length,
model_type=bigcode_config.model_type,
multi_query=bigcode_config.multi_query,
n_embd=bigcode_config.n_embd,
n_head=bigcode_config.n_head,
n_inner=bigcode_config.n_inner,
n_layer=bigcode_config.n_layer,
n_positions=bigcode_config.n_positions,
resid_pdrop=bigcode_config.resid_pdrop,
scale_attn_weights=bigcode_config.scale_attn_weights,
summary_activation=bigcode_config.summary_activation,
summary_first_dropout=bigcode_config.summary_first_dropout,
summary_proj_to_labels=bigcode_config.summary_proj_to_labels,
summary_type=bigcode_config.summary_type,
summary_use_proj=bigcode_config.summary_use_proj,
use_cache=bigcode_config.use_cache,
vocab_size=bigcode_config.vocab_size,
)
# Copyright (c) 2023, Tri Dao.
import math
import json
import re
from pathlib import Path
from collections import OrderedDict
import torch
import torch.nn.functional as F
from einops import rearrange
from transformers import GPT2Config, AutoConfig, PretrainedConfig
def remap_state_dict_hf_btlm(state_dict, config):
# Word embedding and position embedding
def key_mapping_pos_emb(key):
return re.sub(r"^transformer.wpe.", "transformer.embeddings.position_embeddings.", key)
if "transformer.wpe.weight" in state_dict:
state_dict = OrderedDict((key_mapping_pos_emb(k), v) for k, v in state_dict.items())
word_embeddings = state_dict.pop("transformer.wte.weight")
# It's possible that vocab_size is padded to be a multiple of 8, for example.
pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
vocab_size = math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad(
word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0])
)
state_dict["lm_head.weight"] = state_dict["transformer.embeddings.word_embeddings.weight"]
# LayerNorm
def key_mapping_ln(key):
key = re.sub(r"^transformer.ln_f.(weight|bias)", r"transformer.ln_f.\1", key)
key = re.sub(r"^transformer.h.(\d+).ln_(1|2).(weight|bias)", r"transformer.layers.\1.norm\2.\3", key)
return key
state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
# MLP
for d in range(config.num_hidden_layers):
W1 = state_dict.pop(f"transformer.h.{d}.mlp.c_fc.weight")
W3 = state_dict.pop(f"transformer.h.{d}.mlp.c_fc2.weight")
state_dict[f"transformer.layers.{d}.mlp.fc1.weight"] = torch.cat([W1.t(), W3.t()], dim=0)
b1 = state_dict.pop(f"transformer.h.{d}.mlp.c_fc.bias")
b3 = state_dict.pop(f"transformer.h.{d}.mlp.c_fc2.bias")
state_dict[f"transformer.layers.{d}.mlp.fc1.bias"] = torch.cat([b1, b3], dim=0)
W2 = state_dict.pop(f"transformer.h.{d}.mlp.c_proj.weight")
state_dict[f"transformer.layers.{d}.mlp.fc2.weight"] = W2.t()
def key_mapping_mlp(key):
key = re.sub(r"^transformer.h.(\d+).mlp.c_proj.bias", r"transformer.layers.\1.mlp.fc2.bias", key)
return key
state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
# Attention
for d in range(config.num_hidden_layers):
Wqkv = state_dict.pop(f"transformer.h.{d}.attn.c_attn.weight")
state_dict[f"transformer.layers.{d}.mixer.Wqkv.weight"] = Wqkv.t()
Wout = state_dict.pop(f"transformer.h.{d}.attn.c_proj.weight")
state_dict[f"transformer.layers.{d}.mixer.out_proj.weight"] = Wout.t()
state_dict.pop(f"transformer.relative_pe.slopes") # We don't store the Alibi slopes
def key_mapping_attn(key):
key = re.sub(r"^transformer.h.(\d+).attn.c_attn.bias", r"transformer.layers.\1.mixer.Wqkv.bias", key)
key = re.sub(
r"^transformer.h.(\d+).attn.c_proj.bias", r"transformer.layers.\1.mixer.out_proj.bias", key
)
return key
state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
return state_dict
def btlm_config_to_gpt2_config(btlm_config: PretrainedConfig) -> GPT2Config:
return GPT2Config(
vocab_size=btlm_config.vocab_size,
n_positions=0 if btlm_config.position_embedding_type == "alibi" else btlm_config.n_positions,
n_embd=btlm_config.hidden_size,
n_layer=btlm_config.num_hidden_layers,
n_head=btlm_config.num_attention_heads,
n_inner=btlm_config.n_inner,
activation_function=btlm_config.activation_function,
resid_pdrop=btlm_config.resid_pdrop,
embd_pdrop=btlm_config.embd_pdrop,
attn_pdrop=btlm_config.attn_pdrop,
layer_norm_epsilon=btlm_config.layer_norm_epsilon,
initializer_range=btlm_config.initializer_range,
bos_token_id=btlm_config.bos_token_id,
eos_token_id=btlm_config.eos_token_id,
# These are new arguments not in the original GPT2Config
use_alibi=btlm_config.position_embedding_type == "alibi",
use_flash_attn=btlm_config.position_embedding_type == "alibi", # Alibi code path requires flash_attn
mup_width_scale=btlm_config.mup_width_scale,
mup_embeddings_multiplier=btlm_config.mup_embeddings_scale,
mup_output_multiplier=btlm_config.mup_output_alpha,
mup_scale_qk_dot_by_d=btlm_config.mup_scale_qk_dot_by_d,
mlp_multiple_of=1,
)
# Copyright (c) 2023, Tri Dao.
import math
import re
from collections import OrderedDict
import torch
import torch.nn.functional as F
from einops import rearrange
from transformers import FalconConfig, GPT2Config
def remap_state_dict_hf_falcon(state_dict, config):
def key_mapping_layers(key):
return re.sub(r"^transformer.h.", "transformer.layers.", key)
state_dict = OrderedDict((key_mapping_layers(k), v) for k, v in state_dict.items())
# Word embedding
def key_mapping_emb(key):
return re.sub(
r"^transformer.word_embeddings.", "transformer.embeddings.word_embeddings.", key
)
state_dict = OrderedDict((key_mapping_emb(k), v) for k, v in state_dict.items())
word_embeddings = state_dict.pop("transformer.embeddings.word_embeddings.weight")
# It's possible that vocab_size is padded to be a multiple of 8, for example.
pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
vocab_size = math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad(
word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0])
)
if getattr(config, "tie_word_embeddings"):
state_dict["lm_head.weight"] = state_dict["transformer.embeddings.word_embeddings.weight"]
else:
output_embeddings = state_dict.pop("lm_head.weight")
# It's possible that vocab_size is padded to be a multiple of 8, for example.
state_dict["lm_head.weight"] = F.pad(
output_embeddings, (0, 0, 0, vocab_size - output_embeddings.shape[0])
)
output_embeddings_bias = state_dict.pop("lm_head.bias")
state_dict["lm_head.bias"] = F.pad(
output_embeddings_bias, (0, vocab_size - output_embeddings_bias.shape[0])
)
# LayerNorm
def key_mapping_ln(key):
key = re.sub(
r"^transformer.layers.(\d+).input_layernorm.", r"transformer.layers.\1.norm1.", key
)
key = re.sub(
r"^transformer.layers.(\d+).post_attention_layernorm.",
r"transformer.layers.\1.norm2.",
key,
)
key = re.sub(r"^transformer.layers.(\d+).ln_attn.", r"transformer.layers.\1.norm1.", key)
key = re.sub(r"^transformer.layers.(\d+).ln_mlp.", r"transformer.layers.\1.norm2.", key)
return key
state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
# MLP
def key_mapping_mlp(key):
key = re.sub(
r"^transformer.layers.(\d+).mlp.dense_h_to_4h.", r"transformer.layers.\1.mlp.fc1.", key
)
key = re.sub(
r"^transformer.layers.(\d+).mlp.dense_4h_to_h.", r"transformer.layers.\1.mlp.fc2.", key
)
return key
state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
def key_mapping_attn(key):
key = re.sub(
r"^transformer.layers.(\d+).self_attention.query_key_value.",
r"transformer.layers.\1.mixer.Wqkv.",
key,
)
key = re.sub(
r"^transformer.layers.(\d+).self_attention.dense.",
r"transformer.layers.\1.mixer.out_proj.",
key,
)
return key
state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
n_head = config.n_head
n_head_kv = getattr(config, "n_head_kv", 1)
headdim = config.hidden_size // n_head
for l in range(config.n_layer):
# The weights are stored in a different layout compared to our implementation
Wqkv = rearrange(
state_dict.pop(f"transformer.layers.{l}.mixer.Wqkv.weight"),
"(group ratio headdim) ... -> group ratio headdim ...",
ratio=n_head // n_head_kv + 2,
headdim=headdim,
)
Wq = rearrange(Wqkv[:, :-2], "group ratio headdim ... -> (group ratio headdim) ...")
Wk = rearrange(Wqkv[:, [-2]], "group ratio headdim ... -> (group ratio headdim) ...")
Wv = rearrange(Wqkv[:, [-1]], "group ratio headdim ... -> (group ratio headdim) ...")
state_dict[f"transformer.layers.{l}.mixer.Wqkv.weight"] = torch.cat([Wq, Wk, Wv], dim=0)
return state_dict
def falcon_config_to_gpt2_config(falcon_config: FalconConfig) -> GPT2Config:
# The 40b config uses "n_head_kv" instead of "num_kv_heads"
n_head_kv = getattr(
falcon_config,
"n_head_kv",
1 if getattr(falcon_config, "multi_query", False) else falcon_config.n_head,
)
# HACK: the 40b config has 2 LN per layer instead of 1, but that's not reflected in the config.
# So we have to infer it from the number of heads in the key/value block
parallel_block_tied_norm = n_head_kv == 1
return GPT2Config(
vocab_size=falcon_config.vocab_size,
n_positions=0, # No absolute position embedding
n_embd=falcon_config.hidden_size,
n_layer=falcon_config.n_layer,
n_head=falcon_config.n_head,
n_inner=falcon_config.hidden_size * 4,
activation_function="gelu",
resid_pdrop=falcon_config.hidden_dropout,
embd_pdrop=0.0, # There doesn't seem to be any embedding dropout
attn_pdrop=falcon_config.attention_dropout,
layer_norm_epsilon=falcon_config.layer_norm_epsilon,
initializer_range=falcon_config.initializer_range,
bos_token_id=falcon_config.bos_token_id,
eos_token_id=falcon_config.eos_token_id,
# These are new arguments not in the original GPT2Config
parallel_block=falcon_config.parallel_attn,
n_head_kv=n_head_kv,
parallel_block_tied_norm=parallel_block_tied_norm,
rotary_emb_fraction=1.0,
rotary_emb_interleaved=False,
tie_word_embeddings=True,
qkv_proj_bias=falcon_config.bias,
out_proj_bias=falcon_config.bias,
mlp_fc1_bias=falcon_config.bias,
mlp_fc2_bias=falcon_config.bias,
lm_head_bias=False,
)
# Copyright (c) 2024, Tri Dao.
import logging
import math
import re
from collections import OrderedDict, namedtuple
from collections.abc import Sequence
from functools import partial
from typing import Dict, List
import torch
import torch.nn as nn
import torch.nn.functional as F
from einops import rearrange
from transformers import GPT2Config
from flash_attn.models.bigcode import remap_state_dict_hf_bigcode
from flash_attn.models.falcon import remap_state_dict_hf_falcon
from flash_attn.models.gpt_neox import remap_state_dict_hf_gpt_neox
from flash_attn.models.gptj import remap_state_dict_hf_gptj
from flash_attn.models.llama import remap_state_dict_hf_llama
from flash_attn.models.opt import remap_state_dict_hf_opt
from flash_attn.modules.block import Block, ParallelBlock
from flash_attn.modules.embedding import GPT2Embeddings, ParallelGPT2Embeddings
from flash_attn.modules.mha import MHA, ParallelMHA
from flash_attn.modules.mlp import (
FusedMLP,
GatedMlp,
Mlp,
ParallelFusedMLP,
ParallelGatedMlp,
ParallelMLP,
)
from flash_attn.ops.activations import sqrelu_fwd
from flash_attn.utils.distributed import (
all_gather,
all_gather_raw,
get_dim_for_local_rank,
sync_shared_params,
)
from flash_attn.utils.generation import GenerationMixin
from flash_attn.utils.pretrained import state_dict_from_pretrained
try:
from flash_attn.ops.fused_dense import ColumnParallelLinear
except ImportError:
ColumnParallelLinear = None
try:
from flash_attn.ops.triton.mlp import FusedDenseSqreluDense
except ImportError:
FusedDenseSqreluDense = None
try:
from flash_attn.ops.triton.layer_norm import layer_norm_fn, RMSNorm
except ImportError:
layer_norm_fn, RMSNorm = None, None
logger = logging.getLogger(__name__)
def create_mixer_cls(config, layer_idx=None, process_group=None, device=None, dtype=None):
factory_kwargs = {"device": device, "dtype": dtype}
head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
attn_scale_power = 0.5 if not getattr(config, "mup_scale_qk_dot_by_d", False) else 1.0
softmax_scale = 1.0 if not config.scale_attn_weights else (head_dim ** (-attn_scale_power))
softmax_scale *= getattr(config, "mup_attn_multiplier", 1.0)
if config.scale_attn_by_inverse_layer_idx:
assert layer_idx is not None
softmax_scale /= float(layer_idx + 1)
dwconv = getattr(config, "attn_dwconv", False)
if dwconv:
assert process_group is None, "TensorParallel MHA does not support dwconv yet"
qkv_proj_bias = getattr(config, "qkv_proj_bias", True)
out_proj_bias = getattr(config, "out_proj_bias", True)
rotary_emb_dim = int(getattr(config, "rotary_emb_fraction", 0.0) * head_dim)
rotary_emb_base = getattr(config, "rotary_emb_base", 10000.0)
rotary_emb_scale_base = getattr(config, "rotary_emb_scale_base", None)
rotary_emb_interleaved = getattr(config, "rotary_emb_interleaved", False)
use_alibi = getattr(config, "use_alibi", False)
window_size = getattr(config, "window_size", (-1, -1))
use_flash_attn = getattr(config, "use_flash_attn", False)
fused_bias_fc = getattr(config, "fused_bias_fc", False)
if not fused_bias_fc:
assert process_group is None, "TensorParallel MHA requires fused_bias_fc"
mha_cls = MHA if process_group is None else ParallelMHA
serial_kwargs = (
{"fused_bias_fc": fused_bias_fc, "dwconv": dwconv} if process_group is None else {}
)
parallel_kwargs = (
{
"process_group": process_group,
"sequence_parallel": getattr(config, "sequence_parallel", True),
}
if process_group is not None
else {}
)
num_heads_kv = getattr(config, "n_head_kv", None)
mixer_cls = partial(
mha_cls,
num_heads=config.num_attention_heads,
num_heads_kv=num_heads_kv,
qkv_proj_bias=qkv_proj_bias,
out_proj_bias=out_proj_bias,
dropout=config.attn_pdrop,
softmax_scale=softmax_scale,
causal=True,
layer_idx=layer_idx,
rotary_emb_dim=rotary_emb_dim,
rotary_emb_base=rotary_emb_base,
rotary_emb_scale_base=rotary_emb_scale_base,
rotary_emb_interleaved=rotary_emb_interleaved,
use_alibi=use_alibi,
window_size=window_size,
use_flash_attn=use_flash_attn,
**serial_kwargs,
**parallel_kwargs,
**factory_kwargs,
)
return mixer_cls
def create_mlp_cls(config, layer_idx=None, process_group=None, device=None, dtype=None):
factory_kwargs = {"device": device, "dtype": dtype}
mlp_fc1_bias = getattr(config, "mlp_fc1_bias", True)
mlp_fc2_bias = getattr(config, "mlp_fc2_bias", True)
fused_mlp = getattr(config, "fused_mlp", False)
if fused_mlp:
assert config.activation_function in [
"gelu_new",
"gelu_fast",
"gelu_approx",
"gelu_pytorch_tanh",
"relu",
"sqrelu",
]
fused_dense_sqrelu_dense = getattr(config, "fused_dense_sqrelu_dense", False)
if fused_dense_sqrelu_dense:
assert config.activation_function == "sqrelu", (
"fused_dense_sqrelu_dense only " "supports approximate activation_function sqrelu"
)
assert not (fused_dense_sqrelu_dense and fused_mlp)
if not fused_mlp and not fused_dense_sqrelu_dense:
assert config.activation_function in [
"gelu",
"gelu_new",
"gelu_fast",
"gelu_approx",
"gelu_pytorch_tanh",
"relu",
"sqrelu",
"glu",
"swiglu",
"geglu",
]
if config.activation_function in ["glu", "swiglu", "geglu"]:
activation = (
F.sigmoid
if config.activation_function == "glu"
else (F.silu if config.activation_function == "swiglu" else F.gelu)
)
mlp_cls = GatedMlp if process_group is None else ParallelGatedMlp
parallel_kwargs = (
{
"process_group": process_group,
"sequence_parallel": getattr(config, "sequence_parallel", True),
}
if process_group is not None
else {}
)
mlp_multiple_of = getattr(config, "mlp_multiple_of", 128)
mlp_cls = partial(
mlp_cls,
hidden_features=config.n_inner,
activation=activation,
bias1=mlp_fc1_bias,
bias2=mlp_fc2_bias,
multiple_of=mlp_multiple_of,
**parallel_kwargs,
**factory_kwargs,
)
else:
if config.activation_function == "relu":
activation = partial(F.relu, inplace=True)
elif config.activation_function == "sqrelu":
activation = sqrelu_fwd
else:
approximate = (
"tanh"
if config.activation_function
in ["gelu_new", "gelu_fast", "gelu_approx", "gelu_pytorch_tanh"]
else "none"
)
activation = partial(F.gelu, approximate=approximate)
mlp_cls = Mlp if process_group is None else ParallelMLP
parallel_kwargs = (
{
"process_group": process_group,
"sequence_parallel": getattr(config, "sequence_parallel", True),
}
if process_group is not None
else {}
)
mlp_cls = partial(
mlp_cls,
hidden_features=config.n_inner,
activation=activation,
bias1=mlp_fc1_bias,
bias2=mlp_fc2_bias,
**parallel_kwargs,
**factory_kwargs,
)
else:
mlp_checkpoint_lvl = getattr(config, "mlp_checkpoint_lvl", 0)
# mlp_checkpoint_lvl could be a list, which contains the checkpoint_lvl for each layer
if isinstance(mlp_checkpoint_lvl, Sequence):
assert layer_idx is not None
mlp_checkpoint_lvl = mlp_checkpoint_lvl[layer_idx]
if fused_mlp:
if FusedMLP is None:
raise ImportError("fused_dense is not installed")
activation = (
"gelu_approx"
if config.activation_function
in ["gelu_new", "gelu_fast", "gelu_approx", "gelu_pytorch_tanh"]
else config.activation_function
)
mlp_cls = FusedMLP if process_group is None else ParallelFusedMLP
parallel_kwargs = (
{
"process_group": process_group,
"sequence_parallel": getattr(config, "sequence_parallel", True),
}
if process_group is not None
else {}
)
mlp_cls = partial(
mlp_cls,
hidden_features=config.n_inner,
activation=activation,
checkpoint_lvl=mlp_checkpoint_lvl,
bias1=mlp_fc1_bias,
bias2=mlp_fc2_bias,
**parallel_kwargs,
**factory_kwargs,
)
elif fused_dense_sqrelu_dense:
if process_group is not None:
assert fused_mlp, "Tensor Parallel is not implemented for FusedDenseSqreluDense"
assert FusedDenseSqreluDense is not None
mlp_cls = partial(
FusedDenseSqreluDense,
hidden_features=config.n_inner,
checkpoint_lvl=mlp_checkpoint_lvl,
**factory_kwargs,
)
else:
raise RuntimeError("MLP type not supported")
return mlp_cls
def create_block(config, layer_idx=None, process_group=None, device=None, dtype=None):
factory_kwargs = {"device": device, "dtype": dtype}
sequence_parallel = getattr(config, "sequence_parallel", True)
mixer_cls = create_mixer_cls(config, layer_idx, process_group=process_group, **factory_kwargs)
mlp_cls = create_mlp_cls(config, layer_idx, process_group=process_group, **factory_kwargs)
use_rms_norm = getattr(config, "rms_norm", False)
norm_cls = partial(
nn.LayerNorm if not use_rms_norm else RMSNorm,
eps=config.layer_norm_epsilon,
**factory_kwargs,
)
# TD [2022-07-30]: Force residual in fp32, seems to make fp16 training more stable
residual_in_fp32 = getattr(config, "residual_in_fp32", False)
resid_dropout1 = config.resid_pdrop if layer_idx is None or layer_idx > 0 else config.embd_pdrop
prenorm = getattr(config, "prenorm", True)
parallel_block = getattr(config, "parallel_block", False)
if not parallel_block:
block = Block(
config.hidden_size,
mixer_cls,
mlp_cls,
norm_cls=norm_cls,
prenorm=prenorm,
resid_dropout1=resid_dropout1,
resid_dropout2=config.resid_pdrop,
fused_dropout_add_ln=getattr(config, "fused_dropout_add_ln", False),
residual_in_fp32=residual_in_fp32,
sequence_parallel=sequence_parallel and process_group is not None,
mark_shared_params=process_group is not None,
)
else:
assert prenorm
block = ParallelBlock(
config.hidden_size,
mixer_cls,
mlp_cls,
norm_cls=norm_cls,
resid_dropout1=resid_dropout1,
resid_dropout2=config.resid_pdrop,
tied_norm=getattr(config, "parallel_block_tied_norm", False),
fused_dropout_add_ln=getattr(config, "fused_dropout_add_ln", False),
residual_in_fp32=residual_in_fp32,
sequence_parallel=sequence_parallel and process_group is not None,
mark_shared_params=process_group is not None,
)
block.layer_idx = layer_idx
return block
class GPTPreTrainedModel(nn.Module):
"""An abstract class to handle weights initialization and
a simple interface for dowloading and loading pretrained models.
"""
def __init__(self, config, *inputs, **kwargs):
super().__init__()
if not isinstance(config, GPT2Config):
raise ValueError(
"Parameter config in `{}(config)` should be an instance of class `GPT2Config`. "
"To create a model from a Google pretrained model use "
"`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
self.__class__.__name__, self.__class__.__name__
)
)
self.config = config
@classmethod
def from_pretrained(
cls,
model_name,
config,
*args,
strict=True,
device=None,
dtype=None,
world_size=1,
rank=0,
**kwargs,
):
"""
Instantiate a GPTPreTrainedModel from a pre-trained model file or a pytorch state dict.
Download and cache the pre-trained model file if needed.
"""
# Instantiate model.
model = cls(config, *args, device=device, dtype=dtype, **kwargs)
# Load state_dict in cpu because we already initialized the model in GPU, and we don't
# want extra stuff taking up more GPU memory
state_dict = state_dict_from_pretrained(model_name, device="cpu", dtype=dtype)
if model_name.startswith("gpt2"):
state_dict = remap_state_dict_hf_gpt2(state_dict, config)
elif model_name.startswith("facebook/opt"):
state_dict = remap_state_dict_hf_opt(state_dict, config)
elif model_name.startswith("EleutherAI/gpt-j-") or model_name.startswith(
"togethercomputer/GPT-JT-"
):
state_dict = remap_state_dict_hf_gptj(state_dict, config)
elif (
model_name.startswith("EleutherAI/gpt-neox-")
or model_name.startswith("EleutherAI/pythia-")
or model_name.startswith("togethercomputer/RedPajama-INCITE-")
):
state_dict = remap_state_dict_hf_gpt_neox(state_dict, config)
elif model_name.startswith("tiiuae/falcon-"):
state_dict = remap_state_dict_hf_falcon(state_dict, config)
elif model_name.startswith("meta-llama/Llama-"):
state_dict = remap_state_dict_hf_llama(state_dict, config)
elif model_name.startswith("bigcode/") or model_name.startswith("WizardLM/"):
state_dict = remap_state_dict_hf_bigcode(state_dict, config)
else:
raise NotImplementedError(f"Model {model_name} not supported")
if world_size > 1:
state_dict = shard_state_dict_tp(state_dict, config, world_size, rank)
load_return = model.load_state_dict(state_dict, strict=strict)
logger.info(load_return)
return model
# https://github.com/huggingface/transformers/blob/c28d04e9e252a1a099944e325685f14d242ecdcd/src/transformers/models/gpt2/modeling_gpt2.py#L454
def _init_weights(
module, n_layer, initializer_range=0.02, mup_width_scale=1.0, rescale_prenorm_residual=True
):
mup_init_scale = math.sqrt(mup_width_scale)
if isinstance(module, nn.Linear):
nn.init.normal_(module.weight, std=initializer_range * mup_init_scale)
optim_cfg = getattr(module.weight, "_optim", {})
optim_cfg.update({"lr_multiplier": mup_width_scale})
setattr(module.weight, "_optim", optim_cfg)
if module.bias is not None:
nn.init.zeros_(module.bias)
elif isinstance(module, nn.Embedding):
nn.init.normal_(module.weight, std=initializer_range)
if rescale_prenorm_residual:
# Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
# > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
# > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
# > -- GPT-2 :: https://openai.com/blog/better-language-models/
#
# Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
for name, p in module.named_parameters():
if name in ["out_proj.weight", "fc2.weight"]:
# Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
nn.init.normal_(
p, mean=0.0, std=initializer_range * mup_init_scale / math.sqrt(2 * n_layer)
)
class GPTModel(GPTPreTrainedModel):
def __init__(self, config: GPT2Config, process_group=None, device=None, dtype=None):
super().__init__(config)
factory_kwargs = {"device": device, "dtype": dtype}
self.process_group = process_group
self.sequence_parallel = getattr(config, "sequence_parallel", True)
assert config.activation_function in [
"gelu",
"gelu_new",
"gelu_fast",
"gelu_approx",
"gelu_pytorch_tanh",
"relu",
"sqrelu",
"glu",
"swiglu",
"geglu",
]
pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
vocab_size = (
math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
)
self.embeddings_multiplier = getattr(config, "mup_embeddings_multiplier", 1.0)
# TD [2022-07-30]: Force residual in fp32, seems to make fp16 training more stable
self.residual_in_fp32 = getattr(config, "residual_in_fp32", False)
# These 2 options are for OPT-350m
self.prenorm = getattr(config, "prenorm", True)
use_rms_norm = getattr(config, "rms_norm", False)
word_embed_proj_dim = getattr(config, "word_embed_proj_dim", None)
# For GPT-J, GPT-NeoX
self.parallel_block = getattr(config, "parallel_block", False)
if process_group is None:
self.embeddings = GPT2Embeddings(
config.hidden_size,
vocab_size,
config.max_position_embeddings,
word_embed_proj_dim=word_embed_proj_dim,
**factory_kwargs,
)
else:
self.embeddings = ParallelGPT2Embeddings(
config.hidden_size,
vocab_size,
config.max_position_embeddings,
process_group=process_group,
sequence_parallel=self.sequence_parallel,
**factory_kwargs,
)
# We change the order of dropout, residual and layer norm:
# Instead of LN -> Attn / MLP -> Dropout -> Add, we do:
# Dropout -> Add -> LN -> Attn / MLP, returning both the residual branch (output of Add) and
# the main branch (output of MLP). The model definition is unchanged, but the mapping of the
# nn.Dropout probabilities are changed.
# This is for performance reason: we can fuse dropout + add + layer_norm.
self.layers = nn.ModuleList(
[
create_block(config, layer_idx=i, process_group=process_group, **factory_kwargs)
for i in range(config.num_hidden_layers)
]
)
rotary_emb_fraction = getattr(config, "rotary_emb_fraction", 0.0)
if rotary_emb_fraction > 0.0: # Tie all the RotaryEmbedding modules to share the same cos/sin cache
for layer in self.layers[1:]:
layer.mixer.rotary_emb = self.layers[0].mixer.rotary_emb
self.fused_dropout_add_ln = getattr(config, "fused_dropout_add_ln", False)
if self.fused_dropout_add_ln:
if layer_norm_fn is None:
raise ImportError("Triton is not installed")
if self.prenorm:
self.drop_f = nn.Dropout(config.resid_pdrop)
norm_cls = nn.LayerNorm if not use_rms_norm else RMSNorm
self.ln_f = norm_cls(
config.hidden_size, eps=config.layer_norm_epsilon, **factory_kwargs
)
if process_group is not None:
for p in self.ln_f.parameters():
# Mark the norm parameters as "shared_params" so that we sync their values at init.
p._shared_params = True
# Mark the norm params as "sequence_parallel" so we run all-reduce on their grads.
if self.sequence_parallel:
p._sequence_parallel = True
self.apply(
partial(
_init_weights,
n_layer=config.num_hidden_layers,
initializer_range=config.initializer_range,
mup_width_scale=getattr(config, "mup_width_scale", 1.0),
)
)
self.tie_weights()
def tie_weights(self):
if self.process_group is not None:
sync_shared_params(self, self.process_group)
def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
return {
i: layer.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype, **kwargs)
for i, layer in enumerate(self.layers)
}
def forward(self, input_ids, position_ids=None, inference_params=None):
# If using Tensor Parallel with sequence parallel, we combine the batch and the seqlen
# dimensions so that we can split on it easily, in case of small batch size.
# Only the attention layers need to know the seqlen.
embedding_kwargs = (
{"combine_batch_seqlen_dim": True}
if self.process_group is not None and self.sequence_parallel
else {}
)
hidden_states = self.embeddings(input_ids, position_ids=position_ids, **embedding_kwargs)
if self.embeddings_multiplier != 1.0:
hidden_states = hidden_states * self.embeddings_multiplier
if self.parallel_block:
hidden_states2 = None
residual = None
mixer_kwargs = (
{"seqlen": input_ids.shape[1]}
if self.process_group is not None and self.sequence_parallel
else {}
)
if inference_params is not None:
mixer_kwargs["inference_params"] = inference_params
for layer in self.layers:
if self.prenorm:
if not self.parallel_block:
hidden_states, residual = layer(
hidden_states, residual, mixer_kwargs=mixer_kwargs
)
else:
hidden_states, hidden_states2, residual = layer(
hidden_states, hidden_states2, residual, mixer_kwargs=mixer_kwargs
)
else:
hidden_states = layer(hidden_states, mixer_kwargs=mixer_kwargs)
if self.prenorm:
if not self.fused_dropout_add_ln:
dropped = self.drop_f(hidden_states)
if not self.parallel_block:
residual = (dropped + residual) if residual is not None else dropped
else:
dropped2 = self.drop_f(hidden_states2)
residual = (
(residual + dropped + dropped2)
if residual is not None
else dropped + dropped2
)
hidden_states = self.ln_f(residual.to(dtype=self.ln_f.weight.dtype))
else:
# Set prenorm=False here since we don't need the residual
hidden_states = layer_norm_fn(
hidden_states,
self.ln_f.weight,
self.ln_f.bias,
residual=residual,
x1=None if not self.parallel_block else hidden_states2,
eps=self.ln_f.eps,
dropout_p=self.drop_f.p if self.training else 0.0,
prenorm=False,
is_rms_norm=isinstance(self.ln_f, RMSNorm)
)
return hidden_states
class GPTLMHeadModel(GPTPreTrainedModel, GenerationMixin):
def __init__(self, config: GPT2Config, process_group=None, device=None, dtype=None):
factory_kwargs = {"device": device, "dtype": dtype}
super().__init__(config)
self.process_group = process_group
self.transformer = GPTModel(config, process_group=process_group, **factory_kwargs)
self.tie_word_embeddings = getattr(config, "tie_word_embeddings", True)
lm_head_bias = getattr(config, "lm_head_bias", False)
pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
vocab_size = (
math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
)
# This option is for OPT-350m
word_embed_proj_dim = getattr(config, "word_embed_proj_dim", None)
embed_dim = config.n_embd if word_embed_proj_dim is None else word_embed_proj_dim
if word_embed_proj_dim is not None:
self.project_out = nn.Linear(config.n_embd, embed_dim, bias=False, **factory_kwargs)
else:
self.project_out = None
mup_width_scale = getattr(config, "mup_width_scale", 1.0)
mup_output_multiplier = getattr(config, "mup_output_multiplier", 1.0)
self.output_scale = mup_output_multiplier * mup_width_scale
if process_group is None:
self.lm_head = nn.Linear(embed_dim, vocab_size, bias=lm_head_bias, **factory_kwargs)
else:
if ColumnParallelLinear is None:
raise ImportError("fused_dense_lib is not installed")
self.lm_head = ColumnParallelLinear(
embed_dim,
vocab_size,
process_group,
bias=lm_head_bias,
sequence_parallel=getattr(config, "sequence_parallel", True),
**factory_kwargs,
)
self.norm_head = getattr(config, "norm_head", False)
# Initialize weights and apply final processing
self.apply(
partial(
_init_weights,
n_layer=config.num_hidden_layers,
initializer_range=config.initializer_range,
mup_width_scale=mup_width_scale,
)
)
self.tie_weights()
def tie_weights(self):
if self.tie_word_embeddings:
self.lm_head.weight = self.transformer.embeddings.word_embeddings.weight
if self.process_group is not None:
sync_shared_params(self, self.process_group)
def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
return self.transformer.allocate_inference_cache(
batch_size, max_seqlen, dtype=dtype, **kwargs
)
def forward(self, input_ids, position_ids=None, inference_params=None, num_last_tokens=0):
"""
input_ids: (batch, seqlen) int tensor
inference_params: for generation. Adapted from Megatron-LM (and Apex)
https://github.com/NVIDIA/apex/blob/3ff1a10f72ec07067c4e44759442329804ac5162/apex/transformer/testing/standalone_transformer_lm.py#L470
num_last_tokens: if > 0, only return the logits for the last n tokens
"""
assert (
input_ids.ndim == 2
), f"Expected `input_ids` to have shape [b, slen], but got shape {input_ids.shape}"
b, slen = input_ids.shape
hidden_states = self.transformer(
input_ids, position_ids=position_ids, inference_params=inference_params
)
if inference_params is not None:
assert hidden_states.ndim == 3, "sequence_parallel is not supported in generation mode"
if num_last_tokens > 0:
hidden_states = hidden_states[:, -num_last_tokens:]
if self.project_out is not None:
hidden_states = self.project_out(hidden_states)
if self.output_scale != 1.0:
hidden_states = hidden_states * self.output_scale
if not self.norm_head:
lm_logits = self.lm_head(hidden_states)
else:
lm_head_weight = F.normalize(self.lm_head.weight)
if isinstance(self.lm_head, ColumnParallelLinear) and self.lm_head.sequence_parallel:
hidden_states = all_gather(hidden_states, self.lm_head.process_group)
lm_logits = F.linear(hidden_states, lm_head_weight, bias=self.lm_head.bias)
# During inference, we want the full logit for sampling
if isinstance(self.lm_head, ColumnParallelLinear) and inference_params is not None:
lm_logits, _ = all_gather_raw(lm_logits, self.lm_head.process_group)
lm_logits = rearrange(lm_logits, "(n b) ... d -> b ... (n d)", b=b)
CausalLMOutput = namedtuple("CausalLMOutput", ["logits"])
return CausalLMOutput(logits=lm_logits)
def load_state_dict(self, state_dict, strict=True):
# Remapping from our checkpoints that used a different ordering of layers in the block
# Previous: Attn / MLP -> Dropout -> Add -> LN
# Current: Dropout -> Add -> LN -> Attn / MLP
if "transformer.ln_0.weight" in state_dict:
n_layers = len(self.transformer.layers)
ln_weight = state_dict.pop(f"transformer.layers.{n_layers - 1}.norm2.weight")
ln_bias = state_dict.pop(f"transformer.layers.{n_layers - 1}.norm2.bias")
state_dict["transformer.ln_f.weight"] = ln_weight
state_dict["transformer.ln_f.bias"] = ln_bias
for l in reversed(range(n_layers)):
ln_weight = state_dict.pop(f"transformer.layers.{l}.norm1.weight")
ln_bias = state_dict.pop(f"transformer.layers.{l}.norm1.bias")
state_dict[f"transformer.layers.{l}.norm2.weight"] = ln_weight
state_dict[f"transformer.layers.{l}.norm2.bias"] = ln_bias
if l > 0:
ln_weight = state_dict.pop(f"transformer.layers.{l - 1}.norm2.weight")
ln_bias = state_dict.pop(f"transformer.layers.{l - 1}.norm2.bias")
state_dict[f"transformer.layers.{l}.norm1.weight"] = ln_weight
state_dict[f"transformer.layers.{l}.norm1.bias"] = ln_bias
ln_weight = state_dict.pop("transformer.ln_0.weight")
ln_bias = state_dict.pop("transformer.ln_0.bias")
state_dict[f"transformer.layers.0.norm1.weight"] = ln_weight
state_dict[f"transformer.layers.0.norm1.bias"] = ln_bias
return super().load_state_dict(state_dict, strict=strict)
def shard_state_dict_tp(state_dict, config, world_size, rank):
"""Convert the state_dict of a standard GPT model to the state_dict of a GPT model
with tensor parallel.
This function modifies state_dict in place.
"""
pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
vocab_size = math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
assert vocab_size % world_size == 0
assert config.hidden_size % world_size == 0
inner_dim = config.n_inner if config.n_inner is not None else 4 * config.hidden_size
assert inner_dim % world_size == 0
n_head = config.n_head
n_head_kv = getattr(config, "n_head_kv", n_head)
embed_dim = config.hidden_size
head_dim = embed_dim // n_head
def shard_first_dim(state_dict, key):
if key in state_dict:
x = state_dict[key]
dim = x.shape[0] // world_size
state_dict[key] = x[rank * dim : (rank + 1) * dim]
def shard_last_dim(state_dict, key, multiple_of=1):
if key in state_dict:
x = state_dict[key]
dim_each_rank = [
get_dim_for_local_rank(x.size(-1), world_size, local_rank, multiple_of)
for local_rank in range(world_size)
]
beg, end = tuple(sum(dim_each_rank[:pos]) for pos in (rank, rank + 1))
state_dict[key] = x[..., beg:end]
def shard_gatedmlp_fc1_dim(state_dict, key):
if key in state_dict:
x = state_dict[key]
dim = x.shape[0] // world_size // 2
state_dict[key] = rearrange(
rearrange(x, "(two o) ... -> two o ...", two=2)[:, rank * dim : (rank + 1) * dim],
"two o ... -> (two o) ...",
)
def shard_qkv_headdim(state_dict, key):
if key in state_dict:
n_head_each_rank = [
get_dim_for_local_rank(n_head, world_size, local_rank)
for local_rank in range(world_size)
]
n_head_kv_each_rank = [
get_dim_for_local_rank(n_head_kv, world_size, local_rank)
for local_rank in range(world_size)
]
beg_n_head = sum(n_head_each_rank[:rank])
end_n_head = sum(n_head_each_rank[: rank + 1])
beg_n_head_kv = sum(n_head_kv_each_rank[:rank])
end_n_head_kv = sum(n_head_kv_each_rank[: rank + 1])
if n_head_kv == n_head:
x = rearrange(state_dict[key], "(three d) ... -> three d ...", three=3)
state_dict[key] = rearrange(
x[:, beg_n_head * head_dim : end_n_head * head_dim],
"three d ... -> (three d) ...",
)
else:
x = rearrange(
state_dict[key],
"(nheadqkv headdim) ... -> nheadqkv headdim ...",
nheadqkv=n_head + 2 * n_head_kv,
)
state_dict[key] = rearrange(
torch.cat(
[
x[beg_n_head:end_n_head],
x[n_head + beg_n_head_kv : n_head + end_n_head_kv],
x[
n_head
+ n_head_kv
+ beg_n_head_kv : n_head
+ n_head_kv
+ end_n_head_kv
],
],
dim=0,
),
"nheadqkv headdim ... -> (nheadqkv headdim) ...",
)
shard_first_dim(state_dict, "transformer.embeddings.word_embeddings.weight")
if "lm_head.weight" in state_dict:
shard_first_dim(state_dict, "lm_head.weight")
if "transformer.embeddings.position_embeddings.weight" in state_dict:
shard_last_dim(state_dict, "transformer.embeddings.position_embeddings.weight")
for i in range(config.num_hidden_layers):
shard_qkv_headdim(state_dict, f"transformer.layers.{i}.mixer.Wqkv.weight")
shard_qkv_headdim(state_dict, f"transformer.layers.{i}.mixer.Wqkv.bias")
shard_last_dim(
state_dict, f"transformer.layers.{i}.mixer.out_proj.weight", multiple_of=head_dim
)
if rank != 0:
state_dict.pop(f"transformer.layers.{i}.mixer.out_proj.bias", None)
if config.activation_function in ["glu", "swiglu", "geglu"]:
shard_gatedmlp_fc1_dim(state_dict, f"transformer.layers.{i}.mlp.fc1.weight")
shard_gatedmlp_fc1_dim(state_dict, f"transformer.layers.{i}.mlp.fc1.bias")
else:
shard_first_dim(state_dict, f"transformer.layers.{i}.mlp.fc1.weight")
shard_first_dim(state_dict, f"transformer.layers.{i}.mlp.fc1.bias")
shard_last_dim(state_dict, f"transformer.layers.{i}.mlp.fc2.weight")
if rank != 0:
state_dict.pop(f"transformer.layers.{i}.mlp.fc2.bias", None)
return state_dict
def combine_state_dicts_tp(state_dicts: List[Dict[str, torch.Tensor]], config: GPT2Config):
"""Convert the list of sharded state_dict of a GPT model with tensor parallel to
the state_dict of a standard GPT model.
This function is meant to be the "reverse" of shard_state_dict_tp.
Precondition:
- state_dicts should be ordered in the same way as the shards were created.
"""
world_size = len(state_dicts)
keys = state_dicts[0].keys()
pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
vocab_size = math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
assert vocab_size % world_size == 0
assert config.hidden_size % world_size == 0
inner_dim = config.n_inner if config.n_inner is not None else 4 * config.hidden_size
assert inner_dim % world_size == 0
assert config.hidden_size % config.n_head == 0
headdim = config.hidden_size // config.n_head
# Sometimes the word embeddings are sharded on the 0th dim, sometimes on the 1st dim.
# vocab_size // world_size coordinates are nonzero.
def combine_word_embeddings(state_dicts, state_dict, key):
dim = 0 if state_dicts[0][key].shape[0] == vocab_size // world_size else 1
state_dict[key] = torch.cat([s[key] for s in state_dicts], dim=dim)
def combine_dim(state_dicts, state_dict, key, dim=-1):
if key in state_dict:
state_dict[key] = torch.cat([s[key] for s in state_dicts], dim=dim)
def combine_qkv_headdim(state_dicts, state_dict, key):
n_head = config.n_head
n_head_kv = getattr(config, "n_head_kv", n_head)
if key in state_dict:
if n_head_kv == n_head:
xs = [
rearrange(s[key], "(three d) ... -> three d ...", three=3) for s in state_dicts
]
state_dict[key] = rearrange(torch.cat(xs, dim=1), "three d ... -> (three d) ...")
else:
n_head_each_rank = [
get_dim_for_local_rank(n_head, world_size, local_rank)
for local_rank in range(world_size)
]
n_head_kv_each_rank = [
get_dim_for_local_rank(n_head_kv, world_size, local_rank)
for local_rank in range(world_size)
]
xs = [
rearrange(
s[key],
"(nheadqkv headdim) ... -> nheadqkv headdim ...",
nheadqkv=rank_n_head + 2 * rank_n_head_kv,
headdim=headdim,
)
for s, rank_n_head, rank_n_head_kv in zip(
state_dicts, n_head_each_rank, n_head_kv_each_rank
)
]
wq = torch.cat([x[: n_head_each_rank[rank]] for rank, x in enumerate(xs)], dim=0)
wk = torch.cat(
[
x[
n_head_each_rank[rank] : n_head_each_rank[rank]
+ n_head_kv_each_rank[rank]
]
for rank, x in enumerate(xs)
],
dim=0,
)
wv = torch.cat(
[
x[n_head_each_rank[rank] + n_head_kv_each_rank[rank] :]
for rank, x in enumerate(xs)
],
dim=0,
)
wqkv = torch.cat(
[wq, wk, wv],
dim=0,
)
state_dict[key] = rearrange(
wqkv,
"nheadqkv headdim ... -> (nheadqkv headdim) ...",
)
def combine_gated_mlp(state_dicts, state_dict, key):
if key in state_dict:
xs = [rearrange(s[key], "(two d) ... -> two d ...", two=2) for s in state_dicts]
state_dict[key] = rearrange(torch.cat(xs, dim=1), "two d ... -> (two d) ...")
state_dict = state_dicts[0].copy() # don't modify state_dict[0] inplace
combine_word_embeddings(
state_dicts, state_dict, "transformer.embeddings.word_embeddings.weight"
)
if "lm_head.weight" in state_dict:
combine_word_embeddings(state_dicts, state_dict, "lm_head.weight")
if "transformer.embeddings.position_embeddings.weight" in state_dict:
combine_dim(
state_dicts, state_dict, "transformer.embeddings.position_embeddings.weight", -1
)
mlp_combine_fn = (
combine_gated_mlp
if config.activation_function in ["glu", "swiglu", "geglu"]
else partial(combine_dim, dim=0)
)
for i in range(config.num_hidden_layers):
combine_qkv_headdim(state_dicts, state_dict, f"transformer.layers.{i}.mixer.Wqkv.weight")
combine_qkv_headdim(state_dicts, state_dict, f"transformer.layers.{i}.mixer.Wqkv.bias")
combine_dim(state_dicts, state_dict, f"transformer.layers.{i}.mixer.out_proj.weight", -1)
mlp_combine_fn(state_dicts, state_dict, f"transformer.layers.{i}.mlp.fc1.weight")
combine_dim(state_dicts, state_dict, f"transformer.layers.{i}.mlp.fc1.bias", 0)
combine_dim(state_dicts, state_dict, f"transformer.layers.{i}.mlp.fc2.weight", -1)
return state_dict
def remap_state_dict_hf_gpt2(state_dict, config):
# Word embedding and position embedding
def key_mapping_pos_emb(key):
return re.sub(r"^wpe.", "transformer.embeddings.position_embeddings.", key)
state_dict = OrderedDict((key_mapping_pos_emb(k), v) for k, v in state_dict.items())
word_embeddings = state_dict.pop("wte.weight")
# It's possible that vocab_size is padded to be a multiple of 8, for example.
pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
vocab_size = math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad(
word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0])
)
state_dict["lm_head.weight"] = state_dict["transformer.embeddings.word_embeddings.weight"]
# LayerNorm
def key_mapping_ln(key):
key = re.sub(r"^ln_f.(weight|bias)", r"transformer.ln_f.\1", key)
key = re.sub(r"^h.(\d+).ln_(1|2).(weight|bias)", r"transformer.layers.\1.norm\2.\3", key)
return key
state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
# MLP
for d in range(config.num_hidden_layers):
W1 = state_dict.pop(f"h.{d}.mlp.c_fc.weight")
state_dict[f"transformer.layers.{d}.mlp.fc1.weight"] = W1.t()
W2 = state_dict.pop(f"h.{d}.mlp.c_proj.weight")
state_dict[f"transformer.layers.{d}.mlp.fc2.weight"] = W2.t()
def key_mapping_mlp(key):
key = re.sub(r"^h.(\d+).mlp.c_fc.bias", r"transformer.layers.\1.mlp.fc1.bias", key)
key = re.sub(r"^h.(\d+).mlp.c_proj.bias", r"transformer.layers.\1.mlp.fc2.bias", key)
return key
state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
# Attention
for d in range(config.num_hidden_layers):
state_dict.pop(f"h.{d}.attn.bias") # We don't store this bias
Wqkv = state_dict.pop(f"h.{d}.attn.c_attn.weight")
state_dict[f"transformer.layers.{d}.mixer.Wqkv.weight"] = Wqkv.t()
Wout = state_dict.pop(f"h.{d}.attn.c_proj.weight")
state_dict[f"transformer.layers.{d}.mixer.out_proj.weight"] = Wout.t()
def key_mapping_attn(key):
key = re.sub(r"^h.(\d+).attn.c_attn.bias", r"transformer.layers.\1.mixer.Wqkv.bias", key)
key = re.sub(
r"^h.(\d+).attn.c_proj.bias", r"transformer.layers.\1.mixer.out_proj.bias", key
)
return key
state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
return state_dict
def remap_state_dict_megatron(state_dict, config):
def key_mapping_transformer(key):
key = re.sub(r"^language_model.encoder.", "transformer.", key)
key = re.sub(r"^language_model.", "transformer.", key)
return key
state_dict = OrderedDict((key_mapping_transformer(k), v) for k, v in state_dict.items())
# Word embedding and position embedding
def key_mapping_pos_emb(key):
return re.sub(r"^wpe.", "transformer.embeddings.position_embeddings.", key)
state_dict = OrderedDict((key_mapping_pos_emb(k), v) for k, v in state_dict.items())
word_embeddings = state_dict.pop("transformer.embedding.word_embeddings.weight")
# It's possible that vocab_size is padded to be a multiple of 8, for example.
pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
vocab_size = (
math.ceil(word_embeddings.shape[0] / pad_vocab_size_multiple) * pad_vocab_size_multiple
)
state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad(
word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0])
)
state_dict["lm_head.weight"] = state_dict["transformer.embeddings.word_embeddings.weight"]
# LayerNorm
def key_mapping_ln(key):
key = re.sub(r"^transformer.final_layernorm.(weight|bias)", r"transformer.ln_f.\1", key)
key = re.sub(
r"^transformer.layers.(\d+).input_layernorm.(weight|bias)",
r"transformer.layers.\1.norm1.\2",
key,
)
key = re.sub(
r"^transformer.layers.(\d+).post_attention_layernorm.(weight|bias)",
r"transformer.layers.\1.norm2.\2",
key,
)
return key
state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
# MLP
def key_mapping_mlp(key):
key = re.sub(
r"^transformer.layers.(\d+).mlp.dense_h_to_4h.(weight|bias)",
r"transformer.layers.\1.mlp.fc1.\2",
key,
)
key = re.sub(
r"^transformer.layers.(\d+).mlp.dense_4h_to_h.(weight|bias)",
r"transformer.layers.\1.mlp.fc2.\2",
key,
)
return key
state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
# Attention
def key_mapping_attn(key):
key = re.sub(
r"^transformer.layers.(\d+).self_attention.rotary_emb.inv_freq",
r"transformer.layers.\1.mixer.rotary_emb.inv_freq",
key,
)
key = re.sub(
r"^transformer.layers.(\d+).self_attention.query_key_value.(weight|bias)",
r"transformer.layers.\1.mixer.Wqkv.\2",
key,
)
key = re.sub(
r"^transformer.layers.(\d+).self_attention.dense.(weight|bias)",
r"transformer.layers.\1.mixer.out_proj.\2",
key,
)
return key
state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
# Megatron stores Wqkv as ((nheads 3 headdim), hidden_dim)
# while we store Wqkv as ((3 nheads headdim), hidden_dim)
headdim = config.hidden_size // config.num_attention_heads
for d in range(config.num_hidden_layers):
Wqkv = state_dict.pop(f"transformer.layers.{d}.mixer.Wqkv.weight")
state_dict[f"transformer.layers.{d}.mixer.Wqkv.weight"] = rearrange(
Wqkv,
"(nheads three headdim) ... -> (three nheads headdim) ...",
three=3,
headdim=headdim,
)
bqkv = state_dict.pop(f"transformer.layers.{d}.mixer.Wqkv.bias")
state_dict[f"transformer.layers.{d}.mixer.Wqkv.bias"] = rearrange(
bqkv, "(nheads three headdim) -> (three nheads headdim)", three=3, headdim=headdim
)
return state_dict
# Copyright (c) 2023, Tri Dao.
import math
import re
from collections import OrderedDict
import torch
import torch.nn.functional as F
from einops import rearrange
from transformers import GPT2Config, GPTNeoXConfig
def remap_state_dict_hf_gpt_neox(state_dict, config):
def key_mapping_layers(key):
return re.sub(r"^gpt_neox.", "transformer.", key)
state_dict = OrderedDict((key_mapping_layers(k), v) for k, v in state_dict.items())
# Word embedding
def key_mapping_emb(key):
return re.sub(r"^transformer.embed_in.", "transformer.embeddings.word_embeddings.", key)
state_dict = OrderedDict((key_mapping_emb(k), v) for k, v in state_dict.items())
word_embeddings = state_dict.pop("transformer.embeddings.word_embeddings.weight")
# It's possible that vocab_size is padded to be a multiple of 8, for example.
pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
vocab_size = math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad(
word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0])
)
if getattr(config, "tie_word_embeddings", False):
state_dict["lm_head.weight"] = state_dict["transformer.embeddings.word_embeddings.weight"]
else:
output_embeddings = state_dict.pop("embed_out.weight")
# It's possible that vocab_size is padded to be a multiple of 8, for example.
state_dict["lm_head.weight"] = F.pad(
output_embeddings, (0, 0, 0, vocab_size - output_embeddings.shape[0])
)
# LayerNorm
def key_mapping_ln(key):
key = re.sub(r"^transformer.final_layer_norm.", r"transformer.ln_f.", key)
key = re.sub(
r"^transformer.layers.(\d+).input_layernorm.", r"transformer.layers.\1.norm1.", key
)
key = re.sub(
r"^transformer.layers.(\d+).post_attention_layernorm.",
r"transformer.layers.\1.norm2.",
key,
)
return key
state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
# MLP
def key_mapping_mlp(key):
key = re.sub(
r"^transformer.layers.(\d+).mlp.dense_h_to_4h.", r"transformer.layers.\1.mlp.fc1.", key
)
key = re.sub(
r"^transformer.layers.(\d+).mlp.dense_4h_to_h.", r"transformer.layers.\1.mlp.fc2.", key
)
return key
state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
# Attention
for l in range(config.n_layer):
# We don't store these biases
state_dict.pop(f"transformer.layers.{l}.attention.bias")
state_dict.pop(f"transformer.layers.{l}.attention.masked_bias")
# We don't store these
state_dict.pop(f"transformer.layers.{l}.attention.rotary_emb.inv_freq", None)
# GPT-NeoX stores Wqkv as ((nheads 3 headdim), hidden_dim)
# while we store Wqkv as ((3 nheads headdim), hidden_dim)
headdim = config.hidden_size // config.num_attention_heads
Wqkv = state_dict.pop(f"transformer.layers.{l}.attention.query_key_value.weight")
state_dict[f"transformer.layers.{l}.mixer.Wqkv.weight"] = rearrange(
Wqkv,
"(nheads three headdim) ... -> (three nheads headdim) ...",
three=3,
headdim=headdim,
)
bqkv = state_dict.pop(f"transformer.layers.{l}.attention.query_key_value.bias")
state_dict[f"transformer.layers.{l}.mixer.Wqkv.bias"] = rearrange(
bqkv, "(nheads three headdim) -> (three nheads headdim)", three=3, headdim=headdim
)
def key_mapping_attn(key):
key = re.sub(
r"^transformer.layers.(\d+).attention.dense.",
r"transformer.layers.\1.mixer.out_proj.",
key,
)
return key
state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
return state_dict
def gpt_neox_config_to_gpt2_config(gpt_neox_config: GPTNeoXConfig) -> GPT2Config:
assert gpt_neox_config.rotary_emb_base == 10000
return GPT2Config(
vocab_size=gpt_neox_config.vocab_size,
n_positions=0, # No absolute position embedding
n_embd=gpt_neox_config.hidden_size,
n_layer=gpt_neox_config.num_hidden_layers,
n_head=gpt_neox_config.num_attention_heads,
n_inner=gpt_neox_config.intermediate_size,
activation_function=gpt_neox_config.hidden_act,
resid_pdrop=0.0, # No dropout
embd_pdrop=0.0,
attn_pdrop=0.0,
layer_norm_epsilon=gpt_neox_config.layer_norm_eps,
initializer_range=gpt_neox_config.initializer_range,
bos_token_id=gpt_neox_config.bos_token_id,
eos_token_id=gpt_neox_config.eos_token_id,
# These are new arguments not in the original GPT2Config
prenorm=True,
parallel_block=gpt_neox_config.use_parallel_residual,
parallel_block_tied_norm=False,
rotary_emb_fraction=gpt_neox_config.rotary_pct,
tie_word_embeddings=gpt_neox_config.tie_word_embeddings,
)
# Copyright (c) 2023, Tri Dao.
import math
import re
from collections import OrderedDict
import torch
import torch.nn.functional as F
from transformers import GPT2Config, GPTJConfig
def remap_state_dict_hf_gptj(state_dict, config):
def key_mapping_layers(key):
return re.sub(r"^transformer.h.", "transformer.layers.", key)
state_dict = OrderedDict((key_mapping_layers(k), v) for k, v in state_dict.items())
# Word embedding
def key_mapping_emb(key):
return re.sub(r"^transformer.wte.", "transformer.embeddings.word_embeddings.", key)
state_dict = OrderedDict((key_mapping_emb(k), v) for k, v in state_dict.items())
word_embeddings = state_dict.pop("transformer.embeddings.word_embeddings.weight")
# It's possible that vocab_size is padded to be a multiple of 8, for example.
pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
vocab_size = math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad(
word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0])
)
if getattr(config, "tie_word_embeddings"):
state_dict["lm_head.weight"] = state_dict["transformer.embeddings.word_embeddings.weight"]
else:
output_embeddings = state_dict.pop("lm_head.weight")
# It's possible that vocab_size is padded to be a multiple of 8, for example.
state_dict["lm_head.weight"] = F.pad(
output_embeddings, (0, 0, 0, vocab_size - output_embeddings.shape[0])
)
output_embeddings_bias = state_dict.pop("lm_head.bias")
state_dict["lm_head.bias"] = F.pad(
output_embeddings_bias, (0, vocab_size - output_embeddings_bias.shape[0])
)
# LayerNorm
def key_mapping_ln(key):
return re.sub(r"^transformer.layers.(\d+).ln_1.", r"transformer.layers.\1.norm1.", key)
state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
# MLP
def key_mapping_mlp(key):
key = re.sub(
r"^transformer.layers.(\d+).mlp.fc_in.", r"transformer.layers.\1.mlp.fc1.", key
)
key = re.sub(
r"^transformer.layers.(\d+).mlp.fc_out.", r"transformer.layers.\1.mlp.fc2.", key
)
return key
state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
# Attention
for l in range(config.n_layer):
Wq = state_dict.pop(f"transformer.layers.{l}.attn.q_proj.weight")
Wk = state_dict.pop(f"transformer.layers.{l}.attn.k_proj.weight")
Wv = state_dict.pop(f"transformer.layers.{l}.attn.v_proj.weight")
state_dict[f"transformer.layers.{l}.mixer.Wqkv.weight"] = torch.cat([Wq, Wk, Wv], dim=0)
# We don't store these biases
state_dict.pop(f"transformer.layers.{l}.attn.bias")
state_dict.pop(f"transformer.layers.{l}.attn.masked_bias")
def key_mapping_attn(key):
return re.sub(
r"^transformer.layers.(\d+).attn.out_proj.",
r"transformer.layers.\1.mixer.out_proj.",
key,
)
state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
return state_dict
def gptj_config_to_gpt2_config(gptj_config: GPTJConfig) -> GPT2Config:
headdim = gptj_config.n_embd // gptj_config.n_head
return GPT2Config(
vocab_size=gptj_config.vocab_size,
n_positions=0, # No absolute position embedding
n_embd=gptj_config.n_embd,
n_layer=gptj_config.n_layer,
n_head=gptj_config.n_head,
n_inner=gptj_config.n_inner,
activation_function=gptj_config.activation_function,
resid_pdrop=gptj_config.resid_pdrop,
embd_pdrop=gptj_config.embd_pdrop,
attn_pdrop=gptj_config.attn_pdrop,
layer_norm_epsilon=gptj_config.layer_norm_epsilon,
initializer_range=gptj_config.initializer_range,
bos_token_id=gptj_config.bos_token_id,
eos_token_id=gptj_config.eos_token_id,
# These are new arguments not in the original GPT2Config
prenorm=True,
parallel_block=True,
parallel_block_tied_norm=True,
rotary_emb_fraction=gptj_config.rotary_dim / headdim,
rotary_emb_interleaved=True,
tie_word_embeddings=False,
qkv_proj_bias=False,
out_proj_bias=False,
lm_head_bias=True,
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment