Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
5c3843dc
"...git@developer.sourcefind.cn:OpenDAS/colossalai.git" did not exist on "ec18fc7340f99693f2436e91e1dea99342f476d5"
Unverified
Commit
5c3843dc
authored
Dec 21, 2021
by
shenggan
Committed by
GitHub
Dec 21, 2021
Browse files
add colossalai kernel module (#55)
parent
648f8063
Changes
43
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2858 additions
and
0 deletions
+2858
-0
colossalai/kernel/__init__.py
colossalai/kernel/__init__.py
+8
-0
colossalai/kernel/cuda_native/__init__.py
colossalai/kernel/cuda_native/__init__.py
+17
-0
colossalai/kernel/cuda_native/builder.py
colossalai/kernel/cuda_native/builder.py
+114
-0
colossalai/kernel/cuda_native/csrc/compat.h
colossalai/kernel/cuda_native/csrc/compat.h
+13
-0
colossalai/kernel/cuda_native/csrc/kernels/cross_entropy.cu
colossalai/kernel/cuda_native/csrc/kernels/cross_entropy.cu
+191
-0
colossalai/kernel/cuda_native/csrc/kernels/cublas_wrappers.cu
...ssalai/kernel/cuda_native/csrc/kernels/cublas_wrappers.cu
+87
-0
colossalai/kernel/cuda_native/csrc/kernels/cuda_util.cu
colossalai/kernel/cuda_native/csrc/kernels/cuda_util.cu
+169
-0
colossalai/kernel/cuda_native/csrc/kernels/dropout_kernels.cu
...ssalai/kernel/cuda_native/csrc/kernels/dropout_kernels.cu
+1001
-0
colossalai/kernel/cuda_native/csrc/kernels/general_kernels.cu
...ssalai/kernel/cuda_native/csrc/kernels/general_kernels.cu
+232
-0
colossalai/kernel/cuda_native/csrc/kernels/include/block_reduce.h
...ai/kernel/cuda_native/csrc/kernels/include/block_reduce.h
+312
-0
colossalai/kernel/cuda_native/csrc/kernels/include/context.h
colossalai/kernel/cuda_native/csrc/kernels/include/context.h
+36
-0
colossalai/kernel/cuda_native/csrc/kernels/include/cross_entropy_layer.h
...el/cuda_native/csrc/kernels/include/cross_entropy_layer.h
+46
-0
colossalai/kernel/cuda_native/csrc/kernels/include/cublas_wrappers.h
...kernel/cuda_native/csrc/kernels/include/cublas_wrappers.h
+40
-0
colossalai/kernel/cuda_native/csrc/kernels/include/cuda_util.h
...salai/kernel/cuda_native/csrc/kernels/include/cuda_util.h
+34
-0
colossalai/kernel/cuda_native/csrc/kernels/include/dropout.h
colossalai/kernel/cuda_native/csrc/kernels/include/dropout.h
+95
-0
colossalai/kernel/cuda_native/csrc/kernels/include/feed_forward.h
...ai/kernel/cuda_native/csrc/kernels/include/feed_forward.h
+68
-0
colossalai/kernel/cuda_native/csrc/kernels/include/kernels.h
colossalai/kernel/cuda_native/csrc/kernels/include/kernels.h
+274
-0
colossalai/kernel/cuda_native/csrc/kernels/include/ls_cub.cuh
...ssalai/kernel/cuda_native/csrc/kernels/include/ls_cub.cuh
+12
-0
colossalai/kernel/cuda_native/csrc/kernels/include/normalize_layer.h
...kernel/cuda_native/csrc/kernels/include/normalize_layer.h
+65
-0
colossalai/kernel/cuda_native/csrc/kernels/include/softmax.h
colossalai/kernel/cuda_native/csrc/kernels/include/softmax.h
+44
-0
No files found.
colossalai/kernel/__init__.py
0 → 100644
View file @
5c3843dc
from
.jit.bias_dropout_add
import
bias_dropout_add_fused_train
,
bias_dropout_add_fused_inference
from
.jit.bias_gelu
import
bias_gelu_impl
from
.cuda_native
import
LayerNorm
,
FusedScaleMaskSoftmax
,
MultiHeadAttention
__all__
=
[
"bias_dropout_add_fused_train"
,
"bias_dropout_add_fused_inference"
,
"bias_gelu_impl"
,
"LayerNorm"
,
"FusedScaleMaskSoftmax"
,
"MultiHeadAttention"
]
colossalai/kernel/cuda_native/__init__.py
0 → 100644
View file @
5c3843dc
from
.builder
import
_build_cuda_native_kernel
CUDA_NATIVE_KERNEL_BUILD
=
False
def
build_cuda_native_kernel
():
global
CUDA_NATIVE_KERNEL_BUILD
if
CUDA_NATIVE_KERNEL_BUILD
==
False
:
_build_cuda_native_kernel
()
CUDA_NATIVE_KERNEL_BUILD
=
True
build_cuda_native_kernel
()
from
.layer_norm
import
MixedFusedLayerNorm
as
LayerNorm
from
.scaled_softmax
import
FusedScaleMaskSoftmax
from
.multihead_attention
import
MultiHeadAttention
\ No newline at end of file
colossalai/kernel/cuda_native/builder.py
0 → 100644
View file @
5c3843dc
import
os
import
pathlib
import
subprocess
from
torch.utils
import
cpp_extension
# Setting this param to a list has a problem of generating different
# compilation commands (with diferent order of architectures) and
# leading to recompilation of fused kernels. Set it to empty string
# to avoid recompilation and assign arch flags explicity in
# extra_cuda_cflags below
os
.
environ
[
"TORCH_CUDA_ARCH_LIST"
]
=
""
def
_build_cuda_native_kernel
():
# Check if cuda 11 is installed for compute capability 8.0
cc_flag
=
[]
_
,
bare_metal_major
,
_
=
_get_cuda_bare_metal_version
(
cpp_extension
.
CUDA_HOME
)
if
int
(
bare_metal_major
)
>=
11
:
cc_flag
.
append
(
'-gencode'
)
cc_flag
.
append
(
'arch=compute_80,code=sm_80'
)
# Build path
basepath
=
pathlib
.
Path
(
__file__
).
parent
.
absolute
()
srcpath
=
basepath
/
'csrc'
buildpath
=
basepath
/
'build'
_create_build_dir
(
buildpath
)
# Helper function to build the kernels.
def
_cpp_extention_load_helper
(
name
,
sources
,
extra_cuda_flags
):
return
cpp_extension
.
load
(
name
=
name
,
sources
=
sources
,
build_directory
=
buildpath
,
extra_cflags
=
[
'-O3'
,
],
extra_include_paths
=
[
str
(
srcpath
/
'kernels'
/
'include'
)],
extra_cuda_cflags
=
[
'-O3'
,
'-gencode'
,
'arch=compute_70,code=sm_70'
,
'--use_fast_math'
]
+
extra_cuda_flags
+
cc_flag
,
verbose
=
False
)
# ==============
# Fused softmax.
# ==============
extra_cuda_flags
=
[
'-U__CUDA_NO_HALF_OPERATORS__'
,
'-U__CUDA_NO_HALF_CONVERSIONS__'
,
'--expt-relaxed-constexpr'
,
'--expt-extended-lambda'
]
# Upper triangular softmax.
sources
=
[
srcpath
/
'scaled_upper_triang_masked_softmax.cpp'
,
srcpath
/
'scaled_upper_triang_masked_softmax_cuda.cu'
]
colossal_scaled_upper_triang_masked_softmax
=
_cpp_extention_load_helper
(
"colossal_scaled_upper_triang_masked_softmax"
,
sources
,
extra_cuda_flags
)
# Masked softmax.
sources
=
[
srcpath
/
'scaled_masked_softmax.cpp'
,
srcpath
/
'scaled_masked_softmax_cuda.cu'
]
colossal_scaled_masked_softmax
=
_cpp_extention_load_helper
(
"colossal_scaled_masked_softmax"
,
sources
,
extra_cuda_flags
)
# =================================
# Mixed precision fused layer norm.
# =================================
extra_cuda_flags
=
[
'-maxrregcount=50'
]
sources
=
[
srcpath
/
'layer_norm_cuda.cpp'
,
srcpath
/
'layer_norm_cuda_kernel.cu'
]
colossal_layer_norm_cuda
=
_cpp_extention_load_helper
(
"colossal_layer_norm_cuda"
,
sources
,
extra_cuda_flags
)
# ==========================================
# Mixed precision Transformer Encoder Layer.
# ==========================================
extra_cuda_flags
=
[
'-std=c++14'
,
'-U__CUDA_NO_HALF_OPERATORS__'
,
'-U__CUDA_NO_HALF_CONVERSIONS__'
,
'-U__CUDA_NO_HALF2_OPERATORS__'
,
'-DTHRUST_IGNORE_CUB_VERSION_CHECK'
]
sources
=
[
srcpath
/
'multihead_attention_1d.cpp'
]
kernel_sources
=
[
"cublas_wrappers.cu"
,
"transform_kernels.cu"
,
"dropout_kernels.cu"
,
"normalize_kernels.cu"
,
"softmax_kernels.cu"
,
"general_kernels.cu"
,
"cuda_util.cu"
]
sources
+=
[(
srcpath
/
'kernels'
/
cu_file
)
for
cu_file
in
kernel_sources
]
colossal_multihead_attention
=
_cpp_extention_load_helper
(
"colossal_multihead_attention"
,
sources
,
extra_cuda_flags
)
def
_get_cuda_bare_metal_version
(
cuda_dir
):
raw_output
=
subprocess
.
check_output
([
cuda_dir
+
"/bin/nvcc"
,
"-V"
],
universal_newlines
=
True
)
output
=
raw_output
.
split
()
release_idx
=
output
.
index
(
"release"
)
+
1
release
=
output
[
release_idx
].
split
(
"."
)
bare_metal_major
=
release
[
0
]
bare_metal_minor
=
release
[
1
][
0
]
return
raw_output
,
bare_metal_major
,
bare_metal_minor
def
_create_build_dir
(
buildpath
):
try
:
os
.
mkdir
(
buildpath
)
except
OSError
:
if
not
os
.
path
.
isdir
(
buildpath
):
print
(
f
"Creation of the build directory
{
buildpath
}
failed"
)
colossalai/kernel/cuda_native/csrc/compat.h
0 → 100644
View file @
5c3843dc
/*This code from NVIDIA apex:
* https://github.com/NVIDIA/apex
* with minor changes. */
#ifndef TORCH_CHECK
#define TORCH_CHECK AT_CHECK
#endif
#ifdef VERSION_GE_1_3
#define DATA_PTR data_ptr
#else
#define DATA_PTR data
#endif
\ No newline at end of file
colossalai/kernel/cuda_native/csrc/kernels/cross_entropy.cu
0 → 100644
View file @
5c3843dc
#include "block_reduce.h"
#include "cuda_util.h"
#include "kernels.h"
#include "ls_cub.cuh"
ls
::
cub
::
CachingDeviceAllocator
g_allocator
(
true
);
template
<
typename
T
>
__global__
void
ls_cross_entropy_fw_kernel
(
const
T
*
__restrict__
inputs
,
const
int
*
__restrict__
targets
,
float
*
__restrict__
outputs
,
float
*
__restrict__
nll_loss_outputs
,
const
int
padding_idx
,
const
float
epsilon
,
const
int
vocab_size
)
{
/* step1: compute each thread's max_logit and sum_exp_logit, store in
* max_input, sum_exp_logit */
const
int
block_start
=
blockIdx
.
x
*
vocab_size
;
const
int
left_idx
=
block_start
+
threadIdx
.
x
;
const
int
right_idx
=
(
blockIdx
.
x
+
1
)
*
vocab_size
;
float
max_input
[
1
]
=
{
REDUCE_FLOAT_INF_NEG
};
float
sum_logits
[
2
]
=
{
0.
f
,
0.
f
};
// logit and logit exp
int
target_tid
=
targets
[
blockIdx
.
x
];
if
(
target_tid
==
padding_idx
)
{
if
(
threadIdx
.
x
==
0
)
{
nll_loss_outputs
[
blockIdx
.
x
]
=
0.
f
;
outputs
[
blockIdx
.
x
]
=
0.
f
;
}
return
;
}
for
(
int
i
=
left_idx
;
i
<
right_idx
;
i
+=
blockDim
.
x
)
{
max_input
[
0
]
=
fmaxf
(
max_input
[
0
],
static_cast
<
float
>
(
inputs
[
i
]));
}
blockReduce
<
ReduceType
::
kMax
,
1
>
(
max_input
);
__shared__
float
s_max_input
;
if
(
threadIdx
.
x
==
0
)
{
s_max_input
=
max_input
[
0
];
}
__syncthreads
();
for
(
int
i
=
left_idx
;
i
<
right_idx
;
i
+=
blockDim
.
x
)
{
float
logit
=
static_cast
<
float
>
(
inputs
[
i
])
-
s_max_input
;
sum_logits
[
0
]
+=
logit
;
sum_logits
[
1
]
+=
expf
(
logit
);
}
blockReduce
<
ReduceType
::
kSum
,
2
>
(
sum_logits
);
__shared__
float
s_sum_logit
;
__shared__
float
s_sum_exp
;
if
(
threadIdx
.
x
==
0
)
{
s_sum_logit
=
sum_logits
[
0
];
s_sum_exp
=
sum_logits
[
1
];
}
__syncthreads
();
float
eps_i
=
epsilon
/
(
vocab_size
-
1
);
if
(
threadIdx
.
x
==
0
)
{
// neg_log_prob = log(sum(exp(x - x_max))) - (x - x_max)
float
nll_loss
=
logf
(
s_sum_exp
)
-
static_cast
<
float
>
(
inputs
[
block_start
+
target_tid
])
+
s_max_input
;
nll_loss_outputs
[
blockIdx
.
x
]
=
nll_loss
;
float
sum_nll_loss
=
vocab_size
*
logf
(
s_sum_exp
)
-
s_sum_logit
;
outputs
[
blockIdx
.
x
]
=
(
1.
f
-
epsilon
-
eps_i
)
*
nll_loss
+
eps_i
*
sum_nll_loss
;
}
}
template
<
typename
T
>
__global__
void
ls_cross_entropy_bw_kernel
(
const
float
*
__restrict__
grad_outputs
,
const
T
*
__restrict__
inputs
,
const
int
*
__restrict__
targets
,
T
*
__restrict__
grad_inputs
,
const
int
padding_idx
,
const
float
epsilon
,
const
int
vocab_size
)
{
/* step1: compute each thread's max_logit and sum_exp_logit, store in
* max_input, sum_exp_logit */
const
int
block_start
=
blockIdx
.
x
*
vocab_size
;
const
int
left_idx
=
block_start
+
threadIdx
.
x
;
const
int
right_idx
=
(
blockIdx
.
x
+
1
)
*
vocab_size
;
float
max_input
[
1
]
=
{
REDUCE_FLOAT_INF_NEG
};
float
sum_logits
[
1
]
=
{
0.
f
};
const
float
grad_out
=
static_cast
<
float
>
(
grad_outputs
[
0
]);
int
target_tid
=
targets
[
blockIdx
.
x
];
if
(
target_tid
==
padding_idx
)
{
for
(
int
i
=
left_idx
;
i
<
right_idx
;
i
+=
blockDim
.
x
)
{
grad_inputs
[
i
]
=
0.
f
;
}
return
;
}
for
(
int
i
=
left_idx
;
i
<
right_idx
;
i
+=
blockDim
.
x
)
{
max_input
[
0
]
=
fmaxf
(
max_input
[
0
],
static_cast
<
float
>
(
inputs
[
i
]));
}
blockReduce
<
ReduceType
::
kMax
,
1
>
(
max_input
);
__shared__
float
s_max_input
;
if
(
threadIdx
.
x
==
0
)
{
s_max_input
=
max_input
[
0
];
}
__syncthreads
();
for
(
int
i
=
left_idx
;
i
<
right_idx
;
i
+=
blockDim
.
x
)
{
float
logit
=
static_cast
<
float
>
(
inputs
[
i
])
-
s_max_input
;
sum_logits
[
0
]
+=
expf
(
logit
);
}
blockReduce
<
ReduceType
::
kSum
,
1
>
(
sum_logits
);
__shared__
float
s_sum_exp
;
if
(
threadIdx
.
x
==
0
)
{
s_sum_exp
=
sum_logits
[
0
];
}
__syncthreads
();
float
eps_i
=
epsilon
/
(
vocab_size
-
1
);
float
nll_weight
=
1.0
-
epsilon
-
eps_i
;
for
(
int
i
=
left_idx
;
i
<
right_idx
;
i
+=
blockDim
.
x
)
{
float
prob
=
expf
(
static_cast
<
float
>
(
inputs
[
i
])
-
s_max_input
)
/
s_sum_exp
;
float
grad
=
0
;
grad
+=
(
vocab_size
*
prob
-
1
)
*
eps_i
;
grad
+=
prob
*
nll_weight
;
if
((
i
-
block_start
)
==
target_tid
)
{
grad
-=
nll_weight
;
}
grad_inputs
[
i
]
=
grad_out
*
grad
;
}
}
template
<
typename
T
>
void
launch_cross_entropy_fw
(
const
T
*
inputs_ptr
,
const
int
*
targets_ptr
,
float
*
outputs_ptr
,
float
*
nll_loss_ptr
,
float
*
loss_buffer
,
const
int
padding_idx
,
const
float
epsilon
,
const
int
batch_size
,
const
int
seq_len
,
const
int
vocab_size
,
cudaStream_t
stream
)
{
int
grid_dim
=
batch_size
*
seq_len
;
float
*
nll_loss_buffer
=
loss_buffer
+
grid_dim
;
ls_cross_entropy_fw_kernel
<<<
grid_dim
,
MAX_THREADS
,
0
,
stream
>>>
(
inputs_ptr
,
targets_ptr
,
loss_buffer
,
nll_loss_buffer
,
padding_idx
,
epsilon
,
vocab_size
);
int
num_items
=
grid_dim
;
void
*
d_temp_storage
=
NULL
;
size_t
temp_storage_bytes
=
0
;
CHECK_GPU_ERROR
(
ls
::
cub
::
DeviceReduce
::
Sum
(
d_temp_storage
,
temp_storage_bytes
,
loss_buffer
,
outputs_ptr
,
num_items
,
stream
));
CHECK_GPU_ERROR
(
g_allocator
.
DeviceAllocate
(
&
d_temp_storage
,
temp_storage_bytes
));
CHECK_GPU_ERROR
(
ls
::
cub
::
DeviceReduce
::
Sum
(
d_temp_storage
,
temp_storage_bytes
,
loss_buffer
,
outputs_ptr
,
num_items
,
stream
));
CHECK_GPU_ERROR
(
ls
::
cub
::
DeviceReduce
::
Sum
(
d_temp_storage
,
temp_storage_bytes
,
nll_loss_buffer
,
nll_loss_ptr
,
num_items
,
stream
));
CHECK_GPU_ERROR
(
g_allocator
.
DeviceFree
(
d_temp_storage
));
}
template
void
launch_cross_entropy_fw
<
float
>(
const
float
*
inputs_ptr
,
const
int
*
targets_ptr
,
float
*
outputs_ptr
,
float
*
nll_loss_ptr
,
float
*
loss_buffer
,
const
int
padding_idx
,
const
float
epsilon
,
const
int
batch_size
,
const
int
seq_len
,
const
int
vocab_size
,
cudaStream_t
stream
);
template
void
launch_cross_entropy_fw
<
__half
>(
const
__half
*
inputs_ptr
,
const
int
*
targets_ptr
,
float
*
outputs_ptr
,
float
*
nll_loss_ptr
,
float
*
loss_buffer
,
const
int
padding_idx
,
const
float
epsilon
,
const
int
batch_size
,
const
int
seq_len
,
const
int
vocab_size
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_cross_entropy_bw
(
const
float
*
grad_outputs_ptr
,
const
T
*
inputs_ptr
,
const
int
*
targets_ptr
,
T
*
grad_inputs_ptr
,
const
int
padding_idx
,
const
float
epsilon
,
const
int
batch_size
,
const
int
seq_len
,
const
int
vocab_size
,
cudaStream_t
stream
)
{
int
grid_dim
=
batch_size
*
seq_len
;
ls_cross_entropy_bw_kernel
<<<
grid_dim
,
MAX_THREADS
,
0
,
stream
>>>
(
grad_outputs_ptr
,
inputs_ptr
,
targets_ptr
,
grad_inputs_ptr
,
padding_idx
,
epsilon
,
vocab_size
);
}
template
void
launch_cross_entropy_bw
<
float
>(
const
float
*
grad_outputs_ptr
,
const
float
*
inputs_ptr
,
const
int
*
targets_ptr
,
float
*
grad_inputs_ptr
,
const
int
padding_idx
,
const
float
epsilon
,
const
int
batch_size
,
const
int
seq_len
,
const
int
vocab_size
,
cudaStream_t
stream
);
template
void
launch_cross_entropy_bw
<
__half
>(
const
float
*
grad_outputs_ptr
,
const
__half
*
inputs_ptr
,
const
int
*
targets_ptr
,
__half
*
grad_inputs_ptr
,
const
int
padding_idx
,
const
float
epsilon
,
const
int
batch_size
,
const
int
seq_len
,
const
int
vocab_size
,
cudaStream_t
stream
);
colossalai/kernel/cuda_native/csrc/kernels/cublas_wrappers.cu
0 → 100644
View file @
5c3843dc
/* Copyright 2021 The LightSeq Team
Copyright Microsoft DeepSpeed
This file is adapted from Microsoft DeepSpeed
*/
#include "cublas_wrappers.h"
int
cublas_gemm_ex
(
cublasHandle_t
handle
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
float
*
A
,
const
float
*
B
,
float
*
C
,
cublasGemmAlgo_t
algo
)
{
cublasStatus_t
status
=
cublasGemmEx
(
handle
,
transa
,
transb
,
m
,
n
,
k
,
(
const
void
*
)
alpha
,
(
const
void
*
)
A
,
CUDA_R_32F
,
(
transa
==
CUBLAS_OP_N
)
?
m
:
k
,
(
const
void
*
)
B
,
CUDA_R_32F
,
(
transb
==
CUBLAS_OP_N
)
?
k
:
n
,
(
const
void
*
)
beta
,
C
,
CUDA_R_32F
,
m
,
CUDA_R_32F
,
algo
);
if
(
status
!=
CUBLAS_STATUS_SUCCESS
)
{
fprintf
(
stderr
,
"!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d)
\n
"
,
m
,
n
,
k
,
(
int
)
status
);
return
EXIT_FAILURE
;
}
return
0
;
}
int
cublas_gemm_ex
(
cublasHandle_t
handle
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
__half
*
A
,
const
__half
*
B
,
__half
*
C
,
cublasGemmAlgo_t
algo
)
{
cublasStatus_t
status
=
cublasGemmEx
(
handle
,
transa
,
transb
,
m
,
n
,
k
,
(
const
void
*
)
alpha
,
(
const
void
*
)
A
,
CUDA_R_16F
,
(
transa
==
CUBLAS_OP_N
)
?
m
:
k
,
(
const
void
*
)
B
,
CUDA_R_16F
,
(
transb
==
CUBLAS_OP_N
)
?
k
:
n
,
(
const
void
*
)
beta
,
(
void
*
)
C
,
CUDA_R_16F
,
m
,
CUDA_R_32F
,
algo
);
if
(
status
!=
CUBLAS_STATUS_SUCCESS
)
{
fprintf
(
stderr
,
"!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d)
\n
"
,
m
,
n
,
k
,
(
int
)
status
);
return
EXIT_FAILURE
;
}
return
0
;
}
int
cublas_strided_batched_gemm
(
cublasHandle_t
handle
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
float
*
A
,
const
float
*
B
,
float
*
C
,
cublasOperation_t
op_A
,
cublasOperation_t
op_B
,
int
stride_A
,
int
stride_B
,
int
stride_C
,
int
batch
,
cublasGemmAlgo_t
algo
)
{
cublasStatus_t
status
=
cublasGemmStridedBatchedEx
(
handle
,
op_A
,
op_B
,
m
,
n
,
k
,
alpha
,
A
,
CUDA_R_32F
,
(
op_A
==
CUBLAS_OP_N
)
?
m
:
k
,
stride_A
,
B
,
CUDA_R_32F
,
(
op_B
==
CUBLAS_OP_N
)
?
k
:
n
,
stride_B
,
beta
,
C
,
CUDA_R_32F
,
m
,
stride_C
,
batch
,
CUDA_R_32F
,
algo
);
if
(
status
!=
CUBLAS_STATUS_SUCCESS
)
{
fprintf
(
stderr
,
"!!!! kernel execution error. (batch: %d, m: %d, n: %d, k: %d, "
"error: %d)
\n
"
,
batch
,
m
,
n
,
k
,
(
int
)
status
);
return
EXIT_FAILURE
;
}
return
0
;
}
int
cublas_strided_batched_gemm
(
cublasHandle_t
handle
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
__half
*
A
,
const
__half
*
B
,
__half
*
C
,
cublasOperation_t
op_A
,
cublasOperation_t
op_B
,
int
stride_A
,
int
stride_B
,
int
stride_C
,
int
batch
,
cublasGemmAlgo_t
algo
)
{
cublasStatus_t
status
=
cublasGemmStridedBatchedEx
(
handle
,
op_A
,
op_B
,
m
,
n
,
k
,
alpha
,
A
,
CUDA_R_16F
,
(
op_A
==
CUBLAS_OP_N
)
?
m
:
k
,
stride_A
,
B
,
CUDA_R_16F
,
(
op_B
==
CUBLAS_OP_N
)
?
k
:
n
,
stride_B
,
beta
,
C
,
CUDA_R_16F
,
m
,
stride_C
,
batch
,
CUDA_R_32F
,
algo
);
if
(
status
!=
CUBLAS_STATUS_SUCCESS
)
{
fprintf
(
stderr
,
"!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d)
\n
"
,
m
,
n
,
k
,
(
int
)
status
);
return
EXIT_FAILURE
;
}
return
0
;
}
colossalai/kernel/cuda_native/csrc/kernels/cuda_util.cu
0 → 100644
View file @
5c3843dc
#include <thrust/device_vector.h>
#include <thrust/reduce.h>
#include "cuda_util.h"
/* GPU function guard */
std
::
string
_cudaGetErrorString
(
cudaError_t
error
)
{
return
cudaGetErrorString
(
error
);
}
std
::
string
_cudaGetErrorString
(
cublasStatus_t
error
)
{
switch
(
error
)
{
case
CUBLAS_STATUS_SUCCESS
:
return
"CUBLAS_STATUS_SUCCESS"
;
case
CUBLAS_STATUS_NOT_INITIALIZED
:
return
"CUBLAS_STATUS_NOT_INITIALIZED"
;
case
CUBLAS_STATUS_ALLOC_FAILED
:
return
"CUBLAS_STATUS_ALLOC_FAILED"
;
case
CUBLAS_STATUS_INVALID_VALUE
:
return
"CUBLAS_STATUS_INVALID_VALUE"
;
case
CUBLAS_STATUS_ARCH_MISMATCH
:
return
"CUBLAS_STATUS_ARCH_MISMATCH"
;
case
CUBLAS_STATUS_MAPPING_ERROR
:
return
"CUBLAS_STATUS_MAPPING_ERROR"
;
case
CUBLAS_STATUS_EXECUTION_FAILED
:
return
"CUBLAS_STATUS_EXECUTION_FAILED"
;
case
CUBLAS_STATUS_INTERNAL_ERROR
:
return
"CUBLAS_STATUS_INTERNAL_ERROR"
;
case
CUBLAS_STATUS_NOT_SUPPORTED
:
return
"CUBLAS_STATUS_NOT_SUPPORTED"
;
case
CUBLAS_STATUS_LICENSE_ERROR
:
return
"CUBLAS_STATUS_LICENSE_ERROR"
;
}
return
"CUBLAS_UNKNOW"
;
}
template
<
typename
T
>
void
check_gpu_error
(
T
result
,
char
const
*
const
func
,
const
char
*
const
file
,
int
const
line
)
{
if
(
result
)
{
throw
std
::
runtime_error
(
std
::
string
(
"[CUDA][ERROR] "
)
+
+
file
+
"("
+
std
::
to_string
(
line
)
+
"): "
+
(
_cudaGetErrorString
(
result
))
+
"
\n
"
);
}
}
template
void
check_gpu_error
<
cudaError_t
>(
cudaError_t
result
,
char
const
*
const
func
,
const
char
*
const
file
,
int
const
line
);
template
void
check_gpu_error
<
cublasStatus_t
>(
cublasStatus_t
result
,
char
const
*
const
func
,
const
char
*
const
file
,
int
const
line
);
template
<
typename
T
>
void
print_vec
(
const
T
*
outv
,
std
::
string
outn
,
int
num_output_ele
)
{
std
::
cout
<<
outn
<<
": "
;
std
::
vector
<
T
>
hout
(
num_output_ele
,
(
T
)
0
);
cudaMemcpy
(
hout
.
data
(),
outv
,
num_output_ele
*
sizeof
(
T
),
cudaMemcpyDeviceToHost
);
for
(
int
i
=
0
;
i
<
num_output_ele
;
i
++
)
{
std
::
cout
<<
hout
[
i
]
<<
", "
;
}
std
::
cout
<<
std
::
endl
;
}
template
<
>
void
print_vec
<
__half
>
(
const
__half
*
outv
,
std
::
string
outn
,
int
num_output_ele
)
{
std
::
cout
<<
outn
<<
": "
;
std
::
vector
<
__half
>
hout
(
num_output_ele
,
(
__half
)
0.
f
);
cudaMemcpy
(
hout
.
data
(),
outv
,
num_output_ele
*
sizeof
(
__half
),
cudaMemcpyDeviceToHost
);
for
(
int
i
=
0
;
i
<
num_output_ele
;
i
++
)
{
std
::
cout
<<
__half2float
(
hout
[
i
])
<<
", "
;
}
std
::
cout
<<
std
::
endl
;
}
template
void
print_vec
<
float
>(
const
float
*
outv
,
std
::
string
outn
,
int
num_output_ele
);
template
void
print_vec
<
int
>(
const
int
*
outv
,
std
::
string
outn
,
int
num_output_ele
);
template
void
print_vec
<
__half
>(
const
__half
*
outv
,
std
::
string
outn
,
int
num_output_ele
);
template
<
typename
T
>
T
*
cuda_malloc
(
size_t
ele_num
)
{
size_t
byte_size
=
ele_num
*
sizeof
(
T
);
T
*
pdata
=
nullptr
;
CHECK_GPU_ERROR
(
cudaMalloc
((
void
**
)
&
pdata
,
byte_size
));
return
pdata
;
}
template
float
*
cuda_malloc
<
float
>(
size_t
ele_num
);
template
__half
*
cuda_malloc
<
__half
>(
size_t
ele_num
);
template
uint8_t
*
cuda_malloc
<
uint8_t
>(
size_t
ele_num
);
void
cuda_free
(
void
*
pdata
)
{
if
(
pdata
!=
nullptr
)
{
cudaFree
(
pdata
);
}
}
template
<
typename
T
>
struct
_isnan
{
__device__
bool
operator
()(
T
a
)
const
{
return
isnan
(
a
);
}
};
template
<
>
struct
_isnan
<
__half
>
{
__device__
bool
operator
()(
const
__half
a
)
const
{
return
__hisnan
(
a
);
}
};
template
<
typename
T
>
struct
_isinf
{
__device__
bool
operator
()(
T
a
)
const
{
return
isinf
(
a
);
}
};
template
<
>
struct
_isinf
<
__half
>
{
__device__
bool
operator
()(
const
__half
a
)
const
{
return
__hisinf
(
a
);
}
};
template
<
typename
T
>
void
check_nan_inf
(
const
T
*
data_ptr
,
int
dsize
,
bool
check_nan_inf
,
std
::
string
file
,
int
line
,
cudaStream_t
stream
)
{
// check_nan_inf = 0 for checking nan
// check_nan_inf = 1 for checking inf
bool
res
=
false
;
std
::
string
msg
=
file
+
"("
+
std
::
to_string
(
line
)
+
"): "
;
if
(
check_nan_inf
)
{
msg
+=
"nan."
;
res
=
thrust
::
transform_reduce
(
thrust
::
cuda
::
par
.
on
(
stream
),
data_ptr
,
data_ptr
+
dsize
,
_isnan
<
T
>
(),
false
,
thrust
::
logical_or
<
bool
>
());
}
else
{
msg
+=
"inf."
;
res
=
thrust
::
transform_reduce
(
thrust
::
cuda
::
par
.
on
(
stream
),
data_ptr
,
data_ptr
+
dsize
,
_isinf
<
T
>
(),
false
,
thrust
::
logical_or
<
bool
>
());
}
if
(
res
)
{
throw
std
::
runtime_error
(
msg
);
}
std
::
cout
<<
msg
<<
" [check pass]."
<<
std
::
endl
;
}
template
void
check_nan_inf
<
float
>(
const
float
*
data_ptr
,
int
dsize
,
bool
check_nan_inf
,
std
::
string
file
,
int
line
,
cudaStream_t
stream
);
template
void
check_nan_inf
<
__half
>(
const
__half
*
data_ptr
,
int
dsize
,
bool
check_nan_inf
,
std
::
string
file
,
int
line
,
cudaStream_t
stream
);
colossalai/kernel/cuda_native/csrc/kernels/dropout_kernels.cu
0 → 100644
View file @
5c3843dc
#include <chrono>
#include <ctime>
#include "kernels.h"
#include <cooperative_groups.h>
namespace
cg
=
cooperative_groups
;
curandStatePhilox4_32_10_t
*
curandstate
;
/**
* @brief element-wise activation function on device, like Relu, Gelu
*
* @tparam enum class ActivationType, kRelu, kGelu
* @tparam input type
* @param any shape of float and __half2
* @return same shape and type with input
*/
template
<
ActivationType
,
typename
T
>
__forceinline__
__device__
T
activation_kernel
(
T
x
);
template
<
>
__device__
float
activation_kernel
<
ActivationType
::
kGelu
,
float
>
(
float
x
)
{
float
cdf
=
0.5
f
*
(
1.0
f
+
tanhf
((
0.7978845608028654
f
*
(
x
+
0.044715
f
*
x
*
x
*
x
))));
return
x
*
cdf
;
}
template
<
>
__device__
__half2
activation_kernel
<
ActivationType
::
kGelu
,
__half2
>
(
__half2
val
)
{
__half2
val_pow3
=
__hmul2
(
val
,
__hmul2
(
val
,
val
));
float2
tmp_pow
=
__half22float2
(
val_pow3
);
float2
tmp
=
__half22float2
(
val
);
tmp
.
x
=
0.5
f
*
(
1.0
f
+
tanhf
((
0.7978845608028654
f
*
(
tmp
.
x
+
0.044715
f
*
tmp_pow
.
x
))));
tmp
.
y
=
0.5
f
*
(
1.0
f
+
tanhf
((
0.7978845608028654
f
*
(
tmp
.
y
+
0.044715
f
*
tmp_pow
.
y
))));
return
__hmul2
(
val
,
__float22half2_rn
(
tmp
));
}
template
<
>
__device__
float
activation_kernel
<
ActivationType
::
kRelu
,
float
>
(
float
x
)
{
return
fmaxf
(
x
,
0
);
}
template
<
>
__device__
__half2
activation_kernel
<
ActivationType
::
kRelu
,
__half2
>
(
__half2
x
)
{
return
__floats2half2_rn
(
fmaxf
(
0.
f
,
__half2float
(
x
.
x
)),
fmaxf
(
0.
f
,
__half2float
(
x
.
y
)));
}
/**
* @brief element-wise activation backward function on device
*
* @tparam enum class ActivationType
* @tparam input type
* @param any shape of float and __half2
* @return same shape of input
*/
template
<
ActivationType
,
typename
T
>
__forceinline__
__device__
T
activation_bwd_kernel
(
T
grad
,
T
x
);
template
<
>
__device__
float
activation_bwd_kernel
<
ActivationType
::
kGelu
,
float
>
(
float
grad
,
float
x
)
{
const
float
sqrt_param
=
0.79788456080286535587989211986876
f
;
const
float
mul_param
=
0.044715
;
float
x2mul
=
x
*
x
*
mul_param
;
float
tan_h
=
tanhf
(
sqrt_param
*
(
x
+
x
*
x2mul
));
float
dg1
=
0.5
f
*
(
1.0
f
+
tan_h
);
float
dg2
=
x
*
0.5
f
*
sqrt_param
*
(
1
-
tan_h
*
tan_h
);
float
dg3
=
dg2
*
3
*
x2mul
;
return
grad
*
(
dg1
+
dg2
+
dg3
);
}
template
<
>
__device__
__half
activation_bwd_kernel
<
ActivationType
::
kGelu
,
__half
>
(
__half
grad
,
__half
x_half
)
{
float
x
=
__half2float
(
x_half
);
const
float
sqrt_param
=
0.79788456080286535587989211986876
f
;
const
float
mul_param
=
0.044715
;
float
x2mul
=
x
*
x
*
mul_param
;
float
tan_h
=
tanhf
(
sqrt_param
*
(
x
+
x
*
x2mul
));
float
dg1
=
0.5
f
*
(
1.0
f
+
tan_h
);
float
dg2
=
x
*
0.5
f
*
sqrt_param
*
(
1
-
tan_h
*
tan_h
);
float
dg3
=
dg2
*
3
*
x2mul
;
return
grad
*
__float2half
(
dg1
+
dg2
+
dg3
);
}
template
<
>
__device__
float
activation_bwd_kernel
<
ActivationType
::
kRelu
,
float
>
(
float
grad
,
float
x
)
{
return
x
>
0.
f
?
grad
:
0.
f
;
}
template
<
>
__device__
__half
activation_bwd_kernel
<
ActivationType
::
kRelu
,
__half
>
(
__half
grad
,
__half
x
)
{
const
__half
half_zero
=
__float2half
(
0.
f
);
return
x
>
half_zero
?
grad
:
half_zero
;
}
template
<
>
__device__
__half2
activation_bwd_kernel
<
ActivationType
::
kRelu
,
__half2
>
(
__half2
grad2
,
__half2
x_half2
)
{
const
__half
half_zero
=
__float2half
(
0.
f
);
return
__floats2half2_rn
(
x_half2
.
x
>
half_zero
?
grad2
.
x
:
half_zero
,
x_half2
.
y
>
half_zero
?
grad2
.
y
:
half_zero
);
}
/**
* @brief init curand states in global memory
*
* @thread grid_dim * block*dim to suuport any size of states
* @param state persistant curand states
* @param seed seed to init states
* @return void
*/
__global__
void
curand_init_kernel
(
curandStatePhilox4_32_10_t
*
state
,
int
seed
)
{
/* Each thread gets same seed, a different sequence
number, no offset */
int
id
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
curand_init
(
seed
,
id
,
0
,
&
state
[
id
]);
}
void
launch_curand_init
(
int
total_count
,
int
dim
,
cudaStream_t
stream
)
{
cudaMalloc
(
&
curandstate
,
total_count
*
sizeof
(
curandStatePhilox4_32_10_t
));
int
grid_dim
=
total_count
>>
9
;
curand_init_kernel
<<<
grid_dim
,
512
,
0
,
stream
>>>
(
curandstate
,
std
::
chrono
::
duration_cast
<
std
::
chrono
::
microseconds
>
(
std
::
chrono
::
system_clock
::
now
().
time_since_epoch
())
.
count
());
}
/**
* @brief element-wise dropout, store dropped position in mask, it's not
* in-place
*
* @thread
* gridDim.x = total_count / 1024
* blockDim.x = 1024
*
* @param total_count total elements
* @param ratio drop ratio
* @param out any size of float and __half
* @param in same with out
* @param mask uint8 type, same size with out
* @param seed seed to curand
* @return void
*/
__global__
void
ls_dropout_kernel
(
const
int
total_count
,
const
float
ratio
,
float
*
__restrict__
out
,
const
float
*
__restrict__
in
,
uint8_t
*
__restrict__
mask
,
const
int
seed
)
{
const
float
scale
=
1.
f
/
(
1.
f
-
ratio
);
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
i
*
4
>=
total_count
)
return
;
curandStatePhilox4_32_10_t
state
;
curand_init
(
seed
,
i
,
0
,
&
state
);
uint8_t
m
[
4
];
float4
*
out4
=
reinterpret_cast
<
float4
*>
(
out
);
const
float4
*
data4
=
reinterpret_cast
<
const
float4
*>
(
in
);
uint32_t
*
mask4
=
reinterpret_cast
<
uint32_t
*>
(
mask
);
float4
rand
=
curand_uniform4
(
&
state
);
m
[
0
]
=
(
uint8_t
)(
rand
.
x
>
ratio
);
m
[
1
]
=
(
uint8_t
)(
rand
.
y
>
ratio
);
m
[
2
]
=
(
uint8_t
)(
rand
.
z
>
ratio
);
m
[
3
]
=
(
uint8_t
)(
rand
.
w
>
ratio
);
uint32_t
*
m4
=
reinterpret_cast
<
uint32_t
*>
(
m
);
mask4
[
i
]
=
m4
[
0
];
float4
input4
=
data4
[
i
];
float4
res4
;
res4
.
x
=
input4
.
x
*
scale
*
m
[
0
];
res4
.
y
=
input4
.
y
*
scale
*
m
[
1
];
res4
.
z
=
input4
.
z
*
scale
*
m
[
2
];
res4
.
w
=
input4
.
w
*
scale
*
m
[
3
];
out4
[
i
]
=
res4
;
}
__global__
void
ls_dropout_kernel
(
const
int
total_count
,
const
float
ratio
,
__half
*
__restrict__
out
,
const
__half
*
__restrict__
in
,
uint8_t
*
__restrict__
mask
,
const
int
seed
)
{
const
float
scale
=
1.
f
/
(
1.
f
-
ratio
);
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
i
*
8
>=
total_count
)
return
;
curandStatePhilox4_32_10_t
state
;
curand_init
(
seed
,
i
,
0
,
&
state
);
const
float4
*
vals_float4
=
reinterpret_cast
<
const
float4
*>
(
in
);
float4
*
outs_float4
=
reinterpret_cast
<
float4
*>
(
out
);
uint64_t
*
mask8
=
reinterpret_cast
<
uint64_t
*>
(
mask
);
uint8_t
m
[
8
];
float4
rand
=
curand_uniform4
(
&
state
);
m
[
0
]
=
(
uint8_t
)(
rand
.
x
>
ratio
);
m
[
1
]
=
(
uint8_t
)(
rand
.
y
>
ratio
);
m
[
2
]
=
(
uint8_t
)(
rand
.
z
>
ratio
);
m
[
3
]
=
(
uint8_t
)(
rand
.
w
>
ratio
);
rand
=
curand_uniform4
(
&
state
);
m
[
4
]
=
(
uint8_t
)(
rand
.
x
>
ratio
);
m
[
5
]
=
(
uint8_t
)(
rand
.
y
>
ratio
);
m
[
6
]
=
(
uint8_t
)(
rand
.
z
>
ratio
);
m
[
7
]
=
(
uint8_t
)(
rand
.
w
>
ratio
);
uint64_t
*
m8
=
reinterpret_cast
<
uint64_t
*>
(
m
);
mask8
[
i
]
=
*
m8
;
float4
val_float4
=
vals_float4
[
i
];
float4
out_float4
;
__half2
*
val_half2
=
reinterpret_cast
<
__half2
*>
(
&
val_float4
);
__half2
*
out_half2
=
reinterpret_cast
<
__half2
*>
(
&
out_float4
);
__half2
scale_mask_1
=
__floats2half2_rn
(
scale
*
m
[
0
],
scale
*
m
[
1
]);
__half2
scale_mask_2
=
__floats2half2_rn
(
scale
*
m
[
2
],
scale
*
m
[
3
]);
__half2
scale_mask_3
=
__floats2half2_rn
(
scale
*
m
[
4
],
scale
*
m
[
5
]);
__half2
scale_mask_4
=
__floats2half2_rn
(
scale
*
m
[
6
],
scale
*
m
[
7
]);
out_half2
[
0
]
=
__hmul2
(
val_half2
[
0
],
scale_mask_1
);
out_half2
[
1
]
=
__hmul2
(
val_half2
[
1
],
scale_mask_2
);
out_half2
[
2
]
=
__hmul2
(
val_half2
[
2
],
scale_mask_3
);
out_half2
[
3
]
=
__hmul2
(
val_half2
[
3
],
scale_mask_4
);
outs_float4
[
i
]
=
out_float4
;
}
/**
* @brief element-wise dropout backward with dropout mask, it's
* not in-place
*
* @thread
* gridDim.x = total_count / 1024
* blockDim.x = 1024
*
* @param total_count total elements
* @param ratio drop ratio
* @param in any size of float and __half
* @param mask uint8 type, same size with in
* @return void
*/
__global__
void
ls_dropout_bwd_kernel
(
const
int
total_count
,
const
float
ratio
,
float
*
out
,
const
float
*
in
,
const
uint8_t
*
__restrict__
mask
)
{
const
float
scale
=
1.
f
/
(
1.
f
-
ratio
);
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
i
*
4
>=
total_count
)
return
;
uint8_t
m
[
4
];
float4
*
out4
=
reinterpret_cast
<
float4
*>
(
out
);
const
float4
*
in4
=
reinterpret_cast
<
const
float4
*>
(
in
);
const
uint32_t
*
mask4
=
reinterpret_cast
<
const
uint32_t
*>
(
mask
);
uint32_t
*
m4
=
reinterpret_cast
<
uint32_t
*>
(
m
);
m4
[
0
]
=
mask4
[
i
];
float4
input4
=
in4
[
i
];
float4
res4
;
res4
.
x
=
input4
.
x
*
scale
*
static_cast
<
float
>
(
m
[
0
]);
res4
.
y
=
input4
.
y
*
scale
*
static_cast
<
float
>
(
m
[
1
]);
res4
.
z
=
input4
.
z
*
scale
*
static_cast
<
float
>
(
m
[
2
]);
res4
.
w
=
input4
.
w
*
scale
*
static_cast
<
float
>
(
m
[
3
]);
out4
[
i
]
=
res4
;
}
__global__
void
ls_dropout_bwd_kernel
(
const
int
total_count
,
const
float
ratio
,
__half
*
out
,
const
__half
*
in
,
const
uint8_t
*
__restrict__
mask
)
{
const
__half
scale
=
1.
f
/
(
1.
f
-
ratio
);
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
i
*
8
>=
total_count
)
return
;
float4
*
out4
=
reinterpret_cast
<
float4
*>
(
out
);
const
float4
*
vals_float4
=
reinterpret_cast
<
const
float4
*>
(
in
);
const
uint64_t
*
mask8
=
reinterpret_cast
<
const
uint64_t
*>
(
mask
);
uint8_t
m
[
8
];
uint64_t
*
m8
=
reinterpret_cast
<
uint64_t
*>
(
m
);
m8
[
0
]
=
mask8
[
i
];
float4
val_float4
=
vals_float4
[
i
];
float4
out_float4
;
__half2
*
val_half2
=
reinterpret_cast
<
__half2
*>
(
&
val_float4
);
__half2
*
out_half2
=
reinterpret_cast
<
__half2
*>
(
&
out_float4
);
__half2
scale_mask_1
=
__halves2half2
(
scale
*
__float2half
(
m
[
0
]),
scale
*
__float2half
(
m
[
1
]));
__half2
scale_mask_2
=
__halves2half2
(
scale
*
__float2half
(
m
[
2
]),
scale
*
__float2half
(
m
[
3
]));
__half2
scale_mask_3
=
__halves2half2
(
scale
*
__float2half
(
m
[
4
]),
scale
*
__float2half
(
m
[
5
]));
__half2
scale_mask_4
=
__halves2half2
(
scale
*
__float2half
(
m
[
6
]),
scale
*
__float2half
(
m
[
7
]));
out_half2
[
0
]
=
__hmul2
(
val_half2
[
0
],
scale_mask_1
);
out_half2
[
1
]
=
__hmul2
(
val_half2
[
1
],
scale_mask_2
);
out_half2
[
2
]
=
__hmul2
(
val_half2
[
2
],
scale_mask_3
);
out_half2
[
3
]
=
__hmul2
(
val_half2
[
3
],
scale_mask_4
);
out4
[
i
]
=
out_float4
;
}
template
<
>
void
launch_ls_dropout
<
float
>
(
float
*
out
,
const
float
*
vals
,
uint8_t
*
mask
,
int
total_count
,
float
ratio
,
cudaStream_t
stream
,
bool
backward
)
{
int
grid_dim
=
total_count
>>
12
;
if
(
!
backward
)
{
ls_dropout_kernel
<<<
grid_dim
+
1
,
1024
,
0
,
stream
>>>
(
total_count
,
ratio
,
out
,
vals
,
mask
,
std
::
chrono
::
duration_cast
<
std
::
chrono
::
microseconds
>
(
std
::
chrono
::
system_clock
::
now
().
time_since_epoch
())
.
count
());
}
else
{
ls_dropout_bwd_kernel
<<<
grid_dim
+
1
,
1024
,
0
,
stream
>>>
(
total_count
,
ratio
,
out
,
vals
,
mask
);
}
}
template
<
>
void
launch_ls_dropout
<
__half
>
(
__half
*
out
,
const
__half
*
vals
,
uint8_t
*
mask
,
int
total_count
,
float
ratio
,
cudaStream_t
stream
,
bool
backward
)
{
int
grid_dim
=
total_count
>>
13
;
if
(
!
backward
)
{
ls_dropout_kernel
<<<
grid_dim
+
1
,
1024
,
0
,
stream
>>>
(
total_count
,
ratio
,
out
,
vals
,
mask
,
std
::
chrono
::
duration_cast
<
std
::
chrono
::
microseconds
>
(
std
::
chrono
::
system_clock
::
now
().
time_since_epoch
())
.
count
());
}
else
{
ls_dropout_bwd_kernel
<<<
grid_dim
+
1
,
1024
,
0
,
stream
>>>
(
total_count
,
ratio
,
out
,
vals
,
mask
);
}
}
/**
* @brief fused bias, dropout, and residual at the end of Attention and FFN,
* store dropped position in mask, it's not in-place
*
* @thread
* gridDim.x = total_count / 1024
* blockDim.x = 1024
*
* @param total_count total elements
* @param ratio drop ratio
* @param out [batch_size, seq_len, hidden_size], float and __half
* @param in [batch_size, seq_len, hidden_size], float and __half
* @param mask [batch_size, seq_len, hidden_size], uint8 type
* @param bias [hidden_size], ffn bias
* @param residual [batch_size, seq_len, hidden_size], float and __half
* @param seed seed to curand
* @param hidden_size hidden size
* @return void
*/
__global__
void
ls_dropout_res_bias_kernel
(
const
int
total_count
,
const
float
ratio
,
float
*
__restrict__
out
,
const
float
*
__restrict__
in
,
uint8_t
*
__restrict__
mask
,
const
float
*
__restrict__
bias
,
const
float
*
__restrict__
residual
,
const
int
seed
,
const
int
hidden_size
)
{
const
float
scale
=
1.
f
/
(
1.
f
-
ratio
);
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
i
*
4
>=
total_count
)
return
;
curandStatePhilox4_32_10_t
state
;
curand_init
(
seed
,
i
,
0
,
&
state
);
uint8_t
m
[
4
];
float4
*
out4
=
reinterpret_cast
<
float4
*>
(
out
);
const
float4
*
data4
=
reinterpret_cast
<
const
float4
*>
(
in
);
const
float4
*
residual4
=
reinterpret_cast
<
const
float4
*>
(
residual
);
const
float4
*
bias4
=
reinterpret_cast
<
const
float4
*>
(
bias
);
uint32_t
*
mask4
=
reinterpret_cast
<
uint32_t
*>
(
mask
);
float4
rand
=
curand_uniform4
(
&
state
);
m
[
0
]
=
static_cast
<
uint8_t
>
(
rand
.
x
>
ratio
);
m
[
1
]
=
static_cast
<
uint8_t
>
(
rand
.
y
>
ratio
);
m
[
2
]
=
static_cast
<
uint8_t
>
(
rand
.
z
>
ratio
);
m
[
3
]
=
static_cast
<
uint8_t
>
(
rand
.
w
>
ratio
);
int
bias_i
=
i
%
(
hidden_size
>>
2
);
uint32_t
*
m4
=
reinterpret_cast
<
uint32_t
*>
(
m
);
mask4
[
i
]
=
m4
[
0
];
const
float4
input4
=
data4
[
i
];
const
float4
b4
=
__ldg
(
&
bias4
[
bias_i
]);
const
float4
res4
=
residual4
[
i
];
float4
output4
;
output4
.
x
=
(
input4
.
x
+
b4
.
x
)
*
scale
*
m
[
0
]
+
res4
.
x
;
output4
.
y
=
(
input4
.
y
+
b4
.
y
)
*
scale
*
m
[
1
]
+
res4
.
y
;
output4
.
z
=
(
input4
.
z
+
b4
.
z
)
*
scale
*
m
[
2
]
+
res4
.
z
;
output4
.
w
=
(
input4
.
w
+
b4
.
w
)
*
scale
*
m
[
3
]
+
res4
.
w
;
out4
[
i
]
=
output4
;
}
__global__
void
ls_dropout_res_bias_kernel
(
const
int
total_count
,
const
float
ratio
,
__half
*
__restrict__
out
,
const
__half
*
__restrict__
in
,
uint8_t
*
__restrict__
mask
,
const
__half
*
__restrict__
bias
,
const
__half
*
__restrict__
residual
,
const
int
seed
,
const
int
hidden_size
)
{
const
__half
scale
=
1.
/
(
1.
-
ratio
);
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
i
*
8
>=
total_count
)
return
;
curandStatePhilox4_32_10_t
state
;
curand_init
(
seed
,
i
,
0
,
&
state
);
const
float4
*
vals_float4
=
reinterpret_cast
<
const
float4
*>
(
in
);
float4
*
outs_float4
=
reinterpret_cast
<
float4
*>
(
out
);
const
float4
*
residual4
=
reinterpret_cast
<
const
float4
*>
(
residual
);
const
float4
*
bias4
=
reinterpret_cast
<
const
float4
*>
(
bias
);
uint64_t
*
mask8
=
reinterpret_cast
<
uint64_t
*>
(
mask
);
uint8_t
m
[
8
];
float4
rand
=
curand_uniform4
(
&
state
);
m
[
0
]
=
static_cast
<
uint8_t
>
(
rand
.
x
>
ratio
);
m
[
1
]
=
static_cast
<
uint8_t
>
(
rand
.
y
>
ratio
);
m
[
2
]
=
static_cast
<
uint8_t
>
(
rand
.
z
>
ratio
);
m
[
3
]
=
static_cast
<
uint8_t
>
(
rand
.
w
>
ratio
);
rand
=
curand_uniform4
(
&
state
);
m
[
4
]
=
static_cast
<
uint8_t
>
(
rand
.
x
>
ratio
);
m
[
5
]
=
static_cast
<
uint8_t
>
(
rand
.
y
>
ratio
);
m
[
6
]
=
static_cast
<
uint8_t
>
(
rand
.
z
>
ratio
);
m
[
7
]
=
static_cast
<
uint8_t
>
(
rand
.
w
>
ratio
);
uint64_t
*
m8
=
reinterpret_cast
<
uint64_t
*>
(
m
);
mask8
[
i
]
=
m8
[
0
];
int
bias_i
=
i
%
(
hidden_size
>>
3
);
float4
val_float4
=
vals_float4
[
i
];
const
float4
b4
=
__ldg
(
&
bias4
[
bias_i
]);
const
float4
res4
=
residual4
[
i
];
float4
out_float4
;
__half2
*
val_half2
=
reinterpret_cast
<
__half2
*>
(
&
val_float4
);
__half2
*
out_half2
=
reinterpret_cast
<
__half2
*>
(
&
out_float4
);
const
__half2
*
b_half2
=
reinterpret_cast
<
const
__half2
*>
(
&
b4
);
const
__half2
*
res_half2
=
reinterpret_cast
<
const
__half2
*>
(
&
res4
);
__half2
scale_mask_1
=
__halves2half2
(
scale
*
__float2half
(
m
[
0
]),
scale
*
__float2half
(
m
[
1
]));
__half2
scale_mask_2
=
__halves2half2
(
scale
*
__float2half
(
m
[
2
]),
scale
*
__float2half
(
m
[
3
]));
__half2
scale_mask_3
=
__halves2half2
(
scale
*
__float2half
(
m
[
4
]),
scale
*
__float2half
(
m
[
5
]));
__half2
scale_mask_4
=
__halves2half2
(
scale
*
__float2half
(
m
[
6
]),
scale
*
__float2half
(
m
[
7
]));
out_half2
[
0
]
=
__hfma2
(
__hadd2
(
val_half2
[
0
],
b_half2
[
0
]),
scale_mask_1
,
res_half2
[
0
]);
out_half2
[
1
]
=
__hfma2
(
__hadd2
(
val_half2
[
1
],
b_half2
[
1
]),
scale_mask_2
,
res_half2
[
1
]);
out_half2
[
2
]
=
__hfma2
(
__hadd2
(
val_half2
[
2
],
b_half2
[
2
]),
scale_mask_3
,
res_half2
[
2
]);
out_half2
[
3
]
=
__hfma2
(
__hadd2
(
val_half2
[
3
],
b_half2
[
3
]),
scale_mask_4
,
res_half2
[
3
]);
outs_float4
[
i
]
=
out_float4
;
}
template
<
>
void
launch_ls_dropout_res_bias
<
float
>
(
float
*
out
,
const
float
*
vals
,
uint8_t
*
mask
,
const
float
*
bias
,
const
float
*
residual
,
int
total_count
,
int
dim
,
float
ratio
,
cudaStream_t
stream
)
{
int
grid_dim
=
total_count
>>
12
;
ls_dropout_res_bias_kernel
<<<
grid_dim
+
1
,
1024
,
0
,
stream
>>>
(
total_count
,
ratio
,
out
,
vals
,
mask
,
bias
,
residual
,
std
::
chrono
::
duration_cast
<
std
::
chrono
::
microseconds
>
(
std
::
chrono
::
system_clock
::
now
().
time_since_epoch
())
.
count
(),
dim
);
}
template
<
>
void
launch_ls_dropout_res_bias
<
__half
>
(
__half
*
out
,
const
__half
*
vals
,
uint8_t
*
mask
,
const
__half
*
bias
,
const
__half
*
residual
,
int
total_count
,
int
dim
,
float
ratio
,
cudaStream_t
stream
)
{
int
grid_dim
=
total_count
>>
13
;
ls_dropout_res_bias_kernel
<<<
grid_dim
+
1
,
1024
,
0
,
stream
>>>
(
total_count
,
ratio
,
out
,
vals
,
mask
,
bias
,
residual
,
std
::
chrono
::
duration_cast
<
std
::
chrono
::
microseconds
>
(
std
::
chrono
::
system_clock
::
now
().
time_since_epoch
())
.
count
(),
dim
);
}
/**
* @brief fused bias and dropout backward at the end of Attention and FFN
*
* @thread
* gridDim.x = hidden_size / 8
* blockDim.x = 8
* blockDim.y = 1024 / 8 = 128
*
* @param row_size batch_size * seq_len
* @param ratio dropout ratio
* @param in_grad [batch_size, seq_len, hidden_size], input grad
* @param bias_grad [hidden_size], bias grad
* @param out_grad [batch_size, seq_len, hidden_size], output grad
* @param mask [batch_size, seq_len, hidden_size], dropout mask
* @param hidden_size
* @return void
*/
__global__
void
ls_dropout_bias_bwd_kernel
(
const
int
row_size
,
const
float
ratio
,
float
*
__restrict__
in_grad
,
float
*
__restrict__
bias_grad
,
const
float
*
__restrict__
out_grad
,
const
uint8_t
*
__restrict__
mask
,
const
int
hidden_size
)
{
const
float
scale
=
1.
f
/
(
1.
f
-
ratio
);
// every block generate 8 bias result
__shared__
float
tile
[
8
][
129
];
cg
::
thread_block
b
=
cg
::
this_thread_block
();
cg
::
thread_block_tile
<
WARP_SIZE
>
g
=
cg
::
tiled_partition
<
WARP_SIZE
>
(
b
);
int
col_idx
=
flat_2dim
(
blockIdx
.
x
,
threadIdx
.
x
,
8
);
int
stride
=
hidden_size
*
128
;
float
local_sum
=
0
;
int
idx
=
flat_2dim
(
threadIdx
.
y
,
col_idx
,
hidden_size
);
for
(
int
r
=
threadIdx
.
y
;
r
<
row_size
;
r
+=
128
)
{
float
val
=
out_grad
[
idx
];
val
*=
scale
*
static_cast
<
float
>
(
mask
[
idx
]);
local_sum
+=
val
;
in_grad
[
idx
]
=
val
;
idx
+=
stride
;
}
tile
[
threadIdx
.
x
][
threadIdx
.
y
]
=
local_sum
;
__syncthreads
();
float
sum
=
0
;
int
tid
=
threadIdx
.
y
*
blockDim
.
x
+
threadIdx
.
x
;
int
x
=
tid
>>
7
;
int
y
=
tid
&
(
127
);
if
(
y
<
32
)
{
#pragma unroll
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
sum
+=
tile
[
x
][
y
+
i
*
32
];
}
}
__syncthreads
();
for
(
int
i
=
1
;
i
<
32
;
i
<<=
1
)
sum
+=
g
.
shfl_down
(
sum
,
i
);
if
(
y
==
0
)
tile
[
0
][
x
]
=
sum
;
__syncthreads
();
if
(
threadIdx
.
x
<
8
)
{
int
pos
=
flat_2dim
(
blockIdx
.
x
,
threadIdx
.
x
,
8
);
bias_grad
[
pos
]
=
tile
[
0
][
threadIdx
.
x
];
}
}
__global__
void
ls_dropout_bias_bwd_kernel
(
const
int
row_size
,
const
float
ratio
,
__half
*
__restrict__
in_grad
,
__half
*
__restrict__
bias_grad
,
const
__half
*
__restrict__
out_grad
,
const
uint8_t
*
__restrict__
mask
,
const
int
hidden_size
)
{
const
__half2
scale
=
__float2half2_rn
(
1.
f
/
(
1.
f
-
ratio
));
__shared__
__half2
tile
[
8
][
129
];
cg
::
thread_block
b
=
cg
::
this_thread_block
();
cg
::
thread_block_tile
<
WARP_SIZE
>
g
=
cg
::
tiled_partition
<
WARP_SIZE
>
(
b
);
__half2
*
in_grad2
=
reinterpret_cast
<
__half2
*>
(
in_grad
);
const
__half2
*
out_grad2
=
reinterpret_cast
<
const
__half2
*>
(
out_grad
);
__half2
*
bias_grad2
=
reinterpret_cast
<
__half2
*>
(
bias_grad
);
int
col_idx
=
flat_2dim
(
blockIdx
.
x
,
threadIdx
.
x
,
8
);
int
stride
=
hidden_size
*
128
;
__half2
local_sum
=
__float2half2_rn
(
0.
f
);
int
idx
=
flat_2dim
(
threadIdx
.
y
,
col_idx
,
hidden_size
);
for
(
int
r
=
threadIdx
.
y
;
r
<
row_size
;
r
+=
128
)
{
__half2
val
=
out_grad2
[
idx
];
__half2
m2
=
__floats2half2_rn
(
mask
[
2
*
idx
],
mask
[
2
*
idx
+
1
]);
val
*=
scale
*
m2
;
local_sum
+=
val
;
in_grad2
[
idx
]
=
val
;
idx
+=
stride
;
}
tile
[
threadIdx
.
x
][
threadIdx
.
y
]
=
local_sum
;
__syncthreads
();
__half2
sum
=
__float2half2_rn
(
0.
f
);
int
tid
=
threadIdx
.
y
*
blockDim
.
x
+
threadIdx
.
x
;
int
x
=
tid
>>
7
;
int
y
=
tid
&
(
127
);
if
(
y
<
32
)
{
#pragma unroll
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
sum
+=
tile
[
x
][
y
+
i
*
32
];
}
}
__syncthreads
();
for
(
int
i
=
1
;
i
<
WARP_SIZE
;
i
<<=
1
)
sum
+=
g
.
shfl_down
(
sum
,
i
);
if
(
y
==
0
)
tile
[
0
][
x
]
=
sum
;
__syncthreads
();
if
(
threadIdx
.
x
<
8
)
{
int
pos
=
flat_2dim
(
blockIdx
.
x
,
threadIdx
.
x
,
8
);
bias_grad2
[
pos
]
=
tile
[
0
][
threadIdx
.
x
];
}
}
template
<
typename
T
>
void
launch_ls_dropout_bias_bwd
(
T
*
in_grad
,
T
*
bias_grad
,
const
T
*
out_grad
,
const
uint8_t
*
mask
,
int
row_size
,
int
dim
,
float
ratio
,
cudaStream_t
stream
)
{
dim3
grid_dim
((
dim
-
1
)
/
8
+
1
);
dim3
block_dim
(
8
,
128
);
ls_dropout_bias_bwd_kernel
<<<
grid_dim
,
block_dim
,
0
,
stream
>>>
(
row_size
,
ratio
,
in_grad
,
bias_grad
,
out_grad
,
mask
,
dim
);
}
template
<
>
void
launch_ls_dropout_bias_bwd
(
__half
*
in_grad
,
__half
*
bias_grad
,
const
__half
*
out_grad
,
const
uint8_t
*
mask
,
int
row_size
,
int
dim
,
float
ratio
,
cudaStream_t
stream
)
{
dim
>>=
1
;
dim3
grid_dim
((
dim
-
1
)
/
8
+
1
);
dim3
block_dim
(
8
,
128
);
ls_dropout_bias_bwd_kernel
<<<
grid_dim
,
block_dim
,
0
,
stream
>>>
(
row_size
,
ratio
,
in_grad
,
bias_grad
,
out_grad
,
mask
,
dim
);
}
template
void
launch_ls_dropout_bias_bwd
(
float
*
in_grad
,
float
*
bias_grad
,
const
float
*
out_grad
,
const
uint8_t
*
mask
,
int
row_size
,
int
dim
,
float
ratio
,
cudaStream_t
stream
);
/**
* @brief fused bias, activation, and dropout at the end of first ffn
*
* @thread
* gridDim.x = hidden_size / 8
* blockDim.x = 8
* blockDim.y = 1024 / 8 = 128
*
* @tparam act_type activation function, like kRelu, kGelu
* @param total_count total elements
* @param ratio drop ratio
* @param out [batch_size, seq_len, hidden_size], float and __half
* @param in [batch_size, seq_len, hidden_size], float and __half
* @param mask [batch_size, seq_len, hidden_size], uint8 type
* @param bias [hidden_size], ffn bias
* @param seed seed to curand
* @param hidden_size
* @return void
*/
template
<
ActivationType
act_type
>
__global__
void
ls_dropout_act_bias_kernel
(
const
int
total_count
,
const
float
ratio
,
float
*
__restrict__
out
,
const
float
*
__restrict__
in
,
uint8_t
*
__restrict__
mask
,
const
float
*
__restrict__
bias
,
const
int
seed
,
const
int
hidden_size
)
{
const
float
scale
=
1.
f
/
(
1.
f
-
ratio
);
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
i
*
4
>=
total_count
)
return
;
curandStatePhilox4_32_10_t
state
;
curand_init
(
seed
,
i
,
0
,
&
state
);
uint8_t
m
[
4
];
float4
*
out4
=
reinterpret_cast
<
float4
*>
(
out
);
const
float4
*
data4
=
reinterpret_cast
<
const
float4
*>
(
in
);
const
float4
*
bias4
=
reinterpret_cast
<
const
float4
*>
(
bias
);
uint32_t
*
mask4
=
reinterpret_cast
<
uint32_t
*>
(
mask
);
float4
rand
=
curand_uniform4
(
&
state
);
m
[
0
]
=
(
uint8_t
)(
rand
.
x
>
ratio
);
m
[
1
]
=
(
uint8_t
)(
rand
.
y
>
ratio
);
m
[
2
]
=
(
uint8_t
)(
rand
.
z
>
ratio
);
m
[
3
]
=
(
uint8_t
)(
rand
.
w
>
ratio
);
int
bias_i
=
i
%
(
hidden_size
>>
2
);
uint32_t
*
m4
=
reinterpret_cast
<
uint32_t
*>
(
m
);
mask4
[
i
]
=
m4
[
0
];
const
float4
input4
=
data4
[
i
];
const
float4
b4
=
__ldg
(
&
bias4
[
bias_i
]);
float4
output4
;
output4
.
x
=
activation_kernel
<
act_type
,
float
>
(
input4
.
x
+
b4
.
x
)
*
scale
*
m
[
0
];
output4
.
y
=
activation_kernel
<
act_type
,
float
>
(
input4
.
y
+
b4
.
y
)
*
scale
*
m
[
1
];
output4
.
z
=
activation_kernel
<
act_type
,
float
>
(
input4
.
z
+
b4
.
z
)
*
scale
*
m
[
2
];
output4
.
w
=
activation_kernel
<
act_type
,
float
>
(
input4
.
w
+
b4
.
w
)
*
scale
*
m
[
3
];
out4
[
i
]
=
output4
;
}
template
<
ActivationType
act_type
>
__global__
void
ls_dropout_act_bias_kernel
(
const
int
total_count
,
const
float
ratio
,
__half
*
__restrict__
out
,
const
__half
*
__restrict__
in
,
uint8_t
*
__restrict__
mask
,
const
__half
*
__restrict__
bias
,
const
int
seed
,
const
int
hidden_size
)
{
const
float
scale
=
1.
f
/
(
1.
f
-
ratio
);
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
i
*
8
>=
total_count
)
return
;
curandStatePhilox4_32_10_t
state
;
curand_init
(
seed
,
i
,
0
,
&
state
);
const
float4
*
vals_float4
=
reinterpret_cast
<
const
float4
*>
(
in
);
float4
*
outs_float4
=
reinterpret_cast
<
float4
*>
(
out
);
const
float4
*
bias4
=
reinterpret_cast
<
const
float4
*>
(
bias
);
uint64_t
*
mask8
=
reinterpret_cast
<
uint64_t
*>
(
mask
);
uint8_t
m
[
8
];
float4
rand
=
curand_uniform4
(
&
state
);
m
[
0
]
=
(
uint8_t
)(
rand
.
x
>
ratio
);
m
[
1
]
=
(
uint8_t
)(
rand
.
y
>
ratio
);
m
[
2
]
=
(
uint8_t
)(
rand
.
z
>
ratio
);
m
[
3
]
=
(
uint8_t
)(
rand
.
w
>
ratio
);
rand
=
curand_uniform4
(
&
state
);
m
[
4
]
=
(
uint8_t
)(
rand
.
x
>
ratio
);
m
[
5
]
=
(
uint8_t
)(
rand
.
y
>
ratio
);
m
[
6
]
=
(
uint8_t
)(
rand
.
z
>
ratio
);
m
[
7
]
=
(
uint8_t
)(
rand
.
w
>
ratio
);
uint64_t
*
m8
=
reinterpret_cast
<
uint64_t
*>
(
m
);
mask8
[
i
]
=
*
m8
;
int
bias_i
=
i
%
(
hidden_size
>>
3
);
float4
val_float4
=
vals_float4
[
i
];
const
float4
b4
=
__ldg
(
&
bias4
[
bias_i
]);
float4
out_float4
;
__half2
*
val_half2
=
reinterpret_cast
<
__half2
*>
(
&
val_float4
);
__half2
*
out_half2
=
reinterpret_cast
<
__half2
*>
(
&
out_float4
);
const
__half2
*
b_half2
=
reinterpret_cast
<
const
__half2
*>
(
&
b4
);
__half2
scale_mask_1
=
__floats2half2_rn
(
scale
*
m
[
0
],
scale
*
m
[
1
]);
__half2
scale_mask_2
=
__floats2half2_rn
(
scale
*
m
[
2
],
scale
*
m
[
3
]);
__half2
scale_mask_3
=
__floats2half2_rn
(
scale
*
m
[
4
],
scale
*
m
[
5
]);
__half2
scale_mask_4
=
__floats2half2_rn
(
scale
*
m
[
6
],
scale
*
m
[
7
]);
out_half2
[
0
]
=
__hmul2
(
activation_kernel
<
act_type
,
__half2
>
(
__hadd2
(
val_half2
[
0
],
b_half2
[
0
])),
scale_mask_1
);
out_half2
[
1
]
=
__hmul2
(
activation_kernel
<
act_type
,
__half2
>
(
__hadd2
(
val_half2
[
1
],
b_half2
[
1
])),
scale_mask_2
);
out_half2
[
2
]
=
__hmul2
(
activation_kernel
<
act_type
,
__half2
>
(
__hadd2
(
val_half2
[
2
],
b_half2
[
2
])),
scale_mask_3
);
out_half2
[
3
]
=
__hmul2
(
activation_kernel
<
act_type
,
__half2
>
(
__hadd2
(
val_half2
[
3
],
b_half2
[
3
])),
scale_mask_4
);
outs_float4
[
i
]
=
out_float4
;
}
template
<
>
void
launch_ls_dropout_act_bias
<
ActivationType
::
kGelu
,
float
>
(
float
*
out
,
const
float
*
vals
,
uint8_t
*
mask
,
const
float
*
bias
,
int
total_count
,
int
dim
,
float
ratio
,
cudaStream_t
stream
)
{
int
grid_dim
=
total_count
>>
10
;
ls_dropout_act_bias_kernel
<
ActivationType
::
kGelu
>
<<<
grid_dim
+
1
,
256
,
0
,
stream
>>>
(
total_count
,
ratio
,
out
,
vals
,
mask
,
bias
,
std
::
chrono
::
duration_cast
<
std
::
chrono
::
microseconds
>
(
std
::
chrono
::
system_clock
::
now
().
time_since_epoch
())
.
count
(),
dim
);
}
template
<
>
void
launch_ls_dropout_act_bias
<
ActivationType
::
kGelu
,
__half
>
(
__half
*
out
,
const
__half
*
vals
,
uint8_t
*
mask
,
const
__half
*
bias
,
int
total_count
,
int
dim
,
float
ratio
,
cudaStream_t
stream
)
{
int
grid_dim
=
total_count
>>
11
;
ls_dropout_act_bias_kernel
<
ActivationType
::
kGelu
>
<<<
grid_dim
+
1
,
256
,
0
,
stream
>>>
(
total_count
,
ratio
,
out
,
vals
,
mask
,
bias
,
std
::
chrono
::
duration_cast
<
std
::
chrono
::
microseconds
>
(
std
::
chrono
::
system_clock
::
now
().
time_since_epoch
())
.
count
(),
dim
);
}
template
<
>
void
launch_ls_dropout_act_bias
<
ActivationType
::
kRelu
,
float
>
(
float
*
out
,
const
float
*
vals
,
uint8_t
*
mask
,
const
float
*
bias
,
int
total_count
,
int
dim
,
float
ratio
,
cudaStream_t
stream
)
{
int
grid_dim
=
total_count
>>
10
;
ls_dropout_act_bias_kernel
<
ActivationType
::
kRelu
>
<<<
grid_dim
+
1
,
256
,
0
,
stream
>>>
(
total_count
,
ratio
,
out
,
vals
,
mask
,
bias
,
std
::
chrono
::
duration_cast
<
std
::
chrono
::
microseconds
>
(
std
::
chrono
::
system_clock
::
now
().
time_since_epoch
())
.
count
(),
dim
);
}
template
<
>
void
launch_ls_dropout_act_bias
<
ActivationType
::
kRelu
,
__half
>
(
__half
*
out
,
const
__half
*
vals
,
uint8_t
*
mask
,
const
__half
*
bias
,
int
total_count
,
int
dim
,
float
ratio
,
cudaStream_t
stream
)
{
int
grid_dim
=
total_count
>>
11
;
ls_dropout_act_bias_kernel
<
ActivationType
::
kRelu
>
<<<
grid_dim
+
1
,
256
,
0
,
stream
>>>
(
total_count
,
ratio
,
out
,
vals
,
mask
,
bias
,
std
::
chrono
::
duration_cast
<
std
::
chrono
::
microseconds
>
(
std
::
chrono
::
system_clock
::
now
().
time_since_epoch
())
.
count
(),
dim
);
}
/**
* @brief fused bias, activation, and dropout backward
*
* @thread
* gridDim.x = total_count / 1024
* blockDim.x = 1024
*
* @tparam act_type kRelu
* @param row_size batch_size * seq_len
* @param ratio dropout ratio
* @param in_grad [batch_size, seq_len, hidden_size], input grad
* @param bias_grad [hidden_size], bias grad
* @param out_grad [batch_size, seq_len, hidden_size], output grad
* @param mask [batch_size, seq_len, hidden_size], dropout mask
* @param hidden_size
* @return void
*/
template
<
ActivationType
act_type
,
typename
T
>
__global__
void
ls_dropout_act_bias_bwd_kernel
(
const
int
row_size
,
const
float
ratio
,
T
*
in_grad
,
T
*
__restrict__
bias_grad
,
const
T
*
__restrict__
input
,
const
T
*
__restrict__
bias
,
const
T
*
out_grad
,
const
uint8_t
*
__restrict__
mask
,
const
int
hidden_size
)
{
const
float
scale
=
1.
f
/
(
1.
f
-
ratio
);
__shared__
float
tile
[
WARP_SIZE
][
WARP_SIZE
+
1
];
cg
::
thread_block
b
=
cg
::
this_thread_block
();
cg
::
thread_block_tile
<
WARP_SIZE
>
g
=
cg
::
tiled_partition
<
WARP_SIZE
>
(
b
);
int
col_idx
=
flat_2dim
(
blockIdx
.
x
,
threadIdx
.
x
,
WARP_SIZE
);
int
stride
=
hidden_size
*
WARP_SIZE
;
float
local_sum
=
0
;
int
idx
=
flat_2dim
(
threadIdx
.
y
,
col_idx
,
hidden_size
);
if
(
col_idx
<
hidden_size
)
{
for
(
int
r
=
threadIdx
.
y
;
r
<
row_size
;
r
+=
WARP_SIZE
)
{
float
val
=
out_grad
[
idx
];
float
in
=
input
[
idx
];
float
b
=
bias
[
idx
%
hidden_size
];
val
=
activation_bwd_kernel
<
act_type
,
float
>
(
val
*
scale
*
static_cast
<
float
>
(
mask
[
idx
]),
in
+
b
);
local_sum
+=
val
;
in_grad
[
idx
]
=
val
;
idx
+=
stride
;
}
}
tile
[
threadIdx
.
x
][
threadIdx
.
y
]
=
local_sum
;
__syncthreads
();
float
sum
=
tile
[
threadIdx
.
y
][
threadIdx
.
x
];
__syncthreads
();
for
(
int
i
=
1
;
i
<
WARP_SIZE
;
i
<<=
1
)
sum
+=
g
.
shfl_down
(
sum
,
i
);
if
(
threadIdx
.
x
==
0
)
tile
[
0
][
threadIdx
.
y
]
=
sum
;
__syncthreads
();
if
(
threadIdx
.
y
==
0
)
{
int
pos
=
flat_2dim
(
blockIdx
.
x
,
threadIdx
.
x
,
WARP_SIZE
);
bias_grad
[
pos
]
=
tile
[
0
][
threadIdx
.
x
];
}
}
// @brief fused bias, activation, and dropout backward
// It is deprecated for precision reason. Keep it for future optimization.
//
// template <ActivationType act_type>
// __global__ void ls_dropout_act_bias_bwd_kernel(
// const int row_size, const float ratio, __half * in_grad,
// __half *__restrict__ bias_grad, const __half *__restrict__ input, const
// __half *__restrict__ bias, const __half * out_grad, const uint8_t
// *__restrict__ mask, const int hidden_size) {
// const __half2 scale = __float2half2_rn(1.f / (1.f - ratio));
// __shared__ __half2 tile[WARP_SIZE][WARP_SIZE + 1];
// cg::thread_block b = cg::this_thread_block();
// cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
// __half2 *in_grad2 = reinterpret_cast<__half2 *>(in_grad);
// __half2 *bias_grad2 = reinterpret_cast<__half2 *>(bias_grad);
// const __half2 *out_grad2 = reinterpret_cast<const __half2 *>(out_grad);
// const __half2 *input2 = reinterpret_cast<const __half2 *>(input);
// const __half2 *bias2 = reinterpret_cast<const __half2 *>(bias);
// int col_idx = flat_2dim(blockIdx.x, threadIdx.x, WARP_SIZE);
// int stride = hidden_size * WARP_SIZE;
// __half2 local_sum = __float2half2_rn(0.f);
// int idx = flat_2dim(threadIdx.y, col_idx, hidden_size);
// if (col_idx < hidden_size) {
// for (int r = threadIdx.y; r < row_size; r += WARP_SIZE) {
// __half2 val = out_grad2[idx];
// __half2 in2 = input2[idx];
// __half2 b2 = bias2[idx % hidden_size ];
// __half2 m2 = __floats2half2_rn(mask[2 * idx], mask[2 * idx + 1]);
// val = activation_bwd_kernel<ActivationType::kRelu, __half2>(val * scale
// *
// m2,
// in2+b2);
// local_sum += val;
// in_grad2[idx] = val;
// idx += stride;
// }
// }
// tile[threadIdx.x][threadIdx.y] = local_sum;
// __syncthreads();
// __half2 sum = tile[threadIdx.y][threadIdx.x];
// __syncthreads();
// for (int i = 1; i < WARP_SIZE; i <<= 1) sum += g.shfl_down(sum, i);
// if (threadIdx.x == 0) tile[0][threadIdx.y] = sum;
// __syncthreads();
// if (threadIdx.y == 0) {
// int pos = flat_2dim(blockIdx.x, threadIdx.x, WARP_SIZE);
// bias_grad2[pos] = tile[0][threadIdx.x];
// }
// }
template
<
ActivationType
act_type
,
typename
T
>
void
launch_ls_dropout_act_bias_bwd
(
T
*
in_grad
,
T
*
bias_grad
,
const
T
*
input
,
const
T
*
bias
,
const
T
*
out_grad
,
const
uint8_t
*
mask
,
int
row_size
,
int
dim
,
float
ratio
,
cudaStream_t
stream
)
{
dim3
grid_dim
((
dim
-
1
)
/
WARP_SIZE
+
1
);
dim3
block_dim
(
WARP_SIZE
,
WARP_SIZE
);
ls_dropout_act_bias_bwd_kernel
<
act_type
><<<
grid_dim
,
block_dim
,
0
,
stream
>>>
(
row_size
,
ratio
,
in_grad
,
bias_grad
,
input
,
bias
,
out_grad
,
mask
,
dim
);
}
// template <>
// void launch_ls_dropout_act_bias_bwd<ActivationType::kRelu, __half>(
// __half *in_grad, __half *bias_grad,const __half *input, const __half
// *bias, const __half *out_grad, const uint8_t *mask, int row_size, int
// dim, float ratio, cudaStream_t stream) {
// dim >>= 1;
// dim3 grid_dim((dim - 1) / WARP_SIZE + 1);
// dim3 block_dim(WARP_SIZE, WARP_SIZE);
// ls_dropout_act_bias_bwd_kernel<ActivationType::kRelu>
// <<<grid_dim, block_dim, 0, stream>>>(row_size, ratio, in_grad,
// bias_grad,
// input, bias,out_grad, mask, dim);
// }
template
void
launch_ls_dropout_act_bias_bwd
<
ActivationType
::
kRelu
,
float
>(
float
*
in_grad
,
float
*
bias_grad
,
const
float
*
input
,
const
float
*
bias
,
const
float
*
out_grad
,
const
uint8_t
*
mask
,
int
row_size
,
int
dim
,
float
ratio
,
cudaStream_t
stream
);
template
void
launch_ls_dropout_act_bias_bwd
<
ActivationType
::
kRelu
,
__half
>(
__half
*
in_grad
,
__half
*
bias_grad
,
const
__half
*
input
,
const
__half
*
bias
,
const
__half
*
out_grad
,
const
uint8_t
*
mask
,
int
row_size
,
int
dim
,
float
ratio
,
cudaStream_t
stream
);
template
void
launch_ls_dropout_act_bias_bwd
<
ActivationType
::
kGelu
,
float
>(
float
*
in_grad
,
float
*
bias_grad
,
const
float
*
input
,
const
float
*
bias
,
const
float
*
out_grad
,
const
uint8_t
*
mask
,
int
row_size
,
int
dim
,
float
ratio
,
cudaStream_t
stream
);
template
void
launch_ls_dropout_act_bias_bwd
<
ActivationType
::
kGelu
,
__half
>(
__half
*
in_grad
,
__half
*
bias_grad
,
const
__half
*
input
,
const
__half
*
bias
,
const
__half
*
out_grad
,
const
uint8_t
*
mask
,
int
row_size
,
int
dim
,
float
ratio
,
cudaStream_t
stream
);
colossalai/kernel/cuda_native/csrc/kernels/general_kernels.cu
0 → 100644
View file @
5c3843dc
#include "kernels.h"
#include <cooperative_groups.h>
namespace
cg
=
cooperative_groups
;
/**
@brief: fuse_transpose_bias
Calculate the sum of elements in each column of the matrix.
@thread
gridDim.x = ceil(cols / WARP_SIZE)
blockDim.x = WARP_SIZE
blockDim.y = WARP_SIZE
@param
inp: [rows, cols]
out: [cols]
rows: the number of rows in the matrix
cols: the number of cols in the matrix
*/
template
<
typename
T
>
__global__
void
column_sum_reduce
(
const
T
*
__restrict__
inp
,
T
*
__restrict__
out
,
int
rows
,
int
cols
)
{
__shared__
float
tile
[
WARP_SIZE
][
WARP_SIZE
];
cg
::
thread_block
b
=
cg
::
this_thread_block
();
cg
::
thread_block_tile
<
WARP_SIZE
>
g
=
cg
::
tiled_partition
<
WARP_SIZE
>
(
b
);
int
idx
=
flat_2dim
(
blockIdx
.
x
,
threadIdx
.
x
,
WARP_SIZE
);
int
y_stride
=
cols
*
WARP_SIZE
;
float
localSum
=
0
;
// Loop across matrix row
// TODO: optimize to log complexity
if
(
idx
<
cols
)
{
int
offset
=
flat_2dim
(
threadIdx
.
y
,
idx
,
cols
);
for
(
int
r
=
threadIdx
.
y
;
r
<
rows
;
r
+=
WARP_SIZE
)
{
localSum
+=
(
float
)
inp
[
offset
];
offset
+=
y_stride
;
}
}
// The sum of a row in tile is equal to the sum of a col in original matrix
tile
[
threadIdx
.
x
][
threadIdx
.
y
]
=
localSum
;
__syncthreads
();
// Sum the shared buffer.
// The change of threadIdx.x is continuous
float
sum
=
tile
[
threadIdx
.
y
][
threadIdx
.
x
];
__syncthreads
();
// Calculate the sum of a row in tile
for
(
int
i
=
1
;
i
<
WARP_SIZE
;
i
<<=
1
)
sum
+=
g
.
shfl_down
(
sum
,
i
);
if
(
threadIdx
.
x
==
0
)
{
int
pos
=
flat_2dim
(
blockIdx
.
x
,
threadIdx
.
y
,
WARP_SIZE
);
if
(
pos
<
cols
)
out
[
pos
]
=
sum
;
}
}
// [r, c] -> [c]
template
<
>
void
launch_fuse_transpose_bias_kernel
<
float
>
(
const
float
*
inp
,
float
*
out
,
int
rows
,
int
cols
,
cudaStream_t
stream
)
{
dim3
grid_dim
((
cols
-
1
)
/
WARP_SIZE
+
1
);
dim3
block_dim
(
WARP_SIZE
,
WARP_SIZE
);
column_sum_reduce
<
float
>
<<<
grid_dim
,
block_dim
,
0
,
stream
>>>
(
inp
,
out
,
rows
,
cols
);
}
template
<
>
void
launch_fuse_transpose_bias_kernel
<
__half
>
(
const
__half
*
inp
,
__half
*
out
,
int
rows
,
int
cols
,
cudaStream_t
stream
)
{
dim3
grid_dim
((
cols
-
1
)
/
WARP_SIZE
+
1
);
dim3
block_dim
(
WARP_SIZE
,
WARP_SIZE
);
column_sum_reduce
<
__half
>
<<<
grid_dim
,
block_dim
,
0
,
stream
>>>
(
inp
,
out
,
rows
,
cols
);
}
/**
@brief: fused_add2
Add two matrix inp1 and inp2 to out.
@thread
gridDim.x = batch_size * seq_len
blockDim.x = min(hidden_dim, MAX_THREADS)
@param
inp1: [batch_size, seq_len, hidden_dim]
inp2: [batch_size, seq_len, hidden_dim]
out: [batch_size, seq_len, hidden_dim]
batch_size: the size of the current batch
seq_len: the sequence length of the current batch
hidden_dim: dim of the hidden tensor
*/
template
<
typename
T
>
__global__
void
fused_add2_kernel
(
T
*
out
,
const
T
*
inp1
,
const
T
*
inp2
,
int
hidden_dim
);
template
<
>
__global__
void
fused_add2_kernel
<
float
>
(
float
*
out
,
const
float
*
inp1
,
const
float
*
inp2
,
int
hidden_dim
)
{
int
row_id
=
blockIdx
.
x
;
int
offset
=
flat_2dim
(
row_id
,
0
,
hidden_dim
);
const
float4
*
inp1_4
=
reinterpret_cast
<
const
float4
*>
(
inp1
);
const
float4
*
inp2_4
=
reinterpret_cast
<
const
float4
*>
(
inp2
);
float4
*
out_4
=
reinterpret_cast
<
float4
*>
(
out
);
float4
vinp1
;
float4
vinp2
;
float4
val
;
for
(
std
::
size_t
i
=
threadIdx
.
x
;
i
<
hidden_dim
;
i
+=
blockDim
.
x
)
{
vinp1
=
inp1_4
[
offset
+
i
];
vinp2
=
inp2_4
[
offset
+
i
];
val
.
x
=
vinp1
.
x
+
vinp2
.
x
;
val
.
y
=
vinp1
.
y
+
vinp2
.
y
;
val
.
z
=
vinp1
.
z
+
vinp2
.
z
;
val
.
w
=
vinp1
.
w
+
vinp2
.
w
;
out_4
[
offset
+
i
]
=
val
;
}
}
template
<
>
__global__
void
fused_add2_kernel
<
__half
>
(
__half
*
out
,
const
__half
*
inp1
,
const
__half
*
inp2
,
int
hidden_dim
)
{
int
row_id
=
blockIdx
.
x
;
int
offset
=
flat_2dim
(
row_id
,
0
,
hidden_dim
);
const
float4
*
inp1_4
=
reinterpret_cast
<
const
float4
*>
(
inp1
);
const
float4
*
inp2_4
=
reinterpret_cast
<
const
float4
*>
(
inp2
);
float4
*
out_4
=
reinterpret_cast
<
float4
*>
(
out
);
float4
vinp1
;
float4
vinp2
;
float4
val
;
__half2
*
h2_inp1
=
reinterpret_cast
<
__half2
*>
(
&
vinp1
);
__half2
*
h2_inp2
=
reinterpret_cast
<
__half2
*>
(
&
vinp2
);
__half2
*
h2_val
=
reinterpret_cast
<
__half2
*>
(
&
val
);
for
(
std
::
size_t
i
=
threadIdx
.
x
;
i
<
hidden_dim
;
i
+=
blockDim
.
x
)
{
vinp1
=
inp1_4
[
offset
+
i
];
vinp2
=
inp2_4
[
offset
+
i
];
h2_val
[
0
]
=
__hadd2
(
h2_inp1
[
0
],
h2_inp2
[
0
]);
h2_val
[
1
]
=
__hadd2
(
h2_inp1
[
1
],
h2_inp2
[
1
]);
h2_val
[
2
]
=
__hadd2
(
h2_inp1
[
2
],
h2_inp2
[
2
]);
h2_val
[
3
]
=
__hadd2
(
h2_inp1
[
3
],
h2_inp2
[
3
]);
out_4
[
offset
+
i
]
=
val
;
}
}
//[b, s, h] -> [b, s, h]
template
<
>
void
launch_fused_add2
<
float
>
(
float
*
out
,
const
float
*
inp1
,
const
float
*
inp2
,
int
batch_size
,
int
seq_len
,
int
hidden_dim
,
cudaStream_t
&
stream
)
{
hidden_dim
>>=
2
;
dim3
grid_dim
(
batch_size
*
seq_len
);
dim3
block_dim
(
min
(
hidden_dim
,
MAX_THREADS
));
fused_add2_kernel
<<<
grid_dim
,
block_dim
,
0
,
stream
>>>
(
out
,
inp1
,
inp2
,
hidden_dim
);
}
template
<
>
void
launch_fused_add2
<
__half
>
(
__half
*
out
,
const
__half
*
inp1
,
const
__half
*
inp2
,
int
batch_size
,
int
seq_len
,
int
hidden_dim
,
cudaStream_t
&
stream
)
{
hidden_dim
>>=
3
;
dim3
grid_dim
(
batch_size
*
seq_len
);
dim3
block_dim
(
min
(
hidden_dim
,
MAX_THREADS
));
fused_add2_kernel
<<<
grid_dim
,
block_dim
,
0
,
stream
>>>
(
out
,
inp1
,
inp2
,
hidden_dim
);
}
template
<
typename
T
>
__global__
void
kernel_concat3_dim1
(
const
T
*
inp1
,
const
T
*
inp2
,
T
*
output
,
int
sz0
,
int
sz2
,
int
sz1_1
,
int
sz1_2
)
{
int
nele
=
sz0
*
sz2
*
(
sz1_1
+
sz1_2
);
int
idx
=
flat_2dim
(
blockIdx
.
x
,
threadIdx
.
x
,
blockDim
.
x
);
if
(
idx
>=
nele
)
{
return
;
}
float4
*
dst_ptr
=
(
float4
*
)
output
+
idx
;
int
idx2
=
idx
%
sz2
;
idx
=
idx
/
sz2
;
int
idx1
=
idx
%
(
sz1_1
+
sz1_2
);
int
idx0
=
idx
/
(
sz1_1
+
sz1_2
);
float4
*
src_ptr
=
nullptr
;
int
sz1
=
0
;
if
(
idx1
<
sz1_1
)
{
sz1
=
sz1_1
;
src_ptr
=
(
float4
*
)
inp1
;
}
else
{
idx1
-=
sz1_1
;
sz1
=
sz1_2
;
src_ptr
=
(
float4
*
)
inp2
;
}
src_ptr
+=
flat_3dim
(
idx0
,
idx1
,
idx2
,
sz1
,
sz2
);
dst_ptr
[
0
]
=
src_ptr
[
0
];
}
template
<
>
void
launch_concat3_dim1
<
float
>
(
const
float
*
inp1
,
const
float
*
inp2
,
float
*
output
,
int
sz0
,
int
sz2
,
int
sz1_1
,
int
sz1_2
,
cudaStream_t
stream
)
{
sz2
>>=
2
;
int
nele
=
sz0
*
sz2
*
(
sz1_1
+
sz1_2
);
int
nblock
=
(
nele
+
MAX_THREADS
-
1
)
/
MAX_THREADS
;
kernel_concat3_dim1
<<<
nblock
,
MAX_THREADS
,
0
,
stream
>>>
(
inp1
,
inp2
,
output
,
sz0
,
sz2
,
sz1_1
,
sz1_2
);
}
template
<
>
void
launch_concat3_dim1
<
__half
>
(
const
__half
*
inp1
,
const
__half
*
inp2
,
__half
*
output
,
int
sz0
,
int
sz2
,
int
sz1_1
,
int
sz1_2
,
cudaStream_t
stream
)
{
sz2
>>=
3
;
int
nele
=
sz0
*
sz2
*
(
sz1_1
+
sz1_2
);
int
nblock
=
(
nele
+
MAX_THREADS
-
1
)
/
MAX_THREADS
;
kernel_concat3_dim1
<<<
nblock
,
MAX_THREADS
,
0
,
stream
>>>
(
inp1
,
inp2
,
output
,
sz0
,
sz2
,
sz1_1
,
sz1_2
);
}
colossalai/kernel/cuda_native/csrc/kernels/include/block_reduce.h
0 → 100644
View file @
5c3843dc
/* Copyright 2021 The LightSeq Team
Copyright Tencent/TurboTransformers
This block_reduce_n is adapted from Tencent/TurboTransformers
*/
#pragma once
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
enum
class
ReduceType
{
kMax
=
0
,
kSum
};
const
unsigned
int
WARP_REDUCE_MASK
=
0xffffffff
;
const
float
REDUCE_FLOAT_INF_NEG
=
-
100000000.
f
;
const
float
REDUCE_FLOAT_INF_POS
=
100000000.
f
;
const
unsigned
int
WARP_REDUCE_SIZE
=
32
;
template
<
typename
T
>
__forceinline__
__device__
T
warpReduceSum
(
T
val
)
{
for
(
int
mask
=
(
WARP_REDUCE_SIZE
>>
1
);
mask
>
0
;
mask
>>=
1
)
val
+=
__shfl_xor_sync
(
WARP_REDUCE_MASK
,
val
,
mask
,
WARP_REDUCE_SIZE
);
return
val
;
}
/* Calculate the sum of all elements in a block */
template
<
typename
T
>
__forceinline__
__device__
T
blockReduceSum
(
T
val
)
{
static
__shared__
T
shared
[
32
];
int
lane
=
threadIdx
.
x
&
0x1f
;
int
wid
=
threadIdx
.
x
>>
5
;
val
=
warpReduceSum
<
T
>
(
val
);
if
(
lane
==
0
)
shared
[
wid
]
=
val
;
__syncthreads
();
val
=
(
threadIdx
.
x
<
(
blockDim
.
x
>>
5
))
?
shared
[
lane
]
:
(
T
)
0.0
f
;
val
=
warpReduceSum
<
T
>
(
val
);
return
val
;
}
template
<
ReduceType
Rtype
,
int
Num
>
__inline__
__device__
void
blockReduce
(
float
*
pval
);
// use template to make code more concise
template
<
ReduceType
Rtype
,
int
Num
>
__inline__
__device__
void
warpReduce
(
float
*
pval
);
// static
template
<
>
__inline__
__device__
void
warpReduce
<
ReduceType
::
kMax
,
1
>
(
float
*
pval
)
{
*
pval
=
max
(
*
pval
,
__shfl_xor_sync
(
WARP_REDUCE_MASK
,
*
pval
,
16
,
32
));
*
pval
=
max
(
*
pval
,
__shfl_xor_sync
(
WARP_REDUCE_MASK
,
*
pval
,
8
,
32
));
*
pval
=
max
(
*
pval
,
__shfl_xor_sync
(
WARP_REDUCE_MASK
,
*
pval
,
4
,
32
));
*
pval
=
max
(
*
pval
,
__shfl_xor_sync
(
WARP_REDUCE_MASK
,
*
pval
,
2
,
32
));
*
pval
=
max
(
*
pval
,
__shfl_xor_sync
(
WARP_REDUCE_MASK
,
*
pval
,
1
,
32
));
}
template
<
>
__inline__
__device__
void
warpReduce
<
ReduceType
::
kMax
,
2
>
(
float
*
pval
)
{
float
val0_tmp
,
val1_tmp
;
#define WarpReduceMaxOneStep(a, b) \
val0_tmp = __shfl_xor_sync(WARP_REDUCE_MASK, *(pval), a, b); \
val1_tmp = __shfl_xor_sync(WARP_REDUCE_MASK, *(pval + 1), a, b); \
*(pval) = max(val0_tmp, *(pval)); \
*(pval + 1) = max(val1_tmp, *(pval + 1));
WarpReduceMaxOneStep
(
16
,
32
);
WarpReduceMaxOneStep
(
8
,
32
);
WarpReduceMaxOneStep
(
4
,
32
);
WarpReduceMaxOneStep
(
2
,
32
);
WarpReduceMaxOneStep
(
1
,
32
);
#undef WarpReduceMaxOneStep
}
template
<
>
__inline__
__device__
void
warpReduce
<
ReduceType
::
kSum
,
1
>
(
float
*
pval
)
{
*
pval
+=
__shfl_xor_sync
(
WARP_REDUCE_MASK
,
*
pval
,
16
,
32
);
*
pval
+=
__shfl_xor_sync
(
WARP_REDUCE_MASK
,
*
pval
,
8
,
32
);
*
pval
+=
__shfl_xor_sync
(
WARP_REDUCE_MASK
,
*
pval
,
4
,
32
);
*
pval
+=
__shfl_xor_sync
(
WARP_REDUCE_MASK
,
*
pval
,
2
,
32
);
*
pval
+=
__shfl_xor_sync
(
WARP_REDUCE_MASK
,
*
pval
,
1
,
32
);
}
/*
* Unorll for loop for warpreduce to
* imporve instruction issue efficiency
* ElemX means there are X numbers to be summed
*/
template
<
>
__inline__
__device__
void
warpReduce
<
ReduceType
::
kSum
,
2
>
(
float
*
pval
)
{
float
val0_tmp
,
val1_tmp
;
#define WarpReduceSumOneStep(a, b) \
val0_tmp = __shfl_xor_sync(WARP_REDUCE_MASK, *(pval + 0), a, b); \
val1_tmp = __shfl_xor_sync(WARP_REDUCE_MASK, *(pval + 1), a, b); \
*(pval + 0) += val0_tmp; \
*(pval + 1) += val1_tmp
WarpReduceSumOneStep
(
16
,
32
);
WarpReduceSumOneStep
(
8
,
32
);
WarpReduceSumOneStep
(
4
,
32
);
WarpReduceSumOneStep
(
2
,
32
);
WarpReduceSumOneStep
(
1
,
32
);
#undef WarpReduceSumOneStep
}
template
<
>
__inline__
__device__
void
warpReduce
<
ReduceType
::
kSum
,
4
>
(
float
*
pval
)
{
float
val0_tmp
,
val1_tmp
,
val2_tmp
,
val3_tmp
;
#define WarpReduceSumOneStep(a, b) \
val0_tmp = __shfl_xor_sync(WARP_REDUCE_MASK, *(pval + 0), a, b); \
val1_tmp = __shfl_xor_sync(WARP_REDUCE_MASK, *(pval + 1), a, b); \
val2_tmp = __shfl_xor_sync(WARP_REDUCE_MASK, *(pval + 2), a, b); \
val3_tmp = __shfl_xor_sync(WARP_REDUCE_MASK, *(pval + 3), a, b); \
*(pval + 0) += val0_tmp; \
*(pval + 1) += val1_tmp; \
*(pval + 2) += val2_tmp; \
*(pval + 3) += val3_tmp
WarpReduceSumOneStep
(
16
,
32
);
WarpReduceSumOneStep
(
8
,
32
);
WarpReduceSumOneStep
(
4
,
32
);
WarpReduceSumOneStep
(
2
,
32
);
WarpReduceSumOneStep
(
1
,
32
);
#undef WarpReduceSumOneStep
}
template
<
>
__inline__
__device__
void
blockReduce
<
ReduceType
::
kSum
,
1
>
(
float
*
pval
)
{
const
int
num
=
1
;
static
__shared__
float
shared
[
num
][
32
];
int
lane_id
=
threadIdx
.
x
&
0x1f
;
int
wid
=
threadIdx
.
x
>>
5
;
warpReduce
<
ReduceType
::
kSum
,
num
>
(
pval
);
if
(
lane_id
==
0
)
{
#pragma unroll
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
shared
[
i
][
wid
]
=
*
(
pval
+
i
);
}
}
__syncthreads
();
if
(
threadIdx
.
x
<
(
blockDim
.
x
>>
5
))
{
#pragma unroll
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
*
(
pval
+
i
)
=
shared
[
i
][
lane_id
];
}
}
else
{
#pragma unroll
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
*
(
pval
+
i
)
=
0.
f
;
}
}
warpReduce
<
ReduceType
::
kSum
,
num
>
(
pval
);
}
template
<
>
__inline__
__device__
void
blockReduce
<
ReduceType
::
kSum
,
2
>
(
float
*
pval
)
{
const
int
num
=
2
;
static
__shared__
float
shared
[
num
][
32
];
int
lane_id
=
threadIdx
.
x
&
0x1f
;
int
wid
=
threadIdx
.
x
>>
5
;
warpReduce
<
ReduceType
::
kSum
,
num
>
(
pval
);
if
(
lane_id
==
0
)
{
#pragma unroll
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
shared
[
i
][
wid
]
=
*
(
pval
+
i
);
}
}
__syncthreads
();
if
(
threadIdx
.
x
<
(
blockDim
.
x
>>
5
))
{
#pragma unroll
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
*
(
pval
+
i
)
=
shared
[
i
][
lane_id
];
}
}
else
{
#pragma unroll
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
*
(
pval
+
i
)
=
0.
f
;
}
}
warpReduce
<
ReduceType
::
kSum
,
num
>
(
pval
);
}
template
<
>
__inline__
__device__
void
blockReduce
<
ReduceType
::
kSum
,
4
>
(
float
*
pval
)
{
const
int
num
=
4
;
static
__shared__
float
shared
[
num
][
32
];
int
lane_id
=
threadIdx
.
x
&
0x1f
;
int
wid
=
threadIdx
.
x
>>
5
;
warpReduce
<
ReduceType
::
kSum
,
num
>
(
pval
);
if
(
lane_id
==
0
)
{
#pragma unroll
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
shared
[
i
][
wid
]
=
*
(
pval
+
i
);
}
}
__syncthreads
();
if
(
threadIdx
.
x
<
(
blockDim
.
x
>>
5
))
{
#pragma unroll
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
*
(
pval
+
i
)
=
shared
[
i
][
lane_id
];
}
}
else
{
#pragma unroll
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
*
(
pval
+
i
)
=
0.
f
;
}
}
warpReduce
<
ReduceType
::
kSum
,
num
>
(
pval
);
}
template
<
>
__inline__
__device__
void
blockReduce
<
ReduceType
::
kMax
,
1
>
(
float
*
pval
)
{
const
int
num
=
1
;
static
__shared__
float
shared
[
num
][
32
];
int
lane_id
=
threadIdx
.
x
&
0x1f
;
int
wid
=
threadIdx
.
x
>>
5
;
warpReduce
<
ReduceType
::
kMax
,
num
>
(
pval
);
if
(
lane_id
==
0
)
{
#pragma unroll
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
shared
[
i
][
wid
]
=
*
(
pval
+
i
);
}
}
__syncthreads
();
if
(
threadIdx
.
x
<
(
blockDim
.
x
>>
5
))
{
#pragma unroll
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
*
(
pval
+
i
)
=
shared
[
i
][
lane_id
];
}
}
else
{
#pragma unroll
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
*
(
pval
+
i
)
=
REDUCE_FLOAT_INF_NEG
;
}
}
warpReduce
<
ReduceType
::
kMax
,
num
>
(
pval
);
}
template
<
>
__inline__
__device__
void
blockReduce
<
ReduceType
::
kMax
,
2
>
(
float
*
pval
)
{
const
int
num
=
1
;
static
__shared__
float
shared
[
num
][
32
];
int
lane_id
=
threadIdx
.
x
&
0x1f
;
int
wid
=
threadIdx
.
x
>>
5
;
warpReduce
<
ReduceType
::
kMax
,
num
>
(
pval
);
if
(
lane_id
==
0
)
{
#pragma unroll
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
shared
[
i
][
wid
]
=
*
(
pval
+
i
);
}
}
__syncthreads
();
if
(
threadIdx
.
x
<
(
blockDim
.
x
>>
5
))
{
#pragma unroll
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
*
(
pval
+
i
)
=
shared
[
i
][
lane_id
];
}
}
else
{
#pragma unroll
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
*
(
pval
+
i
)
=
REDUCE_FLOAT_INF_NEG
;
}
}
warpReduce
<
ReduceType
::
kMax
,
num
>
(
pval
);
}
template
<
>
__inline__
__device__
void
blockReduce
<
ReduceType
::
kMax
,
4
>
(
float
*
pval
)
{
const
int
num
=
1
;
static
__shared__
float
shared
[
num
][
32
];
int
lane_id
=
threadIdx
.
x
&
0x1f
;
int
wid
=
threadIdx
.
x
>>
5
;
warpReduce
<
ReduceType
::
kMax
,
num
>
(
pval
);
if
(
lane_id
==
0
)
{
#pragma unroll
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
shared
[
i
][
wid
]
=
*
(
pval
+
i
);
}
}
__syncthreads
();
if
(
threadIdx
.
x
<
(
blockDim
.
x
>>
5
))
{
#pragma unroll
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
*
(
pval
+
i
)
=
shared
[
i
][
lane_id
];
}
}
else
{
#pragma unroll
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
*
(
pval
+
i
)
=
REDUCE_FLOAT_INF_NEG
;
}
}
warpReduce
<
ReduceType
::
kMax
,
num
>
(
pval
);
}
colossalai/kernel/cuda_native/csrc/kernels/include/context.h
0 → 100644
View file @
5c3843dc
#pragma once
#include <cublas_v2.h>
#include <cuda.h>
#include <iostream>
#include <string>
#include "cuda_util.h"
class
Context
{
public:
Context
()
:
_stream
(
nullptr
)
{
CHECK_GPU_ERROR
(
cublasCreate
(
&
_cublasHandle
));
}
virtual
~
Context
()
{}
static
Context
&
Instance
()
{
static
Context
_ctx
;
return
_ctx
;
}
void
set_stream
(
cudaStream_t
stream
)
{
_stream
=
stream
;
CHECK_GPU_ERROR
(
cublasSetStream
(
_cublasHandle
,
_stream
));
}
cudaStream_t
get_stream
()
{
return
_stream
;
}
cublasHandle_t
get_cublashandle
()
{
return
_cublasHandle
;
}
private:
cudaStream_t
_stream
;
cublasHandle_t
_cublasHandle
;
};
colossalai/kernel/cuda_native/csrc/kernels/include/cross_entropy_layer.h
0 → 100644
View file @
5c3843dc
#pragma once
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_runtime_api.h>
#include <type_traits>
#include "cuda_util.h"
template
<
typename
T
>
class
CrossEntropyLayer
{
public:
CrossEntropyLayer
(
float
epsilon
,
int
padding_idx
,
int
max_batch_tokens
);
virtual
~
CrossEntropyLayer
();
void
Forward
(
const
T
*
inputs_ptr
,
const
int
*
targets_ptr
,
float
*
outputs_ptr
,
float
*
nll_loss_ptr
);
void
Backward
(
const
float
*
grad_outputs_ptr
,
const
T
*
inputs_ptr
,
const
int
*
targets_ptr
,
T
*
grad_inputs_ptr
);
void
set_cur_batch_shape
(
int
batch_size
,
int
seq_len
,
int
vocab_size
);
private:
void
allocate_mem_buffer
()
{
// allocate local gpu memory
_loss_buffer
=
cuda_malloc
<
float
>
(
_max_batch_tokens
*
2
);
}
void
free_mem_buffer
()
{
// free local gpu memory
cuda_free
(
_loss_buffer
);
}
const
int
_padding_idx
;
const
float
_epsilon
;
const
int
_max_batch_tokens
;
size_t
_batch_size
;
size_t
_seq_len
;
size_t
_vocab_size
;
float
*
_loss_buffer
;
};
colossalai/kernel/cuda_native/csrc/kernels/include/cublas_wrappers.h
0 → 100644
View file @
5c3843dc
/* Copyright 2021 The LightSeq Team
Copyright Microsoft DeepSpeed
This file is adapted from Microsoft DeepSpeed
*/
#pragma once
#include <assert.h>
#include <cublas_v2.h>
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#include <mma.h>
#include <stdio.h>
int
cublas_gemm_ex
(
cublasHandle_t
handle
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
float
*
A
,
const
float
*
B
,
float
*
C
,
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DEFAULT
);
int
cublas_gemm_ex
(
cublasHandle_t
handle
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
__half
*
A
,
const
__half
*
B
,
__half
*
C
,
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DEFAULT_TENSOR_OP
);
int
cublas_strided_batched_gemm
(
cublasHandle_t
handle
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
float
*
A
,
const
float
*
B
,
float
*
C
,
cublasOperation_t
op_A
,
cublasOperation_t
op_B
,
int
stride_A
,
int
stride_B
,
int
stride_C
,
int
batch
,
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DEFAULT
);
int
cublas_strided_batched_gemm
(
cublasHandle_t
handle
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
__half
*
A
,
const
__half
*
B
,
__half
*
C
,
cublasOperation_t
op_A
,
cublasOperation_t
op_B
,
int
stride_A
,
int
stride_B
,
int
stride_C
,
int
batch
,
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DEFAULT_TENSOR_OP
);
colossalai/kernel/cuda_native/csrc/kernels/include/cuda_util.h
0 → 100644
View file @
5c3843dc
#pragma once
#include <cublas_v2.h>
#include <cuda.h>
#include <math_constants.h>
#include <chrono>
#include <fstream>
#include <iostream>
#include <string>
#include <type_traits>
#include <vector>
template
<
typename
T
>
void
check_gpu_error
(
T
result
,
char
const
*
const
func
,
const
char
*
const
file
,
int
const
line
);
#define CHECK_GPU_ERROR(val) check_gpu_error((val), #val, __FILE__, __LINE__)
template
<
typename
T
>
void
print_vec
(
const
T
*
outv
,
std
::
string
outn
,
int
num_output_ele
);
template
<
typename
T
>
T
*
cuda_malloc
(
size_t
ele_num
);
void
cuda_free
(
void
*
pdata
);
template
<
typename
T
>
void
check_nan_inf
(
const
T
*
data_ptr
,
int
dsize
,
bool
check_nan_inf
,
std
::
string
file
,
int
line
,
cudaStream_t
stream
);
#define CHECK_NAN_INF(ptr, size, stream) \
check_nan_inf((ptr), (size), true, __FILE__, __LINE__, (stream)); \
check_nan_inf((ptr), (size), false, __FILE__, __LINE__, (stream))
colossalai/kernel/cuda_native/csrc/kernels/include/dropout.h
0 → 100644
View file @
5c3843dc
#pragma once
#include <string>
#include <cuda.h>
#include <cuda_fp16.h>
#include <stdio.h>
#include "kernels.h"
template
<
typename
T
>
class
Dropout
{
public:
struct
Config
{
float
ratio
;
bool
training
;
Config
(
float
r
)
:
ratio
(
r
),
training
(
true
)
{}
float
RATIO
()
const
{
return
training
?
ratio
:
0.0
;
}
};
Dropout
(
const
Config
&
config
,
size_t
max_ele_num
)
:
_config
(
config
),
_mask
(
nullptr
)
{
_mask
=
cuda_malloc
<
uint8_t
>
(
max_ele_num
);
}
virtual
~
Dropout
()
{
cuda_free
(
_mask
);
}
// after attention softmax
void
dropout
(
T
*
output
,
const
T
*
input
,
int
count
,
cudaStream_t
stream
,
bool
bwd
=
false
)
{
launch_ls_dropout
<
T
>
(
output
,
input
,
_mask
,
count
,
_config
.
RATIO
(),
stream
,
bwd
);
}
void
d_dropout
(
T
*
d_inp_out
,
int
count
,
cudaStream_t
stream
)
{
launch_ls_dropout
<
T
>
(
d_inp_out
,
d_inp_out
,
_mask
,
count
,
_config
.
RATIO
(),
stream
,
true
);
}
// transformer layer's postprocessing dropout, after attn or ffn module,
// before residual add.
void
bias_dropout_residual
(
T
*
output
,
const
T
*
input
,
const
T
*
residual
,
const
T
*
bias
,
int
rows
,
int
cols
,
cudaStream_t
stream
)
{
launch_ls_dropout_res_bias
<
T
>
(
output
,
input
,
_mask
,
bias
,
residual
,
rows
*
cols
,
cols
,
_config
.
RATIO
(),
stream
);
}
void
d_bias_dropout_residual
(
T
*
d_input
,
T
*
d_bias
,
const
T
*
d_output
,
int
rows
,
int
cols
,
cudaStream_t
stream
)
{
launch_ls_dropout_bias_bwd
<
T
>
(
d_input
,
d_bias
,
d_output
,
_mask
,
rows
,
cols
,
_config
.
RATIO
(),
stream
);
}
// dropout inside ffn.
void
bias_act_dropout
(
T
*
output
,
const
T
*
input
,
const
T
*
bias
,
int
rows
,
int
cols
,
std
::
string
activation_fn
,
cudaStream_t
stream
)
{
if
(
activation_fn
==
"relu"
)
{
launch_ls_dropout_act_bias
<
ActivationType
::
kRelu
,
T
>
(
output
,
input
,
_mask
,
bias
,
rows
*
cols
,
cols
,
_config
.
RATIO
(),
stream
);
}
else
if
(
activation_fn
==
"gelu"
)
{
launch_ls_dropout_act_bias
<
ActivationType
::
kGelu
,
T
>
(
output
,
input
,
_mask
,
bias
,
rows
*
cols
,
cols
,
_config
.
RATIO
(),
stream
);
}
else
{
throw
std
::
runtime_error
(
"not supported activation: "
+
activation_fn
);
}
}
void
d_bias_act_dropout
(
T
*
d_inp_out
,
T
*
d_bias_out
,
const
T
*
input
,
const
T
*
bias
,
int
rows
,
int
cols
,
std
::
string
activation_fn
,
cudaStream_t
stream
)
{
if
(
activation_fn
==
"relu"
)
{
launch_ls_dropout_act_bias_bwd
<
ActivationType
::
kRelu
,
T
>
(
d_inp_out
,
d_bias_out
,
input
,
bias
,
d_inp_out
,
_mask
,
rows
,
cols
,
_config
.
RATIO
(),
stream
);
}
else
if
(
activation_fn
==
"gelu"
)
{
launch_ls_dropout_act_bias_bwd
<
ActivationType
::
kGelu
,
T
>
(
d_inp_out
,
d_bias_out
,
input
,
bias
,
d_inp_out
,
_mask
,
rows
,
cols
,
_config
.
RATIO
(),
stream
);
}
else
{
throw
std
::
runtime_error
(
"not supported activation: "
+
activation_fn
);
}
}
bool
HasDropout
()
const
{
return
_config
.
RATIO
()
>
0.0
;
}
void
SetTrainingMode
(
bool
training
)
{
_config
.
training
=
training
;
}
private:
uint8_t
*
_mask
;
Config
_config
;
};
colossalai/kernel/cuda_native/csrc/kernels/include/feed_forward.h
0 → 100644
View file @
5c3843dc
#pragma once
/* Copyright 2021 The LightSeq Team
Copyright Microsoft DeepSpeed
This file is adapted from Microsoft DeepSpeed
*/
#include <cuda.h>
#include <cuda_fp16.h>
#include <stdio.h>
#include <array>
#include "cublas_wrappers.h"
#include "kernels.h"
template
<
typename
T
>
class
FeedForward
{
public:
struct
Config
{
int
outputSize
;
int
inputSize
;
std
::
array
<
int
,
3
>
gemm_algos
;
Config
(
int
outputs
,
int
inputs
)
:
outputSize
(
outputs
),
inputSize
(
inputs
),
gemm_algos
(
std
::
array
<
int
,
3
>
({
99
,
99
,
99
}))
{}
};
FeedForward
(
Config
config
)
:
config_
(
config
)
{}
~
FeedForward
()
{}
void
Forward
(
int
bsz
,
const
T
*
input_ptr
,
const
T
*
weights
,
T
*
out
,
cublasHandle_t
&
_cublasHandle
)
{
float
alpha
=
T
(
1.
);
float
beta
=
T
(
0.
);
cublas_gemm_ex
(
_cublasHandle
,
CUBLAS_OP_T
,
CUBLAS_OP_N
,
config_
.
outputSize
,
bsz
,
config_
.
inputSize
,
&
alpha
,
&
beta
,
weights
,
input_ptr
,
out
,
cublasGemmAlgo_t
(
config_
.
gemm_algos
[
0
]));
}
void
Backward
(
int
bsz
,
const
T
*
out_grad
,
const
T
*
input_ptr
,
const
T
*
weights
,
T
*
weights_grad
,
T
*
bias_grad
,
cublasHandle_t
&
_cublasHandle
,
cudaStream_t
&
stream
,
T
*
inp_grad_out
=
nullptr
,
T
*
out_grad_trans_out
=
nullptr
,
bool
compute_bias
=
true
)
{
float
alpha
=
(
T
)
1.0
,
beta
=
(
T
)
0.0
;
cublas_gemm_ex
(
_cublasHandle
,
CUBLAS_OP_N
,
CUBLAS_OP_T
,
config_
.
inputSize
,
config_
.
outputSize
,
bsz
,
&
alpha
,
&
beta
,
input_ptr
,
out_grad
,
weights_grad
,
cublasGemmAlgo_t
(
config_
.
gemm_algos
[
1
]));
cublas_gemm_ex
(
_cublasHandle
,
CUBLAS_OP_N
,
CUBLAS_OP_N
,
config_
.
inputSize
,
bsz
,
config_
.
outputSize
,
&
alpha
,
&
beta
,
weights
,
out_grad
,
inp_grad_out
,
cublasGemmAlgo_t
(
config_
.
gemm_algos
[
2
]));
if
(
compute_bias
)
{
launch_fuse_transpose_bias_kernel
<
T
>
(
out_grad
,
bias_grad
,
bsz
,
config_
.
outputSize
,
stream
);
}
}
void
reset_size
(
int
outputSize
,
int
inputSize
)
{
config_
.
outputSize
=
outputSize
;
config_
.
inputSize
=
inputSize
;
}
private:
Config
config_
;
};
colossalai/kernel/cuda_native/csrc/kernels/include/kernels.h
0 → 100644
View file @
5c3843dc
#pragma once
#include <cuda.h>
#include <cuda_fp16.h>
#include <curand_kernel.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdexcept>
#define MAX_THREADS 1024
#define WARP_SIZE 32
enum
class
ActivationType
{
kRelu
,
kGelu
};
void
launch_curand_init
(
int
total_count
,
int
dim
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_layer_norm
(
T
*
ln_res
,
T
*
vars
,
T
*
means
,
const
T
*
inp
,
const
T
*
scale
,
const
T
*
bias
,
int
batch_size
,
int
hidden_dim
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_ln_bw
(
T
*
gamma_grad
,
T
*
betta_grad
,
T
*
inp_grad
,
const
T
*
out_grad
,
const
T
*
residual_grad
,
const
T
*
inp_or_out
,
const
T
*
gamma
,
const
T
*
betta
,
const
T
*
vars
,
const
T
*
means
,
int
batch
,
int
hidden_dim
,
cudaStream_t
stream
[
2
]);
template
<
typename
T
>
void
launch_attn_softmax
(
T
*
vals
,
const
T
*
attn_mask
,
int
batch_size
,
int
heads
,
int
from_len
,
int
to_len
,
bool
mask_future
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_attn_softmax_bw
(
T
*
out_grad
,
const
T
*
soft_inp
,
int
rows
,
int
softmax_len
,
cudaStream_t
stream
);
// [b, s, h] -> [b, nh, s, ad]
template
<
typename
T
>
void
launch_transform_0213
(
T
*
output
,
const
T
*
vals
,
int
batch_size
,
int
seq_length
,
int
hidden_dim
,
int
nhead
,
cudaStream_t
stream
);
// [b, s, 3, h] -> [3, b, nh, s, ad]
template
<
typename
T
>
void
launch_bias_add_transform_20314
(
T
*
output
,
const
T
*
input
,
const
T
*
bias
,
int
dim_0
,
int
dim_1
,
int
dim_2
,
int
dim_3
,
int
dim_4
,
cudaStream_t
stream
);
// [tc, b, nh, s, ad] -> [b, s, tc, nh, ad]
template
<
typename
T
>
void
launch_transform4d_0213
(
T
*
output
,
const
T
*
vals
,
int
batch_size
,
int
seq_len
,
int
hidden_dim
,
int
nhead
,
int
trans_count
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_ls_dropout
(
T
*
out
,
const
T
*
vals
,
uint8_t
*
mask
,
int
total_count
,
float
ratio
,
cudaStream_t
stream
,
bool
backward
=
false
);
template
<
typename
T
>
void
launch_ls_dropout_res_bias
(
T
*
out
,
const
T
*
vals
,
uint8_t
*
mask
,
const
T
*
bias
,
const
T
*
residual
,
int
total_count
,
int
dim
,
float
ratio
,
cudaStream_t
stream
);
template
<
ActivationType
,
typename
T
>
void
launch_ls_dropout_act_bias
(
T
*
out
,
const
T
*
vals
,
uint8_t
*
mask
,
const
T
*
bias
,
int
total_count
,
int
dim
,
float
ratio
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_ls_dropout_bias_bwd
(
T
*
in_grad
,
T
*
bias_grad
,
const
T
*
out_grad
,
const
uint8_t
*
mask
,
int
row_size
,
int
dim
,
float
ratio
,
cudaStream_t
stream
);
template
<
ActivationType
act_type
,
typename
T
>
void
launch_ls_dropout_act_bias_bwd
(
T
*
in_grad
,
T
*
bias_grad
,
const
T
*
input
,
const
T
*
bias
,
const
T
*
out_grad
,
const
uint8_t
*
mask
,
int
row_size
,
int
dim
,
float
ratio
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_fuse_transpose_bias_kernel
(
const
T
*
inp
,
T
*
out
,
int
rows
,
int
cols
,
cudaStream_t
stream
);
void
launch_param_update
(
const
float
*
input
,
__half
*
output
,
int
size
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_concat3_dim1
(
const
T
*
inp1
,
const
T
*
inp2
,
T
*
output
,
int
sz0
,
int
sz2
,
int
sz1_1
,
int
sz1_2
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_fused_add2
(
T
*
out
,
const
T
*
inp1
,
const
T
*
inp2
,
int
batch_size
,
int
seq_len
,
int
hidden_size
,
cudaStream_t
&
stream
);
template
<
typename
T
>
void
launch_cross_entropy_fw
(
const
T
*
inputs_ptr
,
const
int
*
targets_ptr
,
float
*
outputs_ptr
,
float
*
nll_loss_ptr
,
float
*
loss_buffer
,
const
int
padding_idx
,
const
float
epsilon
,
const
int
batch_size
,
const
int
seq_len
,
const
int
vocab_size
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_cross_entropy_bw
(
const
float
*
grad_outputs_ptr
,
const
T
*
inputs_ptr
,
const
int
*
targets_ptr
,
T
*
grad_inputs_ptr
,
const
int
padding_idx
,
const
float
epsilon
,
const
int
batch_size
,
const
int
seq_len
,
const
int
vocab_size
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_lookup_scale_pos_dropout
(
T
*
output
,
const
int
*
input
,
const
T
*
embeddings
,
const
T
*
pos_embeddings
,
uint8_t
*
dropout_mask
,
int
batch_size
,
int
seq_len
,
int
embedding_dim
,
int
padding_idx
,
float
dropout_ratio
,
int
step
,
cudaStream_t
&
stream
);
template
<
typename
T
>
void
launch_d_lookup_scale_pos_dropout
(
T
*
grad_embeddings
,
const
T
*
grad_output
,
const
int
*
input
,
const
uint8_t
*
dropout_mask
,
int
batch_size
,
int
seq_len
,
int
embedding_dim
,
int
vocab_size
,
int
padding_idx
,
float
dropout_ratio
,
cudaStream_t
&
stream
);
/* Convert 2-dim tensor index into vector index */
__forceinline__
__host__
__device__
int
flat_2dim
(
int
id1
,
int
id2
,
int
dim2
)
{
return
id1
*
dim2
+
id2
;
}
/* Convert 3-dim tensor index into vector index */
__forceinline__
__host__
__device__
int
flat_3dim
(
int
id1
,
int
id2
,
int
id3
,
int
dim2
,
int
dim3
)
{
return
id1
*
dim2
*
dim3
+
id2
*
dim3
+
id3
;
}
/* Convert 4-dim tensor index into vector index */
__forceinline__
__host__
__device__
int
flat_4dim
(
int
id1
,
int
id2
,
int
id3
,
int
id4
,
int
dim2
,
int
dim3
,
int
dim4
)
{
// return id1*(dim2*dim3*dim4) + id2*(dim3*dim4) + id3*dim4 + id4;
int
res
=
id4
;
int
ld
=
dim4
;
res
+=
id3
*
ld
;
ld
*=
dim3
;
res
+=
id2
*
ld
;
ld
*=
dim2
;
res
+=
id1
*
ld
;
return
res
;
}
/* Convert 5-dim tensor index into vector index */
__forceinline__
__host__
__device__
int
flat_5dim
(
int
id1
,
int
id2
,
int
id3
,
int
id4
,
int
id5
,
int
dim2
,
int
dim3
,
int
dim4
,
int
dim5
)
{
// return id1*(dim2*dim3*dim4*dim5) + id2*(dim3*dim4*dim5) + id3*(dim4*dim5) +
// id4*dim5 + dim5;
int
res
=
id5
;
int
ld
=
dim5
;
res
+=
id4
*
ld
;
ld
*=
dim4
;
res
+=
id3
*
ld
;
ld
*=
dim3
;
res
+=
id2
*
ld
;
ld
*=
dim2
;
res
+=
id1
*
ld
;
return
res
;
}
/* Convert 6-dim tensor index into vector index */
__forceinline__
__host__
__device__
int
flat_6dim
(
int
id1
,
int
id2
,
int
id3
,
int
id4
,
int
id5
,
int
id6
,
int
dim2
,
int
dim3
,
int
dim4
,
int
dim5
,
int
dim6
)
{
// return id1*(dim2*dim3*dim4*dim5*dim6) + id2*(dim3*dim4*dim5*dim6) +
// id3*(dim4*dim5*dim6) + id4*(dim5*dim6) + id5*dim6 + id6;
int
res
=
id6
;
int
ld
=
dim6
;
res
+=
id5
*
ld
;
ld
*=
dim5
;
res
+=
id4
*
ld
;
ld
*=
dim4
;
res
+=
id3
*
ld
;
ld
*=
dim3
;
res
+=
id2
*
ld
;
ld
*=
dim2
;
res
+=
id1
*
ld
;
return
res
;
}
/* Convert vector index to 6-dim tensor index */
__forceinline__
__host__
__device__
void
decompose_6dim
(
int
src
,
int
dim1
,
int
dim2
,
int
dim3
,
int
dim4
,
int
dim5
,
int
*
id0
,
int
*
id1
,
int
*
id2
,
int
*
id3
,
int
*
id4
,
int
*
id5
)
{
*
id5
=
src
%
dim5
;
src
/=
dim5
;
*
id4
=
src
%
dim4
;
src
/=
dim4
;
*
id3
=
src
%
dim3
;
src
/=
dim3
;
*
id2
=
src
%
dim2
;
src
/=
dim2
;
*
id1
=
src
%
dim1
;
*
id0
=
src
/
dim1
;
}
/* Convert vector index to 5-dim tensor index */
__forceinline__
__host__
__device__
void
decompose_5dim
(
int
src
,
int
dim1
,
int
dim2
,
int
dim3
,
int
dim4
,
int
*
id0
,
int
*
id1
,
int
*
id2
,
int
*
id3
,
int
*
id4
)
{
*
id4
=
src
%
dim4
;
src
/=
dim4
;
*
id3
=
src
%
dim3
;
src
/=
dim3
;
*
id2
=
src
%
dim2
;
src
/=
dim2
;
*
id1
=
src
%
dim1
;
*
id0
=
src
/
dim1
;
}
/* Convert vector index to 4-dim tensor index */
__forceinline__
__host__
__device__
void
decompose_4dim
(
int
src
,
int
dim1
,
int
dim2
,
int
dim3
,
int
*
id0
,
int
*
id1
,
int
*
id2
,
int
*
id3
)
{
*
id3
=
src
%
dim3
;
src
/=
dim3
;
*
id2
=
src
%
dim2
;
src
/=
dim2
;
*
id1
=
src
%
dim1
;
*
id0
=
src
/
dim1
;
}
/* Convert vector index to 3-dim tensor index */
__forceinline__
__host__
__device__
void
decompose_3dim
(
int
src
,
int
dim1
,
int
dim2
,
int
*
id0
,
int
*
id1
,
int
*
id2
)
{
*
id2
=
src
%
dim2
;
src
/=
dim2
;
*
id1
=
src
%
dim1
;
*
id0
=
src
/
dim1
;
}
/* Convert vector index to 2-dim tensor index */
__forceinline__
__host__
__device__
void
decompose_2dim
(
int
src
,
int
dim1
,
int
*
id0
,
int
*
id1
)
{
*
id1
=
src
%
dim1
;
*
id0
=
src
/
dim1
;
}
colossalai/kernel/cuda_native/csrc/kernels/include/ls_cub.cuh
0 → 100644
View file @
5c3843dc
// copied from https://github.com/dmlc/dgl/pull/2758
#ifndef DGL_ARRAY_CUDA_DGL_CUB_CUH_
#define DGL_ARRAY_CUDA_DGL_CUB_CUH_
#define CUB_NS_PREFIX namespace ls {
#define CUB_NS_POSTFIX }
#include "cub/cub.cuh"
#include "cub/util_allocator.cuh"
#undef CUB_NS_POSTFIX
#undef CUB_NS_PREFIX
#endif
colossalai/kernel/cuda_native/csrc/kernels/include/normalize_layer.h
0 → 100644
View file @
5c3843dc
#pragma once
#include <cuda.h>
#include <cuda_fp16.h>
#include <stdio.h>
#include <fstream>
#include "kernels.h"
using
namespace
std
;
template
<
typename
T
>
class
Normalize_Layer
{
public:
struct
Config
{
uint32_t
hidden_dim
;
bool
use_mean
;
Config
(
uint32_t
hidden_dim
,
bool
use_mean
=
false
)
:
hidden_dim
(
hidden_dim
),
use_mean
(
use_mean
)
{}
};
Normalize_Layer
(
Config
config
,
size_t
max_rows
)
:
config_
(
config
),
vars_
(
nullptr
),
means_
(
nullptr
)
{
vars_
=
cuda_malloc
<
T
>
(
max_rows
);
if
(
config_
.
use_mean
)
{
means_
=
cuda_malloc
<
T
>
(
max_rows
);
}
}
~
Normalize_Layer
()
{
cuda_free
(
vars_
);
cuda_free
(
means_
);
}
void
Forward
(
T
*
ln_res
,
const
T
*
inp
,
const
T
*
gamma
,
const
T
*
betta
,
int
batch_size
,
cudaStream_t
stream
)
{
launch_layer_norm
(
ln_res
,
vars_
,
means_
,
inp
,
gamma
,
betta
,
batch_size
,
config_
.
hidden_dim
,
stream
);
}
/*
residual_grad, inp_or_out, betta should be treated carefully.
inp_or_out = input if use_mean else output
residual_grad, betta can be nullptr.
residual_grad will be added to dinp if it is not nullptr
which is useful in transformer layer when pre-ln
betta are only used to compute xhat,
(use_mean == false) ^ (betta == nullptr) should be true
*/
void
Backward
(
T
*
gamma_grad
,
T
*
betta_grad
,
T
*
inp_grad
,
const
T
*
out_grad
,
const
T
*
residual_grad
,
const
T
*
inp_or_out
,
const
T
*
gamma
,
const
T
*
betta
,
int
batch_size
,
cudaStream_t
stream
[
2
])
{
launch_ln_bw
(
gamma_grad
,
betta_grad
,
inp_grad
,
out_grad
,
residual_grad
,
inp_or_out
,
gamma
,
betta
,
vars_
,
means_
,
batch_size
,
config_
.
hidden_dim
,
stream
);
}
inline
bool
use_mean
()
const
{
return
config_
.
use_mean
;
}
private:
Config
config_
;
T
*
vars_
;
T
*
means_
;
};
colossalai/kernel/cuda_native/csrc/kernels/include/softmax.h
0 → 100644
View file @
5c3843dc
#pragma once
#include <cuda.h>
#include <cuda_fp16.h>
#include <stdio.h>
#include <fstream>
#include "kernels.h"
using
namespace
std
;
template
<
typename
T
>
class
Softmax
{
public:
struct
Config
{
size_t
nhead
;
Config
(
size_t
nhead
)
:
nhead
(
nhead
)
{}
};
Softmax
(
Config
config
)
:
config_
(
config
)
{}
~
Softmax
()
{}
void
Forward
(
T
*
vals
,
const
T
*
attn_mask
,
int
batch_size
,
int
from_len
,
int
to_len
,
cudaStream_t
&
stream
,
bool
mask_future
=
true
)
{
launch_attn_softmax
<
T
>
(
vals
,
attn_mask
,
batch_size
,
config_
.
nhead
,
from_len
,
to_len
,
mask_future
,
stream
);
}
void
Backward
(
T
*
out_grad
,
const
T
*
soft_out
,
int
batch_size
,
int
from_len
,
int
to_len
,
cudaStream_t
stream
)
{
launch_attn_softmax_bw
<
T
>
(
out_grad
,
soft_out
,
batch_size
*
config_
.
nhead
*
from_len
,
to_len
,
stream
);
}
void
reset_size
(
size_t
nhead
)
{
config_
.
nhead
=
nhead
;
}
private:
Config
config_
;
};
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment