Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
5c3843dc
Unverified
Commit
5c3843dc
authored
Dec 21, 2021
by
shenggan
Committed by
GitHub
Dec 21, 2021
Browse files
add colossalai kernel module (#55)
parent
648f8063
Changes
43
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2858 additions
and
0 deletions
+2858
-0
colossalai/kernel/__init__.py
colossalai/kernel/__init__.py
+8
-0
colossalai/kernel/cuda_native/__init__.py
colossalai/kernel/cuda_native/__init__.py
+17
-0
colossalai/kernel/cuda_native/builder.py
colossalai/kernel/cuda_native/builder.py
+114
-0
colossalai/kernel/cuda_native/csrc/compat.h
colossalai/kernel/cuda_native/csrc/compat.h
+13
-0
colossalai/kernel/cuda_native/csrc/kernels/cross_entropy.cu
colossalai/kernel/cuda_native/csrc/kernels/cross_entropy.cu
+191
-0
colossalai/kernel/cuda_native/csrc/kernels/cublas_wrappers.cu
...ssalai/kernel/cuda_native/csrc/kernels/cublas_wrappers.cu
+87
-0
colossalai/kernel/cuda_native/csrc/kernels/cuda_util.cu
colossalai/kernel/cuda_native/csrc/kernels/cuda_util.cu
+169
-0
colossalai/kernel/cuda_native/csrc/kernels/dropout_kernels.cu
...ssalai/kernel/cuda_native/csrc/kernels/dropout_kernels.cu
+1001
-0
colossalai/kernel/cuda_native/csrc/kernels/general_kernels.cu
...ssalai/kernel/cuda_native/csrc/kernels/general_kernels.cu
+232
-0
colossalai/kernel/cuda_native/csrc/kernels/include/block_reduce.h
...ai/kernel/cuda_native/csrc/kernels/include/block_reduce.h
+312
-0
colossalai/kernel/cuda_native/csrc/kernels/include/context.h
colossalai/kernel/cuda_native/csrc/kernels/include/context.h
+36
-0
colossalai/kernel/cuda_native/csrc/kernels/include/cross_entropy_layer.h
...el/cuda_native/csrc/kernels/include/cross_entropy_layer.h
+46
-0
colossalai/kernel/cuda_native/csrc/kernels/include/cublas_wrappers.h
...kernel/cuda_native/csrc/kernels/include/cublas_wrappers.h
+40
-0
colossalai/kernel/cuda_native/csrc/kernels/include/cuda_util.h
...salai/kernel/cuda_native/csrc/kernels/include/cuda_util.h
+34
-0
colossalai/kernel/cuda_native/csrc/kernels/include/dropout.h
colossalai/kernel/cuda_native/csrc/kernels/include/dropout.h
+95
-0
colossalai/kernel/cuda_native/csrc/kernels/include/feed_forward.h
...ai/kernel/cuda_native/csrc/kernels/include/feed_forward.h
+68
-0
colossalai/kernel/cuda_native/csrc/kernels/include/kernels.h
colossalai/kernel/cuda_native/csrc/kernels/include/kernels.h
+274
-0
colossalai/kernel/cuda_native/csrc/kernels/include/ls_cub.cuh
...ssalai/kernel/cuda_native/csrc/kernels/include/ls_cub.cuh
+12
-0
colossalai/kernel/cuda_native/csrc/kernels/include/normalize_layer.h
...kernel/cuda_native/csrc/kernels/include/normalize_layer.h
+65
-0
colossalai/kernel/cuda_native/csrc/kernels/include/softmax.h
colossalai/kernel/cuda_native/csrc/kernels/include/softmax.h
+44
-0
No files found.
colossalai/kernel/__init__.py
0 → 100644
View file @
5c3843dc
from
.jit.bias_dropout_add
import
bias_dropout_add_fused_train
,
bias_dropout_add_fused_inference
from
.jit.bias_gelu
import
bias_gelu_impl
from
.cuda_native
import
LayerNorm
,
FusedScaleMaskSoftmax
,
MultiHeadAttention
__all__
=
[
"bias_dropout_add_fused_train"
,
"bias_dropout_add_fused_inference"
,
"bias_gelu_impl"
,
"LayerNorm"
,
"FusedScaleMaskSoftmax"
,
"MultiHeadAttention"
]
colossalai/kernel/cuda_native/__init__.py
0 → 100644
View file @
5c3843dc
from
.builder
import
_build_cuda_native_kernel
CUDA_NATIVE_KERNEL_BUILD
=
False
def
build_cuda_native_kernel
():
global
CUDA_NATIVE_KERNEL_BUILD
if
CUDA_NATIVE_KERNEL_BUILD
==
False
:
_build_cuda_native_kernel
()
CUDA_NATIVE_KERNEL_BUILD
=
True
build_cuda_native_kernel
()
from
.layer_norm
import
MixedFusedLayerNorm
as
LayerNorm
from
.scaled_softmax
import
FusedScaleMaskSoftmax
from
.multihead_attention
import
MultiHeadAttention
\ No newline at end of file
colossalai/kernel/cuda_native/builder.py
0 → 100644
View file @
5c3843dc
import
os
import
pathlib
import
subprocess
from
torch.utils
import
cpp_extension
# Setting this param to a list has a problem of generating different
# compilation commands (with diferent order of architectures) and
# leading to recompilation of fused kernels. Set it to empty string
# to avoid recompilation and assign arch flags explicity in
# extra_cuda_cflags below
os
.
environ
[
"TORCH_CUDA_ARCH_LIST"
]
=
""
def
_build_cuda_native_kernel
():
# Check if cuda 11 is installed for compute capability 8.0
cc_flag
=
[]
_
,
bare_metal_major
,
_
=
_get_cuda_bare_metal_version
(
cpp_extension
.
CUDA_HOME
)
if
int
(
bare_metal_major
)
>=
11
:
cc_flag
.
append
(
'-gencode'
)
cc_flag
.
append
(
'arch=compute_80,code=sm_80'
)
# Build path
basepath
=
pathlib
.
Path
(
__file__
).
parent
.
absolute
()
srcpath
=
basepath
/
'csrc'
buildpath
=
basepath
/
'build'
_create_build_dir
(
buildpath
)
# Helper function to build the kernels.
def
_cpp_extention_load_helper
(
name
,
sources
,
extra_cuda_flags
):
return
cpp_extension
.
load
(
name
=
name
,
sources
=
sources
,
build_directory
=
buildpath
,
extra_cflags
=
[
'-O3'
,
],
extra_include_paths
=
[
str
(
srcpath
/
'kernels'
/
'include'
)],
extra_cuda_cflags
=
[
'-O3'
,
'-gencode'
,
'arch=compute_70,code=sm_70'
,
'--use_fast_math'
]
+
extra_cuda_flags
+
cc_flag
,
verbose
=
False
)
# ==============
# Fused softmax.
# ==============
extra_cuda_flags
=
[
'-U__CUDA_NO_HALF_OPERATORS__'
,
'-U__CUDA_NO_HALF_CONVERSIONS__'
,
'--expt-relaxed-constexpr'
,
'--expt-extended-lambda'
]
# Upper triangular softmax.
sources
=
[
srcpath
/
'scaled_upper_triang_masked_softmax.cpp'
,
srcpath
/
'scaled_upper_triang_masked_softmax_cuda.cu'
]
colossal_scaled_upper_triang_masked_softmax
=
_cpp_extention_load_helper
(
"colossal_scaled_upper_triang_masked_softmax"
,
sources
,
extra_cuda_flags
)
# Masked softmax.
sources
=
[
srcpath
/
'scaled_masked_softmax.cpp'
,
srcpath
/
'scaled_masked_softmax_cuda.cu'
]
colossal_scaled_masked_softmax
=
_cpp_extention_load_helper
(
"colossal_scaled_masked_softmax"
,
sources
,
extra_cuda_flags
)
# =================================
# Mixed precision fused layer norm.
# =================================
extra_cuda_flags
=
[
'-maxrregcount=50'
]
sources
=
[
srcpath
/
'layer_norm_cuda.cpp'
,
srcpath
/
'layer_norm_cuda_kernel.cu'
]
colossal_layer_norm_cuda
=
_cpp_extention_load_helper
(
"colossal_layer_norm_cuda"
,
sources
,
extra_cuda_flags
)
# ==========================================
# Mixed precision Transformer Encoder Layer.
# ==========================================
extra_cuda_flags
=
[
'-std=c++14'
,
'-U__CUDA_NO_HALF_OPERATORS__'
,
'-U__CUDA_NO_HALF_CONVERSIONS__'
,
'-U__CUDA_NO_HALF2_OPERATORS__'
,
'-DTHRUST_IGNORE_CUB_VERSION_CHECK'
]
sources
=
[
srcpath
/
'multihead_attention_1d.cpp'
]
kernel_sources
=
[
"cublas_wrappers.cu"
,
"transform_kernels.cu"
,
"dropout_kernels.cu"
,
"normalize_kernels.cu"
,
"softmax_kernels.cu"
,
"general_kernels.cu"
,
"cuda_util.cu"
]
sources
+=
[(
srcpath
/
'kernels'
/
cu_file
)
for
cu_file
in
kernel_sources
]
colossal_multihead_attention
=
_cpp_extention_load_helper
(
"colossal_multihead_attention"
,
sources
,
extra_cuda_flags
)
def
_get_cuda_bare_metal_version
(
cuda_dir
):
raw_output
=
subprocess
.
check_output
([
cuda_dir
+
"/bin/nvcc"
,
"-V"
],
universal_newlines
=
True
)
output
=
raw_output
.
split
()
release_idx
=
output
.
index
(
"release"
)
+
1
release
=
output
[
release_idx
].
split
(
"."
)
bare_metal_major
=
release
[
0
]
bare_metal_minor
=
release
[
1
][
0
]
return
raw_output
,
bare_metal_major
,
bare_metal_minor
def
_create_build_dir
(
buildpath
):
try
:
os
.
mkdir
(
buildpath
)
except
OSError
:
if
not
os
.
path
.
isdir
(
buildpath
):
print
(
f
"Creation of the build directory
{
buildpath
}
failed"
)
colossalai/kernel/cuda_native/csrc/compat.h
0 → 100644
View file @
5c3843dc
/*This code from NVIDIA apex:
* https://github.com/NVIDIA/apex
* with minor changes. */
#ifndef TORCH_CHECK
#define TORCH_CHECK AT_CHECK
#endif
#ifdef VERSION_GE_1_3
#define DATA_PTR data_ptr
#else
#define DATA_PTR data
#endif
\ No newline at end of file
colossalai/kernel/cuda_native/csrc/kernels/cross_entropy.cu
0 → 100644
View file @
5c3843dc
#include "block_reduce.h"
#include "cuda_util.h"
#include "kernels.h"
#include "ls_cub.cuh"
ls
::
cub
::
CachingDeviceAllocator
g_allocator
(
true
);
template
<
typename
T
>
__global__
void
ls_cross_entropy_fw_kernel
(
const
T
*
__restrict__
inputs
,
const
int
*
__restrict__
targets
,
float
*
__restrict__
outputs
,
float
*
__restrict__
nll_loss_outputs
,
const
int
padding_idx
,
const
float
epsilon
,
const
int
vocab_size
)
{
/* step1: compute each thread's max_logit and sum_exp_logit, store in
* max_input, sum_exp_logit */
const
int
block_start
=
blockIdx
.
x
*
vocab_size
;
const
int
left_idx
=
block_start
+
threadIdx
.
x
;
const
int
right_idx
=
(
blockIdx
.
x
+
1
)
*
vocab_size
;
float
max_input
[
1
]
=
{
REDUCE_FLOAT_INF_NEG
};
float
sum_logits
[
2
]
=
{
0.
f
,
0.
f
};
// logit and logit exp
int
target_tid
=
targets
[
blockIdx
.
x
];
if
(
target_tid
==
padding_idx
)
{
if
(
threadIdx
.
x
==
0
)
{
nll_loss_outputs
[
blockIdx
.
x
]
=
0.
f
;
outputs
[
blockIdx
.
x
]
=
0.
f
;
}
return
;
}
for
(
int
i
=
left_idx
;
i
<
right_idx
;
i
+=
blockDim
.
x
)
{
max_input
[
0
]
=
fmaxf
(
max_input
[
0
],
static_cast
<
float
>
(
inputs
[
i
]));
}
blockReduce
<
ReduceType
::
kMax
,
1
>
(
max_input
);
__shared__
float
s_max_input
;
if
(
threadIdx
.
x
==
0
)
{
s_max_input
=
max_input
[
0
];
}
__syncthreads
();
for
(
int
i
=
left_idx
;
i
<
right_idx
;
i
+=
blockDim
.
x
)
{
float
logit
=
static_cast
<
float
>
(
inputs
[
i
])
-
s_max_input
;
sum_logits
[
0
]
+=
logit
;
sum_logits
[
1
]
+=
expf
(
logit
);
}
blockReduce
<
ReduceType
::
kSum
,
2
>
(
sum_logits
);
__shared__
float
s_sum_logit
;
__shared__
float
s_sum_exp
;
if
(
threadIdx
.
x
==
0
)
{
s_sum_logit
=
sum_logits
[
0
];
s_sum_exp
=
sum_logits
[
1
];
}
__syncthreads
();
float
eps_i
=
epsilon
/
(
vocab_size
-
1
);
if
(
threadIdx
.
x
==
0
)
{
// neg_log_prob = log(sum(exp(x - x_max))) - (x - x_max)
float
nll_loss
=
logf
(
s_sum_exp
)
-
static_cast
<
float
>
(
inputs
[
block_start
+
target_tid
])
+
s_max_input
;
nll_loss_outputs
[
blockIdx
.
x
]
=
nll_loss
;
float
sum_nll_loss
=
vocab_size
*
logf
(
s_sum_exp
)
-
s_sum_logit
;
outputs
[
blockIdx
.
x
]
=
(
1.
f
-
epsilon
-
eps_i
)
*
nll_loss
+
eps_i
*
sum_nll_loss
;
}
}
template
<
typename
T
>
__global__
void
ls_cross_entropy_bw_kernel
(
const
float
*
__restrict__
grad_outputs
,
const
T
*
__restrict__
inputs
,
const
int
*
__restrict__
targets
,
T
*
__restrict__
grad_inputs
,
const
int
padding_idx
,
const
float
epsilon
,
const
int
vocab_size
)
{
/* step1: compute each thread's max_logit and sum_exp_logit, store in
* max_input, sum_exp_logit */
const
int
block_start
=
blockIdx
.
x
*
vocab_size
;
const
int
left_idx
=
block_start
+
threadIdx
.
x
;
const
int
right_idx
=
(
blockIdx
.
x
+
1
)
*
vocab_size
;
float
max_input
[
1
]
=
{
REDUCE_FLOAT_INF_NEG
};
float
sum_logits
[
1
]
=
{
0.
f
};
const
float
grad_out
=
static_cast
<
float
>
(
grad_outputs
[
0
]);
int
target_tid
=
targets
[
blockIdx
.
x
];
if
(
target_tid
==
padding_idx
)
{
for
(
int
i
=
left_idx
;
i
<
right_idx
;
i
+=
blockDim
.
x
)
{
grad_inputs
[
i
]
=
0.
f
;
}
return
;
}
for
(
int
i
=
left_idx
;
i
<
right_idx
;
i
+=
blockDim
.
x
)
{
max_input
[
0
]
=
fmaxf
(
max_input
[
0
],
static_cast
<
float
>
(
inputs
[
i
]));
}
blockReduce
<
ReduceType
::
kMax
,
1
>
(
max_input
);
__shared__
float
s_max_input
;
if
(
threadIdx
.
x
==
0
)
{
s_max_input
=
max_input
[
0
];
}
__syncthreads
();
for
(
int
i
=
left_idx
;
i
<
right_idx
;
i
+=
blockDim
.
x
)
{
float
logit
=
static_cast
<
float
>
(
inputs
[
i
])
-
s_max_input
;
sum_logits
[
0
]
+=
expf
(
logit
);
}
blockReduce
<
ReduceType
::
kSum
,
1
>
(
sum_logits
);
__shared__
float
s_sum_exp
;
if
(
threadIdx
.
x
==
0
)
{
s_sum_exp
=
sum_logits
[
0
];
}
__syncthreads
();
float
eps_i
=
epsilon
/
(
vocab_size
-
1
);
float
nll_weight
=
1.0
-
epsilon
-
eps_i
;
for
(
int
i
=
left_idx
;
i
<
right_idx
;
i
+=
blockDim
.
x
)
{
float
prob
=
expf
(
static_cast
<
float
>
(
inputs
[
i
])
-
s_max_input
)
/
s_sum_exp
;
float
grad
=
0
;
grad
+=
(
vocab_size
*
prob
-
1
)
*
eps_i
;
grad
+=
prob
*
nll_weight
;
if
((
i
-
block_start
)
==
target_tid
)
{
grad
-=
nll_weight
;
}
grad_inputs
[
i
]
=
grad_out
*
grad
;
}
}
template
<
typename
T
>
void
launch_cross_entropy_fw
(
const
T
*
inputs_ptr
,
const
int
*
targets_ptr
,
float
*
outputs_ptr
,
float
*
nll_loss_ptr
,
float
*
loss_buffer
,
const
int
padding_idx
,
const
float
epsilon
,
const
int
batch_size
,
const
int
seq_len
,
const
int
vocab_size
,
cudaStream_t
stream
)
{
int
grid_dim
=
batch_size
*
seq_len
;
float
*
nll_loss_buffer
=
loss_buffer
+
grid_dim
;
ls_cross_entropy_fw_kernel
<<<
grid_dim
,
MAX_THREADS
,
0
,
stream
>>>
(
inputs_ptr
,
targets_ptr
,
loss_buffer
,
nll_loss_buffer
,
padding_idx
,
epsilon
,
vocab_size
);
int
num_items
=
grid_dim
;
void
*
d_temp_storage
=
NULL
;
size_t
temp_storage_bytes
=
0
;
CHECK_GPU_ERROR
(
ls
::
cub
::
DeviceReduce
::
Sum
(
d_temp_storage
,
temp_storage_bytes
,
loss_buffer
,
outputs_ptr
,
num_items
,
stream
));
CHECK_GPU_ERROR
(
g_allocator
.
DeviceAllocate
(
&
d_temp_storage
,
temp_storage_bytes
));
CHECK_GPU_ERROR
(
ls
::
cub
::
DeviceReduce
::
Sum
(
d_temp_storage
,
temp_storage_bytes
,
loss_buffer
,
outputs_ptr
,
num_items
,
stream
));
CHECK_GPU_ERROR
(
ls
::
cub
::
DeviceReduce
::
Sum
(
d_temp_storage
,
temp_storage_bytes
,
nll_loss_buffer
,
nll_loss_ptr
,
num_items
,
stream
));
CHECK_GPU_ERROR
(
g_allocator
.
DeviceFree
(
d_temp_storage
));
}
template
void
launch_cross_entropy_fw
<
float
>(
const
float
*
inputs_ptr
,
const
int
*
targets_ptr
,
float
*
outputs_ptr
,
float
*
nll_loss_ptr
,
float
*
loss_buffer
,
const
int
padding_idx
,
const
float
epsilon
,
const
int
batch_size
,
const
int
seq_len
,
const
int
vocab_size
,
cudaStream_t
stream
);
template
void
launch_cross_entropy_fw
<
__half
>(
const
__half
*
inputs_ptr
,
const
int
*
targets_ptr
,
float
*
outputs_ptr
,
float
*
nll_loss_ptr
,
float
*
loss_buffer
,
const
int
padding_idx
,
const
float
epsilon
,
const
int
batch_size
,
const
int
seq_len
,
const
int
vocab_size
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_cross_entropy_bw
(
const
float
*
grad_outputs_ptr
,
const
T
*
inputs_ptr
,
const
int
*
targets_ptr
,
T
*
grad_inputs_ptr
,
const
int
padding_idx
,
const
float
epsilon
,
const
int
batch_size
,
const
int
seq_len
,
const
int
vocab_size
,
cudaStream_t
stream
)
{
int
grid_dim
=
batch_size
*
seq_len
;
ls_cross_entropy_bw_kernel
<<<
grid_dim
,
MAX_THREADS
,
0
,
stream
>>>
(
grad_outputs_ptr
,
inputs_ptr
,
targets_ptr
,
grad_inputs_ptr
,
padding_idx
,
epsilon
,
vocab_size
);
}
template
void
launch_cross_entropy_bw
<
float
>(
const
float
*
grad_outputs_ptr
,
const
float
*
inputs_ptr
,
const
int
*
targets_ptr
,
float
*
grad_inputs_ptr
,
const
int
padding_idx
,
const
float
epsilon
,
const
int
batch_size
,
const
int
seq_len
,
const
int
vocab_size
,
cudaStream_t
stream
);
template
void
launch_cross_entropy_bw
<
__half
>(
const
float
*
grad_outputs_ptr
,
const
__half
*
inputs_ptr
,
const
int
*
targets_ptr
,
__half
*
grad_inputs_ptr
,
const
int
padding_idx
,
const
float
epsilon
,
const
int
batch_size
,
const
int
seq_len
,
const
int
vocab_size
,
cudaStream_t
stream
);
colossalai/kernel/cuda_native/csrc/kernels/cublas_wrappers.cu
0 → 100644
View file @
5c3843dc
/* Copyright 2021 The LightSeq Team
Copyright Microsoft DeepSpeed
This file is adapted from Microsoft DeepSpeed
*/
#include "cublas_wrappers.h"
int
cublas_gemm_ex
(
cublasHandle_t
handle
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
float
*
A
,
const
float
*
B
,
float
*
C
,
cublasGemmAlgo_t
algo
)
{
cublasStatus_t
status
=
cublasGemmEx
(
handle
,
transa
,
transb
,
m
,
n
,
k
,
(
const
void
*
)
alpha
,
(
const
void
*
)
A
,
CUDA_R_32F
,
(
transa
==
CUBLAS_OP_N
)
?
m
:
k
,
(
const
void
*
)
B
,
CUDA_R_32F
,
(
transb
==
CUBLAS_OP_N
)
?
k
:
n
,
(
const
void
*
)
beta
,
C
,
CUDA_R_32F
,
m
,
CUDA_R_32F
,
algo
);
if
(
status
!=
CUBLAS_STATUS_SUCCESS
)
{
fprintf
(
stderr
,
"!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d)
\n
"
,
m
,
n
,
k
,
(
int
)
status
);
return
EXIT_FAILURE
;
}
return
0
;
}
int
cublas_gemm_ex
(
cublasHandle_t
handle
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
__half
*
A
,
const
__half
*
B
,
__half
*
C
,
cublasGemmAlgo_t
algo
)
{
cublasStatus_t
status
=
cublasGemmEx
(
handle
,
transa
,
transb
,
m
,
n
,
k
,
(
const
void
*
)
alpha
,
(
const
void
*
)
A
,
CUDA_R_16F
,
(
transa
==
CUBLAS_OP_N
)
?
m
:
k
,
(
const
void
*
)
B
,
CUDA_R_16F
,
(
transb
==
CUBLAS_OP_N
)
?
k
:
n
,
(
const
void
*
)
beta
,
(
void
*
)
C
,
CUDA_R_16F
,
m
,
CUDA_R_32F
,
algo
);
if
(
status
!=
CUBLAS_STATUS_SUCCESS
)
{
fprintf
(
stderr
,
"!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d)
\n
"
,
m
,
n
,
k
,
(
int
)
status
);
return
EXIT_FAILURE
;
}
return
0
;
}
int
cublas_strided_batched_gemm
(
cublasHandle_t
handle
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
float
*
A
,
const
float
*
B
,
float
*
C
,
cublasOperation_t
op_A
,
cublasOperation_t
op_B
,
int
stride_A
,
int
stride_B
,
int
stride_C
,
int
batch
,
cublasGemmAlgo_t
algo
)
{
cublasStatus_t
status
=
cublasGemmStridedBatchedEx
(
handle
,
op_A
,
op_B
,
m
,
n
,
k
,
alpha
,
A
,
CUDA_R_32F
,
(
op_A
==
CUBLAS_OP_N
)
?
m
:
k
,
stride_A
,
B
,
CUDA_R_32F
,
(
op_B
==
CUBLAS_OP_N
)
?
k
:
n
,
stride_B
,
beta
,
C
,
CUDA_R_32F
,
m
,
stride_C
,
batch
,
CUDA_R_32F
,
algo
);
if
(
status
!=
CUBLAS_STATUS_SUCCESS
)
{
fprintf
(
stderr
,
"!!!! kernel execution error. (batch: %d, m: %d, n: %d, k: %d, "
"error: %d)
\n
"
,
batch
,
m
,
n
,
k
,
(
int
)
status
);
return
EXIT_FAILURE
;
}
return
0
;
}
int
cublas_strided_batched_gemm
(
cublasHandle_t
handle
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
__half
*
A
,
const
__half
*
B
,
__half
*
C
,
cublasOperation_t
op_A
,
cublasOperation_t
op_B
,
int
stride_A
,
int
stride_B
,
int
stride_C
,
int
batch
,
cublasGemmAlgo_t
algo
)
{
cublasStatus_t
status
=
cublasGemmStridedBatchedEx
(
handle
,
op_A
,
op_B
,
m
,
n
,
k
,
alpha
,
A
,
CUDA_R_16F
,
(
op_A
==
CUBLAS_OP_N
)
?
m
:
k
,
stride_A
,
B
,
CUDA_R_16F
,
(
op_B
==
CUBLAS_OP_N
)
?
k
:
n
,
stride_B
,
beta
,
C
,
CUDA_R_16F
,
m
,
stride_C
,
batch
,
CUDA_R_32F
,
algo
);
if
(
status
!=
CUBLAS_STATUS_SUCCESS
)
{
fprintf
(
stderr
,
"!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d)
\n
"
,
m
,
n
,
k
,
(
int
)
status
);
return
EXIT_FAILURE
;
}
return
0
;
}
colossalai/kernel/cuda_native/csrc/kernels/cuda_util.cu
0 → 100644
View file @
5c3843dc
#include <thrust/device_vector.h>
#include <thrust/reduce.h>
#include "cuda_util.h"
/* GPU function guard */
std
::
string
_cudaGetErrorString
(
cudaError_t
error
)
{
return
cudaGetErrorString
(
error
);
}
std
::
string
_cudaGetErrorString
(
cublasStatus_t
error
)
{
switch
(
error
)
{
case
CUBLAS_STATUS_SUCCESS
:
return
"CUBLAS_STATUS_SUCCESS"
;
case
CUBLAS_STATUS_NOT_INITIALIZED
:
return
"CUBLAS_STATUS_NOT_INITIALIZED"
;
case
CUBLAS_STATUS_ALLOC_FAILED
:
return
"CUBLAS_STATUS_ALLOC_FAILED"
;
case
CUBLAS_STATUS_INVALID_VALUE
:
return
"CUBLAS_STATUS_INVALID_VALUE"
;
case
CUBLAS_STATUS_ARCH_MISMATCH
:
return
"CUBLAS_STATUS_ARCH_MISMATCH"
;
case
CUBLAS_STATUS_MAPPING_ERROR
:
return
"CUBLAS_STATUS_MAPPING_ERROR"
;
case
CUBLAS_STATUS_EXECUTION_FAILED
:
return
"CUBLAS_STATUS_EXECUTION_FAILED"
;
case
CUBLAS_STATUS_INTERNAL_ERROR
:
return
"CUBLAS_STATUS_INTERNAL_ERROR"
;
case
CUBLAS_STATUS_NOT_SUPPORTED
:
return
"CUBLAS_STATUS_NOT_SUPPORTED"
;
case
CUBLAS_STATUS_LICENSE_ERROR
:
return
"CUBLAS_STATUS_LICENSE_ERROR"
;
}
return
"CUBLAS_UNKNOW"
;
}
template
<
typename
T
>
void
check_gpu_error
(
T
result
,
char
const
*
const
func
,
const
char
*
const
file
,
int
const
line
)
{
if
(
result
)
{
throw
std
::
runtime_error
(
std
::
string
(
"[CUDA][ERROR] "
)
+
+
file
+
"("
+
std
::
to_string
(
line
)
+
"): "
+
(
_cudaGetErrorString
(
result
))
+
"
\n
"
);
}
}
template
void
check_gpu_error
<
cudaError_t
>(
cudaError_t
result
,
char
const
*
const
func
,
const
char
*
const
file
,
int
const
line
);
template
void
check_gpu_error
<
cublasStatus_t
>(
cublasStatus_t
result
,
char
const
*
const
func
,
const
char
*
const
file
,
int
const
line
);
template
<
typename
T
>
void
print_vec
(
const
T
*
outv
,
std
::
string
outn
,
int
num_output_ele
)
{
std
::
cout
<<
outn
<<
": "
;
std
::
vector
<
T
>
hout
(
num_output_ele
,
(
T
)
0
);
cudaMemcpy
(
hout
.
data
(),
outv
,
num_output_ele
*
sizeof
(
T
),
cudaMemcpyDeviceToHost
);
for
(
int
i
=
0
;
i
<
num_output_ele
;
i
++
)
{
std
::
cout
<<
hout
[
i
]
<<
", "
;
}
std
::
cout
<<
std
::
endl
;
}
template
<
>
void
print_vec
<
__half
>
(
const
__half
*
outv
,
std
::
string
outn
,
int
num_output_ele
)
{
std
::
cout
<<
outn
<<
": "
;
std
::
vector
<
__half
>
hout
(
num_output_ele
,
(
__half
)
0.
f
);
cudaMemcpy
(
hout
.
data
(),
outv
,
num_output_ele
*
sizeof
(
__half
),
cudaMemcpyDeviceToHost
);
for
(
int
i
=
0
;
i
<
num_output_ele
;
i
++
)
{
std
::
cout
<<
__half2float
(
hout
[
i
])
<<
", "
;
}
std
::
cout
<<
std
::
endl
;
}
template
void
print_vec
<
float
>(
const
float
*
outv
,
std
::
string
outn
,
int
num_output_ele
);
template
void
print_vec
<
int
>(
const
int
*
outv
,
std
::
string
outn
,
int
num_output_ele
);
template
void
print_vec
<
__half
>(
const
__half
*
outv
,
std
::
string
outn
,
int
num_output_ele
);
template
<
typename
T
>
T
*
cuda_malloc
(
size_t
ele_num
)
{
size_t
byte_size
=
ele_num
*
sizeof
(
T
);
T
*
pdata
=
nullptr
;
CHECK_GPU_ERROR
(
cudaMalloc
((
void
**
)
&
pdata
,
byte_size
));
return
pdata
;
}
template
float
*
cuda_malloc
<
float
>(
size_t
ele_num
);
template
__half
*
cuda_malloc
<
__half
>(
size_t
ele_num
);
template
uint8_t
*
cuda_malloc
<
uint8_t
>(
size_t
ele_num
);
void
cuda_free
(
void
*
pdata
)
{
if
(
pdata
!=
nullptr
)
{
cudaFree
(
pdata
);
}
}
template
<
typename
T
>
struct
_isnan
{
__device__
bool
operator
()(
T
a
)
const
{
return
isnan
(
a
);
}
};
template
<
>
struct
_isnan
<
__half
>
{
__device__
bool
operator
()(
const
__half
a
)
const
{
return
__hisnan
(
a
);
}
};
template
<
typename
T
>
struct
_isinf
{
__device__
bool
operator
()(
T
a
)
const
{
return
isinf
(
a
);
}
};
template
<
>
struct
_isinf
<
__half
>
{
__device__
bool
operator
()(
const
__half
a
)
const
{
return
__hisinf
(
a
);
}
};
template
<
typename
T
>
void
check_nan_inf
(
const
T
*
data_ptr
,
int
dsize
,
bool
check_nan_inf
,
std
::
string
file
,
int
line
,
cudaStream_t
stream
)
{
// check_nan_inf = 0 for checking nan
// check_nan_inf = 1 for checking inf
bool
res
=
false
;
std
::
string
msg
=
file
+
"("
+
std
::
to_string
(
line
)
+
"): "
;
if
(
check_nan_inf
)
{
msg
+=
"nan."
;
res
=
thrust
::
transform_reduce
(
thrust
::
cuda
::
par
.
on
(
stream
),
data_ptr
,
data_ptr
+
dsize
,
_isnan
<
T
>
(),
false
,
thrust
::
logical_or
<
bool
>
());
}
else
{
msg
+=
"inf."
;
res
=
thrust
::
transform_reduce
(
thrust
::
cuda
::
par
.
on
(
stream
),
data_ptr
,
data_ptr
+
dsize
,
_isinf
<
T
>
(),
false
,
thrust
::
logical_or
<
bool
>
());
}
if
(
res
)
{
throw
std
::
runtime_error
(
msg
);
}
std
::
cout
<<
msg
<<
" [check pass]."
<<
std
::
endl
;
}
template
void
check_nan_inf
<
float
>(
const
float
*
data_ptr
,
int
dsize
,
bool
check_nan_inf
,
std
::
string
file
,
int
line
,
cudaStream_t
stream
);
template
void
check_nan_inf
<
__half
>(
const
__half
*
data_ptr
,
int
dsize
,
bool
check_nan_inf
,
std
::
string
file
,
int
line
,
cudaStream_t
stream
);
colossalai/kernel/cuda_native/csrc/kernels/dropout_kernels.cu
0 → 100644
View file @
5c3843dc
This diff is collapsed.
Click to expand it.
colossalai/kernel/cuda_native/csrc/kernels/general_kernels.cu
0 → 100644
View file @
5c3843dc
#include "kernels.h"
#include <cooperative_groups.h>
namespace
cg
=
cooperative_groups
;
/**
@brief: fuse_transpose_bias
Calculate the sum of elements in each column of the matrix.
@thread
gridDim.x = ceil(cols / WARP_SIZE)
blockDim.x = WARP_SIZE
blockDim.y = WARP_SIZE
@param
inp: [rows, cols]
out: [cols]
rows: the number of rows in the matrix
cols: the number of cols in the matrix
*/
template
<
typename
T
>
__global__
void
column_sum_reduce
(
const
T
*
__restrict__
inp
,
T
*
__restrict__
out
,
int
rows
,
int
cols
)
{
__shared__
float
tile
[
WARP_SIZE
][
WARP_SIZE
];
cg
::
thread_block
b
=
cg
::
this_thread_block
();
cg
::
thread_block_tile
<
WARP_SIZE
>
g
=
cg
::
tiled_partition
<
WARP_SIZE
>
(
b
);
int
idx
=
flat_2dim
(
blockIdx
.
x
,
threadIdx
.
x
,
WARP_SIZE
);
int
y_stride
=
cols
*
WARP_SIZE
;
float
localSum
=
0
;
// Loop across matrix row
// TODO: optimize to log complexity
if
(
idx
<
cols
)
{
int
offset
=
flat_2dim
(
threadIdx
.
y
,
idx
,
cols
);
for
(
int
r
=
threadIdx
.
y
;
r
<
rows
;
r
+=
WARP_SIZE
)
{
localSum
+=
(
float
)
inp
[
offset
];
offset
+=
y_stride
;
}
}
// The sum of a row in tile is equal to the sum of a col in original matrix
tile
[
threadIdx
.
x
][
threadIdx
.
y
]
=
localSum
;
__syncthreads
();
// Sum the shared buffer.
// The change of threadIdx.x is continuous
float
sum
=
tile
[
threadIdx
.
y
][
threadIdx
.
x
];
__syncthreads
();
// Calculate the sum of a row in tile
for
(
int
i
=
1
;
i
<
WARP_SIZE
;
i
<<=
1
)
sum
+=
g
.
shfl_down
(
sum
,
i
);
if
(
threadIdx
.
x
==
0
)
{
int
pos
=
flat_2dim
(
blockIdx
.
x
,
threadIdx
.
y
,
WARP_SIZE
);
if
(
pos
<
cols
)
out
[
pos
]
=
sum
;
}
}
// [r, c] -> [c]
template
<
>
void
launch_fuse_transpose_bias_kernel
<
float
>
(
const
float
*
inp
,
float
*
out
,
int
rows
,
int
cols
,
cudaStream_t
stream
)
{
dim3
grid_dim
((
cols
-
1
)
/
WARP_SIZE
+
1
);
dim3
block_dim
(
WARP_SIZE
,
WARP_SIZE
);
column_sum_reduce
<
float
>
<<<
grid_dim
,
block_dim
,
0
,
stream
>>>
(
inp
,
out
,
rows
,
cols
);
}
template
<
>
void
launch_fuse_transpose_bias_kernel
<
__half
>
(
const
__half
*
inp
,
__half
*
out
,
int
rows
,
int
cols
,
cudaStream_t
stream
)
{
dim3
grid_dim
((
cols
-
1
)
/
WARP_SIZE
+
1
);
dim3
block_dim
(
WARP_SIZE
,
WARP_SIZE
);
column_sum_reduce
<
__half
>
<<<
grid_dim
,
block_dim
,
0
,
stream
>>>
(
inp
,
out
,
rows
,
cols
);
}
/**
@brief: fused_add2
Add two matrix inp1 and inp2 to out.
@thread
gridDim.x = batch_size * seq_len
blockDim.x = min(hidden_dim, MAX_THREADS)
@param
inp1: [batch_size, seq_len, hidden_dim]
inp2: [batch_size, seq_len, hidden_dim]
out: [batch_size, seq_len, hidden_dim]
batch_size: the size of the current batch
seq_len: the sequence length of the current batch
hidden_dim: dim of the hidden tensor
*/
template
<
typename
T
>
__global__
void
fused_add2_kernel
(
T
*
out
,
const
T
*
inp1
,
const
T
*
inp2
,
int
hidden_dim
);
template
<
>
__global__
void
fused_add2_kernel
<
float
>
(
float
*
out
,
const
float
*
inp1
,
const
float
*
inp2
,
int
hidden_dim
)
{
int
row_id
=
blockIdx
.
x
;
int
offset
=
flat_2dim
(
row_id
,
0
,
hidden_dim
);
const
float4
*
inp1_4
=
reinterpret_cast
<
const
float4
*>
(
inp1
);
const
float4
*
inp2_4
=
reinterpret_cast
<
const
float4
*>
(
inp2
);
float4
*
out_4
=
reinterpret_cast
<
float4
*>
(
out
);
float4
vinp1
;
float4
vinp2
;
float4
val
;
for
(
std
::
size_t
i
=
threadIdx
.
x
;
i
<
hidden_dim
;
i
+=
blockDim
.
x
)
{
vinp1
=
inp1_4
[
offset
+
i
];
vinp2
=
inp2_4
[
offset
+
i
];
val
.
x
=
vinp1
.
x
+
vinp2
.
x
;
val
.
y
=
vinp1
.
y
+
vinp2
.
y
;
val
.
z
=
vinp1
.
z
+
vinp2
.
z
;
val
.
w
=
vinp1
.
w
+
vinp2
.
w
;
out_4
[
offset
+
i
]
=
val
;
}
}
template
<
>
__global__
void
fused_add2_kernel
<
__half
>
(
__half
*
out
,
const
__half
*
inp1
,
const
__half
*
inp2
,
int
hidden_dim
)
{
int
row_id
=
blockIdx
.
x
;
int
offset
=
flat_2dim
(
row_id
,
0
,
hidden_dim
);
const
float4
*
inp1_4
=
reinterpret_cast
<
const
float4
*>
(
inp1
);
const
float4
*
inp2_4
=
reinterpret_cast
<
const
float4
*>
(
inp2
);
float4
*
out_4
=
reinterpret_cast
<
float4
*>
(
out
);
float4
vinp1
;
float4
vinp2
;
float4
val
;
__half2
*
h2_inp1
=
reinterpret_cast
<
__half2
*>
(
&
vinp1
);
__half2
*
h2_inp2
=
reinterpret_cast
<
__half2
*>
(
&
vinp2
);
__half2
*
h2_val
=
reinterpret_cast
<
__half2
*>
(
&
val
);
for
(
std
::
size_t
i
=
threadIdx
.
x
;
i
<
hidden_dim
;
i
+=
blockDim
.
x
)
{
vinp1
=
inp1_4
[
offset
+
i
];
vinp2
=
inp2_4
[
offset
+
i
];
h2_val
[
0
]
=
__hadd2
(
h2_inp1
[
0
],
h2_inp2
[
0
]);
h2_val
[
1
]
=
__hadd2
(
h2_inp1
[
1
],
h2_inp2
[
1
]);
h2_val
[
2
]
=
__hadd2
(
h2_inp1
[
2
],
h2_inp2
[
2
]);
h2_val
[
3
]
=
__hadd2
(
h2_inp1
[
3
],
h2_inp2
[
3
]);
out_4
[
offset
+
i
]
=
val
;
}
}
//[b, s, h] -> [b, s, h]
template
<
>
void
launch_fused_add2
<
float
>
(
float
*
out
,
const
float
*
inp1
,
const
float
*
inp2
,
int
batch_size
,
int
seq_len
,
int
hidden_dim
,
cudaStream_t
&
stream
)
{
hidden_dim
>>=
2
;
dim3
grid_dim
(
batch_size
*
seq_len
);
dim3
block_dim
(
min
(
hidden_dim
,
MAX_THREADS
));
fused_add2_kernel
<<<
grid_dim
,
block_dim
,
0
,
stream
>>>
(
out
,
inp1
,
inp2
,
hidden_dim
);
}
template
<
>
void
launch_fused_add2
<
__half
>
(
__half
*
out
,
const
__half
*
inp1
,
const
__half
*
inp2
,
int
batch_size
,
int
seq_len
,
int
hidden_dim
,
cudaStream_t
&
stream
)
{
hidden_dim
>>=
3
;
dim3
grid_dim
(
batch_size
*
seq_len
);
dim3
block_dim
(
min
(
hidden_dim
,
MAX_THREADS
));
fused_add2_kernel
<<<
grid_dim
,
block_dim
,
0
,
stream
>>>
(
out
,
inp1
,
inp2
,
hidden_dim
);
}
template
<
typename
T
>
__global__
void
kernel_concat3_dim1
(
const
T
*
inp1
,
const
T
*
inp2
,
T
*
output
,
int
sz0
,
int
sz2
,
int
sz1_1
,
int
sz1_2
)
{
int
nele
=
sz0
*
sz2
*
(
sz1_1
+
sz1_2
);
int
idx
=
flat_2dim
(
blockIdx
.
x
,
threadIdx
.
x
,
blockDim
.
x
);
if
(
idx
>=
nele
)
{
return
;
}
float4
*
dst_ptr
=
(
float4
*
)
output
+
idx
;
int
idx2
=
idx
%
sz2
;
idx
=
idx
/
sz2
;
int
idx1
=
idx
%
(
sz1_1
+
sz1_2
);
int
idx0
=
idx
/
(
sz1_1
+
sz1_2
);
float4
*
src_ptr
=
nullptr
;
int
sz1
=
0
;
if
(
idx1
<
sz1_1
)
{
sz1
=
sz1_1
;
src_ptr
=
(
float4
*
)
inp1
;
}
else
{
idx1
-=
sz1_1
;
sz1
=
sz1_2
;
src_ptr
=
(
float4
*
)
inp2
;
}
src_ptr
+=
flat_3dim
(
idx0
,
idx1
,
idx2
,
sz1
,
sz2
);
dst_ptr
[
0
]
=
src_ptr
[
0
];
}
template
<
>
void
launch_concat3_dim1
<
float
>
(
const
float
*
inp1
,
const
float
*
inp2
,
float
*
output
,
int
sz0
,
int
sz2
,
int
sz1_1
,
int
sz1_2
,
cudaStream_t
stream
)
{
sz2
>>=
2
;
int
nele
=
sz0
*
sz2
*
(
sz1_1
+
sz1_2
);
int
nblock
=
(
nele
+
MAX_THREADS
-
1
)
/
MAX_THREADS
;
kernel_concat3_dim1
<<<
nblock
,
MAX_THREADS
,
0
,
stream
>>>
(
inp1
,
inp2
,
output
,
sz0
,
sz2
,
sz1_1
,
sz1_2
);
}
template
<
>
void
launch_concat3_dim1
<
__half
>
(
const
__half
*
inp1
,
const
__half
*
inp2
,
__half
*
output
,
int
sz0
,
int
sz2
,
int
sz1_1
,
int
sz1_2
,
cudaStream_t
stream
)
{
sz2
>>=
3
;
int
nele
=
sz0
*
sz2
*
(
sz1_1
+
sz1_2
);
int
nblock
=
(
nele
+
MAX_THREADS
-
1
)
/
MAX_THREADS
;
kernel_concat3_dim1
<<<
nblock
,
MAX_THREADS
,
0
,
stream
>>>
(
inp1
,
inp2
,
output
,
sz0
,
sz2
,
sz1_1
,
sz1_2
);
}
colossalai/kernel/cuda_native/csrc/kernels/include/block_reduce.h
0 → 100644
View file @
5c3843dc
/* Copyright 2021 The LightSeq Team
Copyright Tencent/TurboTransformers
This block_reduce_n is adapted from Tencent/TurboTransformers
*/
#pragma once
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
enum
class
ReduceType
{
kMax
=
0
,
kSum
};
const
unsigned
int
WARP_REDUCE_MASK
=
0xffffffff
;
const
float
REDUCE_FLOAT_INF_NEG
=
-
100000000.
f
;
const
float
REDUCE_FLOAT_INF_POS
=
100000000.
f
;
const
unsigned
int
WARP_REDUCE_SIZE
=
32
;
template
<
typename
T
>
__forceinline__
__device__
T
warpReduceSum
(
T
val
)
{
for
(
int
mask
=
(
WARP_REDUCE_SIZE
>>
1
);
mask
>
0
;
mask
>>=
1
)
val
+=
__shfl_xor_sync
(
WARP_REDUCE_MASK
,
val
,
mask
,
WARP_REDUCE_SIZE
);
return
val
;
}
/* Calculate the sum of all elements in a block */
template
<
typename
T
>
__forceinline__
__device__
T
blockReduceSum
(
T
val
)
{
static
__shared__
T
shared
[
32
];
int
lane
=
threadIdx
.
x
&
0x1f
;
int
wid
=
threadIdx
.
x
>>
5
;
val
=
warpReduceSum
<
T
>
(
val
);
if
(
lane
==
0
)
shared
[
wid
]
=
val
;
__syncthreads
();
val
=
(
threadIdx
.
x
<
(
blockDim
.
x
>>
5
))
?
shared
[
lane
]
:
(
T
)
0.0
f
;
val
=
warpReduceSum
<
T
>
(
val
);
return
val
;
}
template
<
ReduceType
Rtype
,
int
Num
>
__inline__
__device__
void
blockReduce
(
float
*
pval
);
// use template to make code more concise
template
<
ReduceType
Rtype
,
int
Num
>
__inline__
__device__
void
warpReduce
(
float
*
pval
);
// static
template
<
>
__inline__
__device__
void
warpReduce
<
ReduceType
::
kMax
,
1
>
(
float
*
pval
)
{
*
pval
=
max
(
*
pval
,
__shfl_xor_sync
(
WARP_REDUCE_MASK
,
*
pval
,
16
,
32
));
*
pval
=
max
(
*
pval
,
__shfl_xor_sync
(
WARP_REDUCE_MASK
,
*
pval
,
8
,
32
));
*
pval
=
max
(
*
pval
,
__shfl_xor_sync
(
WARP_REDUCE_MASK
,
*
pval
,
4
,
32
));
*
pval
=
max
(
*
pval
,
__shfl_xor_sync
(
WARP_REDUCE_MASK
,
*
pval
,
2
,
32
));
*
pval
=
max
(
*
pval
,
__shfl_xor_sync
(
WARP_REDUCE_MASK
,
*
pval
,
1
,
32
));
}
template
<
>
__inline__
__device__
void
warpReduce
<
ReduceType
::
kMax
,
2
>
(
float
*
pval
)
{
float
val0_tmp
,
val1_tmp
;
#define WarpReduceMaxOneStep(a, b) \
val0_tmp = __shfl_xor_sync(WARP_REDUCE_MASK, *(pval), a, b); \
val1_tmp = __shfl_xor_sync(WARP_REDUCE_MASK, *(pval + 1), a, b); \
*(pval) = max(val0_tmp, *(pval)); \
*(pval + 1) = max(val1_tmp, *(pval + 1));
WarpReduceMaxOneStep
(
16
,
32
);
WarpReduceMaxOneStep
(
8
,
32
);
WarpReduceMaxOneStep
(
4
,
32
);
WarpReduceMaxOneStep
(
2
,
32
);
WarpReduceMaxOneStep
(
1
,
32
);
#undef WarpReduceMaxOneStep
}
template
<
>
__inline__
__device__
void
warpReduce
<
ReduceType
::
kSum
,
1
>
(
float
*
pval
)
{
*
pval
+=
__shfl_xor_sync
(
WARP_REDUCE_MASK
,
*
pval
,
16
,
32
);
*
pval
+=
__shfl_xor_sync
(
WARP_REDUCE_MASK
,
*
pval
,
8
,
32
);
*
pval
+=
__shfl_xor_sync
(
WARP_REDUCE_MASK
,
*
pval
,
4
,
32
);
*
pval
+=
__shfl_xor_sync
(
WARP_REDUCE_MASK
,
*
pval
,
2
,
32
);
*
pval
+=
__shfl_xor_sync
(
WARP_REDUCE_MASK
,
*
pval
,
1
,
32
);
}
/*
* Unorll for loop for warpreduce to
* imporve instruction issue efficiency
* ElemX means there are X numbers to be summed
*/
template
<
>
__inline__
__device__
void
warpReduce
<
ReduceType
::
kSum
,
2
>
(
float
*
pval
)
{
float
val0_tmp
,
val1_tmp
;
#define WarpReduceSumOneStep(a, b) \
val0_tmp = __shfl_xor_sync(WARP_REDUCE_MASK, *(pval + 0), a, b); \
val1_tmp = __shfl_xor_sync(WARP_REDUCE_MASK, *(pval + 1), a, b); \
*(pval + 0) += val0_tmp; \
*(pval + 1) += val1_tmp
WarpReduceSumOneStep
(
16
,
32
);
WarpReduceSumOneStep
(
8
,
32
);
WarpReduceSumOneStep
(
4
,
32
);
WarpReduceSumOneStep
(
2
,
32
);
WarpReduceSumOneStep
(
1
,
32
);
#undef WarpReduceSumOneStep
}
template
<
>
__inline__
__device__
void
warpReduce
<
ReduceType
::
kSum
,
4
>
(
float
*
pval
)
{
float
val0_tmp
,
val1_tmp
,
val2_tmp
,
val3_tmp
;
#define WarpReduceSumOneStep(a, b) \
val0_tmp = __shfl_xor_sync(WARP_REDUCE_MASK, *(pval + 0), a, b); \
val1_tmp = __shfl_xor_sync(WARP_REDUCE_MASK, *(pval + 1), a, b); \
val2_tmp = __shfl_xor_sync(WARP_REDUCE_MASK, *(pval + 2), a, b); \
val3_tmp = __shfl_xor_sync(WARP_REDUCE_MASK, *(pval + 3), a, b); \
*(pval + 0) += val0_tmp; \
*(pval + 1) += val1_tmp; \
*(pval + 2) += val2_tmp; \
*(pval + 3) += val3_tmp
WarpReduceSumOneStep
(
16
,
32
);
WarpReduceSumOneStep
(
8
,
32
);
WarpReduceSumOneStep
(
4
,
32
);
WarpReduceSumOneStep
(
2
,
32
);
WarpReduceSumOneStep
(
1
,
32
);
#undef WarpReduceSumOneStep
}
template
<
>
__inline__
__device__
void
blockReduce
<
ReduceType
::
kSum
,
1
>
(
float
*
pval
)
{
const
int
num
=
1
;
static
__shared__
float
shared
[
num
][
32
];
int
lane_id
=
threadIdx
.
x
&
0x1f
;
int
wid
=
threadIdx
.
x
>>
5
;
warpReduce
<
ReduceType
::
kSum
,
num
>
(
pval
);
if
(
lane_id
==
0
)
{
#pragma unroll
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
shared
[
i
][
wid
]
=
*
(
pval
+
i
);
}
}
__syncthreads
();
if
(
threadIdx
.
x
<
(
blockDim
.
x
>>
5
))
{
#pragma unroll
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
*
(
pval
+
i
)
=
shared
[
i
][
lane_id
];
}
}
else
{
#pragma unroll
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
*
(
pval
+
i
)
=
0.
f
;
}
}
warpReduce
<
ReduceType
::
kSum
,
num
>
(
pval
);
}
template
<
>
__inline__
__device__
void
blockReduce
<
ReduceType
::
kSum
,
2
>
(
float
*
pval
)
{
const
int
num
=
2
;
static
__shared__
float
shared
[
num
][
32
];
int
lane_id
=
threadIdx
.
x
&
0x1f
;
int
wid
=
threadIdx
.
x
>>
5
;
warpReduce
<
ReduceType
::
kSum
,
num
>
(
pval
);
if
(
lane_id
==
0
)
{
#pragma unroll
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
shared
[
i
][
wid
]
=
*
(
pval
+
i
);
}
}
__syncthreads
();
if
(
threadIdx
.
x
<
(
blockDim
.
x
>>
5
))
{
#pragma unroll
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
*
(
pval
+
i
)
=
shared
[
i
][
lane_id
];
}
}
else
{
#pragma unroll
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
*
(
pval
+
i
)
=
0.
f
;
}
}
warpReduce
<
ReduceType
::
kSum
,
num
>
(
pval
);
}
template
<
>
__inline__
__device__
void
blockReduce
<
ReduceType
::
kSum
,
4
>
(
float
*
pval
)
{
const
int
num
=
4
;
static
__shared__
float
shared
[
num
][
32
];
int
lane_id
=
threadIdx
.
x
&
0x1f
;
int
wid
=
threadIdx
.
x
>>
5
;
warpReduce
<
ReduceType
::
kSum
,
num
>
(
pval
);
if
(
lane_id
==
0
)
{
#pragma unroll
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
shared
[
i
][
wid
]
=
*
(
pval
+
i
);
}
}
__syncthreads
();
if
(
threadIdx
.
x
<
(
blockDim
.
x
>>
5
))
{
#pragma unroll
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
*
(
pval
+
i
)
=
shared
[
i
][
lane_id
];
}
}
else
{
#pragma unroll
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
*
(
pval
+
i
)
=
0.
f
;
}
}
warpReduce
<
ReduceType
::
kSum
,
num
>
(
pval
);
}
template
<
>
__inline__
__device__
void
blockReduce
<
ReduceType
::
kMax
,
1
>
(
float
*
pval
)
{
const
int
num
=
1
;
static
__shared__
float
shared
[
num
][
32
];
int
lane_id
=
threadIdx
.
x
&
0x1f
;
int
wid
=
threadIdx
.
x
>>
5
;
warpReduce
<
ReduceType
::
kMax
,
num
>
(
pval
);
if
(
lane_id
==
0
)
{
#pragma unroll
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
shared
[
i
][
wid
]
=
*
(
pval
+
i
);
}
}
__syncthreads
();
if
(
threadIdx
.
x
<
(
blockDim
.
x
>>
5
))
{
#pragma unroll
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
*
(
pval
+
i
)
=
shared
[
i
][
lane_id
];
}
}
else
{
#pragma unroll
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
*
(
pval
+
i
)
=
REDUCE_FLOAT_INF_NEG
;
}
}
warpReduce
<
ReduceType
::
kMax
,
num
>
(
pval
);
}
template
<
>
__inline__
__device__
void
blockReduce
<
ReduceType
::
kMax
,
2
>
(
float
*
pval
)
{
const
int
num
=
1
;
static
__shared__
float
shared
[
num
][
32
];
int
lane_id
=
threadIdx
.
x
&
0x1f
;
int
wid
=
threadIdx
.
x
>>
5
;
warpReduce
<
ReduceType
::
kMax
,
num
>
(
pval
);
if
(
lane_id
==
0
)
{
#pragma unroll
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
shared
[
i
][
wid
]
=
*
(
pval
+
i
);
}
}
__syncthreads
();
if
(
threadIdx
.
x
<
(
blockDim
.
x
>>
5
))
{
#pragma unroll
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
*
(
pval
+
i
)
=
shared
[
i
][
lane_id
];
}
}
else
{
#pragma unroll
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
*
(
pval
+
i
)
=
REDUCE_FLOAT_INF_NEG
;
}
}
warpReduce
<
ReduceType
::
kMax
,
num
>
(
pval
);
}
template
<
>
__inline__
__device__
void
blockReduce
<
ReduceType
::
kMax
,
4
>
(
float
*
pval
)
{
const
int
num
=
1
;
static
__shared__
float
shared
[
num
][
32
];
int
lane_id
=
threadIdx
.
x
&
0x1f
;
int
wid
=
threadIdx
.
x
>>
5
;
warpReduce
<
ReduceType
::
kMax
,
num
>
(
pval
);
if
(
lane_id
==
0
)
{
#pragma unroll
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
shared
[
i
][
wid
]
=
*
(
pval
+
i
);
}
}
__syncthreads
();
if
(
threadIdx
.
x
<
(
blockDim
.
x
>>
5
))
{
#pragma unroll
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
*
(
pval
+
i
)
=
shared
[
i
][
lane_id
];
}
}
else
{
#pragma unroll
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
*
(
pval
+
i
)
=
REDUCE_FLOAT_INF_NEG
;
}
}
warpReduce
<
ReduceType
::
kMax
,
num
>
(
pval
);
}
colossalai/kernel/cuda_native/csrc/kernels/include/context.h
0 → 100644
View file @
5c3843dc
#pragma once
#include <cublas_v2.h>
#include <cuda.h>
#include <iostream>
#include <string>
#include "cuda_util.h"
class
Context
{
public:
Context
()
:
_stream
(
nullptr
)
{
CHECK_GPU_ERROR
(
cublasCreate
(
&
_cublasHandle
));
}
virtual
~
Context
()
{}
static
Context
&
Instance
()
{
static
Context
_ctx
;
return
_ctx
;
}
void
set_stream
(
cudaStream_t
stream
)
{
_stream
=
stream
;
CHECK_GPU_ERROR
(
cublasSetStream
(
_cublasHandle
,
_stream
));
}
cudaStream_t
get_stream
()
{
return
_stream
;
}
cublasHandle_t
get_cublashandle
()
{
return
_cublasHandle
;
}
private:
cudaStream_t
_stream
;
cublasHandle_t
_cublasHandle
;
};
colossalai/kernel/cuda_native/csrc/kernels/include/cross_entropy_layer.h
0 → 100644
View file @
5c3843dc
#pragma once
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_runtime_api.h>
#include <type_traits>
#include "cuda_util.h"
template
<
typename
T
>
class
CrossEntropyLayer
{
public:
CrossEntropyLayer
(
float
epsilon
,
int
padding_idx
,
int
max_batch_tokens
);
virtual
~
CrossEntropyLayer
();
void
Forward
(
const
T
*
inputs_ptr
,
const
int
*
targets_ptr
,
float
*
outputs_ptr
,
float
*
nll_loss_ptr
);
void
Backward
(
const
float
*
grad_outputs_ptr
,
const
T
*
inputs_ptr
,
const
int
*
targets_ptr
,
T
*
grad_inputs_ptr
);
void
set_cur_batch_shape
(
int
batch_size
,
int
seq_len
,
int
vocab_size
);
private:
void
allocate_mem_buffer
()
{
// allocate local gpu memory
_loss_buffer
=
cuda_malloc
<
float
>
(
_max_batch_tokens
*
2
);
}
void
free_mem_buffer
()
{
// free local gpu memory
cuda_free
(
_loss_buffer
);
}
const
int
_padding_idx
;
const
float
_epsilon
;
const
int
_max_batch_tokens
;
size_t
_batch_size
;
size_t
_seq_len
;
size_t
_vocab_size
;
float
*
_loss_buffer
;
};
colossalai/kernel/cuda_native/csrc/kernels/include/cublas_wrappers.h
0 → 100644
View file @
5c3843dc
/* Copyright 2021 The LightSeq Team
Copyright Microsoft DeepSpeed
This file is adapted from Microsoft DeepSpeed
*/
#pragma once
#include <assert.h>
#include <cublas_v2.h>
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#include <mma.h>
#include <stdio.h>
int
cublas_gemm_ex
(
cublasHandle_t
handle
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
float
*
A
,
const
float
*
B
,
float
*
C
,
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DEFAULT
);
int
cublas_gemm_ex
(
cublasHandle_t
handle
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
__half
*
A
,
const
__half
*
B
,
__half
*
C
,
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DEFAULT_TENSOR_OP
);
int
cublas_strided_batched_gemm
(
cublasHandle_t
handle
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
float
*
A
,
const
float
*
B
,
float
*
C
,
cublasOperation_t
op_A
,
cublasOperation_t
op_B
,
int
stride_A
,
int
stride_B
,
int
stride_C
,
int
batch
,
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DEFAULT
);
int
cublas_strided_batched_gemm
(
cublasHandle_t
handle
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
__half
*
A
,
const
__half
*
B
,
__half
*
C
,
cublasOperation_t
op_A
,
cublasOperation_t
op_B
,
int
stride_A
,
int
stride_B
,
int
stride_C
,
int
batch
,
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DEFAULT_TENSOR_OP
);
colossalai/kernel/cuda_native/csrc/kernels/include/cuda_util.h
0 → 100644
View file @
5c3843dc
#pragma once
#include <cublas_v2.h>
#include <cuda.h>
#include <math_constants.h>
#include <chrono>
#include <fstream>
#include <iostream>
#include <string>
#include <type_traits>
#include <vector>
template
<
typename
T
>
void
check_gpu_error
(
T
result
,
char
const
*
const
func
,
const
char
*
const
file
,
int
const
line
);
#define CHECK_GPU_ERROR(val) check_gpu_error((val), #val, __FILE__, __LINE__)
template
<
typename
T
>
void
print_vec
(
const
T
*
outv
,
std
::
string
outn
,
int
num_output_ele
);
template
<
typename
T
>
T
*
cuda_malloc
(
size_t
ele_num
);
void
cuda_free
(
void
*
pdata
);
template
<
typename
T
>
void
check_nan_inf
(
const
T
*
data_ptr
,
int
dsize
,
bool
check_nan_inf
,
std
::
string
file
,
int
line
,
cudaStream_t
stream
);
#define CHECK_NAN_INF(ptr, size, stream) \
check_nan_inf((ptr), (size), true, __FILE__, __LINE__, (stream)); \
check_nan_inf((ptr), (size), false, __FILE__, __LINE__, (stream))
colossalai/kernel/cuda_native/csrc/kernels/include/dropout.h
0 → 100644
View file @
5c3843dc
#pragma once
#include <string>
#include <cuda.h>
#include <cuda_fp16.h>
#include <stdio.h>
#include "kernels.h"
template
<
typename
T
>
class
Dropout
{
public:
struct
Config
{
float
ratio
;
bool
training
;
Config
(
float
r
)
:
ratio
(
r
),
training
(
true
)
{}
float
RATIO
()
const
{
return
training
?
ratio
:
0.0
;
}
};
Dropout
(
const
Config
&
config
,
size_t
max_ele_num
)
:
_config
(
config
),
_mask
(
nullptr
)
{
_mask
=
cuda_malloc
<
uint8_t
>
(
max_ele_num
);
}
virtual
~
Dropout
()
{
cuda_free
(
_mask
);
}
// after attention softmax
void
dropout
(
T
*
output
,
const
T
*
input
,
int
count
,
cudaStream_t
stream
,
bool
bwd
=
false
)
{
launch_ls_dropout
<
T
>
(
output
,
input
,
_mask
,
count
,
_config
.
RATIO
(),
stream
,
bwd
);
}
void
d_dropout
(
T
*
d_inp_out
,
int
count
,
cudaStream_t
stream
)
{
launch_ls_dropout
<
T
>
(
d_inp_out
,
d_inp_out
,
_mask
,
count
,
_config
.
RATIO
(),
stream
,
true
);
}
// transformer layer's postprocessing dropout, after attn or ffn module,
// before residual add.
void
bias_dropout_residual
(
T
*
output
,
const
T
*
input
,
const
T
*
residual
,
const
T
*
bias
,
int
rows
,
int
cols
,
cudaStream_t
stream
)
{
launch_ls_dropout_res_bias
<
T
>
(
output
,
input
,
_mask
,
bias
,
residual
,
rows
*
cols
,
cols
,
_config
.
RATIO
(),
stream
);
}
void
d_bias_dropout_residual
(
T
*
d_input
,
T
*
d_bias
,
const
T
*
d_output
,
int
rows
,
int
cols
,
cudaStream_t
stream
)
{
launch_ls_dropout_bias_bwd
<
T
>
(
d_input
,
d_bias
,
d_output
,
_mask
,
rows
,
cols
,
_config
.
RATIO
(),
stream
);
}
// dropout inside ffn.
void
bias_act_dropout
(
T
*
output
,
const
T
*
input
,
const
T
*
bias
,
int
rows
,
int
cols
,
std
::
string
activation_fn
,
cudaStream_t
stream
)
{
if
(
activation_fn
==
"relu"
)
{
launch_ls_dropout_act_bias
<
ActivationType
::
kRelu
,
T
>
(
output
,
input
,
_mask
,
bias
,
rows
*
cols
,
cols
,
_config
.
RATIO
(),
stream
);
}
else
if
(
activation_fn
==
"gelu"
)
{
launch_ls_dropout_act_bias
<
ActivationType
::
kGelu
,
T
>
(
output
,
input
,
_mask
,
bias
,
rows
*
cols
,
cols
,
_config
.
RATIO
(),
stream
);
}
else
{
throw
std
::
runtime_error
(
"not supported activation: "
+
activation_fn
);
}
}
void
d_bias_act_dropout
(
T
*
d_inp_out
,
T
*
d_bias_out
,
const
T
*
input
,
const
T
*
bias
,
int
rows
,
int
cols
,
std
::
string
activation_fn
,
cudaStream_t
stream
)
{
if
(
activation_fn
==
"relu"
)
{
launch_ls_dropout_act_bias_bwd
<
ActivationType
::
kRelu
,
T
>
(
d_inp_out
,
d_bias_out
,
input
,
bias
,
d_inp_out
,
_mask
,
rows
,
cols
,
_config
.
RATIO
(),
stream
);
}
else
if
(
activation_fn
==
"gelu"
)
{
launch_ls_dropout_act_bias_bwd
<
ActivationType
::
kGelu
,
T
>
(
d_inp_out
,
d_bias_out
,
input
,
bias
,
d_inp_out
,
_mask
,
rows
,
cols
,
_config
.
RATIO
(),
stream
);
}
else
{
throw
std
::
runtime_error
(
"not supported activation: "
+
activation_fn
);
}
}
bool
HasDropout
()
const
{
return
_config
.
RATIO
()
>
0.0
;
}
void
SetTrainingMode
(
bool
training
)
{
_config
.
training
=
training
;
}
private:
uint8_t
*
_mask
;
Config
_config
;
};
colossalai/kernel/cuda_native/csrc/kernels/include/feed_forward.h
0 → 100644
View file @
5c3843dc
#pragma once
/* Copyright 2021 The LightSeq Team
Copyright Microsoft DeepSpeed
This file is adapted from Microsoft DeepSpeed
*/
#include <cuda.h>
#include <cuda_fp16.h>
#include <stdio.h>
#include <array>
#include "cublas_wrappers.h"
#include "kernels.h"
template
<
typename
T
>
class
FeedForward
{
public:
struct
Config
{
int
outputSize
;
int
inputSize
;
std
::
array
<
int
,
3
>
gemm_algos
;
Config
(
int
outputs
,
int
inputs
)
:
outputSize
(
outputs
),
inputSize
(
inputs
),
gemm_algos
(
std
::
array
<
int
,
3
>
({
99
,
99
,
99
}))
{}
};
FeedForward
(
Config
config
)
:
config_
(
config
)
{}
~
FeedForward
()
{}
void
Forward
(
int
bsz
,
const
T
*
input_ptr
,
const
T
*
weights
,
T
*
out
,
cublasHandle_t
&
_cublasHandle
)
{
float
alpha
=
T
(
1.
);
float
beta
=
T
(
0.
);
cublas_gemm_ex
(
_cublasHandle
,
CUBLAS_OP_T
,
CUBLAS_OP_N
,
config_
.
outputSize
,
bsz
,
config_
.
inputSize
,
&
alpha
,
&
beta
,
weights
,
input_ptr
,
out
,
cublasGemmAlgo_t
(
config_
.
gemm_algos
[
0
]));
}
void
Backward
(
int
bsz
,
const
T
*
out_grad
,
const
T
*
input_ptr
,
const
T
*
weights
,
T
*
weights_grad
,
T
*
bias_grad
,
cublasHandle_t
&
_cublasHandle
,
cudaStream_t
&
stream
,
T
*
inp_grad_out
=
nullptr
,
T
*
out_grad_trans_out
=
nullptr
,
bool
compute_bias
=
true
)
{
float
alpha
=
(
T
)
1.0
,
beta
=
(
T
)
0.0
;
cublas_gemm_ex
(
_cublasHandle
,
CUBLAS_OP_N
,
CUBLAS_OP_T
,
config_
.
inputSize
,
config_
.
outputSize
,
bsz
,
&
alpha
,
&
beta
,
input_ptr
,
out_grad
,
weights_grad
,
cublasGemmAlgo_t
(
config_
.
gemm_algos
[
1
]));
cublas_gemm_ex
(
_cublasHandle
,
CUBLAS_OP_N
,
CUBLAS_OP_N
,
config_
.
inputSize
,
bsz
,
config_
.
outputSize
,
&
alpha
,
&
beta
,
weights
,
out_grad
,
inp_grad_out
,
cublasGemmAlgo_t
(
config_
.
gemm_algos
[
2
]));
if
(
compute_bias
)
{
launch_fuse_transpose_bias_kernel
<
T
>
(
out_grad
,
bias_grad
,
bsz
,
config_
.
outputSize
,
stream
);
}
}
void
reset_size
(
int
outputSize
,
int
inputSize
)
{
config_
.
outputSize
=
outputSize
;
config_
.
inputSize
=
inputSize
;
}
private:
Config
config_
;
};
colossalai/kernel/cuda_native/csrc/kernels/include/kernels.h
0 → 100644
View file @
5c3843dc
#pragma once
#include <cuda.h>
#include <cuda_fp16.h>
#include <curand_kernel.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdexcept>
#define MAX_THREADS 1024
#define WARP_SIZE 32
enum
class
ActivationType
{
kRelu
,
kGelu
};
void
launch_curand_init
(
int
total_count
,
int
dim
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_layer_norm
(
T
*
ln_res
,
T
*
vars
,
T
*
means
,
const
T
*
inp
,
const
T
*
scale
,
const
T
*
bias
,
int
batch_size
,
int
hidden_dim
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_ln_bw
(
T
*
gamma_grad
,
T
*
betta_grad
,
T
*
inp_grad
,
const
T
*
out_grad
,
const
T
*
residual_grad
,
const
T
*
inp_or_out
,
const
T
*
gamma
,
const
T
*
betta
,
const
T
*
vars
,
const
T
*
means
,
int
batch
,
int
hidden_dim
,
cudaStream_t
stream
[
2
]);
template
<
typename
T
>
void
launch_attn_softmax
(
T
*
vals
,
const
T
*
attn_mask
,
int
batch_size
,
int
heads
,
int
from_len
,
int
to_len
,
bool
mask_future
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_attn_softmax_bw
(
T
*
out_grad
,
const
T
*
soft_inp
,
int
rows
,
int
softmax_len
,
cudaStream_t
stream
);
// [b, s, h] -> [b, nh, s, ad]
template
<
typename
T
>
void
launch_transform_0213
(
T
*
output
,
const
T
*
vals
,
int
batch_size
,
int
seq_length
,
int
hidden_dim
,
int
nhead
,
cudaStream_t
stream
);
// [b, s, 3, h] -> [3, b, nh, s, ad]
template
<
typename
T
>
void
launch_bias_add_transform_20314
(
T
*
output
,
const
T
*
input
,
const
T
*
bias
,
int
dim_0
,
int
dim_1
,
int
dim_2
,
int
dim_3
,
int
dim_4
,
cudaStream_t
stream
);
// [tc, b, nh, s, ad] -> [b, s, tc, nh, ad]
template
<
typename
T
>
void
launch_transform4d_0213
(
T
*
output
,
const
T
*
vals
,
int
batch_size
,
int
seq_len
,
int
hidden_dim
,
int
nhead
,
int
trans_count
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_ls_dropout
(
T
*
out
,
const
T
*
vals
,
uint8_t
*
mask
,
int
total_count
,
float
ratio
,
cudaStream_t
stream
,
bool
backward
=
false
);
template
<
typename
T
>
void
launch_ls_dropout_res_bias
(
T
*
out
,
const
T
*
vals
,
uint8_t
*
mask
,
const
T
*
bias
,
const
T
*
residual
,
int
total_count
,
int
dim
,
float
ratio
,
cudaStream_t
stream
);
template
<
ActivationType
,
typename
T
>
void
launch_ls_dropout_act_bias
(
T
*
out
,
const
T
*
vals
,
uint8_t
*
mask
,
const
T
*
bias
,
int
total_count
,
int
dim
,
float
ratio
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_ls_dropout_bias_bwd
(
T
*
in_grad
,
T
*
bias_grad
,
const
T
*
out_grad
,
const
uint8_t
*
mask
,
int
row_size
,
int
dim
,
float
ratio
,
cudaStream_t
stream
);
template
<
ActivationType
act_type
,
typename
T
>
void
launch_ls_dropout_act_bias_bwd
(
T
*
in_grad
,
T
*
bias_grad
,
const
T
*
input
,
const
T
*
bias
,
const
T
*
out_grad
,
const
uint8_t
*
mask
,
int
row_size
,
int
dim
,
float
ratio
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_fuse_transpose_bias_kernel
(
const
T
*
inp
,
T
*
out
,
int
rows
,
int
cols
,
cudaStream_t
stream
);
void
launch_param_update
(
const
float
*
input
,
__half
*
output
,
int
size
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_concat3_dim1
(
const
T
*
inp1
,
const
T
*
inp2
,
T
*
output
,
int
sz0
,
int
sz2
,
int
sz1_1
,
int
sz1_2
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_fused_add2
(
T
*
out
,
const
T
*
inp1
,
const
T
*
inp2
,
int
batch_size
,
int
seq_len
,
int
hidden_size
,
cudaStream_t
&
stream
);
template
<
typename
T
>
void
launch_cross_entropy_fw
(
const
T
*
inputs_ptr
,
const
int
*
targets_ptr
,
float
*
outputs_ptr
,
float
*
nll_loss_ptr
,
float
*
loss_buffer
,
const
int
padding_idx
,
const
float
epsilon
,
const
int
batch_size
,
const
int
seq_len
,
const
int
vocab_size
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_cross_entropy_bw
(
const
float
*
grad_outputs_ptr
,
const
T
*
inputs_ptr
,
const
int
*
targets_ptr
,
T
*
grad_inputs_ptr
,
const
int
padding_idx
,
const
float
epsilon
,
const
int
batch_size
,
const
int
seq_len
,
const
int
vocab_size
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_lookup_scale_pos_dropout
(
T
*
output
,
const
int
*
input
,
const
T
*
embeddings
,
const
T
*
pos_embeddings
,
uint8_t
*
dropout_mask
,
int
batch_size
,
int
seq_len
,
int
embedding_dim
,
int
padding_idx
,
float
dropout_ratio
,
int
step
,
cudaStream_t
&
stream
);
template
<
typename
T
>
void
launch_d_lookup_scale_pos_dropout
(
T
*
grad_embeddings
,
const
T
*
grad_output
,
const
int
*
input
,
const
uint8_t
*
dropout_mask
,
int
batch_size
,
int
seq_len
,
int
embedding_dim
,
int
vocab_size
,
int
padding_idx
,
float
dropout_ratio
,
cudaStream_t
&
stream
);
/* Convert 2-dim tensor index into vector index */
__forceinline__
__host__
__device__
int
flat_2dim
(
int
id1
,
int
id2
,
int
dim2
)
{
return
id1
*
dim2
+
id2
;
}
/* Convert 3-dim tensor index into vector index */
__forceinline__
__host__
__device__
int
flat_3dim
(
int
id1
,
int
id2
,
int
id3
,
int
dim2
,
int
dim3
)
{
return
id1
*
dim2
*
dim3
+
id2
*
dim3
+
id3
;
}
/* Convert 4-dim tensor index into vector index */
__forceinline__
__host__
__device__
int
flat_4dim
(
int
id1
,
int
id2
,
int
id3
,
int
id4
,
int
dim2
,
int
dim3
,
int
dim4
)
{
// return id1*(dim2*dim3*dim4) + id2*(dim3*dim4) + id3*dim4 + id4;
int
res
=
id4
;
int
ld
=
dim4
;
res
+=
id3
*
ld
;
ld
*=
dim3
;
res
+=
id2
*
ld
;
ld
*=
dim2
;
res
+=
id1
*
ld
;
return
res
;
}
/* Convert 5-dim tensor index into vector index */
__forceinline__
__host__
__device__
int
flat_5dim
(
int
id1
,
int
id2
,
int
id3
,
int
id4
,
int
id5
,
int
dim2
,
int
dim3
,
int
dim4
,
int
dim5
)
{
// return id1*(dim2*dim3*dim4*dim5) + id2*(dim3*dim4*dim5) + id3*(dim4*dim5) +
// id4*dim5 + dim5;
int
res
=
id5
;
int
ld
=
dim5
;
res
+=
id4
*
ld
;
ld
*=
dim4
;
res
+=
id3
*
ld
;
ld
*=
dim3
;
res
+=
id2
*
ld
;
ld
*=
dim2
;
res
+=
id1
*
ld
;
return
res
;
}
/* Convert 6-dim tensor index into vector index */
__forceinline__
__host__
__device__
int
flat_6dim
(
int
id1
,
int
id2
,
int
id3
,
int
id4
,
int
id5
,
int
id6
,
int
dim2
,
int
dim3
,
int
dim4
,
int
dim5
,
int
dim6
)
{
// return id1*(dim2*dim3*dim4*dim5*dim6) + id2*(dim3*dim4*dim5*dim6) +
// id3*(dim4*dim5*dim6) + id4*(dim5*dim6) + id5*dim6 + id6;
int
res
=
id6
;
int
ld
=
dim6
;
res
+=
id5
*
ld
;
ld
*=
dim5
;
res
+=
id4
*
ld
;
ld
*=
dim4
;
res
+=
id3
*
ld
;
ld
*=
dim3
;
res
+=
id2
*
ld
;
ld
*=
dim2
;
res
+=
id1
*
ld
;
return
res
;
}
/* Convert vector index to 6-dim tensor index */
__forceinline__
__host__
__device__
void
decompose_6dim
(
int
src
,
int
dim1
,
int
dim2
,
int
dim3
,
int
dim4
,
int
dim5
,
int
*
id0
,
int
*
id1
,
int
*
id2
,
int
*
id3
,
int
*
id4
,
int
*
id5
)
{
*
id5
=
src
%
dim5
;
src
/=
dim5
;
*
id4
=
src
%
dim4
;
src
/=
dim4
;
*
id3
=
src
%
dim3
;
src
/=
dim3
;
*
id2
=
src
%
dim2
;
src
/=
dim2
;
*
id1
=
src
%
dim1
;
*
id0
=
src
/
dim1
;
}
/* Convert vector index to 5-dim tensor index */
__forceinline__
__host__
__device__
void
decompose_5dim
(
int
src
,
int
dim1
,
int
dim2
,
int
dim3
,
int
dim4
,
int
*
id0
,
int
*
id1
,
int
*
id2
,
int
*
id3
,
int
*
id4
)
{
*
id4
=
src
%
dim4
;
src
/=
dim4
;
*
id3
=
src
%
dim3
;
src
/=
dim3
;
*
id2
=
src
%
dim2
;
src
/=
dim2
;
*
id1
=
src
%
dim1
;
*
id0
=
src
/
dim1
;
}
/* Convert vector index to 4-dim tensor index */
__forceinline__
__host__
__device__
void
decompose_4dim
(
int
src
,
int
dim1
,
int
dim2
,
int
dim3
,
int
*
id0
,
int
*
id1
,
int
*
id2
,
int
*
id3
)
{
*
id3
=
src
%
dim3
;
src
/=
dim3
;
*
id2
=
src
%
dim2
;
src
/=
dim2
;
*
id1
=
src
%
dim1
;
*
id0
=
src
/
dim1
;
}
/* Convert vector index to 3-dim tensor index */
__forceinline__
__host__
__device__
void
decompose_3dim
(
int
src
,
int
dim1
,
int
dim2
,
int
*
id0
,
int
*
id1
,
int
*
id2
)
{
*
id2
=
src
%
dim2
;
src
/=
dim2
;
*
id1
=
src
%
dim1
;
*
id0
=
src
/
dim1
;
}
/* Convert vector index to 2-dim tensor index */
__forceinline__
__host__
__device__
void
decompose_2dim
(
int
src
,
int
dim1
,
int
*
id0
,
int
*
id1
)
{
*
id1
=
src
%
dim1
;
*
id0
=
src
/
dim1
;
}
colossalai/kernel/cuda_native/csrc/kernels/include/ls_cub.cuh
0 → 100644
View file @
5c3843dc
// copied from https://github.com/dmlc/dgl/pull/2758
#ifndef DGL_ARRAY_CUDA_DGL_CUB_CUH_
#define DGL_ARRAY_CUDA_DGL_CUB_CUH_
#define CUB_NS_PREFIX namespace ls {
#define CUB_NS_POSTFIX }
#include "cub/cub.cuh"
#include "cub/util_allocator.cuh"
#undef CUB_NS_POSTFIX
#undef CUB_NS_PREFIX
#endif
colossalai/kernel/cuda_native/csrc/kernels/include/normalize_layer.h
0 → 100644
View file @
5c3843dc
#pragma once
#include <cuda.h>
#include <cuda_fp16.h>
#include <stdio.h>
#include <fstream>
#include "kernels.h"
using
namespace
std
;
template
<
typename
T
>
class
Normalize_Layer
{
public:
struct
Config
{
uint32_t
hidden_dim
;
bool
use_mean
;
Config
(
uint32_t
hidden_dim
,
bool
use_mean
=
false
)
:
hidden_dim
(
hidden_dim
),
use_mean
(
use_mean
)
{}
};
Normalize_Layer
(
Config
config
,
size_t
max_rows
)
:
config_
(
config
),
vars_
(
nullptr
),
means_
(
nullptr
)
{
vars_
=
cuda_malloc
<
T
>
(
max_rows
);
if
(
config_
.
use_mean
)
{
means_
=
cuda_malloc
<
T
>
(
max_rows
);
}
}
~
Normalize_Layer
()
{
cuda_free
(
vars_
);
cuda_free
(
means_
);
}
void
Forward
(
T
*
ln_res
,
const
T
*
inp
,
const
T
*
gamma
,
const
T
*
betta
,
int
batch_size
,
cudaStream_t
stream
)
{
launch_layer_norm
(
ln_res
,
vars_
,
means_
,
inp
,
gamma
,
betta
,
batch_size
,
config_
.
hidden_dim
,
stream
);
}
/*
residual_grad, inp_or_out, betta should be treated carefully.
inp_or_out = input if use_mean else output
residual_grad, betta can be nullptr.
residual_grad will be added to dinp if it is not nullptr
which is useful in transformer layer when pre-ln
betta are only used to compute xhat,
(use_mean == false) ^ (betta == nullptr) should be true
*/
void
Backward
(
T
*
gamma_grad
,
T
*
betta_grad
,
T
*
inp_grad
,
const
T
*
out_grad
,
const
T
*
residual_grad
,
const
T
*
inp_or_out
,
const
T
*
gamma
,
const
T
*
betta
,
int
batch_size
,
cudaStream_t
stream
[
2
])
{
launch_ln_bw
(
gamma_grad
,
betta_grad
,
inp_grad
,
out_grad
,
residual_grad
,
inp_or_out
,
gamma
,
betta
,
vars_
,
means_
,
batch_size
,
config_
.
hidden_dim
,
stream
);
}
inline
bool
use_mean
()
const
{
return
config_
.
use_mean
;
}
private:
Config
config_
;
T
*
vars_
;
T
*
means_
;
};
colossalai/kernel/cuda_native/csrc/kernels/include/softmax.h
0 → 100644
View file @
5c3843dc
#pragma once
#include <cuda.h>
#include <cuda_fp16.h>
#include <stdio.h>
#include <fstream>
#include "kernels.h"
using
namespace
std
;
template
<
typename
T
>
class
Softmax
{
public:
struct
Config
{
size_t
nhead
;
Config
(
size_t
nhead
)
:
nhead
(
nhead
)
{}
};
Softmax
(
Config
config
)
:
config_
(
config
)
{}
~
Softmax
()
{}
void
Forward
(
T
*
vals
,
const
T
*
attn_mask
,
int
batch_size
,
int
from_len
,
int
to_len
,
cudaStream_t
&
stream
,
bool
mask_future
=
true
)
{
launch_attn_softmax
<
T
>
(
vals
,
attn_mask
,
batch_size
,
config_
.
nhead
,
from_len
,
to_len
,
mask_future
,
stream
);
}
void
Backward
(
T
*
out_grad
,
const
T
*
soft_out
,
int
batch_size
,
int
from_len
,
int
to_len
,
cudaStream_t
stream
)
{
launch_attn_softmax_bw
<
T
>
(
out_grad
,
soft_out
,
batch_size
*
config_
.
nhead
*
from_len
,
to_len
,
stream
);
}
void
reset_size
(
size_t
nhead
)
{
config_
.
nhead
=
nhead
;
}
private:
Config
config_
;
};
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment