Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
8823cc48
Unverified
Commit
8823cc48
authored
Jan 29, 2024
by
Frank Lee
Committed by
GitHub
Jan 29, 2024
Browse files
Merge pull request #5310 from hpcaitech/feature/npu
Feature/npu
parents
bce9499e
73f4dc57
Changes
266
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
0 additions
and
2764 deletions
+0
-2764
colossalai/kernel/cuda_native/csrc/gptq/q4_matmul.cuh
colossalai/kernel/cuda_native/csrc/gptq/q4_matmul.cuh
+0
-43
colossalai/kernel/cuda_native/csrc/gptq/q4_matrix.cu
colossalai/kernel/cuda_native/csrc/gptq/q4_matrix.cu
+0
-225
colossalai/kernel/cuda_native/csrc/gptq/q4_matrix.cuh
colossalai/kernel/cuda_native/csrc/gptq/q4_matrix.cuh
+0
-53
colossalai/kernel/cuda_native/csrc/gptq/tuning.h
colossalai/kernel/cuda_native/csrc/gptq/tuning.h
+0
-12
colossalai/kernel/cuda_native/csrc/gptq/util.cuh
colossalai/kernel/cuda_native/csrc/gptq/util.cuh
+0
-33
colossalai/kernel/cuda_native/csrc/kernels/cross_entropy.cu
colossalai/kernel/cuda_native/csrc/kernels/cross_entropy.cu
+0
-191
colossalai/kernel/cuda_native/csrc/kernels/cublas_wrappers.cu
...ssalai/kernel/cuda_native/csrc/kernels/cublas_wrappers.cu
+0
-88
colossalai/kernel/cuda_native/csrc/kernels/cuda_util.cu
colossalai/kernel/cuda_native/csrc/kernels/cuda_util.cu
+0
-169
colossalai/kernel/cuda_native/csrc/kernels/dropout_kernels.cu
...ssalai/kernel/cuda_native/csrc/kernels/dropout_kernels.cu
+0
-1002
colossalai/kernel/cuda_native/csrc/kernels/general_kernels.cu
...ssalai/kernel/cuda_native/csrc/kernels/general_kernels.cu
+0
-232
colossalai/kernel/cuda_native/csrc/kernels/include/context.h
colossalai/kernel/cuda_native/csrc/kernels/include/context.h
+0
-36
colossalai/kernel/cuda_native/csrc/kernels/include/cross_entropy_layer.h
...el/cuda_native/csrc/kernels/include/cross_entropy_layer.h
+0
-46
colossalai/kernel/cuda_native/csrc/kernels/include/cublas_wrappers.h
...kernel/cuda_native/csrc/kernels/include/cublas_wrappers.h
+0
-41
colossalai/kernel/cuda_native/csrc/kernels/include/cuda_util.h
...salai/kernel/cuda_native/csrc/kernels/include/cuda_util.h
+0
-34
colossalai/kernel/cuda_native/csrc/kernels/include/dropout.h
colossalai/kernel/cuda_native/csrc/kernels/include/dropout.h
+0
-96
colossalai/kernel/cuda_native/csrc/kernels/include/feed_forward.h
...ai/kernel/cuda_native/csrc/kernels/include/feed_forward.h
+0
-69
colossalai/kernel/cuda_native/csrc/kernels/include/kernels.h
colossalai/kernel/cuda_native/csrc/kernels/include/kernels.h
+0
-275
colossalai/kernel/cuda_native/csrc/kernels/include/ls_cub.cuh
...ssalai/kernel/cuda_native/csrc/kernels/include/ls_cub.cuh
+0
-12
colossalai/kernel/cuda_native/csrc/kernels/include/normalize_layer.h
...kernel/cuda_native/csrc/kernels/include/normalize_layer.h
+0
-65
colossalai/kernel/cuda_native/csrc/kernels/include/softmax.h
colossalai/kernel/cuda_native/csrc/kernels/include/softmax.h
+0
-42
No files found.
colossalai/kernel/cuda_native/csrc/gptq/q4_matmul.cuh
deleted
100644 → 0
View file @
bce9499e
// Adapted from turboderp exllama: https://github.com/turboderp/exllama
#ifndef _q4_matmul_cuh
#define _q4_matmul_cuh
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include <cstdint>
#include <cstdio>
#include <ATen/cuda/CUDAContext.h>
#include "q4_matrix.cuh"
#include "tuning.h"
// Workaround for hipify_python using rocblas instead of hipblas.
#if defined(USE_ROCM)
#include <hipblas/hipblas.h>
#define rocblas_handle hipblasHandle_t
#endif
void
q4_matmul_cuda
(
ExLlamaTuning
*
tuningParams
,
const
half
*
x
,
const
int
x_height
,
const
Q4Matrix
*
w
,
half
*
out
,
bool
no_zero
=
false
,
cudaStream_t
alt_stream
=
NULL
);
void
q4_matmul_recons_cuda
(
ExLlamaTuning
*
tuningParams
,
const
half
*
x
,
const
int
x_height
,
Q4Matrix
*
w
,
half
*
out
,
const
cublasHandle_t
handle
,
bool
no_zero
=
false
);
#endif
colossalai/kernel/cuda_native/csrc/gptq/q4_matrix.cu
deleted
100644 → 0
View file @
bce9499e
// Adapted from turboderp exllama: https://github.com/turboderp/exllama
#include "q4_matrix.cuh"
#include <vector>
#include "util.cuh"
#include "matrix.cuh"
using
namespace
std
;
const
int
UNSHUF_BLOCKSIZE_X
=
64
;
const
int
RECONS_THREADS_X
=
64
;
// Block size and thread count along columns in out, each thread converts 1 column
const
int
RECONS_THREADS_Y
=
1
;
// Block size and thread count along rows in x and out, each thread converts 8 rows
vector
<
Q4Matrix
*>
g_q4_matrices
;
void
g_q4_keep_matrix
(
Q4Matrix
*
m
)
{
g_q4_matrices
.
push_back
(
m
);
}
void
g_q4_free_matrices
()
{
for
(
const
auto
&
m
:
g_q4_matrices
)
delete
m
;
g_q4_matrices
.
clear
();
}
Q4Matrix
::
Q4Matrix
(
const
int
_height
,
const
int
_width
,
const
int
_groups
,
uint32_t
*
_qweight
,
uint32_t
*
_qzeros
,
half
*
_scales
,
uint32_t
*
_g_idx
,
const
int
_device
)
:
height
(
_height
),
width
(
_width
),
groups
(
_groups
),
device
(
_device
)
{
cudaSetDevice
(
device
);
cuda_qweight
=
_qweight
;
cuda_qzeros
=
_qzeros
;
cuda_scales
=
_scales
;
groupsize
=
height
/
groups
;
if
(
_g_idx
)
make_sequential
(
_g_idx
);
}
Q4Matrix
::~
Q4Matrix
()
{
}
// Make sequential
__global__
void
make_sequential_kernel
(
const
uint32_t
*
__restrict__
w
,
uint32_t
*
__restrict__
w_new
,
const
uint32_t
*
__restrict__
x_map
,
const
int
w_height
,
const
int
w_width
)
{
const
uint64_t
*
w2
=
(
uint64_t
*
)
w
;
uint64_t
*
w_new2
=
(
uint64_t
*
)
w_new
;
int
w2_stride
=
w_width
>>
1
;
int
w2_column
=
UNSHUF_BLOCKSIZE_X
*
blockIdx
.
x
+
threadIdx
.
x
;
if
(
w2_column
>=
w2_stride
)
return
;
int
w_new2_row
=
blockIdx
.
y
;
int
x_map_idx
=
w_new2_row
<<
3
;
uint64_t
dst
=
0
;
#pragma unroll
for
(
int
i
=
0
;
i
<
8
;
i
++
)
{
int
source_row
=
x_map
[
x_map_idx
++
];
int
w2_row
=
source_row
>>
3
;
int
w2_subrow
=
source_row
&
0x07
;
int
w2_row_shift
=
w2_subrow
<<
2
;
int
wnew2_row_shift
=
i
<<
2
;
uint64_t
src
=
w2
[
w2_row
*
w2_stride
+
w2_column
];
src
>>=
w2_row_shift
;
src
&=
0x0000000f0000000f
;
src
<<=
wnew2_row_shift
;
dst
|=
src
;
}
w_new2
[
w_new2_row
*
w2_stride
+
w2_column
]
=
dst
;
}
void
Q4Matrix
::
make_sequential
(
const
uint32_t
*
cpu_g_idx
)
{
uint32_t
*
cuda_new_qweight
=
NULL
;
cudaMalloc
(
&
cuda_new_qweight
,
height
/
8
*
width
*
sizeof
(
uint32_t
));
cudaMalloc
(
&
cuda_x_map
,
height
*
sizeof
(
uint32_t
));
// TODO: Should probably be allocated in PyTorch
uint32_t
*
cpu_g_idx_map
=
(
uint32_t
*
)
calloc
(
groups
,
sizeof
(
uint32_t
));
uint32_t
*
cpu_x_map
=
(
uint32_t
*
)
malloc
(
height
*
sizeof
(
uint32_t
));
uint32_t
*
cpu_x_map_inv
=
(
uint32_t
*
)
malloc
(
height
*
sizeof
(
uint32_t
));
// Group histogram
for
(
int
i
=
0
;
i
<
height
;
i
++
)
cpu_g_idx_map
[
cpu_g_idx
[
i
]]
++
;
// Group map
for
(
int
i
=
0
,
acc
=
0
;
i
<
groups
;
i
++
)
{
short
tmp
=
cpu_g_idx_map
[
i
];
cpu_g_idx_map
[
i
]
=
acc
;
acc
+=
tmp
;
}
// X map (inverse)
for
(
int
row
=
0
;
row
<
height
;
row
++
)
{
uint32_t
target_group
=
cpu_g_idx
[
row
];
uint32_t
target_row
=
cpu_g_idx_map
[
target_group
];
cpu_g_idx_map
[
target_group
]
++
;
cpu_x_map_inv
[
row
]
=
target_row
;
}
// X map
for
(
int
row
=
0
;
row
<
height
;
row
++
)
cpu_x_map
[
cpu_x_map_inv
[
row
]]
=
row
;
// Move to CUDA
cudaMemcpyAsync
(
cuda_x_map
,
cpu_x_map
,
height
*
sizeof
(
uint32_t
),
cudaMemcpyHostToDevice
);
// Rearrange rows in w
dim3
threads
(
UNSHUF_BLOCKSIZE_X
,
1
,
1
);
dim3
blocks
(
(
width
+
UNSHUF_BLOCKSIZE_X
*
2
-
1
)
/
(
UNSHUF_BLOCKSIZE_X
*
2
),
height
/
8
,
1
);
make_sequential_kernel
<<<
blocks
,
threads
>>>
(
cuda_qweight
,
cuda_new_qweight
,
cuda_x_map
,
height
/
8
,
width
);
// Replace qweights
cudaMemcpyAsync
(
cuda_qweight
,
cuda_new_qweight
,
height
/
8
*
width
*
sizeof
(
uint32_t
),
cudaMemcpyDeviceToDevice
);
// Cleanup
cudaDeviceSynchronize
();
cudaFree
(
cuda_new_qweight
);
free
(
cpu_g_idx_map
);
free
(
cpu_x_map
);
free
(
cpu_x_map_inv
);
}
__global__
void
reconstruct_kernel
(
const
uint32_t
*
__restrict__
w
,
half
*
__restrict__
out
,
// (y)
const
half
*
__restrict__
w_scales
,
const
uint32_t
*
__restrict__
w_zeros
,
const
int
height
,
const
int
width
,
const
int
groupsize
)
{
// Start of block
int
column
=
RECONS_THREADS_X
*
blockIdx
.
x
+
threadIdx
.
x
;
int
row
=
(
RECONS_THREADS_Y
*
blockIdx
.
y
+
threadIdx
.
y
)
*
8
;
if
(
column
>=
width
)
return
;
// Views
MatrixView_q4_column
w_
(
w
,
height
,
width
);
MatrixView_half_rw
out_
(
out
,
height
,
width
);
MatrixView_half
w_scales_
(
w_scales
,
height
/
groupsize
,
width
);
MatrixView_q4_row
w_zeros_
(
w_zeros
,
height
/
groupsize
,
width
);
// Groupsize version
int
group
=
row
/
groupsize
;
half
w_scale
=
w_scales_
.
item
(
group
,
column
);
uint32_t
w_zero
=
w_zeros_
.
item
(
group
,
column
)
+
1
;
uint32_t
w_read
=
w_
.
item_uint32_t
(
row
,
column
);
half
*
out_ptr
=
out_
.
item_ptr
(
row
,
column
);
#pragma unroll
for
(
int
s
=
0
;
s
<
32
;
s
+=
4
)
{
half
w_item
=
__hmul
(
__int2half_rn
((
int
)((
w_read
>>
s
)
&
0x0f
)
-
w_zero
),
w_scale
);
*
out_ptr
=
w_item
;
out_ptr
+=
out_
.
width
;
}
}
void
Q4Matrix
::
reconstruct
(
half
*
out
)
{
dim3
threads
(
RECONS_THREADS_X
,
RECONS_THREADS_Y
,
1
);
dim3
blocks
(
(
width
+
threads
.
x
-
1
)
/
threads
.
x
,
(
height
/
8
+
threads
.
y
-
1
)
/
threads
.
y
,
1
);
reconstruct_kernel
<<<
blocks
,
threads
>>>
(
cuda_qweight
,
out
,
cuda_scales
,
cuda_qzeros
,
height
/
8
,
width
,
groupsize
);
}
colossalai/kernel/cuda_native/csrc/gptq/q4_matrix.cuh
deleted
100644 → 0
View file @
bce9499e
// Adapted from turboderp exllama: https://github.com/turboderp/exllama
#ifndef _q4_matrix_cuh
#define _q4_matrix_cuh
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include <cstdint>
class
Q4Matrix
{
public:
int
device
;
int
height
;
int
width
;
int
groups
;
int
groupsize
;
uint32_t
*
cuda_qweight
=
NULL
;
uint32_t
*
cuda_qzeros
=
NULL
;
half
*
cuda_scales
=
NULL
;
uint32_t
*
cuda_x_map
=
NULL
;
Q4Matrix
(
const
int
_height
,
const
int
_width
,
const
int
_groups
,
uint32_t
*
_qweight
,
uint32_t
*
_qzeros
,
half
*
_scales
,
uint32_t
*
_g_idx
,
const
int
_device
);
~
Q4Matrix
();
void
reconstruct
(
half
*
out
);
private:
void
make_sequential
(
const
uint32_t
*
cpu_g_idx
);
};
void
g_q4_keep_matrix
(
Q4Matrix
*
m
);
void
g_q4_free_matrices
();
#endif
\ No newline at end of file
colossalai/kernel/cuda_native/csrc/gptq/tuning.h
deleted
100644 → 0
View file @
bce9499e
// Adapted from turboderp exllama: https://github.com/turboderp/exllama
#ifndef _tuning_h
#define _tuning_h
struct
ExLlamaTuning
{
int
matmul_recons_thd
;
bool
matmul_fused_remap
;
bool
matmul_no_half2
;
};
#endif
colossalai/kernel/cuda_native/csrc/gptq/util.cuh
deleted
100644 → 0
View file @
bce9499e
// Adapted from turboderp exllama: https://github.com/turboderp/exllama
#ifndef _util_cuh
#define _util_cuh
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include <cstdint>
#include <cstdio>
#if defined(USE_ROCM)
#define cudaUnspecified hipErrorUnknown
#else
#define cudaUnspecified cudaErrorApiFailureBase
#endif
// React to failure on return code != cudaSuccess
#define _cuda_check(fn) \
do { \
{_cuda_err = fn;} \
if (_cuda_err != cudaSuccess) goto _cuda_fail; \
} while(false)
// React to failure on return code == 0
#define _alloc_check(fn) \
do { \
if (!(fn)) { _cuda_err = cudaUnspecified; goto _cuda_fail; } \
else _cuda_err = cudaSuccess; \
} while(false)
#endif
colossalai/kernel/cuda_native/csrc/kernels/cross_entropy.cu
deleted
100644 → 0
View file @
bce9499e
#include "block_reduce.h"
#include "cuda_util.h"
#include "kernels.h"
#include "ls_cub.cuh"
ls
::
cub
::
CachingDeviceAllocator
g_allocator
(
true
);
template
<
typename
T
>
__global__
void
ls_cross_entropy_fw_kernel
(
const
T
*
__restrict__
inputs
,
const
int
*
__restrict__
targets
,
float
*
__restrict__
outputs
,
float
*
__restrict__
nll_loss_outputs
,
const
int
padding_idx
,
const
float
epsilon
,
const
int
vocab_size
)
{
/* step1: compute each thread's max_logit and sum_exp_logit, store in
* max_input, sum_exp_logit */
const
int
block_start
=
blockIdx
.
x
*
vocab_size
;
const
int
left_idx
=
block_start
+
threadIdx
.
x
;
const
int
right_idx
=
(
blockIdx
.
x
+
1
)
*
vocab_size
;
float
max_input
[
1
]
=
{
REDUCE_FLOAT_INF_NEG
};
float
sum_logits
[
2
]
=
{
0.
f
,
0.
f
};
// logit and logit exp
int
target_tid
=
targets
[
blockIdx
.
x
];
if
(
target_tid
==
padding_idx
)
{
if
(
threadIdx
.
x
==
0
)
{
nll_loss_outputs
[
blockIdx
.
x
]
=
0.
f
;
outputs
[
blockIdx
.
x
]
=
0.
f
;
}
return
;
}
for
(
int
i
=
left_idx
;
i
<
right_idx
;
i
+=
blockDim
.
x
)
{
max_input
[
0
]
=
fmaxf
(
max_input
[
0
],
static_cast
<
float
>
(
inputs
[
i
]));
}
blockReduce
<
ReduceType
::
kMax
,
1
>
(
max_input
);
__shared__
float
s_max_input
;
if
(
threadIdx
.
x
==
0
)
{
s_max_input
=
max_input
[
0
];
}
__syncthreads
();
for
(
int
i
=
left_idx
;
i
<
right_idx
;
i
+=
blockDim
.
x
)
{
float
logit
=
static_cast
<
float
>
(
inputs
[
i
])
-
s_max_input
;
sum_logits
[
0
]
+=
logit
;
sum_logits
[
1
]
+=
expf
(
logit
);
}
blockReduce
<
ReduceType
::
kSum
,
2
>
(
sum_logits
);
__shared__
float
s_sum_logit
;
__shared__
float
s_sum_exp
;
if
(
threadIdx
.
x
==
0
)
{
s_sum_logit
=
sum_logits
[
0
];
s_sum_exp
=
sum_logits
[
1
];
}
__syncthreads
();
float
eps_i
=
epsilon
/
(
vocab_size
-
1
);
if
(
threadIdx
.
x
==
0
)
{
// neg_log_prob = log(sum(exp(x - x_max))) - (x - x_max)
float
nll_loss
=
logf
(
s_sum_exp
)
-
static_cast
<
float
>
(
inputs
[
block_start
+
target_tid
])
+
s_max_input
;
nll_loss_outputs
[
blockIdx
.
x
]
=
nll_loss
;
float
sum_nll_loss
=
vocab_size
*
logf
(
s_sum_exp
)
-
s_sum_logit
;
outputs
[
blockIdx
.
x
]
=
(
1.
f
-
epsilon
-
eps_i
)
*
nll_loss
+
eps_i
*
sum_nll_loss
;
}
}
template
<
typename
T
>
__global__
void
ls_cross_entropy_bw_kernel
(
const
float
*
__restrict__
grad_outputs
,
const
T
*
__restrict__
inputs
,
const
int
*
__restrict__
targets
,
T
*
__restrict__
grad_inputs
,
const
int
padding_idx
,
const
float
epsilon
,
const
int
vocab_size
)
{
/* step1: compute each thread's max_logit and sum_exp_logit, store in
* max_input, sum_exp_logit */
const
int
block_start
=
blockIdx
.
x
*
vocab_size
;
const
int
left_idx
=
block_start
+
threadIdx
.
x
;
const
int
right_idx
=
(
blockIdx
.
x
+
1
)
*
vocab_size
;
float
max_input
[
1
]
=
{
REDUCE_FLOAT_INF_NEG
};
float
sum_logits
[
1
]
=
{
0.
f
};
const
float
grad_out
=
static_cast
<
float
>
(
grad_outputs
[
0
]);
int
target_tid
=
targets
[
blockIdx
.
x
];
if
(
target_tid
==
padding_idx
)
{
for
(
int
i
=
left_idx
;
i
<
right_idx
;
i
+=
blockDim
.
x
)
{
grad_inputs
[
i
]
=
0.
f
;
}
return
;
}
for
(
int
i
=
left_idx
;
i
<
right_idx
;
i
+=
blockDim
.
x
)
{
max_input
[
0
]
=
fmaxf
(
max_input
[
0
],
static_cast
<
float
>
(
inputs
[
i
]));
}
blockReduce
<
ReduceType
::
kMax
,
1
>
(
max_input
);
__shared__
float
s_max_input
;
if
(
threadIdx
.
x
==
0
)
{
s_max_input
=
max_input
[
0
];
}
__syncthreads
();
for
(
int
i
=
left_idx
;
i
<
right_idx
;
i
+=
blockDim
.
x
)
{
float
logit
=
static_cast
<
float
>
(
inputs
[
i
])
-
s_max_input
;
sum_logits
[
0
]
+=
expf
(
logit
);
}
blockReduce
<
ReduceType
::
kSum
,
1
>
(
sum_logits
);
__shared__
float
s_sum_exp
;
if
(
threadIdx
.
x
==
0
)
{
s_sum_exp
=
sum_logits
[
0
];
}
__syncthreads
();
float
eps_i
=
epsilon
/
(
vocab_size
-
1
);
float
nll_weight
=
1.0
-
epsilon
-
eps_i
;
for
(
int
i
=
left_idx
;
i
<
right_idx
;
i
+=
blockDim
.
x
)
{
float
prob
=
expf
(
static_cast
<
float
>
(
inputs
[
i
])
-
s_max_input
)
/
s_sum_exp
;
float
grad
=
0
;
grad
+=
(
vocab_size
*
prob
-
1
)
*
eps_i
;
grad
+=
prob
*
nll_weight
;
if
((
i
-
block_start
)
==
target_tid
)
{
grad
-=
nll_weight
;
}
grad_inputs
[
i
]
=
grad_out
*
grad
;
}
}
template
<
typename
T
>
void
launch_cross_entropy_fw
(
const
T
*
inputs_ptr
,
const
int
*
targets_ptr
,
float
*
outputs_ptr
,
float
*
nll_loss_ptr
,
float
*
loss_buffer
,
const
int
padding_idx
,
const
float
epsilon
,
const
int
batch_size
,
const
int
seq_len
,
const
int
vocab_size
,
cudaStream_t
stream
)
{
int
grid_dim
=
batch_size
*
seq_len
;
float
*
nll_loss_buffer
=
loss_buffer
+
grid_dim
;
ls_cross_entropy_fw_kernel
<<<
grid_dim
,
MAX_THREADS
,
0
,
stream
>>>
(
inputs_ptr
,
targets_ptr
,
loss_buffer
,
nll_loss_buffer
,
padding_idx
,
epsilon
,
vocab_size
);
int
num_items
=
grid_dim
;
void
*
d_temp_storage
=
NULL
;
size_t
temp_storage_bytes
=
0
;
CHECK_GPU_ERROR
(
ls
::
cub
::
DeviceReduce
::
Sum
(
d_temp_storage
,
temp_storage_bytes
,
loss_buffer
,
outputs_ptr
,
num_items
,
stream
));
CHECK_GPU_ERROR
(
g_allocator
.
DeviceAllocate
(
&
d_temp_storage
,
temp_storage_bytes
));
CHECK_GPU_ERROR
(
ls
::
cub
::
DeviceReduce
::
Sum
(
d_temp_storage
,
temp_storage_bytes
,
loss_buffer
,
outputs_ptr
,
num_items
,
stream
));
CHECK_GPU_ERROR
(
ls
::
cub
::
DeviceReduce
::
Sum
(
d_temp_storage
,
temp_storage_bytes
,
nll_loss_buffer
,
nll_loss_ptr
,
num_items
,
stream
));
CHECK_GPU_ERROR
(
g_allocator
.
DeviceFree
(
d_temp_storage
));
}
template
void
launch_cross_entropy_fw
<
float
>(
const
float
*
inputs_ptr
,
const
int
*
targets_ptr
,
float
*
outputs_ptr
,
float
*
nll_loss_ptr
,
float
*
loss_buffer
,
const
int
padding_idx
,
const
float
epsilon
,
const
int
batch_size
,
const
int
seq_len
,
const
int
vocab_size
,
cudaStream_t
stream
);
template
void
launch_cross_entropy_fw
<
__half
>(
const
__half
*
inputs_ptr
,
const
int
*
targets_ptr
,
float
*
outputs_ptr
,
float
*
nll_loss_ptr
,
float
*
loss_buffer
,
const
int
padding_idx
,
const
float
epsilon
,
const
int
batch_size
,
const
int
seq_len
,
const
int
vocab_size
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_cross_entropy_bw
(
const
float
*
grad_outputs_ptr
,
const
T
*
inputs_ptr
,
const
int
*
targets_ptr
,
T
*
grad_inputs_ptr
,
const
int
padding_idx
,
const
float
epsilon
,
const
int
batch_size
,
const
int
seq_len
,
const
int
vocab_size
,
cudaStream_t
stream
)
{
int
grid_dim
=
batch_size
*
seq_len
;
ls_cross_entropy_bw_kernel
<<<
grid_dim
,
MAX_THREADS
,
0
,
stream
>>>
(
grad_outputs_ptr
,
inputs_ptr
,
targets_ptr
,
grad_inputs_ptr
,
padding_idx
,
epsilon
,
vocab_size
);
}
template
void
launch_cross_entropy_bw
<
float
>(
const
float
*
grad_outputs_ptr
,
const
float
*
inputs_ptr
,
const
int
*
targets_ptr
,
float
*
grad_inputs_ptr
,
const
int
padding_idx
,
const
float
epsilon
,
const
int
batch_size
,
const
int
seq_len
,
const
int
vocab_size
,
cudaStream_t
stream
);
template
void
launch_cross_entropy_bw
<
__half
>(
const
float
*
grad_outputs_ptr
,
const
__half
*
inputs_ptr
,
const
int
*
targets_ptr
,
__half
*
grad_inputs_ptr
,
const
int
padding_idx
,
const
float
epsilon
,
const
int
batch_size
,
const
int
seq_len
,
const
int
vocab_size
,
cudaStream_t
stream
);
colossalai/kernel/cuda_native/csrc/kernels/cublas_wrappers.cu
deleted
100644 → 0
View file @
bce9499e
/* Copyright 2021 The LightSeq Team
Copyright Microsoft DeepSpeed
This file is adapted from Microsoft DeepSpeed
Licensed under the MIT License.
*/
#include "cublas_wrappers.h"
int
cublas_gemm_ex
(
cublasHandle_t
handle
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
float
*
A
,
const
float
*
B
,
float
*
C
,
cublasGemmAlgo_t
algo
)
{
cublasStatus_t
status
=
cublasGemmEx
(
handle
,
transa
,
transb
,
m
,
n
,
k
,
(
const
void
*
)
alpha
,
(
const
void
*
)
A
,
CUDA_R_32F
,
(
transa
==
CUBLAS_OP_N
)
?
m
:
k
,
(
const
void
*
)
B
,
CUDA_R_32F
,
(
transb
==
CUBLAS_OP_N
)
?
k
:
n
,
(
const
void
*
)
beta
,
C
,
CUDA_R_32F
,
m
,
CUDA_R_32F
,
algo
);
if
(
status
!=
CUBLAS_STATUS_SUCCESS
)
{
fprintf
(
stderr
,
"!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d)
\n
"
,
m
,
n
,
k
,
(
int
)
status
);
return
EXIT_FAILURE
;
}
return
0
;
}
int
cublas_gemm_ex
(
cublasHandle_t
handle
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
__half
*
A
,
const
__half
*
B
,
__half
*
C
,
cublasGemmAlgo_t
algo
)
{
cublasStatus_t
status
=
cublasGemmEx
(
handle
,
transa
,
transb
,
m
,
n
,
k
,
(
const
void
*
)
alpha
,
(
const
void
*
)
A
,
CUDA_R_16F
,
(
transa
==
CUBLAS_OP_N
)
?
m
:
k
,
(
const
void
*
)
B
,
CUDA_R_16F
,
(
transb
==
CUBLAS_OP_N
)
?
k
:
n
,
(
const
void
*
)
beta
,
(
void
*
)
C
,
CUDA_R_16F
,
m
,
CUDA_R_32F
,
algo
);
if
(
status
!=
CUBLAS_STATUS_SUCCESS
)
{
fprintf
(
stderr
,
"!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d)
\n
"
,
m
,
n
,
k
,
(
int
)
status
);
return
EXIT_FAILURE
;
}
return
0
;
}
int
cublas_strided_batched_gemm
(
cublasHandle_t
handle
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
float
*
A
,
const
float
*
B
,
float
*
C
,
cublasOperation_t
op_A
,
cublasOperation_t
op_B
,
int
stride_A
,
int
stride_B
,
int
stride_C
,
int
batch
,
cublasGemmAlgo_t
algo
)
{
cublasStatus_t
status
=
cublasGemmStridedBatchedEx
(
handle
,
op_A
,
op_B
,
m
,
n
,
k
,
alpha
,
A
,
CUDA_R_32F
,
(
op_A
==
CUBLAS_OP_N
)
?
m
:
k
,
stride_A
,
B
,
CUDA_R_32F
,
(
op_B
==
CUBLAS_OP_N
)
?
k
:
n
,
stride_B
,
beta
,
C
,
CUDA_R_32F
,
m
,
stride_C
,
batch
,
CUDA_R_32F
,
algo
);
if
(
status
!=
CUBLAS_STATUS_SUCCESS
)
{
fprintf
(
stderr
,
"!!!! kernel execution error. (batch: %d, m: %d, n: %d, k: %d, "
"error: %d)
\n
"
,
batch
,
m
,
n
,
k
,
(
int
)
status
);
return
EXIT_FAILURE
;
}
return
0
;
}
int
cublas_strided_batched_gemm
(
cublasHandle_t
handle
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
__half
*
A
,
const
__half
*
B
,
__half
*
C
,
cublasOperation_t
op_A
,
cublasOperation_t
op_B
,
int
stride_A
,
int
stride_B
,
int
stride_C
,
int
batch
,
cublasGemmAlgo_t
algo
)
{
cublasStatus_t
status
=
cublasGemmStridedBatchedEx
(
handle
,
op_A
,
op_B
,
m
,
n
,
k
,
alpha
,
A
,
CUDA_R_16F
,
(
op_A
==
CUBLAS_OP_N
)
?
m
:
k
,
stride_A
,
B
,
CUDA_R_16F
,
(
op_B
==
CUBLAS_OP_N
)
?
k
:
n
,
stride_B
,
beta
,
C
,
CUDA_R_16F
,
m
,
stride_C
,
batch
,
CUDA_R_32F
,
algo
);
if
(
status
!=
CUBLAS_STATUS_SUCCESS
)
{
fprintf
(
stderr
,
"!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d)
\n
"
,
m
,
n
,
k
,
(
int
)
status
);
return
EXIT_FAILURE
;
}
return
0
;
}
colossalai/kernel/cuda_native/csrc/kernels/cuda_util.cu
deleted
100644 → 0
View file @
bce9499e
#include <thrust/device_vector.h>
#include <thrust/reduce.h>
#include <thrust/transform_reduce.h>
#include "cuda_util.h"
/* GPU function guard */
std
::
string
_cudaGetErrorString
(
cudaError_t
error
)
{
return
cudaGetErrorString
(
error
);
}
std
::
string
_cudaGetErrorString
(
cublasStatus_t
error
)
{
switch
(
error
)
{
case
CUBLAS_STATUS_SUCCESS
:
return
"CUBLAS_STATUS_SUCCESS"
;
case
CUBLAS_STATUS_NOT_INITIALIZED
:
return
"CUBLAS_STATUS_NOT_INITIALIZED"
;
case
CUBLAS_STATUS_ALLOC_FAILED
:
return
"CUBLAS_STATUS_ALLOC_FAILED"
;
case
CUBLAS_STATUS_INVALID_VALUE
:
return
"CUBLAS_STATUS_INVALID_VALUE"
;
case
CUBLAS_STATUS_ARCH_MISMATCH
:
return
"CUBLAS_STATUS_ARCH_MISMATCH"
;
case
CUBLAS_STATUS_MAPPING_ERROR
:
return
"CUBLAS_STATUS_MAPPING_ERROR"
;
case
CUBLAS_STATUS_EXECUTION_FAILED
:
return
"CUBLAS_STATUS_EXECUTION_FAILED"
;
case
CUBLAS_STATUS_INTERNAL_ERROR
:
return
"CUBLAS_STATUS_INTERNAL_ERROR"
;
case
CUBLAS_STATUS_NOT_SUPPORTED
:
return
"CUBLAS_STATUS_NOT_SUPPORTED"
;
case
CUBLAS_STATUS_LICENSE_ERROR
:
return
"CUBLAS_STATUS_LICENSE_ERROR"
;
}
return
"CUBLAS_UNKNOW"
;
}
template
<
typename
T
>
void
check_gpu_error
(
T
result
,
char
const
*
const
func
,
const
char
*
const
file
,
int
const
line
)
{
if
(
result
)
{
throw
std
::
runtime_error
(
std
::
string
(
"[CUDA][ERROR] "
)
+
+
file
+
"("
+
std
::
to_string
(
line
)
+
"): "
+
(
_cudaGetErrorString
(
result
))
+
"
\n
"
);
}
}
template
void
check_gpu_error
<
cudaError_t
>(
cudaError_t
result
,
char
const
*
const
func
,
const
char
*
const
file
,
int
const
line
);
template
void
check_gpu_error
<
cublasStatus_t
>(
cublasStatus_t
result
,
char
const
*
const
func
,
const
char
*
const
file
,
int
const
line
);
template
<
typename
T
>
void
print_vec
(
const
T
*
outv
,
std
::
string
outn
,
int
num_output_ele
)
{
std
::
cout
<<
outn
<<
": "
;
std
::
vector
<
T
>
hout
(
num_output_ele
,
(
T
)
0
);
cudaMemcpy
(
hout
.
data
(),
outv
,
num_output_ele
*
sizeof
(
T
),
cudaMemcpyDeviceToHost
);
for
(
int
i
=
0
;
i
<
num_output_ele
;
i
++
)
{
std
::
cout
<<
hout
[
i
]
<<
", "
;
}
std
::
cout
<<
std
::
endl
;
}
template
<
>
void
print_vec
<
__half
>
(
const
__half
*
outv
,
std
::
string
outn
,
int
num_output_ele
)
{
std
::
cout
<<
outn
<<
": "
;
std
::
vector
<
__half
>
hout
(
num_output_ele
,
(
__half
)
0.
f
);
cudaMemcpy
(
hout
.
data
(),
outv
,
num_output_ele
*
sizeof
(
__half
),
cudaMemcpyDeviceToHost
);
for
(
int
i
=
0
;
i
<
num_output_ele
;
i
++
)
{
std
::
cout
<<
__half2float
(
hout
[
i
])
<<
", "
;
}
std
::
cout
<<
std
::
endl
;
}
template
void
print_vec
<
float
>(
const
float
*
outv
,
std
::
string
outn
,
int
num_output_ele
);
template
void
print_vec
<
int
>(
const
int
*
outv
,
std
::
string
outn
,
int
num_output_ele
);
template
void
print_vec
<
__half
>(
const
__half
*
outv
,
std
::
string
outn
,
int
num_output_ele
);
template
<
typename
T
>
T
*
cuda_malloc
(
size_t
ele_num
)
{
size_t
byte_size
=
ele_num
*
sizeof
(
T
);
T
*
pdata
=
nullptr
;
CHECK_GPU_ERROR
(
cudaMalloc
((
void
**
)
&
pdata
,
byte_size
));
return
pdata
;
}
template
float
*
cuda_malloc
<
float
>(
size_t
ele_num
);
template
__half
*
cuda_malloc
<
__half
>(
size_t
ele_num
);
template
uint8_t
*
cuda_malloc
<
uint8_t
>(
size_t
ele_num
);
void
cuda_free
(
void
*
pdata
)
{
if
(
pdata
!=
nullptr
)
{
cudaFree
(
pdata
);
}
}
template
<
typename
T
>
struct
_isnan
{
__device__
bool
operator
()(
T
a
)
const
{
return
isnan
(
a
);
}
};
template
<
>
struct
_isnan
<
__half
>
{
__device__
bool
operator
()(
const
__half
a
)
const
{
return
__hisnan
(
a
);
}
};
template
<
typename
T
>
struct
_isinf
{
__device__
bool
operator
()(
T
a
)
const
{
return
isinf
(
a
);
}
};
template
<
>
struct
_isinf
<
__half
>
{
__device__
bool
operator
()(
const
__half
a
)
const
{
return
__hisinf
(
a
);
}
};
template
<
typename
T
>
void
check_nan_inf
(
const
T
*
data_ptr
,
int
dsize
,
bool
check_nan_inf
,
std
::
string
file
,
int
line
,
cudaStream_t
stream
)
{
// check_nan_inf = 0 for checking nan
// check_nan_inf = 1 for checking inf
bool
res
=
false
;
std
::
string
msg
=
file
+
"("
+
std
::
to_string
(
line
)
+
"): "
;
if
(
check_nan_inf
)
{
msg
+=
"nan."
;
res
=
thrust
::
transform_reduce
(
thrust
::
cuda
::
par
.
on
(
stream
),
data_ptr
,
data_ptr
+
dsize
,
_isnan
<
T
>
(),
false
,
thrust
::
logical_or
<
bool
>
());
}
else
{
msg
+=
"inf."
;
res
=
thrust
::
transform_reduce
(
thrust
::
cuda
::
par
.
on
(
stream
),
data_ptr
,
data_ptr
+
dsize
,
_isinf
<
T
>
(),
false
,
thrust
::
logical_or
<
bool
>
());
}
if
(
res
)
{
throw
std
::
runtime_error
(
msg
);
}
std
::
cout
<<
msg
<<
" [check pass]."
<<
std
::
endl
;
}
template
void
check_nan_inf
<
float
>(
const
float
*
data_ptr
,
int
dsize
,
bool
check_nan_inf
,
std
::
string
file
,
int
line
,
cudaStream_t
stream
);
template
void
check_nan_inf
<
__half
>(
const
__half
*
data_ptr
,
int
dsize
,
bool
check_nan_inf
,
std
::
string
file
,
int
line
,
cudaStream_t
stream
);
colossalai/kernel/cuda_native/csrc/kernels/dropout_kernels.cu
deleted
100644 → 0
View file @
bce9499e
#include <chrono>
#include <ctime>
#include "kernels.h"
#include <cooperative_groups.h>
namespace
cg
=
cooperative_groups
;
curandStatePhilox4_32_10_t
*
curandstate
;
/**
* @brief element-wise activation function on device, like Relu, Gelu
*
* @tparam enum class ActivationType, kRelu, kGelu
* @tparam input type
* @param any shape of float and __half2
* @return same shape and type with input
*/
template
<
ActivationType
,
typename
T
>
__forceinline__
__device__
T
activation_kernel
(
T
x
);
template
<
>
__device__
float
activation_kernel
<
ActivationType
::
kGelu
,
float
>
(
float
x
)
{
float
cdf
=
0.5
f
*
(
1.0
f
+
tanhf
((
0.7978845608028654
f
*
(
x
+
0.044715
f
*
x
*
x
*
x
))));
return
x
*
cdf
;
}
template
<
>
__device__
__half2
activation_kernel
<
ActivationType
::
kGelu
,
__half2
>
(
__half2
val
)
{
__half2
val_pow3
=
__hmul2
(
val
,
__hmul2
(
val
,
val
));
float2
tmp_pow
=
__half22float2
(
val_pow3
);
float2
tmp
=
__half22float2
(
val
);
tmp
.
x
=
0.5
f
*
(
1.0
f
+
tanhf
((
0.7978845608028654
f
*
(
tmp
.
x
+
0.044715
f
*
tmp_pow
.
x
))));
tmp
.
y
=
0.5
f
*
(
1.0
f
+
tanhf
((
0.7978845608028654
f
*
(
tmp
.
y
+
0.044715
f
*
tmp_pow
.
y
))));
return
__hmul2
(
val
,
__float22half2_rn
(
tmp
));
}
template
<
>
__device__
float
activation_kernel
<
ActivationType
::
kRelu
,
float
>
(
float
x
)
{
return
fmaxf
(
x
,
0
);
}
template
<
>
__device__
__half2
activation_kernel
<
ActivationType
::
kRelu
,
__half2
>
(
__half2
x
)
{
return
__floats2half2_rn
(
fmaxf
(
0.
f
,
__half2float
(
x
.
x
)),
fmaxf
(
0.
f
,
__half2float
(
x
.
y
)));
}
/**
* @brief element-wise activation backward function on device
*
* @tparam enum class ActivationType
* @tparam input type
* @param any shape of float and __half2
* @return same shape of input
*/
template
<
ActivationType
,
typename
T
>
__forceinline__
__device__
T
activation_bwd_kernel
(
T
grad
,
T
x
);
template
<
>
__device__
float
activation_bwd_kernel
<
ActivationType
::
kGelu
,
float
>
(
float
grad
,
float
x
)
{
const
float
sqrt_param
=
0.79788456080286535587989211986876
f
;
const
float
mul_param
=
0.044715
;
float
x2mul
=
x
*
x
*
mul_param
;
float
tan_h
=
tanhf
(
sqrt_param
*
(
x
+
x
*
x2mul
));
float
dg1
=
0.5
f
*
(
1.0
f
+
tan_h
);
float
dg2
=
x
*
0.5
f
*
sqrt_param
*
(
1
-
tan_h
*
tan_h
);
float
dg3
=
dg2
*
3
*
x2mul
;
return
grad
*
(
dg1
+
dg2
+
dg3
);
}
template
<
>
__device__
__half
activation_bwd_kernel
<
ActivationType
::
kGelu
,
__half
>
(
__half
grad
,
__half
x_half
)
{
float
x
=
__half2float
(
x_half
);
const
float
sqrt_param
=
0.79788456080286535587989211986876
f
;
const
float
mul_param
=
0.044715
;
float
x2mul
=
x
*
x
*
mul_param
;
float
tan_h
=
tanhf
(
sqrt_param
*
(
x
+
x
*
x2mul
));
float
dg1
=
0.5
f
*
(
1.0
f
+
tan_h
);
float
dg2
=
x
*
0.5
f
*
sqrt_param
*
(
1
-
tan_h
*
tan_h
);
float
dg3
=
dg2
*
3
*
x2mul
;
return
grad
*
__float2half
(
dg1
+
dg2
+
dg3
);
}
template
<
>
__device__
float
activation_bwd_kernel
<
ActivationType
::
kRelu
,
float
>
(
float
grad
,
float
x
)
{
return
x
>
0.
f
?
grad
:
0.
f
;
}
template
<
>
__device__
__half
activation_bwd_kernel
<
ActivationType
::
kRelu
,
__half
>
(
__half
grad
,
__half
x
)
{
const
__half
half_zero
=
__float2half
(
0.
f
);
return
x
>
half_zero
?
grad
:
half_zero
;
}
template
<
>
__device__
__half2
activation_bwd_kernel
<
ActivationType
::
kRelu
,
__half2
>
(
__half2
grad2
,
__half2
x_half2
)
{
const
__half
half_zero
=
__float2half
(
0.
f
);
return
__floats2half2_rn
(
x_half2
.
x
>
half_zero
?
grad2
.
x
:
half_zero
,
x_half2
.
y
>
half_zero
?
grad2
.
y
:
half_zero
);
}
/**
* @brief init curand states in global memory
*
* @thread grid_dim * block*dim to suuport any size of states
* @param state persistant curand states
* @param seed seed to init states
* @return void
*/
__global__
void
curand_init_kernel
(
curandStatePhilox4_32_10_t
*
state
,
int
seed
)
{
/* Each thread gets same seed, a different sequence
number, no offset */
int
id
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
curand_init
(
seed
,
id
,
0
,
&
state
[
id
]);
}
void
launch_curand_init
(
int
total_count
,
int
dim
,
cudaStream_t
stream
)
{
cudaMalloc
(
&
curandstate
,
total_count
*
sizeof
(
curandStatePhilox4_32_10_t
));
int
grid_dim
=
total_count
>>
9
;
curand_init_kernel
<<<
grid_dim
,
512
,
0
,
stream
>>>
(
curandstate
,
std
::
chrono
::
duration_cast
<
std
::
chrono
::
microseconds
>
(
std
::
chrono
::
system_clock
::
now
().
time_since_epoch
())
.
count
());
}
/**
* @brief element-wise dropout, store dropped position in mask, it's not
* in-place
*
* @thread
* gridDim.x = total_count / 1024
* blockDim.x = 1024
*
* @param total_count total elements
* @param ratio drop ratio
* @param out any size of float and __half
* @param in same with out
* @param mask uint8 type, same size with out
* @param seed seed to curand
* @return void
*/
__global__
void
ls_dropout_kernel
(
const
int
total_count
,
const
float
ratio
,
float
*
__restrict__
out
,
const
float
*
__restrict__
in
,
uint8_t
*
__restrict__
mask
,
const
int
seed
)
{
const
float
scale
=
1.
f
/
(
1.
f
-
ratio
);
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
i
*
4
>=
total_count
)
return
;
curandStatePhilox4_32_10_t
state
;
curand_init
(
seed
,
i
,
0
,
&
state
);
uint8_t
m
[
4
];
float4
*
out4
=
reinterpret_cast
<
float4
*>
(
out
);
const
float4
*
data4
=
reinterpret_cast
<
const
float4
*>
(
in
);
uint32_t
*
mask4
=
reinterpret_cast
<
uint32_t
*>
(
mask
);
float4
rand
=
curand_uniform4
(
&
state
);
m
[
0
]
=
(
uint8_t
)(
rand
.
x
>
ratio
);
m
[
1
]
=
(
uint8_t
)(
rand
.
y
>
ratio
);
m
[
2
]
=
(
uint8_t
)(
rand
.
z
>
ratio
);
m
[
3
]
=
(
uint8_t
)(
rand
.
w
>
ratio
);
uint32_t
*
m4
=
reinterpret_cast
<
uint32_t
*>
(
m
);
mask4
[
i
]
=
m4
[
0
];
float4
input4
=
data4
[
i
];
float4
res4
;
res4
.
x
=
input4
.
x
*
scale
*
m
[
0
];
res4
.
y
=
input4
.
y
*
scale
*
m
[
1
];
res4
.
z
=
input4
.
z
*
scale
*
m
[
2
];
res4
.
w
=
input4
.
w
*
scale
*
m
[
3
];
out4
[
i
]
=
res4
;
}
__global__
void
ls_dropout_kernel
(
const
int
total_count
,
const
float
ratio
,
__half
*
__restrict__
out
,
const
__half
*
__restrict__
in
,
uint8_t
*
__restrict__
mask
,
const
int
seed
)
{
const
float
scale
=
1.
f
/
(
1.
f
-
ratio
);
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
i
*
8
>=
total_count
)
return
;
curandStatePhilox4_32_10_t
state
;
curand_init
(
seed
,
i
,
0
,
&
state
);
const
float4
*
vals_float4
=
reinterpret_cast
<
const
float4
*>
(
in
);
float4
*
outs_float4
=
reinterpret_cast
<
float4
*>
(
out
);
uint64_t
*
mask8
=
reinterpret_cast
<
uint64_t
*>
(
mask
);
uint8_t
m
[
8
];
float4
rand
=
curand_uniform4
(
&
state
);
m
[
0
]
=
(
uint8_t
)(
rand
.
x
>
ratio
);
m
[
1
]
=
(
uint8_t
)(
rand
.
y
>
ratio
);
m
[
2
]
=
(
uint8_t
)(
rand
.
z
>
ratio
);
m
[
3
]
=
(
uint8_t
)(
rand
.
w
>
ratio
);
rand
=
curand_uniform4
(
&
state
);
m
[
4
]
=
(
uint8_t
)(
rand
.
x
>
ratio
);
m
[
5
]
=
(
uint8_t
)(
rand
.
y
>
ratio
);
m
[
6
]
=
(
uint8_t
)(
rand
.
z
>
ratio
);
m
[
7
]
=
(
uint8_t
)(
rand
.
w
>
ratio
);
uint64_t
*
m8
=
reinterpret_cast
<
uint64_t
*>
(
m
);
mask8
[
i
]
=
*
m8
;
float4
val_float4
=
vals_float4
[
i
];
float4
out_float4
;
__half2
*
val_half2
=
reinterpret_cast
<
__half2
*>
(
&
val_float4
);
__half2
*
out_half2
=
reinterpret_cast
<
__half2
*>
(
&
out_float4
);
__half2
scale_mask_1
=
__floats2half2_rn
(
scale
*
m
[
0
],
scale
*
m
[
1
]);
__half2
scale_mask_2
=
__floats2half2_rn
(
scale
*
m
[
2
],
scale
*
m
[
3
]);
__half2
scale_mask_3
=
__floats2half2_rn
(
scale
*
m
[
4
],
scale
*
m
[
5
]);
__half2
scale_mask_4
=
__floats2half2_rn
(
scale
*
m
[
6
],
scale
*
m
[
7
]);
out_half2
[
0
]
=
__hmul2
(
val_half2
[
0
],
scale_mask_1
);
out_half2
[
1
]
=
__hmul2
(
val_half2
[
1
],
scale_mask_2
);
out_half2
[
2
]
=
__hmul2
(
val_half2
[
2
],
scale_mask_3
);
out_half2
[
3
]
=
__hmul2
(
val_half2
[
3
],
scale_mask_4
);
outs_float4
[
i
]
=
out_float4
;
}
/**
* @brief element-wise dropout backward with dropout mask, it's
* not in-place
*
* @thread
* gridDim.x = total_count / 1024
* blockDim.x = 1024
*
* @param total_count total elements
* @param ratio drop ratio
* @param in any size of float and __half
* @param mask uint8 type, same size with in
* @return void
*/
__global__
void
ls_dropout_bwd_kernel
(
const
int
total_count
,
const
float
ratio
,
float
*
out
,
const
float
*
in
,
const
uint8_t
*
__restrict__
mask
)
{
const
float
scale
=
1.
f
/
(
1.
f
-
ratio
);
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
i
*
4
>=
total_count
)
return
;
uint8_t
m
[
4
];
float4
*
out4
=
reinterpret_cast
<
float4
*>
(
out
);
const
float4
*
in4
=
reinterpret_cast
<
const
float4
*>
(
in
);
const
uint32_t
*
mask4
=
reinterpret_cast
<
const
uint32_t
*>
(
mask
);
uint32_t
*
m4
=
reinterpret_cast
<
uint32_t
*>
(
m
);
m4
[
0
]
=
mask4
[
i
];
float4
input4
=
in4
[
i
];
float4
res4
;
res4
.
x
=
input4
.
x
*
scale
*
static_cast
<
float
>
(
m
[
0
]);
res4
.
y
=
input4
.
y
*
scale
*
static_cast
<
float
>
(
m
[
1
]);
res4
.
z
=
input4
.
z
*
scale
*
static_cast
<
float
>
(
m
[
2
]);
res4
.
w
=
input4
.
w
*
scale
*
static_cast
<
float
>
(
m
[
3
]);
out4
[
i
]
=
res4
;
}
__global__
void
ls_dropout_bwd_kernel
(
const
int
total_count
,
const
float
ratio
,
__half
*
out
,
const
__half
*
in
,
const
uint8_t
*
__restrict__
mask
)
{
const
__half
scale
=
1.
f
/
(
1.
f
-
ratio
);
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
i
*
8
>=
total_count
)
return
;
float4
*
out4
=
reinterpret_cast
<
float4
*>
(
out
);
const
float4
*
vals_float4
=
reinterpret_cast
<
const
float4
*>
(
in
);
const
uint64_t
*
mask8
=
reinterpret_cast
<
const
uint64_t
*>
(
mask
);
uint8_t
m
[
8
];
uint64_t
*
m8
=
reinterpret_cast
<
uint64_t
*>
(
m
);
m8
[
0
]
=
mask8
[
i
];
float4
val_float4
=
vals_float4
[
i
];
float4
out_float4
;
__half2
*
val_half2
=
reinterpret_cast
<
__half2
*>
(
&
val_float4
);
__half2
*
out_half2
=
reinterpret_cast
<
__half2
*>
(
&
out_float4
);
__half2
scale_mask_1
=
__halves2half2
(
scale
*
__float2half
(
m
[
0
]),
scale
*
__float2half
(
m
[
1
]));
__half2
scale_mask_2
=
__halves2half2
(
scale
*
__float2half
(
m
[
2
]),
scale
*
__float2half
(
m
[
3
]));
__half2
scale_mask_3
=
__halves2half2
(
scale
*
__float2half
(
m
[
4
]),
scale
*
__float2half
(
m
[
5
]));
__half2
scale_mask_4
=
__halves2half2
(
scale
*
__float2half
(
m
[
6
]),
scale
*
__float2half
(
m
[
7
]));
out_half2
[
0
]
=
__hmul2
(
val_half2
[
0
],
scale_mask_1
);
out_half2
[
1
]
=
__hmul2
(
val_half2
[
1
],
scale_mask_2
);
out_half2
[
2
]
=
__hmul2
(
val_half2
[
2
],
scale_mask_3
);
out_half2
[
3
]
=
__hmul2
(
val_half2
[
3
],
scale_mask_4
);
out4
[
i
]
=
out_float4
;
}
template
<
>
void
launch_ls_dropout
<
float
>
(
float
*
out
,
const
float
*
vals
,
uint8_t
*
mask
,
int
total_count
,
float
ratio
,
cudaStream_t
stream
,
bool
backward
)
{
int
grid_dim
=
total_count
>>
12
;
if
(
!
backward
)
{
ls_dropout_kernel
<<<
grid_dim
+
1
,
1024
,
0
,
stream
>>>
(
total_count
,
ratio
,
out
,
vals
,
mask
,
std
::
chrono
::
duration_cast
<
std
::
chrono
::
microseconds
>
(
std
::
chrono
::
system_clock
::
now
().
time_since_epoch
())
.
count
());
}
else
{
ls_dropout_bwd_kernel
<<<
grid_dim
+
1
,
1024
,
0
,
stream
>>>
(
total_count
,
ratio
,
out
,
vals
,
mask
);
}
}
template
<
>
void
launch_ls_dropout
<
__half
>
(
__half
*
out
,
const
__half
*
vals
,
uint8_t
*
mask
,
int
total_count
,
float
ratio
,
cudaStream_t
stream
,
bool
backward
)
{
int
grid_dim
=
total_count
>>
13
;
if
(
!
backward
)
{
ls_dropout_kernel
<<<
grid_dim
+
1
,
1024
,
0
,
stream
>>>
(
total_count
,
ratio
,
out
,
vals
,
mask
,
std
::
chrono
::
duration_cast
<
std
::
chrono
::
microseconds
>
(
std
::
chrono
::
system_clock
::
now
().
time_since_epoch
())
.
count
());
}
else
{
ls_dropout_bwd_kernel
<<<
grid_dim
+
1
,
1024
,
0
,
stream
>>>
(
total_count
,
ratio
,
out
,
vals
,
mask
);
}
}
/**
* @brief fused bias, dropout, and residual at the end of Attention and FFN,
* store dropped position in mask, it's not in-place
*
* @thread
* gridDim.x = total_count / 1024
* blockDim.x = 1024
*
* @param total_count total elements
* @param ratio drop ratio
* @param out [batch_size, seq_len, hidden_size], float and __half
* @param in [batch_size, seq_len, hidden_size], float and __half
* @param mask [batch_size, seq_len, hidden_size], uint8 type
* @param bias [hidden_size], ffn bias
* @param residual [batch_size, seq_len, hidden_size], float and __half
* @param seed seed to curand
* @param hidden_size hidden size
* @return void
*/
__global__
void
ls_dropout_res_bias_kernel
(
const
int
total_count
,
const
float
ratio
,
float
*
__restrict__
out
,
const
float
*
__restrict__
in
,
uint8_t
*
__restrict__
mask
,
const
float
*
__restrict__
bias
,
const
float
*
__restrict__
residual
,
const
int
seed
,
const
int
hidden_size
)
{
const
float
scale
=
1.
f
/
(
1.
f
-
ratio
);
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
i
*
4
>=
total_count
)
return
;
curandStatePhilox4_32_10_t
state
;
curand_init
(
seed
,
i
,
0
,
&
state
);
uint8_t
m
[
4
];
float4
*
out4
=
reinterpret_cast
<
float4
*>
(
out
);
const
float4
*
data4
=
reinterpret_cast
<
const
float4
*>
(
in
);
const
float4
*
residual4
=
reinterpret_cast
<
const
float4
*>
(
residual
);
const
float4
*
bias4
=
reinterpret_cast
<
const
float4
*>
(
bias
);
uint32_t
*
mask4
=
reinterpret_cast
<
uint32_t
*>
(
mask
);
float4
rand
=
curand_uniform4
(
&
state
);
m
[
0
]
=
static_cast
<
uint8_t
>
(
rand
.
x
>
ratio
);
m
[
1
]
=
static_cast
<
uint8_t
>
(
rand
.
y
>
ratio
);
m
[
2
]
=
static_cast
<
uint8_t
>
(
rand
.
z
>
ratio
);
m
[
3
]
=
static_cast
<
uint8_t
>
(
rand
.
w
>
ratio
);
int
bias_i
=
i
%
(
hidden_size
>>
2
);
uint32_t
*
m4
=
reinterpret_cast
<
uint32_t
*>
(
m
);
mask4
[
i
]
=
m4
[
0
];
const
float4
input4
=
data4
[
i
];
const
float4
b4
=
__ldg
(
&
bias4
[
bias_i
]);
const
float4
res4
=
residual4
[
i
];
float4
output4
;
output4
.
x
=
(
input4
.
x
+
b4
.
x
)
*
scale
*
m
[
0
]
+
res4
.
x
;
output4
.
y
=
(
input4
.
y
+
b4
.
y
)
*
scale
*
m
[
1
]
+
res4
.
y
;
output4
.
z
=
(
input4
.
z
+
b4
.
z
)
*
scale
*
m
[
2
]
+
res4
.
z
;
output4
.
w
=
(
input4
.
w
+
b4
.
w
)
*
scale
*
m
[
3
]
+
res4
.
w
;
out4
[
i
]
=
output4
;
}
__global__
void
ls_dropout_res_bias_kernel
(
const
int
total_count
,
const
float
ratio
,
__half
*
__restrict__
out
,
const
__half
*
__restrict__
in
,
uint8_t
*
__restrict__
mask
,
const
__half
*
__restrict__
bias
,
const
__half
*
__restrict__
residual
,
const
int
seed
,
const
int
hidden_size
)
{
const
__half
scale
=
1.
/
(
1.
-
ratio
);
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
i
*
8
>=
total_count
)
return
;
curandStatePhilox4_32_10_t
state
;
curand_init
(
seed
,
i
,
0
,
&
state
);
const
float4
*
vals_float4
=
reinterpret_cast
<
const
float4
*>
(
in
);
float4
*
outs_float4
=
reinterpret_cast
<
float4
*>
(
out
);
const
float4
*
residual4
=
reinterpret_cast
<
const
float4
*>
(
residual
);
const
float4
*
bias4
=
reinterpret_cast
<
const
float4
*>
(
bias
);
uint64_t
*
mask8
=
reinterpret_cast
<
uint64_t
*>
(
mask
);
uint8_t
m
[
8
];
float4
rand
=
curand_uniform4
(
&
state
);
m
[
0
]
=
static_cast
<
uint8_t
>
(
rand
.
x
>
ratio
);
m
[
1
]
=
static_cast
<
uint8_t
>
(
rand
.
y
>
ratio
);
m
[
2
]
=
static_cast
<
uint8_t
>
(
rand
.
z
>
ratio
);
m
[
3
]
=
static_cast
<
uint8_t
>
(
rand
.
w
>
ratio
);
rand
=
curand_uniform4
(
&
state
);
m
[
4
]
=
static_cast
<
uint8_t
>
(
rand
.
x
>
ratio
);
m
[
5
]
=
static_cast
<
uint8_t
>
(
rand
.
y
>
ratio
);
m
[
6
]
=
static_cast
<
uint8_t
>
(
rand
.
z
>
ratio
);
m
[
7
]
=
static_cast
<
uint8_t
>
(
rand
.
w
>
ratio
);
uint64_t
*
m8
=
reinterpret_cast
<
uint64_t
*>
(
m
);
mask8
[
i
]
=
m8
[
0
];
int
bias_i
=
i
%
(
hidden_size
>>
3
);
float4
val_float4
=
vals_float4
[
i
];
const
float4
b4
=
__ldg
(
&
bias4
[
bias_i
]);
const
float4
res4
=
residual4
[
i
];
float4
out_float4
;
__half2
*
val_half2
=
reinterpret_cast
<
__half2
*>
(
&
val_float4
);
__half2
*
out_half2
=
reinterpret_cast
<
__half2
*>
(
&
out_float4
);
const
__half2
*
b_half2
=
reinterpret_cast
<
const
__half2
*>
(
&
b4
);
const
__half2
*
res_half2
=
reinterpret_cast
<
const
__half2
*>
(
&
res4
);
__half2
scale_mask_1
=
__halves2half2
(
scale
*
__float2half
(
m
[
0
]),
scale
*
__float2half
(
m
[
1
]));
__half2
scale_mask_2
=
__halves2half2
(
scale
*
__float2half
(
m
[
2
]),
scale
*
__float2half
(
m
[
3
]));
__half2
scale_mask_3
=
__halves2half2
(
scale
*
__float2half
(
m
[
4
]),
scale
*
__float2half
(
m
[
5
]));
__half2
scale_mask_4
=
__halves2half2
(
scale
*
__float2half
(
m
[
6
]),
scale
*
__float2half
(
m
[
7
]));
out_half2
[
0
]
=
__hfma2
(
__hadd2
(
val_half2
[
0
],
b_half2
[
0
]),
scale_mask_1
,
res_half2
[
0
]);
out_half2
[
1
]
=
__hfma2
(
__hadd2
(
val_half2
[
1
],
b_half2
[
1
]),
scale_mask_2
,
res_half2
[
1
]);
out_half2
[
2
]
=
__hfma2
(
__hadd2
(
val_half2
[
2
],
b_half2
[
2
]),
scale_mask_3
,
res_half2
[
2
]);
out_half2
[
3
]
=
__hfma2
(
__hadd2
(
val_half2
[
3
],
b_half2
[
3
]),
scale_mask_4
,
res_half2
[
3
]);
outs_float4
[
i
]
=
out_float4
;
}
template
<
>
void
launch_ls_dropout_res_bias
<
float
>
(
float
*
out
,
const
float
*
vals
,
uint8_t
*
mask
,
const
float
*
bias
,
const
float
*
residual
,
int
total_count
,
int
dim
,
float
ratio
,
cudaStream_t
stream
)
{
int
grid_dim
=
total_count
>>
12
;
ls_dropout_res_bias_kernel
<<<
grid_dim
+
1
,
1024
,
0
,
stream
>>>
(
total_count
,
ratio
,
out
,
vals
,
mask
,
bias
,
residual
,
std
::
chrono
::
duration_cast
<
std
::
chrono
::
microseconds
>
(
std
::
chrono
::
system_clock
::
now
().
time_since_epoch
())
.
count
(),
dim
);
}
template
<
>
void
launch_ls_dropout_res_bias
<
__half
>
(
__half
*
out
,
const
__half
*
vals
,
uint8_t
*
mask
,
const
__half
*
bias
,
const
__half
*
residual
,
int
total_count
,
int
dim
,
float
ratio
,
cudaStream_t
stream
)
{
int
grid_dim
=
total_count
>>
13
;
ls_dropout_res_bias_kernel
<<<
grid_dim
+
1
,
1024
,
0
,
stream
>>>
(
total_count
,
ratio
,
out
,
vals
,
mask
,
bias
,
residual
,
std
::
chrono
::
duration_cast
<
std
::
chrono
::
microseconds
>
(
std
::
chrono
::
system_clock
::
now
().
time_since_epoch
())
.
count
(),
dim
);
}
/**
* @brief fused bias and dropout backward at the end of Attention and FFN
*
* @thread
* gridDim.x = hidden_size / 8
* blockDim.x = 8
* blockDim.y = 1024 / 8 = 128
*
* @param row_size batch_size * seq_len
* @param ratio dropout ratio
* @param in_grad [batch_size, seq_len, hidden_size], input grad
* @param bias_grad [hidden_size], bias grad
* @param out_grad [batch_size, seq_len, hidden_size], output grad
* @param mask [batch_size, seq_len, hidden_size], dropout mask
* @param hidden_size
* @return void
*/
__global__
void
ls_dropout_bias_bwd_kernel
(
const
int
row_size
,
const
float
ratio
,
float
*
__restrict__
in_grad
,
float
*
__restrict__
bias_grad
,
const
float
*
__restrict__
out_grad
,
const
uint8_t
*
__restrict__
mask
,
const
int
hidden_size
)
{
const
float
scale
=
1.
f
/
(
1.
f
-
ratio
);
// every block generate 8 bias result
__shared__
float
tile
[
8
][
129
];
cg
::
thread_block
b
=
cg
::
this_thread_block
();
cg
::
thread_block_tile
<
WARP_SIZE
>
g
=
cg
::
tiled_partition
<
WARP_SIZE
>
(
b
);
int
col_idx
=
flat_2dim
(
blockIdx
.
x
,
threadIdx
.
x
,
8
);
int
stride
=
hidden_size
*
128
;
float
local_sum
=
0
;
int
idx
=
flat_2dim
(
threadIdx
.
y
,
col_idx
,
hidden_size
);
for
(
int
r
=
threadIdx
.
y
;
r
<
row_size
;
r
+=
128
)
{
float
val
=
out_grad
[
idx
];
val
*=
scale
*
static_cast
<
float
>
(
mask
[
idx
]);
local_sum
+=
val
;
in_grad
[
idx
]
=
val
;
idx
+=
stride
;
}
tile
[
threadIdx
.
x
][
threadIdx
.
y
]
=
local_sum
;
__syncthreads
();
float
sum
=
0
;
int
tid
=
threadIdx
.
y
*
blockDim
.
x
+
threadIdx
.
x
;
int
x
=
tid
>>
7
;
int
y
=
tid
&
(
127
);
if
(
y
<
32
)
{
#pragma unroll
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
sum
+=
tile
[
x
][
y
+
i
*
32
];
}
}
__syncthreads
();
for
(
int
i
=
1
;
i
<
32
;
i
<<=
1
)
sum
+=
g
.
shfl_down
(
sum
,
i
);
if
(
y
==
0
)
tile
[
0
][
x
]
=
sum
;
__syncthreads
();
if
(
threadIdx
.
x
<
8
)
{
int
pos
=
flat_2dim
(
blockIdx
.
x
,
threadIdx
.
x
,
8
);
bias_grad
[
pos
]
=
tile
[
0
][
threadIdx
.
x
];
}
}
__global__
void
ls_dropout_bias_bwd_kernel
(
const
int
row_size
,
const
float
ratio
,
__half
*
__restrict__
in_grad
,
__half
*
__restrict__
bias_grad
,
const
__half
*
__restrict__
out_grad
,
const
uint8_t
*
__restrict__
mask
,
const
int
hidden_size
)
{
const
__half2
scale
=
__float2half2_rn
(
1.
f
/
(
1.
f
-
ratio
));
__shared__
__half2
tile
[
8
][
129
];
cg
::
thread_block
b
=
cg
::
this_thread_block
();
cg
::
thread_block_tile
<
WARP_SIZE
>
g
=
cg
::
tiled_partition
<
WARP_SIZE
>
(
b
);
__half2
*
in_grad2
=
reinterpret_cast
<
__half2
*>
(
in_grad
);
const
__half2
*
out_grad2
=
reinterpret_cast
<
const
__half2
*>
(
out_grad
);
__half2
*
bias_grad2
=
reinterpret_cast
<
__half2
*>
(
bias_grad
);
int
col_idx
=
flat_2dim
(
blockIdx
.
x
,
threadIdx
.
x
,
8
);
int
stride
=
hidden_size
*
128
;
__half2
local_sum
=
__float2half2_rn
(
0.
f
);
int
idx
=
flat_2dim
(
threadIdx
.
y
,
col_idx
,
hidden_size
);
for
(
int
r
=
threadIdx
.
y
;
r
<
row_size
;
r
+=
128
)
{
__half2
val
=
out_grad2
[
idx
];
__half2
m2
=
__floats2half2_rn
(
mask
[
2
*
idx
],
mask
[
2
*
idx
+
1
]);
val
*=
scale
*
m2
;
local_sum
+=
val
;
in_grad2
[
idx
]
=
val
;
idx
+=
stride
;
}
tile
[
threadIdx
.
x
][
threadIdx
.
y
]
=
local_sum
;
__syncthreads
();
__half2
sum
=
__float2half2_rn
(
0.
f
);
int
tid
=
threadIdx
.
y
*
blockDim
.
x
+
threadIdx
.
x
;
int
x
=
tid
>>
7
;
int
y
=
tid
&
(
127
);
if
(
y
<
32
)
{
#pragma unroll
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
sum
+=
tile
[
x
][
y
+
i
*
32
];
}
}
__syncthreads
();
for
(
int
i
=
1
;
i
<
WARP_SIZE
;
i
<<=
1
)
sum
+=
g
.
shfl_down
(
sum
,
i
);
if
(
y
==
0
)
tile
[
0
][
x
]
=
sum
;
__syncthreads
();
if
(
threadIdx
.
x
<
8
)
{
int
pos
=
flat_2dim
(
blockIdx
.
x
,
threadIdx
.
x
,
8
);
bias_grad2
[
pos
]
=
tile
[
0
][
threadIdx
.
x
];
}
}
template
<
typename
T
>
void
launch_ls_dropout_bias_bwd
(
T
*
in_grad
,
T
*
bias_grad
,
const
T
*
out_grad
,
const
uint8_t
*
mask
,
int
row_size
,
int
dim
,
float
ratio
,
cudaStream_t
stream
)
{
dim3
grid_dim
((
dim
-
1
)
/
8
+
1
);
dim3
block_dim
(
8
,
128
);
ls_dropout_bias_bwd_kernel
<<<
grid_dim
,
block_dim
,
0
,
stream
>>>
(
row_size
,
ratio
,
in_grad
,
bias_grad
,
out_grad
,
mask
,
dim
);
}
template
<
>
void
launch_ls_dropout_bias_bwd
(
__half
*
in_grad
,
__half
*
bias_grad
,
const
__half
*
out_grad
,
const
uint8_t
*
mask
,
int
row_size
,
int
dim
,
float
ratio
,
cudaStream_t
stream
)
{
dim
>>=
1
;
dim3
grid_dim
((
dim
-
1
)
/
8
+
1
);
dim3
block_dim
(
8
,
128
);
ls_dropout_bias_bwd_kernel
<<<
grid_dim
,
block_dim
,
0
,
stream
>>>
(
row_size
,
ratio
,
in_grad
,
bias_grad
,
out_grad
,
mask
,
dim
);
}
template
void
launch_ls_dropout_bias_bwd
(
float
*
in_grad
,
float
*
bias_grad
,
const
float
*
out_grad
,
const
uint8_t
*
mask
,
int
row_size
,
int
dim
,
float
ratio
,
cudaStream_t
stream
);
/**
* @brief fused bias, activation, and dropout at the end of first ffn
*
* @thread
* gridDim.x = hidden_size / 8
* blockDim.x = 8
* blockDim.y = 1024 / 8 = 128
*
* @tparam act_type activation function, like kRelu, kGelu
* @param total_count total elements
* @param ratio drop ratio
* @param out [batch_size, seq_len, hidden_size], float and __half
* @param in [batch_size, seq_len, hidden_size], float and __half
* @param mask [batch_size, seq_len, hidden_size], uint8 type
* @param bias [hidden_size], ffn bias
* @param seed seed to curand
* @param hidden_size
* @return void
*/
template
<
ActivationType
act_type
>
__global__
void
ls_dropout_act_bias_kernel
(
const
int
total_count
,
const
float
ratio
,
float
*
__restrict__
out
,
const
float
*
__restrict__
in
,
uint8_t
*
__restrict__
mask
,
const
float
*
__restrict__
bias
,
const
int
seed
,
const
int
hidden_size
)
{
const
float
scale
=
1.
f
/
(
1.
f
-
ratio
);
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
i
*
4
>=
total_count
)
return
;
curandStatePhilox4_32_10_t
state
;
curand_init
(
seed
,
i
,
0
,
&
state
);
uint8_t
m
[
4
];
float4
*
out4
=
reinterpret_cast
<
float4
*>
(
out
);
const
float4
*
data4
=
reinterpret_cast
<
const
float4
*>
(
in
);
const
float4
*
bias4
=
reinterpret_cast
<
const
float4
*>
(
bias
);
uint32_t
*
mask4
=
reinterpret_cast
<
uint32_t
*>
(
mask
);
float4
rand
=
curand_uniform4
(
&
state
);
m
[
0
]
=
(
uint8_t
)(
rand
.
x
>
ratio
);
m
[
1
]
=
(
uint8_t
)(
rand
.
y
>
ratio
);
m
[
2
]
=
(
uint8_t
)(
rand
.
z
>
ratio
);
m
[
3
]
=
(
uint8_t
)(
rand
.
w
>
ratio
);
int
bias_i
=
i
%
(
hidden_size
>>
2
);
uint32_t
*
m4
=
reinterpret_cast
<
uint32_t
*>
(
m
);
mask4
[
i
]
=
m4
[
0
];
const
float4
input4
=
data4
[
i
];
const
float4
b4
=
__ldg
(
&
bias4
[
bias_i
]);
float4
output4
;
output4
.
x
=
activation_kernel
<
act_type
,
float
>
(
input4
.
x
+
b4
.
x
)
*
scale
*
m
[
0
];
output4
.
y
=
activation_kernel
<
act_type
,
float
>
(
input4
.
y
+
b4
.
y
)
*
scale
*
m
[
1
];
output4
.
z
=
activation_kernel
<
act_type
,
float
>
(
input4
.
z
+
b4
.
z
)
*
scale
*
m
[
2
];
output4
.
w
=
activation_kernel
<
act_type
,
float
>
(
input4
.
w
+
b4
.
w
)
*
scale
*
m
[
3
];
out4
[
i
]
=
output4
;
}
template
<
ActivationType
act_type
>
__global__
void
ls_dropout_act_bias_kernel
(
const
int
total_count
,
const
float
ratio
,
__half
*
__restrict__
out
,
const
__half
*
__restrict__
in
,
uint8_t
*
__restrict__
mask
,
const
__half
*
__restrict__
bias
,
const
int
seed
,
const
int
hidden_size
)
{
const
float
scale
=
1.
f
/
(
1.
f
-
ratio
);
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
i
*
8
>=
total_count
)
return
;
curandStatePhilox4_32_10_t
state
;
curand_init
(
seed
,
i
,
0
,
&
state
);
const
float4
*
vals_float4
=
reinterpret_cast
<
const
float4
*>
(
in
);
float4
*
outs_float4
=
reinterpret_cast
<
float4
*>
(
out
);
const
float4
*
bias4
=
reinterpret_cast
<
const
float4
*>
(
bias
);
uint64_t
*
mask8
=
reinterpret_cast
<
uint64_t
*>
(
mask
);
uint8_t
m
[
8
];
float4
rand
=
curand_uniform4
(
&
state
);
m
[
0
]
=
(
uint8_t
)(
rand
.
x
>
ratio
);
m
[
1
]
=
(
uint8_t
)(
rand
.
y
>
ratio
);
m
[
2
]
=
(
uint8_t
)(
rand
.
z
>
ratio
);
m
[
3
]
=
(
uint8_t
)(
rand
.
w
>
ratio
);
rand
=
curand_uniform4
(
&
state
);
m
[
4
]
=
(
uint8_t
)(
rand
.
x
>
ratio
);
m
[
5
]
=
(
uint8_t
)(
rand
.
y
>
ratio
);
m
[
6
]
=
(
uint8_t
)(
rand
.
z
>
ratio
);
m
[
7
]
=
(
uint8_t
)(
rand
.
w
>
ratio
);
uint64_t
*
m8
=
reinterpret_cast
<
uint64_t
*>
(
m
);
mask8
[
i
]
=
*
m8
;
int
bias_i
=
i
%
(
hidden_size
>>
3
);
float4
val_float4
=
vals_float4
[
i
];
const
float4
b4
=
__ldg
(
&
bias4
[
bias_i
]);
float4
out_float4
;
__half2
*
val_half2
=
reinterpret_cast
<
__half2
*>
(
&
val_float4
);
__half2
*
out_half2
=
reinterpret_cast
<
__half2
*>
(
&
out_float4
);
const
__half2
*
b_half2
=
reinterpret_cast
<
const
__half2
*>
(
&
b4
);
__half2
scale_mask_1
=
__floats2half2_rn
(
scale
*
m
[
0
],
scale
*
m
[
1
]);
__half2
scale_mask_2
=
__floats2half2_rn
(
scale
*
m
[
2
],
scale
*
m
[
3
]);
__half2
scale_mask_3
=
__floats2half2_rn
(
scale
*
m
[
4
],
scale
*
m
[
5
]);
__half2
scale_mask_4
=
__floats2half2_rn
(
scale
*
m
[
6
],
scale
*
m
[
7
]);
out_half2
[
0
]
=
__hmul2
(
activation_kernel
<
act_type
,
__half2
>
(
__hadd2
(
val_half2
[
0
],
b_half2
[
0
])),
scale_mask_1
);
out_half2
[
1
]
=
__hmul2
(
activation_kernel
<
act_type
,
__half2
>
(
__hadd2
(
val_half2
[
1
],
b_half2
[
1
])),
scale_mask_2
);
out_half2
[
2
]
=
__hmul2
(
activation_kernel
<
act_type
,
__half2
>
(
__hadd2
(
val_half2
[
2
],
b_half2
[
2
])),
scale_mask_3
);
out_half2
[
3
]
=
__hmul2
(
activation_kernel
<
act_type
,
__half2
>
(
__hadd2
(
val_half2
[
3
],
b_half2
[
3
])),
scale_mask_4
);
outs_float4
[
i
]
=
out_float4
;
}
template
<
>
void
launch_ls_dropout_act_bias
<
ActivationType
::
kGelu
,
float
>
(
float
*
out
,
const
float
*
vals
,
uint8_t
*
mask
,
const
float
*
bias
,
int
total_count
,
int
dim
,
float
ratio
,
cudaStream_t
stream
)
{
int
grid_dim
=
total_count
>>
10
;
ls_dropout_act_bias_kernel
<
ActivationType
::
kGelu
>
<<<
grid_dim
+
1
,
256
,
0
,
stream
>>>
(
total_count
,
ratio
,
out
,
vals
,
mask
,
bias
,
std
::
chrono
::
duration_cast
<
std
::
chrono
::
microseconds
>
(
std
::
chrono
::
system_clock
::
now
().
time_since_epoch
())
.
count
(),
dim
);
}
template
<
>
void
launch_ls_dropout_act_bias
<
ActivationType
::
kGelu
,
__half
>
(
__half
*
out
,
const
__half
*
vals
,
uint8_t
*
mask
,
const
__half
*
bias
,
int
total_count
,
int
dim
,
float
ratio
,
cudaStream_t
stream
)
{
int
grid_dim
=
total_count
>>
11
;
ls_dropout_act_bias_kernel
<
ActivationType
::
kGelu
>
<<<
grid_dim
+
1
,
256
,
0
,
stream
>>>
(
total_count
,
ratio
,
out
,
vals
,
mask
,
bias
,
std
::
chrono
::
duration_cast
<
std
::
chrono
::
microseconds
>
(
std
::
chrono
::
system_clock
::
now
().
time_since_epoch
())
.
count
(),
dim
);
}
template
<
>
void
launch_ls_dropout_act_bias
<
ActivationType
::
kRelu
,
float
>
(
float
*
out
,
const
float
*
vals
,
uint8_t
*
mask
,
const
float
*
bias
,
int
total_count
,
int
dim
,
float
ratio
,
cudaStream_t
stream
)
{
int
grid_dim
=
total_count
>>
10
;
ls_dropout_act_bias_kernel
<
ActivationType
::
kRelu
>
<<<
grid_dim
+
1
,
256
,
0
,
stream
>>>
(
total_count
,
ratio
,
out
,
vals
,
mask
,
bias
,
std
::
chrono
::
duration_cast
<
std
::
chrono
::
microseconds
>
(
std
::
chrono
::
system_clock
::
now
().
time_since_epoch
())
.
count
(),
dim
);
}
template
<
>
void
launch_ls_dropout_act_bias
<
ActivationType
::
kRelu
,
__half
>
(
__half
*
out
,
const
__half
*
vals
,
uint8_t
*
mask
,
const
__half
*
bias
,
int
total_count
,
int
dim
,
float
ratio
,
cudaStream_t
stream
)
{
int
grid_dim
=
total_count
>>
11
;
ls_dropout_act_bias_kernel
<
ActivationType
::
kRelu
>
<<<
grid_dim
+
1
,
256
,
0
,
stream
>>>
(
total_count
,
ratio
,
out
,
vals
,
mask
,
bias
,
std
::
chrono
::
duration_cast
<
std
::
chrono
::
microseconds
>
(
std
::
chrono
::
system_clock
::
now
().
time_since_epoch
())
.
count
(),
dim
);
}
/**
* @brief fused bias, activation, and dropout backward
*
* @thread
* gridDim.x = total_count / 1024
* blockDim.x = 1024
*
* @tparam act_type kRelu
* @param row_size batch_size * seq_len
* @param ratio dropout ratio
* @param in_grad [batch_size, seq_len, hidden_size], input grad
* @param bias_grad [hidden_size], bias grad
* @param out_grad [batch_size, seq_len, hidden_size], output grad
* @param mask [batch_size, seq_len, hidden_size], dropout mask
* @param hidden_size
* @return void
*/
template
<
ActivationType
act_type
,
typename
T
>
__global__
void
ls_dropout_act_bias_bwd_kernel
(
const
int
row_size
,
const
float
ratio
,
T
*
in_grad
,
T
*
__restrict__
bias_grad
,
const
T
*
__restrict__
input
,
const
T
*
__restrict__
bias
,
const
T
*
out_grad
,
const
uint8_t
*
__restrict__
mask
,
const
int
hidden_size
)
{
const
float
scale
=
1.
f
/
(
1.
f
-
ratio
);
__shared__
float
tile
[
WARP_SIZE
][
WARP_SIZE
+
1
];
cg
::
thread_block
b
=
cg
::
this_thread_block
();
cg
::
thread_block_tile
<
WARP_SIZE
>
g
=
cg
::
tiled_partition
<
WARP_SIZE
>
(
b
);
int
col_idx
=
flat_2dim
(
blockIdx
.
x
,
threadIdx
.
x
,
WARP_SIZE
);
int
stride
=
hidden_size
*
WARP_SIZE
;
float
local_sum
=
0
;
int
idx
=
flat_2dim
(
threadIdx
.
y
,
col_idx
,
hidden_size
);
if
(
col_idx
<
hidden_size
)
{
for
(
int
r
=
threadIdx
.
y
;
r
<
row_size
;
r
+=
WARP_SIZE
)
{
float
val
=
out_grad
[
idx
];
float
in
=
input
[
idx
];
float
b
=
bias
[
idx
%
hidden_size
];
val
=
activation_bwd_kernel
<
act_type
,
float
>
(
val
*
scale
*
static_cast
<
float
>
(
mask
[
idx
]),
in
+
b
);
local_sum
+=
val
;
in_grad
[
idx
]
=
val
;
idx
+=
stride
;
}
}
tile
[
threadIdx
.
x
][
threadIdx
.
y
]
=
local_sum
;
__syncthreads
();
float
sum
=
tile
[
threadIdx
.
y
][
threadIdx
.
x
];
__syncthreads
();
for
(
int
i
=
1
;
i
<
WARP_SIZE
;
i
<<=
1
)
sum
+=
g
.
shfl_down
(
sum
,
i
);
if
(
threadIdx
.
x
==
0
)
tile
[
0
][
threadIdx
.
y
]
=
sum
;
__syncthreads
();
if
(
threadIdx
.
y
==
0
)
{
int
pos
=
flat_2dim
(
blockIdx
.
x
,
threadIdx
.
x
,
WARP_SIZE
);
bias_grad
[
pos
]
=
tile
[
0
][
threadIdx
.
x
];
}
}
// @brief fused bias, activation, and dropout backward
// It is deprecated for precision reason. Keep it for future optimization.
//
// template <ActivationType act_type>
// __global__ void ls_dropout_act_bias_bwd_kernel(
// const int row_size, const float ratio, __half * in_grad,
// __half *__restrict__ bias_grad, const __half *__restrict__ input, const
// __half *__restrict__ bias, const __half * out_grad, const uint8_t
// *__restrict__ mask, const int hidden_size) {
// const __half2 scale = __float2half2_rn(1.f / (1.f - ratio));
// __shared__ __half2 tile[WARP_SIZE][WARP_SIZE + 1];
// cg::thread_block b = cg::this_thread_block();
// cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
// __half2 *in_grad2 = reinterpret_cast<__half2 *>(in_grad);
// __half2 *bias_grad2 = reinterpret_cast<__half2 *>(bias_grad);
// const __half2 *out_grad2 = reinterpret_cast<const __half2 *>(out_grad);
// const __half2 *input2 = reinterpret_cast<const __half2 *>(input);
// const __half2 *bias2 = reinterpret_cast<const __half2 *>(bias);
// int col_idx = flat_2dim(blockIdx.x, threadIdx.x, WARP_SIZE);
// int stride = hidden_size * WARP_SIZE;
// __half2 local_sum = __float2half2_rn(0.f);
// int idx = flat_2dim(threadIdx.y, col_idx, hidden_size);
// if (col_idx < hidden_size) {
// for (int r = threadIdx.y; r < row_size; r += WARP_SIZE) {
// __half2 val = out_grad2[idx];
// __half2 in2 = input2[idx];
// __half2 b2 = bias2[idx % hidden_size ];
// __half2 m2 = __floats2half2_rn(mask[2 * idx], mask[2 * idx + 1]);
// val = activation_bwd_kernel<ActivationType::kRelu, __half2>(val * scale
// *
// m2,
// in2+b2);
// local_sum += val;
// in_grad2[idx] = val;
// idx += stride;
// }
// }
// tile[threadIdx.x][threadIdx.y] = local_sum;
// __syncthreads();
// __half2 sum = tile[threadIdx.y][threadIdx.x];
// __syncthreads();
// for (int i = 1; i < WARP_SIZE; i <<= 1) sum += g.shfl_down(sum, i);
// if (threadIdx.x == 0) tile[0][threadIdx.y] = sum;
// __syncthreads();
// if (threadIdx.y == 0) {
// int pos = flat_2dim(blockIdx.x, threadIdx.x, WARP_SIZE);
// bias_grad2[pos] = tile[0][threadIdx.x];
// }
// }
template
<
ActivationType
act_type
,
typename
T
>
void
launch_ls_dropout_act_bias_bwd
(
T
*
in_grad
,
T
*
bias_grad
,
const
T
*
input
,
const
T
*
bias
,
const
T
*
out_grad
,
const
uint8_t
*
mask
,
int
row_size
,
int
dim
,
float
ratio
,
cudaStream_t
stream
)
{
dim3
grid_dim
((
dim
-
1
)
/
WARP_SIZE
+
1
);
dim3
block_dim
(
WARP_SIZE
,
WARP_SIZE
);
ls_dropout_act_bias_bwd_kernel
<
act_type
><<<
grid_dim
,
block_dim
,
0
,
stream
>>>
(
row_size
,
ratio
,
in_grad
,
bias_grad
,
input
,
bias
,
out_grad
,
mask
,
dim
);
}
// template <>
// void launch_ls_dropout_act_bias_bwd<ActivationType::kRelu, __half>(
// __half *in_grad, __half *bias_grad,const __half *input, const __half
// *bias, const __half *out_grad, const uint8_t *mask, int row_size, int
// dim, float ratio, cudaStream_t stream) {
// dim >>= 1;
// dim3 grid_dim((dim - 1) / WARP_SIZE + 1);
// dim3 block_dim(WARP_SIZE, WARP_SIZE);
// ls_dropout_act_bias_bwd_kernel<ActivationType::kRelu>
// <<<grid_dim, block_dim, 0, stream>>>(row_size, ratio, in_grad,
// bias_grad,
// input, bias,out_grad, mask, dim);
// }
template
void
launch_ls_dropout_act_bias_bwd
<
ActivationType
::
kRelu
,
float
>(
float
*
in_grad
,
float
*
bias_grad
,
const
float
*
input
,
const
float
*
bias
,
const
float
*
out_grad
,
const
uint8_t
*
mask
,
int
row_size
,
int
dim
,
float
ratio
,
cudaStream_t
stream
);
template
void
launch_ls_dropout_act_bias_bwd
<
ActivationType
::
kRelu
,
__half
>(
__half
*
in_grad
,
__half
*
bias_grad
,
const
__half
*
input
,
const
__half
*
bias
,
const
__half
*
out_grad
,
const
uint8_t
*
mask
,
int
row_size
,
int
dim
,
float
ratio
,
cudaStream_t
stream
);
template
void
launch_ls_dropout_act_bias_bwd
<
ActivationType
::
kGelu
,
float
>(
float
*
in_grad
,
float
*
bias_grad
,
const
float
*
input
,
const
float
*
bias
,
const
float
*
out_grad
,
const
uint8_t
*
mask
,
int
row_size
,
int
dim
,
float
ratio
,
cudaStream_t
stream
);
template
void
launch_ls_dropout_act_bias_bwd
<
ActivationType
::
kGelu
,
__half
>(
__half
*
in_grad
,
__half
*
bias_grad
,
const
__half
*
input
,
const
__half
*
bias
,
const
__half
*
out_grad
,
const
uint8_t
*
mask
,
int
row_size
,
int
dim
,
float
ratio
,
cudaStream_t
stream
);
colossalai/kernel/cuda_native/csrc/kernels/general_kernels.cu
deleted
100644 → 0
View file @
bce9499e
#include <cooperative_groups.h>
#include "kernels.h"
namespace
cg
=
cooperative_groups
;
/**
@brief: fuse_transpose_bias
Calculate the sum of elements in each column of the matrix.
@thread
gridDim.x = ceil(cols / WARP_SIZE)
blockDim.x = WARP_SIZE
blockDim.y = WARP_SIZE
@param
inp: [rows, cols]
out: [cols]
rows: the number of rows in the matrix
cols: the number of cols in the matrix
*/
template
<
typename
T
>
__global__
void
column_sum_reduce
(
const
T
*
__restrict__
inp
,
T
*
__restrict__
out
,
int
rows
,
int
cols
)
{
__shared__
float
tile
[
WARP_SIZE
][
WARP_SIZE
];
cg
::
thread_block
b
=
cg
::
this_thread_block
();
cg
::
thread_block_tile
<
WARP_SIZE
>
g
=
cg
::
tiled_partition
<
WARP_SIZE
>
(
b
);
int
idx
=
flat_2dim
(
blockIdx
.
x
,
threadIdx
.
x
,
WARP_SIZE
);
int
y_stride
=
cols
*
WARP_SIZE
;
float
localSum
=
0
;
// Loop across matrix row
// TODO: optimize to log complexity
if
(
idx
<
cols
)
{
int
offset
=
flat_2dim
(
threadIdx
.
y
,
idx
,
cols
);
for
(
int
r
=
threadIdx
.
y
;
r
<
rows
;
r
+=
WARP_SIZE
)
{
localSum
+=
(
float
)
inp
[
offset
];
offset
+=
y_stride
;
}
}
// The sum of a row in tile is equal to the sum of a col in original matrix
tile
[
threadIdx
.
x
][
threadIdx
.
y
]
=
localSum
;
__syncthreads
();
// Sum the shared buffer.
// The change of threadIdx.x is continuous
float
sum
=
tile
[
threadIdx
.
y
][
threadIdx
.
x
];
__syncthreads
();
// Calculate the sum of a row in tile
for
(
int
i
=
1
;
i
<
WARP_SIZE
;
i
<<=
1
)
sum
+=
g
.
shfl_down
(
sum
,
i
);
if
(
threadIdx
.
x
==
0
)
{
int
pos
=
flat_2dim
(
blockIdx
.
x
,
threadIdx
.
y
,
WARP_SIZE
);
if
(
pos
<
cols
)
out
[
pos
]
=
sum
;
}
}
// [r, c] -> [c]
template
<
>
void
launch_fuse_transpose_bias_kernel
<
float
>
(
const
float
*
inp
,
float
*
out
,
int
rows
,
int
cols
,
cudaStream_t
stream
)
{
dim3
grid_dim
((
cols
-
1
)
/
WARP_SIZE
+
1
);
dim3
block_dim
(
WARP_SIZE
,
WARP_SIZE
);
column_sum_reduce
<
float
>
<<<
grid_dim
,
block_dim
,
0
,
stream
>>>
(
inp
,
out
,
rows
,
cols
);
}
template
<
>
void
launch_fuse_transpose_bias_kernel
<
__half
>
(
const
__half
*
inp
,
__half
*
out
,
int
rows
,
int
cols
,
cudaStream_t
stream
)
{
dim3
grid_dim
((
cols
-
1
)
/
WARP_SIZE
+
1
);
dim3
block_dim
(
WARP_SIZE
,
WARP_SIZE
);
column_sum_reduce
<
__half
>
<<<
grid_dim
,
block_dim
,
0
,
stream
>>>
(
inp
,
out
,
rows
,
cols
);
}
/**
@brief: fused_add2
Add two matrix inp1 and inp2 to out.
@thread
gridDim.x = batch_size * seq_len
blockDim.x = min(hidden_dim, MAX_THREADS)
@param
inp1: [batch_size, seq_len, hidden_dim]
inp2: [batch_size, seq_len, hidden_dim]
out: [batch_size, seq_len, hidden_dim]
batch_size: the size of the current batch
seq_len: the sequence length of the current batch
hidden_dim: dim of the hidden tensor
*/
template
<
typename
T
>
__global__
void
fused_add2_kernel
(
T
*
out
,
const
T
*
inp1
,
const
T
*
inp2
,
int
hidden_dim
);
template
<
>
__global__
void
fused_add2_kernel
<
float
>
(
float
*
out
,
const
float
*
inp1
,
const
float
*
inp2
,
int
hidden_dim
)
{
int
row_id
=
blockIdx
.
x
;
int
offset
=
flat_2dim
(
row_id
,
0
,
hidden_dim
);
const
float4
*
inp1_4
=
reinterpret_cast
<
const
float4
*>
(
inp1
);
const
float4
*
inp2_4
=
reinterpret_cast
<
const
float4
*>
(
inp2
);
float4
*
out_4
=
reinterpret_cast
<
float4
*>
(
out
);
float4
vinp1
;
float4
vinp2
;
float4
val
;
for
(
std
::
size_t
i
=
threadIdx
.
x
;
i
<
hidden_dim
;
i
+=
blockDim
.
x
)
{
vinp1
=
inp1_4
[
offset
+
i
];
vinp2
=
inp2_4
[
offset
+
i
];
val
.
x
=
vinp1
.
x
+
vinp2
.
x
;
val
.
y
=
vinp1
.
y
+
vinp2
.
y
;
val
.
z
=
vinp1
.
z
+
vinp2
.
z
;
val
.
w
=
vinp1
.
w
+
vinp2
.
w
;
out_4
[
offset
+
i
]
=
val
;
}
}
template
<
>
__global__
void
fused_add2_kernel
<
__half
>
(
__half
*
out
,
const
__half
*
inp1
,
const
__half
*
inp2
,
int
hidden_dim
)
{
int
row_id
=
blockIdx
.
x
;
int
offset
=
flat_2dim
(
row_id
,
0
,
hidden_dim
);
const
float4
*
inp1_4
=
reinterpret_cast
<
const
float4
*>
(
inp1
);
const
float4
*
inp2_4
=
reinterpret_cast
<
const
float4
*>
(
inp2
);
float4
*
out_4
=
reinterpret_cast
<
float4
*>
(
out
);
float4
vinp1
;
float4
vinp2
;
float4
val
;
__half2
*
h2_inp1
=
reinterpret_cast
<
__half2
*>
(
&
vinp1
);
__half2
*
h2_inp2
=
reinterpret_cast
<
__half2
*>
(
&
vinp2
);
__half2
*
h2_val
=
reinterpret_cast
<
__half2
*>
(
&
val
);
for
(
std
::
size_t
i
=
threadIdx
.
x
;
i
<
hidden_dim
;
i
+=
blockDim
.
x
)
{
vinp1
=
inp1_4
[
offset
+
i
];
vinp2
=
inp2_4
[
offset
+
i
];
h2_val
[
0
]
=
__hadd2
(
h2_inp1
[
0
],
h2_inp2
[
0
]);
h2_val
[
1
]
=
__hadd2
(
h2_inp1
[
1
],
h2_inp2
[
1
]);
h2_val
[
2
]
=
__hadd2
(
h2_inp1
[
2
],
h2_inp2
[
2
]);
h2_val
[
3
]
=
__hadd2
(
h2_inp1
[
3
],
h2_inp2
[
3
]);
out_4
[
offset
+
i
]
=
val
;
}
}
//[b, s, h] -> [b, s, h]
template
<
>
void
launch_fused_add2
<
float
>
(
float
*
out
,
const
float
*
inp1
,
const
float
*
inp2
,
int
batch_size
,
int
seq_len
,
int
hidden_dim
,
cudaStream_t
&
stream
)
{
hidden_dim
>>=
2
;
dim3
grid_dim
(
batch_size
*
seq_len
);
dim3
block_dim
(
min
(
hidden_dim
,
MAX_THREADS
));
fused_add2_kernel
<<<
grid_dim
,
block_dim
,
0
,
stream
>>>
(
out
,
inp1
,
inp2
,
hidden_dim
);
}
template
<
>
void
launch_fused_add2
<
__half
>
(
__half
*
out
,
const
__half
*
inp1
,
const
__half
*
inp2
,
int
batch_size
,
int
seq_len
,
int
hidden_dim
,
cudaStream_t
&
stream
)
{
hidden_dim
>>=
3
;
dim3
grid_dim
(
batch_size
*
seq_len
);
dim3
block_dim
(
min
(
hidden_dim
,
MAX_THREADS
));
fused_add2_kernel
<<<
grid_dim
,
block_dim
,
0
,
stream
>>>
(
out
,
inp1
,
inp2
,
hidden_dim
);
}
template
<
typename
T
>
__global__
void
kernel_concat3_dim1
(
const
T
*
inp1
,
const
T
*
inp2
,
T
*
output
,
int
sz0
,
int
sz2
,
int
sz1_1
,
int
sz1_2
)
{
int
nele
=
sz0
*
sz2
*
(
sz1_1
+
sz1_2
);
int
idx
=
flat_2dim
(
blockIdx
.
x
,
threadIdx
.
x
,
blockDim
.
x
);
if
(
idx
>=
nele
)
{
return
;
}
float4
*
dst_ptr
=
(
float4
*
)
output
+
idx
;
int
idx2
=
idx
%
sz2
;
idx
=
idx
/
sz2
;
int
idx1
=
idx
%
(
sz1_1
+
sz1_2
);
int
idx0
=
idx
/
(
sz1_1
+
sz1_2
);
float4
*
src_ptr
=
nullptr
;
int
sz1
=
0
;
if
(
idx1
<
sz1_1
)
{
sz1
=
sz1_1
;
src_ptr
=
(
float4
*
)
inp1
;
}
else
{
idx1
-=
sz1_1
;
sz1
=
sz1_2
;
src_ptr
=
(
float4
*
)
inp2
;
}
src_ptr
+=
flat_3dim
(
idx0
,
idx1
,
idx2
,
sz1
,
sz2
);
dst_ptr
[
0
]
=
src_ptr
[
0
];
}
template
<
>
void
launch_concat3_dim1
<
float
>
(
const
float
*
inp1
,
const
float
*
inp2
,
float
*
output
,
int
sz0
,
int
sz2
,
int
sz1_1
,
int
sz1_2
,
cudaStream_t
stream
)
{
sz2
>>=
2
;
int
nele
=
sz0
*
sz2
*
(
sz1_1
+
sz1_2
);
int
nblock
=
(
nele
+
MAX_THREADS
-
1
)
/
MAX_THREADS
;
kernel_concat3_dim1
<<<
nblock
,
MAX_THREADS
,
0
,
stream
>>>
(
inp1
,
inp2
,
output
,
sz0
,
sz2
,
sz1_1
,
sz1_2
);
}
template
<
>
void
launch_concat3_dim1
<
__half
>
(
const
__half
*
inp1
,
const
__half
*
inp2
,
__half
*
output
,
int
sz0
,
int
sz2
,
int
sz1_1
,
int
sz1_2
,
cudaStream_t
stream
)
{
sz2
>>=
3
;
int
nele
=
sz0
*
sz2
*
(
sz1_1
+
sz1_2
);
int
nblock
=
(
nele
+
MAX_THREADS
-
1
)
/
MAX_THREADS
;
kernel_concat3_dim1
<<<
nblock
,
MAX_THREADS
,
0
,
stream
>>>
(
inp1
,
inp2
,
output
,
sz0
,
sz2
,
sz1_1
,
sz1_2
);
}
colossalai/kernel/cuda_native/csrc/kernels/include/context.h
deleted
100644 → 0
View file @
bce9499e
#pragma once
#include <cublas_v2.h>
#include <cuda.h>
#include <iostream>
#include <string>
#include "cuda_util.h"
class
Context
{
public:
Context
()
:
_stream
(
nullptr
)
{
CHECK_GPU_ERROR
(
cublasCreate
(
&
_cublasHandle
));
}
virtual
~
Context
()
{}
static
Context
&
Instance
()
{
static
Context
_ctx
;
return
_ctx
;
}
void
set_stream
(
cudaStream_t
stream
)
{
_stream
=
stream
;
CHECK_GPU_ERROR
(
cublasSetStream
(
_cublasHandle
,
_stream
));
}
cudaStream_t
get_stream
()
{
return
_stream
;
}
cublasHandle_t
get_cublashandle
()
{
return
_cublasHandle
;
}
private:
cudaStream_t
_stream
;
cublasHandle_t
_cublasHandle
;
};
colossalai/kernel/cuda_native/csrc/kernels/include/cross_entropy_layer.h
deleted
100644 → 0
View file @
bce9499e
#pragma once
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_runtime_api.h>
#include <type_traits>
#include "cuda_util.h"
template
<
typename
T
>
class
CrossEntropyLayer
{
public:
CrossEntropyLayer
(
float
epsilon
,
int
padding_idx
,
int
max_batch_tokens
);
virtual
~
CrossEntropyLayer
();
void
Forward
(
const
T
*
inputs_ptr
,
const
int
*
targets_ptr
,
float
*
outputs_ptr
,
float
*
nll_loss_ptr
);
void
Backward
(
const
float
*
grad_outputs_ptr
,
const
T
*
inputs_ptr
,
const
int
*
targets_ptr
,
T
*
grad_inputs_ptr
);
void
set_cur_batch_shape
(
int
batch_size
,
int
seq_len
,
int
vocab_size
);
private:
void
allocate_mem_buffer
()
{
// allocate local gpu memory
_loss_buffer
=
cuda_malloc
<
float
>
(
_max_batch_tokens
*
2
);
}
void
free_mem_buffer
()
{
// free local gpu memory
cuda_free
(
_loss_buffer
);
}
const
int
_padding_idx
;
const
float
_epsilon
;
const
int
_max_batch_tokens
;
size_t
_batch_size
;
size_t
_seq_len
;
size_t
_vocab_size
;
float
*
_loss_buffer
;
};
colossalai/kernel/cuda_native/csrc/kernels/include/cublas_wrappers.h
deleted
100644 → 0
View file @
bce9499e
/* Copyright 2021 The LightSeq Team
Copyright Microsoft DeepSpeed
This file is adapted from Microsoft DeepSpeed
Licensed under the MIT License.
*/
#pragma once
#include <assert.h>
#include <cublas_v2.h>
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#include <mma.h>
#include <stdio.h>
int
cublas_gemm_ex
(
cublasHandle_t
handle
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
float
*
A
,
const
float
*
B
,
float
*
C
,
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DEFAULT
);
int
cublas_gemm_ex
(
cublasHandle_t
handle
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
__half
*
A
,
const
__half
*
B
,
__half
*
C
,
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DEFAULT_TENSOR_OP
);
int
cublas_strided_batched_gemm
(
cublasHandle_t
handle
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
float
*
A
,
const
float
*
B
,
float
*
C
,
cublasOperation_t
op_A
,
cublasOperation_t
op_B
,
int
stride_A
,
int
stride_B
,
int
stride_C
,
int
batch
,
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DEFAULT
);
int
cublas_strided_batched_gemm
(
cublasHandle_t
handle
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
__half
*
A
,
const
__half
*
B
,
__half
*
C
,
cublasOperation_t
op_A
,
cublasOperation_t
op_B
,
int
stride_A
,
int
stride_B
,
int
stride_C
,
int
batch
,
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DEFAULT_TENSOR_OP
);
colossalai/kernel/cuda_native/csrc/kernels/include/cuda_util.h
deleted
100644 → 0
View file @
bce9499e
#pragma once
#include <cublas_v2.h>
#include <cuda.h>
#include <math_constants.h>
#include <chrono>
#include <fstream>
#include <iostream>
#include <string>
#include <type_traits>
#include <vector>
template
<
typename
T
>
void
check_gpu_error
(
T
result
,
char
const
*
const
func
,
const
char
*
const
file
,
int
const
line
);
#define CHECK_GPU_ERROR(val) check_gpu_error((val), #val, __FILE__, __LINE__)
template
<
typename
T
>
void
print_vec
(
const
T
*
outv
,
std
::
string
outn
,
int
num_output_ele
);
template
<
typename
T
>
T
*
cuda_malloc
(
size_t
ele_num
);
void
cuda_free
(
void
*
pdata
);
template
<
typename
T
>
void
check_nan_inf
(
const
T
*
data_ptr
,
int
dsize
,
bool
check_nan_inf
,
std
::
string
file
,
int
line
,
cudaStream_t
stream
);
#define CHECK_NAN_INF(ptr, size, stream) \
check_nan_inf((ptr), (size), true, __FILE__, __LINE__, (stream)); \
check_nan_inf((ptr), (size), false, __FILE__, __LINE__, (stream))
colossalai/kernel/cuda_native/csrc/kernels/include/dropout.h
deleted
100644 → 0
View file @
bce9499e
#pragma once
#include <cuda.h>
#include <cuda_fp16.h>
#include <stdio.h>
#include <string>
#include "kernels.h"
template
<
typename
T
>
class
Dropout
{
public:
struct
Config
{
float
ratio
;
bool
training
;
Config
(
float
r
)
:
ratio
(
r
),
training
(
true
)
{}
float
RATIO
()
const
{
return
training
?
ratio
:
0.0
;
}
};
Dropout
(
const
Config
&
config
,
size_t
max_ele_num
)
:
_config
(
config
),
_mask
(
nullptr
)
{
_mask
=
cuda_malloc
<
uint8_t
>
(
max_ele_num
);
}
virtual
~
Dropout
()
{
cuda_free
(
_mask
);
}
// after attention softmax
void
dropout
(
T
*
output
,
const
T
*
input
,
int
count
,
cudaStream_t
stream
,
bool
bwd
=
false
)
{
launch_ls_dropout
<
T
>
(
output
,
input
,
_mask
,
count
,
_config
.
RATIO
(),
stream
,
bwd
);
}
void
d_dropout
(
T
*
d_inp_out
,
int
count
,
cudaStream_t
stream
)
{
launch_ls_dropout
<
T
>
(
d_inp_out
,
d_inp_out
,
_mask
,
count
,
_config
.
RATIO
(),
stream
,
true
);
}
// transformer layer's postprocessing dropout, after attn or ffn module,
// before residual add.
void
bias_dropout_residual
(
T
*
output
,
const
T
*
input
,
const
T
*
residual
,
const
T
*
bias
,
int
rows
,
int
cols
,
cudaStream_t
stream
)
{
launch_ls_dropout_res_bias
<
T
>
(
output
,
input
,
_mask
,
bias
,
residual
,
rows
*
cols
,
cols
,
_config
.
RATIO
(),
stream
);
}
void
d_bias_dropout_residual
(
T
*
d_input
,
T
*
d_bias
,
const
T
*
d_output
,
int
rows
,
int
cols
,
cudaStream_t
stream
)
{
launch_ls_dropout_bias_bwd
<
T
>
(
d_input
,
d_bias
,
d_output
,
_mask
,
rows
,
cols
,
_config
.
RATIO
(),
stream
);
}
// dropout inside ffn.
void
bias_act_dropout
(
T
*
output
,
const
T
*
input
,
const
T
*
bias
,
int
rows
,
int
cols
,
std
::
string
activation_fn
,
cudaStream_t
stream
)
{
if
(
activation_fn
==
"relu"
)
{
launch_ls_dropout_act_bias
<
ActivationType
::
kRelu
,
T
>
(
output
,
input
,
_mask
,
bias
,
rows
*
cols
,
cols
,
_config
.
RATIO
(),
stream
);
}
else
if
(
activation_fn
==
"gelu"
)
{
launch_ls_dropout_act_bias
<
ActivationType
::
kGelu
,
T
>
(
output
,
input
,
_mask
,
bias
,
rows
*
cols
,
cols
,
_config
.
RATIO
(),
stream
);
}
else
{
throw
std
::
runtime_error
(
"not supported activation: "
+
activation_fn
);
}
}
void
d_bias_act_dropout
(
T
*
d_inp_out
,
T
*
d_bias_out
,
const
T
*
input
,
const
T
*
bias
,
int
rows
,
int
cols
,
std
::
string
activation_fn
,
cudaStream_t
stream
)
{
if
(
activation_fn
==
"relu"
)
{
launch_ls_dropout_act_bias_bwd
<
ActivationType
::
kRelu
,
T
>
(
d_inp_out
,
d_bias_out
,
input
,
bias
,
d_inp_out
,
_mask
,
rows
,
cols
,
_config
.
RATIO
(),
stream
);
}
else
if
(
activation_fn
==
"gelu"
)
{
launch_ls_dropout_act_bias_bwd
<
ActivationType
::
kGelu
,
T
>
(
d_inp_out
,
d_bias_out
,
input
,
bias
,
d_inp_out
,
_mask
,
rows
,
cols
,
_config
.
RATIO
(),
stream
);
}
else
{
throw
std
::
runtime_error
(
"not supported activation: "
+
activation_fn
);
}
}
bool
HasDropout
()
const
{
return
_config
.
RATIO
()
>
0.0
;
}
void
SetTrainingMode
(
bool
training
)
{
_config
.
training
=
training
;
}
private:
uint8_t
*
_mask
;
Config
_config
;
};
colossalai/kernel/cuda_native/csrc/kernels/include/feed_forward.h
deleted
100644 → 0
View file @
bce9499e
#pragma once
/* Copyright 2021 The LightSeq Team
Copyright Microsoft DeepSpeed
This file is adapted from Microsoft DeepSpeed
Licensed under the MIT License.
*/
#include <cuda.h>
#include <cuda_fp16.h>
#include <stdio.h>
#include <array>
#include "cublas_wrappers.h"
#include "kernels.h"
template
<
typename
T
>
class
FeedForward
{
public:
struct
Config
{
int
outputSize
;
int
inputSize
;
std
::
array
<
int
,
3
>
gemm_algos
;
Config
(
int
outputs
,
int
inputs
)
:
outputSize
(
outputs
),
inputSize
(
inputs
),
gemm_algos
(
std
::
array
<
int
,
3
>
({
99
,
99
,
99
}))
{}
};
FeedForward
(
Config
config
)
:
config_
(
config
)
{}
~
FeedForward
()
{}
void
Forward
(
int
bsz
,
const
T
*
input_ptr
,
const
T
*
weights
,
T
*
out
,
cublasHandle_t
&
_cublasHandle
)
{
float
alpha
=
T
(
1.
);
float
beta
=
T
(
0.
);
cublas_gemm_ex
(
_cublasHandle
,
CUBLAS_OP_T
,
CUBLAS_OP_N
,
config_
.
outputSize
,
bsz
,
config_
.
inputSize
,
&
alpha
,
&
beta
,
weights
,
input_ptr
,
out
,
cublasGemmAlgo_t
(
config_
.
gemm_algos
[
0
]));
}
void
Backward
(
int
bsz
,
const
T
*
out_grad
,
const
T
*
input_ptr
,
const
T
*
weights
,
T
*
weights_grad
,
T
*
bias_grad
,
cublasHandle_t
&
_cublasHandle
,
cudaStream_t
&
stream
,
T
*
inp_grad_out
=
nullptr
,
T
*
out_grad_trans_out
=
nullptr
,
bool
compute_bias
=
true
)
{
float
alpha
=
(
T
)
1.0
,
beta
=
(
T
)
0.0
;
cublas_gemm_ex
(
_cublasHandle
,
CUBLAS_OP_N
,
CUBLAS_OP_T
,
config_
.
inputSize
,
config_
.
outputSize
,
bsz
,
&
alpha
,
&
beta
,
input_ptr
,
out_grad
,
weights_grad
,
cublasGemmAlgo_t
(
config_
.
gemm_algos
[
1
]));
cublas_gemm_ex
(
_cublasHandle
,
CUBLAS_OP_N
,
CUBLAS_OP_N
,
config_
.
inputSize
,
bsz
,
config_
.
outputSize
,
&
alpha
,
&
beta
,
weights
,
out_grad
,
inp_grad_out
,
cublasGemmAlgo_t
(
config_
.
gemm_algos
[
2
]));
if
(
compute_bias
)
{
launch_fuse_transpose_bias_kernel
<
T
>
(
out_grad
,
bias_grad
,
bsz
,
config_
.
outputSize
,
stream
);
}
}
void
reset_size
(
int
outputSize
,
int
inputSize
)
{
config_
.
outputSize
=
outputSize
;
config_
.
inputSize
=
inputSize
;
}
private:
Config
config_
;
};
colossalai/kernel/cuda_native/csrc/kernels/include/kernels.h
deleted
100644 → 0
View file @
bce9499e
#pragma once
#include <cuda.h>
#include <cuda_fp16.h>
#include <curand_kernel.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdexcept>
#define MAX_THREADS 1024
#define WARP_SIZE 32
enum
class
ActivationType
{
kRelu
,
kGelu
};
void
launch_curand_init
(
int
total_count
,
int
dim
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_layer_norm
(
T
*
ln_res
,
T
*
vars
,
T
*
means
,
const
T
*
inp
,
const
T
*
scale
,
const
T
*
bias
,
int
batch_size
,
int
hidden_dim
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_ln_bw
(
T
*
gamma_grad
,
T
*
betta_grad
,
T
*
inp_grad
,
const
T
*
out_grad
,
const
T
*
residual_grad
,
const
T
*
inp_or_out
,
const
T
*
gamma
,
const
T
*
betta
,
const
T
*
vars
,
const
T
*
means
,
int
batch
,
int
hidden_dim
,
cudaStream_t
stream
[
2
]);
template
<
typename
T
>
void
launch_attn_softmax
(
T
*
vals
,
const
T
*
attn_mask
,
int
batch_size
,
int
heads
,
int
from_len
,
int
to_len
,
bool
mask_future
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_attn_softmax_bw
(
T
*
out_grad
,
const
T
*
soft_inp
,
int
rows
,
int
softmax_len
,
cudaStream_t
stream
);
// [b, s, h] -> [b, nh, s, ad]
template
<
typename
T
>
void
launch_transform_0213
(
T
*
output
,
const
T
*
vals
,
int
batch_size
,
int
seq_length
,
int
hidden_dim
,
int
nhead
,
cudaStream_t
stream
);
// [b, s, 3, h] -> [3, b, nh, s, ad]
template
<
typename
T
>
void
launch_bias_add_transform_20314
(
T
*
output
,
const
T
*
input
,
const
T
*
bias
,
int
dim_0
,
int
dim_1
,
int
dim_2
,
int
dim_3
,
int
dim_4
,
cudaStream_t
stream
);
// [tc, b, nh, s, ad] -> [b, s, tc, nh, ad]
template
<
typename
T
>
void
launch_transform4d_0213
(
T
*
output
,
const
T
*
vals
,
int
batch_size
,
int
seq_len
,
int
hidden_dim
,
int
nhead
,
int
trans_count
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_ls_dropout
(
T
*
out
,
const
T
*
vals
,
uint8_t
*
mask
,
int
total_count
,
float
ratio
,
cudaStream_t
stream
,
bool
backward
=
false
);
template
<
typename
T
>
void
launch_ls_dropout_res_bias
(
T
*
out
,
const
T
*
vals
,
uint8_t
*
mask
,
const
T
*
bias
,
const
T
*
residual
,
int
total_count
,
int
dim
,
float
ratio
,
cudaStream_t
stream
);
template
<
ActivationType
,
typename
T
>
void
launch_ls_dropout_act_bias
(
T
*
out
,
const
T
*
vals
,
uint8_t
*
mask
,
const
T
*
bias
,
int
total_count
,
int
dim
,
float
ratio
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_ls_dropout_bias_bwd
(
T
*
in_grad
,
T
*
bias_grad
,
const
T
*
out_grad
,
const
uint8_t
*
mask
,
int
row_size
,
int
dim
,
float
ratio
,
cudaStream_t
stream
);
template
<
ActivationType
act_type
,
typename
T
>
void
launch_ls_dropout_act_bias_bwd
(
T
*
in_grad
,
T
*
bias_grad
,
const
T
*
input
,
const
T
*
bias
,
const
T
*
out_grad
,
const
uint8_t
*
mask
,
int
row_size
,
int
dim
,
float
ratio
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_fuse_transpose_bias_kernel
(
const
T
*
inp
,
T
*
out
,
int
rows
,
int
cols
,
cudaStream_t
stream
);
void
launch_param_update
(
const
float
*
input
,
__half
*
output
,
int
size
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_concat3_dim1
(
const
T
*
inp1
,
const
T
*
inp2
,
T
*
output
,
int
sz0
,
int
sz2
,
int
sz1_1
,
int
sz1_2
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_fused_add2
(
T
*
out
,
const
T
*
inp1
,
const
T
*
inp2
,
int
batch_size
,
int
seq_len
,
int
hidden_size
,
cudaStream_t
&
stream
);
template
<
typename
T
>
void
launch_cross_entropy_fw
(
const
T
*
inputs_ptr
,
const
int
*
targets_ptr
,
float
*
outputs_ptr
,
float
*
nll_loss_ptr
,
float
*
loss_buffer
,
const
int
padding_idx
,
const
float
epsilon
,
const
int
batch_size
,
const
int
seq_len
,
const
int
vocab_size
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_cross_entropy_bw
(
const
float
*
grad_outputs_ptr
,
const
T
*
inputs_ptr
,
const
int
*
targets_ptr
,
T
*
grad_inputs_ptr
,
const
int
padding_idx
,
const
float
epsilon
,
const
int
batch_size
,
const
int
seq_len
,
const
int
vocab_size
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_lookup_scale_pos_dropout
(
T
*
output
,
const
int
*
input
,
const
T
*
embeddings
,
const
T
*
pos_embeddings
,
uint8_t
*
dropout_mask
,
int
batch_size
,
int
seq_len
,
int
embedding_dim
,
int
padding_idx
,
float
dropout_ratio
,
int
step
,
cudaStream_t
&
stream
);
template
<
typename
T
>
void
launch_d_lookup_scale_pos_dropout
(
T
*
grad_embeddings
,
const
T
*
grad_output
,
const
int
*
input
,
const
uint8_t
*
dropout_mask
,
int
batch_size
,
int
seq_len
,
int
embedding_dim
,
int
vocab_size
,
int
padding_idx
,
float
dropout_ratio
,
cudaStream_t
&
stream
);
/* Convert 2-dim tensor index into vector index */
__forceinline__
__host__
__device__
int
flat_2dim
(
int
id1
,
int
id2
,
int
dim2
)
{
return
id1
*
dim2
+
id2
;
}
/* Convert 3-dim tensor index into vector index */
__forceinline__
__host__
__device__
int
flat_3dim
(
int
id1
,
int
id2
,
int
id3
,
int
dim2
,
int
dim3
)
{
return
id1
*
dim2
*
dim3
+
id2
*
dim3
+
id3
;
}
/* Convert 4-dim tensor index into vector index */
__forceinline__
__host__
__device__
int
flat_4dim
(
int
id1
,
int
id2
,
int
id3
,
int
id4
,
int
dim2
,
int
dim3
,
int
dim4
)
{
// return id1*(dim2*dim3*dim4) + id2*(dim3*dim4) + id3*dim4 + id4;
int
res
=
id4
;
int
ld
=
dim4
;
res
+=
id3
*
ld
;
ld
*=
dim3
;
res
+=
id2
*
ld
;
ld
*=
dim2
;
res
+=
id1
*
ld
;
return
res
;
}
/* Convert 5-dim tensor index into vector index */
__forceinline__
__host__
__device__
int
flat_5dim
(
int
id1
,
int
id2
,
int
id3
,
int
id4
,
int
id5
,
int
dim2
,
int
dim3
,
int
dim4
,
int
dim5
)
{
// return id1*(dim2*dim3*dim4*dim5) + id2*(dim3*dim4*dim5) + id3*(dim4*dim5) +
// id4*dim5 + dim5;
int
res
=
id5
;
int
ld
=
dim5
;
res
+=
id4
*
ld
;
ld
*=
dim4
;
res
+=
id3
*
ld
;
ld
*=
dim3
;
res
+=
id2
*
ld
;
ld
*=
dim2
;
res
+=
id1
*
ld
;
return
res
;
}
/* Convert 6-dim tensor index into vector index */
__forceinline__
__host__
__device__
int
flat_6dim
(
int
id1
,
int
id2
,
int
id3
,
int
id4
,
int
id5
,
int
id6
,
int
dim2
,
int
dim3
,
int
dim4
,
int
dim5
,
int
dim6
)
{
// return id1*(dim2*dim3*dim4*dim5*dim6) + id2*(dim3*dim4*dim5*dim6) +
// id3*(dim4*dim5*dim6) + id4*(dim5*dim6) + id5*dim6 + id6;
int
res
=
id6
;
int
ld
=
dim6
;
res
+=
id5
*
ld
;
ld
*=
dim5
;
res
+=
id4
*
ld
;
ld
*=
dim4
;
res
+=
id3
*
ld
;
ld
*=
dim3
;
res
+=
id2
*
ld
;
ld
*=
dim2
;
res
+=
id1
*
ld
;
return
res
;
}
/* Convert vector index to 6-dim tensor index */
__forceinline__
__host__
__device__
void
decompose_6dim
(
int
src
,
int
dim1
,
int
dim2
,
int
dim3
,
int
dim4
,
int
dim5
,
int
*
id0
,
int
*
id1
,
int
*
id2
,
int
*
id3
,
int
*
id4
,
int
*
id5
)
{
*
id5
=
src
%
dim5
;
src
/=
dim5
;
*
id4
=
src
%
dim4
;
src
/=
dim4
;
*
id3
=
src
%
dim3
;
src
/=
dim3
;
*
id2
=
src
%
dim2
;
src
/=
dim2
;
*
id1
=
src
%
dim1
;
*
id0
=
src
/
dim1
;
}
/* Convert vector index to 5-dim tensor index */
__forceinline__
__host__
__device__
void
decompose_5dim
(
int
src
,
int
dim1
,
int
dim2
,
int
dim3
,
int
dim4
,
int
*
id0
,
int
*
id1
,
int
*
id2
,
int
*
id3
,
int
*
id4
)
{
*
id4
=
src
%
dim4
;
src
/=
dim4
;
*
id3
=
src
%
dim3
;
src
/=
dim3
;
*
id2
=
src
%
dim2
;
src
/=
dim2
;
*
id1
=
src
%
dim1
;
*
id0
=
src
/
dim1
;
}
/* Convert vector index to 4-dim tensor index */
__forceinline__
__host__
__device__
void
decompose_4dim
(
int
src
,
int
dim1
,
int
dim2
,
int
dim3
,
int
*
id0
,
int
*
id1
,
int
*
id2
,
int
*
id3
)
{
*
id3
=
src
%
dim3
;
src
/=
dim3
;
*
id2
=
src
%
dim2
;
src
/=
dim2
;
*
id1
=
src
%
dim1
;
*
id0
=
src
/
dim1
;
}
/* Convert vector index to 3-dim tensor index */
__forceinline__
__host__
__device__
void
decompose_3dim
(
int
src
,
int
dim1
,
int
dim2
,
int
*
id0
,
int
*
id1
,
int
*
id2
)
{
*
id2
=
src
%
dim2
;
src
/=
dim2
;
*
id1
=
src
%
dim1
;
*
id0
=
src
/
dim1
;
}
/* Convert vector index to 2-dim tensor index */
__forceinline__
__host__
__device__
void
decompose_2dim
(
int
src
,
int
dim1
,
int
*
id0
,
int
*
id1
)
{
*
id1
=
src
%
dim1
;
*
id0
=
src
/
dim1
;
}
colossalai/kernel/cuda_native/csrc/kernels/include/ls_cub.cuh
deleted
100644 → 0
View file @
bce9499e
// copied from https://github.com/dmlc/dgl/pull/2758
#ifndef DGL_ARRAY_CUDA_DGL_CUB_CUH_
#define DGL_ARRAY_CUDA_DGL_CUB_CUH_
#define CUB_NS_PREFIX namespace ls {
#define CUB_NS_POSTFIX }
#include "cub/cub.cuh"
#include "cub/util_allocator.cuh"
#undef CUB_NS_POSTFIX
#undef CUB_NS_PREFIX
#endif
colossalai/kernel/cuda_native/csrc/kernels/include/normalize_layer.h
deleted
100644 → 0
View file @
bce9499e
#pragma once
#include <cuda.h>
#include <cuda_fp16.h>
#include <stdio.h>
#include <fstream>
#include "kernels.h"
using
namespace
std
;
template
<
typename
T
>
class
Normalize_Layer
{
public:
struct
Config
{
uint32_t
hidden_dim
;
bool
use_mean
;
Config
(
uint32_t
hidden_dim
,
bool
use_mean
=
false
)
:
hidden_dim
(
hidden_dim
),
use_mean
(
use_mean
)
{}
};
Normalize_Layer
(
Config
config
,
size_t
max_rows
)
:
config_
(
config
),
vars_
(
nullptr
),
means_
(
nullptr
)
{
vars_
=
cuda_malloc
<
T
>
(
max_rows
);
if
(
config_
.
use_mean
)
{
means_
=
cuda_malloc
<
T
>
(
max_rows
);
}
}
~
Normalize_Layer
()
{
cuda_free
(
vars_
);
cuda_free
(
means_
);
}
void
Forward
(
T
*
ln_res
,
const
T
*
inp
,
const
T
*
gamma
,
const
T
*
betta
,
int
batch_size
,
cudaStream_t
stream
)
{
launch_layer_norm
(
ln_res
,
vars_
,
means_
,
inp
,
gamma
,
betta
,
batch_size
,
config_
.
hidden_dim
,
stream
);
}
/*
residual_grad, inp_or_out, betta should be treated carefully.
inp_or_out = input if use_mean else output
residual_grad, betta can be nullptr.
residual_grad will be added to dinp if it is not nullptr
which is useful in transformer layer when pre-ln
betta are only used to compute xhat,
(use_mean == false) ^ (betta == nullptr) should be true
*/
void
Backward
(
T
*
gamma_grad
,
T
*
betta_grad
,
T
*
inp_grad
,
const
T
*
out_grad
,
const
T
*
residual_grad
,
const
T
*
inp_or_out
,
const
T
*
gamma
,
const
T
*
betta
,
int
batch_size
,
cudaStream_t
stream
[
2
])
{
launch_ln_bw
(
gamma_grad
,
betta_grad
,
inp_grad
,
out_grad
,
residual_grad
,
inp_or_out
,
gamma
,
betta
,
vars_
,
means_
,
batch_size
,
config_
.
hidden_dim
,
stream
);
}
inline
bool
use_mean
()
const
{
return
config_
.
use_mean
;
}
private:
Config
config_
;
T
*
vars_
;
T
*
means_
;
};
colossalai/kernel/cuda_native/csrc/kernels/include/softmax.h
deleted
100644 → 0
View file @
bce9499e
#pragma once
#include <cuda.h>
#include <cuda_fp16.h>
#include <stdio.h>
#include <fstream>
#include "kernels.h"
using
namespace
std
;
template
<
typename
T
>
class
Softmax
{
public:
struct
Config
{
size_t
nhead
;
Config
(
size_t
nhead
)
:
nhead
(
nhead
)
{}
};
Softmax
(
Config
config
)
:
config_
(
config
)
{}
~
Softmax
()
{}
void
Forward
(
T
*
vals
,
const
T
*
attn_mask
,
int
batch_size
,
int
from_len
,
int
to_len
,
cudaStream_t
&
stream
,
bool
mask_future
=
true
)
{
launch_attn_softmax
<
T
>
(
vals
,
attn_mask
,
batch_size
,
config_
.
nhead
,
from_len
,
to_len
,
mask_future
,
stream
);
}
void
Backward
(
T
*
out_grad
,
const
T
*
soft_out
,
int
batch_size
,
int
from_len
,
int
to_len
,
cudaStream_t
stream
)
{
launch_attn_softmax_bw
<
T
>
(
out_grad
,
soft_out
,
batch_size
*
config_
.
nhead
*
from_len
,
to_len
,
stream
);
}
void
reset_size
(
size_t
nhead
)
{
config_
.
nhead
=
nhead
;
}
private:
Config
config_
;
};
Prev
1
2
3
4
5
6
7
…
14
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment