Commit ab9d7598 authored by yuguo's avatar yuguo
Browse files
parents 229be5e8 6efebcd0
......@@ -115,7 +115,7 @@ float gelu_forward(float x)
template <typename T, int THREADS_PER_BLOCK>
__global__
void gelu_forward_kernel(const float* in, T* out, float* amax, const float* scale, int m, int n) {
void __launch_bounds__(THREADS_PER_BLOCK) gelu_forward_kernel(const float* in, T* out, float* amax, const float* scale, int m, int n) {
// fp8 output flow
if constexpr(std::is_same<T, fp8e4m3>::value ||std::is_same<T, fp8e5m2>::value){
typedef hipcub::BlockReduce<float, THREADS_PER_BLOCK> BlockReduce;
......@@ -184,15 +184,15 @@ void gelu_backward_kernel(const float* dy, T* out, const Taux* __restrict pre_ge
template <typename T, typename Taux>
void gelu_backward_kernelLauncher(const float* in, T* out, const Taux* pre_gelu_out, int m, int n, hipStream_t stream) {
int blocks_per_row = ceil(float(n)/1024);
int blocks_per_row = ceil(float(n)/256);
dim3 grid(min(m * blocks_per_row, 65536));
dim3 block(min(n, 1024));
dim3 block(min(n, 256));
hipLaunchKernelGGL(( gelu_backward_kernel<T, Taux>), dim3(grid), dim3(block), 0, stream, in, out, pre_gelu_out, m, n);
}
template <typename T, typename Tb, int THREADS_PER_BLOCK>
__global__
void add_bias_kernel(const float* in, T* out, const Tb* __restrict bias, float* amax, const float* scale, int m, int n){
void __launch_bounds__(THREADS_PER_BLOCK) add_bias_kernel(const float* in, T* out, const Tb* __restrict bias, float* amax, const float* scale, int m, int n){
// fp8 output flow
if constexpr(std::is_same<T, fp8e4m3>::value ||std::is_same<T, fp8e5m2>::value){
typedef hipcub::BlockReduce<float, THREADS_PER_BLOCK> BlockReduce;
......@@ -232,7 +232,7 @@ void add_bias_kernelLauncher(const float* in, T* out, const Tb* __restrict bias,
template <typename T, typename Taux, typename Tb, int THREADS_PER_BLOCK>
__global__
void add_bias_gelu_kernel(const float* in, T* out, Taux* pre_gelu_out, const Tb* __restrict bias, float* amax, const float* scale, int m, int n){
void __launch_bounds__(THREADS_PER_BLOCK) add_bias_gelu_kernel(const float* in, T* out, Taux* pre_gelu_out, const Tb* __restrict bias, float* amax, const float* scale, int m, int n){
// fp8 output flow
if constexpr(std::is_same<T, fp8e4m3>::value ||std::is_same<T, fp8e5m2>::value){
// only need to deal with amax and scale of D, no need to deal with amax and scale of pre_gelu_out
......@@ -288,14 +288,14 @@ void identity_kernel(const Tin* in, T* out, int n) {
template <typename Tin, typename T>
void identity_kernelLauncher(const Tin* in, T* out, int n, hipStream_t stream) {
dim3 block, grid;
block.x = 1024;
grid.x = ceil( n / 1024.);
block.x = 256;
grid.x = ceil( n / 256.);
hipLaunchKernelGGL(( identity_kernel<Tin, T>), dim3(grid), dim3(block), 0, stream, in, out, n );
}
template <typename T, int THREADS_PER_BLOCK>
__global__
void identity_output_kernel(const float* in, T* out, float* amax, const float* scale, int n) {
void __launch_bounds__(THREADS_PER_BLOCK) identity_output_kernel(const float* in, T* out, float* amax, const float* scale, int n) {
if constexpr(std::is_same<T, fp8e4m3>::value ||std::is_same<T, fp8e5m2>::value){
typedef hipcub::BlockReduce<float, THREADS_PER_BLOCK> BlockReduce;
__shared__ typename BlockReduce::TempStorage block_temp_storage;
......@@ -331,7 +331,7 @@ void identity_output_kernelLauncher(const float* in, T* out, float* amax, const
template <typename Tin, int THREADS_PER_BLOCK>
__global__
void bias_gradient_kernel(const Tin* in, float* out, int m, int n) {
void __launch_bounds__(THREADS_PER_BLOCK) bias_gradient_kernel(const Tin* in, float* out, int m, int n) {
typedef hipcub::BlockReduce<float, THREADS_PER_BLOCK> BlockReduce;
__shared__ typename BlockReduce::TempStorage block_temp_storage;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment