"...gmock/git@developer.sourcefind.cn:yangql/googletest.git" did not exist on "9311242db422dd6f24c8e764847fe5d70d0d4859"
Commit ab9d7598 authored by yuguo's avatar yuguo
Browse files
parents 229be5e8 6efebcd0
...@@ -115,7 +115,7 @@ float gelu_forward(float x) ...@@ -115,7 +115,7 @@ float gelu_forward(float x)
template <typename T, int THREADS_PER_BLOCK> template <typename T, int THREADS_PER_BLOCK>
__global__ __global__
void gelu_forward_kernel(const float* in, T* out, float* amax, const float* scale, int m, int n) { void __launch_bounds__(THREADS_PER_BLOCK) gelu_forward_kernel(const float* in, T* out, float* amax, const float* scale, int m, int n) {
// fp8 output flow // fp8 output flow
if constexpr(std::is_same<T, fp8e4m3>::value ||std::is_same<T, fp8e5m2>::value){ if constexpr(std::is_same<T, fp8e4m3>::value ||std::is_same<T, fp8e5m2>::value){
typedef hipcub::BlockReduce<float, THREADS_PER_BLOCK> BlockReduce; typedef hipcub::BlockReduce<float, THREADS_PER_BLOCK> BlockReduce;
...@@ -184,15 +184,15 @@ void gelu_backward_kernel(const float* dy, T* out, const Taux* __restrict pre_ge ...@@ -184,15 +184,15 @@ void gelu_backward_kernel(const float* dy, T* out, const Taux* __restrict pre_ge
template <typename T, typename Taux> template <typename T, typename Taux>
void gelu_backward_kernelLauncher(const float* in, T* out, const Taux* pre_gelu_out, int m, int n, hipStream_t stream) { void gelu_backward_kernelLauncher(const float* in, T* out, const Taux* pre_gelu_out, int m, int n, hipStream_t stream) {
int blocks_per_row = ceil(float(n)/1024); int blocks_per_row = ceil(float(n)/256);
dim3 grid(min(m * blocks_per_row, 65536)); dim3 grid(min(m * blocks_per_row, 65536));
dim3 block(min(n, 1024)); dim3 block(min(n, 256));
hipLaunchKernelGGL(( gelu_backward_kernel<T, Taux>), dim3(grid), dim3(block), 0, stream, in, out, pre_gelu_out, m, n); hipLaunchKernelGGL(( gelu_backward_kernel<T, Taux>), dim3(grid), dim3(block), 0, stream, in, out, pre_gelu_out, m, n);
} }
template <typename T, typename Tb, int THREADS_PER_BLOCK> template <typename T, typename Tb, int THREADS_PER_BLOCK>
__global__ __global__
void add_bias_kernel(const float* in, T* out, const Tb* __restrict bias, float* amax, const float* scale, int m, int n){ void __launch_bounds__(THREADS_PER_BLOCK) add_bias_kernel(const float* in, T* out, const Tb* __restrict bias, float* amax, const float* scale, int m, int n){
// fp8 output flow // fp8 output flow
if constexpr(std::is_same<T, fp8e4m3>::value ||std::is_same<T, fp8e5m2>::value){ if constexpr(std::is_same<T, fp8e4m3>::value ||std::is_same<T, fp8e5m2>::value){
typedef hipcub::BlockReduce<float, THREADS_PER_BLOCK> BlockReduce; typedef hipcub::BlockReduce<float, THREADS_PER_BLOCK> BlockReduce;
...@@ -232,7 +232,7 @@ void add_bias_kernelLauncher(const float* in, T* out, const Tb* __restrict bias, ...@@ -232,7 +232,7 @@ void add_bias_kernelLauncher(const float* in, T* out, const Tb* __restrict bias,
template <typename T, typename Taux, typename Tb, int THREADS_PER_BLOCK> template <typename T, typename Taux, typename Tb, int THREADS_PER_BLOCK>
__global__ __global__
void add_bias_gelu_kernel(const float* in, T* out, Taux* pre_gelu_out, const Tb* __restrict bias, float* amax, const float* scale, int m, int n){ void __launch_bounds__(THREADS_PER_BLOCK) add_bias_gelu_kernel(const float* in, T* out, Taux* pre_gelu_out, const Tb* __restrict bias, float* amax, const float* scale, int m, int n){
// fp8 output flow // fp8 output flow
if constexpr(std::is_same<T, fp8e4m3>::value ||std::is_same<T, fp8e5m2>::value){ if constexpr(std::is_same<T, fp8e4m3>::value ||std::is_same<T, fp8e5m2>::value){
// only need to deal with amax and scale of D, no need to deal with amax and scale of pre_gelu_out // only need to deal with amax and scale of D, no need to deal with amax and scale of pre_gelu_out
...@@ -288,14 +288,14 @@ void identity_kernel(const Tin* in, T* out, int n) { ...@@ -288,14 +288,14 @@ void identity_kernel(const Tin* in, T* out, int n) {
template <typename Tin, typename T> template <typename Tin, typename T>
void identity_kernelLauncher(const Tin* in, T* out, int n, hipStream_t stream) { void identity_kernelLauncher(const Tin* in, T* out, int n, hipStream_t stream) {
dim3 block, grid; dim3 block, grid;
block.x = 1024; block.x = 256;
grid.x = ceil( n / 1024.); grid.x = ceil( n / 256.);
hipLaunchKernelGGL(( identity_kernel<Tin, T>), dim3(grid), dim3(block), 0, stream, in, out, n ); hipLaunchKernelGGL(( identity_kernel<Tin, T>), dim3(grid), dim3(block), 0, stream, in, out, n );
} }
template <typename T, int THREADS_PER_BLOCK> template <typename T, int THREADS_PER_BLOCK>
__global__ __global__
void identity_output_kernel(const float* in, T* out, float* amax, const float* scale, int n) { void __launch_bounds__(THREADS_PER_BLOCK) identity_output_kernel(const float* in, T* out, float* amax, const float* scale, int n) {
if constexpr(std::is_same<T, fp8e4m3>::value ||std::is_same<T, fp8e5m2>::value){ if constexpr(std::is_same<T, fp8e4m3>::value ||std::is_same<T, fp8e5m2>::value){
typedef hipcub::BlockReduce<float, THREADS_PER_BLOCK> BlockReduce; typedef hipcub::BlockReduce<float, THREADS_PER_BLOCK> BlockReduce;
__shared__ typename BlockReduce::TempStorage block_temp_storage; __shared__ typename BlockReduce::TempStorage block_temp_storage;
...@@ -331,7 +331,7 @@ void identity_output_kernelLauncher(const float* in, T* out, float* amax, const ...@@ -331,7 +331,7 @@ void identity_output_kernelLauncher(const float* in, T* out, float* amax, const
template <typename Tin, int THREADS_PER_BLOCK> template <typename Tin, int THREADS_PER_BLOCK>
__global__ __global__
void bias_gradient_kernel(const Tin* in, float* out, int m, int n) { void __launch_bounds__(THREADS_PER_BLOCK) bias_gradient_kernel(const Tin* in, float* out, int m, int n) {
typedef hipcub::BlockReduce<float, THREADS_PER_BLOCK> BlockReduce; typedef hipcub::BlockReduce<float, THREADS_PER_BLOCK> BlockReduce;
__shared__ typename BlockReduce::TempStorage block_temp_storage; __shared__ typename BlockReduce::TempStorage block_temp_storage;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment