Commit 6efebcd0 authored by wenjh's avatar wenjh
Browse files

[ROCM_GEMM] Fix launch params



Fix launch params are larger than launch bounds(256) for kernels in
rocm_gemm.cu
Signed-off-by: wenjh's avatarwenjh <wenjh@sugon.com>
parent 388ac735
...@@ -119,7 +119,7 @@ float gelu_forward(float x) ...@@ -119,7 +119,7 @@ float gelu_forward(float x)
template <typename T, int THREADS_PER_BLOCK> template <typename T, int THREADS_PER_BLOCK>
__global__ __global__
void gelu_forward_kernel(const float* in, T* out, float* amax, const float* scale, int m, int n) { void __launch_bounds__(THREADS_PER_BLOCK) gelu_forward_kernel(const float* in, T* out, float* amax, const float* scale, int m, int n) {
// fp8 output flow // fp8 output flow
if constexpr(std::is_same<T, fp8e4m3>::value ||std::is_same<T, fp8e5m2>::value){ if constexpr(std::is_same<T, fp8e4m3>::value ||std::is_same<T, fp8e5m2>::value){
typedef hipcub::BlockReduce<float, THREADS_PER_BLOCK> BlockReduce; typedef hipcub::BlockReduce<float, THREADS_PER_BLOCK> BlockReduce;
...@@ -188,15 +188,15 @@ void gelu_backward_kernel(const float* dy, T* out, const Taux* __restrict pre_ge ...@@ -188,15 +188,15 @@ void gelu_backward_kernel(const float* dy, T* out, const Taux* __restrict pre_ge
template <typename T, typename Taux> template <typename T, typename Taux>
void gelu_backward_kernelLauncher(const float* in, T* out, const Taux* pre_gelu_out, int m, int n, hipStream_t stream) { void gelu_backward_kernelLauncher(const float* in, T* out, const Taux* pre_gelu_out, int m, int n, hipStream_t stream) {
int blocks_per_row = ceil(float(n)/1024); int blocks_per_row = ceil(float(n)/256);
dim3 grid(min(m * blocks_per_row, 65536)); dim3 grid(min(m * blocks_per_row, 65536));
dim3 block(min(n, 1024)); dim3 block(min(n, 256));
hipLaunchKernelGGL(( gelu_backward_kernel<T, Taux>), dim3(grid), dim3(block), 0, stream, in, out, pre_gelu_out, m, n); hipLaunchKernelGGL(( gelu_backward_kernel<T, Taux>), dim3(grid), dim3(block), 0, stream, in, out, pre_gelu_out, m, n);
} }
template <typename T, typename Tb, int THREADS_PER_BLOCK> template <typename T, typename Tb, int THREADS_PER_BLOCK>
__global__ __global__
void add_bias_kernel(const float* in, T* out, const Tb* __restrict bias, float* amax, const float* scale, int m, int n){ void __launch_bounds__(THREADS_PER_BLOCK) add_bias_kernel(const float* in, T* out, const Tb* __restrict bias, float* amax, const float* scale, int m, int n){
// fp8 output flow // fp8 output flow
if constexpr(std::is_same<T, fp8e4m3>::value ||std::is_same<T, fp8e5m2>::value){ if constexpr(std::is_same<T, fp8e4m3>::value ||std::is_same<T, fp8e5m2>::value){
typedef hipcub::BlockReduce<float, THREADS_PER_BLOCK> BlockReduce; typedef hipcub::BlockReduce<float, THREADS_PER_BLOCK> BlockReduce;
...@@ -236,7 +236,7 @@ void add_bias_kernelLauncher(const float* in, T* out, const Tb* __restrict bias, ...@@ -236,7 +236,7 @@ void add_bias_kernelLauncher(const float* in, T* out, const Tb* __restrict bias,
template <typename T, typename Taux, typename Tb, int THREADS_PER_BLOCK> template <typename T, typename Taux, typename Tb, int THREADS_PER_BLOCK>
__global__ __global__
void add_bias_gelu_kernel(const float* in, T* out, Taux* pre_gelu_out, const Tb* __restrict bias, float* amax, const float* scale, int m, int n){ void __launch_bounds__(THREADS_PER_BLOCK) add_bias_gelu_kernel(const float* in, T* out, Taux* pre_gelu_out, const Tb* __restrict bias, float* amax, const float* scale, int m, int n){
// fp8 output flow // fp8 output flow
if constexpr(std::is_same<T, fp8e4m3>::value ||std::is_same<T, fp8e5m2>::value){ if constexpr(std::is_same<T, fp8e4m3>::value ||std::is_same<T, fp8e5m2>::value){
// only need to deal with amax and scale of D, no need to deal with amax and scale of pre_gelu_out // only need to deal with amax and scale of D, no need to deal with amax and scale of pre_gelu_out
...@@ -292,14 +292,14 @@ void identity_kernel(const Tin* in, T* out, int n) { ...@@ -292,14 +292,14 @@ void identity_kernel(const Tin* in, T* out, int n) {
template <typename Tin, typename T> template <typename Tin, typename T>
void identity_kernelLauncher(const Tin* in, T* out, int n, hipStream_t stream) { void identity_kernelLauncher(const Tin* in, T* out, int n, hipStream_t stream) {
dim3 block, grid; dim3 block, grid;
block.x = 1024; block.x = 256;
grid.x = ceil( n / 1024.); grid.x = ceil( n / 256.);
hipLaunchKernelGGL(( identity_kernel<Tin, T>), dim3(grid), dim3(block), 0, stream, in, out, n ); hipLaunchKernelGGL(( identity_kernel<Tin, T>), dim3(grid), dim3(block), 0, stream, in, out, n );
} }
template <typename T, int THREADS_PER_BLOCK> template <typename T, int THREADS_PER_BLOCK>
__global__ __global__
void identity_output_kernel(const float* in, T* out, float* amax, const float* scale, int n) { void __launch_bounds__(THREADS_PER_BLOCK) identity_output_kernel(const float* in, T* out, float* amax, const float* scale, int n) {
if constexpr(std::is_same<T, fp8e4m3>::value ||std::is_same<T, fp8e5m2>::value){ if constexpr(std::is_same<T, fp8e4m3>::value ||std::is_same<T, fp8e5m2>::value){
typedef hipcub::BlockReduce<float, THREADS_PER_BLOCK> BlockReduce; typedef hipcub::BlockReduce<float, THREADS_PER_BLOCK> BlockReduce;
__shared__ typename BlockReduce::TempStorage block_temp_storage; __shared__ typename BlockReduce::TempStorage block_temp_storage;
...@@ -335,7 +335,7 @@ void identity_output_kernelLauncher(const float* in, T* out, float* amax, const ...@@ -335,7 +335,7 @@ void identity_output_kernelLauncher(const float* in, T* out, float* amax, const
template <typename Tin, int THREADS_PER_BLOCK> template <typename Tin, int THREADS_PER_BLOCK>
__global__ __global__
void bias_gradient_kernel(const Tin* in, float* out, int m, int n) { void __launch_bounds__(THREADS_PER_BLOCK) bias_gradient_kernel(const Tin* in, float* out, int m, int n) {
typedef hipcub::BlockReduce<float, THREADS_PER_BLOCK> BlockReduce; typedef hipcub::BlockReduce<float, THREADS_PER_BLOCK> BlockReduce;
__shared__ typename BlockReduce::TempStorage block_temp_storage; __shared__ typename BlockReduce::TempStorage block_temp_storage;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment