[ROCM_GEMM] Fix launch params

Fix launch params are larger than launch bounds(256) for kernels in rocm_gemm.cu Signed-off-by: wenjh <wenjh@sugon.com>

[ROCM_GEMM] Fix launch params
Fix launch params are larger than launch bounds(256) for kernels in rocm_gemm.cu Signed-off-by: wenjh <wenjh@sugon.com>
6efebcd0 · wenjh · 388ac735 · 6efebcd0
Commit 6efebcd0 authored May 06, 2025 by wenjh
Show whitespace changes
Inline Side-by-side

Showing with 9 additions and 9 deletions

transformer_engine/common/gemm/rocm_gemm.cu transformer_engine/common/gemm/rocm_gemm.cu +9 -9

No files found.
--- a/transformer_engine/common/gemm/rocm_gemm.cu
+++ b/transformer_engine/common/gemm/rocm_gemm.cu
@@ -119,7 +119,7 @@ float gelu_forward(float x)
 template <typename T, int THREADS_PER_BLOCK>
 __global__
-void gelu_forward_kernel(const float* in, T* out, float* amax, const float* scale, int m, int n) {
+void __launch_bounds__(THREADS_PER_BLOCK) gelu_forward_kernel(const float* in, T* out, float* amax, const float* scale, int m, int n) {
  // fp8 output flow
  if constexpr(std::is_same<T, fp8e4m3>::value ||std::is_same<T, fp8e5m2>::value){
    typedef hipcub::BlockReduce<float, THREADS_PER_BLOCK> BlockReduce;
@@ -188,15 +188,15 @@ void gelu_backward_kernel(const float* dy, T* out, const Taux* __restrict pre_ge
 template <typename T, typename Taux>
 void gelu_backward_kernelLauncher(const float* in, T* out, const Taux* pre_gelu_out, int m, int n, hipStream_t stream) {
-  int blocks_per_row = ceil(float(n)/1024);
+  int blocks_per_row = ceil(float(n)/256);
  dim3 grid(min(m * blocks_per_row, 65536));
-  dim3 block(min(n, 1024));
+  dim3 block(min(n, 256));
  hipLaunchKernelGGL(( gelu_backward_kernel<T, Taux>), dim3(grid), dim3(block), 0, stream, in, out, pre_gelu_out, m, n);
 }
 template <typename T, typename Tb, int THREADS_PER_BLOCK>
 __global__ 
-void add_bias_kernel(const float* in, T* out, const Tb* __restrict bias, float* amax, const float* scale, int m, int n){
+void __launch_bounds__(THREADS_PER_BLOCK) add_bias_kernel(const float* in, T* out, const Tb* __restrict bias, float* amax, const float* scale, int m, int n){
  // fp8 output flow
  if constexpr(std::is_same<T, fp8e4m3>::value ||std::is_same<T, fp8e5m2>::value){
    typedef hipcub::BlockReduce<float, THREADS_PER_BLOCK> BlockReduce;
@@ -236,7 +236,7 @@ void add_bias_kernelLauncher(const float* in, T* out, const Tb* __restrict bias,
 template <typename T, typename Taux, typename Tb, int THREADS_PER_BLOCK>
 __global__ 
-void add_bias_gelu_kernel(const float* in, T* out, Taux* pre_gelu_out, const Tb* __restrict bias, float* amax, const float* scale, int m, int n){
+void __launch_bounds__(THREADS_PER_BLOCK) add_bias_gelu_kernel(const float* in, T* out, Taux* pre_gelu_out, const Tb* __restrict bias, float* amax, const float* scale, int m, int n){
  // fp8 output flow
  if constexpr(std::is_same<T, fp8e4m3>::value ||std::is_same<T, fp8e5m2>::value){
    // only need to deal with amax and scale of D, no need to deal with amax and scale of pre_gelu_out
@@ -292,14 +292,14 @@ void identity_kernel(const Tin* in, T* out, int n) {
 template <typename Tin, typename T>
 void identity_kernelLauncher(const Tin* in, T* out, int n, hipStream_t stream) {
  dim3 block, grid;
-  block.x = 1024;
+  block.x = 256;
-  grid.x = ceil( n / 1024.);
+  grid.x = ceil( n / 256.);
  hipLaunchKernelGGL(( identity_kernel<Tin, T>), dim3(grid), dim3(block), 0, stream, in, out, n );
 }
 template <typename T, int THREADS_PER_BLOCK>
 __global__ 
-void identity_output_kernel(const float* in, T* out, float* amax, const float* scale, int n) {
+void __launch_bounds__(THREADS_PER_BLOCK) identity_output_kernel(const float* in, T* out, float* amax, const float* scale, int n) {
  if constexpr(std::is_same<T, fp8e4m3>::value ||std::is_same<T, fp8e5m2>::value){
    typedef hipcub::BlockReduce<float, THREADS_PER_BLOCK> BlockReduce;
    __shared__ typename BlockReduce::TempStorage block_temp_storage;
@@ -335,7 +335,7 @@ void identity_output_kernelLauncher(const float* in, T* out, float* amax, const
 template <typename Tin, int THREADS_PER_BLOCK>
 __global__
-void bias_gradient_kernel(const Tin* in, float* out, int m, int n) {
+void __launch_bounds__(THREADS_PER_BLOCK) bias_gradient_kernel(const Tin* in, float* out, int m, int n) {
  typedef hipcub::BlockReduce<float, THREADS_PER_BLOCK> BlockReduce;
  __shared__ typename BlockReduce::TempStorage block_temp_storage;