dtk

8f7de847 · yuguo960516yuguo · f262efc9 · 8f7de847 · 8f7de847 · 8f7de847
Commit 8f7de847 authored Apr 25, 2023 by yuguo960516yuguo
11 changed files
--- a/oneflow/user/kernels/fused_self_attention_query_mul_key_and_value_kernel.hip.cpp
+++ b/oneflow/user/kernels/fused_self_attention_query_mul_key_and_value_kernel.hip.cpp
--- a/oneflow/user/kernels/fused_tril_scale_softmax_mask_scale_kernel.hip.cpp
+++ b/oneflow/user/kernels/fused_tril_scale_softmax_mask_scale_kernel.hip.cpp
--- a/oneflow/user/kernels/gather_kernel_util.hip.cpp
+++ b/oneflow/user/kernels/gather_kernel_util.hip.cpp
--- a/oneflow/user/kernels/generate_random_batch_permutation_indices_kernel.hip.cpp
+++ b/oneflow/user/kernels/generate_random_batch_permutation_indices_kernel.hip.cpp
--- a/oneflow/user/kernels/heap_selection_top_k_kernel.hip.cpp
+++ b/oneflow/user/kernels/heap_selection_top_k_kernel.hip.cpp
--- a/oneflow/user/kernels/image_preprocess_kernels.hip.cpp
+++ b/oneflow/user/kernels/image_preprocess_kernels.hip.cpp
--- a/oneflow/user/kernels/in_top_k_kernel_util.hip.cpp
+++ b/oneflow/user/kernels/in_top_k_kernel_util.hip.cpp
--- a/oneflow/user/kernels/kl_div_kernel.hip.cpp
+++ b/oneflow/user/kernels/kl_div_kernel.hip.cpp
--- a/oneflow/user/kernels/l1_l2_regularize_gradient_kernel_util.hip.cpp
+++ b/oneflow/user/kernels/l1_l2_regularize_gradient_kernel_util.hip.cpp
--- a/oneflow/user/kernels/l2_normalize_kernel.hip.cpp
+++ b/oneflow/user/kernels/l2_normalize_kernel.hip.cpp
--- a/oneflow/user/kernels/layer_norm_gpu_kernel.hip.cpp
+++ b/oneflow/user/kernels/layer_norm_gpu_kernel.hip.cpp
@@ -24,273 +24,462 @@ limitations under the License.
 #include "oneflow/core/ep/include/primitive/matmul.h"
 #include "oneflow/core/ep/rocm/cuda_stream.h"
 #include "oneflow/core/hip/layer_norm.hip.h"
+#include <hipcub/hipcub.hpp>
+#include <thrust/pair.h>

-namespace oneflow {
+template <typename T, bool is_cuda>
+struct AccumulateType { };

-namespace {
-
-template<typename SRC, typename DST, bool do_scale, bool do_center>
-struct AffineStore {
-  AffineStore(DST* y, int64_t row_size, const DST* gamma, const DST* beta)
-      : y(y), row_size(row_size), gamma(gamma), beta(beta) {}
-  template<int N>
-  __device__ void store(const SRC* src, int64_t row, int64_t col) {
-    cuda::layer_norm::Pack<DST, N> y_pack;
-    cuda::layer_norm::Pack<DST, N> gamma_pack;
-    cuda::layer_norm::Pack<DST, N> beta_pack;
-    const int64_t offset = (row * row_size + col) / N;
-    const int64_t gamma_offset = col / N;
-    if (do_scale) {
-      gamma_pack.storage =
-          *(reinterpret_cast<const cuda::layer_norm::PackType<DST, N>*>(gamma) + gamma_offset);
-    } else {
-#pragma unroll
-      for (int i = 0; i < N; ++i) { gamma_pack.elem[i] = 1; }
+#if defined(__HIPCC__)
+template <> struct AccumulateType<half, true> { using type = float; };
+#endif
+template <> struct AccumulateType<float, true> { using type = float; };
+template <> struct AccumulateType<double, true> { using type = double; };
+template <> struct AccumulateType<int8_t, true> { using type = int64_t; };
+template <> struct AccumulateType<uint8_t, true> { using type = int64_t; };
+template <> struct AccumulateType<char, true> { using type = int64_t; };
+template <> struct AccumulateType<int16_t, true> { using type = int64_t; };
+template <> struct AccumulateType<int32_t, true> { using type = int64_t; };
+template <> struct AccumulateType<int64_t, true> { using type = int64_t; };
+template <> struct AccumulateType<bool, true> {using type = bool; };
+template <> struct AccumulateType<float, false> { using type = double; };
+template <> struct AccumulateType<double, false> { using type = double; };
+template <> struct AccumulateType<int8_t, false> { using type = int64_t; };
+template <> struct AccumulateType<uint8_t, false> { using type = int64_t; };
+template <> struct AccumulateType<char, false> { using type = int64_t; };
+template <> struct AccumulateType<int16_t, false> { using type = int64_t; };
+template <> struct AccumulateType<int32_t, false> { using type = int64_t; };
+template <> struct AccumulateType<int64_t, false> { using type = int64_t; };
+template <> struct AccumulateType<bool, false> {using type = bool; };
+
+template<typename T, bool is_cuda>
+using acc_type = typename AccumulateType<T, is_cuda>::type;
+
+#define C10_HOST_DEVICE __host__ __device__
+#define C10_DEVICE __device__
+#define C10_HOST __host__
+#define C10_WARP_SIZE 64
+
+#define VEC 4
+typedef int64_t IndexType ;
+
+constexpr int BlockReduceNumThreads=512;
+constexpr int NumThreads = 256;
+constexpr int ColwiseReduceTileSize = 32;
+
+template <typename scalar_t, typename index_t, typename combine_t>
+struct WelfordData {
+  scalar_t mean;
+  scalar_t m2;
+  index_t n;
+  combine_t nf;
+
+  C10_HOST_DEVICE WelfordData() : mean(0), m2(0), n(0), nf(0) {}
+
+  C10_HOST_DEVICE WelfordData(
+      scalar_t mean,
+      scalar_t m2,
+      index_t n,
+      combine_t nf)
+      : mean(mean), m2(m2), n(n), nf(nf) {}
+};
+
+
+template <typename scalar_t, typename acc_scalar_t, typename index_t, typename combine_t, typename res_t>
+struct WelfordOps {
+ public:
+  using acc_t = WelfordData<acc_scalar_t, index_t, combine_t>;
+  inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data) const {
+    acc_scalar_t delta = data - acc.mean;
+    // using acc.nf(combine_t) here, as acc.n(index_t) would still be converted
+    // accumulation in reduce is done through index_T
+    acc_scalar_t new_mean = acc.mean + delta / (acc.nf + 1);
+    acc_scalar_t new_delta = data - new_mean;
+    return {
+      new_mean,
+      acc.m2 + delta * new_delta,
+      acc.n + 1,
+      combine_t(acc.n + 1), // accumulate for combine_t uses index_t
+    };
  }
-    if (do_center) {
-      beta_pack.storage =
-          *(reinterpret_cast<const cuda::layer_norm::PackType<DST, N>*>(beta) + gamma_offset);
-    } else {
-#pragma unroll
-      for (int i = 0; i < N; ++i) { beta_pack.elem[i] = 0; }
+  inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const {
+    if (a.nf == 0) {
+      return b;
    }
-#pragma unroll
-    for (int i = 0; i < N; ++i) {
-      DST normalized_i = static_cast<DST>(src[i]);
-      if (do_scale || do_center) {
-        y_pack.elem[i] = normalized_i * gamma_pack.elem[i] + beta_pack.elem[i];
-      } else {
-        y_pack.elem[i] = normalized_i;
+    if (b.nf == 0) {
+      return a;
    }
+    acc_scalar_t delta = b.mean - a.mean;
+    combine_t new_count = a.nf + b.nf;
+    acc_scalar_t nb_over_n = b.nf / new_count;
+    return {
+      a.mean + delta * nb_over_n,
+      a.m2 + b.m2 + delta * delta * a.nf * nb_over_n,
+      // setting acc.n as -1 since acc.n might not be able to represent the count
+      // correctly within its range, setting it to -1 to avoid confusion
+      -1,
+      new_count
+    };
  }
-    *(reinterpret_cast<cuda::layer_norm::PackType<DST, N>*>(y) + offset) = y_pack.storage;
+  inline C10_DEVICE res_t project(acc_t acc) const {
+    return res_t(acc.m2 / acc.nf, static_cast<scalar_t>(acc.mean));
  }
-  DST* y;
-  int64_t row_size;
-  const DST* gamma;
-  const DST* beta;
-};

-template<typename SRC, typename DST, bool do_scale>
-struct ScaleLoad {
-  ScaleLoad(const SRC* src, const SRC* gamma, int64_t row_size)
-      : src(src), gamma(gamma), row_size(row_size) {}
-  template<int N>
-  __device__ void load(DST* dst, int64_t row, int64_t col) const {
-    cuda::layer_norm::Pack<SRC, N> src_pack;
-    cuda::layer_norm::Pack<SRC, N> gamma_pack;
-    const int64_t offset = (row * row_size + col) / N;
-    const int64_t gamma_offset = col / N;
-    src_pack.storage = *(reinterpret_cast<const cuda::layer_norm::PackType<SRC, N>*>(src) + offset);
-    if (do_scale) {
-      gamma_pack.storage =
-          *(reinterpret_cast<const cuda::layer_norm::PackType<SRC, N>*>(gamma) + gamma_offset);
-    } else {
-#pragma unroll
-      for (int i = 0; i < N; ++i) { gamma_pack.elem[i] = static_cast<SRC>(1); }
-    }
-#pragma unroll
-    for (int i = 0; i < N; ++i) {
-      dst[i] = static_cast<DST>(src_pack.elem[i] * gamma_pack.elem[i]);
-    }
+  inline __device__ acc_t warp_shfl_down(acc_t acc, int offset) const {
+    return {
+      __shfl_down(acc.mean, offset)
+      , __shfl_down(acc.m2, offset)
+      , __shfl_down(acc.n, offset)
+      , __shfl_down(acc.nf, offset)
+    };
  }
-  const SRC* src;
-  const SRC* gamma;
-  int64_t row_size;
 };

-template<typename SRC, typename DST, bool do_add>
-struct AddStore {
-  AddStore(const DST* add_to_output, DST* dst, int64_t row_size)
-      : add_to_output(add_to_output), dst(dst), row_size(row_size) {}
-  template<int N>
-  __device__ void store(const SRC* src, int64_t row, int64_t col) {
-    cuda::layer_norm::Pack<DST, N> add_to_output_pack;
-    cuda::layer_norm::Pack<DST, N> dst_pack;
-    const int64_t offset = (row * row_size + col) / N;
-    if (do_add) {
-      add_to_output_pack.storage =
-          *(reinterpret_cast<const cuda::layer_norm::PackType<DST, N>*>(add_to_output) + offset);
+template <typename T, class ReduceOp>
+__inline__ __device__ T WarpReduce(T val, const ReduceOp& op) {
+#pragma unroll
+  for (int offset = (C10_WARP_SIZE >> 1); offset > 0; offset >>= 1) {
+    val = op.combine(val, op.warp_shfl_down(val, offset));
  }
+  return val;
+}
+
+template <typename T, class ReduceOp>
+__inline__ __device__ T WarpReduce(T val,int max,const ReduceOp& op) {
 #pragma unroll
-    for (int i = 0; i < N; ++i) {
-      if (do_add) {
-        dst_pack.elem[i] = static_cast<DST>(src[i]) + add_to_output_pack.elem[i];
-      } else {
-        dst_pack.elem[i] = static_cast<DST>(src[i]);
+  for (int offset = max; offset > 0; offset >>= 1) {
+    val = op.combine(val, op.warp_shfl_down(val, offset));
  }
+  return val;
+}
+
+template <typename T, class ReduceOp>
+__inline__ __device__ T
+BlockReduce(T val, const ReduceOp& op, T* shared) {
+  const int lid = threadIdx.x % C10_WARP_SIZE;
+  const int wid = threadIdx.x / C10_WARP_SIZE;
+  val = WarpReduce(val, op);
+  __syncthreads();
+  if (lid == 0) {
+    shared[wid] = val;
  }
-    *(reinterpret_cast<cuda::layer_norm::PackType<DST, N>*>(dst) + offset) = dst_pack.storage;
+  __syncthreads();
+  if (wid == 0) {
+    val= shared[lid];
+    val = WarpReduce(val,blockDim.x / C10_WARP_SIZE / 2,op);
  }
-  const DST* add_to_output;
-  DST* dst;
-  int64_t row_size;
-};
-
-template<typename T>
-__inline__ __device__ T WarpReduce(T val) {
-//   for (int mask = 16; mask > 0; mask /= 2) { val += __shfl_down_sync(0xffffffff, val, mask); }
-  for (int mask = 32; mask > 0; mask /= 2) { val += __shfl_down(val, mask, 64); }
  return val;
 }

-constexpr int tile_size = 32;
-constexpr int num_per_block = 4;
-constexpr int block_dim_x = 32;
-constexpr int block_dim_y = 32 / num_per_block;
-
-template<typename T, typename ComputeType>
-__global__ void LayerNormParamGrad(int rows, int cols, const T* __restrict__ dy,
-                                   const T* __restrict__ x, const ComputeType* __restrict__ mean,
-                                   const ComputeType* __restrict__ inv_var,
-                                   T* __restrict__ tmp_gamma_diff, T* __restrict__ tmp_beta_diff) {
-  __shared__ ComputeType dgamma[32][33];
-  __shared__ ComputeType dbeta[32][33];
-  ComputeType dgamma_sum[num_per_block];
-  ComputeType dbeta_sum[num_per_block];
+template <typename T>
+__inline__ __device__ T WarpReduceSum(T val) {
 #pragma unroll
-  for (int index = 0; index < num_per_block; ++index) {
-    dgamma_sum[index] = 0;
-    dbeta_sum[index] = 0;
+  for (int offset = (C10_WARP_SIZE >> 1); offset > 0; offset >>= 1) {
+    val += __shfl_down(val, offset);
  }
-  const int col_id = blockIdx.x * blockDim.x + threadIdx.x;
-  if (col_id < cols) {
-    for (int i = blockIdx.y * tile_size + threadIdx.y; i < rows; i += tile_size * gridDim.y) {
+  return val;
+}
+
+template <typename T>
+__inline__ __device__ T WarpReduceSum(T val,int max) {
 #pragma unroll
-      for (int index = 0; index < num_per_block; ++index) {
-        int row_id = i + index * blockDim.y;
-        if (row_id < rows) {
-          int offset = row_id * cols + col_id;
-          const ComputeType dy_val = static_cast<ComputeType>(dy[offset]);
-          const ComputeType x_val = static_cast<ComputeType>(x[offset]);
-          const ComputeType mean_val = mean[row_id];
-          const ComputeType inv_var_val = inv_var[row_id];
-          dgamma_sum[index] += dy_val * (x_val - mean_val) * inv_var_val;
-          dbeta_sum[index] += dy_val;
-        }
+  for (int offset = max; offset > 0; offset >>= 1) {
+    val += __shfl_down(val, offset);
  }
+  return val;
+}
+
+
+template <typename T>
+__inline__ __device__ T BlockReduceSum(T val, T* shared) {
+  const int lid = threadIdx.x % C10_WARP_SIZE;
+  const int wid = threadIdx.x / C10_WARP_SIZE;
+  val = WarpReduceSum(val);
+  __syncthreads();
+  if (lid == 0) {
+    shared[wid] = val;
  }
+  __syncthreads();
+  if (wid == 0) {
+    val= shared[lid];
+    val = WarpReduceSum(val,blockDim.x / C10_WARP_SIZE / 2);
  }
-#pragma unroll
-  for (int index = 0; index < num_per_block; ++index) {
-    dgamma[index * blockDim.y + threadIdx.y][threadIdx.x] = dgamma_sum[index];
-    dbeta[index * blockDim.y + threadIdx.y][threadIdx.x] = dbeta_sum[index];
+  return val;
+}
+
+template <typename scalar_t>
+__global__ void layernorm_forward_kernel(const scalar_t* input,scalar_t* ret,acc_type<scalar_t, true>* mean,acc_type<scalar_t, true>* rstd,
+                                            const scalar_t* gamma,const scalar_t* beta,IndexType cols,double eps)
+{
+  //dropout do nothing in val mode
+  IndexType i=blockIdx.x;
+  // add + layernorm get mean and rstd
+  using T_ACC = acc_type<scalar_t, true>;
+  using WelfordType = WelfordData<T_ACC, IndexType, T_ACC>;
+  using WelfordOp = WelfordOps<T_ACC, T_ACC, IndexType, T_ACC, thrust::pair<T_ACC, T_ACC>>;
+  __shared__ typename std::aligned_storage<sizeof(WelfordType), alignof(WelfordType)>::type val_shared[BlockReduceNumThreads/C10_WARP_SIZE];
+  WelfordType* val_shared_ptr = reinterpret_cast<WelfordType*>(val_shared);
+  WelfordOp welford_op;
+  WelfordType val;
+
+  #pragma unroll
+  for (IndexType j = threadIdx.x; j < cols; j += blockDim.x) {
+    IndexType index = i * cols + j;
+    val = welford_op.reduce(val, static_cast<T_ACC>(input[index]));
  }
-  __syncthreads();
-#pragma unroll
-  for (int index = 0; index < num_per_block; ++index) {
-    const int col_id = blockIdx.x * blockDim.x + threadIdx.y + index * blockDim.y;
-    if (col_id < cols) {
-      ComputeType gamma_sum = dgamma[threadIdx.x][threadIdx.y + index * blockDim.y];
-      ComputeType beta_sum = dbeta[threadIdx.x][threadIdx.y + index * blockDim.y];
-      ComputeType global_dgamma = WarpReduce<ComputeType>(gamma_sum);
-      ComputeType global_dbeta = WarpReduce<ComputeType>(beta_sum);
+  val = BlockReduce(val,welford_op,val_shared_ptr);
+
+  __shared__ T_ACC s_mean;
+  __shared__ T_ACC s_rstd;
  if (threadIdx.x == 0) {
-        const int offset = blockIdx.y * cols + col_id;
-        tmp_gamma_diff[offset] = global_dgamma;
-        tmp_beta_diff[offset] = global_dbeta;
-      }
+    thrust::tie(s_rstd, s_mean) = welford_op.project(val);
+    mean[i] = s_mean;
+    s_rstd=rsqrt(s_rstd + static_cast<T_ACC>(eps));
+    rstd[i] = s_rstd;
  }
+  __syncthreads();
+  //layernorm  (x-mean)*rstd*gamma+beta
+  #pragma unroll
+  for (IndexType j = threadIdx.x; j < cols; j += blockDim.x) {
+    IndexType index = i * cols + j;
+    ret[index] = (static_cast<T_ACC>(input[index]) - s_mean)*s_rstd * (gamma == nullptr ? T_ACC(1) : static_cast<T_ACC>(gamma[j])) 
+    + (beta == nullptr ? T_ACC(0) : static_cast<T_ACC>(beta[j]));
  }
 }

-template<typename T>
-int GetGirdDimY(const int64_t num_instances, const int64_t norm_size) {
-  using ComputeType = typename cuda::layer_norm::DefaultComputeType<T>::type;
-  const int grid_dim_x = (norm_size + tile_size - 1) / tile_size;
-  const int max_grid_dim_y = (num_instances + tile_size - 1) / tile_size;
-  const int block_size = block_dim_x * block_dim_y;
-  int max_active_blocks = 0;
-  OF_CUDA_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
-      &max_active_blocks, LayerNormParamGrad<T, ComputeType>, block_size, 0));
-  int waves = 1;
-  int dev;
-  OF_CUDA_CHECK(hipGetDevice(&dev));
-  int sm_count;
-  OF_CUDA_CHECK(hipDeviceGetAttribute(&sm_count, hipDeviceAttributeMultiprocessorCount, dev));
-  int num_blocks = max_active_blocks * sm_count * waves;
-  int grid_dim_y = std::min(max_grid_dim_y, static_cast<int>(num_blocks / grid_dim_x));
-  return std::max(grid_dim_y, 1);
+template <typename T>
+void LayerNormKernelImplInternal(
+    oneflow::ep::Stream* stream, 
+    const T* X,
+    const T* gamma,
+    const T* beta,
+    int64_t M,
+    int64_t N,
+    double eps,
+    T* Y,
+    acc_type<T, true>* mean,
+    acc_type<T, true>* rstd) {
+  using T_ACC = acc_type<T, true>;
+  const T* X_data = X;
+  const T* gamma_data = gamma;
+  const T* beta_data = beta;
+  T* Y_data = Y;
+  T_ACC* mean_data = mean;
+  T_ACC* rstd_data = rstd;
+  hipStream_t cuda_stream = stream->As<oneflow::ep::CudaStream>()->cuda_stream();
+  layernorm_forward_kernel<T><<<M, BlockReduceNumThreads, 0, cuda_stream>>>(
+                                   X_data,Y_data,mean_data,rstd_data,gamma_data,beta_data,N,eps);
 }

-template<typename T, bool do_scale, bool do_center>
-void LayerNormForwardGpu(ep::Stream* stream, const int64_t num_instances, const int64_t norm_size,
-                         const double epsilon, const T* x_ptr, const T* gamma_ptr,
-                         const T* beta_ptr, T* y_ptr, user_op::Tensor* mean,
-                         user_op::Tensor* inv_variance) {
-  using ComputeType = typename cuda::layer_norm::DefaultComputeType<T>::type;
-  cuda::layer_norm::DirectLoad<T, ComputeType> load(x_ptr, norm_size);
-  AffineStore<ComputeType, T, do_scale, do_center> store(y_ptr, norm_size, gamma_ptr, beta_ptr);
-  cuda::layer_norm::DispatchLayerNorm<decltype(load), decltype(store), ComputeType>(
-      stream->As<ep::CudaStream>()->cuda_stream(), load, store, num_instances, norm_size, epsilon,
-      mean->mut_dptr<ComputeType>(), inv_variance->mut_dptr<ComputeType>());
+template <typename scalar_t>
+__global__ void GammaBetaBackwardSimple(IndexType M,IndexType N,const scalar_t* dY,const scalar_t* X,const acc_type<scalar_t, true>* mean,
+                                                   const acc_type<scalar_t, true>* rstd,scalar_t* dg,scalar_t* db)
+{
+   using T_ACC = acc_type<scalar_t, true>;
+  const int64_t j = blockIdx.x * blockDim.x + threadIdx.x;
+  if (j < N) {
+    T_ACC sum1 = 0;
+    T_ACC sum2 = 0;
+    for (int64_t i = 0; i < M; ++i) {
+      const int64_t index = i * N + j;
+      sum1 += dg == nullptr ? T_ACC(0)
+                            : static_cast<T_ACC>(dY[index]) *
+              (static_cast<T_ACC>(X[index]) - static_cast<T_ACC>(mean[i])) *
+              static_cast<T_ACC>(rstd[i]);
+      sum2 += db == nullptr ? T_ACC(0) : static_cast<T_ACC>(dY[index]);
+    }
+    if (dg != nullptr) {
+      dg[j] = sum1;
+    }
+    if (db != nullptr) {
+      db[j] = sum2;
+    }
+  }
 }

-template<typename T>
-void DispatchLayerNormForwardGpu(ep::Stream* stream, const int64_t num_instances,
-                                 const int64_t norm_size, const double epsilon, const T* x_ptr,
-                                 const T* gamma_ptr, const T* beta_ptr, T* y_ptr,
-                                 user_op::Tensor* mean, user_op::Tensor* inv_variance) {
-  if (gamma_ptr != nullptr && beta_ptr != nullptr) {
-    LayerNormForwardGpu<T, true, true>(stream, num_instances, norm_size, epsilon, x_ptr, gamma_ptr,
-                                       beta_ptr, y_ptr, mean, inv_variance);
-  } else if (gamma_ptr != nullptr && beta_ptr == nullptr) {
-    LayerNormForwardGpu<T, true, false>(stream, num_instances, norm_size, epsilon, x_ptr, gamma_ptr,
-                                        beta_ptr, y_ptr, mean, inv_variance);
-  } else if (gamma_ptr == nullptr && beta_ptr != nullptr) {
-    LayerNormForwardGpu<T, false, true>(stream, num_instances, norm_size, epsilon, x_ptr, gamma_ptr,
-                                        beta_ptr, y_ptr, mean, inv_variance);
-  } else {
-    LayerNormForwardGpu<T, false, false>(stream, num_instances, norm_size, epsilon, x_ptr,
-                                         gamma_ptr, beta_ptr, y_ptr, mean, inv_variance);
+template <typename scalar_t>
+__global__ void GammaBetaBackward(IndexType M,IndexType N,const scalar_t* dY,const scalar_t* X,const acc_type<scalar_t, true>* mean,
+                                  const acc_type<scalar_t, true>* rstd,scalar_t* dg,scalar_t* db)
+{
+  using T_ACC = acc_type<scalar_t, true>;
+  __shared__ T_ACC g_shared[ColwiseReduceTileSize][ColwiseReduceTileSize + 1];
+  __shared__ T_ACC b_shared[ColwiseReduceTileSize][ColwiseReduceTileSize + 1];
+  const int64_t j = blockIdx.x * blockDim.x + threadIdx.x;
+  T_ACC dg_sum1 = 0;
+  T_ACC dg_sum2 = 0;
+  T_ACC db_sum1 = 0;
+  T_ACC db_sum2 = 0;
+  if (j < N) {
+    for (int64_t i = threadIdx.y; i < M; i += blockDim.y * 2) {
+      const int64_t i1 = i;
+      const int64_t i2 = i + blockDim.y;
+      const int64_t index1 = i1 * N + j;
+      const int64_t index2 = i2 * N + j;
+      dg_sum1 += dg == nullptr ? T_ACC(0)
+                               : static_cast<T_ACC>(dY[index1]) *
+              (static_cast<T_ACC>(X[index1]) - static_cast<T_ACC>(mean[i1])) *
+              static_cast<T_ACC>(rstd[i1]);
+      db_sum1 += db == nullptr ? T_ACC(0) : static_cast<T_ACC>(dY[index1]);
+      if (i2 < M) {
+        dg_sum2 += dg == nullptr ? T_ACC(0)
+                                 : static_cast<T_ACC>(dY[index2]) *
+                (static_cast<T_ACC>(X[index2]) - static_cast<T_ACC>(mean[i2])) *
+                static_cast<T_ACC>(rstd[i2]);
+        db_sum2 += db == nullptr ? T_ACC(0) : static_cast<T_ACC>(dY[index2]);
+      }
+    }
+  }
+  g_shared[threadIdx.y][threadIdx.x] = dg_sum1;
+  g_shared[threadIdx.y + blockDim.y][threadIdx.x] = dg_sum2;
+  b_shared[threadIdx.y][threadIdx.x] = db_sum1;
+  b_shared[threadIdx.y + blockDim.y][threadIdx.x] = db_sum2;
+  __syncthreads();
+  T_ACC sum1 = g_shared[threadIdx.x][threadIdx.y];
+  T_ACC sum2 = b_shared[threadIdx.x][threadIdx.y];
+  sum1 = WarpReduceSum(sum1);
+  sum2 = WarpReduceSum(sum2);
+  if (threadIdx.x == 0) {
+    const int64_t j = blockIdx.x * blockDim.x + threadIdx.y;
+    if (j < N) {
+      if (dg != nullptr) {
+        dg[j] = sum1;
+      }
+      if (db != nullptr) {
+        db[j] = sum2;
+      }
+    }
+  }
+  sum1 = g_shared[threadIdx.x][threadIdx.y + blockDim.y];
+  sum2 = b_shared[threadIdx.x][threadIdx.y + blockDim.y];
+  sum1 = WarpReduceSum(sum1);
+  sum2 = WarpReduceSum(sum2);
+  if (threadIdx.x == 0) {
+    const int64_t j = blockIdx.x * blockDim.x + threadIdx.y + blockDim.y;
+    if (j < N) {
+      if (dg != nullptr) {
+        dg[j] = sum1;
+      }
+      if (db != nullptr) {
+        db[j] = sum2;
+      }
+    }
  }
 }

-template<typename T, bool do_scale, bool do_add>
-void LayerNormBackwardGpu(ep::Stream* stream, const int64_t num_instances, const int64_t norm_size,
-                          const T* dy_ptr, const T* x_ptr, const user_op::Tensor* mean,
-                          const user_op::Tensor* inv_variance, const T* gamma_ptr,
-                          const T* add_to_output_ptr, T* dx_ptr) {
-  using ComputeType = typename cuda::layer_norm::DefaultComputeType<T>::type;
-  cuda::layer_norm::DirectLoad<T, ComputeType> load_x(x_ptr, norm_size);
-  ScaleLoad<T, ComputeType, do_scale> load_scaled_dy(dy_ptr, gamma_ptr, norm_size);
-  AddStore<ComputeType, T, do_add> store(add_to_output_ptr, dx_ptr, norm_size);
-  OF_CUDA_CHECK((cuda::layer_norm::DispatchLayerNormGrad<decltype(load_x), decltype(load_scaled_dy),
-                                                         decltype(store), ComputeType>(
-      stream->As<ep::CudaStream>()->cuda_stream(), load_x, load_scaled_dy, store,
-      mean->dptr<ComputeType>(), inv_variance->dptr<ComputeType>(), num_instances, norm_size)));
+template <typename scalar_t>
+__global__ void LayerNormBackward_kernel(IndexType N,const scalar_t* dY,const scalar_t* X,const scalar_t* gamma,const acc_type<scalar_t, true>* mean,
+                                         const acc_type<scalar_t, true>* rstd, scalar_t* dX, const scalar_t* add_to_output)
+{
+  using T_ACC = acc_type<scalar_t, true>;
+  __shared__ T_ACC ds_shared[C10_WARP_SIZE];
+  __shared__ T_ACC db_shared[C10_WARP_SIZE];
+  const IndexType i = blockIdx.x;
+  T_ACC sum1 = 0;
+  T_ACC sum2 = 0;
+  #pragma unroll
+  for (IndexType j = threadIdx.x; j < N; j += blockDim.x) {
+    const IndexType index = i * N + j;
+    const T_ACC gamma_v = gamma == nullptr ? T_ACC(1) : static_cast<T_ACC>(gamma[j]);
+    sum1 += static_cast<T_ACC>(dY[index]) * static_cast<T_ACC>(X[index]) * gamma_v;
+    sum2 += static_cast<T_ACC>(dY[index]) * gamma_v;
+  }
+  sum1 = BlockReduceSum<T_ACC>(sum1, ds_shared);
+  sum2 = BlockReduceSum<T_ACC>(sum2, db_shared);
+  const T_ACC s = T_ACC(1) / static_cast<T_ACC>(N);
+  __shared__ T_ACC b;
+  __shared__ T_ACC c;
+  if (threadIdx.x == 0) {
+    b = (sum2 * static_cast<T_ACC>(mean[i]) - sum1) * static_cast<T_ACC>(rstd[i]) * static_cast<T_ACC>(rstd[i]) *static_cast<T_ACC>(rstd[i]) * s;
+    c = -(b * static_cast<T_ACC>(mean[i]) + sum2 * static_cast<T_ACC>(rstd[i]) * s);
+  }
+  __syncthreads();
+  #pragma unroll
+  for (IndexType j = threadIdx.x; j < N; j += blockDim.x) {
+    const IndexType index = i * N + j;
+    const T_ACC gamma_v = gamma == nullptr ? T_ACC(1) : static_cast<T_ACC>(gamma[j]);
+    dX[index] = static_cast<T_ACC>(rstd[i]) * static_cast<T_ACC>(dY[index]) * gamma_v + b * static_cast<T_ACC>(X[index]) + c 
+    + (add_to_output == nullptr ? T_ACC(0) : static_cast<T_ACC>(add_to_output[index]));
+  }
 }

-template<typename T, bool do_scale>
-void DispatchLayerNormBackwardDoAdd(ep::Stream* stream, const int64_t num_instances,
-                                    const int64_t norm_size, const T* dy_ptr, const T* x_ptr,
-                                    const user_op::Tensor* mean,
-                                    const user_op::Tensor* inv_variance, const T* gamma_ptr,
-                                    const T* add_to_output_ptr, T* dx_ptr) {
-  if (add_to_output_ptr != nullptr) {
-    LayerNormBackwardGpu<T, do_scale, true>(stream, num_instances, norm_size, dy_ptr, x_ptr, mean,
-                                            inv_variance, gamma_ptr, add_to_output_ptr, dx_ptr);
-  } else {
-    LayerNormBackwardGpu<T, do_scale, false>(stream, num_instances, norm_size, dy_ptr, x_ptr, mean,
-                                             inv_variance, gamma_ptr, add_to_output_ptr, dx_ptr);
+template <typename T>
+void LayerNormBackwardKernelImplInternal(
+    oneflow::ep::Stream* stream,
+    const T* dY,
+    const T* X,
+    const acc_type<T, true>* mean,
+    const acc_type<T, true>* rstd,
+    const T* gamma,
+    int64_t M,
+    int64_t N,
+    T* dX,
+    const T* add_to_output) {
+  using T_ACC = acc_type<T, true>;
+  const T* dY_data = dY;
+  const T* X_data = X;
+  const T_ACC* mean_data = mean;
+  const T_ACC* rstd_data = rstd;
+  const T* gamma_data = gamma;
+  T* dX_data = dX;
+  const T* add_to_output_data = add_to_output;
+  hipStream_t cuda_stream = stream->As<oneflow::ep::CudaStream>()->cuda_stream();
+  if (dX_data != nullptr) {
+    LayerNormBackward_kernel<T><<<M, BlockReduceNumThreads, 0, cuda_stream>>>(
+                  N, dY_data, X_data,gamma_data,mean_data,rstd_data,dX_data,add_to_output_data);
  }
 }

-template<typename T>
-void LaunchLayerNormBackward(ep::Stream* stream, const int64_t num_instances,
-                             const int64_t norm_size, const T* dy_ptr, const T* x_ptr,
-                             const user_op::Tensor* mean, const user_op::Tensor* inv_variance,
-                             const T* gamma_ptr, const T* add_to_output_ptr, T* dx_ptr) {
-  if (gamma_ptr != nullptr) {
-    DispatchLayerNormBackwardDoAdd<T, true>(stream, num_instances, norm_size, dy_ptr, x_ptr, mean,
-                                            inv_variance, gamma_ptr, add_to_output_ptr, dx_ptr);
+template <typename T>
+void LayerNormBackwardKernelImplInternalParam(
+    oneflow::ep::Stream* stream,
+    const T* dY,
+    const T* X,
+    const acc_type<T, true>* mean,
+    const acc_type<T, true>* rstd,
+    int64_t M,
+    int64_t N,
+    T* dgamma,
+    T* dbeta) {
+  using T_ACC = acc_type<T, true>;
+  const T* dY_data = dY;
+  const T* X_data = X;
+  const T_ACC* mean_data = mean;
+  const T_ACC* rstd_data = rstd;
+  hipStream_t cuda_stream = stream->As<oneflow::ep::CudaStream>()->cuda_stream();
+  T* dgamma_data = dgamma;
+  T* dbeta_data = dbeta;
+  if (M < 512) {
+    // For small batch size, do colwise reduce directly.
+    const int64_t B = (N + NumThreads - 1) / NumThreads;
+    GammaBetaBackwardSimple<T>
+        <<<B, NumThreads, 0, cuda_stream>>>(
+            M,
+            N,
+            dY_data,
+            X_data,
+            mean_data,
+            rstd_data,
+            dgamma_data,
+            dbeta_data);
  } else {
-    DispatchLayerNormBackwardDoAdd<T, false>(stream, num_instances, norm_size, dy_ptr, x_ptr, mean,
-                                             inv_variance, gamma_ptr, add_to_output_ptr, dx_ptr);
+    const int64_t B =
+        (N + ColwiseReduceTileSize - 1) / ColwiseReduceTileSize;
+    constexpr int kThreadX = ColwiseReduceTileSize;
+    constexpr int kThreadY = ColwiseReduceTileSize / 2;
+    GammaBetaBackward<T>
+        <<<B, dim3(kThreadX, kThreadY), 0, cuda_stream>>>(
+            M,
+            N,
+            dY_data,
+            X_data,
+            mean_data,
+            rstd_data,
+            dgamma_data,
+            dbeta_data);
  }
 }

-}  // namespace
+namespace oneflow {

 template<typename T>
 class LayerNormGpuKernel final : public user_op::OpKernel, public user_op::CudaGraphSupport {
@@ -306,10 +495,9 @@ class LayerNormGpuKernel final : public user_op::OpKernel, public user_op::CudaG
    user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0);
    user_op::Tensor* mean = ctx->Tensor4ArgNameAndIndex("mean", 0);
    user_op::Tensor* inv_variance = ctx->Tensor4ArgNameAndIndex("inv_variance", 0);
-    const double epsilon = ctx->Attr<double>("epsilon");
-    CHECK_GE(epsilon, HIPDNN_BN_MIN_EPSILON);
-    const int64_t num_instances = mean->shape_view().elem_cnt();
-    const int64_t norm_size = x->shape_view().elem_cnt() / num_instances;
+    double epsilon = ctx->Attr<double>("epsilon");
+    int64_t num_instances = mean->shape_view().elem_cnt();
+    int64_t norm_size = x->shape_view().elem_cnt() / num_instances;
    const T* gamma_ptr = nullptr;
    const T* beta_ptr = nullptr;
    if (ctx->has_input("gamma", 0)) {
@@ -318,8 +506,11 @@ class LayerNormGpuKernel final : public user_op::OpKernel, public user_op::CudaG
      CHECK_EQ(gamma->shape_view().elem_cnt(), norm_size);
    }
    if (ctx->has_input("beta", 0)) { beta_ptr = ctx->Tensor4ArgNameAndIndex("beta", 0)->dptr<T>(); }
-    DispatchLayerNormForwardGpu<T>(ctx->stream(), num_instances, norm_size, epsilon, x->dptr<T>(),
-                                   gamma_ptr, beta_ptr, y->mut_dptr<T>(), mean, inv_variance);
+    // DispatchLayerNormForwardGpu<T>(ctx->stream(), num_instances, norm_size, epsilon, x->dptr<T>(),
+    //                                gamma_ptr, beta_ptr, y->mut_dptr<T>(), mean, inv_variance);
+    using ComputeType = typename cuda::layer_norm::DefaultComputeType<T>::type;
+    LayerNormKernelImplInternal<T>(ctx->stream(), x->dptr<T>(), gamma_ptr, beta_ptr, num_instances, norm_size, epsilon, 
+                                  y->mut_dptr<T>(), mean->mut_dptr<ComputeType>(), inv_variance->mut_dptr<ComputeType>());
  };
 };

@@ -332,6 +523,9 @@ class LayerNormGpuKernel final : public user_op::OpKernel, public user_op::CudaG
 REGISTER_LAYER_NORM_CUDA_KERNEL(float)
 REGISTER_LAYER_NORM_CUDA_KERNEL(double)
 REGISTER_LAYER_NORM_CUDA_KERNEL(half)
+#if CUDA_VERSION >= 11000
+REGISTER_LAYER_NORM_CUDA_KERNEL(nv_bfloat16)
+#endif

 template<typename T>
 class LayerNormGradGpuKernel final : public user_op::OpKernel, public user_op::CudaGraphSupport {
@@ -348,8 +542,8 @@ class LayerNormGradGpuKernel final : public user_op::OpKernel, public user_op::C
    const user_op::Tensor* mean = ctx->Tensor4ArgNameAndIndex("mean", 0);
    const user_op::Tensor* inv_variance = ctx->Tensor4ArgNameAndIndex("inv_variance", 0);
    user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
-    const int64_t num_instances = mean->shape_view().elem_cnt();
-    const int64_t norm_size = x->shape_view().elem_cnt() / num_instances;
+    int64_t num_instances = mean->shape_view().elem_cnt();
+    int64_t norm_size = x->shape_view().elem_cnt() / num_instances;
    const T* gamma_ptr = nullptr;
    if (ctx->has_input("gamma", 0)) {
      gamma_ptr = ctx->Tensor4ArgNameAndIndex("gamma", 0)->dptr<T>();
@@ -361,8 +555,11 @@ class LayerNormGradGpuKernel final : public user_op::OpKernel, public user_op::C
      CHECK_EQ(add_to_output->shape_view(), dx->shape_view());
      add_to_output_ptr = add_to_output->dptr<T>();
    }
-    LaunchLayerNormBackward<T>(ctx->stream(), num_instances, norm_size, dy->dptr<T>(), x->dptr<T>(),
-                               mean, inv_variance, gamma_ptr, add_to_output_ptr, dx->mut_dptr<T>());
+    // LaunchLayerNormBackward<T>(ctx->stream(), num_instances, norm_size, dy->dptr<T>(), x->dptr<T>(),
+    //                            mean, inv_variance, gamma_ptr, add_to_output_ptr, dx->mut_dptr<T>());
+    using ComputeType = typename cuda::layer_norm::DefaultComputeType<T>::type;
+    LayerNormBackwardKernelImplInternal<T>(ctx->stream(), dy->dptr<T>(), x->dptr<T>(), mean->dptr<ComputeType>(), inv_variance->dptr<ComputeType>(), 
+                                        gamma_ptr, num_instances, norm_size, dx->mut_dptr<T>(), add_to_output_ptr);
  };
 };

@@ -383,6 +580,9 @@ class LayerNormGradGpuKernel final : public user_op::OpKernel, public user_op::C
 REGISTER_LAYER_NORM_GRAD_CUDA_KERNEL(float)
 REGISTER_LAYER_NORM_GRAD_CUDA_KERNEL(double)
 REGISTER_LAYER_NORM_GRAD_CUDA_KERNEL(half)
+#if CUDA_VERSION >= 11000
+REGISTER_LAYER_NORM_GRAD_CUDA_KERNEL(nv_bfloat16)
+#endif

 template<typename T>
 class LayerNormParamGradGpuKernel final : public user_op::OpKernel,
@@ -399,45 +599,55 @@ class LayerNormParamGradGpuKernel final : public user_op::OpKernel,
    const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0);
    const user_op::Tensor* mean = ctx->Tensor4ArgNameAndIndex("mean", 0);
    const user_op::Tensor* inv_variance = ctx->Tensor4ArgNameAndIndex("inv_variance", 0);
-    const int64_t num_instances = mean->shape_view().elem_cnt();
-    const int64_t norm_size = x->shape_view().elem_cnt() / num_instances;
+    int64_t num_instances = mean->shape_view().elem_cnt();
+    int64_t norm_size = x->shape_view().elem_cnt() / num_instances;
    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-    const DataType data_type = dy->data_type();
-    const int grid_dim_x = (norm_size + tile_size - 1) / tile_size;
-    const int grid_dim_y = GetGirdDimY<T>(num_instances, norm_size);
-    const size_t tmp_gamma_diff_size = grid_dim_y * norm_size * sizeof(T);
-    T* tmp_gamma_diff_ptr = reinterpret_cast<T*>(tmp_buffer->mut_dptr());
-    T* tmp_beta_diff_ptr = reinterpret_cast<T*>(tmp_buffer->mut_dptr<char>() + tmp_gamma_diff_size);
-    T* reduce_buf_ptr =
-        reinterpret_cast<T*>(tmp_buffer->mut_dptr<char>() + 2 * tmp_gamma_diff_size);
+    // const DataType data_type = dy->data_type();
+    // const int grid_dim_x = (norm_size + tile_size - 1) / tile_size;
+    // const int grid_dim_y = GetGirdDimY<T>(num_instances, norm_size);
+    // const size_t tmp_gamma_diff_size = grid_dim_y * norm_size * sizeof(T);
+    // T* tmp_gamma_diff_ptr = reinterpret_cast<T*>(tmp_buffer->mut_dptr());
+    // T* tmp_beta_diff_ptr = reinterpret_cast<T*>(tmp_buffer->mut_dptr<char>() + tmp_gamma_diff_size);
+    // T* reduce_buf_ptr =
+    //     reinterpret_cast<T*>(tmp_buffer->mut_dptr<char>() + 2 * tmp_gamma_diff_size);
    using ComputeType = typename cuda::layer_norm::DefaultComputeType<T>::type;
-    LayerNormParamGrad<T, ComputeType><<<dim3(grid_dim_x, grid_dim_y), dim3(32, 32 / num_per_block),
-                                         0, ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-        num_instances, norm_size, dy->dptr<T>(), x->dptr<T>(), mean->dptr<ComputeType>(),
-        inv_variance->dptr<ComputeType>(), tmp_gamma_diff_ptr, tmp_beta_diff_ptr);
-    const int32_t m = norm_size;
-    const int32_t n = 1;
-    const int32_t k = grid_dim_y;
-    std::unique_ptr<ep::primitive::Fill> fill =
-        ep::primitive::NewPrimitive<ep::primitive::FillFactory>(ctx->stream()->device_type(),
-                                                                data_type);
-    CHECK(fill);
-    fill->Launch(ctx->stream(), reduce_buf_ptr, 1.0, grid_dim_y);
-    std::unique_ptr<ep::primitive::Matmul> matmul =
-        ep::primitive::NewPrimitive<ep::primitive::MatmulFactory>(
-            ctx->stream()->device_type(), data_type, ep::primitive::BlasTransposeType::T,
-            ep::primitive::BlasTransposeType::N);
-    CHECK(matmul);
+    // LayerNormParamGrad<T, ComputeType><<<dim3(grid_dim_x, grid_dim_y), dim3(32, 32 / num_per_block),
+    //                                      0, ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+    //     num_instances, norm_size, dy->dptr<T>(), x->dptr<T>(), mean->dptr<ComputeType>(),
+    //     inv_variance->dptr<ComputeType>(), tmp_gamma_diff_ptr, tmp_beta_diff_ptr);
+    // const int32_t m = norm_size;
+    // const int32_t n = 1;
+    // const int32_t k = grid_dim_y;
+    // std::unique_ptr<ep::primitive::Fill> fill =
+    //     ep::primitive::NewPrimitive<ep::primitive::FillFactory>(ctx->stream()->device_type(),
+    //                                                             data_type);
+    // CHECK(fill);
+    // fill->Launch(ctx->stream(), reduce_buf_ptr, 1.0, grid_dim_y);
+    // std::unique_ptr<ep::primitive::Matmul> matmul =
+    //     ep::primitive::NewPrimitive<ep::primitive::MatmulFactory>(
+    //         ctx->stream()->device_type(), data_type, ep::primitive::BlasTransposeType::T,
+    //         ep::primitive::BlasTransposeType::N);
+    // CHECK(matmul);
+    // if (ctx->has_output("gamma_diff", 0)) {
+    //   user_op::Tensor* gamma_diff = ctx->Tensor4ArgNameAndIndex("gamma_diff", 0);
+    //   matmul->Launch(ctx->stream(), m, n, k, 1.0, tmp_gamma_diff_ptr, reduce_buf_ptr, 0.0,
+    //                  gamma_diff->mut_dptr());
+    // }
+    // if (ctx->has_output("beta_diff", 0)) {
+    //   user_op::Tensor* beta_diff = ctx->Tensor4ArgNameAndIndex("beta_diff", 0);
+    //   matmul->Launch(ctx->stream(), m, n, k, 1.0, tmp_beta_diff_ptr, reduce_buf_ptr, 0.0,
+    //                  beta_diff->mut_dptr());
+    // }
+    T* gamma_diff_ptr = nullptr;
+    T* beta_diff_ptr = nullptr;
    if (ctx->has_output("gamma_diff", 0)) {
-      user_op::Tensor* gamma_diff = ctx->Tensor4ArgNameAndIndex("gamma_diff", 0);
-      matmul->Launch(ctx->stream(), m, n, k, 1.0, tmp_gamma_diff_ptr, reduce_buf_ptr, 0.0,
-                     gamma_diff->mut_dptr());
+      gamma_diff_ptr = ctx->Tensor4ArgNameAndIndex("gamma_diff", 0)->mut_dptr<T>();
    }
    if (ctx->has_output("beta_diff", 0)) {
-      user_op::Tensor* beta_diff = ctx->Tensor4ArgNameAndIndex("beta_diff", 0);
-      matmul->Launch(ctx->stream(), m, n, k, 1.0, tmp_beta_diff_ptr, reduce_buf_ptr, 0.0,
-                     beta_diff->mut_dptr());
+      beta_diff_ptr = ctx->Tensor4ArgNameAndIndex("beta_diff", 0)->mut_dptr<T>();
    }
+    LayerNormBackwardKernelImplInternalParam<T>(ctx->stream(), dy->dptr<T>(), x->dptr<T>(), mean->dptr<ComputeType>(), inv_variance->dptr<ComputeType>(), 
+                                                num_instances, norm_size, gamma_diff_ptr, beta_diff_ptr);
  };
 };

@@ -453,7 +663,7 @@ class LayerNormParamGradGpuKernel final : public user_op::OpKernel,
        const auto& dy = ctx->InputTensorDesc("dy", 0);                                     \
        const int64_t num_instances = dy.shape().Count(0, begin_params_axis);               \
        const int64_t norm_size = dy.shape().Count(begin_params_axis);                      \
-        const int grid_dim_y = GetGirdDimY<dtype>(num_instances, norm_size);                \
+        const int grid_dim_y = num_instances;                                               \
        size_t tmp_buffer_size = (2 * grid_dim_y * norm_size + grid_dim_y) * sizeof(dtype); \
        return tmp_buffer_size;                                                             \
      });
@@ -461,5 +671,8 @@ class LayerNormParamGradGpuKernel final : public user_op::OpKernel,
 REGISTER_LAYER_NORM_PARAM_GRAD_GPU_KERNEL(float)
 REGISTER_LAYER_NORM_PARAM_GRAD_GPU_KERNEL(double)
 REGISTER_LAYER_NORM_PARAM_GRAD_GPU_KERNEL(half)
+#if CUDA_VERSION >= 11000
+REGISTER_LAYER_NORM_PARAM_GRAD_GPU_KERNEL(nv_bfloat16)
+#endif

-}  // namespace oneflow
\ No newline at end of file
+}
\ No newline at end of file