push Deepspeed 0.6.3 rocm version

7d1a83a9 · aiss · ab5534fc · 7d1a83a9 · 7d1a83a9 · 7d1a83a9
Commit 7d1a83a9 authored May 25, 2022 by aiss
20 changed files
--- a/csrc/transformer_bak/inference/includes/cublas_wrappers_hip.h
+++ b/csrc/transformer_bak/inference/includes/cublas_wrappers_hip.h
+// !!! This is a file automatically generated by hipify!!!
+#pragma once
+
+#include <assert.h>
+#include <rocblas.h>
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#include <mma.h>
+#include <stdio.h>
+
+int cublas_gemm_ex(rocblas_handle handle,
+                   rocblas_operation transa,
+                   rocblas_operation transb,
+                   int m,
+                   int n,
+                   int k,
+                   const float* alpha,
+                   const float* beta,
+                   const float* A,
+                   const float* B,
+                   float* C,
+                   cublasGemmAlgo_t algo)
+{
+    rocblas_status status = rocblas_gemmex(handle,
+                                         transa,
+                                         transb,
+                                         m,
+                                         n,
+                                         k,
+                                         (const void*)alpha,
+                                         (const void*)A,
+                                         hipR32F,
+                                         (transa == rocblas_operation_none) ? m : k,
+                                         (const void*)B,
+                                         hipR32F,
+                                         (transb == rocblas_operation_none) ? k : n,
+                                         (const void*)beta,
+                                         C,
+                                         hipR32F,
+                                         m,
+                                         hipR32F,
+                                         algo);
+
+    if (status != rocblas_status_success) {
+        fprintf(stderr,
+                "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
+                m,
+                n,
+                k,
+                (int)status);
+        return EXIT_FAILURE;
+    }
+    return 0;
+}
+
+int cublas_gemm_ex(rocblas_handle handle,
+                   rocblas_operation transa,
+                   rocblas_operation transb,
+                   int m,
+                   int n,
+                   int k,
+                   const float* alpha,
+                   const float* beta,
+                   const __half* A,
+                   const __half* B,
+                   __half* C,
+                   cublasGemmAlgo_t algo)
+{
+    rocblas_status status = rocblas_gemmex(handle,
+                                         transa,
+                                         transb,
+                                         m,
+                                         n,
+                                         k,
+                                         (const void*)alpha,
+                                         (const void*)A,
+                                         hipR16F,
+                                         (transa == rocblas_operation_none) ? m : k,
+                                         (const void*)B,
+                                         hipR16F,
+                                         (transb == rocblas_operation_none) ? k : n,
+                                         (const void*)beta,
+                                         (void*)C,
+                                         hipR16F,
+                                         m,
+                                         hipR32F,
+                                         algo);
+
+    if (status != rocblas_status_success) {
+        fprintf(stderr,
+                "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
+                m,
+                n,
+                k,
+                (int)status);
+        return EXIT_FAILURE;
+    }
+    return 0;
+}
+
+int cublas_strided_batched_gemm(rocblas_handle handle,
+                                int m,
+                                int n,
+                                int k,
+                                const float* alpha,
+                                const float* beta,
+                                const float* A,
+                                const float* B,
+                                float* C,
+                                rocblas_operation op_A,
+                                rocblas_operation op_B,
+                                int stride_A,
+                                int stride_B,
+                                int stride_C,
+                                int batch,
+                                cublasGemmAlgo_t algo)
+{
+    rocblas_status status = cublasGemmStridedBatchedEx(handle,
+                                                       op_A,
+                                                       op_B,
+                                                       m,
+                                                       n,
+                                                       k,
+                                                       alpha,
+                                                       A,
+                                                       hipR32F,
+                                                       (op_A == rocblas_operation_none) ? m : k,
+                                                       stride_A,
+                                                       B,
+                                                       hipR32F,
+                                                       (op_B == rocblas_operation_none) ? k : n,
+                                                       stride_B,
+                                                       beta,
+                                                       C,
+                                                       hipR32F,
+                                                       m,
+                                                       stride_C,
+                                                       batch,
+                                                       hipR32F,
+                                                       algo);
+
+    if (status != rocblas_status_success) {
+        fprintf(stderr,
+                "!!!! kernel execution error. (batch: %d, m: %d, n: %d, k: %d, error: %d) \n",
+                batch,
+                m,
+                n,
+                k,
+                (int)status);
+        return EXIT_FAILURE;
+    }
+    return 0;
+}
+
+int cublas_strided_batched_gemm(rocblas_handle handle,
+                                int m,
+                                int n,
+                                int k,
+                                const float* alpha,
+                                const float* beta,
+                                const __half* A,
+                                const __half* B,
+                                __half* C,
+                                rocblas_operation op_A,
+                                rocblas_operation op_B,
+                                int stride_A,
+                                int stride_B,
+                                int stride_C,
+                                int batch,
+                                cublasGemmAlgo_t algo)
+{
+    rocblas_status status = cublasGemmStridedBatchedEx(handle,
+                                                       op_A,
+                                                       op_B,
+                                                       m,
+                                                       n,
+                                                       k,
+                                                       alpha,
+                                                       A,
+                                                       hipR16F,
+                                                       (op_A == rocblas_operation_none) ? m : k,
+                                                       stride_A,
+                                                       B,
+                                                       hipR16F,
+                                                       (op_B == rocblas_operation_none) ? k : n,
+                                                       stride_B,
+                                                       beta,
+                                                       C,
+                                                       hipR16F,
+                                                       m,
+                                                       stride_C,
+                                                       batch,
+                                                       hipR32F,
+                                                       algo);
+
+    if (status != rocblas_status_success) {
+        fprintf(stderr,
+                "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
+                m,
+                n,
+                k,
+                (int)status);
+        return EXIT_FAILURE;
+    }
+
+    return 0;
+}
--- a/csrc/transformer_bak/inference/includes/custom_cuda_layers.h
+++ b/csrc/transformer_bak/inference/includes/custom_cuda_layers.h
+#pragma once
+
+#ifdef __HIP_PLATFORM_HCC__
+#define HALF_PRECISION_AVAILABLE = 1
+#include <hip/hip_cooperative_groups.h>
+#else
+#if __CUDA_ARCH__ >= 700
+#define HALF_PRECISION_AVAILABLE = 1
+#endif
+#include <cooperative_groups.h>
+#endif
+
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <cassert>
+#include <iostream>
+
+#define MAX_WARP_NUM 32
+#define WARP_SIZE 32
+#define SMs 80
+
+#define MAX_REGISTERS 256
+template <typename T>
+void launch_attn_softmax_v2(T* vals,
+                            T* mask,
+                            bool triangular,
+                            bool recompute,
+                            bool local_attention,
+                            int window_size,
+                            int batch_size,
+                            int heads,
+                            int num_seq,
+                            int sequence_length,
+                            float scale,
+                            cudaStream_t stream);
+
+// Fused bias add with gelu activation
+template <typename T>
+void launch_bias_gelu(T* input,
+                      const T* bias,
+                      int intermediate_size,
+                      int batch_size,
+                      cudaStream_t stream);
+template <typename T>
+void launch_bias_add(T* input, const T* bias, int hidden_size, int batch_size, cudaStream_t stream);
+
+template <typename T>
+void launch_bias_residual(T* input,
+                          T* output,
+                          T* attn,
+                          T* bias,
+                          T* attn_bias,
+                          int batch,
+                          int hidden_dim,
+                          int mp_size,
+                          cudaStream_t stream);
+
+template <typename T>
+void launch_layer_norm(T* out,
+                       T* vals,
+                       const T* gamma,
+                       const T* beta,
+                       float epsilon,
+                       int batch_size,
+                       int hidden_dim,
+                       cudaStream_t stream);
+
+template <typename T>
+void launch_residual_layer_norm(T* norm,
+                                T* res_add,
+                                T* vals,
+                                T* residual,
+                                const T* bias,
+                                const T* gamma,
+                                const T* beta,
+                                float epsilon,
+                                int batch_size,
+                                int hidden_dim,
+                                bool preLN,
+                                bool mlp_after_attn,
+                                cudaStream_t stream);
+template <typename T>
+void launch_dequantize(T* output,
+                       const int8_t* input,
+                       const float* qscale,
+                       unsigned output_size,
+                       unsigned hidden_dim,
+                       unsigned groups,
+                       unsigned merge_count,
+                       cudaStream_t stream);
+
+template <typename T>
+void launch_gptj_residual_add(T* input,
+                              T* output,
+                              T* attn,
+                              T* bias,
+                              T* attn_bias,
+                              int batch,
+                              int head_size,
+                              int mp_size,
+                              cudaStream_t stream);
+
+template <typename T>
+void launch_apply_rotary_pos_emb(T* mixed_query,
+                                 T* key_layer,
+                                 unsigned head_size,
+                                 unsigned seq_len,
+                                 unsigned rotary_dim,
+                                 unsigned offset,
+                                 unsigned num_heads,
+                                 unsigned batch,
+                                 bool rotate_half,
+                                 bool rotate_every_two,
+                                 cudaStream_t stream);
+
+template <typename T>
+void launch_moe_res_matmul(T* residual,
+                           T* coef,
+                           T* mlp_out,
+                           int seq_len,
+                           int hidden_dim,
+                           cudaStream_t stream);
--- a/csrc/transformer_bak/inference/includes/custom_hip_layers.h
+++ b/csrc/transformer_bak/inference/includes/custom_hip_layers.h
+// !!! This is a file automatically generated by hipify!!!
+#pragma once
+
+#ifdef __HIP_PLATFORM_HCC__
+#define HALF_PRECISION_AVAILABLE = 1
+#include <hip/hip_cooperative_groups.h>
+#else
+#if __CUDA_ARCH__ >= 700
+#define HALF_PRECISION_AVAILABLE = 1
+#endif
+#include <cooperative_groups.h>
+#endif
+
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <cassert>
+#include <iostream>
+
+#define MAX_WARP_NUM 32
+#define WARP_SIZE 32
+#define SMs 80
+
+#define MAX_REGISTERS 256
+template <typename T>
+void launch_attn_softmax_v2(T* vals,
+                            T* mask,
+                            bool triangular,
+                            bool recompute,
+                            bool local_attention,
+                            int window_size,
+                            int batch_size,
+                            int heads,
+                            int num_seq,
+                            int sequence_length,
+                            float scale,
+                            hipStream_t stream);
+
+// Fused bias add with gelu activation
+template <typename T>
+void launch_bias_gelu(T* input,
+                      const T* bias,
+                      int intermediate_size,
+                      int batch_size,
+                      hipStream_t stream);
+template <typename T>
+void launch_bias_add(T* input, const T* bias, int hidden_size, int batch_size, hipStream_t stream);
+
+template <typename T>
+void launch_bias_residual(T* input,
+                          T* output,
+                          T* attn,
+                          T* bias,
+                          T* attn_bias,
+                          int batch,
+                          int hidden_dim,
+                          int mp_size,
+                          hipStream_t stream);
+
+template <typename T>
+void launch_layer_norm(T* out,
+                       T* vals,
+                       const T* gamma,
+                       const T* beta,
+                       float epsilon,
+                       int batch_size,
+                       int hidden_dim,
+                       hipStream_t stream);
+
+template <typename T>
+void launch_residual_layer_norm(T* norm,
+                                T* res_add,
+                                T* vals,
+                                T* residual,
+                                const T* bias,
+                                const T* gamma,
+                                const T* beta,
+                                float epsilon,
+                                int batch_size,
+                                int hidden_dim,
+                                bool preLN,
+                                bool mlp_after_attn,
+                                hipStream_t stream);
+template <typename T>
+void launch_dequantize(T* output,
+                       const int8_t* input,
+                       const float* qscale,
+                       unsigned output_size,
+                       unsigned hidden_dim,
+                       unsigned groups,
+                       unsigned merge_count,
+                       hipStream_t stream);
+
+template <typename T>
+void launch_gptj_residual_add(T* input,
+                              T* output,
+                              T* attn,
+                              T* bias,
+                              T* attn_bias,
+                              int batch,
+                              int head_size,
+                              int mp_size,
+                              hipStream_t stream);
+
+template <typename T>
+void launch_apply_rotary_pos_emb(T* mixed_query,
+                                 T* key_layer,
+                                 unsigned head_size,
+                                 unsigned seq_len,
+                                 unsigned rotary_dim,
+                                 unsigned offset,
+                                 unsigned num_heads,
+                                 unsigned batch,
+                                 bool rotate_half,
+                                 bool rotate_every_two,
+                                 hipStream_t stream);
+
+template <typename T>
+void launch_moe_res_matmul(T* residual,
+                           T* coef,
+                           T* mlp_out,
+                           int seq_len,
+                           int hidden_dim,
+                           hipStream_t stream);
--- a/csrc/transformer_bak/normalize_kernels.cu
+++ b/csrc/transformer_bak/normalize_kernels.cu
+#include "custom_cuda_layers.h"
+
+namespace cg = cooperative_groups;
+
+/*
+Fused bias add, residual (elementwise) add, and normalization layer.
+
+For FP16, this kernel does not promote to FP32 in order to utilize the 2x throughput for
+__half2 instructions, and avoid the conversion overhead (1/8 of __hal2 arithmetic).
+
+For specific launch constraints, see the launch functions.
+*/
+
+#define NORM_REG (MAX_REGISTERS / 4)
+
+__global__ void fused_bias_residual_layer_norm(float* vals,
+                                               const float* residual,
+                                               const float* gamma,
+                                               const float* beta,
+                                               float epsilon,
+                                               bool preLayerNorm,
+                                               bool training,
+                                               float* vars,
+                                               float* means,
+                                               int row_stride)
+{
+    int iteration_stride = blockDim.x;
+    int iterations = row_stride / iteration_stride;
+
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+
+    int row = blockIdx.x;
+    int id = threadIdx.x;
+    int gid = id / WARP_SIZE;
+
+    float vals_arr[NORM_REG];
+    __shared__ float shr[MAX_WARP_NUM];
+
+    residual += (row * row_stride);
+    vals += (row * row_stride);
+
+    float sum = 0.f;
+    int high_index = iterations * iteration_stride + id;
+#pragma unroll
+    for (int i = 0; i < iterations; i++) {
+        vals_arr[i] = residual[i * iteration_stride + id];
+        sum += vals_arr[i];
+    }
+    if (high_index < row_stride) {
+        vals_arr[iterations] = residual[high_index];
+        sum += vals_arr[iterations];
+        iterations++;
+    }
+
+    for (int i = 1; i < 32; i *= 2) { sum += g.shfl_down(sum, i); }
+
+    if (g.thread_rank() == 0) shr[gid] = sum;
+
+    b.sync();
+
+    if (g.thread_rank() < (iteration_stride >> WARP_SIZE_BITS)) sum = shr[g.thread_rank()];
+
+#if !defined(__STOCHASTIC_MODE__) || __CUDA_ARCH__ < 700
+    b.sync();
+#endif
+
+    for (int i = 1; i < (iteration_stride >> WARP_SIZE_BITS); i *= 2) {
+        sum += g.shfl_down(sum, i);
+    }
+
+    sum = g.shfl(sum, 0);
+    float mean = sum / row_stride;
+    if (training)
+        if (threadIdx.x == 0) means[row] = mean;
+    float variance = 0.f;
+    for (int i = 0; i < iterations; i++) {
+        vals_arr[i] -= mean;
+        variance += vals_arr[i] * vals_arr[i];
+    }
+
+    for (int i = 1; i < 32; i *= 2) { variance += g.shfl_down(variance, i); }
+
+    if (g.thread_rank() == 0) shr[gid] = variance;
+
+    b.sync();
+
+    if (g.thread_rank() < (iteration_stride >> WARP_SIZE_BITS)) variance = shr[g.thread_rank()];
+
+#ifndef __STOCHASTIC_MODE__
+    b.sync();
+#endif
+
+    for (int i = 1; i < (iteration_stride >> WARP_SIZE_BITS); i *= 2) {
+        variance += g.shfl_down(variance, i);
+    }
+    variance = g.shfl(variance, 0);
+    variance /= row_stride;
+    variance += epsilon;
+    if (training)
+        if (threadIdx.x == 0) vars[row] = variance;
+
+    iterations = row_stride / iteration_stride;
+    for (int i = 0; i < iterations; i++) {
+        vals_arr[i] = vals_arr[i] * rsqrtf(variance);
+        vals_arr[i] =
+            vals_arr[i] * gamma[i * iteration_stride + id] + beta[i * iteration_stride + id];
+        vals[i * iteration_stride + id] = vals_arr[i];
+    }
+    if ((high_index) < row_stride) {
+        vals_arr[iterations] = vals_arr[iterations] * rsqrtf(variance);
+        vals_arr[iterations] = vals_arr[iterations] * gamma[high_index] + beta[high_index];
+        vals[high_index] = vals_arr[iterations];
+    }
+}
+
+__global__ void fused_bias_residual_layer_norm(__half* vals,
+                                               const __half* residual,
+                                               const __half* gamma,
+                                               const __half* beta,
+                                               float epsilon,
+                                               bool preLayerNorm,
+                                               bool training,
+                                               __half* vars,
+                                               __half* means,
+                                               int row_stride)
+{
+#ifdef HALF_PRECISION_AVAILABLE
+    int iteration_stride = blockDim.x;
+    int iterations = row_stride / iteration_stride;
+
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
+
+    int row = blockIdx.x;
+    int id = threadIdx.x;
+    int gid = id >> WARP_SIZE_BITS;
+
+    float2 vals_f[NORM_REG];
+    __shared__ float shr[MAX_WARP_NUM];
+
+    __half2* vals_cast = reinterpret_cast<__half2*>(vals);
+    const __half2* residual_cast = reinterpret_cast<const __half2*>(residual);
+
+    residual_cast += (row * row_stride);
+    vals_cast += (row * row_stride);
+
+    float sum = 0.f;
+    int high_index = iterations * iteration_stride + id;
+#pragma unroll
+    for (int i = 0; i < iterations; i++) {
+        vals_f[i] = __half22float2(residual_cast[i * iteration_stride + id]);
+        sum += vals_f[i].x;
+        sum += vals_f[i].y;
+    }
+    if ((high_index) < row_stride) {
+        vals_f[iterations] = __half22float2(residual_cast[high_index]);
+        sum += vals_f[iterations].x;
+        sum += vals_f[iterations].y;
+        iterations++;
+    }
+
+    for (int i = 1; i < 32; i *= 2) { sum += g.shfl_down(sum, i); }
+
+    if (g.thread_rank() == 0) shr[gid] = sum;
+
+    b.sync();
+
+    if (g.thread_rank() < (iteration_stride >> WARP_SIZE_BITS)) sum = shr[g.thread_rank()];
+
+#ifndef __STOCHASTIC_MODE__
+    b.sync();
+#endif
+
+    for (int i = 1; i < (iteration_stride >> WARP_SIZE_BITS); i *= 2) {
+        sum += g.shfl_down(sum, i);
+    }
+    sum = g.shfl(sum, 0);
+    float mean = sum / (row_stride * 2);
+
+    float variance = 0.f;
+    for (int i = 0; i < iterations; i++) {
+        vals_f[i].x -= mean;
+        vals_f[i].y -= mean;
+        variance += vals_f[i].x * vals_f[i].x;
+        variance += vals_f[i].y * vals_f[i].y;
+    }
+
+    for (int i = 1; i < 32; i *= 2) { variance += g.shfl_down(variance, i); }
+
+    if (g.thread_rank() == 0) shr[gid] = variance;
+
+    b.sync();
+
+    if (g.thread_rank() < (iteration_stride >> WARP_SIZE_BITS)) variance = shr[g.thread_rank()];
+
+#ifndef __STOCHASTIC_MODE__
+    b.sync();
+#endif
+
+    for (int i = 1; i < (iteration_stride >> WARP_SIZE_BITS); i *= 2) {
+        variance += g.shfl_down(variance, i);
+    }
+    variance = g.shfl(variance, 0);
+    variance /= (row_stride * 2);
+    variance += epsilon;
+
+    __half2 variance_h = __float2half2_rn(variance);
+    const __half2* gamma_cast = reinterpret_cast<const __half2*>(gamma);
+    const __half2* beta_cast = reinterpret_cast<const __half2*>(beta);
+
+    if (training && threadIdx.x == 0) {
+        vars[row] = __float2half(variance);
+        means[row] = __float2half(mean);
+    }
+    iterations = row_stride / iteration_stride;
+    for (int i = 0; i < iterations; i++) {
+        __half2 vals_arr = __float22half2_rn(vals_f[i]);
+        vals_arr = vals_arr * h2rsqrt(variance_h);
+        vals_arr =
+            vals_arr * gamma_cast[i * iteration_stride + id] + beta_cast[i * iteration_stride + id];
+        vals_cast[i * iteration_stride + id] = vals_arr;
+    }
+    if ((high_index) < row_stride) {
+        __half2 vals_arr = __float22half2_rn(vals_f[iterations]);
+        vals_arr = vals_arr * h2rsqrt(variance_h);
+        vals_arr = vals_arr * gamma_cast[high_index] + beta_cast[high_index];
+        vals_cast[high_index] = vals_arr;
+    }
+#endif
+}
+
+template <typename T>
+void launch_bias_residual_layer_norm(T* vals,
+                                     const T* residual,
+                                     const T* gamma,
+                                     const T* beta,
+                                     float epsilon,
+                                     int batch_size,
+                                     int hidden_dim,
+                                     cudaStream_t stream,
+                                     bool preLayerNorm,
+                                     bool training,
+                                     T* vars,
+                                     T* means);
+
+template <>
+void launch_bias_residual_layer_norm<float>(float* vals,
+                                            const float* residual,
+                                            const float* gamma,
+                                            const float* beta,
+                                            float epsilon,
+                                            int batch_size,
+                                            int hidden_dim,
+                                            cudaStream_t stream,
+                                            bool preLayerNorm,
+                                            bool training,
+                                            float* vars,
+                                            float* means)
+{
+    int threads = THREADS;
+
+    dim3 grid_dim(batch_size);
+
+    if (hidden_dim > 16384 && hidden_dim <= 32768)
+        threads <<= 1;
+    else if (hidden_dim > 32768 && hidden_dim <= 65536)
+        threads <<= 2;
+    else if (hidden_dim > 65536)
+        throw std::runtime_error("Unsupport hidden_dim.");
+
+    dim3 block_dim(threads);
+
+    fused_bias_residual_layer_norm<<<grid_dim, block_dim, 0, stream>>>(
+        vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, means, hidden_dim);
+}
+
+template <>
+void launch_bias_residual_layer_norm<__half>(__half* vals,
+                                             const __half* residual,
+                                             const __half* gamma,
+                                             const __half* beta,
+                                             float epsilon,
+                                             int batch_size,
+                                             int hidden_dim,
+                                             cudaStream_t stream,
+                                             bool preLayerNorm,
+                                             bool training,
+                                             __half* vars,
+                                             __half* means)
+{
+    int threads = 128;
+
+    dim3 grid_dim(batch_size);
+
+    if (hidden_dim > 8192 && hidden_dim <= 16384)
+        threads <<= 1;
+    else if (hidden_dim > 16384 && hidden_dim <= 32768)
+        threads <<= 2;
+    else if (hidden_dim > 32768 && hidden_dim <= 65536)
+        threads <<= 3;
+    else if (hidden_dim > 65536)
+        throw std::runtime_error("Unsupport hidden_dim.");
+
+    dim3 block_dim(threads);
+
+    fused_bias_residual_layer_norm<<<grid_dim, block_dim, 0, stream>>>(
+        vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, means, hidden_dim / 2);
+}
+
+__global__ void fused_bias_residual_layer_norm(float* vals,
+                                               const float* residual,
+                                               const float* gamma,
+                                               const float* beta,
+                                               float epsilon,
+                                               bool preLayerNorm,
+                                               bool training,
+                                               float* vars,
+                                               int row_stride)
+{
+    int iteration_stride = blockDim.x;
+    int iterations = row_stride / iteration_stride;
+
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
+
+    int row = blockIdx.x;
+    int id = threadIdx.x;
+    int gid = id / 32;
+
+    float vals_arr[NORM_REG];
+    __shared__ float shr[MAX_WARP_NUM];
+
+    residual += (row * row_stride);
+    vals += (row * row_stride);
+
+    float sum = 0.f;
+    int high_index = iterations * iteration_stride + id;
+#pragma unroll
+    for (int i = 0; i < iterations; i++) {
+        vals_arr[i] = residual[i * iteration_stride + id];
+        sum += vals_arr[i];
+    }
+    if ((high_index) < row_stride) {
+        vals_arr[iterations] = residual[high_index];
+        sum += vals_arr[iterations];
+        iterations++;
+    }
+
+    for (int i = 1; i < 32; i *= 2) { sum += g.shfl_down(sum, i); }
+
+    if (g.thread_rank() == 0) shr[gid] = sum;
+
+    b.sync();
+
+    if (g.thread_rank() < (iteration_stride >> WARP_SIZE_BITS)) sum = shr[g.thread_rank()];
+
+#if !defined(__STOCHASTIC_MODE__) || __CUDA_ARCH__ < 700
+    b.sync();
+#endif
+
+    for (int i = 1; i < (iteration_stride >> WARP_SIZE_BITS); i *= 2) {
+        sum += g.shfl_down(sum, i);
+    }
+
+    sum = g.shfl(sum, 0);
+    float mean = sum / row_stride;
+    float variance = 0.f;
+    for (int i = 0; i < iterations; i++) {
+        vals_arr[i] -= mean;
+        variance += vals_arr[i] * vals_arr[i];
+    }
+
+    for (int i = 1; i < 32; i *= 2) { variance += g.shfl_down(variance, i); }
+
+    if (g.thread_rank() == 0) shr[gid] = variance;
+
+    b.sync();
+
+    if (g.thread_rank() < (iteration_stride >> WARP_SIZE_BITS)) variance = shr[g.thread_rank()];
+
+#ifndef __STOCHASTIC_MODE__
+    b.sync();
+#endif
+
+    for (int i = 1; i < (iteration_stride >> WARP_SIZE_BITS); i *= 2) {
+        variance += g.shfl_down(variance, i);
+    }
+    variance = g.shfl(variance, 0);
+    variance /= row_stride;
+    variance += epsilon;
+    if (training)
+        if (threadIdx.x == 0) vars[row] = variance;
+
+    iterations = row_stride / iteration_stride;
+    for (int i = 0; i < iterations; i++) {
+        vals_arr[i] = vals_arr[i] * rsqrtf(variance);
+        vals_arr[i] =
+            vals_arr[i] * gamma[i * iteration_stride + id] + beta[i * iteration_stride + id];
+        vals[i * iteration_stride + id] = vals_arr[i];
+    }
+    if ((high_index) < row_stride) {
+        vals_arr[iterations] = vals_arr[iterations] * rsqrtf(variance);
+        vals_arr[iterations] = vals_arr[iterations] * gamma[high_index] + beta[high_index];
+        vals[high_index] = vals_arr[iterations];
+    }
+}
+
+__global__ void fused_bias_residual_layer_norm(__half* vals,
+                                               const __half* residual,
+                                               const __half* gamma,
+                                               const __half* beta,
+                                               float epsilon,
+                                               bool preLayerNorm,
+                                               bool training,
+                                               __half* vars,
+                                               int row_stride)
+{
+#ifdef HALF_PRECISION_AVAILABLE
+
+    int iteration_stride = blockDim.x;
+    int iterations = row_stride / iteration_stride;
+
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
+
+    int row = blockIdx.x;
+    int id = threadIdx.x;
+    int gid = id >> WARP_SIZE_BITS;
+
+    float2 vals_f[NORM_REG];
+    __shared__ float shr[MAX_WARP_NUM];
+
+    __half2* vals_cast = reinterpret_cast<__half2*>(vals);
+    const __half2* residual_cast = reinterpret_cast<const __half2*>(residual);
+
+    residual_cast += (row * row_stride);
+    vals_cast += (row * row_stride);
+
+    float sum = 0.f;
+    int high_index = iterations * iteration_stride + id;
+#pragma unroll
+    for (int i = 0; i < iterations; i++) {
+        vals_f[i] = __half22float2(residual_cast[i * iteration_stride + id]);
+        sum += vals_f[i].x;
+        sum += vals_f[i].y;
+    }
+    if ((high_index) < row_stride) {
+        vals_f[iterations] = __half22float2(residual_cast[high_index]);
+        sum += vals_f[iterations].x;
+        sum += vals_f[iterations].y;
+        iterations++;
+    }
+
+    for (int i = 1; i < 32; i *= 2) { sum += g.shfl_down(sum, i); }
+
+    if (g.thread_rank() == 0) shr[gid] = sum;
+
+    b.sync();
+
+    if (g.thread_rank() < (iteration_stride >> WARP_SIZE_BITS)) sum = shr[g.thread_rank()];
+
+#ifndef __STOCHASTIC_MODE__
+    b.sync();
+#endif
+
+    for (int i = 1; i < (iteration_stride >> WARP_SIZE_BITS); i *= 2) {
+        sum += g.shfl_down(sum, i);
+    }
+    sum = g.shfl(sum, 0);
+    float mean = sum / (row_stride * 2);
+
+    float variance = 0.f;
+    for (int i = 0; i < iterations; i++) {
+        vals_f[i].x -= mean;
+        vals_f[i].y -= mean;
+        variance += vals_f[i].x * vals_f[i].x;
+        variance += vals_f[i].y * vals_f[i].y;
+    }
+
+    for (int i = 1; i < 32; i *= 2) { variance += g.shfl_down(variance, i); }
+
+    if (g.thread_rank() == 0) shr[gid] = variance;
+
+    b.sync();
+
+    if (g.thread_rank() < (iteration_stride >> WARP_SIZE_BITS)) variance = shr[g.thread_rank()];
+
+#ifndef __STOCHASTIC_MODE__
+    b.sync();
+#endif
+
+    for (int i = 1; i < (iteration_stride >> WARP_SIZE_BITS); i *= 2) {
+        variance += g.shfl_down(variance, i);
+    }
+    variance = g.shfl(variance, 0);
+    variance /= (row_stride * 2);
+    variance += epsilon;
+
+    __half2 variance_h = __float2half2_rn(variance);
+    const __half2* gamma_cast = reinterpret_cast<const __half2*>(gamma);
+    const __half2* beta_cast = reinterpret_cast<const __half2*>(beta);
+
+    if (training && threadIdx.x == 0) vars[row] = __float2half(variance);
+
+    iterations = row_stride / iteration_stride;
+    for (int i = 0; i < iterations; i++) {
+        __half2 vals_arr = __float22half2_rn(vals_f[i]);
+        vals_arr = vals_arr * h2rsqrt(variance_h);
+        vals_arr =
+            vals_arr * gamma_cast[i * iteration_stride + id] + beta_cast[i * iteration_stride + id];
+        vals_cast[i * iteration_stride + id] = vals_arr;
+    }
+    if ((high_index) < row_stride) {
+        __half2 vals_arr = __float22half2_rn(vals_f[iterations]);
+        vals_arr = vals_arr * h2rsqrt(variance_h);
+        vals_arr = vals_arr * gamma_cast[high_index] + beta_cast[high_index];
+        vals_cast[high_index] = vals_arr;
+    }
+#endif
+}
+
+template <typename T>
+void launch_bias_residual_layer_norm(T* vals,
+                                     const T* residual,
+                                     const T* gamma,
+                                     const T* beta,
+                                     float epsilon,
+                                     int batch_size,
+                                     int hidden_dim,
+                                     cudaStream_t stream,
+                                     bool preLayerNorm,
+                                     bool training,
+                                     T* vars);
+
+/*
+To tune this launch the following restrictions must be met:
+
+For float:
+row_stride == hidden_size
+threads * iterations == row_stride
+threads is in [32, 64, 128, 256, 512, 1024]
+
+For half:
+row_stride == hidden_size / 2
+threads * iterations == row_stride
+threads is in [32, 64, 128, 256, 512, 1024]
+
+*/
+
+template <>
+void launch_bias_residual_layer_norm<float>(float* vals,
+                                            const float* residual,
+                                            const float* gamma,
+                                            const float* beta,
+                                            float epsilon,
+                                            int batch_size,
+                                            int hidden_dim,
+                                            cudaStream_t stream,
+                                            bool preLayerNorm,
+                                            bool training,
+                                            float* vars)
+{
+    int threads = THREADS;
+
+    dim3 grid_dim(batch_size);
+
+    // There are some limitations to call below functions, now just enumerate the situations.
+
+    if (hidden_dim > 16384 && hidden_dim <= 32768)
+        threads <<= 1;
+    else if (hidden_dim > 32768 && hidden_dim <= 65536)
+        threads <<= 2;
+    else if (hidden_dim > 65536)
+        throw std::runtime_error("Unsupport hidden_dim.");
+
+    dim3 block_dim(threads);
+
+    fused_bias_residual_layer_norm<<<grid_dim, block_dim, 0, stream>>>(
+        vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, hidden_dim);
+}
+
+template <>
+void launch_bias_residual_layer_norm<__half>(__half* vals,
+                                             const __half* residual,
+                                             const __half* gamma,
+                                             const __half* beta,
+                                             float epsilon,
+                                             int batch_size,
+                                             int hidden_dim,
+                                             cudaStream_t stream,
+                                             bool preLayerNorm,
+                                             bool training,
+                                             __half* vars)
+{
+    int threads = 128;
+
+    dim3 grid_dim(batch_size);
+
+    // There are some limitations to call below functions, now just enumerate the situations.
+
+    if (hidden_dim > 8192 && hidden_dim <= 16384)
+        threads <<= 1;
+    else if (hidden_dim > 16384 && hidden_dim <= 32768)
+        threads <<= 2;
+    else if (hidden_dim > 32768 && hidden_dim <= 65536)
+        threads <<= 3;
+    else if (hidden_dim > 65536)
+        throw std::runtime_error("Unsupport hidden_dim.");
+
+    dim3 block_dim(threads);
+    fused_bias_residual_layer_norm<<<grid_dim, block_dim, 0, stream>>>(
+        vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, hidden_dim / 2);
+}
+
+/* Normalize Gamma & Betta gradients
+ * Compute gradients using either X_hat or
+ * normalize input (invertible).
+ * Combine transpose with gradients computation.
+ */
+
+template <typename T>
+__global__ void LayerNormBackward1(const T* __restrict__ out_grad,
+                                   const T* __restrict__ vals_hat,
+                                   const T* __restrict__ gamma,
+                                   const T* __restrict__ betta,
+                                   T* __restrict__ gamma_grad,
+                                   T* __restrict__ betta_grad,
+                                   int rows,
+                                   int width,
+                                   bool invertible)
+{
+    __shared__ float betta_buffer[TILE_DIM][TILE_DIM + 1];
+    __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
+
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
+
+    int idx = blockDim.x * blockIdx.x + threadIdx.x;
+    int offset = threadIdx.y * width + idx;
+    int y_stride = width * TILE_DIM;
+
+    float betta_reg = (invertible ? (float)betta[idx] : 0.0f);
+    float gamma_reg = (float)gamma[idx];
+
+    // Loop across matrix height
+    float betta_tmp = 0;
+    float gamma_tmp = 0;
+    for (int r = threadIdx.y; r < rows; r += TILE_DIM) {
+        float grad = (float)out_grad[offset];
+        float val = (invertible ? ((float)vals_hat[offset] - betta_reg) / gamma_reg
+                                : (float)vals_hat[offset]);
+        betta_tmp += grad;
+        gamma_tmp += (val * grad);
+
+        offset += y_stride;
+    }
+
+    betta_buffer[threadIdx.x][threadIdx.y] = betta_tmp;
+    gamma_buffer[threadIdx.x][threadIdx.y] = gamma_tmp;
+
+    __syncthreads();
+
+    // Sum the shared buffer.
+    float s1 = betta_buffer[threadIdx.y][threadIdx.x];
+    float s2 = gamma_buffer[threadIdx.y][threadIdx.x];
+
+#ifndef __STOCHASTIC_MODE__
+    __syncthreads();
+#endif
+
+    for (int i = 1; i < TILE_DIM; i <<= 1) {
+        s1 += g.shfl_down(s1, i);
+        s2 += g.shfl_down(s2, i);
+    }
+
+    if (threadIdx.x == 0) {
+        int pos = blockIdx.x * TILE_DIM + threadIdx.y;
+        betta_grad[pos] = s1;
+        gamma_grad[pos] = s2;
+    }
+}
+
+/* Normalize Gamma & Betta gradients
+ * Compute gradients using the input to
+ * the normalize.
+ * Combine transpose with gradients computation.
+ */
+
+template <typename T>
+__global__ void LayerNormBackward1(const T* __restrict__ out_grad,
+                                   const T* __restrict__ X_data,
+                                   const T* __restrict__ vars,
+                                   const T* __restrict__ means,
+                                   T* __restrict__ gamma_grad,
+                                   T* __restrict__ betta_grad,
+                                   int rows,
+                                   int width)
+{
+    __shared__ float betta_buffer[TILE_DIM][TILE_DIM + 1];
+    __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
+
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
+
+    int idx = blockDim.x * blockIdx.x + threadIdx.x;
+    int offset = threadIdx.y * width + idx;
+    int y_stride = width * TILE_DIM;
+
+    int pos = blockIdx.x * TILE_DIM + threadIdx.y;
+    // Loop across matrix height
+
+    float betta_tmp = 0;
+    float gamma_tmp = 0;
+    for (int r = threadIdx.y; r < rows; r += TILE_DIM) {
+        float grad = (float)out_grad[offset];
+        float val = (float)X_data[offset];
+        val = (val - (float)means[r]) * rsqrtf((float)vars[r]);
+        betta_tmp += grad;
+        gamma_tmp += (val * grad);
+
+        offset += y_stride;
+    }
+
+    betta_buffer[threadIdx.x][threadIdx.y] = betta_tmp;
+    gamma_buffer[threadIdx.x][threadIdx.y] = gamma_tmp;
+
+    __syncthreads();
+
+    // Sum the shared buffer.
+    float s1 = betta_buffer[threadIdx.y][threadIdx.x];
+    float s2 = gamma_buffer[threadIdx.y][threadIdx.x];
+
+#ifndef __STOCHASTIC_MODE__
+    __syncthreads();
+#endif
+
+    for (int i = 1; i < TILE_DIM; i <<= 1) {
+        s1 += g.shfl_down(s1, i);
+        s2 += g.shfl_down(s2, i);
+    }
+
+    if (threadIdx.x == 0) {
+        betta_grad[pos] = s1;
+        gamma_grad[pos] = s2;
+    }
+}
+/*
+
+/* Backward Normalize (Input-Gradient)
+ * Using the means and variances from the input
+ * This type of backward is invertible!
+ * We do the backward using the X_hat (X - u) / sqrt(variance) or the output of Normalization.
+ */
+
+__global__ void LayerNormBackward2(const float* out_grad,
+                                   const float* vals_hat,
+                                   const float* gamma,
+                                   const float* betta,
+                                   const float* vars,
+                                   float* inp_grad,
+                                   bool invertible,
+                                   int row_stride)
+{
+    int iteration_stride = blockDim.x;
+    int iterations = row_stride / iteration_stride;
+
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+
+    int row = blockIdx.x;
+    int id = threadIdx.x;
+    int wid = id / WARP_SIZE;
+    int warp_num = iteration_stride >> WARP_SIZE_BITS;
+    __shared__ float partialSum[MAX_WARP_NUM];
+
+    out_grad += (row * row_stride);
+    vals_hat += (row * row_stride);
+    inp_grad += (row * row_stride);
+
+    float vals_arr[NORM_REG];
+    float vals_hat_arr[NORM_REG];
+    int high_index = iterations * iteration_stride + id;
+#pragma unroll
+    for (int i = 0; i < iterations; i++) {
+        float gamma_reg = gamma[i * iteration_stride + id];
+        vals_arr[i] = out_grad[i * iteration_stride + id];
+        vals_arr[i] *= gamma_reg;
+        vals_hat_arr[i] =
+            (invertible ? (vals_hat[i * iteration_stride + id] - betta[i * iteration_stride + id]) /
+                              gamma_reg
+                        : vals_hat[i * iteration_stride + id]);
+    }
+    if ((high_index) < row_stride) {
+        float gamma_reg = gamma[high_index];
+        vals_arr[iterations] = out_grad[high_index];
+        vals_arr[iterations] *= gamma_reg;
+        vals_hat_arr[iterations] =
+            (invertible ? (vals_hat[high_index] - betta[high_index]) / gamma_reg
+                        : vals_hat[high_index]);
+        iterations++;
+    }
+
+    float var_reg = vars[row];
+
+    float sum = 0;
+    for (int i = 0; i < iterations; i++) {
+        sum += vals_hat_arr[i] * vals_arr[i] *
+               sqrtf(var_reg);           // dval_hat = gamma * (x - u) * out_grad
+        vals_arr[i] *= rsqrtf(var_reg);  // dvar_inv = gamma * out_grad / sqrt(var)
+    }
+
+    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
+
+    if (g.thread_rank() == 0) partialSum[wid] = sum;
+
+    __syncthreads();
+
+    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
+
+#ifndef __STOCHASTIC_MODE__
+    __syncthreads();
+#endif
+
+    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
+
+    sum = g.shfl(sum, 0);
+    sum /= row_stride;
+
+    for (int i = 0; i < iterations; i++) { vals_arr[i] += ((-sum * vals_hat_arr[i]) / var_reg); }
+
+    sum = 0;
+    for (int i = 0; i < iterations; i++) { sum += vals_arr[i]; }
+
+    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
+
+    if (g.thread_rank() == 0) partialSum[wid] = sum;
+
+    __syncthreads();
+
+    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
+
+#ifndef __STOCHASTIC_MODE__
+    __syncthreads();
+#endif
+
+    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
+    sum = g.shfl(sum, 0);
+    sum /= row_stride;
+
+    iterations = row_stride / iteration_stride;
+    for (int i = 0; i < iterations; i++) inp_grad[i * iteration_stride + id] = (vals_arr[i] - sum);
+    if ((high_index) < row_stride) inp_grad[high_index] = (vals_arr[iterations] - sum);
+}
+
+__global__ void LayerNormBackward2(const __half* out_grad,
+                                   const __half* vals_hat,
+                                   const __half* gamma,
+                                   const __half* betta,
+                                   const __half* vars,
+                                   __half* inp_grad,
+                                   bool invertible,
+                                   int row_stride)
+{
+    int iteration_stride = blockDim.x;
+    int iterations = row_stride / iteration_stride;
+
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+
+    int row = blockIdx.x;
+    int id = threadIdx.x;
+    int wid = id / WARP_SIZE;
+    int warp_num = iteration_stride >> WARP_SIZE_BITS;
+    __shared__ float partialSum[MAX_WARP_NUM];
+
+    __half2 vals_arr[NORM_REG];
+    float2 vals_arr_f[NORM_REG];
+    __half2 vals_hat_arr[NORM_REG];
+
+    __half2* inp_grad_h = reinterpret_cast<__half2*>(inp_grad);
+    const __half2* out_grad_h = reinterpret_cast<const __half2*>(out_grad);
+    const __half2* vals_hat_h = reinterpret_cast<const __half2*>(vals_hat);
+
+    inp_grad_h += (row * row_stride);
+    out_grad_h += (row * row_stride);
+    vals_hat_h += (row * row_stride);
+
+    const __half2* gamma_h = reinterpret_cast<const __half2*>(gamma);
+    const __half2* betta_h = (invertible ? reinterpret_cast<const __half2*>(betta) : nullptr);
+    int high_index = iterations * iteration_stride + id;
+#pragma unroll
+    for (int i = 0; i < iterations; i++) {
+        __half2 gamma_reg = gamma_h[i * iteration_stride + id];
+        vals_arr[i] = out_grad_h[i * iteration_stride + id];
+        vals_arr[i] *= gamma_reg;
+        vals_hat_arr[i] =
+            (invertible
+                 ? (vals_hat_h[i * iteration_stride + id] - betta_h[i * iteration_stride + id]) /
+                       gamma_reg
+                 : vals_hat_h[i * iteration_stride + id]);
+    }
+    if ((high_index) < row_stride) {
+        __half2 gamma_reg = gamma_h[high_index];
+        vals_arr[iterations] = out_grad_h[high_index];
+        vals_arr[iterations] *= gamma_reg;
+        vals_hat_arr[iterations] =
+            (invertible ? (vals_hat_h[high_index] - betta_h[high_index]) / gamma_reg
+                        : vals_hat_h[high_index]);
+        iterations++;
+    }
+    __half var_h = vars[row];
+    __half2 var_reg = __halves2half2(var_h, var_h);
+
+    float sum = 0.f;
+    for (int i = 0; i < iterations; i++) {
+        __half2 result_h = (vals_hat_arr[i] * vals_arr[i] * h2sqrt(var_reg));
+        float2 result_f = __half22float2(result_h);
+        sum += result_f.x;
+        sum += result_f.y;
+        vals_arr[i] *= h2rsqrt(var_reg);
+    }
+
+    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
+
+    if (g.thread_rank() == 0) partialSum[wid] = sum;
+
+    __syncthreads();
+
+    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
+
+#ifndef __STOCHASTIC_MODE__
+    __syncthreads();
+#endif
+
+    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
+
+    sum = g.shfl(sum, 0);
+    sum /= (2 * row_stride);
+    __half2 sum_h = __float2half2_rn(sum);
+
+    for (int i = 0; i < iterations; i++) {
+        __half2 temp = ((-sum_h * vals_hat_arr[i]) / (var_reg));
+        vals_arr_f[i] = __half22float2(vals_arr[i]);
+        float2 temp_f = __half22float2(temp);
+        vals_arr_f[i].x += temp_f.x;
+        vals_arr_f[i].y += temp_f.y;
+    }
+    sum = 0.f;
+
+    for (int i = 0; i < iterations; i++) {
+        sum += (vals_arr_f[i].x);
+        sum += (vals_arr_f[i].y);
+    }
+
+    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
+
+    if (g.thread_rank() == 0) partialSum[wid] = sum;
+
+    __syncthreads();
+
+    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
+
+#ifndef __STOCHASTIC_MODE__
+    __syncthreads();
+#endif
+
+    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
+
+    sum = g.shfl(sum, 0);
+    sum /= (2 * row_stride);
+
+    iterations = row_stride / iteration_stride;
+    for (int i = 0; i < iterations; i++) {
+        vals_arr_f[i].x -= sum;
+        vals_arr_f[i].y -= sum;
+        __half2 temp = __float22half2_rn(vals_arr_f[i]);
+
+        inp_grad_h[i * iteration_stride + id] = temp;
+    }
+    if ((high_index) < row_stride) {
+        vals_arr_f[iterations].x -= sum;
+        vals_arr_f[iterations].y -= sum;
+        __half2 temp = __float22half2_rn(vals_arr_f[iterations]);
+
+        inp_grad_h[high_index] = temp;
+    }
+}
+
+template <>
+void launch_layerNorm_backward<float>(const float* out_grad,
+                                      const float* vals_hat,
+                                      const float* vars,
+                                      const float* gamma,
+                                      float* gamma_grad,
+                                      float* betta_grad,
+                                      float* inp_grad,
+                                      int batch,
+                                      int hidden_dim,
+                                      cudaStream_t stream[2],
+                                      bool invertible,
+                                      const float* betta)
+{
+    int threads = THREADS;
+
+    dim3 grid_dim(hidden_dim / TILE_DIM);
+    dim3 block_dim(TILE_DIM, TILE_DIM);
+
+    LayerNormBackward1<float><<<grid_dim, block_dim, 0, stream[0]>>>(
+        out_grad, vals_hat, gamma, betta, gamma_grad, betta_grad, batch, hidden_dim, invertible);
+
+    dim3 grid_dim2(batch);
+
+    if (hidden_dim > 16384 && hidden_dim <= 32768)
+        threads <<= 1;
+    else if (hidden_dim > 32768 && hidden_dim <= 65536)
+        threads <<= 2;
+    else if (hidden_dim > 65536)
+        throw std::runtime_error("Unsupport hidden_dim.");
+
+    dim3 block_dim2(threads);
+
+    LayerNormBackward2<<<grid_dim2, block_dim2, 0, stream[1]>>>(
+        out_grad, vals_hat, gamma, betta, vars, inp_grad, invertible, hidden_dim);
+}
+
+template <>
+void launch_layerNorm_backward<__half>(const __half* out_grad,
+                                       const __half* vals_hat,
+                                       const __half* vars,
+                                       const __half* gamma,
+                                       __half* gamma_grad,
+                                       __half* betta_grad,
+                                       __half* inp_grad,
+                                       int batch,
+                                       int hidden_dim,
+                                       cudaStream_t stream[2],
+                                       bool invertible,
+                                       const __half* betta)
+{
+    int threads = THREADS;
+
+    dim3 grid_dim(hidden_dim / TILE_DIM);
+    dim3 block_dim(TILE_DIM, TILE_DIM);
+
+    // LayerNormBackward1<__half><<<grid_dim, block_dim, 0, stream[0]>>>(
+    //    out_grad, vals_hat, gamma, betta, gamma_grad, betta_grad, batch, hidden_dim, invertible);
+
+    dim3 grid_dim2(batch);
+
+    if (hidden_dim > 8192 && hidden_dim <= 16384)
+        threads <<= 1;
+    else if (hidden_dim > 16384 && hidden_dim <= 32768)
+        threads <<= 2;
+    else if (hidden_dim > 32768 && hidden_dim <= 65536)
+        threads <<= 3;
+    else if (hidden_dim > 65536)
+        throw std::runtime_error("Unsupport hidden_dim.");
+
+    dim3 block_dim2(threads / 2);
+
+    LayerNormBackward2<<<grid_dim2, block_dim2, 0, stream[1]>>>(
+        out_grad, vals_hat, gamma, betta, vars, inp_grad, invertible, hidden_dim / 2);
+}
+
+/* Backward Normalize (Input-Gradient)
+ * Using the means and variances from the input
+ * This type of backward is not invertible!
+ * We do the backward using the input (X)
+ */
+
+__global__ void LayerNormBackward2(const float* out_grad,
+                                   const float* X_vals,
+                                   const float* gamma,
+                                   const float* vars,
+                                   const float* means,
+                                   float* inp_grad,
+                                   int row_stride)
+{
+    int iteration_stride = blockDim.x;
+    int iterations = row_stride / iteration_stride;
+
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+
+    int row = blockIdx.x;
+    int id = threadIdx.x;
+    int wid = id >> WARP_SIZE_BITS;
+    int warp_num = iteration_stride >> WARP_SIZE_BITS;
+    __shared__ float partialSum[MAX_WARP_NUM];
+
+    out_grad += (row * row_stride);
+    X_vals += (row * row_stride);
+    inp_grad += (row * row_stride);
+
+    float vals_arr[NORM_REG];
+    int high_index = iterations * iteration_stride + id;
+#pragma unroll
+    for (int i = 0; i < iterations; i++) {
+        float gamma_reg = gamma[i * iteration_stride + id];
+        vals_arr[i] = out_grad[i * iteration_stride + id];
+        vals_arr[i] *= gamma_reg;
+    }
+    if ((high_index) < row_stride) {
+        float gamma_reg = gamma[high_index];
+        vals_arr[iterations] = out_grad[high_index];
+        vals_arr[iterations] *= gamma_reg;
+        iterations++;
+    }
+
+    float var_reg = vars[row];
+    float mean_reg = means[row];
+
+    float sum = 0;
+    float xu[NORM_REG];
+    for (int i = 0; i < iterations; i++) {
+        xu[i] = (X_vals[i * iteration_stride + id] - mean_reg);
+        sum += vals_arr[i] * xu[i];
+        vals_arr[i] *= rsqrtf(var_reg);
+    }
+
+    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
+
+    if (g.thread_rank() == 0) partialSum[wid] = sum;
+
+    __syncthreads();
+
+    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
+
+#ifndef __STOCHASTIC_MODE__
+    __syncthreads();
+#endif
+
+    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
+
+    sum = g.shfl(sum, 0);
+    sum /= row_stride;
+
+    for (int i = 0; i < iterations; i++) {
+        vals_arr[i] += (-sum * xu[i] * rsqrtf(var_reg) / (var_reg));
+    }
+
+    sum = 0;
+    for (int i = 0; i < iterations; i++) { sum += vals_arr[i]; }
+
+    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
+
+    if (g.thread_rank() == 0) partialSum[wid] = sum;
+
+    __syncthreads();
+
+    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
+
+#ifndef __STOCHASTIC_MODE__
+    __syncthreads();
+#endif
+
+    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
+    sum = g.shfl(sum, 0);
+    sum /= row_stride;
+
+    iterations = row_stride / iteration_stride;
+    for (int i = 0; i < iterations; i++) inp_grad[i * iteration_stride + id] = (vals_arr[i] - sum);
+    if ((high_index) < row_stride) inp_grad[high_index] = (vals_arr[iterations] - sum);
+}
+
+__global__ void LayerNormBackward2(const __half* out_grad,
+                                   const __half* X_vals,
+                                   const __half* gamma,
+                                   const __half* vars,
+                                   const __half* means,
+                                   __half* inp_grad,
+                                   int row_stride)
+{
+    int iteration_stride = blockDim.x;
+    int iterations = row_stride / iteration_stride;
+
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+
+    int row = blockIdx.x;
+    int id = threadIdx.x;
+    int wid = id >> WARP_SIZE_BITS;
+    int warp_num = iteration_stride >> WARP_SIZE_BITS;
+
+    __shared__ float partialSum[MAX_WARP_NUM];
+
+    __half2 vals_arr[NORM_REG];
+    float2 vals_arr_f[NORM_REG];
+    __half2 xu[NORM_REG];
+
+    __half2* inp_grad_h = reinterpret_cast<__half2*>(inp_grad);
+    const __half2* out_grad_h = reinterpret_cast<const __half2*>(out_grad);
+    const __half2* vals_hat_h = reinterpret_cast<const __half2*>(X_vals);
+
+    inp_grad_h += (row * row_stride);
+    out_grad_h += (row * row_stride);
+    vals_hat_h += (row * row_stride);
+
+    const __half2* gamma_h = reinterpret_cast<const __half2*>(gamma);
+    int high_index = iterations * iteration_stride + id;
+
+    __half mean_h = means[row];
+    __half2 mean_reg = __halves2half2(mean_h, mean_h);
+#pragma unroll
+    for (int i = 0; i < iterations; i++) {
+        __half2 gamma_reg = gamma_h[i * iteration_stride + id];
+        vals_arr[i] = out_grad_h[i * iteration_stride + id];
+        vals_arr[i] *= gamma_reg;  // out_grad * gamma
+        xu[i] = (vals_hat_h[i * iteration_stride + id] - mean_reg);
+    }
+    if ((high_index) < row_stride) {
+        __half2 gamma_reg = gamma_h[high_index];
+        vals_arr[iterations] = out_grad_h[high_index];
+        vals_arr[iterations] *= gamma_reg;  // out_grad * gamma
+        xu[iterations] = (vals_hat_h[high_index] - mean_reg);
+        iterations++;
+    }
+    __half var_h = vars[row];
+    __half2 var_reg = __halves2half2(var_h, var_h);
+
+    float sum = 0.f;
+    for (int i = 0; i < iterations; i++) {
+        __half2 result_h = (xu[i] * vals_arr[i]);
+        float2 result_f = __half22float2(result_h);
+        sum += result_f.x;
+        sum += result_f.y;
+        vals_arr[i] *= h2rsqrt(var_reg);
+    }
+
+    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
+
+    if (g.thread_rank() == 0) partialSum[wid] = sum;
+
+    __syncthreads();
+
+    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
+
+#ifndef __STOCHASTIC_MODE__
+    __syncthreads();
+#endif
+
+    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
+
+    sum = g.shfl(sum, 0);
+    sum /= (2 * row_stride);
+    __half2 sum_h = __float2half2_rn(sum);
+
+    for (int i = 0; i < iterations; i++) {
+        __half2 xu_grad = ((-sum_h * xu[i] * h2rsqrt(var_reg)) / (var_reg));
+        vals_arr_f[i] = __half22float2(vals_arr[i]);
+        float2 xu_grad_f = __half22float2(xu_grad);
+        vals_arr_f[i].x += xu_grad_f.x;
+        vals_arr_f[i].y += xu_grad_f.y;
+    }
+
+    sum = 0.f;
+    for (int i = 0; i < iterations; i++) {
+        sum += (vals_arr_f[i].x);
+        sum += (vals_arr_f[i].y);
+    }
+
+    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
+
+    if (g.thread_rank() == 0) partialSum[wid] = sum;
+
+    __syncthreads();
+
+    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
+
+#ifndef __STOCHASTIC_MODE__
+    __syncthreads();
+#endif
+
+    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
+
+    sum = g.shfl(sum, 0);
+    sum /= (2 * row_stride);
+
+    iterations = row_stride / iteration_stride;
+    for (int i = 0; i < iterations; i++) {
+        vals_arr_f[i].x -= sum;
+        vals_arr_f[i].y -= sum;
+        __half2 temp = __float22half2_rn(vals_arr_f[i]);
+        inp_grad_h[i * iteration_stride + id] = temp;
+    }
+    if ((high_index) < row_stride) {
+        vals_arr_f[iterations].x -= sum;
+        vals_arr_f[iterations].y -= sum;
+        __half2 temp = __float22half2_rn(vals_arr_f[iterations]);
+        inp_grad_h[high_index] = temp;
+    }
+}
+
+template <>
+void launch_layerNorm_backward<float>(const float* out_grad,
+                                      const float* X_data,
+                                      const float* vars,
+                                      const float* means,
+                                      const float* gamma,
+                                      float* gamma_grad,
+                                      float* betta_grad,
+                                      float* inp_grad,
+                                      int batch,
+                                      int hidden_dim,
+                                      cudaStream_t stream[2])
+{
+    int threads = THREADS;
+
+    dim3 grid_dim(hidden_dim / TILE_DIM);
+    dim3 block_dim(TILE_DIM, TILE_DIM);
+
+    LayerNormBackward1<float><<<grid_dim, block_dim, 0, stream[0]>>>(
+        out_grad, X_data, vars, means, gamma_grad, betta_grad, batch, hidden_dim);
+
+    dim3 grid_dim2(batch);
+
+    if (hidden_dim > 16384 && hidden_dim <= 32768)
+        threads <<= 1;
+    else if (hidden_dim > 32768 && hidden_dim <= 65536)
+        threads <<= 2;
+    else if (hidden_dim > 65536)
+        throw std::runtime_error("Unsupport hidden_dim.");
+
+    dim3 block_dim2(threads);
+    LayerNormBackward2<<<grid_dim2, block_dim2, 0, stream[1]>>>(
+        out_grad, X_data, gamma, vars, means, inp_grad, hidden_dim);
+}
+
+template <>
+void launch_layerNorm_backward<__half>(const __half* out_grad,
+                                       const __half* X_data,
+                                       const __half* vars,
+                                       const __half* means,
+                                       const __half* gamma,
+                                       __half* gamma_grad,
+                                       __half* betta_grad,
+                                       __half* inp_grad,
+                                       int batch,
+                                       int hidden_dim,
+                                       cudaStream_t stream[2])
+{
+    int threads = THREADS;
+
+    dim3 grid_dim(hidden_dim / TILE_DIM);
+    dim3 block_dim(TILE_DIM, TILE_DIM);
+
+    LayerNormBackward1<__half><<<grid_dim, block_dim, 0, stream[0]>>>(
+        out_grad, X_data, vars, means, gamma_grad, betta_grad, batch, hidden_dim);
+
+    dim3 grid_dim2(batch);
+
+    if (hidden_dim > 8192 && hidden_dim <= 16384)
+        threads <<= 1;
+    else if (hidden_dim > 16384 && hidden_dim <= 32768)
+        threads <<= 2;
+    else if (hidden_dim > 32768 && hidden_dim <= 65536)
+        threads <<= 3;
+    else if (hidden_dim > 65536)
+        throw std::runtime_error("Unsupport hidden_dim.");
+
+    dim3 block_dim2(threads / 2);
+    LayerNormBackward2<<<grid_dim2, block_dim2, 0, stream[1]>>>(
+        out_grad, X_data, gamma, vars, means, inp_grad, hidden_dim / 2);
+}
+
+template <typename T>
+__global__ void LayerNormBackward1_fused_add(const T* __restrict__ out_grad1,
+                                             const T* __restrict__ out_grad2,
+                                             const T* __restrict__ vals_hat,
+                                             const T* __restrict__ gamma,
+                                             const T* __restrict__ betta,
+                                             T* __restrict__ gamma_grad,
+                                             T* __restrict__ betta_grad,
+                                             int rows,
+                                             int width,
+                                             bool invertible)
+{
+    __shared__ float betta_buffer[TILE_DIM][TILE_DIM + 1];
+    __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
+
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
+
+    int idx = blockDim.x * blockIdx.x + threadIdx.x;
+    int offset = threadIdx.y * width + idx;
+    int y_stride = width * TILE_DIM;
+
+    float betta_reg = (invertible ? (float)betta[idx] : 0.0f);
+    float gamma_reg = (float)gamma[idx];
+
+    // Loop across matrix height
+    float betta_tmp = 0;
+    float gamma_tmp = 0;
+    for (int r = threadIdx.y; r < rows; r += TILE_DIM) {
+        float grad = (float)out_grad1[offset] + (float)out_grad2[offset];
+        float val = (invertible ? ((float)vals_hat[offset] - betta_reg) / gamma_reg
+                                : (float)vals_hat[offset]);
+        betta_tmp += grad;
+        gamma_tmp += (val * grad);
+
+        offset += y_stride;
+    }
+
+    betta_buffer[threadIdx.x][threadIdx.y] = betta_tmp;
+    gamma_buffer[threadIdx.x][threadIdx.y] = gamma_tmp;
+
+    __syncthreads();
+
+    // Sum the shared buffer.
+    float s1 = betta_buffer[threadIdx.y][threadIdx.x];
+    float s2 = gamma_buffer[threadIdx.y][threadIdx.x];
+
+#ifndef __STOCHASTIC_MODE__
+    __syncthreads();
+#endif
+
+    for (int i = 1; i < TILE_DIM; i <<= 1) {
+        s1 += g.shfl_down(s1, i);
+        s2 += g.shfl_down(s2, i);
+    }
+
+    if (threadIdx.x == 0) {
+        int pos = blockIdx.x * TILE_DIM + threadIdx.y;
+        betta_grad[pos] = s1;
+        gamma_grad[pos] = s2;
+    }
+}
+
+template <typename T>
+__global__ void LayerNormBackward1_fused_add(const T* __restrict__ out_grad1,
+                                             const T* __restrict__ out_grad2,
+                                             const T* __restrict__ X_data,
+                                             const T* __restrict__ vars,
+                                             const T* __restrict__ means,
+                                             T* __restrict__ gamma_grad,
+                                             T* __restrict__ betta_grad,
+                                             int rows,
+                                             int width)
+{
+    __shared__ float betta_buffer[TILE_DIM][TILE_DIM + 1];
+    __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
+
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
+
+    int idx = blockDim.x * blockIdx.x + threadIdx.x;
+    int offset = threadIdx.y * width + idx;
+    int y_stride = width * TILE_DIM;
+
+    int pos = blockIdx.x * TILE_DIM + threadIdx.y;
+    // Loop across matrix height
+
+    float betta_tmp = 0;
+    float gamma_tmp = 0;
+    for (int r = threadIdx.y; r < rows; r += TILE_DIM) {
+        float grad = (float)out_grad1[offset] + (float)out_grad2[offset];
+        float val = (float)X_data[offset];
+        val = (val - (float)means[r]) * rsqrtf((float)vars[r]);
+        betta_tmp += grad;
+        gamma_tmp += (val * grad);
+
+        offset += y_stride;
+    }
+
+    betta_buffer[threadIdx.x][threadIdx.y] = betta_tmp;
+    gamma_buffer[threadIdx.x][threadIdx.y] = gamma_tmp;
+
+    __syncthreads();
+
+    // Sum the shared buffer.
+    float s1 = betta_buffer[threadIdx.y][threadIdx.x];
+    float s2 = gamma_buffer[threadIdx.y][threadIdx.x];
+
+#ifndef __STOCHASTIC_MODE__
+    __syncthreads();
+#endif
+
+    for (int i = 1; i < TILE_DIM; i <<= 1) {
+        s1 += g.shfl_down(s1, i);
+        s2 += g.shfl_down(s2, i);
+    }
+
+    if (threadIdx.x == 0) {
+        betta_grad[pos] = s1;
+        gamma_grad[pos] = s2;
+    }
+}
+
+__global__ void LayerNormBackward2_fused_add(const float* out_grad1,
+                                             const float* out_grad2,
+                                             const float* vals_hat,
+                                             const float* gamma,
+                                             const float* betta,
+                                             const float* vars,
+                                             float* inp_grad,
+                                             bool invertible,
+                                             int row_stride)
+{
+    int iteration_stride = blockDim.x;
+    int iterations = row_stride / iteration_stride;
+
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+
+    int row = blockIdx.x;
+    int id = threadIdx.x;
+    int wid = id / WARP_SIZE;
+    int warp_num = iteration_stride >> WARP_SIZE_BITS;
+    __shared__ float partialSum[MAX_WARP_NUM];
+
+    out_grad1 += (row * row_stride);
+    out_grad2 += (row * row_stride);
+    vals_hat += (row * row_stride);
+    inp_grad += (row * row_stride);
+
+    float vals_arr[NORM_REG];
+    float vals_hat_arr[NORM_REG];
+    int high_index = iterations * iteration_stride + id;
+#pragma unroll
+    for (int i = 0; i < iterations; i++) {
+        float gamma_reg = gamma[i * iteration_stride + id];
+        vals_arr[i] = out_grad1[i * iteration_stride + id];
+        vals_arr[i] *= gamma_reg;
+        vals_hat_arr[i] =
+            (invertible ? (vals_hat[i * iteration_stride + id] - betta[i * iteration_stride + id]) /
+                              gamma_reg
+                        : vals_hat[i * iteration_stride + id]);
+    }
+    if ((high_index) < row_stride) {
+        float gamma_reg = gamma[high_index];
+        vals_arr[iterations] = out_grad1[high_index];
+        vals_arr[iterations] *= gamma_reg;
+        vals_hat_arr[iterations] =
+            (invertible ? (vals_hat[high_index] - betta[high_index]) / gamma_reg
+                        : vals_hat[high_index]);
+        iterations++;
+    }
+
+    float var_reg = vars[row];
+
+    float sum = 0;
+    for (int i = 0; i < iterations; i++) {
+        sum += vals_hat_arr[i] * vals_arr[i] * sqrtf(var_reg);
+        vals_arr[i] *= rsqrtf(var_reg);
+    }
+
+    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
+
+    if (g.thread_rank() == 0) partialSum[wid] = sum;
+
+    __syncthreads();
+
+    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
+
+#ifndef __STOCHASTIC_MODE__
+    __syncthreads();
+#endif
+
+    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
+
+    sum = g.shfl(sum, 0);
+    sum /= row_stride;
+
+    for (int i = 0; i < iterations; i++) { vals_arr[i] += ((-sum * vals_hat_arr[i]) / var_reg); }
+
+    sum = 0;
+    for (int i = 0; i < iterations; i++) { sum += vals_arr[i]; }
+
+    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
+
+    if (g.thread_rank() == 0) partialSum[wid] = sum;
+
+    __syncthreads();
+
+    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
+
+#ifndef __STOCHASTIC_MODE__
+    __syncthreads();
+#endif
+
+    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
+    sum = g.shfl(sum, 0);
+    sum /= row_stride;
+
+    iterations = row_stride / iteration_stride;
+    for (int i = 0; i < iterations; i++)
+        inp_grad[i * iteration_stride + id] =
+            (vals_arr[i] - sum) + out_grad2[i * iteration_stride + id];
+    if ((high_index) < row_stride)
+        inp_grad[high_index] = (vals_arr[iterations] - sum) + out_grad2[high_index];
+}
+
+__global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
+                                             const __half* out_grad2,
+                                             const __half* vals_hat,
+                                             const __half* gamma,
+                                             const __half* betta,
+                                             const __half* vars,
+                                             __half* inp_grad,
+                                             bool invertible,
+                                             int row_stride)
+{
+    int iteration_stride = blockDim.x;
+    int iterations = row_stride / iteration_stride;
+
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+
+    int row = blockIdx.x;
+    int id = threadIdx.x;
+    int wid = id / WARP_SIZE;
+    int warp_num = iteration_stride >> WARP_SIZE_BITS;
+    __shared__ float partialSum[MAX_WARP_NUM];
+
+    __half2 vals_arr[NORM_REG];
+    float2 vals_arr_f[NORM_REG];
+    __half2 vals_hat_arr[NORM_REG];
+
+    // float2 result[iterations];
+
+    __half2* inp_grad_h = reinterpret_cast<__half2*>(inp_grad);
+    const __half2* out_grad_h1 = reinterpret_cast<const __half2*>(out_grad1);
+    const __half2* out_grad_h2 = reinterpret_cast<const __half2*>(out_grad2);
+    const __half2* vals_hat_h = reinterpret_cast<const __half2*>(vals_hat);
+
+    inp_grad_h += (row * row_stride);
+    out_grad_h1 += (row * row_stride);
+    out_grad_h2 += (row * row_stride);
+    vals_hat_h += (row * row_stride);
+
+    const __half2* gamma_h = reinterpret_cast<const __half2*>(gamma);
+    const __half2* betta_h = (invertible ? reinterpret_cast<const __half2*>(betta) : nullptr);
+    int high_index = iterations * iteration_stride + id;
+#pragma unroll
+    for (int i = 0; i < iterations; i++) {
+        __half2 gamma_reg = gamma_h[i * iteration_stride + id];
+        vals_arr[i] = out_grad_h1[i * iteration_stride + id];
+        vals_arr[i] *= gamma_reg;  // out_grad * gamma
+        vals_hat_arr[i] =
+            (invertible
+                 ? (vals_hat_h[i * iteration_stride + id] - betta_h[i * iteration_stride + id]) /
+                       gamma_reg
+                 : vals_hat_h[i * iteration_stride + id]);
+    }
+    if ((high_index) < row_stride) {
+        __half2 gamma_reg = gamma_h[high_index];
+        vals_arr[iterations] = out_grad_h1[high_index];
+        vals_arr[iterations] *= gamma_reg;  // out_grad * gamma
+        vals_hat_arr[iterations] =
+            (invertible ? (vals_hat_h[high_index] - betta_h[high_index]) / gamma_reg
+                        : vals_hat_h[high_index]);
+        iterations++;
+    }
+    __half var_h = vars[row];
+    __half2 var_reg = __halves2half2(var_h, var_h);
+
+    float sum = 0.f;
+    for (int i = 0; i < iterations; i++) {
+        __half2 result_h = (vals_hat_arr[i] * vals_arr[i] * h2sqrt(var_reg));
+        float2 result_f = __half22float2(result_h);
+        sum += result_f.x;
+        sum += result_f.y;
+        vals_arr[i] *= h2rsqrt(var_reg);
+    }
+
+    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
+
+    if (g.thread_rank() == 0) partialSum[wid] = sum;
+
+    __syncthreads();
+
+    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
+
+#ifndef __STOCHASTIC_MODE__
+    __syncthreads();
+#endif
+
+    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
+
+    sum = g.shfl(sum, 0);
+    sum /= (2 * row_stride);
+    __half2 sum_h = __float2half2_rn(sum);
+
+    for (int i = 0; i < iterations; i++) {
+        __half2 temp = ((-sum_h * vals_hat_arr[i]) / (var_reg));
+        vals_arr_f[i] = __half22float2(vals_arr[i]);
+        float2 temp_f = __half22float2(temp);
+        vals_arr_f[i].x += temp_f.x;
+        vals_arr_f[i].y += temp_f.y;
+    }
+    sum = 0.f;
+    for (int i = 0; i < iterations; i++) {
+        sum += (vals_arr_f[i].x);
+        sum += (vals_arr_f[i].y);
+    }
+
+    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
+
+    if (g.thread_rank() == 0) partialSum[wid] = sum;
+
+    __syncthreads();
+
+    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
+
+#ifndef __STOCHASTIC_MODE__
+    __syncthreads();
+#endif
+
+    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
+
+    sum = g.shfl(sum, 0);
+    sum /= (2 * row_stride);
+
+    iterations = row_stride / iteration_stride;
+    for (int i = 0; i < iterations; i++) {
+        vals_arr_f[i].x -= sum;
+        vals_arr_f[i].y -= sum;
+        __half2 temp = __float22half2_rn(vals_arr_f[i]);
+
+        inp_grad_h[i * iteration_stride + id] = temp + out_grad_h2[i * iteration_stride + id];
+    }
+    if ((high_index) < row_stride) {
+        vals_arr_f[iterations].x -= sum;
+        vals_arr_f[iterations].y -= sum;
+        __half2 temp = __float22half2_rn(vals_arr_f[iterations]);
+
+        inp_grad_h[high_index] = temp + out_grad_h2[high_index];
+    }
+}
+
+template <>
+void launch_layerNorm_backward_fused_add<float>(const float* out_grad1,
+                                                const float* out_grad2,
+                                                const float* vals_hat,
+                                                const float* vars,
+                                                const float* gamma,
+                                                float* gamma_grad,
+                                                float* betta_grad,
+                                                float* inp_grad,
+                                                int batch,
+                                                int hidden_dim,
+                                                cudaStream_t stream[2],
+                                                bool invertible,
+                                                const float* betta)
+{
+    int threads = THREADS;
+
+    dim3 grid_dim(hidden_dim / TILE_DIM);
+    dim3 block_dim(TILE_DIM, TILE_DIM);
+    LayerNormBackward1<float><<<grid_dim, block_dim, 0, stream[0]>>>(
+        out_grad1, vals_hat, gamma, betta, gamma_grad, betta_grad, batch, hidden_dim, invertible);
+
+    dim3 grid_dim2(batch);
+
+    if (hidden_dim > 16384 && hidden_dim <= 32768)
+        threads <<= 1;
+    else if (hidden_dim > 32768 && hidden_dim <= 65536)
+        threads <<= 2;
+    else if (hidden_dim > 65536)
+        throw std::runtime_error("Unsupport hidden_dim.");
+
+    dim3 block_dim2(threads);
+    LayerNormBackward2_fused_add<<<grid_dim2, block_dim2, 0, stream[1]>>>(
+        out_grad1, out_grad2, vals_hat, gamma, betta, vars, inp_grad, invertible, hidden_dim);
+}
+
+template <>
+void launch_layerNorm_backward_fused_add<__half>(const __half* out_grad1,
+                                                 const __half* out_grad2,
+                                                 const __half* vals_hat,
+                                                 const __half* vars,
+                                                 const __half* gamma,
+                                                 __half* gamma_grad,
+                                                 __half* betta_grad,
+                                                 __half* inp_grad,
+                                                 int batch,
+                                                 int hidden_dim,
+                                                 cudaStream_t stream[2],
+                                                 bool invertible,
+                                                 const __half* betta)
+{
+    int threads = THREADS;
+
+    dim3 grid_dim(hidden_dim / TILE_DIM);
+    dim3 block_dim(TILE_DIM, TILE_DIM);
+
+    LayerNormBackward1<__half><<<grid_dim, block_dim, 0, stream[0]>>>(
+        out_grad1, vals_hat, gamma, betta, gamma_grad, betta_grad, batch, hidden_dim, invertible);
+
+    dim3 grid_dim2(batch);
+
+    if (hidden_dim > 8192 && hidden_dim <= 16384)
+        threads <<= 1;
+    else if (hidden_dim > 16384 && hidden_dim <= 32768)
+        threads <<= 2;
+    else if (hidden_dim > 32768 && hidden_dim <= 65536)
+        threads <<= 3;
+    else if (hidden_dim > 65536)
+        throw std::runtime_error("Unsupport hidden_dim.");
+
+    dim3 block_dim2(threads / 2);
+    LayerNormBackward2_fused_add<<<grid_dim2, block_dim2, 0, stream[1]>>>(
+        out_grad1, out_grad2, vals_hat, gamma, betta, vars, inp_grad, invertible, hidden_dim / 2);
+}
+
+/* Backward Normalize (Input-Gradient)
+ * Using the means and variances from the input
+ * This type of backward is not invertible!
+ * We do the backward using the input (X)
+ */
+
+__global__ void LayerNormBackward2_fused_add(const float* out_grad1,
+                                             const float* out_grad2,
+                                             const float* X_vals,
+                                             const float* gamma,
+                                             const float* vars,
+                                             const float* means,
+                                             float* inp_grad,
+                                             int row_stride)
+{
+    int iteration_stride = blockDim.x;
+    int iterations = row_stride / iteration_stride;
+
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+
+    int row = blockIdx.x;
+    int id = threadIdx.x;
+    int wid = id / WARP_SIZE;
+    int warp_num = iteration_stride >> WARP_SIZE_BITS;
+    __shared__ float partialSum[MAX_WARP_NUM];
+
+    float vals_arr[NORM_REG];
+    float vals_hat_arr[NORM_REG];
+
+    out_grad1 += (row * row_stride);
+    out_grad2 += (row * row_stride);
+    X_vals += (row * row_stride);
+    inp_grad += (row * row_stride);
+    int high_index = iterations * iteration_stride + id;
+#pragma unroll
+    for (int i = 0; i < iterations; i++) {
+        float gamma_reg = gamma[i * iteration_stride + id];
+        vals_arr[i] = out_grad1[i * iteration_stride + id];
+        vals_arr[i] *= gamma_reg;
+        vals_hat_arr[i] = X_vals[i * iteration_stride + id];
+    }
+    if ((high_index) < row_stride) {
+        float gamma_reg = gamma[high_index];
+        vals_arr[iterations] = out_grad1[high_index];
+        vals_arr[iterations] *= gamma_reg;
+        vals_hat_arr[iterations] = X_vals[high_index];
+        iterations++;
+    }
+
+    float var_reg = vars[row];
+    float mean_reg = means[row];
+
+    float sum = 0;
+    float xu[NORM_REG];
+    for (int i = 0; i < iterations; i++) {
+        xu[i] = (vals_hat_arr[i] - mean_reg);
+        sum += vals_arr[i] * xu[i];
+        vals_arr[i] *= rsqrtf(var_reg);
+    }
+
+    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
+
+    if (g.thread_rank() == 0) partialSum[wid] = sum;
+
+    __syncthreads();
+
+    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
+
+#ifndef __STOCHASTIC_MODE__
+    __syncthreads();
+#endif
+
+    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
+
+    sum = g.shfl(sum, 0);
+    sum /= row_stride;
+
+    for (int i = 0; i < iterations; i++) {
+        vals_arr[i] += (-sum * xu[i] * rsqrtf(var_reg) / (var_reg));
+    }
+
+    sum = 0;
+    for (int i = 0; i < iterations; i++) { sum += vals_arr[i]; }
+
+    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
+
+    if (g.thread_rank() == 0) partialSum[wid] = sum;
+
+    __syncthreads();
+
+    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
+
+#ifndef __STOCHASTIC_MODE__
+    __syncthreads();
+#endif
+
+    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
+    sum = g.shfl(sum, 0);
+    sum /= row_stride;
+
+    iterations = row_stride / iteration_stride;
+    for (int i = 0; i < iterations; i++)
+        inp_grad[i * iteration_stride + id] =
+            (vals_arr[i] - sum) + out_grad2[i * iteration_stride + id];
+    if ((high_index) < row_stride)
+        inp_grad[high_index] = (vals_arr[iterations] - sum) + out_grad2[high_index];
+}
+
+__global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
+                                             const __half* out_grad2,
+                                             const __half* X_vals,
+                                             const __half* gamma,
+                                             const __half* vars,
+                                             const __half* means,
+                                             __half* inp_grad,
+                                             int row_stride)
+{
+    int iteration_stride = blockDim.x;
+    int iterations = row_stride / iteration_stride;
+
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+
+    int row = blockIdx.x;
+    int id = threadIdx.x;
+    int wid = id / WARP_SIZE;
+    int warp_num = iteration_stride >> WARP_SIZE_BITS;
+
+    __shared__ float partialSum[MAX_WARP_NUM];
+
+    __half2 vals_arr[NORM_REG];
+    float2 vals_arr_f[NORM_REG];
+    __half2 vals_hat_arr[NORM_REG];
+
+    __half2* inp_grad_h = reinterpret_cast<__half2*>(inp_grad);
+    const __half2* out_grad_h1 = reinterpret_cast<const __half2*>(out_grad1);
+    const __half2* out_grad_h2 = reinterpret_cast<const __half2*>(out_grad2);
+    const __half2* vals_hat_h = reinterpret_cast<const __half2*>(X_vals);
+
+    out_grad_h1 += (row * row_stride);
+    out_grad_h2 += (row * row_stride);
+    inp_grad_h += (row * row_stride);
+    vals_hat_h += (row * row_stride);
+
+    const __half2* gamma_h = reinterpret_cast<const __half2*>(gamma);
+    int high_index = iterations * iteration_stride + id;
+#pragma unroll
+    for (int i = 0; i < iterations; i++) {
+        __half2 gamma_reg = gamma_h[i * iteration_stride + id];
+        vals_arr[i] = out_grad_h1[i * iteration_stride + id];
+        vals_arr[i] *= gamma_reg;  // out_grad * gamma
+        vals_hat_arr[i] = vals_hat_h[i * iteration_stride + id];
+    }
+    if ((high_index) < row_stride) {
+        __half2 gamma_reg = gamma_h[high_index];
+        vals_arr[iterations] = out_grad_h1[high_index];
+        vals_arr[iterations] *= gamma_reg;  // out_grad * gamma
+        vals_hat_arr[iterations] = vals_hat_h[high_index];
+        iterations++;
+    }
+
+    __half mean_h = means[row];
+    __half var_h = vars[row];
+    __half2 var_reg = __halves2half2(var_h, var_h);
+    __half2 mean_reg = __halves2half2(mean_h, mean_h);
+    __half2 xu[NORM_REG];
+
+    float sum = 0.f;
+    for (int i = 0; i < iterations; i++) {
+        xu[i] = (vals_hat_arr[i] - mean_reg);
+        __half2 result_h = (xu[i] * vals_arr[i]);
+        float2 result_f = __half22float2(result_h);
+        sum += result_f.x;
+        sum += result_f.y;
+        vals_arr[i] *= h2rsqrt(var_reg);
+    }
+
+    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
+
+    if (g.thread_rank() == 0) partialSum[wid] = sum;
+
+    __syncthreads();
+
+    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
+
+#ifndef __STOCHASTIC_MODE__
+    __syncthreads();
+#endif
+
+    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
+
+    sum = g.shfl(sum, 0);
+    sum /= (2 * row_stride);
+    __half2 sum_h = __float2half2_rn(sum);
+
+    for (int i = 0; i < iterations; i++) {
+        __half2 xu_grad = ((-sum_h * xu[i] * h2rsqrt(var_reg)) / (var_reg));
+        vals_arr_f[i] = __half22float2(vals_arr[i]);
+        float2 xu_grad_f = __half22float2(xu_grad);
+        vals_arr_f[i].x += xu_grad_f.x;
+        vals_arr_f[i].y += xu_grad_f.y;
+    }
+
+    sum = 0.f;
+    for (int i = 0; i < iterations; i++) {
+        sum += (vals_arr_f[i].x);
+        sum += (vals_arr_f[i].y);
+    }
+
+    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
+
+    if (g.thread_rank() == 0) partialSum[wid] = sum;
+
+    __syncthreads();
+
+    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
+
+#ifndef __STOCHASTIC_MODE__
+    __syncthreads();
+#endif
+
+    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
+
+    sum = g.shfl(sum, 0);
+    sum /= (2 * row_stride);
+
+    iterations = row_stride / iteration_stride;
+    for (int i = 0; i < iterations; i++) {
+        vals_arr_f[i].x -= sum;
+        vals_arr_f[i].y -= sum;
+        __half2 temp = __float22half2_rn(vals_arr_f[i]);
+        inp_grad_h[i * iteration_stride + id] = temp + out_grad_h2[i * iteration_stride + id];
+    }
+    if ((high_index) < row_stride) {
+        vals_arr_f[iterations].x -= sum;
+        vals_arr_f[iterations].y -= sum;
+        __half2 temp = __float22half2_rn(vals_arr_f[iterations]);
+        inp_grad_h[high_index] = temp + out_grad_h2[high_index];
+    }
+}
+
+template <>
+void launch_layerNorm_backward_fused_add<float>(const float* out_grad1,
+                                                const float* out_grad2,
+                                                const float* X_data,
+                                                const float* vars,
+                                                const float* means,
+                                                const float* gamma,
+                                                float* gamma_grad,
+                                                float* betta_grad,
+                                                float* inp_grad,
+                                                int batch,
+                                                int hidden_dim,
+                                                cudaStream_t stream[2])
+{
+    int threads = THREADS;
+
+    dim3 grid_dim(hidden_dim / TILE_DIM);
+    dim3 block_dim(TILE_DIM, TILE_DIM);
+
+    LayerNormBackward1<float><<<grid_dim, block_dim, 0, stream[0]>>>(
+        out_grad1, X_data, vars, means, gamma_grad, betta_grad, batch, hidden_dim);
+
+    dim3 grid_dim2(batch);
+
+    if (hidden_dim > 16384 && hidden_dim <= 32768)
+        threads <<= 1;
+    else if (hidden_dim > 32768 && hidden_dim <= 65536)
+        threads <<= 2;
+    else if (hidden_dim > 65536)
+        throw std::runtime_error("Unsupport hidden_dim.");
+
+    dim3 block_dim2(threads);
+    LayerNormBackward2_fused_add<<<grid_dim2, block_dim2, 0, stream[1]>>>(
+        out_grad1, out_grad2, X_data, gamma, vars, means, inp_grad, hidden_dim);
+}
+
+template <>
+void launch_layerNorm_backward_fused_add<__half>(const __half* out_grad1,
+                                                 const __half* out_grad2,
+                                                 const __half* X_data,
+                                                 const __half* vars,
+                                                 const __half* means,
+                                                 const __half* gamma,
+                                                 __half* gamma_grad,
+                                                 __half* betta_grad,
+                                                 __half* inp_grad,
+                                                 int batch,
+                                                 int hidden_dim,
+                                                 cudaStream_t stream[2])
+{
+    int threads = THREADS;
+
+    dim3 grid_dim(hidden_dim / TILE_DIM);
+    dim3 block_dim(TILE_DIM, TILE_DIM);
+
+    LayerNormBackward1<__half><<<grid_dim, block_dim, 0, stream[0]>>>(
+        out_grad1, X_data, vars, means, gamma_grad, betta_grad, batch, hidden_dim);
+
+    dim3 grid_dim2(batch);
+
+    if (hidden_dim > 8192 && hidden_dim <= 16384)
+        threads <<= 1;
+    else if (hidden_dim > 16384 && hidden_dim <= 32768)
+        threads <<= 2;
+    else if (hidden_dim > 32768 && hidden_dim <= 65536)
+        threads <<= 3;
+    else if (hidden_dim > 65536)
+        throw std::runtime_error("Unsupport hidden_dim.");
+
+    dim3 block_dim2(threads / 2);
+    LayerNormBackward2_fused_add<<<grid_dim2, block_dim2, 0, stream[1]>>>(
+        out_grad1, out_grad2, X_data, gamma, vars, means, inp_grad, hidden_dim / 2);
+}
--- a/csrc/transformer_bak/normalize_kernels.hip
+++ b/csrc/transformer_bak/normalize_kernels.hip
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
+#include "custom_hip_layers.h"
+
+namespace cg = cooperative_groups;
+
+/*
+Fused bias add, residual (elementwise) add, and normalization layer.
+
+For FP16, this kernel does not promote to FP32 in order to utilize the 2x throughput for
+__half2 instructions, and avoid the conversion overhead (1/8 of __hal2 arithmetic).
+
+For specific launch constraints, see the launch functions.
+*/
+
+#define NORM_REG (MAX_REGISTERS / 4)
+
+__global__ void fused_bias_residual_layer_norm(float* vals,
+                                               const float* residual,
+                                               const float* gamma,
+                                               const float* beta,
+                                               float epsilon,
+                                               bool preLayerNorm,
+                                               bool training,
+                                               float* vars,
+                                               float* means,
+                                               int row_stride)
+{
+    int iteration_stride = blockDim.x;
+    int iterations = row_stride / iteration_stride;
+
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+
+    int row = blockIdx.x;
+    int id = threadIdx.x;
+    int gid = id / WARP_SIZE;
+
+    float vals_arr[NORM_REG];
+    __shared__ float shr[MAX_WARP_NUM];
+
+    residual += (row * row_stride);
+    vals += (row * row_stride);
+
+    float sum = 0.f;
+    int high_index = iterations * iteration_stride + id;
+#pragma unroll
+    for (int i = 0; i < iterations; i++) {
+        vals_arr[i] = residual[i * iteration_stride + id];
+        sum += vals_arr[i];
+    }
+    if (high_index < row_stride) {
+        vals_arr[iterations] = residual[high_index];
+        sum += vals_arr[iterations];
+        iterations++;
+    }
+
+    for (int i = 1; i < 32; i *= 2) { sum += g.shfl_down(sum, i); }
+
+    if (g.thread_rank() == 0) shr[gid] = sum;
+
+    b.sync();
+
+    if (g.thread_rank() < (iteration_stride >> WARP_SIZE_BITS)) sum = shr[g.thread_rank()];
+
+#if !defined(__STOCHASTIC_MODE__) || __CUDA_ARCH__ < 700
+    b.sync();
+#endif
+
+    for (int i = 1; i < (iteration_stride >> WARP_SIZE_BITS); i *= 2) {
+        sum += g.shfl_down(sum, i);
+    }
+
+    sum = g.shfl(sum, 0);
+    float mean = sum / row_stride;
+    if (training)
+        if (threadIdx.x == 0) means[row] = mean;
+    float variance = 0.f;
+    for (int i = 0; i < iterations; i++) {
+        vals_arr[i] -= mean;
+        variance += vals_arr[i] * vals_arr[i];
+    }
+
+    for (int i = 1; i < 32; i *= 2) { variance += g.shfl_down(variance, i); }
+
+    if (g.thread_rank() == 0) shr[gid] = variance;
+
+    b.sync();
+
+    if (g.thread_rank() < (iteration_stride >> WARP_SIZE_BITS)) variance = shr[g.thread_rank()];
+
+#ifndef __STOCHASTIC_MODE__
+    b.sync();
+#endif
+
+    for (int i = 1; i < (iteration_stride >> WARP_SIZE_BITS); i *= 2) {
+        variance += g.shfl_down(variance, i);
+    }
+    variance = g.shfl(variance, 0);
+    variance /= row_stride;
+    variance += epsilon;
+    if (training)
+        if (threadIdx.x == 0) vars[row] = variance;
+
+    iterations = row_stride / iteration_stride;
+    for (int i = 0; i < iterations; i++) {
+        vals_arr[i] = vals_arr[i] * rsqrtf(variance);
+        vals_arr[i] =
+            vals_arr[i] * gamma[i * iteration_stride + id] + beta[i * iteration_stride + id];
+        vals[i * iteration_stride + id] = vals_arr[i];
+    }
+    if ((high_index) < row_stride) {
+        vals_arr[iterations] = vals_arr[iterations] * rsqrtf(variance);
+        vals_arr[iterations] = vals_arr[iterations] * gamma[high_index] + beta[high_index];
+        vals[high_index] = vals_arr[iterations];
+    }
+}
+
+__global__ void fused_bias_residual_layer_norm(__half* vals,
+                                               const __half* residual,
+                                               const __half* gamma,
+                                               const __half* beta,
+                                               float epsilon,
+                                               bool preLayerNorm,
+                                               bool training,
+                                               __half* vars,
+                                               __half* means,
+                                               int row_stride)
+{
+#ifdef HALF_PRECISION_AVAILABLE
+    int iteration_stride = blockDim.x;
+    int iterations = row_stride / iteration_stride;
+
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
+
+    int row = blockIdx.x;
+    int id = threadIdx.x;
+    int gid = id >> WARP_SIZE_BITS;
+
+    float2 vals_f[NORM_REG];
+    __shared__ float shr[MAX_WARP_NUM];
+
+    __half2* vals_cast = reinterpret_cast<__half2*>(vals);
+    const __half2* residual_cast = reinterpret_cast<const __half2*>(residual);
+
+    residual_cast += (row * row_stride);
+    vals_cast += (row * row_stride);
+
+    float sum = 0.f;
+    int high_index = iterations * iteration_stride + id;
+#pragma unroll
+    for (int i = 0; i < iterations; i++) {
+        vals_f[i] = __half22float2(residual_cast[i * iteration_stride + id]);
+        sum += vals_f[i].x;
+        sum += vals_f[i].y;
+    }
+    if ((high_index) < row_stride) {
+        vals_f[iterations] = __half22float2(residual_cast[high_index]);
+        sum += vals_f[iterations].x;
+        sum += vals_f[iterations].y;
+        iterations++;
+    }
+
+    for (int i = 1; i < 32; i *= 2) { sum += g.shfl_down(sum, i); }
+
+    if (g.thread_rank() == 0) shr[gid] = sum;
+
+    b.sync();
+
+    if (g.thread_rank() < (iteration_stride >> WARP_SIZE_BITS)) sum = shr[g.thread_rank()];
+
+#ifndef __STOCHASTIC_MODE__
+    b.sync();
+#endif
+
+    for (int i = 1; i < (iteration_stride >> WARP_SIZE_BITS); i *= 2) {
+        sum += g.shfl_down(sum, i);
+    }
+    sum = g.shfl(sum, 0);
+    float mean = sum / (row_stride * 2);
+
+    float variance = 0.f;
+    for (int i = 0; i < iterations; i++) {
+        vals_f[i].x -= mean;
+        vals_f[i].y -= mean;
+        variance += vals_f[i].x * vals_f[i].x;
+        variance += vals_f[i].y * vals_f[i].y;
+    }
+
+    for (int i = 1; i < 32; i *= 2) { variance += g.shfl_down(variance, i); }
+
+    if (g.thread_rank() == 0) shr[gid] = variance;
+
+    b.sync();
+
+    if (g.thread_rank() < (iteration_stride >> WARP_SIZE_BITS)) variance = shr[g.thread_rank()];
+
+#ifndef __STOCHASTIC_MODE__
+    b.sync();
+#endif
+
+    for (int i = 1; i < (iteration_stride >> WARP_SIZE_BITS); i *= 2) {
+        variance += g.shfl_down(variance, i);
+    }
+    variance = g.shfl(variance, 0);
+    variance /= (row_stride * 2);
+    variance += epsilon;
+
+    __half2 variance_h = __float2half2_rn(variance);
+    const __half2* gamma_cast = reinterpret_cast<const __half2*>(gamma);
+    const __half2* beta_cast = reinterpret_cast<const __half2*>(beta);
+
+    if (training && threadIdx.x == 0) {
+        vars[row] = __float2half(variance);
+        means[row] = __float2half(mean);
+    }
+    iterations = row_stride / iteration_stride;
+    for (int i = 0; i < iterations; i++) {
+        __half2 vals_arr = __float22half2_rn(vals_f[i]);
+        vals_arr = vals_arr * h2rsqrt(variance_h);
+        vals_arr =
+            vals_arr * gamma_cast[i * iteration_stride + id] + beta_cast[i * iteration_stride + id];
+        vals_cast[i * iteration_stride + id] = vals_arr;
+    }
+    if ((high_index) < row_stride) {
+        __half2 vals_arr = __float22half2_rn(vals_f[iterations]);
+        vals_arr = vals_arr * h2rsqrt(variance_h);
+        vals_arr = vals_arr * gamma_cast[high_index] + beta_cast[high_index];
+        vals_cast[high_index] = vals_arr;
+    }
+#endif
+}
+
+template <typename T>
+void launch_bias_residual_layer_norm(T* vals,
+                                     const T* residual,
+                                     const T* gamma,
+                                     const T* beta,
+                                     float epsilon,
+                                     int batch_size,
+                                     int hidden_dim,
+                                     hipStream_t stream,
+                                     bool preLayerNorm,
+                                     bool training,
+                                     T* vars,
+                                     T* means);
+
+template <>
+void launch_bias_residual_layer_norm<float>(float* vals,
+                                            const float* residual,
+                                            const float* gamma,
+                                            const float* beta,
+                                            float epsilon,
+                                            int batch_size,
+                                            int hidden_dim,
+                                            hipStream_t stream,
+                                            bool preLayerNorm,
+                                            bool training,
+                                            float* vars,
+                                            float* means)
+{
+    int threads = THREADS;
+
+    dim3 grid_dim(batch_size);
+
+    if (hidden_dim > 16384 && hidden_dim <= 32768)
+        threads <<= 1;
+    else if (hidden_dim > 32768 && hidden_dim <= 65536)
+        threads <<= 2;
+    else if (hidden_dim > 65536)
+        throw std::runtime_error("Unsupport hidden_dim.");
+
+    dim3 block_dim(threads);
+
+   hipLaunchKernelGGL(( fused_bias_residual_layer_norm), dim3(grid_dim), dim3(block_dim), 0, stream, 
+        vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, means, hidden_dim);
+}
+
+template <>
+void launch_bias_residual_layer_norm<__half>(__half* vals,
+                                             const __half* residual,
+                                             const __half* gamma,
+                                             const __half* beta,
+                                             float epsilon,
+                                             int batch_size,
+                                             int hidden_dim,
+                                             hipStream_t stream,
+                                             bool preLayerNorm,
+                                             bool training,
+                                             __half* vars,
+                                             __half* means)
+{
+    int threads = 128;
+
+    dim3 grid_dim(batch_size);
+
+    if (hidden_dim > 8192 && hidden_dim <= 16384)
+        threads <<= 1;
+    else if (hidden_dim > 16384 && hidden_dim <= 32768)
+        threads <<= 2;
+    else if (hidden_dim > 32768 && hidden_dim <= 65536)
+        threads <<= 3;
+    else if (hidden_dim > 65536)
+        throw std::runtime_error("Unsupport hidden_dim.");
+
+    dim3 block_dim(threads);
+
+   hipLaunchKernelGGL(( fused_bias_residual_layer_norm), dim3(grid_dim), dim3(block_dim), 0, stream, 
+        vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, means, hidden_dim / 2);
+}
+
+__global__ void fused_bias_residual_layer_norm(float* vals,
+                                               const float* residual,
+                                               const float* gamma,
+                                               const float* beta,
+                                               float epsilon,
+                                               bool preLayerNorm,
+                                               bool training,
+                                               float* vars,
+                                               int row_stride)
+{
+    int iteration_stride = blockDim.x;
+    int iterations = row_stride / iteration_stride;
+
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
+
+    int row = blockIdx.x;
+    int id = threadIdx.x;
+    int gid = id / 32;
+
+    float vals_arr[NORM_REG];
+    __shared__ float shr[MAX_WARP_NUM];
+
+    residual += (row * row_stride);
+    vals += (row * row_stride);
+
+    float sum = 0.f;
+    int high_index = iterations * iteration_stride + id;
+#pragma unroll
+    for (int i = 0; i < iterations; i++) {
+        vals_arr[i] = residual[i * iteration_stride + id];
+        sum += vals_arr[i];
+    }
+    if ((high_index) < row_stride) {
+        vals_arr[iterations] = residual[high_index];
+        sum += vals_arr[iterations];
+        iterations++;
+    }
+
+    for (int i = 1; i < 32; i *= 2) { sum += g.shfl_down(sum, i); }
+
+    if (g.thread_rank() == 0) shr[gid] = sum;
+
+    b.sync();
+
+    if (g.thread_rank() < (iteration_stride >> WARP_SIZE_BITS)) sum = shr[g.thread_rank()];
+
+#if !defined(__STOCHASTIC_MODE__) || __CUDA_ARCH__ < 700
+    b.sync();
+#endif
+
+    for (int i = 1; i < (iteration_stride >> WARP_SIZE_BITS); i *= 2) {
+        sum += g.shfl_down(sum, i);
+    }
+
+    sum = g.shfl(sum, 0);
+    float mean = sum / row_stride;
+    float variance = 0.f;
+    for (int i = 0; i < iterations; i++) {
+        vals_arr[i] -= mean;
+        variance += vals_arr[i] * vals_arr[i];
+    }
+
+    for (int i = 1; i < 32; i *= 2) { variance += g.shfl_down(variance, i); }
+
+    if (g.thread_rank() == 0) shr[gid] = variance;
+
+    b.sync();
+
+    if (g.thread_rank() < (iteration_stride >> WARP_SIZE_BITS)) variance = shr[g.thread_rank()];
+
+#ifndef __STOCHASTIC_MODE__
+    b.sync();
+#endif
+
+    for (int i = 1; i < (iteration_stride >> WARP_SIZE_BITS); i *= 2) {
+        variance += g.shfl_down(variance, i);
+    }
+    variance = g.shfl(variance, 0);
+    variance /= row_stride;
+    variance += epsilon;
+    if (training)
+        if (threadIdx.x == 0) vars[row] = variance;
+
+    iterations = row_stride / iteration_stride;
+    for (int i = 0; i < iterations; i++) {
+        vals_arr[i] = vals_arr[i] * rsqrtf(variance);
+        vals_arr[i] =
+            vals_arr[i] * gamma[i * iteration_stride + id] + beta[i * iteration_stride + id];
+        vals[i * iteration_stride + id] = vals_arr[i];
+    }
+    if ((high_index) < row_stride) {
+        vals_arr[iterations] = vals_arr[iterations] * rsqrtf(variance);
+        vals_arr[iterations] = vals_arr[iterations] * gamma[high_index] + beta[high_index];
+        vals[high_index] = vals_arr[iterations];
+    }
+}
+
+__global__ void fused_bias_residual_layer_norm(__half* vals,
+                                               const __half* residual,
+                                               const __half* gamma,
+                                               const __half* beta,
+                                               float epsilon,
+                                               bool preLayerNorm,
+                                               bool training,
+                                               __half* vars,
+                                               int row_stride)
+{
+#ifdef HALF_PRECISION_AVAILABLE
+
+    int iteration_stride = blockDim.x;
+    int iterations = row_stride / iteration_stride;
+
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
+
+    int row = blockIdx.x;
+    int id = threadIdx.x;
+    int gid = id >> WARP_SIZE_BITS;
+
+    float2 vals_f[NORM_REG];
+    __shared__ float shr[MAX_WARP_NUM];
+
+    __half2* vals_cast = reinterpret_cast<__half2*>(vals);
+    const __half2* residual_cast = reinterpret_cast<const __half2*>(residual);
+
+    residual_cast += (row * row_stride);
+    vals_cast += (row * row_stride);
+
+    float sum = 0.f;
+    int high_index = iterations * iteration_stride + id;
+#pragma unroll
+    for (int i = 0; i < iterations; i++) {
+        vals_f[i] = __half22float2(residual_cast[i * iteration_stride + id]);
+        sum += vals_f[i].x;
+        sum += vals_f[i].y;
+    }
+    if ((high_index) < row_stride) {
+        vals_f[iterations] = __half22float2(residual_cast[high_index]);
+        sum += vals_f[iterations].x;
+        sum += vals_f[iterations].y;
+        iterations++;
+    }
+
+    for (int i = 1; i < 32; i *= 2) { sum += g.shfl_down(sum, i); }
+
+    if (g.thread_rank() == 0) shr[gid] = sum;
+
+    b.sync();
+
+    if (g.thread_rank() < (iteration_stride >> WARP_SIZE_BITS)) sum = shr[g.thread_rank()];
+
+#ifndef __STOCHASTIC_MODE__
+    b.sync();
+#endif
+
+    for (int i = 1; i < (iteration_stride >> WARP_SIZE_BITS); i *= 2) {
+        sum += g.shfl_down(sum, i);
+    }
+    sum = g.shfl(sum, 0);
+    float mean = sum / (row_stride * 2);
+
+    float variance = 0.f;
+    for (int i = 0; i < iterations; i++) {
+        vals_f[i].x -= mean;
+        vals_f[i].y -= mean;
+        variance += vals_f[i].x * vals_f[i].x;
+        variance += vals_f[i].y * vals_f[i].y;
+    }
+
+    for (int i = 1; i < 32; i *= 2) { variance += g.shfl_down(variance, i); }
+
+    if (g.thread_rank() == 0) shr[gid] = variance;
+
+    b.sync();
+
+    if (g.thread_rank() < (iteration_stride >> WARP_SIZE_BITS)) variance = shr[g.thread_rank()];
+
+#ifndef __STOCHASTIC_MODE__
+    b.sync();
+#endif
+
+    for (int i = 1; i < (iteration_stride >> WARP_SIZE_BITS); i *= 2) {
+        variance += g.shfl_down(variance, i);
+    }
+    variance = g.shfl(variance, 0);
+    variance /= (row_stride * 2);
+    variance += epsilon;
+
+    __half2 variance_h = __float2half2_rn(variance);
+    const __half2* gamma_cast = reinterpret_cast<const __half2*>(gamma);
+    const __half2* beta_cast = reinterpret_cast<const __half2*>(beta);
+
+    if (training && threadIdx.x == 0) vars[row] = __float2half(variance);
+
+    iterations = row_stride / iteration_stride;
+    for (int i = 0; i < iterations; i++) {
+        __half2 vals_arr = __float22half2_rn(vals_f[i]);
+        vals_arr = vals_arr * h2rsqrt(variance_h);
+        vals_arr =
+            vals_arr * gamma_cast[i * iteration_stride + id] + beta_cast[i * iteration_stride + id];
+        vals_cast[i * iteration_stride + id] = vals_arr;
+    }
+    if ((high_index) < row_stride) {
+        __half2 vals_arr = __float22half2_rn(vals_f[iterations]);
+        vals_arr = vals_arr * h2rsqrt(variance_h);
+        vals_arr = vals_arr * gamma_cast[high_index] + beta_cast[high_index];
+        vals_cast[high_index] = vals_arr;
+    }
+#endif
+}
+
+template <typename T>
+void launch_bias_residual_layer_norm(T* vals,
+                                     const T* residual,
+                                     const T* gamma,
+                                     const T* beta,
+                                     float epsilon,
+                                     int batch_size,
+                                     int hidden_dim,
+                                     hipStream_t stream,
+                                     bool preLayerNorm,
+                                     bool training,
+                                     T* vars);
+
+/*
+To tune this launch the following restrictions must be met:
+
+For float:
+row_stride == hidden_size
+threads * iterations == row_stride
+threads is in [32, 64, 128, 256, 512, 1024]
+
+For half:
+row_stride == hidden_size / 2
+threads * iterations == row_stride
+threads is in [32, 64, 128, 256, 512, 1024]
+
+*/
+
+template <>
+void launch_bias_residual_layer_norm<float>(float* vals,
+                                            const float* residual,
+                                            const float* gamma,
+                                            const float* beta,
+                                            float epsilon,
+                                            int batch_size,
+                                            int hidden_dim,
+                                            hipStream_t stream,
+                                            bool preLayerNorm,
+                                            bool training,
+                                            float* vars)
+{
+    int threads = THREADS;
+
+    dim3 grid_dim(batch_size);
+
+    // There are some limitations to call below functions, now just enumerate the situations.
+
+    if (hidden_dim > 16384 && hidden_dim <= 32768)
+        threads <<= 1;
+    else if (hidden_dim > 32768 && hidden_dim <= 65536)
+        threads <<= 2;
+    else if (hidden_dim > 65536)
+        throw std::runtime_error("Unsupport hidden_dim.");
+
+    dim3 block_dim(threads);
+
+   hipLaunchKernelGGL(( fused_bias_residual_layer_norm), dim3(grid_dim), dim3(block_dim), 0, stream, 
+        vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, hidden_dim);
+}
+
+template <>
+void launch_bias_residual_layer_norm<__half>(__half* vals,
+                                             const __half* residual,
+                                             const __half* gamma,
+                                             const __half* beta,
+                                             float epsilon,
+                                             int batch_size,
+                                             int hidden_dim,
+                                             hipStream_t stream,
+                                             bool preLayerNorm,
+                                             bool training,
+                                             __half* vars)
+{
+    int threads = 128;
+
+    dim3 grid_dim(batch_size);
+
+    // There are some limitations to call below functions, now just enumerate the situations.
+
+    if (hidden_dim > 8192 && hidden_dim <= 16384)
+        threads <<= 1;
+    else if (hidden_dim > 16384 && hidden_dim <= 32768)
+        threads <<= 2;
+    else if (hidden_dim > 32768 && hidden_dim <= 65536)
+        threads <<= 3;
+    else if (hidden_dim > 65536)
+        throw std::runtime_error("Unsupport hidden_dim.");
+
+    dim3 block_dim(threads);
+   hipLaunchKernelGGL(( fused_bias_residual_layer_norm), dim3(grid_dim), dim3(block_dim), 0, stream, 
+        vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, hidden_dim / 2);
+}
+
+/* Normalize Gamma & Betta gradients
+ * Compute gradients using either X_hat or
+ * normalize input (invertible).
+ * Combine transpose with gradients computation.
+ */
+
+template <typename T>
+__global__ void LayerNormBackward1(const T* __restrict__ out_grad,
+                                   const T* __restrict__ vals_hat,
+                                   const T* __restrict__ gamma,
+                                   const T* __restrict__ betta,
+                                   T* __restrict__ gamma_grad,
+                                   T* __restrict__ betta_grad,
+                                   int rows,
+                                   int width,
+                                   bool invertible)
+{
+    __shared__ float betta_buffer[TILE_DIM][TILE_DIM + 1];
+    __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
+
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
+
+    int idx = blockDim.x * blockIdx.x + threadIdx.x;
+    int offset = threadIdx.y * width + idx;
+    int y_stride = width * TILE_DIM;
+
+    float betta_reg = (invertible ? (float)betta[idx] : 0.0f);
+    float gamma_reg = (float)gamma[idx];
+
+    // Loop across matrix height
+    float betta_tmp = 0;
+    float gamma_tmp = 0;
+    for (int r = threadIdx.y; r < rows; r += TILE_DIM) {
+        float grad = (float)out_grad[offset];
+        float val = (invertible ? ((float)vals_hat[offset] - betta_reg) / gamma_reg
+                                : (float)vals_hat[offset]);
+        betta_tmp += grad;
+        gamma_tmp += (val * grad);
+
+        offset += y_stride;
+    }
+
+    betta_buffer[threadIdx.x][threadIdx.y] = betta_tmp;
+    gamma_buffer[threadIdx.x][threadIdx.y] = gamma_tmp;
+
+    __syncthreads();
+
+    // Sum the shared buffer.
+    float s1 = betta_buffer[threadIdx.y][threadIdx.x];
+    float s2 = gamma_buffer[threadIdx.y][threadIdx.x];
+
+#ifndef __STOCHASTIC_MODE__
+    __syncthreads();
+#endif
+
+    for (int i = 1; i < TILE_DIM; i <<= 1) {
+        s1 += g.shfl_down(s1, i);
+        s2 += g.shfl_down(s2, i);
+    }
+
+    if (threadIdx.x == 0) {
+        int pos = blockIdx.x * TILE_DIM + threadIdx.y;
+        betta_grad[pos] = s1;
+        gamma_grad[pos] = s2;
+    }
+}
+
+/* Normalize Gamma & Betta gradients
+ * Compute gradients using the input to
+ * the normalize.
+ * Combine transpose with gradients computation.
+ */
+
+template <typename T>
+__global__ void LayerNormBackward1(const T* __restrict__ out_grad,
+                                   const T* __restrict__ X_data,
+                                   const T* __restrict__ vars,
+                                   const T* __restrict__ means,
+                                   T* __restrict__ gamma_grad,
+                                   T* __restrict__ betta_grad,
+                                   int rows,
+                                   int width)
+{
+    __shared__ float betta_buffer[TILE_DIM][TILE_DIM + 1];
+    __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
+
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
+
+    int idx = blockDim.x * blockIdx.x + threadIdx.x;
+    int offset = threadIdx.y * width + idx;
+    int y_stride = width * TILE_DIM;
+
+    int pos = blockIdx.x * TILE_DIM + threadIdx.y;
+    // Loop across matrix height
+
+    float betta_tmp = 0;
+    float gamma_tmp = 0;
+    for (int r = threadIdx.y; r < rows; r += TILE_DIM) {
+        float grad = (float)out_grad[offset];
+        float val = (float)X_data[offset];
+        val = (val - (float)means[r]) * rsqrtf((float)vars[r]);
+        betta_tmp += grad;
+        gamma_tmp += (val * grad);
+
+        offset += y_stride;
+    }
+
+    betta_buffer[threadIdx.x][threadIdx.y] = betta_tmp;
+    gamma_buffer[threadIdx.x][threadIdx.y] = gamma_tmp;
+
+    __syncthreads();
+
+    // Sum the shared buffer.
+    float s1 = betta_buffer[threadIdx.y][threadIdx.x];
+    float s2 = gamma_buffer[threadIdx.y][threadIdx.x];
+
+#ifndef __STOCHASTIC_MODE__
+    __syncthreads();
+#endif
+
+    for (int i = 1; i < TILE_DIM; i <<= 1) {
+        s1 += g.shfl_down(s1, i);
+        s2 += g.shfl_down(s2, i);
+    }
+
+    if (threadIdx.x == 0) {
+        betta_grad[pos] = s1;
+        gamma_grad[pos] = s2;
+    }
+}
+/*
+
+/* Backward Normalize (Input-Gradient)
+ * Using the means and variances from the input
+ * This type of backward is invertible!
+ * We do the backward using the X_hat (X - u) / sqrt(variance) or the output of Normalization.
+ */
+
+__global__ void LayerNormBackward2(const float* out_grad,
+                                   const float* vals_hat,
+                                   const float* gamma,
+                                   const float* betta,
+                                   const float* vars,
+                                   float* inp_grad,
+                                   bool invertible,
+                                   int row_stride)
+{
+    int iteration_stride = blockDim.x;
+    int iterations = row_stride / iteration_stride;
+
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+
+    int row = blockIdx.x;
+    int id = threadIdx.x;
+    int wid = id / WARP_SIZE;
+    int warp_num = iteration_stride >> WARP_SIZE_BITS;
+    __shared__ float partialSum[MAX_WARP_NUM];
+
+    out_grad += (row * row_stride);
+    vals_hat += (row * row_stride);
+    inp_grad += (row * row_stride);
+
+    float vals_arr[NORM_REG];
+    float vals_hat_arr[NORM_REG];
+    int high_index = iterations * iteration_stride + id;
+#pragma unroll
+    for (int i = 0; i < iterations; i++) {
+        float gamma_reg = gamma[i * iteration_stride + id];
+        vals_arr[i] = out_grad[i * iteration_stride + id];
+        vals_arr[i] *= gamma_reg;
+        vals_hat_arr[i] =
+            (invertible ? (vals_hat[i * iteration_stride + id] - betta[i * iteration_stride + id]) /
+                              gamma_reg
+                        : vals_hat[i * iteration_stride + id]);
+    }
+    if ((high_index) < row_stride) {
+        float gamma_reg = gamma[high_index];
+        vals_arr[iterations] = out_grad[high_index];
+        vals_arr[iterations] *= gamma_reg;
+        vals_hat_arr[iterations] =
+            (invertible ? (vals_hat[high_index] - betta[high_index]) / gamma_reg
+                        : vals_hat[high_index]);
+        iterations++;
+    }
+
+    float var_reg = vars[row];
+
+    float sum = 0;
+    for (int i = 0; i < iterations; i++) {
+        sum += vals_hat_arr[i] * vals_arr[i] *
+               sqrtf(var_reg);           // dval_hat = gamma * (x - u) * out_grad
+        vals_arr[i] *= rsqrtf(var_reg);  // dvar_inv = gamma * out_grad / sqrt(var)
+    }
+
+    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
+
+    if (g.thread_rank() == 0) partialSum[wid] = sum;
+
+    __syncthreads();
+
+    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
+
+#ifndef __STOCHASTIC_MODE__
+    __syncthreads();
+#endif
+
+    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
+
+    sum = g.shfl(sum, 0);
+    sum /= row_stride;
+
+    for (int i = 0; i < iterations; i++) { vals_arr[i] += ((-sum * vals_hat_arr[i]) / var_reg); }
+
+    sum = 0;
+    for (int i = 0; i < iterations; i++) { sum += vals_arr[i]; }
+
+    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
+
+    if (g.thread_rank() == 0) partialSum[wid] = sum;
+
+    __syncthreads();
+
+    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
+
+#ifndef __STOCHASTIC_MODE__
+    __syncthreads();
+#endif
+
+    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
+    sum = g.shfl(sum, 0);
+    sum /= row_stride;
+
+    iterations = row_stride / iteration_stride;
+    for (int i = 0; i < iterations; i++) inp_grad[i * iteration_stride + id] = (vals_arr[i] - sum);
+    if ((high_index) < row_stride) inp_grad[high_index] = (vals_arr[iterations] - sum);
+}
+
+__global__ void LayerNormBackward2(const __half* out_grad,
+                                   const __half* vals_hat,
+                                   const __half* gamma,
+                                   const __half* betta,
+                                   const __half* vars,
+                                   __half* inp_grad,
+                                   bool invertible,
+                                   int row_stride)
+{
+    int iteration_stride = blockDim.x;
+    int iterations = row_stride / iteration_stride;
+
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+
+    int row = blockIdx.x;
+    int id = threadIdx.x;
+    int wid = id / WARP_SIZE;
+    int warp_num = iteration_stride >> WARP_SIZE_BITS;
+    __shared__ float partialSum[MAX_WARP_NUM];
+
+    __half2 vals_arr[NORM_REG];
+    float2 vals_arr_f[NORM_REG];
+    __half2 vals_hat_arr[NORM_REG];
+
+    __half2* inp_grad_h = reinterpret_cast<__half2*>(inp_grad);
+    const __half2* out_grad_h = reinterpret_cast<const __half2*>(out_grad);
+    const __half2* vals_hat_h = reinterpret_cast<const __half2*>(vals_hat);
+
+    inp_grad_h += (row * row_stride);
+    out_grad_h += (row * row_stride);
+    vals_hat_h += (row * row_stride);
+
+    const __half2* gamma_h = reinterpret_cast<const __half2*>(gamma);
+    const __half2* betta_h = (invertible ? reinterpret_cast<const __half2*>(betta) : nullptr);
+    int high_index = iterations * iteration_stride + id;
+#pragma unroll
+    for (int i = 0; i < iterations; i++) {
+        __half2 gamma_reg = gamma_h[i * iteration_stride + id];
+        vals_arr[i] = out_grad_h[i * iteration_stride + id];
+        vals_arr[i] *= gamma_reg;
+        vals_hat_arr[i] =
+            (invertible
+                 ? (vals_hat_h[i * iteration_stride + id] - betta_h[i * iteration_stride + id]) /
+                       gamma_reg
+                 : vals_hat_h[i * iteration_stride + id]);
+    }
+    if ((high_index) < row_stride) {
+        __half2 gamma_reg = gamma_h[high_index];
+        vals_arr[iterations] = out_grad_h[high_index];
+        vals_arr[iterations] *= gamma_reg;
+        vals_hat_arr[iterations] =
+            (invertible ? (vals_hat_h[high_index] - betta_h[high_index]) / gamma_reg
+                        : vals_hat_h[high_index]);
+        iterations++;
+    }
+    __half var_h = vars[row];
+    __half2 var_reg = __halves2half2(var_h, var_h);
+
+    float sum = 0.f;
+    for (int i = 0; i < iterations; i++) {
+        __half2 result_h = (vals_hat_arr[i] * vals_arr[i] * h2sqrt(var_reg));
+        float2 result_f = __half22float2(result_h);
+        sum += result_f.x;
+        sum += result_f.y;
+        vals_arr[i] *= h2rsqrt(var_reg);
+    }
+
+    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
+
+    if (g.thread_rank() == 0) partialSum[wid] = sum;
+
+    __syncthreads();
+
+    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
+
+#ifndef __STOCHASTIC_MODE__
+    __syncthreads();
+#endif
+
+    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
+
+    sum = g.shfl(sum, 0);
+    sum /= (2 * row_stride);
+    __half2 sum_h = __float2half2_rn(sum);
+
+    for (int i = 0; i < iterations; i++) {
+        __half2 temp = ((-sum_h * vals_hat_arr[i]) / (var_reg));
+        vals_arr_f[i] = __half22float2(vals_arr[i]);
+        float2 temp_f = __half22float2(temp);
+        vals_arr_f[i].x += temp_f.x;
+        vals_arr_f[i].y += temp_f.y;
+    }
+    sum = 0.f;
+
+    for (int i = 0; i < iterations; i++) {
+        sum += (vals_arr_f[i].x);
+        sum += (vals_arr_f[i].y);
+    }
+
+    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
+
+    if (g.thread_rank() == 0) partialSum[wid] = sum;
+
+    __syncthreads();
+
+    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
+
+#ifndef __STOCHASTIC_MODE__
+    __syncthreads();
+#endif
+
+    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
+
+    sum = g.shfl(sum, 0);
+    sum /= (2 * row_stride);
+
+    iterations = row_stride / iteration_stride;
+    for (int i = 0; i < iterations; i++) {
+        vals_arr_f[i].x -= sum;
+        vals_arr_f[i].y -= sum;
+        __half2 temp = __float22half2_rn(vals_arr_f[i]);
+
+        inp_grad_h[i * iteration_stride + id] = temp;
+    }
+    if ((high_index) < row_stride) {
+        vals_arr_f[iterations].x -= sum;
+        vals_arr_f[iterations].y -= sum;
+        __half2 temp = __float22half2_rn(vals_arr_f[iterations]);
+
+        inp_grad_h[high_index] = temp;
+    }
+}
+
+template <>
+void launch_layerNorm_backward<float>(const float* out_grad,
+                                      const float* vals_hat,
+                                      const float* vars,
+                                      const float* gamma,
+                                      float* gamma_grad,
+                                      float* betta_grad,
+                                      float* inp_grad,
+                                      int batch,
+                                      int hidden_dim,
+                                      hipStream_t stream[2],
+                                      bool invertible,
+                                      const float* betta)
+{
+    int threads = THREADS;
+
+    dim3 grid_dim(hidden_dim / TILE_DIM);
+    dim3 block_dim(TILE_DIM, TILE_DIM);
+
+   hipLaunchKernelGGL(( LayerNormBackward1<float>), dim3(grid_dim), dim3(block_dim), 0, stream[0], 
+        out_grad, vals_hat, gamma, betta, gamma_grad, betta_grad, batch, hidden_dim, invertible);
+
+    dim3 grid_dim2(batch);
+
+    if (hidden_dim > 16384 && hidden_dim <= 32768)
+        threads <<= 1;
+    else if (hidden_dim > 32768 && hidden_dim <= 65536)
+        threads <<= 2;
+    else if (hidden_dim > 65536)
+        throw std::runtime_error("Unsupport hidden_dim.");
+
+    dim3 block_dim2(threads);
+
+   hipLaunchKernelGGL(( LayerNormBackward2), dim3(grid_dim2), dim3(block_dim2), 0, stream[1], 
+        out_grad, vals_hat, gamma, betta, vars, inp_grad, invertible, hidden_dim);
+}
+
+template <>
+void launch_layerNorm_backward<__half>(const __half* out_grad,
+                                       const __half* vals_hat,
+                                       const __half* vars,
+                                       const __half* gamma,
+                                       __half* gamma_grad,
+                                       __half* betta_grad,
+                                       __half* inp_grad,
+                                       int batch,
+                                       int hidden_dim,
+                                       hipStream_t stream[2],
+                                       bool invertible,
+                                       const __half* betta)
+{
+    int threads = THREADS;
+
+    dim3 grid_dim(hidden_dim / TILE_DIM);
+    dim3 block_dim(TILE_DIM, TILE_DIM);
+
+    //hipLaunchKernelGGL(( LayerNormBackward1<__half>), dim3(grid_dim), dim3(block_dim), 0, stream[0], 
+    //    out_grad, vals_hat, gamma, betta, gamma_grad, betta_grad, batch, hidden_dim, invertible);
+
+    dim3 grid_dim2(batch);
+
+    if (hidden_dim > 8192 && hidden_dim <= 16384)
+        threads <<= 1;
+    else if (hidden_dim > 16384 && hidden_dim <= 32768)
+        threads <<= 2;
+    else if (hidden_dim > 32768 && hidden_dim <= 65536)
+        threads <<= 3;
+    else if (hidden_dim > 65536)
+        throw std::runtime_error("Unsupport hidden_dim.");
+
+    dim3 block_dim2(threads / 2);
+
+   hipLaunchKernelGGL(( LayerNormBackward2), dim3(grid_dim2), dim3(block_dim2), 0, stream[1], 
+        out_grad, vals_hat, gamma, betta, vars, inp_grad, invertible, hidden_dim / 2);
+}
+
+/* Backward Normalize (Input-Gradient)
+ * Using the means and variances from the input
+ * This type of backward is not invertible!
+ * We do the backward using the input (X)
+ */
+
+__global__ void LayerNormBackward2(const float* out_grad,
+                                   const float* X_vals,
+                                   const float* gamma,
+                                   const float* vars,
+                                   const float* means,
+                                   float* inp_grad,
+                                   int row_stride)
+{
+    int iteration_stride = blockDim.x;
+    int iterations = row_stride / iteration_stride;
+
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+
+    int row = blockIdx.x;
+    int id = threadIdx.x;
+    int wid = id >> WARP_SIZE_BITS;
+    int warp_num = iteration_stride >> WARP_SIZE_BITS;
+    __shared__ float partialSum[MAX_WARP_NUM];
+
+    out_grad += (row * row_stride);
+    X_vals += (row * row_stride);
+    inp_grad += (row * row_stride);
+
+    float vals_arr[NORM_REG];
+    int high_index = iterations * iteration_stride + id;
+#pragma unroll
+    for (int i = 0; i < iterations; i++) {
+        float gamma_reg = gamma[i * iteration_stride + id];
+        vals_arr[i] = out_grad[i * iteration_stride + id];
+        vals_arr[i] *= gamma_reg;
+    }
+    if ((high_index) < row_stride) {
+        float gamma_reg = gamma[high_index];
+        vals_arr[iterations] = out_grad[high_index];
+        vals_arr[iterations] *= gamma_reg;
+        iterations++;
+    }
+
+    float var_reg = vars[row];
+    float mean_reg = means[row];
+
+    float sum = 0;
+    float xu[NORM_REG];
+    for (int i = 0; i < iterations; i++) {
+        xu[i] = (X_vals[i * iteration_stride + id] - mean_reg);
+        sum += vals_arr[i] * xu[i];
+        vals_arr[i] *= rsqrtf(var_reg);
+    }
+
+    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
+
+    if (g.thread_rank() == 0) partialSum[wid] = sum;
+
+    __syncthreads();
+
+    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
+
+#ifndef __STOCHASTIC_MODE__
+    __syncthreads();
+#endif
+
+    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
+
+    sum = g.shfl(sum, 0);
+    sum /= row_stride;
+
+    for (int i = 0; i < iterations; i++) {
+        vals_arr[i] += (-sum * xu[i] * rsqrtf(var_reg) / (var_reg));
+    }
+
+    sum = 0;
+    for (int i = 0; i < iterations; i++) { sum += vals_arr[i]; }
+
+    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
+
+    if (g.thread_rank() == 0) partialSum[wid] = sum;
+
+    __syncthreads();
+
+    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
+
+#ifndef __STOCHASTIC_MODE__
+    __syncthreads();
+#endif
+
+    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
+    sum = g.shfl(sum, 0);
+    sum /= row_stride;
+
+    iterations = row_stride / iteration_stride;
+    for (int i = 0; i < iterations; i++) inp_grad[i * iteration_stride + id] = (vals_arr[i] - sum);
+    if ((high_index) < row_stride) inp_grad[high_index] = (vals_arr[iterations] - sum);
+}
+
+__global__ void LayerNormBackward2(const __half* out_grad,
+                                   const __half* X_vals,
+                                   const __half* gamma,
+                                   const __half* vars,
+                                   const __half* means,
+                                   __half* inp_grad,
+                                   int row_stride)
+{
+    int iteration_stride = blockDim.x;
+    int iterations = row_stride / iteration_stride;
+
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+
+    int row = blockIdx.x;
+    int id = threadIdx.x;
+    int wid = id >> WARP_SIZE_BITS;
+    int warp_num = iteration_stride >> WARP_SIZE_BITS;
+
+    __shared__ float partialSum[MAX_WARP_NUM];
+
+    __half2 vals_arr[NORM_REG];
+    float2 vals_arr_f[NORM_REG];
+    __half2 xu[NORM_REG];
+
+    __half2* inp_grad_h = reinterpret_cast<__half2*>(inp_grad);
+    const __half2* out_grad_h = reinterpret_cast<const __half2*>(out_grad);
+    const __half2* vals_hat_h = reinterpret_cast<const __half2*>(X_vals);
+
+    inp_grad_h += (row * row_stride);
+    out_grad_h += (row * row_stride);
+    vals_hat_h += (row * row_stride);
+
+    const __half2* gamma_h = reinterpret_cast<const __half2*>(gamma);
+    int high_index = iterations * iteration_stride + id;
+
+    __half mean_h = means[row];
+    __half2 mean_reg = __halves2half2(mean_h, mean_h);
+#pragma unroll
+    for (int i = 0; i < iterations; i++) {
+        __half2 gamma_reg = gamma_h[i * iteration_stride + id];
+        vals_arr[i] = out_grad_h[i * iteration_stride + id];
+        vals_arr[i] *= gamma_reg;  // out_grad * gamma
+        xu[i] = (vals_hat_h[i * iteration_stride + id] - mean_reg);
+    }
+    if ((high_index) < row_stride) {
+        __half2 gamma_reg = gamma_h[high_index];
+        vals_arr[iterations] = out_grad_h[high_index];
+        vals_arr[iterations] *= gamma_reg;  // out_grad * gamma
+        xu[iterations] = (vals_hat_h[high_index] - mean_reg);
+        iterations++;
+    }
+    __half var_h = vars[row];
+    __half2 var_reg = __halves2half2(var_h, var_h);
+
+    float sum = 0.f;
+    for (int i = 0; i < iterations; i++) {
+        __half2 result_h = (xu[i] * vals_arr[i]);
+        float2 result_f = __half22float2(result_h);
+        sum += result_f.x;
+        sum += result_f.y;
+        vals_arr[i] *= h2rsqrt(var_reg);
+    }
+
+    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
+
+    if (g.thread_rank() == 0) partialSum[wid] = sum;
+
+    __syncthreads();
+
+    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
+
+#ifndef __STOCHASTIC_MODE__
+    __syncthreads();
+#endif
+
+    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
+
+    sum = g.shfl(sum, 0);
+    sum /= (2 * row_stride);
+    __half2 sum_h = __float2half2_rn(sum);
+
+    for (int i = 0; i < iterations; i++) {
+        __half2 xu_grad = ((-sum_h * xu[i] * h2rsqrt(var_reg)) / (var_reg));
+        vals_arr_f[i] = __half22float2(vals_arr[i]);
+        float2 xu_grad_f = __half22float2(xu_grad);
+        vals_arr_f[i].x += xu_grad_f.x;
+        vals_arr_f[i].y += xu_grad_f.y;
+    }
+
+    sum = 0.f;
+    for (int i = 0; i < iterations; i++) {
+        sum += (vals_arr_f[i].x);
+        sum += (vals_arr_f[i].y);
+    }
+
+    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
+
+    if (g.thread_rank() == 0) partialSum[wid] = sum;
+
+    __syncthreads();
+
+    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
+
+#ifndef __STOCHASTIC_MODE__
+    __syncthreads();
+#endif
+
+    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
+
+    sum = g.shfl(sum, 0);
+    sum /= (2 * row_stride);
+
+    iterations = row_stride / iteration_stride;
+    for (int i = 0; i < iterations; i++) {
+        vals_arr_f[i].x -= sum;
+        vals_arr_f[i].y -= sum;
+        __half2 temp = __float22half2_rn(vals_arr_f[i]);
+        inp_grad_h[i * iteration_stride + id] = temp;
+    }
+    if ((high_index) < row_stride) {
+        vals_arr_f[iterations].x -= sum;
+        vals_arr_f[iterations].y -= sum;
+        __half2 temp = __float22half2_rn(vals_arr_f[iterations]);
+        inp_grad_h[high_index] = temp;
+    }
+}
+
+template <>
+void launch_layerNorm_backward<float>(const float* out_grad,
+                                      const float* X_data,
+                                      const float* vars,
+                                      const float* means,
+                                      const float* gamma,
+                                      float* gamma_grad,
+                                      float* betta_grad,
+                                      float* inp_grad,
+                                      int batch,
+                                      int hidden_dim,
+                                      hipStream_t stream[2])
+{
+    int threads = THREADS;
+
+    dim3 grid_dim(hidden_dim / TILE_DIM);
+    dim3 block_dim(TILE_DIM, TILE_DIM);
+
+   hipLaunchKernelGGL(( LayerNormBackward1<float>), dim3(grid_dim), dim3(block_dim), 0, stream[0], 
+        out_grad, X_data, vars, means, gamma_grad, betta_grad, batch, hidden_dim);
+
+    dim3 grid_dim2(batch);
+
+    if (hidden_dim > 16384 && hidden_dim <= 32768)
+        threads <<= 1;
+    else if (hidden_dim > 32768 && hidden_dim <= 65536)
+        threads <<= 2;
+    else if (hidden_dim > 65536)
+        throw std::runtime_error("Unsupport hidden_dim.");
+
+    dim3 block_dim2(threads);
+   hipLaunchKernelGGL(( LayerNormBackward2), dim3(grid_dim2), dim3(block_dim2), 0, stream[1], 
+        out_grad, X_data, gamma, vars, means, inp_grad, hidden_dim);
+}
+
+template <>
+void launch_layerNorm_backward<__half>(const __half* out_grad,
+                                       const __half* X_data,
+                                       const __half* vars,
+                                       const __half* means,
+                                       const __half* gamma,
+                                       __half* gamma_grad,
+                                       __half* betta_grad,
+                                       __half* inp_grad,
+                                       int batch,
+                                       int hidden_dim,
+                                       hipStream_t stream[2])
+{
+    int threads = THREADS;
+
+    dim3 grid_dim(hidden_dim / TILE_DIM);
+    dim3 block_dim(TILE_DIM, TILE_DIM);
+
+   hipLaunchKernelGGL(( LayerNormBackward1<__half>), dim3(grid_dim), dim3(block_dim), 0, stream[0], 
+        out_grad, X_data, vars, means, gamma_grad, betta_grad, batch, hidden_dim);
+
+    dim3 grid_dim2(batch);
+
+    if (hidden_dim > 8192 && hidden_dim <= 16384)
+        threads <<= 1;
+    else if (hidden_dim > 16384 && hidden_dim <= 32768)
+        threads <<= 2;
+    else if (hidden_dim > 32768 && hidden_dim <= 65536)
+        threads <<= 3;
+    else if (hidden_dim > 65536)
+        throw std::runtime_error("Unsupport hidden_dim.");
+
+    dim3 block_dim2(threads / 2);
+   hipLaunchKernelGGL(( LayerNormBackward2), dim3(grid_dim2), dim3(block_dim2), 0, stream[1], 
+        out_grad, X_data, gamma, vars, means, inp_grad, hidden_dim / 2);
+}
+
+template <typename T>
+__global__ void LayerNormBackward1_fused_add(const T* __restrict__ out_grad1,
+                                             const T* __restrict__ out_grad2,
+                                             const T* __restrict__ vals_hat,
+                                             const T* __restrict__ gamma,
+                                             const T* __restrict__ betta,
+                                             T* __restrict__ gamma_grad,
+                                             T* __restrict__ betta_grad,
+                                             int rows,
+                                             int width,
+                                             bool invertible)
+{
+    __shared__ float betta_buffer[TILE_DIM][TILE_DIM + 1];
+    __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
+
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
+
+    int idx = blockDim.x * blockIdx.x + threadIdx.x;
+    int offset = threadIdx.y * width + idx;
+    int y_stride = width * TILE_DIM;
+
+    float betta_reg = (invertible ? (float)betta[idx] : 0.0f);
+    float gamma_reg = (float)gamma[idx];
+
+    // Loop across matrix height
+    float betta_tmp = 0;
+    float gamma_tmp = 0;
+    for (int r = threadIdx.y; r < rows; r += TILE_DIM) {
+        float grad = (float)out_grad1[offset] + (float)out_grad2[offset];
+        float val = (invertible ? ((float)vals_hat[offset] - betta_reg) / gamma_reg
+                                : (float)vals_hat[offset]);
+        betta_tmp += grad;
+        gamma_tmp += (val * grad);
+
+        offset += y_stride;
+    }
+
+    betta_buffer[threadIdx.x][threadIdx.y] = betta_tmp;
+    gamma_buffer[threadIdx.x][threadIdx.y] = gamma_tmp;
+
+    __syncthreads();
+
+    // Sum the shared buffer.
+    float s1 = betta_buffer[threadIdx.y][threadIdx.x];
+    float s2 = gamma_buffer[threadIdx.y][threadIdx.x];
+
+#ifndef __STOCHASTIC_MODE__
+    __syncthreads();
+#endif
+
+    for (int i = 1; i < TILE_DIM; i <<= 1) {
+        s1 += g.shfl_down(s1, i);
+        s2 += g.shfl_down(s2, i);
+    }
+
+    if (threadIdx.x == 0) {
+        int pos = blockIdx.x * TILE_DIM + threadIdx.y;
+        betta_grad[pos] = s1;
+        gamma_grad[pos] = s2;
+    }
+}
+
+template <typename T>
+__global__ void LayerNormBackward1_fused_add(const T* __restrict__ out_grad1,
+                                             const T* __restrict__ out_grad2,
+                                             const T* __restrict__ X_data,
+                                             const T* __restrict__ vars,
+                                             const T* __restrict__ means,
+                                             T* __restrict__ gamma_grad,
+                                             T* __restrict__ betta_grad,
+                                             int rows,
+                                             int width)
+{
+    __shared__ float betta_buffer[TILE_DIM][TILE_DIM + 1];
+    __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
+
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
+
+    int idx = blockDim.x * blockIdx.x + threadIdx.x;
+    int offset = threadIdx.y * width + idx;
+    int y_stride = width * TILE_DIM;
+
+    int pos = blockIdx.x * TILE_DIM + threadIdx.y;
+    // Loop across matrix height
+
+    float betta_tmp = 0;
+    float gamma_tmp = 0;
+    for (int r = threadIdx.y; r < rows; r += TILE_DIM) {
+        float grad = (float)out_grad1[offset] + (float)out_grad2[offset];
+        float val = (float)X_data[offset];
+        val = (val - (float)means[r]) * rsqrtf((float)vars[r]);
+        betta_tmp += grad;
+        gamma_tmp += (val * grad);
+
+        offset += y_stride;
+    }
+
+    betta_buffer[threadIdx.x][threadIdx.y] = betta_tmp;
+    gamma_buffer[threadIdx.x][threadIdx.y] = gamma_tmp;
+
+    __syncthreads();
+
+    // Sum the shared buffer.
+    float s1 = betta_buffer[threadIdx.y][threadIdx.x];
+    float s2 = gamma_buffer[threadIdx.y][threadIdx.x];
+
+#ifndef __STOCHASTIC_MODE__
+    __syncthreads();
+#endif
+
+    for (int i = 1; i < TILE_DIM; i <<= 1) {
+        s1 += g.shfl_down(s1, i);
+        s2 += g.shfl_down(s2, i);
+    }
+
+    if (threadIdx.x == 0) {
+        betta_grad[pos] = s1;
+        gamma_grad[pos] = s2;
+    }
+}
+
+__global__ void LayerNormBackward2_fused_add(const float* out_grad1,
+                                             const float* out_grad2,
+                                             const float* vals_hat,
+                                             const float* gamma,
+                                             const float* betta,
+                                             const float* vars,
+                                             float* inp_grad,
+                                             bool invertible,
+                                             int row_stride)
+{
+    int iteration_stride = blockDim.x;
+    int iterations = row_stride / iteration_stride;
+
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+
+    int row = blockIdx.x;
+    int id = threadIdx.x;
+    int wid = id / WARP_SIZE;
+    int warp_num = iteration_stride >> WARP_SIZE_BITS;
+    __shared__ float partialSum[MAX_WARP_NUM];
+
+    out_grad1 += (row * row_stride);
+    out_grad2 += (row * row_stride);
+    vals_hat += (row * row_stride);
+    inp_grad += (row * row_stride);
+
+    float vals_arr[NORM_REG];
+    float vals_hat_arr[NORM_REG];
+    int high_index = iterations * iteration_stride + id;
+#pragma unroll
+    for (int i = 0; i < iterations; i++) {
+        float gamma_reg = gamma[i * iteration_stride + id];
+        vals_arr[i] = out_grad1[i * iteration_stride + id];
+        vals_arr[i] *= gamma_reg;
+        vals_hat_arr[i] =
+            (invertible ? (vals_hat[i * iteration_stride + id] - betta[i * iteration_stride + id]) /
+                              gamma_reg
+                        : vals_hat[i * iteration_stride + id]);
+    }
+    if ((high_index) < row_stride) {
+        float gamma_reg = gamma[high_index];
+        vals_arr[iterations] = out_grad1[high_index];
+        vals_arr[iterations] *= gamma_reg;
+        vals_hat_arr[iterations] =
+            (invertible ? (vals_hat[high_index] - betta[high_index]) / gamma_reg
+                        : vals_hat[high_index]);
+        iterations++;
+    }
+
+    float var_reg = vars[row];
+
+    float sum = 0;
+    for (int i = 0; i < iterations; i++) {
+        sum += vals_hat_arr[i] * vals_arr[i] * sqrtf(var_reg);
+        vals_arr[i] *= rsqrtf(var_reg);
+    }
+
+    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
+
+    if (g.thread_rank() == 0) partialSum[wid] = sum;
+
+    __syncthreads();
+
+    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
+
+#ifndef __STOCHASTIC_MODE__
+    __syncthreads();
+#endif
+
+    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
+
+    sum = g.shfl(sum, 0);
+    sum /= row_stride;
+
+    for (int i = 0; i < iterations; i++) { vals_arr[i] += ((-sum * vals_hat_arr[i]) / var_reg); }
+
+    sum = 0;
+    for (int i = 0; i < iterations; i++) { sum += vals_arr[i]; }
+
+    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
+
+    if (g.thread_rank() == 0) partialSum[wid] = sum;
+
+    __syncthreads();
+
+    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
+
+#ifndef __STOCHASTIC_MODE__
+    __syncthreads();
+#endif
+
+    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
+    sum = g.shfl(sum, 0);
+    sum /= row_stride;
+
+    iterations = row_stride / iteration_stride;
+    for (int i = 0; i < iterations; i++)
+        inp_grad[i * iteration_stride + id] =
+            (vals_arr[i] - sum) + out_grad2[i * iteration_stride + id];
+    if ((high_index) < row_stride)
+        inp_grad[high_index] = (vals_arr[iterations] - sum) + out_grad2[high_index];
+}
+
+__global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
+                                             const __half* out_grad2,
+                                             const __half* vals_hat,
+                                             const __half* gamma,
+                                             const __half* betta,
+                                             const __half* vars,
+                                             __half* inp_grad,
+                                             bool invertible,
+                                             int row_stride)
+{
+    int iteration_stride = blockDim.x;
+    int iterations = row_stride / iteration_stride;
+
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+
+    int row = blockIdx.x;
+    int id = threadIdx.x;
+    int wid = id / WARP_SIZE;
+    int warp_num = iteration_stride >> WARP_SIZE_BITS;
+    __shared__ float partialSum[MAX_WARP_NUM];
+
+    __half2 vals_arr[NORM_REG];
+    float2 vals_arr_f[NORM_REG];
+    __half2 vals_hat_arr[NORM_REG];
+
+    // float2 result[iterations];
+
+    __half2* inp_grad_h = reinterpret_cast<__half2*>(inp_grad);
+    const __half2* out_grad_h1 = reinterpret_cast<const __half2*>(out_grad1);
+    const __half2* out_grad_h2 = reinterpret_cast<const __half2*>(out_grad2);
+    const __half2* vals_hat_h = reinterpret_cast<const __half2*>(vals_hat);
+
+    inp_grad_h += (row * row_stride);
+    out_grad_h1 += (row * row_stride);
+    out_grad_h2 += (row * row_stride);
+    vals_hat_h += (row * row_stride);
+
+    const __half2* gamma_h = reinterpret_cast<const __half2*>(gamma);
+    const __half2* betta_h = (invertible ? reinterpret_cast<const __half2*>(betta) : nullptr);
+    int high_index = iterations * iteration_stride + id;
+#pragma unroll
+    for (int i = 0; i < iterations; i++) {
+        __half2 gamma_reg = gamma_h[i * iteration_stride + id];
+        vals_arr[i] = out_grad_h1[i * iteration_stride + id];
+        vals_arr[i] *= gamma_reg;  // out_grad * gamma
+        vals_hat_arr[i] =
+            (invertible
+                 ? (vals_hat_h[i * iteration_stride + id] - betta_h[i * iteration_stride + id]) /
+                       gamma_reg
+                 : vals_hat_h[i * iteration_stride + id]);
+    }
+    if ((high_index) < row_stride) {
+        __half2 gamma_reg = gamma_h[high_index];
+        vals_arr[iterations] = out_grad_h1[high_index];
+        vals_arr[iterations] *= gamma_reg;  // out_grad * gamma
+        vals_hat_arr[iterations] =
+            (invertible ? (vals_hat_h[high_index] - betta_h[high_index]) / gamma_reg
+                        : vals_hat_h[high_index]);
+        iterations++;
+    }
+    __half var_h = vars[row];
+    __half2 var_reg = __halves2half2(var_h, var_h);
+
+    float sum = 0.f;
+    for (int i = 0; i < iterations; i++) {
+        __half2 result_h = (vals_hat_arr[i] * vals_arr[i] * h2sqrt(var_reg));
+        float2 result_f = __half22float2(result_h);
+        sum += result_f.x;
+        sum += result_f.y;
+        vals_arr[i] *= h2rsqrt(var_reg);
+    }
+
+    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
+
+    if (g.thread_rank() == 0) partialSum[wid] = sum;
+
+    __syncthreads();
+
+    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
+
+#ifndef __STOCHASTIC_MODE__
+    __syncthreads();
+#endif
+
+    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
+
+    sum = g.shfl(sum, 0);
+    sum /= (2 * row_stride);
+    __half2 sum_h = __float2half2_rn(sum);
+
+    for (int i = 0; i < iterations; i++) {
+        __half2 temp = ((-sum_h * vals_hat_arr[i]) / (var_reg));
+        vals_arr_f[i] = __half22float2(vals_arr[i]);
+        float2 temp_f = __half22float2(temp);
+        vals_arr_f[i].x += temp_f.x;
+        vals_arr_f[i].y += temp_f.y;
+    }
+    sum = 0.f;
+    for (int i = 0; i < iterations; i++) {
+        sum += (vals_arr_f[i].x);
+        sum += (vals_arr_f[i].y);
+    }
+
+    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
+
+    if (g.thread_rank() == 0) partialSum[wid] = sum;
+
+    __syncthreads();
+
+    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
+
+#ifndef __STOCHASTIC_MODE__
+    __syncthreads();
+#endif
+
+    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
+
+    sum = g.shfl(sum, 0);
+    sum /= (2 * row_stride);
+
+    iterations = row_stride / iteration_stride;
+    for (int i = 0; i < iterations; i++) {
+        vals_arr_f[i].x -= sum;
+        vals_arr_f[i].y -= sum;
+        __half2 temp = __float22half2_rn(vals_arr_f[i]);
+
+        inp_grad_h[i * iteration_stride + id] = temp + out_grad_h2[i * iteration_stride + id];
+    }
+    if ((high_index) < row_stride) {
+        vals_arr_f[iterations].x -= sum;
+        vals_arr_f[iterations].y -= sum;
+        __half2 temp = __float22half2_rn(vals_arr_f[iterations]);
+
+        inp_grad_h[high_index] = temp + out_grad_h2[high_index];
+    }
+}
+
+template <>
+void launch_layerNorm_backward_fused_add<float>(const float* out_grad1,
+                                                const float* out_grad2,
+                                                const float* vals_hat,
+                                                const float* vars,
+                                                const float* gamma,
+                                                float* gamma_grad,
+                                                float* betta_grad,
+                                                float* inp_grad,
+                                                int batch,
+                                                int hidden_dim,
+                                                hipStream_t stream[2],
+                                                bool invertible,
+                                                const float* betta)
+{
+    int threads = THREADS;
+
+    dim3 grid_dim(hidden_dim / TILE_DIM);
+    dim3 block_dim(TILE_DIM, TILE_DIM);
+   hipLaunchKernelGGL(( LayerNormBackward1<float>), dim3(grid_dim), dim3(block_dim), 0, stream[0], 
+        out_grad1, vals_hat, gamma, betta, gamma_grad, betta_grad, batch, hidden_dim, invertible);
+
+    dim3 grid_dim2(batch);
+
+    if (hidden_dim > 16384 && hidden_dim <= 32768)
+        threads <<= 1;
+    else if (hidden_dim > 32768 && hidden_dim <= 65536)
+        threads <<= 2;
+    else if (hidden_dim > 65536)
+        throw std::runtime_error("Unsupport hidden_dim.");
+
+    dim3 block_dim2(threads);
+   hipLaunchKernelGGL(( LayerNormBackward2_fused_add), dim3(grid_dim2), dim3(block_dim2), 0, stream[1], 
+        out_grad1, out_grad2, vals_hat, gamma, betta, vars, inp_grad, invertible, hidden_dim);
+}
+
+template <>
+void launch_layerNorm_backward_fused_add<__half>(const __half* out_grad1,
+                                                 const __half* out_grad2,
+                                                 const __half* vals_hat,
+                                                 const __half* vars,
+                                                 const __half* gamma,
+                                                 __half* gamma_grad,
+                                                 __half* betta_grad,
+                                                 __half* inp_grad,
+                                                 int batch,
+                                                 int hidden_dim,
+                                                 hipStream_t stream[2],
+                                                 bool invertible,
+                                                 const __half* betta)
+{
+    int threads = THREADS;
+
+    dim3 grid_dim(hidden_dim / TILE_DIM);
+    dim3 block_dim(TILE_DIM, TILE_DIM);
+
+   hipLaunchKernelGGL(( LayerNormBackward1<__half>), dim3(grid_dim), dim3(block_dim), 0, stream[0], 
+        out_grad1, vals_hat, gamma, betta, gamma_grad, betta_grad, batch, hidden_dim, invertible);
+
+    dim3 grid_dim2(batch);
+
+    if (hidden_dim > 8192 && hidden_dim <= 16384)
+        threads <<= 1;
+    else if (hidden_dim > 16384 && hidden_dim <= 32768)
+        threads <<= 2;
+    else if (hidden_dim > 32768 && hidden_dim <= 65536)
+        threads <<= 3;
+    else if (hidden_dim > 65536)
+        throw std::runtime_error("Unsupport hidden_dim.");
+
+    dim3 block_dim2(threads / 2);
+   hipLaunchKernelGGL(( LayerNormBackward2_fused_add), dim3(grid_dim2), dim3(block_dim2), 0, stream[1], 
+        out_grad1, out_grad2, vals_hat, gamma, betta, vars, inp_grad, invertible, hidden_dim / 2);
+}
+
+/* Backward Normalize (Input-Gradient)
+ * Using the means and variances from the input
+ * This type of backward is not invertible!
+ * We do the backward using the input (X)
+ */
+
+__global__ void LayerNormBackward2_fused_add(const float* out_grad1,
+                                             const float* out_grad2,
+                                             const float* X_vals,
+                                             const float* gamma,
+                                             const float* vars,
+                                             const float* means,
+                                             float* inp_grad,
+                                             int row_stride)
+{
+    int iteration_stride = blockDim.x;
+    int iterations = row_stride / iteration_stride;
+
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+
+    int row = blockIdx.x;
+    int id = threadIdx.x;
+    int wid = id / WARP_SIZE;
+    int warp_num = iteration_stride >> WARP_SIZE_BITS;
+    __shared__ float partialSum[MAX_WARP_NUM];
+
+    float vals_arr[NORM_REG];
+    float vals_hat_arr[NORM_REG];
+
+    out_grad1 += (row * row_stride);
+    out_grad2 += (row * row_stride);
+    X_vals += (row * row_stride);
+    inp_grad += (row * row_stride);
+    int high_index = iterations * iteration_stride + id;
+#pragma unroll
+    for (int i = 0; i < iterations; i++) {
+        float gamma_reg = gamma[i * iteration_stride + id];
+        vals_arr[i] = out_grad1[i * iteration_stride + id];
+        vals_arr[i] *= gamma_reg;
+        vals_hat_arr[i] = X_vals[i * iteration_stride + id];
+    }
+    if ((high_index) < row_stride) {
+        float gamma_reg = gamma[high_index];
+        vals_arr[iterations] = out_grad1[high_index];
+        vals_arr[iterations] *= gamma_reg;
+        vals_hat_arr[iterations] = X_vals[high_index];
+        iterations++;
+    }
+
+    float var_reg = vars[row];
+    float mean_reg = means[row];
+
+    float sum = 0;
+    float xu[NORM_REG];
+    for (int i = 0; i < iterations; i++) {
+        xu[i] = (vals_hat_arr[i] - mean_reg);
+        sum += vals_arr[i] * xu[i];
+        vals_arr[i] *= rsqrtf(var_reg);
+    }
+
+    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
+
+    if (g.thread_rank() == 0) partialSum[wid] = sum;
+
+    __syncthreads();
+
+    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
+
+#ifndef __STOCHASTIC_MODE__
+    __syncthreads();
+#endif
+
+    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
+
+    sum = g.shfl(sum, 0);
+    sum /= row_stride;
+
+    for (int i = 0; i < iterations; i++) {
+        vals_arr[i] += (-sum * xu[i] * rsqrtf(var_reg) / (var_reg));
+    }
+
+    sum = 0;
+    for (int i = 0; i < iterations; i++) { sum += vals_arr[i]; }
+
+    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
+
+    if (g.thread_rank() == 0) partialSum[wid] = sum;
+
+    __syncthreads();
+
+    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
+
+#ifndef __STOCHASTIC_MODE__
+    __syncthreads();
+#endif
+
+    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
+    sum = g.shfl(sum, 0);
+    sum /= row_stride;
+
+    iterations = row_stride / iteration_stride;
+    for (int i = 0; i < iterations; i++)
+        inp_grad[i * iteration_stride + id] =
+            (vals_arr[i] - sum) + out_grad2[i * iteration_stride + id];
+    if ((high_index) < row_stride)
+        inp_grad[high_index] = (vals_arr[iterations] - sum) + out_grad2[high_index];
+}
+
+__global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
+                                             const __half* out_grad2,
+                                             const __half* X_vals,
+                                             const __half* gamma,
+                                             const __half* vars,
+                                             const __half* means,
+                                             __half* inp_grad,
+                                             int row_stride)
+{
+    int iteration_stride = blockDim.x;
+    int iterations = row_stride / iteration_stride;
+
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+
+    int row = blockIdx.x;
+    int id = threadIdx.x;
+    int wid = id / WARP_SIZE;
+    int warp_num = iteration_stride >> WARP_SIZE_BITS;
+
+    __shared__ float partialSum[MAX_WARP_NUM];
+
+    __half2 vals_arr[NORM_REG];
+    float2 vals_arr_f[NORM_REG];
+    __half2 vals_hat_arr[NORM_REG];
+
+    __half2* inp_grad_h = reinterpret_cast<__half2*>(inp_grad);
+    const __half2* out_grad_h1 = reinterpret_cast<const __half2*>(out_grad1);
+    const __half2* out_grad_h2 = reinterpret_cast<const __half2*>(out_grad2);
+    const __half2* vals_hat_h = reinterpret_cast<const __half2*>(X_vals);
+
+    out_grad_h1 += (row * row_stride);
+    out_grad_h2 += (row * row_stride);
+    inp_grad_h += (row * row_stride);
+    vals_hat_h += (row * row_stride);
+
+    const __half2* gamma_h = reinterpret_cast<const __half2*>(gamma);
+    int high_index = iterations * iteration_stride + id;
+#pragma unroll
+    for (int i = 0; i < iterations; i++) {
+        __half2 gamma_reg = gamma_h[i * iteration_stride + id];
+        vals_arr[i] = out_grad_h1[i * iteration_stride + id];
+        vals_arr[i] *= gamma_reg;  // out_grad * gamma
+        vals_hat_arr[i] = vals_hat_h[i * iteration_stride + id];
+    }
+    if ((high_index) < row_stride) {
+        __half2 gamma_reg = gamma_h[high_index];
+        vals_arr[iterations] = out_grad_h1[high_index];
+        vals_arr[iterations] *= gamma_reg;  // out_grad * gamma
+        vals_hat_arr[iterations] = vals_hat_h[high_index];
+        iterations++;
+    }
+
+    __half mean_h = means[row];
+    __half var_h = vars[row];
+    __half2 var_reg = __halves2half2(var_h, var_h);
+    __half2 mean_reg = __halves2half2(mean_h, mean_h);
+    __half2 xu[NORM_REG];
+
+    float sum = 0.f;
+    for (int i = 0; i < iterations; i++) {
+        xu[i] = (vals_hat_arr[i] - mean_reg);
+        __half2 result_h = (xu[i] * vals_arr[i]);
+        float2 result_f = __half22float2(result_h);
+        sum += result_f.x;
+        sum += result_f.y;
+        vals_arr[i] *= h2rsqrt(var_reg);
+    }
+
+    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
+
+    if (g.thread_rank() == 0) partialSum[wid] = sum;
+
+    __syncthreads();
+
+    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
+
+#ifndef __STOCHASTIC_MODE__
+    __syncthreads();
+#endif
+
+    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
+
+    sum = g.shfl(sum, 0);
+    sum /= (2 * row_stride);
+    __half2 sum_h = __float2half2_rn(sum);
+
+    for (int i = 0; i < iterations; i++) {
+        __half2 xu_grad = ((-sum_h * xu[i] * h2rsqrt(var_reg)) / (var_reg));
+        vals_arr_f[i] = __half22float2(vals_arr[i]);
+        float2 xu_grad_f = __half22float2(xu_grad);
+        vals_arr_f[i].x += xu_grad_f.x;
+        vals_arr_f[i].y += xu_grad_f.y;
+    }
+
+    sum = 0.f;
+    for (int i = 0; i < iterations; i++) {
+        sum += (vals_arr_f[i].x);
+        sum += (vals_arr_f[i].y);
+    }
+
+    for (int i = 1; i < WARP_SIZE; i *= 2) { sum += g.shfl_down(sum, i); }
+
+    if (g.thread_rank() == 0) partialSum[wid] = sum;
+
+    __syncthreads();
+
+    if (g.thread_rank() < warp_num) sum = partialSum[g.thread_rank()];
+
+#ifndef __STOCHASTIC_MODE__
+    __syncthreads();
+#endif
+
+    for (int i = 1; i < warp_num; i *= 2) sum += g.shfl_down(sum, i);
+
+    sum = g.shfl(sum, 0);
+    sum /= (2 * row_stride);
+
+    iterations = row_stride / iteration_stride;
+    for (int i = 0; i < iterations; i++) {
+        vals_arr_f[i].x -= sum;
+        vals_arr_f[i].y -= sum;
+        __half2 temp = __float22half2_rn(vals_arr_f[i]);
+        inp_grad_h[i * iteration_stride + id] = temp + out_grad_h2[i * iteration_stride + id];
+    }
+    if ((high_index) < row_stride) {
+        vals_arr_f[iterations].x -= sum;
+        vals_arr_f[iterations].y -= sum;
+        __half2 temp = __float22half2_rn(vals_arr_f[iterations]);
+        inp_grad_h[high_index] = temp + out_grad_h2[high_index];
+    }
+}
+
+template <>
+void launch_layerNorm_backward_fused_add<float>(const float* out_grad1,
+                                                const float* out_grad2,
+                                                const float* X_data,
+                                                const float* vars,
+                                                const float* means,
+                                                const float* gamma,
+                                                float* gamma_grad,
+                                                float* betta_grad,
+                                                float* inp_grad,
+                                                int batch,
+                                                int hidden_dim,
+                                                hipStream_t stream[2])
+{
+    int threads = THREADS;
+
+    dim3 grid_dim(hidden_dim / TILE_DIM);
+    dim3 block_dim(TILE_DIM, TILE_DIM);
+
+   hipLaunchKernelGGL(( LayerNormBackward1<float>), dim3(grid_dim), dim3(block_dim), 0, stream[0], 
+        out_grad1, X_data, vars, means, gamma_grad, betta_grad, batch, hidden_dim);
+
+    dim3 grid_dim2(batch);
+
+    if (hidden_dim > 16384 && hidden_dim <= 32768)
+        threads <<= 1;
+    else if (hidden_dim > 32768 && hidden_dim <= 65536)
+        threads <<= 2;
+    else if (hidden_dim > 65536)
+        throw std::runtime_error("Unsupport hidden_dim.");
+
+    dim3 block_dim2(threads);
+   hipLaunchKernelGGL(( LayerNormBackward2_fused_add), dim3(grid_dim2), dim3(block_dim2), 0, stream[1], 
+        out_grad1, out_grad2, X_data, gamma, vars, means, inp_grad, hidden_dim);
+}
+
+template <>
+void launch_layerNorm_backward_fused_add<__half>(const __half* out_grad1,
+                                                 const __half* out_grad2,
+                                                 const __half* X_data,
+                                                 const __half* vars,
+                                                 const __half* means,
+                                                 const __half* gamma,
+                                                 __half* gamma_grad,
+                                                 __half* betta_grad,
+                                                 __half* inp_grad,
+                                                 int batch,
+                                                 int hidden_dim,
+                                                 hipStream_t stream[2])
+{
+    int threads = THREADS;
+
+    dim3 grid_dim(hidden_dim / TILE_DIM);
+    dim3 block_dim(TILE_DIM, TILE_DIM);
+
+   hipLaunchKernelGGL(( LayerNormBackward1<__half>), dim3(grid_dim), dim3(block_dim), 0, stream[0], 
+        out_grad1, X_data, vars, means, gamma_grad, betta_grad, batch, hidden_dim);
+
+    dim3 grid_dim2(batch);
+
+    if (hidden_dim > 8192 && hidden_dim <= 16384)
+        threads <<= 1;
+    else if (hidden_dim > 16384 && hidden_dim <= 32768)
+        threads <<= 2;
+    else if (hidden_dim > 32768 && hidden_dim <= 65536)
+        threads <<= 3;
+    else if (hidden_dim > 65536)
+        throw std::runtime_error("Unsupport hidden_dim.");
+
+    dim3 block_dim2(threads / 2);
+   hipLaunchKernelGGL(( LayerNormBackward2_fused_add), dim3(grid_dim2), dim3(block_dim2), 0, stream[1], 
+        out_grad1, out_grad2, X_data, gamma, vars, means, inp_grad, hidden_dim / 2);
+}
--- a/csrc/transformer_bak/softmax_kernels.cu
+++ b/csrc/transformer_bak/softmax_kernels.cu
+#include <math.h>
+#include "custom_cuda_layers.h"
+#include "general_kernels.h"
+
+namespace cg = cooperative_groups;
+
+dim3 get_attn_softmax_grid(int batch_size, int heads, int sequence_length, int threads)
+{
+    int seq_length4 = sequence_length / 4;
+    int block_compute_size =
+        (seq_length4 < threads ? (int)pow(2.0, floor(log2((float)(threads / seq_length4)))) : 1);
+    // Note that the Y and Z dimensions are limited to 65535, while X is basically unlimited:
+    // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications
+    // The batch size is typically relatively small, while the sequence length could potentially be
+    // arbitrarily large. We therefore place the batch size second to avoid hitting the Y limit.
+    unsigned x = heads * sequence_length / block_compute_size;
+    unsigned y = batch_size;
+    return {x, y};
+}
+
+// Fused attention + softmax
+template <int tbSize, int blockStride, int tbSeq>
+__global__ void attn_softmax(float* vals,
+                             const float* attn_mask,
+                             int heads,
+                             int seq_length,
+                             int iterations)
+{
+    __shared__ float partialSum[MAX_WARP_NUM];
+
+    int warp_num = blockDim.x >> WARP_SIZE_BITS;
+
+    int iteration_stride = blockDim.x;
+    int block_width = blockStride * seq_length;
+
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
+
+    int batch = blockIdx.y;
+    int row = blockIdx.x;
+    int max_threads_in_sequence = std::max(seq_length, tbSeq);
+    int seq_lane = threadIdx.x % max_threads_in_sequence;
+
+    int data_offset = batch * (gridDim.x * block_width) + row * block_width +
+                      (threadIdx.x / max_threads_in_sequence) * seq_length;
+    int mask_offset = batch * seq_length;
+
+    int wid = threadIdx.x >> WARP_SIZE_BITS;
+    int lane = threadIdx.x & 0x1f;
+
+    float4* val_cast = reinterpret_cast<float4*>(vals);
+    const float4* attn_mask_cast = reinterpret_cast<const float4*>(attn_mask);
+
+    float4 data[MAX_THREAD_ITERATIONS];
+
+    float max_val = minus_infinity;
+
+    for (int i = 0; i < iterations; i++) {
+        int data_id = i * iteration_stride + seq_lane;
+        if (data_id < seq_length) {
+            float4 mask = attn_mask_cast[mask_offset + data_id];
+            data[i] = val_cast[data_offset + data_id];
+
+            data[i].x += mask.x;
+            data[i].y += mask.y;
+            data[i].z += mask.z;
+            data[i].w += mask.w;
+
+            max_val = (data[i].x > max_val ? data[i].x : max_val);
+            max_val = (data[i].y > max_val ? data[i].y : max_val);
+            max_val = (data[i].z > max_val ? data[i].z : max_val);
+            max_val = (data[i].w > max_val ? data[i].w : max_val);
+        } else {
+            data[i].x = minus_infinity;
+            data[i].y = minus_infinity;
+            data[i].z = minus_infinity;
+            data[i].w = minus_infinity;
+        }
+    }
+
+    for (int i = 1; i < tbSize; i *= 2) {
+        auto temp = g.shfl_xor(max_val, i);
+        max_val = (temp > max_val ? temp : max_val);
+    }
+
+    if (seq_length > tbSize) {
+        if (lane == 0) partialSum[wid] = max_val;
+        b.sync();
+
+        if (lane < warp_num) max_val = partialSum[lane];
+
+#ifndef __STOCHASTIC_MODE__
+        b.sync();
+#endif
+
+        int iters = warp_num;
+        if (seq_length < iteration_stride)
+            iters = warp_num / (iteration_stride / max_threads_in_sequence);
+
+        for (int i = 1; i < iters; i *= 2) {
+            auto temp = g.shfl_xor(max_val, i);
+            max_val = (temp > max_val ? temp : max_val);
+        }
+
+        max_val = g.shfl(max_val, threadIdx.x / tbSize);
+    }
+
+    float sum = 0;
+    for (int i = 0; i < iterations; i++) {
+        data[i].x = __expf(data[i].x - max_val);
+        data[i].y = __expf(data[i].y - max_val);
+        data[i].z = __expf(data[i].z - max_val);
+        data[i].w = __expf(data[i].w - max_val);
+
+        sum += (data[i].x + data[i].y + data[i].z + data[i].w);
+    }
+
+    for (int i = 1; i < tbSize; i *= 2) { sum += g.shfl_xor(sum, i); }
+
+    if (seq_length > tbSize) {
+        if (lane == 0) partialSum[wid] = sum;
+        b.sync();
+
+        if (lane < warp_num) sum = partialSum[lane];
+
+#ifndef __STOCHASTIC_MODE__
+        b.sync();
+#endif
+
+        int iters = warp_num;
+        if (seq_length < iteration_stride)
+            iters = warp_num / (iteration_stride / max_threads_in_sequence);
+
+        for (int i = 1; i < iters; i *= 2) { sum += g.shfl_xor(sum, i); }
+
+        sum = g.shfl(sum, threadIdx.x / tbSize);
+    }
+
+    sum += 1e-6;
+
+    for (int i = 0; i < iterations; i++) {
+        data[i].x /= sum;
+        data[i].y /= sum;
+        data[i].z /= sum;
+        data[i].w /= sum;
+
+        int data_id = i * iteration_stride + seq_lane;
+        if (data_id < seq_length) val_cast[data_offset + data_id] = data[i];
+    }
+}
+
+template <int tbSize, int blockStride, int tbSeq>
+__global__ void attn_softmax(__half* vals,
+                             const __half* attn_mask,
+                             int heads,
+                             int seq_length,
+                             int iterations)
+{
+#ifdef HALF_PRECISION_AVAILABLE
+    __shared__ float partialSum[MAX_WARP_NUM];
+
+    int warp_num = blockDim.x >> WARP_SIZE_BITS;
+
+    int iteration_stride = blockDim.x;
+    int block_width = blockStride * seq_length;
+
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
+
+    int batch = blockIdx.y;
+    int row = blockIdx.x;
+    int max_threads_in_sequence = std::max(seq_length, tbSeq);
+    int seq_lane = threadIdx.x % max_threads_in_sequence;
+
+    int data_offset = batch * (gridDim.x * block_width) + row * block_width +
+                      (threadIdx.x / max_threads_in_sequence) * seq_length;
+    int mask_offset = batch * seq_length;
+
+    int wid = threadIdx.x >> WARP_SIZE_BITS;
+    int lane = threadIdx.x & 0x1f;
+
+    float2* val_cast = reinterpret_cast<float2*>(vals);
+    const float2* attn_mask_cast = reinterpret_cast<const float2*>(attn_mask);
+
+    val_cast += data_offset;
+    attn_mask_cast += mask_offset;
+
+    float2 low_data[MAX_THREAD_ITERATIONS];
+    float2 high_data[MAX_THREAD_ITERATIONS];
+
+    float max_val = minus_infinity;
+
+    for (int i = 0; i < iterations; i++) {
+        int data_id = i * iteration_stride + seq_lane;
+        if (data_id < seq_length) {
+            float2 data = val_cast[data_id];
+            float2 mask = attn_mask_cast[data_id];
+
+            __half2* data_arr = reinterpret_cast<__half2*>(&data);
+            __half2* mask_arr = reinterpret_cast<__half2*>(&mask);
+
+            low_data[i] = __half22float2(data_arr[0]);
+            high_data[i] = __half22float2(data_arr[1]);
+            float2 low_mask = __half22float2(mask_arr[0]);
+            float2 high_mask = __half22float2(mask_arr[1]);
+
+            low_data[i].x += low_mask.x;
+            low_data[i].y += low_mask.y;
+            high_data[i].x += high_mask.x;
+            high_data[i].y += high_mask.y;
+
+            max_val = (low_data[i].x > max_val ? low_data[i].x : max_val);
+            max_val = (low_data[i].y > max_val ? low_data[i].y : max_val);
+            max_val = (high_data[i].x > max_val ? high_data[i].x : max_val);
+            max_val = (high_data[i].y > max_val ? high_data[i].y : max_val);
+        }
+    }
+
+    for (int i = 1; i < tbSize; i *= 2) {
+        auto temp = g.shfl_xor(max_val, i);
+        max_val = (temp > max_val ? temp : max_val);
+    }
+
+    if (seq_length > tbSize) {
+        if (lane == 0) partialSum[wid] = max_val;
+        b.sync();
+
+        if (lane < warp_num) max_val = partialSum[lane];
+
+#ifndef __STOCHASTIC_MODE__
+        b.sync();
+#endif
+
+        int iters = warp_num;
+        if (seq_length < iteration_stride)
+            iters = warp_num / (iteration_stride / max_threads_in_sequence);
+
+        for (int i = 1; i < iters; i *= 2) {
+            auto temp = g.shfl_xor(max_val, i);
+            max_val = (temp > max_val ? temp : max_val);
+        }
+
+        max_val = g.shfl(max_val, threadIdx.x / tbSize);
+    }
+
+    float sum = 0;
+    for (int i = 0; i < iterations; i++) {
+        int data_id = i * iteration_stride + seq_lane;
+        if (data_id < seq_length) {
+            low_data[i].x = __expf(low_data[i].x - max_val);
+            low_data[i].y = __expf(low_data[i].y - max_val);
+            high_data[i].x = __expf(high_data[i].x - max_val);
+            high_data[i].y = __expf(high_data[i].y - max_val);
+
+            sum += (low_data[i].x + low_data[i].y + high_data[i].x + high_data[i].y);
+        }
+    }
+
+    for (int i = 1; i < tbSize; i *= 2) { sum += g.shfl_xor(sum, i); }
+
+    if (seq_length > tbSize) {
+        if (lane == 0) partialSum[wid] = sum;
+        b.sync();
+
+        if (lane < warp_num) sum = partialSum[lane];
+
+#ifndef __STOCHASTIC_MODE__
+        b.sync();
+#endif
+
+        int iters = warp_num;
+        if (seq_length < iteration_stride)
+            iters = warp_num / (iteration_stride / max_threads_in_sequence);
+
+        for (int i = 1; i < iters; i *= 2) { sum += g.shfl_xor(sum, i); }
+
+        sum = g.shfl(sum, threadIdx.x / tbSize);
+    }
+
+    sum += 1e-6;
+
+    for (int i = 0; i < iterations; i++) {
+        int data_id = i * iteration_stride + seq_lane;
+        if (data_id < seq_length) {
+            float2 result_f;
+            __half2* result_h = reinterpret_cast<__half2*>(&result_f);
+
+            low_data[i].x /= sum;
+            low_data[i].y /= sum;
+            high_data[i].x /= sum;
+            high_data[i].y /= sum;
+
+            result_h[0] = __float22half2_rn(low_data[i]);
+            result_h[1] = __float22half2_rn(high_data[i]);
+
+            val_cast[data_id] = result_f;
+        }
+    }
+
+#endif
+}
+
+template <typename T>
+void launch_attn_softmax(T*, const T*, int, int, int, cudaStream_t);
+
+template <>
+void launch_attn_softmax<float>(float* vals,
+                                const float* attn_mask,
+                                int batch_size,
+                                int heads,
+                                int sequence_length,
+                                cudaStream_t stream)
+{
+    const int threads = 128;
+    int seq_length4 = sequence_length / 4;
+
+    dim3 grid_dim = get_attn_softmax_grid(batch_size, heads, sequence_length, threads);
+
+    int subblock_max_workload = MAX_THREAD_ITERATIONS * 4 * threads;
+
+    dim3 block_dim(seq_length4 > threads ? ((sequence_length + subblock_max_workload - 1) /
+                                            subblock_max_workload * threads)
+                                         : threads);
+    int iterations =
+        (sequence_length < subblock_max_workload ? (seq_length4 + threads - 1) / threads
+                                                 : MAX_THREAD_ITERATIONS);
+
+    if (sequence_length <= 8)
+        attn_softmax<2, (threads / 2), 2>
+            <<<grid_dim, block_dim, 0, stream>>>(vals, attn_mask, heads, seq_length4, iterations);
+    else if (sequence_length <= 16)
+        attn_softmax<4, (threads / 4), 4>
+            <<<grid_dim, block_dim, 0, stream>>>(vals, attn_mask, heads, seq_length4, iterations);
+    else if (sequence_length <= 32)
+        attn_softmax<8, (threads / 8), 8>
+            <<<grid_dim, block_dim, 0, stream>>>(vals, attn_mask, heads, seq_length4, iterations);
+    else if (sequence_length <= 64)
+        attn_softmax<16, (threads / 16), 16>
+            <<<grid_dim, block_dim, 0, stream>>>(vals, attn_mask, heads, seq_length4, iterations);
+    else if (sequence_length <= 128)
+        attn_softmax<32, (threads / 32), 32>
+            <<<grid_dim, block_dim, 0, stream>>>(vals, attn_mask, heads, seq_length4, iterations);
+    else if (sequence_length <= 256)
+        attn_softmax<32, (threads / 64), 64>
+            <<<grid_dim, block_dim, 0, stream>>>(vals, attn_mask, heads, seq_length4, iterations);
+    else {
+        const int threads = 256;
+        dim3 grid_dim = get_attn_softmax_grid(batch_size, heads, sequence_length, threads);
+
+        int subblock_max_workload = MAX_THREAD_ITERATIONS * 4 * threads;
+
+        dim3 block_dim(seq_length4 > threads ? ((sequence_length + subblock_max_workload - 1) /
+                                                subblock_max_workload * threads)
+                                             : threads);
+        iterations =
+            (sequence_length < subblock_max_workload ? (seq_length4 + threads - 1) / threads
+                                                     : MAX_THREAD_ITERATIONS);
+        if (sequence_length <= 512)
+            attn_softmax<32, (threads / 128), 128><<<grid_dim, block_dim, 0, stream>>>(
+                vals, attn_mask, heads, seq_length4, iterations);
+        else if (sequence_length < (MAX_THREADS * MAX_THREAD_ITERATIONS * 4))
+            attn_softmax<32, 1, 128><<<grid_dim, block_dim, 0, stream>>>(
+                vals, attn_mask, heads, seq_length4, iterations);
+        else
+            throw std::runtime_error(
+                "Unsupport Seq_Length! Check the restriction of the max_threads and "
+                "max_thread_iterations!");
+    }
+}
+
+template <>
+void launch_attn_softmax<__half>(__half* vals,
+                                 const __half* attn_mask,
+                                 int batch_size,
+                                 int heads,
+                                 int sequence_length,
+                                 cudaStream_t stream)
+{
+    const int threads = 128;
+    int seq_length4 = sequence_length / 4;
+
+    dim3 grid_dim = get_attn_softmax_grid(batch_size, heads, sequence_length, threads);
+
+    int subblock_max_workload = MAX_THREAD_ITERATIONS * 4 * threads;
+
+    dim3 block_dim(seq_length4 > threads ? ((sequence_length + subblock_max_workload - 1) /
+                                            subblock_max_workload * threads)
+                                         : threads);
+
+    int iterations =
+        (sequence_length < subblock_max_workload ? (seq_length4 + threads - 1) / threads
+                                                 : MAX_THREAD_ITERATIONS);
+
+    if (sequence_length <= 8)
+        attn_softmax<2, (threads / 2), 2>
+            <<<grid_dim, block_dim, 0, stream>>>(vals, attn_mask, heads, seq_length4, iterations);
+    else if (sequence_length <= 16)
+        attn_softmax<4, (threads / 4), 4>
+            <<<grid_dim, block_dim, 0, stream>>>(vals, attn_mask, heads, seq_length4, iterations);
+    else if (sequence_length <= 32)
+        attn_softmax<8, (threads / 8), 8>
+            <<<grid_dim, block_dim, 0, stream>>>(vals, attn_mask, heads, seq_length4, iterations);
+    else if (sequence_length <= 64)
+        attn_softmax<16, (threads / 16), 16>
+            <<<grid_dim, block_dim, 0, stream>>>(vals, attn_mask, heads, seq_length4, iterations);
+    else if (sequence_length <= 128)
+        attn_softmax<32, (threads / 32), 32>
+            <<<grid_dim, block_dim, 0, stream>>>(vals, attn_mask, heads, seq_length4, iterations);
+    else if (sequence_length <= 256)
+        attn_softmax<32, (threads / 64), 64>
+            <<<grid_dim, block_dim, 0, stream>>>(vals, attn_mask, heads, seq_length4, iterations);
+    else {
+        const int threads = 256;
+        dim3 grid_dim = get_attn_softmax_grid(batch_size, heads, sequence_length, threads);
+
+        int subblock_max_workload = MAX_THREAD_ITERATIONS * 4 * threads;
+
+        dim3 block_dim(seq_length4 > threads ? ((sequence_length + subblock_max_workload - 1) /
+                                                subblock_max_workload * threads)
+                                             : threads);
+        iterations =
+            (sequence_length < subblock_max_workload ? (seq_length4 + threads - 1) / threads
+                                                     : MAX_THREAD_ITERATIONS);
+        if (sequence_length <= 512)
+            attn_softmax<32, (threads / 128), 128><<<grid_dim, block_dim, 0, stream>>>(
+                vals, attn_mask, heads, seq_length4, iterations);
+        else if (sequence_length < (MAX_THREADS * MAX_THREAD_ITERATIONS * 4))
+            attn_softmax<32, 1, 128><<<grid_dim, block_dim, 0, stream>>>(
+                vals, attn_mask, heads, seq_length4, iterations);
+        else
+            throw std::runtime_error(
+                "Unsupport Seq_Length! Check the restriction of the max_threads and "
+                "max_thread_iterations!");
+    }
+}
+
+template <typename T, int tbSize, int blockStride>
+__global__ void softmax_backward_kernel(T* out_grad, const T* soft_inp, int seq_length)
+{
+    __shared__ float partialSum[MAX_WARP_NUM];
+
+    int warp_num = blockDim.x >> WARP_SIZE_BITS;  // warp-count = num_threads / WARP_SIZE (32)
+
+    int iteration_stride = blockDim.x;
+    int block_width = blockStride * seq_length;
+
+    int iterations = (seq_length < (MAX_THREAD_ITERATIONS * iteration_stride)
+                          ? (seq_length + iteration_stride - 1) / iteration_stride
+                          : MAX_THREAD_ITERATIONS);
+
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
+
+    int row = blockIdx.x;
+    int id = threadIdx.x;
+
+    int wid = id >> WARP_SIZE_BITS;
+    int lane = id & 0x1f;
+
+    T val_reg[MAX_THREAD_ITERATIONS];
+    T soft_reg[MAX_THREAD_ITERATIONS];
+    float grad_reg = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < iterations; i++) {
+        int data_id = i * iteration_stride + id;
+        if (data_id < block_width) {
+            val_reg[i] = out_grad[row * block_width + data_id];
+            soft_reg[i] = soft_inp[row * block_width + data_id];
+
+            grad_reg += ((float)val_reg[i] *
+                         (float)soft_reg[i]);  // if done in half, the multiplication, we may lose
+                                               // 2% of accuracy in computation!!
+        }
+    }
+    for (int i = 1; i < tbSize; i *= 2) grad_reg += g.shfl_xor(grad_reg, i);
+
+    if (seq_length > tbSize) {
+        if (lane == 0) partialSum[wid] = grad_reg;
+        b.sync();
+
+        if (lane < warp_num) grad_reg = partialSum[lane];
+
+        int iters = warp_num;
+        if (seq_length < iteration_stride) iters = warp_num / (iteration_stride / seq_length);
+
+        for (int i = 1; i < iters; i *= 2) grad_reg += g.shfl_xor(grad_reg, i);
+
+        grad_reg = g.shfl(grad_reg, id / tbSize);
+    }
+
+    for (int i = 0; i < iterations; i++) {
+        int data_id = i * iteration_stride + id;
+        if (data_id < block_width) {
+            float temp = (float)soft_reg[i] * ((float)val_reg[i] - grad_reg);
+            out_grad[row * block_width + data_id] = (T)temp;
+        }
+    }
+}
+
+template <typename T, int ITERATIONS>
+__global__ void softmax_backward_kernel_v2(T* grad /* input & output*/,
+                                           const T* output,
+                                           int softmax_length)
+{
+    int batch_idx = blockIdx.x * blockDim.y + threadIdx.y;
+    int offset = batch_idx * softmax_length + threadIdx.x;
+
+    grad += offset;
+    output += offset;
+
+    T grad_reg[ITERATIONS];
+    T output_reg[ITERATIONS];
+    float sum = 0.0;
+
+#pragma unroll
+    for (int i = 0; i < ITERATIONS; ++i) {
+        int curr_idx = threadIdx.x + i * WARP_SIZE;
+        if (curr_idx < softmax_length) {
+            grad_reg[i] = grad[i * WARP_SIZE];
+            output_reg[i] = output[i * WARP_SIZE];
+            sum += (float)grad_reg[i] * (float)output_reg[i];
+        }
+    }
+
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+
+    for (int i = 1; i < WARP_SIZE; i <<= 1) sum += g.shfl_xor(sum, i);
+
+#pragma unroll
+    for (int i = 0; i < ITERATIONS; ++i) {
+        int curr_idx = threadIdx.x + i * WARP_SIZE;
+        if (curr_idx < softmax_length)
+            grad[i * WARP_SIZE] = (float)output_reg[i] * ((float)grad_reg[i] - sum);
+    }
+}
+
+template <typename T>
+void launch_attn_softmax_backward_v2(T* out_grad,
+                                     const T* soft_inp,
+                                     int batch_size,
+                                     int heads,
+                                     int seq_length,
+                                     cudaStream_t stream)
+{
+    const int warps_per_block = 4;
+    dim3 grid_dim(batch_size * heads * seq_length / warps_per_block);
+    dim3 block_dim(WARP_SIZE, warps_per_block);
+
+    if (seq_length <= 32)
+        softmax_backward_kernel_v2<T, 1>
+            <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
+    else if (seq_length <= 64)
+        softmax_backward_kernel_v2<T, 2>
+            <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
+    else if (seq_length <= 128)
+        softmax_backward_kernel_v2<T, 4>
+            <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
+    else if (seq_length <= 256)
+        softmax_backward_kernel_v2<T, 8>
+            <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
+    else if (seq_length <= 384)
+        softmax_backward_kernel_v2<T, 12>
+            <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
+    else if (seq_length <= 512)
+        softmax_backward_kernel_v2<T, 16>
+            <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
+    else if (seq_length <= 768)
+        softmax_backward_kernel_v2<T, 24>
+            <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
+    else if (seq_length <= 1024)
+        softmax_backward_kernel_v2<T, 32>
+            <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
+    else if (seq_length <= 2048)
+        softmax_backward_kernel_v2<T, 64>
+            <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
+    else
+        throw std::runtime_error(
+            std::string("Special sequence length found in softmax backward, seq_length: ") +
+            std::to_string(seq_length));
+}
+
+template void launch_attn_softmax_backward_v2<__half>(__half* out_grad,
+                                                      const __half* soft_inp,
+                                                      int batch_size,
+                                                      int heads,
+                                                      int seq_length,
+                                                      cudaStream_t stream);
+template void launch_attn_softmax_backward_v2<float>(float* out_grad,
+                                                     const float* soft_inp,
+                                                     int batch_size,
+                                                     int heads,
+                                                     int seq_length,
+                                                     cudaStream_t stream);
--- a/csrc/transformer_bak/softmax_kernels.hip
+++ b/csrc/transformer_bak/softmax_kernels.hip
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
+#include <math.h>
+#include "custom_hip_layers.h"
+#include "general_kernels_hip.h"
+
+namespace cg = cooperative_groups;
+
+dim3 get_attn_softmax_grid(int batch_size, int heads, int sequence_length, int threads)
+{
+    int seq_length4 = sequence_length / 4;
+    int block_compute_size =
+        (seq_length4 < threads ? (int)pow(2.0, floor(log2((float)(threads / seq_length4)))) : 1);
+    // Note that the Y and Z dimensions are limited to 65535, while X is basically unlimited:
+    // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications
+    // The batch size is typically relatively small, while the sequence length could potentially be
+    // arbitrarily large. We therefore place the batch size second to avoid hitting the Y limit.
+    unsigned x = heads * sequence_length / block_compute_size;
+    unsigned y = batch_size;
+    return {x, y};
+}
+
+// Fused attention + softmax
+template <int tbSize, int blockStride, int tbSeq>
+__global__ void attn_softmax(float* vals,
+                             const float* attn_mask,
+                             int heads,
+                             int seq_length,
+                             int iterations)
+{
+    __shared__ float partialSum[MAX_WARP_NUM];
+
+    int warp_num = blockDim.x >> WARP_SIZE_BITS;
+
+    int iteration_stride = blockDim.x;
+    int block_width = blockStride * seq_length;
+
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
+
+    int batch = blockIdx.y;
+    int row = blockIdx.x;
+    int max_threads_in_sequence = ::max(seq_length, tbSeq);
+    int seq_lane = threadIdx.x % max_threads_in_sequence;
+
+    int data_offset = batch * (gridDim.x * block_width) + row * block_width +
+                      (threadIdx.x / max_threads_in_sequence) * seq_length;
+    int mask_offset = batch * seq_length;
+
+    int wid = threadIdx.x >> WARP_SIZE_BITS;
+    int lane = threadIdx.x & 0x1f;
+
+    float4* val_cast = reinterpret_cast<float4*>(vals);
+    const float4* attn_mask_cast = reinterpret_cast<const float4*>(attn_mask);
+
+    float4 data[MAX_THREAD_ITERATIONS];
+
+    float max_val = minus_infinity;
+
+    for (int i = 0; i < iterations; i++) {
+        int data_id = i * iteration_stride + seq_lane;
+        if (data_id < seq_length) {
+            float4 mask = attn_mask_cast[mask_offset + data_id];
+            data[i] = val_cast[data_offset + data_id];
+
+            data[i].x += mask.x;
+            data[i].y += mask.y;
+            data[i].z += mask.z;
+            data[i].w += mask.w;
+
+            max_val = (data[i].x > max_val ? data[i].x : max_val);
+            max_val = (data[i].y > max_val ? data[i].y : max_val);
+            max_val = (data[i].z > max_val ? data[i].z : max_val);
+            max_val = (data[i].w > max_val ? data[i].w : max_val);
+        } else {
+            data[i].x = minus_infinity;
+            data[i].y = minus_infinity;
+            data[i].z = minus_infinity;
+            data[i].w = minus_infinity;
+        }
+    }
+
+    for (int i = 1; i < tbSize; i *= 2) {
+        auto temp = g.shfl_xor(max_val, i);
+        max_val = (temp > max_val ? temp : max_val);
+    }
+
+    if (seq_length > tbSize) {
+        if (lane == 0) partialSum[wid] = max_val;
+        b.sync();
+
+        if (lane < warp_num) max_val = partialSum[lane];
+
+#ifndef __STOCHASTIC_MODE__
+        b.sync();
+#endif
+
+        int iters = warp_num;
+        if (seq_length < iteration_stride)
+            iters = warp_num / (iteration_stride / max_threads_in_sequence);
+
+        for (int i = 1; i < iters; i *= 2) {
+            auto temp = g.shfl_xor(max_val, i);
+            max_val = (temp > max_val ? temp : max_val);
+        }
+
+        max_val = g.shfl(max_val, threadIdx.x / tbSize);
+    }
+
+    float sum = 0;
+    for (int i = 0; i < iterations; i++) {
+        data[i].x = __expf(data[i].x - max_val);
+        data[i].y = __expf(data[i].y - max_val);
+        data[i].z = __expf(data[i].z - max_val);
+        data[i].w = __expf(data[i].w - max_val);
+
+        sum += (data[i].x + data[i].y + data[i].z + data[i].w);
+    }
+
+    for (int i = 1; i < tbSize; i *= 2) { sum += g.shfl_xor(sum, i); }
+
+    if (seq_length > tbSize) {
+        if (lane == 0) partialSum[wid] = sum;
+        b.sync();
+
+        if (lane < warp_num) sum = partialSum[lane];
+
+#ifndef __STOCHASTIC_MODE__
+        b.sync();
+#endif
+
+        int iters = warp_num;
+        if (seq_length < iteration_stride)
+            iters = warp_num / (iteration_stride / max_threads_in_sequence);
+
+        for (int i = 1; i < iters; i *= 2) { sum += g.shfl_xor(sum, i); }
+
+        sum = g.shfl(sum, threadIdx.x / tbSize);
+    }
+
+    sum += 1e-6;
+
+    for (int i = 0; i < iterations; i++) {
+        data[i].x /= sum;
+        data[i].y /= sum;
+        data[i].z /= sum;
+        data[i].w /= sum;
+
+        int data_id = i * iteration_stride + seq_lane;
+        if (data_id < seq_length) val_cast[data_offset + data_id] = data[i];
+    }
+}
+
+template <int tbSize, int blockStride, int tbSeq>
+__global__ void attn_softmax(__half* vals,
+                             const __half* attn_mask,
+                             int heads,
+                             int seq_length,
+                             int iterations)
+{
+#ifdef HALF_PRECISION_AVAILABLE
+    __shared__ float partialSum[MAX_WARP_NUM];
+
+    int warp_num = blockDim.x >> WARP_SIZE_BITS;
+
+    int iteration_stride = blockDim.x;
+    int block_width = blockStride * seq_length;
+
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
+
+    int batch = blockIdx.y;
+    int row = blockIdx.x;
+    int max_threads_in_sequence = ::max(seq_length, tbSeq);
+    int seq_lane = threadIdx.x % max_threads_in_sequence;
+
+    int data_offset = batch * (gridDim.x * block_width) + row * block_width +
+                      (threadIdx.x / max_threads_in_sequence) * seq_length;
+    int mask_offset = batch * seq_length;
+
+    int wid = threadIdx.x >> WARP_SIZE_BITS;
+    int lane = threadIdx.x & 0x1f;
+
+    float2* val_cast = reinterpret_cast<float2*>(vals);
+    const float2* attn_mask_cast = reinterpret_cast<const float2*>(attn_mask);
+
+    val_cast += data_offset;
+    attn_mask_cast += mask_offset;
+
+    float2 low_data[MAX_THREAD_ITERATIONS];
+    float2 high_data[MAX_THREAD_ITERATIONS];
+
+    float max_val = minus_infinity;
+
+    for (int i = 0; i < iterations; i++) {
+        int data_id = i * iteration_stride + seq_lane;
+        if (data_id < seq_length) {
+            float2 data = val_cast[data_id];
+            float2 mask = attn_mask_cast[data_id];
+
+            __half2* data_arr = reinterpret_cast<__half2*>(&data);
+            __half2* mask_arr = reinterpret_cast<__half2*>(&mask);
+
+            low_data[i] = __half22float2(data_arr[0]);
+            high_data[i] = __half22float2(data_arr[1]);
+            float2 low_mask = __half22float2(mask_arr[0]);
+            float2 high_mask = __half22float2(mask_arr[1]);
+
+            low_data[i].x += low_mask.x;
+            low_data[i].y += low_mask.y;
+            high_data[i].x += high_mask.x;
+            high_data[i].y += high_mask.y;
+
+            max_val = (low_data[i].x > max_val ? low_data[i].x : max_val);
+            max_val = (low_data[i].y > max_val ? low_data[i].y : max_val);
+            max_val = (high_data[i].x > max_val ? high_data[i].x : max_val);
+            max_val = (high_data[i].y > max_val ? high_data[i].y : max_val);
+        }
+    }
+
+    for (int i = 1; i < tbSize; i *= 2) {
+        auto temp = g.shfl_xor(max_val, i);
+        max_val = (temp > max_val ? temp : max_val);
+    }
+
+    if (seq_length > tbSize) {
+        if (lane == 0) partialSum[wid] = max_val;
+        b.sync();
+
+        if (lane < warp_num) max_val = partialSum[lane];
+
+#ifndef __STOCHASTIC_MODE__
+        b.sync();
+#endif
+
+        int iters = warp_num;
+        if (seq_length < iteration_stride)
+            iters = warp_num / (iteration_stride / max_threads_in_sequence);
+
+        for (int i = 1; i < iters; i *= 2) {
+            auto temp = g.shfl_xor(max_val, i);
+            max_val = (temp > max_val ? temp : max_val);
+        }
+
+        max_val = g.shfl(max_val, threadIdx.x / tbSize);
+    }
+
+    float sum = 0;
+    for (int i = 0; i < iterations; i++) {
+        int data_id = i * iteration_stride + seq_lane;
+        if (data_id < seq_length) {
+            low_data[i].x = __expf(low_data[i].x - max_val);
+            low_data[i].y = __expf(low_data[i].y - max_val);
+            high_data[i].x = __expf(high_data[i].x - max_val);
+            high_data[i].y = __expf(high_data[i].y - max_val);
+
+            sum += (low_data[i].x + low_data[i].y + high_data[i].x + high_data[i].y);
+        }
+    }
+
+    for (int i = 1; i < tbSize; i *= 2) { sum += g.shfl_xor(sum, i); }
+
+    if (seq_length > tbSize) {
+        if (lane == 0) partialSum[wid] = sum;
+        b.sync();
+
+        if (lane < warp_num) sum = partialSum[lane];
+
+#ifndef __STOCHASTIC_MODE__
+        b.sync();
+#endif
+
+        int iters = warp_num;
+        if (seq_length < iteration_stride)
+            iters = warp_num / (iteration_stride / max_threads_in_sequence);
+
+        for (int i = 1; i < iters; i *= 2) { sum += g.shfl_xor(sum, i); }
+
+        sum = g.shfl(sum, threadIdx.x / tbSize);
+    }
+
+    sum += 1e-6;
+
+    for (int i = 0; i < iterations; i++) {
+        int data_id = i * iteration_stride + seq_lane;
+        if (data_id < seq_length) {
+            float2 result_f;
+            __half2* result_h = reinterpret_cast<__half2*>(&result_f);
+
+            low_data[i].x /= sum;
+            low_data[i].y /= sum;
+            high_data[i].x /= sum;
+            high_data[i].y /= sum;
+
+            result_h[0] = __float22half2_rn(low_data[i]);
+            result_h[1] = __float22half2_rn(high_data[i]);
+
+            val_cast[data_id] = result_f;
+        }
+    }
+
+#endif
+}
+
+template <typename T>
+void launch_attn_softmax(T*, const T*, int, int, int, hipStream_t);
+
+template <>
+void launch_attn_softmax<float>(float* vals,
+                                const float* attn_mask,
+                                int batch_size,
+                                int heads,
+                                int sequence_length,
+                                hipStream_t stream)
+{
+    const int threads = 128;
+    int seq_length4 = sequence_length / 4;
+
+    dim3 grid_dim = get_attn_softmax_grid(batch_size, heads, sequence_length, threads);
+
+    int subblock_max_workload = MAX_THREAD_ITERATIONS * 4 * threads;
+
+    dim3 block_dim(seq_length4 > threads ? ((sequence_length + subblock_max_workload - 1) /
+                                            subblock_max_workload * threads)
+                                         : threads);
+    int iterations =
+        (sequence_length < subblock_max_workload ? (seq_length4 + threads - 1) / threads
+                                                 : MAX_THREAD_ITERATIONS);
+
+    if (sequence_length <= 8)
+       hipLaunchKernelGGL(( attn_softmax<2, (threads / 2), 2>)
+            , dim3(grid_dim), dim3(block_dim), 0, stream, vals, attn_mask, heads, seq_length4, iterations);
+    else if (sequence_length <= 16)
+       hipLaunchKernelGGL(( attn_softmax<4, (threads / 4), 4>)
+            , dim3(grid_dim), dim3(block_dim), 0, stream, vals, attn_mask, heads, seq_length4, iterations);
+    else if (sequence_length <= 32)
+       hipLaunchKernelGGL(( attn_softmax<8, (threads / 8), 8>)
+            , dim3(grid_dim), dim3(block_dim), 0, stream, vals, attn_mask, heads, seq_length4, iterations);
+    else if (sequence_length <= 64)
+       hipLaunchKernelGGL(( attn_softmax<16, (threads / 16), 16>)
+            , dim3(grid_dim), dim3(block_dim), 0, stream, vals, attn_mask, heads, seq_length4, iterations);
+    else if (sequence_length <= 128)
+       hipLaunchKernelGGL(( attn_softmax<32, (threads / 32), 32>)
+            , dim3(grid_dim), dim3(block_dim), 0, stream, vals, attn_mask, heads, seq_length4, iterations);
+    else if (sequence_length <= 256)
+       hipLaunchKernelGGL(( attn_softmax<32, (threads / 64), 64>)
+            , dim3(grid_dim), dim3(block_dim), 0, stream, vals, attn_mask, heads, seq_length4, iterations);
+    else {
+        const int threads = 256;
+        dim3 grid_dim = get_attn_softmax_grid(batch_size, heads, sequence_length, threads);
+
+        int subblock_max_workload = MAX_THREAD_ITERATIONS * 4 * threads;
+
+        dim3 block_dim(seq_length4 > threads ? ((sequence_length + subblock_max_workload - 1) /
+                                                subblock_max_workload * threads)
+                                             : threads);
+        iterations =
+            (sequence_length < subblock_max_workload ? (seq_length4 + threads - 1) / threads
+                                                     : MAX_THREAD_ITERATIONS);
+        if (sequence_length <= 512)
+           hipLaunchKernelGGL(( attn_softmax<32, (threads / 128), 128>), dim3(grid_dim), dim3(block_dim), 0, stream, 
+                vals, attn_mask, heads, seq_length4, iterations);
+        else if (sequence_length < (MAX_THREADS * MAX_THREAD_ITERATIONS * 4))
+           hipLaunchKernelGGL(( attn_softmax<32, 1, 128>), dim3(grid_dim), dim3(block_dim), 0, stream, 
+                vals, attn_mask, heads, seq_length4, iterations);
+        else
+            throw std::runtime_error(
+                "Unsupport Seq_Length! Check the restriction of the max_threads and "
+                "max_thread_iterations!");
+    }
+}
+
+template <>
+void launch_attn_softmax<__half>(__half* vals,
+                                 const __half* attn_mask,
+                                 int batch_size,
+                                 int heads,
+                                 int sequence_length,
+                                 hipStream_t stream)
+{
+    const int threads = 128;
+    int seq_length4 = sequence_length / 4;
+
+    dim3 grid_dim = get_attn_softmax_grid(batch_size, heads, sequence_length, threads);
+
+    int subblock_max_workload = MAX_THREAD_ITERATIONS * 4 * threads;
+
+    dim3 block_dim(seq_length4 > threads ? ((sequence_length + subblock_max_workload - 1) /
+                                            subblock_max_workload * threads)
+                                         : threads);
+
+    int iterations =
+        (sequence_length < subblock_max_workload ? (seq_length4 + threads - 1) / threads
+                                                 : MAX_THREAD_ITERATIONS);
+
+    if (sequence_length <= 8)
+       hipLaunchKernelGGL(( attn_softmax<2, (threads / 2), 2>)
+            , dim3(grid_dim), dim3(block_dim), 0, stream, vals, attn_mask, heads, seq_length4, iterations);
+    else if (sequence_length <= 16)
+       hipLaunchKernelGGL(( attn_softmax<4, (threads / 4), 4>)
+            , dim3(grid_dim), dim3(block_dim), 0, stream, vals, attn_mask, heads, seq_length4, iterations);
+    else if (sequence_length <= 32)
+       hipLaunchKernelGGL(( attn_softmax<8, (threads / 8), 8>)
+            , dim3(grid_dim), dim3(block_dim), 0, stream, vals, attn_mask, heads, seq_length4, iterations);
+    else if (sequence_length <= 64)
+       hipLaunchKernelGGL(( attn_softmax<16, (threads / 16), 16>)
+            , dim3(grid_dim), dim3(block_dim), 0, stream, vals, attn_mask, heads, seq_length4, iterations);
+    else if (sequence_length <= 128)
+       hipLaunchKernelGGL(( attn_softmax<32, (threads / 32), 32>)
+            , dim3(grid_dim), dim3(block_dim), 0, stream, vals, attn_mask, heads, seq_length4, iterations);
+    else if (sequence_length <= 256)
+       hipLaunchKernelGGL(( attn_softmax<32, (threads / 64), 64>)
+            , dim3(grid_dim), dim3(block_dim), 0, stream, vals, attn_mask, heads, seq_length4, iterations);
+    else {
+        const int threads = 256;
+        dim3 grid_dim = get_attn_softmax_grid(batch_size, heads, sequence_length, threads);
+
+        int subblock_max_workload = MAX_THREAD_ITERATIONS * 4 * threads;
+
+        dim3 block_dim(seq_length4 > threads ? ((sequence_length + subblock_max_workload - 1) /
+                                                subblock_max_workload * threads)
+                                             : threads);
+        iterations =
+            (sequence_length < subblock_max_workload ? (seq_length4 + threads - 1) / threads
+                                                     : MAX_THREAD_ITERATIONS);
+        if (sequence_length <= 512)
+           hipLaunchKernelGGL(( attn_softmax<32, (threads / 128), 128>), dim3(grid_dim), dim3(block_dim), 0, stream, 
+                vals, attn_mask, heads, seq_length4, iterations);
+        else if (sequence_length < (MAX_THREADS * MAX_THREAD_ITERATIONS * 4))
+           hipLaunchKernelGGL(( attn_softmax<32, 1, 128>), dim3(grid_dim), dim3(block_dim), 0, stream, 
+                vals, attn_mask, heads, seq_length4, iterations);
+        else
+            throw std::runtime_error(
+                "Unsupport Seq_Length! Check the restriction of the max_threads and "
+                "max_thread_iterations!");
+    }
+}
+
+template <typename T, int tbSize, int blockStride>
+__global__ void softmax_backward_kernel(T* out_grad, const T* soft_inp, int seq_length)
+{
+    __shared__ float partialSum[MAX_WARP_NUM];
+
+    int warp_num = blockDim.x >> WARP_SIZE_BITS;  // warp-count = num_threads / WARP_SIZE (32)
+
+    int iteration_stride = blockDim.x;
+    int block_width = blockStride * seq_length;
+
+    int iterations = (seq_length < (MAX_THREAD_ITERATIONS * iteration_stride)
+                          ? (seq_length + iteration_stride - 1) / iteration_stride
+                          : MAX_THREAD_ITERATIONS);
+
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
+
+    int row = blockIdx.x;
+    int id = threadIdx.x;
+
+    int wid = id >> WARP_SIZE_BITS;
+    int lane = id & 0x1f;
+
+    T val_reg[MAX_THREAD_ITERATIONS];
+    T soft_reg[MAX_THREAD_ITERATIONS];
+    float grad_reg = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < iterations; i++) {
+        int data_id = i * iteration_stride + id;
+        if (data_id < block_width) {
+            val_reg[i] = out_grad[row * block_width + data_id];
+            soft_reg[i] = soft_inp[row * block_width + data_id];
+
+            grad_reg += ((float)val_reg[i] *
+                         (float)soft_reg[i]);  // if done in half, the multiplication, we may lose
+                                               // 2% of accuracy in computation!!
+        }
+    }
+    for (int i = 1; i < tbSize; i *= 2) grad_reg += g.shfl_xor(grad_reg, i);
+
+    if (seq_length > tbSize) {
+        if (lane == 0) partialSum[wid] = grad_reg;
+        b.sync();
+
+        if (lane < warp_num) grad_reg = partialSum[lane];
+
+        int iters = warp_num;
+        if (seq_length < iteration_stride) iters = warp_num / (iteration_stride / seq_length);
+
+        for (int i = 1; i < iters; i *= 2) grad_reg += g.shfl_xor(grad_reg, i);
+
+        grad_reg = g.shfl(grad_reg, id / tbSize);
+    }
+
+    for (int i = 0; i < iterations; i++) {
+        int data_id = i * iteration_stride + id;
+        if (data_id < block_width) {
+            float temp = (float)soft_reg[i] * ((float)val_reg[i] - grad_reg);
+            out_grad[row * block_width + data_id] = (T)temp;
+        }
+    }
+}
+
+template <typename T, int ITERATIONS>
+__global__ void softmax_backward_kernel_v2(T* grad /* input & output*/,
+                                           const T* output,
+                                           int softmax_length)
+{
+    int batch_idx = blockIdx.x * blockDim.y + threadIdx.y;
+    int offset = batch_idx * softmax_length + threadIdx.x;
+
+    grad += offset;
+    output += offset;
+
+    T grad_reg[ITERATIONS];
+    T output_reg[ITERATIONS];
+    float sum = 0.0;
+
+#pragma unroll
+    for (int i = 0; i < ITERATIONS; ++i) {
+        int curr_idx = threadIdx.x + i * WARP_SIZE;
+        if (curr_idx < softmax_length) {
+            grad_reg[i] = grad[i * WARP_SIZE];
+            output_reg[i] = output[i * WARP_SIZE];
+            sum += (float)grad_reg[i] * (float)output_reg[i];
+        }
+    }
+
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+
+    for (int i = 1; i < WARP_SIZE; i <<= 1) sum += g.shfl_xor(sum, i);
+
+#pragma unroll
+    for (int i = 0; i < ITERATIONS; ++i) {
+        int curr_idx = threadIdx.x + i * WARP_SIZE;
+        if (curr_idx < softmax_length)
+            grad[i * WARP_SIZE] = (float)output_reg[i] * ((float)grad_reg[i] - sum);
+    }
+}
+
+template <typename T>
+void launch_attn_softmax_backward_v2(T* out_grad,
+                                     const T* soft_inp,
+                                     int batch_size,
+                                     int heads,
+                                     int seq_length,
+                                     hipStream_t stream)
+{
+    const int warps_per_block = 4;
+    dim3 grid_dim(batch_size * heads * seq_length / warps_per_block);
+    dim3 block_dim(WARP_SIZE, warps_per_block);
+
+    if (seq_length <= 32)
+       hipLaunchKernelGGL(( softmax_backward_kernel_v2<T, 1>)
+            , dim3(grid_dim), dim3(block_dim), 0, stream, out_grad, soft_inp, seq_length);
+    else if (seq_length <= 64)
+       hipLaunchKernelGGL(( softmax_backward_kernel_v2<T, 2>)
+            , dim3(grid_dim), dim3(block_dim), 0, stream, out_grad, soft_inp, seq_length);
+    else if (seq_length <= 128)
+       hipLaunchKernelGGL(( softmax_backward_kernel_v2<T, 4>)
+            , dim3(grid_dim), dim3(block_dim), 0, stream, out_grad, soft_inp, seq_length);
+    else if (seq_length <= 256)
+       hipLaunchKernelGGL(( softmax_backward_kernel_v2<T, 8>)
+            , dim3(grid_dim), dim3(block_dim), 0, stream, out_grad, soft_inp, seq_length);
+    else if (seq_length <= 384)
+       hipLaunchKernelGGL(( softmax_backward_kernel_v2<T, 12>)
+            , dim3(grid_dim), dim3(block_dim), 0, stream, out_grad, soft_inp, seq_length);
+    else if (seq_length <= 512)
+       hipLaunchKernelGGL(( softmax_backward_kernel_v2<T, 16>)
+            , dim3(grid_dim), dim3(block_dim), 0, stream, out_grad, soft_inp, seq_length);
+    else if (seq_length <= 768)
+       hipLaunchKernelGGL(( softmax_backward_kernel_v2<T, 24>)
+            , dim3(grid_dim), dim3(block_dim), 0, stream, out_grad, soft_inp, seq_length);
+    else if (seq_length <= 1024)
+       hipLaunchKernelGGL(( softmax_backward_kernel_v2<T, 32>)
+            , dim3(grid_dim), dim3(block_dim), 0, stream, out_grad, soft_inp, seq_length);
+    else if (seq_length <= 2048)
+       hipLaunchKernelGGL(( softmax_backward_kernel_v2<T, 64>)
+            , dim3(grid_dim), dim3(block_dim), 0, stream, out_grad, soft_inp, seq_length);
+    else
+        throw std::runtime_error(
+            std::string("Special sequence length found in softmax backward, seq_length: ") +
+            std::to_string(seq_length));
+}
+
+template void launch_attn_softmax_backward_v2<__half>(__half* out_grad,
+                                                      const __half* soft_inp,
+                                                      int batch_size,
+                                                      int heads,
+                                                      int seq_length,
+                                                      hipStream_t stream);
+template void launch_attn_softmax_backward_v2<float>(float* out_grad,
+                                                     const float* soft_inp,
+                                                     int batch_size,
+                                                     int heads,
+                                                     int seq_length,
+                                                     hipStream_t stream);
--- a/csrc/transformer_bak/transform_kernels.cu
+++ b/csrc/transformer_bak/transform_kernels.cu
+#include "custom_cuda_layers.h"
+
+#define rows_trans 16
+#define cols_trans 16
+
+template <typename T>
+__global__ void Transpose_Kernel(const T* inp, T* out, int row_width, int col_width)
+{
+    __shared__ T data_block[rows_trans * (cols_trans + 1)];
+
+    int r = threadIdx.x / cols_trans;
+    int c = threadIdx.x % cols_trans;
+
+    int m = row_width / cols_trans;
+
+    int i = blockIdx.x / m * rows_trans + r;
+    int j = blockIdx.x % m * cols_trans + c;
+
+    int row_stride = rows_trans / ((rows_trans * cols_trans + THREADS - 1) / THREADS);
+
+    for (int k = 0; k < rows_trans; k += row_stride)
+        data_block[(k + r) * cols_trans + c] = inp[(i + k) * row_width + j];
+
+    __syncthreads();
+
+    i = blockIdx.x % m * rows_trans + r;
+    j = blockIdx.x / m * cols_trans + c;
+
+    for (int k = 0; k < rows_trans; k += row_stride)
+        out[(i + k) * col_width + j] = data_block[c * cols_trans + r + k];
+}
+
+template <>
+void Transpose<__half>(const __half* inp_mat,
+                       __half* out_mat,
+                       int rows,
+                       int cols,
+                       cudaStream_t stream)
+{
+    int threads = THREADS;
+
+    Transpose_Kernel<__half><<<(rows * cols + threads - 1) / threads, threads, 0, stream>>>(
+        inp_mat, out_mat, cols, rows);
+}
+
+template <>
+void Transpose<float>(const float* inp_mat, float* out_mat, int rows, int cols, cudaStream_t stream)
+{
+    int threads = THREADS;
+
+    Transpose_Kernel<float><<<(rows * cols + threads - 1) / threads, threads, 0, stream>>>(
+        inp_mat, out_mat, cols, rows);
+}
+
+template <typename T>
+__global__ void transform_0213(T* output,
+                               const T* vals,
+                               int hidden_dim,
+                               int seq_length,
+                               int heads,
+                               int head_ext);
+
+template <>
+__global__ void transform_0213<float>(float* output,
+                                      const float* vals,
+                                      int hidden_dim,
+                                      int seq_length,
+                                      int heads,
+                                      int head_ext)
+{
+    int d0_stride = hidden_dim * seq_length;
+    int d1_stride = hidden_dim;
+    int d2_stride = hidden_dim / heads;
+
+    int d0_out_stride = d0_stride;
+    int d1_out_stride = d2_stride;
+    int d2_out_stride = d2_stride * seq_length;
+
+    int d0 = blockIdx.x;                                                  // Batch
+    int d1 = blockIdx.y / head_ext;                                       // Sequence ID (0-127)
+    int d2 = threadIdx.y + (blockIdx.y % head_ext) * (heads / head_ext);  // Head (0-11)
+    int d3 = threadIdx.x;                                                 // Values (groups of 4)
+
+    const float4* vals_vec = reinterpret_cast<const float4*>(vals);
+    float4* output_vec = reinterpret_cast<float4*>(output);
+
+    float4 inputs = vals_vec[d0 * d0_stride + d1 * d1_stride + d2 * d2_stride + d3];
+    output_vec[d0 * d0_out_stride + d1 * d1_out_stride + d2 * d2_out_stride + d3] = inputs;
+}
+
+template <>
+__global__ void transform_0213<__half>(__half* output,
+                                       const __half* vals,
+                                       int hidden_dim,
+                                       int seq_length,
+                                       int heads,
+                                       int head_ext)
+{
+#ifdef HALF_PRECISION_AVAILABLE
+
+    int d0_stride = hidden_dim * seq_length;
+    int d1_stride = hidden_dim;
+    int d2_stride = hidden_dim / heads;
+
+    int d0_out_stride = d0_stride;
+    int d1_out_stride = d2_stride;
+    int d2_out_stride = d2_stride * seq_length;
+
+    int d0 = blockIdx.x;                                                  // Batch
+    int d1 = blockIdx.y / head_ext;                                       // Sequence ID (0-127)
+    int d2 = threadIdx.y + (blockIdx.y % head_ext) * (heads / head_ext);  // Head (0-11)
+    int d3 = threadIdx.x;                                                 // Values (groups of 4)
+
+    float4 vals_arr[1];
+
+    const float4* vals_vec = reinterpret_cast<const float4*>(vals);
+    float4* output_vec = reinterpret_cast<float4*>(output);
+
+    vals_arr[0] = vals_vec[d0 * d0_stride + d1 * d1_stride + d2 * d2_stride + d3];
+    output_vec[d0 * d0_out_stride + d1 * d1_out_stride + d2 * d2_out_stride + d3] = vals_arr[0];
+#endif
+}
+
+template <>
+void launch_transform_0213<float>(float* output,
+                                  const float* vals,
+                                  int batch_size,
+                                  int seq_length,
+                                  int hidden_dim,
+                                  int heads,
+                                  cudaStream_t stream)
+{
+    hidden_dim >>= 2;
+    int head_ext = (hidden_dim - 1) / MAX_THREADS + 1;
+    dim3 block_dim(hidden_dim / heads, (heads / head_ext));
+    dim3 grid_dim(batch_size, (seq_length * head_ext));
+
+    transform_0213<float>
+        <<<grid_dim, block_dim, 0, stream>>>(output, vals, hidden_dim, seq_length, heads, head_ext);
+}
+
+template <>
+void launch_transform_0213<__half>(__half* output,
+                                   const __half* vals,
+                                   int batch_size,
+                                   int seq_length,
+                                   int hidden_dim,
+                                   int heads,
+                                   cudaStream_t stream)
+{
+    hidden_dim >>= 3;
+    int head_ext = (hidden_dim - 1) / MAX_THREADS + 1;
+    dim3 block_dim(hidden_dim / heads, (heads / head_ext));
+    dim3 grid_dim(batch_size, (seq_length * head_ext));
+    transform_0213<__half>
+        <<<grid_dim, block_dim, 0, stream>>>(output, vals, hidden_dim, seq_length, heads, head_ext);
+}
+
+// Bias add
+template <typename T>
+__global__ void bias_add_transform_0213(T* output,
+                                        const T* vals,
+                                        const T* bias,
+                                        int hidden_dim,
+                                        int seq_length,
+                                        int heads,
+                                        int head_ext);
+
+template <>
+__global__ void bias_add_transform_0213<float>(float* output,
+                                               const float* vals,
+                                               const float* bias,
+                                               int hidden_dim,
+                                               int seq_length,
+                                               int heads,
+                                               int head_ext)
+{
+    int d0_stride = hidden_dim * seq_length;
+    int d1_stride = hidden_dim;
+    int d2_stride = hidden_dim / heads;
+
+    int d0_out_stride = d0_stride;
+    int d1_out_stride = d2_stride;
+    int d2_out_stride = d2_stride * seq_length;
+
+    int d0 = blockIdx.x;                                                  // Batch
+    int d1 = blockIdx.y;                                                  // Sequence ID (0-127)
+    int cnt = blockIdx.z / head_ext;                                      // Hidden count
+    int d2 = threadIdx.y + (blockIdx.z % head_ext) * (heads / head_ext);  // Head (0-11)
+    int d3 = threadIdx.x;                                                 // Values (groups of 4)
+
+    const float4* vals_vec = reinterpret_cast<const float4*>(vals);
+    const float4* bias_vec = reinterpret_cast<const float4*>(bias);
+    float4* output_vec = reinterpret_cast<float4*>(output);
+
+    float4 inputs = vals_vec[d0 * d0_stride * (gridDim.z / head_ext) + cnt * d1_stride +
+                             d1 * d1_stride * (gridDim.z / head_ext) + d2 * d2_stride + d3];
+    float4 biases = bias_vec[cnt * d1_stride + d2 * d2_stride + d3];
+
+    float4 outputs;
+    outputs.x = inputs.x + biases.x;
+    outputs.y = inputs.y + biases.y;
+    outputs.z = inputs.z + biases.z;
+    outputs.w = inputs.w + biases.w;
+
+    output_vec[cnt * d0_out_stride * gridDim.x + d0 * d0_out_stride + d1 * d1_out_stride +
+               d2 * d2_out_stride + d3] = outputs;
+}
+
+#define ATTN_H 3
+#define MAX_SEQ_LINE 10
+
+template <>
+__global__ void bias_add_transform_0213<__half>(__half* output,
+                                                const __half* vals,
+                                                const __half* bias,
+                                                int hidden_dim,
+                                                int seq_length,
+                                                int heads,
+                                                int head_ext)
+{
+#ifdef HALF_PRECISION_AVAILABLE
+
+    int d0_stride = hidden_dim * seq_length;
+    int d1_stride = hidden_dim;
+    int d2_stride = hidden_dim / heads;
+
+    int d2_out_stride = d2_stride * seq_length;
+
+    int d0 = blockIdx.x;                                                  // Batch
+    int d1 = blockIdx.y;                                                  // Sequence ID (0-127)
+    int cnt = blockIdx.z / head_ext;                                      // Hidden count
+    int d2 = threadIdx.y + (blockIdx.z % head_ext) * (heads / head_ext);  // Head (0-11)
+    int d3 = threadIdx.x;                                                 // Values (groups of 4)
+
+    float4 vals_arr;
+    float4 bias_arr;
+    float4 output_arr;
+    __half2* vals_half = reinterpret_cast<__half2*>(&vals_arr);
+    __half2* bias_half = reinterpret_cast<__half2*>(&bias_arr);
+    __half2* output_half = reinterpret_cast<__half2*>(&output_arr);
+
+    const float4* vals_vec = reinterpret_cast<const float4*>(vals);
+    const float4* bias_vec = reinterpret_cast<const float4*>(bias);
+    float4* output_vec = reinterpret_cast<float4*>(output);
+
+    vals_vec += (d0 * d0_stride * (gridDim.z / head_ext));
+    vals_vec += (d1 * d1_stride * (gridDim.z / head_ext));
+    vals_vec += (cnt * d1_stride);
+    vals_vec += (d2 * d2_stride);
+
+    bias_vec += (cnt * d1_stride);
+    bias_vec += (d2 * d2_stride);
+
+    output_vec += (cnt * d0_stride * gridDim.x);
+    output_vec += (d1 * d2_stride);
+    output_vec += (d0 * d0_stride);
+    output_vec += (d2 * d2_out_stride);
+
+    bias_arr = bias_vec[d3];
+    vals_arr = vals_vec[d3];
+
+#if defined(__ACC_HALF__)
+    output_half[0] = vals_half[0] + bias_half[0];
+    output_half[1] = vals_half[1] + bias_half[1];
+    output_half[2] = vals_half[2] + bias_half[2];
+    output_half[3] = vals_half[3] + bias_half[3];
+#else
+    float2 bias_arr_f[4];
+    float2 vals_arr_f[4];
+#pragma unroll
+    for (int l = 0; l < 4; l++) {
+        bias_arr_f[l] = __half22float2(bias_half[l]);
+        vals_arr_f[l] = __half22float2(vals_half[l]);
+        vals_arr_f[l].x += bias_arr_f[l].x;
+        vals_arr_f[l].y += bias_arr_f[l].y;
+        output_half[l] = __float22half2_rn(vals_arr_f[l]);
+    }
+#endif
+    output_vec[d3] = output_arr;
+
+#endif
+}
+
+__global__ void bias_add_transform_0213_v2(__half* output,
+                                           const __half* vals,
+                                           const __half* bias,
+                                           int hidden_dim,
+                                           int seq_length,
+                                           int heads)
+{
+#ifdef HALF_PRECISION_AVAILABLE
+    __shared__ float4 in_data[3072];
+
+    int d0_stride = hidden_dim * seq_length;
+    int d1_stride = hidden_dim;
+    int d2_stride = hidden_dim / heads;
+    int iteration_stride = d1_stride * blockDim.z;  // Hidden * 3 / 8
+    int batch_stride = d0_stride * blockDim.z;      // Hidden * S * 3 / 8
+
+    int d0_out_stride = d0_stride;
+    int d1_out_stride = d2_stride;
+    int d2_out_stride = d2_stride * seq_length;
+
+    int d0 = blockIdx.x;    // Batch
+    int d1 = blockIdx.y;    // Sequence ID (0-127)
+    int cnt = threadIdx.z;  // blockIdx.z; // Hidden count
+    int d2 = threadIdx.y;   // Head (0-11)
+    int d3 = threadIdx.x;   // Values (groups of 4)
+
+    float4 vals_arr[1];
+    float4 bias_arr[1];
+    float4 output_arr[1];
+    __half2* vals_half = reinterpret_cast<__half2*>(vals_arr);
+    __half2* bias_half = reinterpret_cast<__half2*>(bias_arr);
+    __half2* output_half = reinterpret_cast<__half2*>(output_arr);
+
+    const float4* vals_vec = reinterpret_cast<const float4*>(vals);
+    const float4* bias_vec = reinterpret_cast<const float4*>(bias);
+    float4* output_vec = reinterpret_cast<float4*>(output);
+
+    int iter_index = cnt * d1_stride + d2 * d2_stride + d3;
+    int input_offset = d0 * batch_stride + d1 * (iteration_stride << 1);
+    bias_arr[0] = bias_vec[iter_index];
+
+#pragma unroll
+    for (int iter = 0; iter < 2; iter++) {
+        int iter_id = iter * iteration_stride + iter_index;
+        vals_arr[0] = vals_vec[input_offset + iter_id];
+
+        output_half[0] = vals_half[0] + bias_half[0];
+        output_half[1] = vals_half[1] + bias_half[1];
+        output_half[2] = vals_half[2] + bias_half[2];
+        output_half[3] = vals_half[3] + bias_half[3];
+
+        in_data[iter_id] = output_arr[0];
+    }
+    __syncthreads();
+
+    iteration_stride = blockDim.z * (blockDim.y >> 1);
+    int matrix_stride = (d0_out_stride * gridDim.x);
+    int head_count = (d2 >> 1) + cnt * (blockDim.y >> 1);
+
+    int out_index = d0 * d0_out_stride + d1 * (d1_out_stride << 1) + d3 + (d2 % 2) * d2_stride;
+
+#pragma unroll
+    for (int iter = 0; iter < 2; iter++) {
+        int iter_row = (iter * iteration_stride) + head_count;
+        int iter_offset =
+            (iter_row % blockDim.y) * d2_out_stride + (iter_row / blockDim.y) * matrix_stride;
+        output_vec[out_index + iter_offset] =
+            in_data[iter_row * d2_stride + d3 + (d2 % 2) * (d1_stride * blockDim.z)];
+    }
+#endif
+}
+
+// [B S C*H] - > C * [B A S N]
+template <>
+void launch_bias_add_transform_0213<float>(float* output,
+                                           const float* vals,
+                                           const float* bias,
+                                           int batch_size,
+                                           int seq_length,
+                                           int hidden_dim,
+                                           int heads,
+                                           cudaStream_t stream,
+                                           int trans_count)
+{
+    hidden_dim >>= 2;
+    int head_ext = (hidden_dim - 1) / MAX_THREADS + 1;
+
+    dim3 block_dim(hidden_dim / heads, (heads / head_ext));
+    dim3 grid_dim(batch_size, seq_length, (trans_count * head_ext));
+
+    bias_add_transform_0213<float><<<grid_dim, block_dim, 0, stream>>>(
+        output, vals, bias, hidden_dim, seq_length, heads, head_ext);
+}
+
+template <>
+void launch_bias_add_transform_0213<__half>(__half* output,
+                                            const __half* vals,
+                                            const __half* bias,
+                                            int batch_size,
+                                            int seq_length,
+                                            int hidden_dim,
+                                            int heads,
+                                            cudaStream_t stream,
+                                            int trans_count)
+{
+    hidden_dim >>= 3;
+    if (hidden_dim > 128 || hidden_dim < 16) {
+        int head_ext = (hidden_dim - 1) / MAX_THREADS + 1;
+        dim3 block_dim(hidden_dim / heads, (heads / head_ext));
+        dim3 grid_dim(batch_size, seq_length, (trans_count * head_ext));
+        bias_add_transform_0213<__half><<<grid_dim, block_dim, 0, stream>>>(
+            output, vals, bias, hidden_dim, seq_length, heads, head_ext);
+    } else {
+        dim3 block_dim(hidden_dim / heads, heads, trans_count);
+        dim3 grid_dim(batch_size, seq_length / 2);
+        bias_add_transform_0213_v2<<<grid_dim, block_dim, 0, stream>>>(
+            output, vals, bias, hidden_dim, seq_length, heads);
+    }
+}
+
+template <typename T>
+__global__ void transform4d_0213(T* out,
+                                 const T* in,
+                                 int heads,
+                                 int seq_length,
+                                 int hidden_dim,
+                                 int head_ext);
+
+template <>
+__global__ void transform4d_0213<float>(float* out,
+                                        const float* in,
+                                        int heads,
+                                        int seq_length,
+                                        int hidden_dim,
+                                        int head_ext)
+{
+    int d0_stride = hidden_dim * seq_length;
+    int d1_stride = d0_stride / heads;
+    int d2_stride = hidden_dim / heads;
+
+    int d0_out_stride = d0_stride;
+    int d1_out_stride = d2_stride;
+    int d2_out_stride = hidden_dim;
+
+    int d0 = blockIdx.x;                                        // Batch
+    int d1 = blockIdx.y / ((seq_length - 1) / blockDim.y + 1);  // Head
+    int d2 = (threadIdx.y + blockDim.y * blockIdx.y) % seq_length;
+    int cnt = blockIdx.z;
+    int d3 = threadIdx.x;  // Values (groups of 8)
+
+    if (d2 < seq_length) {
+        const float4* in_vec = reinterpret_cast<const float4*>(in);
+        float4* out_vec = reinterpret_cast<float4*>(out);
+
+        float4 vals_vec = in_vec[cnt * d0_stride * gridDim.x + d0 * d0_stride + d1 * d1_stride +
+                                 d2 * d2_stride + d3];
+        out_vec[d0 * d0_out_stride * gridDim.z + cnt * d2_out_stride + d1 * d1_out_stride +
+                d2 * d2_out_stride * gridDim.z + d3] = vals_vec;
+    }
+}
+
+template <>
+__global__ void transform4d_0213<__half>(__half* out,
+                                         const __half* in,
+                                         int heads,
+                                         int seq_length,
+                                         int hidden_dim,
+                                         int head_ext)
+{
+#ifdef HALF_PRECISION_AVAILABLE
+
+    int d0_stride = hidden_dim * (seq_length / head_ext);
+    int d1_stride = hidden_dim;
+    int d2_stride = hidden_dim / heads;
+
+    int d0 = blockIdx.x;                                                  // Batch
+    int d1 = threadIdx.y + (blockIdx.z % head_ext) * (heads / head_ext);  // Head
+    int d2 = blockIdx.z / head_ext;                                       // Sequence
+    int cnt = blockIdx.y;                                                 // Hidden count
+    int d3 = threadIdx.x;                                                 // Values (groups of 8)
+
+    const float4* in_vec = reinterpret_cast<const float4*>(in);
+    float4* out_vec = reinterpret_cast<float4*>(out);
+
+    in_vec += (cnt * d0_stride * gridDim.x);
+    in_vec += (d0 * d0_stride);
+    in_vec += (d2 * d2_stride);
+    in_vec += (d1 * d2_stride * seq_length);
+
+    out_vec += (cnt * d1_stride);
+    out_vec += (d1 * d2_stride);
+    out_vec += (d0 * d0_stride * gridDim.y);
+    out_vec += (d2 * d1_stride * gridDim.y);
+
+    out_vec[d3] = in_vec[d3];
+
+#endif
+}
+
+__global__ void transform4d_0213_v2(__half* out,
+                                    const __half* in,
+                                    int heads,
+                                    int seq_length,
+                                    int hidden_dim)
+{
+#ifdef HALF_PRECISION_AVAILABLE
+    __shared__ float4 in_data[3072];
+
+    int d0_stride = hidden_dim * seq_length;
+    int d1_stride = hidden_dim;
+    int d2_stride = hidden_dim / heads;
+
+    int d0 = blockIdx.x;    // Batch
+    int d1 = threadIdx.y;   // Head
+    int d2 = blockIdx.y;    // Sequence
+    int cnt = threadIdx.z;  // Hidden count
+    int d3 = threadIdx.x;   // Values (groups of 8)
+
+    const float4* in_vec = reinterpret_cast<const float4*>(in);
+    float4* out_vec = reinterpret_cast<float4*>(out);
+
+    int input_offset = d0 * d0_stride + d2 * (d2_stride << 1) + d3 + (d1 % 2) * d2_stride;
+    int head_count = (d1 >> 1) + cnt * (blockDim.y >> 1);
+    int iteration_stride = blockDim.z * (blockDim.y >> 1);
+    int matrix_stride = (d0_stride * gridDim.x);
+
+#pragma unroll
+    for (int iter = 0; iter < 2; iter++) {
+        int iter_row = iter * iteration_stride + head_count;
+        int iter_offset = (iter_row % blockDim.y) * d2_stride;
+
+        in_data[d3 + iter_offset + (iter_row / blockDim.y + (d1 % 2) * blockDim.z) * d1_stride] =
+            in_vec[input_offset + iter_offset * seq_length +
+                   (iter_row / blockDim.y) * matrix_stride];
+    }
+    __syncthreads();
+
+    iteration_stride = d1_stride * blockDim.z;
+    int iter_index = cnt * d1_stride + d1 * d2_stride + d3;
+    int output_offset = d0 * d0_stride * blockDim.z + d2 * (iteration_stride << 1);
+
+#pragma unroll
+    for (int iter = 0; iter < 2; iter++) {
+        int iter_id = iter * iteration_stride + iter_index;
+        out_vec[output_offset + iter_id] = in_data[iter_id];
+    }
+#endif
+}
+
+// 3 * [B A S N] - > [B S C*H]
+template <>
+void launch_transform4d_0213<float>(float* out,
+                                    const float* in,
+                                    int batch_size,
+                                    int heads,
+                                    int seq_length,
+                                    int hidden_dim,
+                                    cudaStream_t stream,
+                                    int trans_count)
+{
+    hidden_dim >>= 2;
+    dim3 grid_dims(batch_size, heads * ((seq_length - 1) / 8 + 1), trans_count);
+    dim3 block_dims(hidden_dim / heads, 8);
+    transform4d_0213<float>
+        <<<grid_dims, block_dims, 0, stream>>>(out, in, heads, seq_length, hidden_dim, 1);
+}
+
+template <>
+void launch_transform4d_0213<__half>(__half* out,
+                                     const __half* in,
+                                     int batch_size,
+                                     int heads,
+                                     int seq_length,
+                                     int hidden_dim,
+                                     cudaStream_t stream,
+                                     int trans_count)
+{
+    hidden_dim >>= 3;
+    if (hidden_dim > 128 || hidden_dim < 16) {
+        int head_ext = (hidden_dim - 1) / MAX_THREADS + 1;
+        dim3 grid_dims(batch_size, trans_count, (seq_length * head_ext));
+        dim3 block_dims(hidden_dim / heads, (heads / head_ext));
+        transform4d_0213<__half><<<grid_dims, block_dims, 0, stream>>>(
+            out, in, heads, seq_length, hidden_dim, head_ext);
+    } else {
+        dim3 grid_dims(batch_size, seq_length / 2);
+        dim3 block_dims(hidden_dim / heads, heads, trans_count);
+        transform4d_0213_v2<<<grid_dims, block_dims, 0, stream>>>(
+            out, in, heads, seq_length, hidden_dim);
+    }
+}
--- a/csrc/transformer_bak/transform_kernels.hip
+++ b/csrc/transformer_bak/transform_kernels.hip
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
+#include "custom_hip_layers.h"
+
+#define rows_trans 16
+#define cols_trans 16
+
+template <typename T>
+__global__ void Transpose_Kernel(const T* inp, T* out, int row_width, int col_width)
+{
+    __shared__ T data_block[rows_trans * (cols_trans + 1)];
+
+    int r = threadIdx.x / cols_trans;
+    int c = threadIdx.x % cols_trans;
+
+    int m = row_width / cols_trans;
+
+    int i = blockIdx.x / m * rows_trans + r;
+    int j = blockIdx.x % m * cols_trans + c;
+
+    int row_stride = rows_trans / ((rows_trans * cols_trans + THREADS - 1) / THREADS);
+
+    for (int k = 0; k < rows_trans; k += row_stride)
+        data_block[(k + r) * cols_trans + c] = inp[(i + k) * row_width + j];
+
+    __syncthreads();
+
+    i = blockIdx.x % m * rows_trans + r;
+    j = blockIdx.x / m * cols_trans + c;
+
+    for (int k = 0; k < rows_trans; k += row_stride)
+        out[(i + k) * col_width + j] = data_block[c * cols_trans + r + k];
+}
+
+template <>
+void Transpose<__half>(const __half* inp_mat,
+                       __half* out_mat,
+                       int rows,
+                       int cols,
+                       hipStream_t stream)
+{
+    int threads = THREADS;
+
+   hipLaunchKernelGGL(( Transpose_Kernel<__half>), dim3((rows * cols + threads - 1) / threads), dim3(threads), 0, stream, 
+        inp_mat, out_mat, cols, rows);
+}
+
+template <>
+void Transpose<float>(const float* inp_mat, float* out_mat, int rows, int cols, hipStream_t stream)
+{
+    int threads = THREADS;
+
+   hipLaunchKernelGGL(( Transpose_Kernel<float>), dim3((rows * cols + threads - 1) / threads), dim3(threads), 0, stream, 
+        inp_mat, out_mat, cols, rows);
+}
+
+template <typename T>
+__global__ void transform_0213(T* output,
+                               const T* vals,
+                               int hidden_dim,
+                               int seq_length,
+                               int heads,
+                               int head_ext);
+
+template <>
+__global__ void transform_0213<float>(float* output,
+                                      const float* vals,
+                                      int hidden_dim,
+                                      int seq_length,
+                                      int heads,
+                                      int head_ext)
+{
+    int d0_stride = hidden_dim * seq_length;
+    int d1_stride = hidden_dim;
+    int d2_stride = hidden_dim / heads;
+
+    int d0_out_stride = d0_stride;
+    int d1_out_stride = d2_stride;
+    int d2_out_stride = d2_stride * seq_length;
+
+    int d0 = blockIdx.x;                                                  // Batch
+    int d1 = blockIdx.y / head_ext;                                       // Sequence ID (0-127)
+    int d2 = threadIdx.y + (blockIdx.y % head_ext) * (heads / head_ext);  // Head (0-11)
+    int d3 = threadIdx.x;                                                 // Values (groups of 4)
+
+    const float4* vals_vec = reinterpret_cast<const float4*>(vals);
+    float4* output_vec = reinterpret_cast<float4*>(output);
+
+    float4 inputs = vals_vec[d0 * d0_stride + d1 * d1_stride + d2 * d2_stride + d3];
+    output_vec[d0 * d0_out_stride + d1 * d1_out_stride + d2 * d2_out_stride + d3] = inputs;
+}
+
+template <>
+__global__ void transform_0213<__half>(__half* output,
+                                       const __half* vals,
+                                       int hidden_dim,
+                                       int seq_length,
+                                       int heads,
+                                       int head_ext)
+{
+#ifdef HALF_PRECISION_AVAILABLE
+
+    int d0_stride = hidden_dim * seq_length;
+    int d1_stride = hidden_dim;
+    int d2_stride = hidden_dim / heads;
+
+    int d0_out_stride = d0_stride;
+    int d1_out_stride = d2_stride;
+    int d2_out_stride = d2_stride * seq_length;
+
+    int d0 = blockIdx.x;                                                  // Batch
+    int d1 = blockIdx.y / head_ext;                                       // Sequence ID (0-127)
+    int d2 = threadIdx.y + (blockIdx.y % head_ext) * (heads / head_ext);  // Head (0-11)
+    int d3 = threadIdx.x;                                                 // Values (groups of 4)
+
+    float4 vals_arr[1];
+
+    const float4* vals_vec = reinterpret_cast<const float4*>(vals);
+    float4* output_vec = reinterpret_cast<float4*>(output);
+
+    vals_arr[0] = vals_vec[d0 * d0_stride + d1 * d1_stride + d2 * d2_stride + d3];
+    output_vec[d0 * d0_out_stride + d1 * d1_out_stride + d2 * d2_out_stride + d3] = vals_arr[0];
+#endif
+}
+
+template <>
+void launch_transform_0213<float>(float* output,
+                                  const float* vals,
+                                  int batch_size,
+                                  int seq_length,
+                                  int hidden_dim,
+                                  int heads,
+                                  hipStream_t stream)
+{
+    hidden_dim >>= 2;
+    int head_ext = (hidden_dim - 1) / MAX_THREADS + 1;
+    dim3 block_dim(hidden_dim / heads, (heads / head_ext));
+    dim3 grid_dim(batch_size, (seq_length * head_ext));
+
+   hipLaunchKernelGGL(( transform_0213<float>)
+        , dim3(grid_dim), dim3(block_dim), 0, stream, output, vals, hidden_dim, seq_length, heads, head_ext);
+}
+
+template <>
+void launch_transform_0213<__half>(__half* output,
+                                   const __half* vals,
+                                   int batch_size,
+                                   int seq_length,
+                                   int hidden_dim,
+                                   int heads,
+                                   hipStream_t stream)
+{
+    hidden_dim >>= 3;
+    int head_ext = (hidden_dim - 1) / MAX_THREADS + 1;
+    dim3 block_dim(hidden_dim / heads, (heads / head_ext));
+    dim3 grid_dim(batch_size, (seq_length * head_ext));
+   hipLaunchKernelGGL(( transform_0213<__half>)
+        , dim3(grid_dim), dim3(block_dim), 0, stream, output, vals, hidden_dim, seq_length, heads, head_ext);
+}
+
+// Bias add
+template <typename T>
+__global__ void bias_add_transform_0213(T* output,
+                                        const T* vals,
+                                        const T* bias,
+                                        int hidden_dim,
+                                        int seq_length,
+                                        int heads,
+                                        int head_ext);
+
+template <>
+__global__ void bias_add_transform_0213<float>(float* output,
+                                               const float* vals,
+                                               const float* bias,
+                                               int hidden_dim,
+                                               int seq_length,
+                                               int heads,
+                                               int head_ext)
+{
+    int d0_stride = hidden_dim * seq_length;
+    int d1_stride = hidden_dim;
+    int d2_stride = hidden_dim / heads;
+
+    int d0_out_stride = d0_stride;
+    int d1_out_stride = d2_stride;
+    int d2_out_stride = d2_stride * seq_length;
+
+    int d0 = blockIdx.x;                                                  // Batch
+    int d1 = blockIdx.y;                                                  // Sequence ID (0-127)
+    int cnt = blockIdx.z / head_ext;                                      // Hidden count
+    int d2 = threadIdx.y + (blockIdx.z % head_ext) * (heads / head_ext);  // Head (0-11)
+    int d3 = threadIdx.x;                                                 // Values (groups of 4)
+
+    const float4* vals_vec = reinterpret_cast<const float4*>(vals);
+    const float4* bias_vec = reinterpret_cast<const float4*>(bias);
+    float4* output_vec = reinterpret_cast<float4*>(output);
+
+    float4 inputs = vals_vec[d0 * d0_stride * (gridDim.z / head_ext) + cnt * d1_stride +
+                             d1 * d1_stride * (gridDim.z / head_ext) + d2 * d2_stride + d3];
+    float4 biases = bias_vec[cnt * d1_stride + d2 * d2_stride + d3];
+
+    float4 outputs;
+    outputs.x = inputs.x + biases.x;
+    outputs.y = inputs.y + biases.y;
+    outputs.z = inputs.z + biases.z;
+    outputs.w = inputs.w + biases.w;
+
+    output_vec[cnt * d0_out_stride * gridDim.x + d0 * d0_out_stride + d1 * d1_out_stride +
+               d2 * d2_out_stride + d3] = outputs;
+}
+
+#define ATTN_H 3
+#define MAX_SEQ_LINE 10
+
+template <>
+__global__ void bias_add_transform_0213<__half>(__half* output,
+                                                const __half* vals,
+                                                const __half* bias,
+                                                int hidden_dim,
+                                                int seq_length,
+                                                int heads,
+                                                int head_ext)
+{
+#ifdef HALF_PRECISION_AVAILABLE
+
+    int d0_stride = hidden_dim * seq_length;
+    int d1_stride = hidden_dim;
+    int d2_stride = hidden_dim / heads;
+
+    int d2_out_stride = d2_stride * seq_length;
+
+    int d0 = blockIdx.x;                                                  // Batch
+    int d1 = blockIdx.y;                                                  // Sequence ID (0-127)
+    int cnt = blockIdx.z / head_ext;                                      // Hidden count
+    int d2 = threadIdx.y + (blockIdx.z % head_ext) * (heads / head_ext);  // Head (0-11)
+    int d3 = threadIdx.x;                                                 // Values (groups of 4)
+
+    float4 vals_arr;
+    float4 bias_arr;
+    float4 output_arr;
+    __half2* vals_half = reinterpret_cast<__half2*>(&vals_arr);
+    __half2* bias_half = reinterpret_cast<__half2*>(&bias_arr);
+    __half2* output_half = reinterpret_cast<__half2*>(&output_arr);
+
+    const float4* vals_vec = reinterpret_cast<const float4*>(vals);
+    const float4* bias_vec = reinterpret_cast<const float4*>(bias);
+    float4* output_vec = reinterpret_cast<float4*>(output);
+
+    vals_vec += (d0 * d0_stride * (gridDim.z / head_ext));
+    vals_vec += (d1 * d1_stride * (gridDim.z / head_ext));
+    vals_vec += (cnt * d1_stride);
+    vals_vec += (d2 * d2_stride);
+
+    bias_vec += (cnt * d1_stride);
+    bias_vec += (d2 * d2_stride);
+
+    output_vec += (cnt * d0_stride * gridDim.x);
+    output_vec += (d1 * d2_stride);
+    output_vec += (d0 * d0_stride);
+    output_vec += (d2 * d2_out_stride);
+
+    bias_arr = bias_vec[d3];
+    vals_arr = vals_vec[d3];
+
+#if defined(__ACC_HALF__)
+    output_half[0] = vals_half[0] + bias_half[0];
+    output_half[1] = vals_half[1] + bias_half[1];
+    output_half[2] = vals_half[2] + bias_half[2];
+    output_half[3] = vals_half[3] + bias_half[3];
+#else
+    float2 bias_arr_f[4];
+    float2 vals_arr_f[4];
+#pragma unroll
+    for (int l = 0; l < 4; l++) {
+        bias_arr_f[l] = __half22float2(bias_half[l]);
+        vals_arr_f[l] = __half22float2(vals_half[l]);
+        vals_arr_f[l].x += bias_arr_f[l].x;
+        vals_arr_f[l].y += bias_arr_f[l].y;
+        output_half[l] = __float22half2_rn(vals_arr_f[l]);
+    }
+#endif
+    output_vec[d3] = output_arr;
+
+#endif
+}
+
+__global__ void bias_add_transform_0213_v2(__half* output,
+                                           const __half* vals,
+                                           const __half* bias,
+                                           int hidden_dim,
+                                           int seq_length,
+                                           int heads)
+{
+#ifdef HALF_PRECISION_AVAILABLE
+    __shared__ float4 in_data[3072];
+
+    int d0_stride = hidden_dim * seq_length;
+    int d1_stride = hidden_dim;
+    int d2_stride = hidden_dim / heads;
+    int iteration_stride = d1_stride * blockDim.z;  // Hidden * 3 / 8
+    int batch_stride = d0_stride * blockDim.z;      // Hidden * S * 3 / 8
+
+    int d0_out_stride = d0_stride;
+    int d1_out_stride = d2_stride;
+    int d2_out_stride = d2_stride * seq_length;
+
+    int d0 = blockIdx.x;    // Batch
+    int d1 = blockIdx.y;    // Sequence ID (0-127)
+    int cnt = threadIdx.z;  // blockIdx.z; // Hidden count
+    int d2 = threadIdx.y;   // Head (0-11)
+    int d3 = threadIdx.x;   // Values (groups of 4)
+
+    float4 vals_arr[1];
+    float4 bias_arr[1];
+    float4 output_arr[1];
+    __half2* vals_half = reinterpret_cast<__half2*>(vals_arr);
+    __half2* bias_half = reinterpret_cast<__half2*>(bias_arr);
+    __half2* output_half = reinterpret_cast<__half2*>(output_arr);
+
+    const float4* vals_vec = reinterpret_cast<const float4*>(vals);
+    const float4* bias_vec = reinterpret_cast<const float4*>(bias);
+    float4* output_vec = reinterpret_cast<float4*>(output);
+
+    int iter_index = cnt * d1_stride + d2 * d2_stride + d3;
+    int input_offset = d0 * batch_stride + d1 * (iteration_stride << 1);
+    bias_arr[0] = bias_vec[iter_index];
+
+#pragma unroll
+    for (int iter = 0; iter < 2; iter++) {
+        int iter_id = iter * iteration_stride + iter_index;
+        vals_arr[0] = vals_vec[input_offset + iter_id];
+
+        output_half[0] = vals_half[0] + bias_half[0];
+        output_half[1] = vals_half[1] + bias_half[1];
+        output_half[2] = vals_half[2] + bias_half[2];
+        output_half[3] = vals_half[3] + bias_half[3];
+
+        in_data[iter_id] = output_arr[0];
+    }
+    __syncthreads();
+
+    iteration_stride = blockDim.z * (blockDim.y >> 1);
+    int matrix_stride = (d0_out_stride * gridDim.x);
+    int head_count = (d2 >> 1) + cnt * (blockDim.y >> 1);
+
+    int out_index = d0 * d0_out_stride + d1 * (d1_out_stride << 1) + d3 + (d2 % 2) * d2_stride;
+
+#pragma unroll
+    for (int iter = 0; iter < 2; iter++) {
+        int iter_row = (iter * iteration_stride) + head_count;
+        int iter_offset =
+            (iter_row % blockDim.y) * d2_out_stride + (iter_row / blockDim.y) * matrix_stride;
+        output_vec[out_index + iter_offset] =
+            in_data[iter_row * d2_stride + d3 + (d2 % 2) * (d1_stride * blockDim.z)];
+    }
+#endif
+}
+
+// [B S C*H] - > C * [B A S N]
+template <>
+void launch_bias_add_transform_0213<float>(float* output,
+                                           const float* vals,
+                                           const float* bias,
+                                           int batch_size,
+                                           int seq_length,
+                                           int hidden_dim,
+                                           int heads,
+                                           hipStream_t stream,
+                                           int trans_count)
+{
+    hidden_dim >>= 2;
+    int head_ext = (hidden_dim - 1) / MAX_THREADS + 1;
+
+    dim3 block_dim(hidden_dim / heads, (heads / head_ext));
+    dim3 grid_dim(batch_size, seq_length, (trans_count * head_ext));
+
+   hipLaunchKernelGGL(( bias_add_transform_0213<float>), dim3(grid_dim), dim3(block_dim), 0, stream, 
+        output, vals, bias, hidden_dim, seq_length, heads, head_ext);
+}
+
+template <>
+void launch_bias_add_transform_0213<__half>(__half* output,
+                                            const __half* vals,
+                                            const __half* bias,
+                                            int batch_size,
+                                            int seq_length,
+                                            int hidden_dim,
+                                            int heads,
+                                            hipStream_t stream,
+                                            int trans_count)
+{
+    hidden_dim >>= 3;
+    if (hidden_dim > 128 || hidden_dim < 16) {
+        int head_ext = (hidden_dim - 1) / MAX_THREADS + 1;
+        dim3 block_dim(hidden_dim / heads, (heads / head_ext));
+        dim3 grid_dim(batch_size, seq_length, (trans_count * head_ext));
+       hipLaunchKernelGGL(( bias_add_transform_0213<__half>), dim3(grid_dim), dim3(block_dim), 0, stream, 
+            output, vals, bias, hidden_dim, seq_length, heads, head_ext);
+    } else {
+        dim3 block_dim(hidden_dim / heads, heads, trans_count);
+        dim3 grid_dim(batch_size, seq_length / 2);
+       hipLaunchKernelGGL(( bias_add_transform_0213_v2), dim3(grid_dim), dim3(block_dim), 0, stream, 
+            output, vals, bias, hidden_dim, seq_length, heads);
+    }
+}
+
+template <typename T>
+__global__ void transform4d_0213(T* out,
+                                 const T* in,
+                                 int heads,
+                                 int seq_length,
+                                 int hidden_dim,
+                                 int head_ext);
+
+template <>
+__global__ void transform4d_0213<float>(float* out,
+                                        const float* in,
+                                        int heads,
+                                        int seq_length,
+                                        int hidden_dim,
+                                        int head_ext)
+{
+    int d0_stride = hidden_dim * seq_length;
+    int d1_stride = d0_stride / heads;
+    int d2_stride = hidden_dim / heads;
+
+    int d0_out_stride = d0_stride;
+    int d1_out_stride = d2_stride;
+    int d2_out_stride = hidden_dim;
+
+    int d0 = blockIdx.x;                                        // Batch
+    int d1 = blockIdx.y / ((seq_length - 1) / blockDim.y + 1);  // Head
+    int d2 = (threadIdx.y + blockDim.y * blockIdx.y) % seq_length;
+    int cnt = blockIdx.z;
+    int d3 = threadIdx.x;  // Values (groups of 8)
+
+    if (d2 < seq_length) {
+        const float4* in_vec = reinterpret_cast<const float4*>(in);
+        float4* out_vec = reinterpret_cast<float4*>(out);
+
+        float4 vals_vec = in_vec[cnt * d0_stride * gridDim.x + d0 * d0_stride + d1 * d1_stride +
+                                 d2 * d2_stride + d3];
+        out_vec[d0 * d0_out_stride * gridDim.z + cnt * d2_out_stride + d1 * d1_out_stride +
+                d2 * d2_out_stride * gridDim.z + d3] = vals_vec;
+    }
+}
+
+template <>
+__global__ void transform4d_0213<__half>(__half* out,
+                                         const __half* in,
+                                         int heads,
+                                         int seq_length,
+                                         int hidden_dim,
+                                         int head_ext)
+{
+#ifdef HALF_PRECISION_AVAILABLE
+
+    int d0_stride = hidden_dim * (seq_length / head_ext);
+    int d1_stride = hidden_dim;
+    int d2_stride = hidden_dim / heads;
+
+    int d0 = blockIdx.x;                                                  // Batch
+    int d1 = threadIdx.y + (blockIdx.z % head_ext) * (heads / head_ext);  // Head
+    int d2 = blockIdx.z / head_ext;                                       // Sequence
+    int cnt = blockIdx.y;                                                 // Hidden count
+    int d3 = threadIdx.x;                                                 // Values (groups of 8)
+
+    const float4* in_vec = reinterpret_cast<const float4*>(in);
+    float4* out_vec = reinterpret_cast<float4*>(out);
+
+    in_vec += (cnt * d0_stride * gridDim.x);
+    in_vec += (d0 * d0_stride);
+    in_vec += (d2 * d2_stride);
+    in_vec += (d1 * d2_stride * seq_length);
+
+    out_vec += (cnt * d1_stride);
+    out_vec += (d1 * d2_stride);
+    out_vec += (d0 * d0_stride * gridDim.y);
+    out_vec += (d2 * d1_stride * gridDim.y);
+
+    out_vec[d3] = in_vec[d3];
+
+#endif
+}
+
+__global__ void transform4d_0213_v2(__half* out,
+                                    const __half* in,
+                                    int heads,
+                                    int seq_length,
+                                    int hidden_dim)
+{
+#ifdef HALF_PRECISION_AVAILABLE
+    __shared__ float4 in_data[3072];
+
+    int d0_stride = hidden_dim * seq_length;
+    int d1_stride = hidden_dim;
+    int d2_stride = hidden_dim / heads;
+
+    int d0 = blockIdx.x;    // Batch
+    int d1 = threadIdx.y;   // Head
+    int d2 = blockIdx.y;    // Sequence
+    int cnt = threadIdx.z;  // Hidden count
+    int d3 = threadIdx.x;   // Values (groups of 8)
+
+    const float4* in_vec = reinterpret_cast<const float4*>(in);
+    float4* out_vec = reinterpret_cast<float4*>(out);
+
+    int input_offset = d0 * d0_stride + d2 * (d2_stride << 1) + d3 + (d1 % 2) * d2_stride;
+    int head_count = (d1 >> 1) + cnt * (blockDim.y >> 1);
+    int iteration_stride = blockDim.z * (blockDim.y >> 1);
+    int matrix_stride = (d0_stride * gridDim.x);
+
+#pragma unroll
+    for (int iter = 0; iter < 2; iter++) {
+        int iter_row = iter * iteration_stride + head_count;
+        int iter_offset = (iter_row % blockDim.y) * d2_stride;
+
+        in_data[d3 + iter_offset + (iter_row / blockDim.y + (d1 % 2) * blockDim.z) * d1_stride] =
+            in_vec[input_offset + iter_offset * seq_length +
+                   (iter_row / blockDim.y) * matrix_stride];
+    }
+    __syncthreads();
+
+    iteration_stride = d1_stride * blockDim.z;
+    int iter_index = cnt * d1_stride + d1 * d2_stride + d3;
+    int output_offset = d0 * d0_stride * blockDim.z + d2 * (iteration_stride << 1);
+
+#pragma unroll
+    for (int iter = 0; iter < 2; iter++) {
+        int iter_id = iter * iteration_stride + iter_index;
+        out_vec[output_offset + iter_id] = in_data[iter_id];
+    }
+#endif
+}
+
+// 3 * [B A S N] - > [B S C*H]
+template <>
+void launch_transform4d_0213<float>(float* out,
+                                    const float* in,
+                                    int batch_size,
+                                    int heads,
+                                    int seq_length,
+                                    int hidden_dim,
+                                    hipStream_t stream,
+                                    int trans_count)
+{
+    hidden_dim >>= 2;
+    dim3 grid_dims(batch_size, heads * ((seq_length - 1) / 8 + 1), trans_count);
+    dim3 block_dims(hidden_dim / heads, 8);
+   hipLaunchKernelGGL(( transform4d_0213<float>)
+        , dim3(grid_dims), dim3(block_dims), 0, stream, out, in, heads, seq_length, hidden_dim, 1);
+}
+
+template <>
+void launch_transform4d_0213<__half>(__half* out,
+                                     const __half* in,
+                                     int batch_size,
+                                     int heads,
+                                     int seq_length,
+                                     int hidden_dim,
+                                     hipStream_t stream,
+                                     int trans_count)
+{
+    hidden_dim >>= 3;
+    if (hidden_dim > 128 || hidden_dim < 16) {
+        int head_ext = (hidden_dim - 1) / MAX_THREADS + 1;
+        dim3 grid_dims(batch_size, trans_count, (seq_length * head_ext));
+        dim3 block_dims(hidden_dim / heads, (heads / head_ext));
+       hipLaunchKernelGGL(( transform4d_0213<__half>), dim3(grid_dims), dim3(block_dims), 0, stream, 
+            out, in, heads, seq_length, hidden_dim, head_ext);
+    } else {
+        dim3 grid_dims(batch_size, seq_length / 2);
+        dim3 block_dims(hidden_dim / heads, heads, trans_count);
+       hipLaunchKernelGGL(( transform4d_0213_v2), dim3(grid_dims), dim3(block_dims), 0, stream, 
+            out, in, heads, seq_length, hidden_dim);
+    }
+}
--- a/deepspeed/__init__.py
+++ b/deepspeed/__init__.py
 '''
 Copyright 2020 The Microsoft DeepSpeed Team
 '''
+
 import sys
 import types
+from typing import Optional, Union
+import torch
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import _LRScheduler
+from packaging import version as pkg_version

 from . import ops
+from . import module_inject

-from .runtime.engine import DeepSpeedEngine
+from .runtime.engine import DeepSpeedEngine, DeepSpeedOptimizerCallable, DeepSpeedSchedulerCallable
 from .runtime.engine import ADAM_OPTIMIZER, LAMB_OPTIMIZER
 from .runtime.pipe.engine import PipelineEngine
+from .inference.engine import InferenceEngine
+
 from .runtime.lr_schedules import add_tuning_arguments
 from .runtime.config import DeepSpeedConfig, DeepSpeedConfigError
 from .runtime.activation_checkpointing import checkpointing
 from .ops.transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
+from .module_inject import replace_transformer_layer, revert_transformer_layer
+
 from .utils import log_dist
 from .utils.distributed import init_distributed

@@ -25,9 +36,8 @@ from .git_version_info import version, git_hash, git_branch

 def _parse_version(version_str):
    '''Parse a version string and extract the major, minor, and patch versions.'''
-    import re
-    matched = re.search('^(\d+)\.(\d+)\.(\d+)', version_str)
-    return int(matched.group(1)), int(matched.group(2)), int(matched.group(3))
+    ver = pkg_version.parse(version_str)
+    return ver.major, ver.minor, ver.micro


 # Export version information
@@ -36,46 +46,38 @@ __version_major__, __version_minor__, __version_patch__ = _parse_version(__versi
 __git_hash__ = git_hash
 __git_branch__ = git_branch

-# Provide backwards compatability with old deepspeed.pt module structure, should hopefully not be used
-pt = types.ModuleType('pt', 'dummy pt module for backwards compatability')
-deepspeed = sys.modules[__name__]
-setattr(deepspeed, 'pt', pt)
-setattr(deepspeed.pt, 'deepspeed_utils', deepspeed.runtime.utils)
-sys.modules['deepspeed.pt'] = deepspeed.pt
-sys.modules['deepspeed.pt.deepspeed_utils'] = deepspeed.runtime.utils
-setattr(deepspeed.pt, 'deepspeed_config', deepspeed.runtime.config)
-sys.modules['deepspeed.pt.deepspeed_config'] = deepspeed.runtime.config
-setattr(deepspeed.pt, 'loss_scaler', deepspeed.runtime.fp16.loss_scaler)
-sys.modules['deepspeed.pt.loss_scaler'] = deepspeed.runtime.fp16.loss_scaler
-

 def initialize(args=None,
-               model=None,
-               optimizer=None,
-               model_parameters=None,
-               training_data=None,
-               lr_scheduler=None,
+               model: torch.nn.Module = None,
+               optimizer: Optional[Union[Optimizer,
+                                         DeepSpeedOptimizerCallable]] = None,
+               model_parameters: Optional[torch.nn.Module] = None,
+               training_data: Optional[torch.utils.data.Dataset] = None,
+               lr_scheduler: Optional[Union[_LRScheduler,
+                                            DeepSpeedSchedulerCallable]] = None,
               mpu=None,
-               dist_init_required=None,
+               dist_init_required: Optional[bool] = None,
               collate_fn=None,
+               config=None,
               config_params=None):
    """Initialize the DeepSpeed Engine.

    Arguments:
-        args: an object containing local_rank and deepspeed_config fields. This is optional if `config_params` is passed.
+        args: an object containing local_rank and deepspeed_config fields.
+            This is optional if `config` is passed.

        model: Required: nn.module class before apply any wrappers

-        optimizer: Optional: a user defined optimizer, this is typically used instead of defining
-            an optimizer in the DeepSpeed json config.
+        optimizer: Optional: a user defined Optimizer or Callable that returns an Optimizer object.
+            This overrides any optimizer definition in the DeepSpeed json config.

        model_parameters: Optional: An iterable of torch.Tensors or dicts.
            Specifies what Tensors should be optimized.

        training_data: Optional: Dataset of type torch.utils.data.Dataset

-        lr_scheduler: Optional: Learning Rate Scheduler Object. It should define a get_lr(),
-            step(), state_dict(), and load_state_dict() methods
+        lr_scheduler: Optional: Learning Rate Scheduler Object or a Callable that takes an Optimizer and returns a Scheduler object.
+            The scheduler object should define a get_lr(), step(), state_dict(), and load_state_dict() methods

        mpu: Optional: A model parallelism unit object that implements
            get_{model,data}_parallel_{rank,group,world_size}()
@@ -87,8 +89,10 @@ def initialize(args=None,
            mini-batch of Tensor(s).  Used when using batched loading from a
            map-style dataset.

-        config_params: Optional: Instead of requiring args.deepspeed_config you can pass your deepspeed config
-            as a dictionary instead.
+        config: Optional: Instead of requiring args.deepspeed_config you can pass your deepspeed config
+            as an argument instead, as a path or a dictionary.
+
+        config_params: Optional: Same as `config`, kept for backwards compatibility.

    Returns:
        A tuple of ``engine``, ``optimizer``, ``training_dataloader``, ``lr_scheduler``
@@ -109,7 +113,6 @@ def initialize(args=None,
        __git_hash__,
        __git_branch__),
             ranks=[0])
-
    assert model is not None, "deepspeed.initialize requires a model"

    if not isinstance(model, PipelineModule):
@@ -122,6 +125,7 @@ def initialize(args=None,
                                 mpu=mpu,
                                 dist_init_required=dist_init_required,
                                 collate_fn=collate_fn,
+                                 config=config,
                                 config_params=config_params)
    else:
        assert mpu is None, "mpu must be None with pipeline parallelism"
@@ -134,6 +138,7 @@ def initialize(args=None,
                                mpu=model.mpu(),
                                dist_init_required=dist_init_required,
                                collate_fn=collate_fn,
+                                config=config,
                                config_params=config_params)

    return_items = [
@@ -210,3 +215,91 @@ def add_config_arguments(parser):
    parser = _add_core_arguments(parser)

    return parser
+
+
+def init_inference(model,
+                   triangular_masking=True,
+                   mp_size=1,
+                   training_mp_size=1,
+                   mpu=None,
+                   ep_group=None,
+                   expert_mp_group=None,
+                   checkpoint=None,
+                   dtype=None,
+                   injection_policy=None,
+                   replace_method='auto',
+                   quantization_setting=None,
+                   replace_with_kernel_inject=False,
+                   return_tuple=True,
+                   ep_size=1,
+                   moe=False,
+                   moe_experts=1,
+                   moe_type='standard',
+                   args=None):
+    """Initialize the DeepSpeed InferenceEngine.
+
+    Arguments:
+        model: Required: nn.module class before apply any wrappers
+
+        triangular_masking: Required: this shows the type of masking for attention scores in transformer layer
+            note that the masking is application specific.
+
+        mp_size: Optional: Desired model parallel size, default is 1 meaning no
+            model parallelism.
+
+        training_mp_size: Optional: if loading a checkpoint this is the mp size that it was trained with,
+            it may be different than what the mp size that you want to use during inference.
+
+        mpu: Optional: A model parallelism unit object that implements
+            get_{model,data}_parallel_{rank,group,world_size}()
+
+        checkpoint: Optional: Path to deepspeed compatible checkpoint or path to
+            JSON with load policy.
+
+        dtype: Optional: Desired model data type, will convert model to this type.
+            Supported target types: torch.half, torch.int8, torch.float
+
+        injection_policy: Optional: Dictionary mapping a client nn.Module to its corresponding
+            injection policy. e.g., {BertLayer : deepspeed.inference.HFBertLayerPolicy}
+
+        replace_method: Optional: If 'auto' DeepSpeed will automatically try and replace
+            model modules with its optimized versions. If an injection_policy is set this will
+            override the automatic replacement behavior.
+
+        quantization_setting: Optional: Quantization settings used for quantizing your model using the MoQ.
+            The setting can be one element or a tuple. If one value is passed in, we consider it as the number
+            of groups used in quantization. A tuple is passed in if we want to mention that there is extra-grouping
+            for the MLP part of a Transformer layer (e.g. (True, 8) shows we quantize the model using 8 groups for
+            all the network except the MLP part that we use 8 extra grouping).
+        replace_with_kernel_inject: If set we inject kernel as we initialize the inference-engine
+
+    Returns:
+        A deepspeed.InferenceEngine wrapped model.
+    """
+    log_dist("DeepSpeed info: version={}, git-hash={}, git-branch={}".format(
+        __version__,
+        __git_hash__,
+        __git_branch__),
+             ranks=[0])
+
+    engine = InferenceEngine(model,
+                             triangular_masking,
+                             mp_size,
+                             training_mp_size,
+                             ep_size,
+                             mpu,
+                             ep_group,
+                             expert_mp_group,
+                             checkpoint,
+                             dtype,
+                             injection_policy,
+                             return_tuple,
+                             replace_method,
+                             quantization_setting,
+                             replace_with_kernel_inject,
+                             moe,
+                             moe_experts,
+                             moe_type,
+                             args)
+
+    return engine
--- a/deepspeed/autotuning/.gitignore
+++ b/deepspeed/autotuning/.gitignore
+test*
+runs
+autotuning_results*
+autotuning_exps
+output*
+*.png
--- a/deepspeed/autotuning/README.md
+++ b/deepspeed/autotuning/README.md
+# DeepSpeed Autotuning
+## Overview
+
+One pain point in model training is to figure out good performance-relevant configurations such as micro-batch size to fully utilize the hardware and achieve a high throughput number. This configuration exploring process is commonly done manually but is important since model training is repeated many times and benefits from using a good configuration. Not only is the hand-tuning process time-consuming, but the outcome is hardware-dependent. This means that a good configuration on one hardware might not be the best on another different hardware. The user thus has to hand tune the configuration again. With DeepSpeed, there are more configuration parameters that could potentially affect the training speed, thus making it more tedious to manually tune the configuration.
+
+The DeepSpeed Autotuner mitigates this pain point and automatically discovers the optimal DeepSpeed configuration that delivers good training speed.
+The Autotuner uses model information, system information, and heuristics to efficiently tune system knobs that affect compute and memory efficiencies, such as ZeRO optimization stages, micro-batch sizes, and many other ZeRO optimization configurations.
+It not only reduces the time and resources users spend on tuning, but also can discover configurations better than hand-tuned methods.
+
+DeepSpeed Autotuning is easy to use, requiring no code change from DeepSpeed users.
+Compared to the original training script (`deepspeed your_program.py <normal cl args> --deepspeed ds_config.json`), invoking the autotuning feature in DeepSpeed only requires setting an `autotuning` flag after the DeepSpeed launcher (see [Usage](#usage) for details), and adding `"autotuning": {"enabled": true}` to the DeepSpeed configuration file. Users can further tailor the autotuning process by changing the autotuning configuration in the DeepSpeed configuration JSON file (See [Autotuning Configuration](#autotuning-configuration) for details).
+
+## Usage
+
+To use DeepSpeed Autotuner, you need to do two things:
+
+1. Add `"autotuning": {"enabled": true}` to the DeepSpeed configuration file. If the user training script uses DeepSpeed configuration parameters as command-line arguments, the name mappings between the parameters in DeepSpeed configuration and the training script arguments must be provided in the `arg_mappings` dictionary in the `autotuning` section of the DeepSpeed configuration file.
+Common train scripts have micro-batch size per GPU as an argument, this mapping between the flag name and `train_micro_batch_size_per_gpu` must be provided. Below is the an example where the training script takes `--per_device_train_batch_size` as micro-batch size. Note that `--` is needed.
+
+```json
+{
+    "autotuning": {
+        "enabled": true,
+        "arg_mappings": {
+            "train_micro_batch_size_per_gpu": "--per_device_train_batch_size",
+        }
+    }
+```
+
+2. Specifying `--autotuning=[run|tune]` in the command line, shown as below.
+```bash
+deepspeed --autotuning=[run|tune] <user script> --deepspeed ds_config.json <other user args>
+```
+
+`--autotuning=run` finds the optimal DeepSpeed configuration and then launches the training with that configuration. If you want to just find the optimal configuration without running the training script, then set `--autotuning` to `tune`.
+
+
+If users specify the [resource configuration](https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node) using the flags `--num_gpus` and `--num_nodes`, then the command becomes:
+
+```bash
+deepspeed --autotuning=[run|tune] --num_gpus=$NUM_GPUS --num_nodes=$NUM_NODES <user script> --deepspeed ds_config.json <other user args>
+```
+
+ Below shows an example where `train_micro_batch_size_per_gpu` and `gradient_accumulation_steps` are mapped to `--per_device_train_batch_size` and `--gradient_accumulation_steps` as training arguments.
+
+Example script (some details omitted):
+
+```bash
+  deepspeed --autotuning run --num_nodes=1 --num_gpus=8 $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py \
+  --deepspeed $DS_CONFIG_PATH \
+  --model_name_or_path gpt2 \
+  --do_train \
+  --do_eval \
+  --fp16 \
+  --per_device_train_batch_size 8 \
+  --gradient_accumulation_steps 1 \
+  ...
+```
+
+DeepSpeed configuration file:
+
+```json
+{
+    "autotuning": {
+        "enabled": true,
+        "arg_mappings": {
+            "train_micro_batch_size_per_gpu": "--per_device_train_batch_size",
+            "gradient_accumulation_steps ": "--gradient_accumulation_steps"
+        }
+    }
+```
+
+
+By default, the Autotuner would only tune ZeRO optimization stages and micro-batch sizes per GPU (`fast` mode). This saves the autotuning time for a close-to-optimal tuning result. If you would like to tune other ZeRO optimization configurations,set `"fast"` to `false` in the [autotuning configuration](#autotuning-configuration).
+
+
+## Autotuning Workflow and Scope
+
+Currently, the DeepSpeed Autotuner tunes ZeRO stages, micro-batch size per GPU, and ZeRO configurations (offloading is not yet supported) on top of other configurations such as optimizer, scheduler, fp16 defined by the user in the DeepSpeed configuration file. A high-level workflow is described below:
+  1. At the beginning of the autotuning process, the Autotuner launches a model information profiling experiment to get the number of model parameters and amount of activation memory.
+  2. Then the Autotuner explores ZeRO stages in the order of `[0, 1, 2, 3]`. For each ZeRO stage, the Autotuner estimates the minimal memory required per GPU to train the model, and compares it with the available GPU memory. A less-than indicates that the model might be runnable with the given ZeRO stage, and the Autotuner then tunes the micro-batch size per GPU and other ZeRO configurations for that ZeRO stage.
+     1. The Autotuner first tunes the micro-batch size per GPU along with gradient accumulation steps (users can specify the maximum global train batch size for the model), and selects a list of micro-batch sizes to explore next.
+     2. Each ZeRO stage has a carefully-chosen default tuning space to explore for the other [ZeRO configurations](https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training). Users can overwrite it through the DeepSpeed configuration file.
+     3. Combinations of different micro-batch sizes and ZeRO configurations are then explored as experiments by the Autotuner using a supported algorithm (e.g., xgboost model-based algorithm). Early termination in this exploration is set by heuristics and is configurable by the user.
+     4. An optimal configuration based on a metric (throughput, latency, or FLOPS) is returned for that ZeRO stage.
+  3. The exploration of different ZeRO stages would stop if the optimal setup for the current ZeRO stage is no better than that of the previous ZeRO stage tuned (other heuristics are used as well for determining the termination).
+  4. In the end, the global optimal setup is returned to the user. If the value of the `--autotuning` flag is set to `run`, the Autotuner launches the training with the found optimal setup.
+
+Note that ZeRO stages, micro-batch sizes, and other ZeRO configurations to tune are also configurable and can be overwritten by the user through the DeepSpeed configuration file. See [Configuring Tuning Scope](#configuring-tuning-scope) for details.
+
+
+## Configuring Tuning Scope
+
+The DeepSpeed Autotuner tunes ZeRO stages, micro-batch size per GPU, and ZeRO configurations. Other DeepSpeed configurations are used as defined by the user in the DeepSpeed configuration file. Users can overwrite any of the tuning parameters.
+### Configuring ZeRO Stage
+
+By default, the DeepSpeed Autotuner tunes ZeRO stages. If `"zero_optimization"` is not defined or set to `"all"`, the Autotuner explores ZeRO stages in the order of `[0, 1, 2, 3]`. Users can overwrite this behavior if they already know what ZeRO stage(s) to use. For example, the below section in the DeepSpeed configuration file limits the Autotuner to only exploring ZeRO stage 2 and 3.
+
+```json
+{
+  "zero_optimization": {
+    "stage": [2, 3]
+  }
+}
+```
+
+### Configuring Train Micro-Batch Size
+
+The DeepSpeed Autotuner tunes the micro-batch size per GPU (`train_micro_batch_size_per_gpu` in DeepSpeed configuration) along with gradient accumulation steps (`gradient_accumulation_steps` in DeepSpeed configuration). The `train_micro_batch_size_per_gpu` value specified by the user in the DeepSpeed configuration file is used as the minimal micro-batch size per GPU to tune if it's runnable.
+
+When using Hugging Face and `train_micro_batch_size_per_gpu` is set to ["auto"](#using-autotuning-with-hugging-face), if `train_micro_batch_size_per_gpu` has a corresponding training script mapping provided in `args_mapping`, the command-line value is used as the minimal micro-batch size per GPU to tune; else, `1` would be used as the minimal micro-batch size per GPU in tuning.
+
+`train_batch_size` in DeepSpeed configuration must be equal to `train_micro_batch_size_per_gpu * gradient_accumulation_steps * total_num_gpus // model_parallelism_size `. Currently, the DeepSpeed Autotuner ignores the `train_batch_size` parameter specified in the DeepSpeed configuration file, please use `train_micro_batch_size_per_gpu` and `gradient_accumulation_steps` in autotuning.
+
+The configuration below asks the Autotuner to use `4` as the minimal micro-batch size per GPU in tuning. Note that the value passed to the training script through `--per_device_train_batch_size` is ignored (which is supposed to be equal to the `train_micro_batch_size_per_gpu` value set in the DeepSpeed configuration).
+
+```json
+{
+    "train_micro_batch_size_per_gpu": 4,
+    "autotuning": {
+        "enabled": true,
+    },
+    "arg_mappings": {
+        "train_micro_batch_size_per_gpu": "--per_device_train_batch_size",
+    }
+}
+```
+
+The configuration below asks the Autotuner to use the value of `"--per_device_train_batch_size"` in the training script as the minimal micro-batch size per GPU in tuning. Also, the training script takes `gradient_accumulation_steps` as an argument in training code.
+```json
+{
+    "train_micro_batch_size_per_gpu": "auto",
+    "gradient_accumulation_steps": "auto",
+    "autotuning": {
+        "enabled": true,
+        "arg_mappings": {
+            "train_micro_batch_size_per_gpu": "--per_device_train_batch_size",
+            "gradient_accumulation_steps ": "--gradient_accumulation_steps"
+        }
+    }
+}
+```
+
+Users can set the maximum train batch size (global effective batch size) for the autotuning process by specifying `max_train_batch_size` in the autotuning configuration section of the DeepSpeed configuration file, as shown below. If `max_train_batch_size` is not defined, the Autotuner would use `maximum_train_micro_batch_size_per_gpu_runnable * gradient_accumulation_steps * total_num_gpus // model_parallelism_size` as `max_train_batch_size` (`gradient_accumulation_steps` defined in the DeepSpeed configuration file or training script or `1` is used here).
+
+```json
+{
+    "autotuning": {
+        "enabled": true,
+        "max_train_batch_size": 1024,
+        "arg_mappings": {
+          "train_micro_batch_size_per_gpu": "--per_device_train_batch_size",
+        }
+    }
+}
+```
+
+By default, the DeepSpeed Autotuning would select at maximum `num_tuning_micro_batch_sizes` (micro-batch size per GPU, gradient accumulation steps) pairs to tune ZeRO configurations. `num_tuning_micro_batch_sizes` defaults to `3` and can be set in the [autotuning configuration](#autotuning-configuration).
+
+Users can specify the list of micro-batch sizes to tune in the DeepSpeed configuration file.
+For example, the following section in the DeepSpeed configuration file limits the autotuning to explore `train_micro_batch_size_per_gpu` in `[1, 4, 16]`, and `gradient_accumulation_steps = 2` is used. Combinations of the two parameters are considered in the tuning (constrained by `max_train_batch_size` if defined). Note that specifying a list of `gradient_accumulation_steps` to tune is not supported.
+
+```json
+{
+  "train_micro_batch_size_per_gpu": [1, 4, 16],
+  "gradient_accumulation_steps": 2
+}
+```
+
+The entry below asks the Autotuner to use `4` as the micro-batch size per GPU in tuning (micro-batch size per GPU is fixed as 4). Note that it's different from using ` "train_micro_batch_size_per_gpu": [4]` which asks the Autotuner to tune micro-batch size per GPU starting from `4`.
+```json
+{
+    "train_micro_batch_size_per_gpu": [4],
+}
+```
+
+#### Learning rate scaling when the effective batch size changes
+
+Given that DS Autotuner provides users the flexibility to explore the best performance configuration under a range of different batch sizes (e.g., by changing the `train_micro_batch_size_per_gpu`), it is possible that the total effective batch size `B'` per training iteration that maximizes the compute efficiency is different from the one `B` the user originally uses for training. If the user decides to choose the best-performing batch size `B'` identified by DS autotuner for training to achieve faster training speed, we suggest the user to scale the learning rate by `sqrt(B'/B)` while keeping the other hyperparameters unchanged. The rationale behind this scaling is that one should scale the learning rate such that the variance in the gradient expectation remains constant when the batch size changes. In the case of stochastic gradient descent, we recommend the user to scale the learning rate linearly by `B'/B` while keeping the other hyperparameters (momentum, weight decay, etc.) the same, which we empirically find to work better for SGD and momentum-based optimizer.
+
+
+### Configuring ZeRO configurations
+
+The DeepSpeed Autotuner explores a set of carefully-chosen default values for ZeRO configuration parameters, defined in [`DEFAULT_TUNING_SPACE_ZERO_0,1,2,3`](constants.py). Users can overwrite any of the parameters (using a value or a list of values) in the DeepSpeed configuration file.
+
+For example, the default tuning space for ZeRO stage 1 is
+```python
+DEFAULT_TUNING_SPACE_ZERO_1 = {
+    "zero_optimization": {
+        "stage": 1,
+        "reduce_bucket_size": [5e7,
+                               5e8,
+                               1e9],
+        "allgather_bucket_size": [5e7,
+                                  5e8,
+                                  1e9],
+    }
+}
+```
+, where `3*3 = 9` combinations of different `reduce_bucket_size` and `allgather_bucket_size` values are explored in the tuning. Users can overwrite it in the DeepSpeed configuration file
+by
+```json
+{
+  "zero_optimization": {
+    "stage": 1,
+    "reduce_bucket_size": [5e7, 5e8],
+    "allgather_bucket_size": 5e8,
+  }
+}
+```
+, where only `2*1` cases `{"reduce_bucket_size": 5e7, "allgather_bucket_size": 5e8}` and `{"reduce_bucket_size": 5e8, "allgather_bucket_size": 5e8}` would be explored in the tuning.
+If `"stage"` is not defined or set as `"all"`, then the overwriting applies to all ZeRO stages.
+#### Offloading and NVME
+
+Currently, the DeepSpeed Autotuner does not tune offloading behaviors but instead uses the values defined in the offload section of the DeepSpeed configuration file. See [Parameter offloading](https://www.deepspeed.ai/docs/config-json/#parameter-offloading) and [Optimizer offloading](https://www.deepspeed.ai/docs/config-json/#optimizer-offloading) for details.
+
+If using NVME for offloading, users can run a benchmark offline to select the optimal `aio` setup in DeepSpeed. Refer to [profiling NVMe and configuring aio param section](https://github.com/microsoft/DeepSpeed/issues/998).
+
+## Autotuning Output
+
+By default, the DeepSpeed Autotuner generates a folder named `"autotuning_exps"` to store the descriptions of the autotuning experiments, and a folder named `"autotuning_results"` to store the results of the autotuning experiments under the training script launching path. Users can specify other path to use by setting `"results_dir"` or `"exps_dir"` in the autotuning configuration ([Results and Experiments Path](#results-and-experiments-path)).
+
+Each autotuning experiment has a unique name based on the tuning parameters and. For example, z1_tmbspg3_gas1 means the experiment uses ZeRO stage 1, train micro-batch size per GPU (tmbspg) of 3, and gradient accumulation steps (gas) of 1. Then the experiment description is store as file z1_tmbspg3_gas1.json in the `"exps_dir"` folder, and the experiment result is stored in a folder named z1_tmbspg3_gas1 under the `"results_dir"`.
+
+Each experiment result folder could contain the following files:
+
+```bash
+z1_tmbspg3_gas1/ # z1_tmbspg4_gas1 experiment result folder
+|-- cmd.txt # command used to launch the experiment
+|-- ds_config.json # DeepSpeed configuration used in the experiment
+|-- exp.json # experiment description, used by the Autotuner for experiment management
+|-- metrics.json # performance metrics recorded for the experiment
+|-- stderr.log # stderr of running the experiment
+`-- stdout.log # stdout of running the experiment
+```
+
+After the autotuning is done, a table of tuning experiments summary and autotuning duration would be printed to the terminal, for example:
+
+```
+| tuning_space | num_exps | best_metric_val | best_exp_name   |
+| :----------- | -------: | --------------: | :-------------- |
+| z0           |        2 |         90.1269 | z0_tmbspg2_gas1 |
+| z1           |        2 |         187.298 | z1_tmbspg3_gas1 |
+| z2           |        2 |         148.154 | z2_tmbspg3_gas1 |
+| global       |        6 |         187.298 | z1_tmbspg3_gas1 |
+
+Tuning completed in 0:00:03.602291
+```
+
+A file named `summary.txt` with the same content is saved under the `"results_dir"` for reference as well.
+Other than the tuning summary, `ds_config_optimal.json`, the optimal DeepSpeed configuration found by autotuning,  and the corresponding command to launch the experiment `cmd_optimal.txt` are also saved under the `"results_dir"` after autotuning finishes.
+
+## Autotuning Configuration
+
+While `"autotuning": {"enabled": true}` is the minimal requirement to enable autotuning, there are other parameters users can define to configure the autotuning process. Below shows major parameters and their default values. These parameters can be set in the "autotuning" section in DeepSpeed configuration file.
+```json
+{
+  "autotuning": {
+    "enabled": false,
+    "results_dir": null,
+    "exps_dir": null,
+    "overwrite": false,
+    "metric": "throughput",
+    "start_profile_step": 3,
+    "end_profile_step": 5,
+    "fast": true,
+    "max_train_batch_size": null,
+    "mp_size": 1,
+    "num_tuning_micro_batch_sizes": 3,
+    "tuner_type": "model_based",
+    "tuner_early_stopping": 5,
+    "tuner_num_trials": 50,
+    "arg_mappings": null
+  }
+}
+```
+
+### Results and Experiments Path
+
+`"results_dir"` points to a folder where the results of all the autotuning experiments are stored. `"exps_dir"` points to a folder where the descriptions of the autotuning experiments are stored.
+By default, `"exps_dir"` is set to a folder named `"autotuning_exps"` and `"results_dir"` is set to a folder named `"autotuning_results"` under the training script launching path. Users can specify other paths to use by setting these two parameters in the autotuning configuration.
+
+By default, the Autotuner does not run experiments whose results already exist. To change this behavior and rerun experiments all the time, set `"overwrite"` to true.
+
+### Autotuning Metric
+
+The Autotuner ranks tuning experiments by a metric. Currently, three metric types are supported, `"latency"`, `"throughput"`, and `"FLOPS"`:
+* "throughput": training samples per second (calculated as  `train_batch_size * 1000 / "latency"`)
+* "latency": training step latency in ms (`training iteration latency * gradient accumulation steps`)
+* "FLOPS": floating-point operations per second achieved per GPU (calculated as `the number of flops / training iteration latency`). Refer to [DeepSpeed flops profiler](https://www.deepspeed.ai/tutorials/flops-profiler/) for details on how the number of flops is measured.
+
+By default, `"throughput"` is used for ranking. Users can select other metrics, e.g., setting `{"metric": "latency"}` would use latency as the ranking metric.
+
+Note that the performance metric used in autotuning is calculated using the timings captured within DeepSpeed forward, backward and step functions. The sum of these timings is less than the actual training step latency, thus the throughput metric values used by autotuning would be higher than the end-to-end throughput in training.
+
+### Autotuning Resources
+
+The DeepSpeed Autotuner uses all the hardware resources in the environment to run the tuning experiments. Experiments can be scheduled and run in parallel if resources are available and parallelization applies to the tuning logic (some steps in the tuning workflow is sequential).
+For example, in an environment with 2 nodes and 16 GPUs per node, if the user specifies `--num_gpus=16` and `--num_nodes=1` in the training script, then at most two autotuning experiments can be run in parallel at a time.
+
+### Profile Steps
+
+In each of the tuning experiments, profiling is performed over a continuous portion of training steps to collect performance metrics, which are then used to rank the tuning experiments. Users can specify when to start and end the profiling.
+* start_step (int, defaults to 3): the training step to start recording performance metrics
+* end_step (int, >= start_step, defaults to 5): the training step to end recording performance metrics
+
+Note that setting `start_step` to large values could result in a noticeable longer run time for each tuning experiment.
+
+### Fast Mode
+
+Besides ZeRO stages and micro-batch sizes per GPU (`fast` mode), users can tune other ZeRO optimization configurations by setting `"fast"` to `false`. The autotuning time would increase as the tuning space gets larger and more tuning experiments are performed. The fast mode is by default enabled.
+
+### Max Train Batch Size
+
+Users can set the maximum train batch size (global effective batch size) for the autotuning process by specifying `max_train_batch_size` in the autotuning configuration section of the DeepSpeed configuration file. If `max_train_batch_size` is not defined, the Autotuner would use `maximum_train_micro_batch_size_per_gpu_runnable * gradient_accumulation_steps * total_num_gpus // model_parallelism_size` as `max_train_batch_size` (`gradient_accumulation_steps` defined in the DeepSpeed configuration file or training script or `1` is used here). See [Configuring Train Micro-Batch Size](#configuring-train-micro-batch-size) for its usage with micro-batch size and gradient accumulation steps.
+
+# Model Parallelism Size
+
+If model parallelism is used, set the `mp_size` in the autotuning configuration to be the model parallelism degree. `mp_size` defaults to 1 which means no model parallelism is used.
+### Tuning algorithms
+
+Within a ZeRO stage, combinations of micro-batch sizes and other ZeRO configurations form a tuning space if experiments where the DeepSpeed Autotuner explores in an order (tuner algorithm).
+
+Currently, three types of tuner algorithms are supported:
+* `"random"`: randomly select the next set of configurations to experiment with.
+* `" gridsearch" `: sequentially select the next set of configurations to experiment with.
+* `"model_based"`: xgboost cost model is used to select the next set of configurations to experiment with given the results of the finished experiments.
+
+By default, `"model_based"` algorithm is used.
+
+The Autotuner stops exploring the space when any of the following conditions meet:
+
+* When there is no more promising configurations are likely to be found. `"tuner_early_stopping"` defines the number of experiments to explore beyond the current best experiment. If no better experiment is found within that number, the Autotuner stops the exploration. `"tuner_early_stopping"` defaults to `5`.
+* When the total number of experiments explored exceeds the `"tuner_num_trials"`, which defaults to `50`.
+* When all the experiments in the tuning space are explored.
+
+## Using Autotuning with Hugging Face
+
+Hugging Face users can set some configurations values to ["auto"](https://huggingface.co/transformers/main_classes/deepspeed.html?highlight=gradient_accumulation_steps#shared-configuration).
+`"auto"` means the value will be set to the default in Hugging Face or be overwritten using the supplied values from the command line arguments.
+In DeepSpeed Autotuning, if the user-provided DeepSpeed configuration file has "auto" keywords, they are treated as the value "auto".
+
+##  GPT2-large Example
+
+This section shows an example of using DeepSpeed autotuning. For more examples, refer to [autotuning](https://github.com/microsoft/DeepSpeedExamples/tree/master/autotuning) in the DeepSpeedExamples repo.
+
+Example training script:
+
+```bash
+MODEL_NAME=gpt2-large
+PER_DEVICE_TRAIN_BATCH_SIZE=1
+HF_PATH=~/projects # REPLACE WITH YOUR HUGGING FACE PATH
+DS_CONFIG_PATH=ds_config.json # REPLACE WITH YOUR DEEPSPEED CONFIGURATION FILE PATH
+
+NEPOCHS=1
+NGPUS=16
+NNODES=1
+OUTPUT_DIR=./output_b${PER_DEVICE_TRAIN_BATCH_SIZE}_g${NGPUS}
+
+deepspeed --autotuning run --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py --deepspeed $DS_CONFIG_PATH \
+--model_name_or_path $MODEL_NAME \
+--dataset_name wikitext \
+--dataset_config_name wikitext-2-raw-v1 \
+--do_train \
+--do_eval \
+--fp16 \
+--per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+--learning_rate 2e-5 \
+--num_train_epochs $NEPOCHS \
+--output_dir ${OUTPUT_DIR} \
+--overwrite_output_dir
+```
+
+Example DeepSpeed configuration file:
+
+```json
+{
+  "train_micro_batch_size_per_gpu": "auto",
+  "gradient_accumulation_steps": "auto",
+  "autotuning": {
+    "enabled": true,
+    "arg_mappings": {
+      "train_micro_batch_size_per_gpu": "--per_device_train_batch_size",
+      "gradient_accumulation_steps": "--gradient_accumulation_steps"
+    },
+  }
+}
+```
+
+Example output (in `summary.txt`):
+
+```
+| tuning_space | num_experiments | best_metric_val | best_exp_name   |
+| :----------- | --------------: | --------------: | :-------------- |
+| z0           |               4 |         59.0229 | z0_gas1_tmbspg2 |
+| z1           |               5 |         87.3017 | z1_gas1_tmbspg3 |
+| z2           |               3 |         77.8338 | z2_gas1_tmbspg3 |
+| z3           |               1 |               0 | z3_gas1_tmbspg3 |
+| global       |              13 |         87.3017 | z1_gas1_tmbspg3 |
+
+Tuning completed in 0:27:33.988447. Total number of experiments: 13.
+```
+
+The table below shows the throughput (samples per second) comparison. The corresponding train micro-batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would use in the hand-tuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
+ - `baseline` is the vanilla Hugging Face (HF) without DeepSpeed (DS) and mbs is hand-tuned.
+ - `HF + DS hand-tuned` is HF with DS, and mbs is hand-tuned while other DS configuration uses default values.
+ - `HF + DS autotuning` is HF with DS, and the DS configuration selected from autotuning.
+
+Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), gradient accumulation steps (gas), train micro-batch size per GPU (mbs or tmbspg).
+
+| Model name | baseline (vanilla HF) | HF + DS hand-tuned       | HF + DS autotuning (fast-mode) |
+| ---------- | -------------------- | ------------------------ | ------------------------------ |
+| GPT2-large | 27.874 (mbs = 1)     | 56.797 (z = 1, mbs = 2), | 69.061 (z = 1, mbs = 3)        |
+
+As we can see the DeepSpeed Autotuner can select a better than hand-tuned configuration with a reasonable number of experiments. Examples in [Autotuning Hugging Face Examples](https://github.com/microsoft/DeepSpeedExamples/tree/master/autotuning/hf#autotuning-hugging-face-examples) would demonstrate the effectiveness of autotuning across different models.
--- a/deepspeed/autotuning/__init__.py
+++ b/deepspeed/autotuning/__init__.py
+from .autotuner import Autotuner
--- a/deepspeed/autotuning/autotuner.py
+++ b/deepspeed/autotuning/autotuner.py
+import copy
+import json
+import os
+from random import sample
+import shutil
+import subprocess
+import hjson
+import torch
+import time
+import datetime
+import math
+
+from ..runtime.config_utils import dict_raise_error_on_duplicate_keys
+from ..runtime.constants import *
+from ..runtime.zero.constants import *
+from ..utils import logger
+from .config import DeepSpeedAutotuningConfig
+from .constants import *
+from .scheduler import ResourceManager, run_experiment
+from .tuner import GridSearchTuner, RandomTuner, ModelBasedTuner
+from .utils import *
+
+try:
+    from tabulate import tabulate
+except ImportError:
+    tabulate = None
+
+
+class Autotuner:
+    """The DeepSpeed Autotuner automatically discovers the optimal DeepSpeed configuration that delivers good training speed. The Autotuner uses model information, system information, and heuristics to efficiently tune system knobs that affect compute and memory efficiencies, such as ZeRO optimization stages, micro-batch sizes, and many other ZeRO optimization configurations. It not only reduces the time and resources user spend on tuning, but also can discover configurations better than hand-tuned methods.
+    Autotuning with DeepSpeed requires no code change from DeepSpeed users. Please refer to the README for usage details.
+    """
+    def __init__(self, args, active_resources):
+        self.args = args
+        self.selected_exp_dir = None
+
+        assert tabulate is not None, "Missing required package `tabulate`, please install with `pip install deepspeed[autotuning]`."
+
+        logger.debug(f"autotunning args={args}")
+
+        self.user_config = self._get_user_config(args.user_args)
+        assert self.user_config is not None, "DeepSpeed configuration is not provided"
+
+        self.autotuning_config = DeepSpeedAutotuningConfig(self.user_config)
+
+        self.exps_dir = DEFAULT_EXPRS_DIR
+        if self.autotuning_config.exps_dir and self.autotuning_config.exps_dir != "":
+            self.exps_dir = self.autotuning_config.exps_dir
+        if self.autotuning_config.overwrite and os.path.exists(self.exps_dir):
+            shutil.rmtree(self.exps_dir, ignore_errors=True)
+        if not os.path.exists(self.exps_dir):
+            os.makedirs(self.exps_dir, exist_ok=True)
+
+        self.results_dir = DEFAULT_RESULTS_DIR
+        if self.autotuning_config.results_dir and self.autotuning_config.results_dir != "":
+            self.results_dir = self.autotuning_config.results_dir
+        if self.autotuning_config.overwrite and os.path.exists(self.results_dir):
+            shutil.rmtree(self.results_dir, ignore_errors=True)
+        if not os.path.exists(self.results_dir):
+            os.makedirs(self.results_dir, exist_ok=True)
+
+        # set the active resource for the autotuner resource manager
+        self.rm = self._get_resource_manager(active_resources)
+
+        # get resource requirement for each autotuning experiment
+        self.exp_num_nodes, self.exp_num_gpus = self._get_exp_resources(args)
+
+        assert self.exp_num_gpus <= self.rm.num_gpus_per_node, "num_gpus in the autotuning configuration must not be less than the --num_gpus value in the train script if any"
+        assert self.exp_num_nodes <= len(
+            self.rm.nodes), "num_nodes in the autotuning configuration must not be less than the --num_nodes value in the train script if any"
+
+        self.records = {}
+
+    def print_tuning_results(self):
+        """Print the autotuning results in tabular format.
+        """
+        best_space_records = self.get_best_space_records()
+        tab = []
+        if best_space_records:
+            for key, val in best_space_records.items():
+                if not val:
+                    continue
+                row = []
+                row.append(key)
+                num_exps = 0
+                if key == GLOBAL_TUNING_SPACE:
+                    cnt = 0
+                    for k, v in best_space_records.items():
+                        if k != GLOBAL_TUNING_SPACE:
+                            cnt += v[2]
+                    num_exps = cnt
+                else:
+                    num_exps = val[2]
+                row.append(num_exps)
+                row.append(val[1])
+                row.append(val[0]['name'])
+                tab.append(row)
+            summary = tabulate(tab,
+                               headers=[
+                                   "tuning_space",
+                                   "num_experiments",
+                                   "best_metric_val",
+                                   "best_exp_name"
+                               ],
+                               tablefmt="pipe")
+            print(summary)
+            with open(os.path.join(self.results_dir,
+                                   'summary.txt'),
+                      'w',
+                      buffering=BUFSIZE) as fd:
+                fd.write(summary)
+                fd.flush()
+                os.fsync(fd)
+
+        if GLOBAL_TUNING_SPACE in best_space_records:
+            best_exp, best_metric_val, total_num_exps = best_space_records[GLOBAL_TUNING_SPACE]
+            if best_exp:
+                logger.info(
+                    f"{best_exp['name']} is the optimal setup after tuning. The exp result is at {best_exp['result_dir']}."
+                )
+            else:
+                logger.info(
+                    f"No optimal setup is found. Please check that experiments were run successfully."
+                )
+            tuning_duration = datetime.timedelta(seconds=(time.time() - self.start_time))
+
+            logger.info(f"Tuning completed in {tuning_duration}")
+            with open(os.path.join(self.results_dir, 'summary.txt'), 'a') as f:
+                f.write(
+                    f"\n\nTuning completed in {tuning_duration}. Total number of experiments: {self.rm.experiment_count - 1}."
+                )
+                f.flush()
+
+    def _get_user_config(self, user_args):
+        """Get DeepSpeed configuration from the user arguments passed to the launcher.
+
+        Args:
+            user_args ([list]): user arguments passed to the DeepSpeed launcher
+
+        Returns:
+            [dict]: DeepSpeed configuration dictionary
+        """
+        user_config_file = None
+        if "--deepspeed_config" in user_args:
+            idx = user_args.index("--deepspeed_config")
+            assert ".json" in user_args[idx +
+                                        1],  "DeepSpeed --deepspeed_config requires a json file to specify the configuration"
+
+            user_config_file = user_args[idx + 1]
+        elif "--deepspeed" in user_args:
+            idx = user_args.index("--deepspeed")
+            if ".json" in user_args[idx + 1]:
+                user_config_file = user_args[idx + 1]
+
+        logger.debug(f"user_config_file = {user_config_file}")
+        if user_config_file is not None:
+            assert os.path.isfile(
+                user_config_file
+            ), "DeepSpeed configuration file: {} is not an existing file".format(
+                user_config_file
+            )
+            if os.path.exists(user_config_file):
+                return json.load(open(user_config_file,
+                                      "r"),
+                                 object_pairs_hook=dict_raise_error_on_duplicate_keys)
+
+        return None
+
+    def _get_resource_manager(self, active_resources):
+        """Initialize and return a resource manager
+
+        Args:
+            active_resources ([dict]): A dictionary of hostname and its slots (GPUs), e.g. {"worker-0": "0,1,2,3,4,5,6,7,8"}
+
+        Raises:
+            RuntimeError: raises the error if no GPU is available
+
+        Returns:
+            [ResourceManager]: A resource manager that schedules and runs autotuning experiments.
+        """
+        logger.info(f"active_resources = {active_resources}")
+
+        hosts = []
+        ngpus_per_node = 100
+        for hostname, slots in active_resources.items():
+            hosts.append(hostname)
+            ngpus_per_node = min(len(slots), ngpus_per_node)
+
+        assert ngpus_per_node > 0, "no gpu is available"
+
+        return ResourceManager(args=self.args,
+                               hosts=hosts,
+                               num_gpus_per_node=ngpus_per_node,
+                               results_dir=self.results_dir,
+                               exps_dir=self.exps_dir,
+                               arg_mappings=self.autotuning_config.arg_mappings)
+
+    def _get_exp_resources(self, args):
+        """Get resource requirement for each autotuning experiment
+
+        Args:
+            args (dict): user args
+
+        Returns:
+            num_nodes, num_gpus: the number of gpus and number of nodes used in the autotuning experiments
+        """
+        if args.num_nodes > 0:
+            num_nodes = args.num_nodes
+        else:
+            num_nodes = len(self.rm.nodes)
+
+        if args.num_gpus > 0:
+            num_gpus = args.num_gpus
+        else:
+            num_gpus = self.rm.num_gpus_per_node
+
+        return num_nodes, num_gpus
+
+    def metric(self):
+        return self.autotuning_config.metric
+
+    def fast_enabled(self):
+        return self.autotuning_config.fast
+
+    def max_train_batch_size(self):
+        return self.autotuning_config.max_train_batch_size
+
+    def mp_size(self):
+        return self.autotuning_config.mp_size
+
+    def max_train_micro_batch_size_per_gpu(self):
+        if self.max_train_batch_size() and self.max_train_batch_size(
+        ) > 0:  # if the user specifies a max_train_batch_size
+            max_train_micro_batch_size = self.max_train_batch_size() * self.mp_size(
+            ) // (self.exp_num_gpus * self.exp_num_nodes
+                  )  # gradient accumulation steps >=1
+            return min(self.autotuning_config.max_train_micro_batch_size_per_gpu,
+                       max_train_micro_batch_size)
+        else:
+            return self.autotuning_config.max_train_micro_batch_size_per_gpu
+
+    def min_train_micro_batch_size_per_gpu(self):
+        return self.autotuning_config.min_train_micro_batch_size_per_gpu
+
+    def num_tuning_micro_batch_sizes(self):
+        return self.autotuning_config.num_tuning_micro_batch_sizes
+
+    def fp16_enabled(self):
+        if FP16 in self.user_config.keys():
+            return self.user_config[FP16].get(FP16_ENABLED, FP16_ENABLED_DEFAULT)
+        else:
+            return False
+
+    def get_gpu_memory_info(self):
+        return torch.cuda.get_device_properties(0).total_memory
+
+    def get_activation_memory_per_gpu(self):
+        if self.model_info and "activation_mem_per_gpu" in self.model_info:
+            return self.model_info["activation_mem_per_gpu"]
+
+    def get_instantiation_memory_required_per_gpu(self, zero_stage):
+        num_params = self.get_model_num_params()
+        total_gpus = self.exp_num_nodes * self.exp_num_gpus
+        fp16_enabled = self.fp16_enabled()
+
+        if not num_params:
+            return 0
+        # assume the model uses Adam optimizer
+        # ZERO_OPTIMIZATION_DISABLED:
+        params_mem = num_params * (2 if fp16_enabled else 4)
+        gradients_mem = num_params * (2 if fp16_enabled else 4)
+        optimizer_mem = num_params * (16 if fp16_enabled else 8)
+
+        if zero_stage >= ZERO_OPTIMIZATION_OPTIMIZER_STATES:
+            optimizer_mem = optimizer_mem / total_gpus
+
+        if zero_stage >= ZERO_OPTIMIZATION_GRADIENTS:
+            gradients_mem = gradients_mem / total_gpus
+
+        if zero_stage >= ZERO_OPTIMIZATION_WEIGHTS:
+            params_mem = params_mem / total_gpus
+
+        mem_per_gpu = (params_mem + gradients_mem + optimizer_mem) / self.mp_size()
+
+        return mem_per_gpu
+
+    def _generate_experiments(self, tuning_space, max_train_batch_size_per_gpu):
+        """Generates a list of autotuning experiments given a tuning_space.
+            The corresponding parameter values are replaced by user-defined values in the DeepSpeed configuration file.
+        Args:
+            tuning_space ([dict]): A DeepSpeed configuration dictionary where a value can be a list (called a tuning parameter). For example,
+                {
+                    "zero_optimization": {
+                        "stage": 1,
+                        "reduce_bucket_size": [5e7,
+                                            5e8,
+                                            1e9],
+                        "allgather_bucket_size": [5e7,
+                                                5e8,
+                                                1e9],
+                    }
+                }
+                reduce_bucket_size and allgather_bucket_size are the tuning parameters in this tuning space.
+        Returns:
+            [list]: a list of experiments generated by taking combinations of values of the tuning space. The above tuning space generates 3*3 = 9 experiments if the user DeepSpeed configuration file does not overwrite the two tuning parameters or define more tuning parameters.
+        """
+        exps = []
+
+        # each zero stage uses a different template configuration file
+        config_zero = tuning_space.get(ZERO_OPTIMIZATION, {})
+        stage = config_zero.get(ZERO_OPTIMIZATION_STAGE, None)
+        template_config = {}
+        if stage == 0:
+            template_path = DEFAULT_TEMPLATE_PATH_ZERO_0
+            template_config = hjson.load(open(template_path, 'r'))
+            prefix = "z0_"
+
+        elif stage == 1:
+            template_path = DEFAULT_TEMPLATE_PATH_ZERO_1
+            template_config = hjson.load(open(template_path, 'r'))
+            prefix = "z1_"
+
+        elif stage == 2:
+            template_path = DEFAULT_TEMPLATE_PATH_ZERO_2
+            template_config = hjson.load(open(template_path, 'r'))
+            prefix = "z2_"
+
+        elif stage == 3:
+            template_path = DEFAULT_TEMPLATE_PATH_ZERO_3
+            template_config = hjson.load(open(template_path, 'r'))
+            model_info = self.model_info
+            if model_info and "hidden_size" in model_info:
+                hs = model_info["hidden_size"]
+                template_config[ZERO_OPTIMIZATION][
+                    ZERO_OPTIMIZATION_REDUCE_BUCKET_SIZE] = hs * hs
+                template_config[ZERO_OPTIMIZATION][
+                    ZERO_OPTIMIZATION_PREFETCH_BUCKET_SIZE] = 0.9 * hs * hs
+                template_config[ZERO_OPTIMIZATION][
+                    ZERO_OPTIMIZATION_PARAM_PERSISTENCE_THRESHOLD] = 10 * hs
+            prefix = "z3_"
+        else:
+            return exps
+
+        # replace the corresponding parameter values if the user specifies them in the DeepSpeed configuration file
+        replace_dict(tuning_space,
+                     self.user_config,
+                     [ZERO_OPTIMIZATION,
+                      TRAIN_MICRO_BATCH_SIZE_PER_GPU])
+
+        logger.debug(f"tuning_space = {json.dumps(tuning_space)}")
+
+        all_configs = get_all_configs(tuning_space, ignore_keys=["optimizer"])
+
+        tuning_keys = get_tuning_keys(tuning_space)
+
+        logger.debug(f"tuning_keys = {tuning_keys}")
+
+        logger.debug(f"before prunning total configs = {len(all_configs)}")
+
+        pruned_list = prune_configs(all_configs)
+
+        logger.debug(f"after prunning total configs = {len(pruned_list)}")
+
+        for config in pruned_list:
+            exp_config = copy.deepcopy(template_config)
+            # fill the template with the expr config
+            replace_dict(exp_config, config)
+
+            # if the config does not use offloading, remove the offloading section
+            config_zero = config.get(ZERO_OPTIMIZATION, None)
+            if config_zero:
+                if OFFLOAD_OPTIMIZER not in config_zero and OFFLOAD_OPTIMIZER in exp_config[
+                        ZERO_OPTIMIZATION]:
+                    del exp_config[ZERO_OPTIMIZATION][OFFLOAD_OPTIMIZER]
+                if OFFLOAD_PARAM not in config_zero and OFFLOAD_PARAM in exp_config[
+                        ZERO_OPTIMIZATION]:
+                    del exp_config[ZERO_OPTIMIZATION][OFFLOAD_PARAM]
+
+            # set gradient accumulation steps according to max_train_batch_size_per_gpu
+            mbs = exp_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU]
+            gas = max_train_batch_size_per_gpu // mbs
+            exp_config[GRADIENT_ACCUMULATION_STEPS] = gas
+            exp_config[TRAIN_BATCH_SIZE] = mbs * gas * \
+                self.exp_num_gpus * self.exp_num_nodes // self.mp_size()
+            exp = {}
+            # generate the expr name
+            exp_name = canonical_name(exp_config, tuning_keys, prefix)
+            exp['name'] = exp_name
+            exp[DS_CONFIG] = exp_config
+            exp['num_gpus'] = self.exp_num_gpus
+            exp['num_nodes'] = self.exp_num_nodes
+            exps.append(exp)
+
+        return exps
+
+    def tune(self):
+        """ Tunes Zero stages, micro batch size per GPU, and other Zero configurations. Performance metrics of different tuning spaces are recorded in self.records.
+        """
+        self.start_time = time.time()
+        if self.fast_enabled():
+            logger.info(f"Fast mode is enabled. Tuning micro batch size only.")
+
+        # model info profile run with DEFAULT_MIN_MEM_CONFIG
+        model_info = self.model_info_profile_run()
+        if model_info:
+            self.model_info = model_info
+        else:
+            return
+
+        logger.info(
+            f"The model has {number_to_string(self.get_model_num_params())} parameters.")
+
+        self.gpu_mem = self.get_gpu_memory_info()
+        logger.info(
+            f"Memory per GPU in the system is {memory_to_string(self.gpu_mem, postfix='B')}."
+        )
+
+        self.activation_mem = self.get_activation_memory_per_gpu()
+        logger.info(
+            f"The model requires at least {memory_to_string(self.activation_mem, postfix='B')} activation memory for micro batch size 1."
+        )
+
+        stage = self.user_config.get(ZERO_OPTIMIZATION,
+                                     {}).get(ZERO_OPTIMIZATION_STAGE,
+                                             "all")
+        user_zero_stages = [stage] if not isinstance(stage, list) else stage
+        logger.info(f"User-defined zero stages are {stage}.")
+
+        mbs = 0
+        max_mbs = 0
+        metric_val = 0
+
+        required_gpu_mem = self.get_instantiation_memory_required_per_gpu(
+            ZERO_OPTIMIZATION_DISABLED) + self.activation_mem
+        if self.gpu_mem > required_gpu_mem:
+            if "all" in user_zero_stages or ZERO_OPTIMIZATION_DISABLED in user_zero_stages:
+                logger.info(
+                    f"The model might be runable with ZERO 0 (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory with mbs = 1), adding DEFAULT_TUNING_SPACE_ZERO_0 to the global tuning space"
+                )
+                next_max_mbs, next_mbs, next_metric_val = self.tune_space(
+                    DEFAULT_TUNING_SPACE_ZERO_0)
+                if next_mbs > mbs:
+                    mbs = next_mbs
+                    max_mbs = next_max_mbs
+                    metric_val = next_metric_val
+        else:
+            logger.info(
+                f"The model is not runable with ZERO stage {ZERO_OPTIMIZATION_DISABLED} (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory with mbs = 1)"
+            )
+
+        required_gpu_mem = self.get_instantiation_memory_required_per_gpu(
+            ZERO_OPTIMIZATION_OPTIMIZER_STATES) + self.activation_mem
+        if self.gpu_mem > required_gpu_mem:
+            if "all" in user_zero_stages or ZERO_OPTIMIZATION_OPTIMIZER_STATES in user_zero_stages:
+                logger.info(
+                    f"The model might be runable with ZERO 1 (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory), adding DEFAULT_TUNING_SPACE_ZERO_1 to the global tuning space"
+                )
+                next_max_mbs, next_mbs, next_metric_val = self.tune_space(
+                    DEFAULT_TUNING_SPACE_ZERO_1, prev_max_mbs = max_mbs, prev_best_mbs=mbs, prev_best_metric_val=metric_val)
+                if next_mbs > mbs:
+                    mbs = next_mbs
+                    max_mbs = next_max_mbs
+                    metric_val = next_metric_val
+        else:
+            logger.info(
+                f"The model is not runable with ZERO stage {ZERO_OPTIMIZATION_OPTIMIZER_STATES} (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory with mbs = 1)"
+            )
+
+        required_gpu_mem = self.get_instantiation_memory_required_per_gpu(
+            ZERO_OPTIMIZATION_GRADIENTS) + self.activation_mem
+        if self.gpu_mem > required_gpu_mem:
+            if "all" in user_zero_stages or ZERO_OPTIMIZATION_GRADIENTS in user_zero_stages:
+                logger.info(
+                    f"The model might be runable with ZERO 2 (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory), adding DEFAULT_TUNING_SPACE_ZERO_2 to the global tuning space"
+                )
+                next_max_mbs, next_mbs, next_metric_val = self.tune_space(
+                    DEFAULT_TUNING_SPACE_ZERO_2, prev_max_mbs = max_mbs, prev_best_mbs=mbs, prev_best_metric_val=metric_val)
+                if next_mbs > mbs:
+                    mbs = next_mbs
+                    max_mbs = next_max_mbs
+                    metric_val = next_metric_val
+        else:
+            logger.info(
+                f"The model is not runable with ZERO stage {ZERO_OPTIMIZATION_GRADIENTS} (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory with mbs = 1)"
+            )
+
+        required_gpu_mem = self.get_instantiation_memory_required_per_gpu(
+            ZERO_OPTIMIZATION_WEIGHTS) + self.activation_mem
+        if self.gpu_mem > required_gpu_mem:
+            if "all" in user_zero_stages or ZERO_OPTIMIZATION_WEIGHTS in user_zero_stages:
+                logger.info(
+                    f"The model might be runable with ZERO 3 (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory), adding DEFAULT_TUNING_SPACE_ZERO_3 to the global tuning space"
+                )
+                _, _, _ = self.tune_space(
+                    DEFAULT_TUNING_SPACE_ZERO_3, prev_max_mbs = max_mbs, prev_best_mbs=mbs, prev_best_metric_val=metric_val)
+        else:
+            logger.info(
+                f"The model has {self.get_model_num_params()} parameters and requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory per GPU with DeepSpeed Zero stage {ZERO_OPTIMIZATION_WEIGHTS} optimization. Memory per GPU in system is {memory_to_string(self.gpu_mem)}. No tuning is performed."
+            )
+            return
+
+    def tune_space(self,
+                   tuning_space,
+                   prev_max_mbs=0,
+                   prev_best_mbs=0,
+                   prev_best_metric_val=0):
+        config_zero = tuning_space.get(ZERO_OPTIMIZATION, {})
+        stage = config_zero.get(ZERO_OPTIMIZATION_STAGE, ZERO_OPTIMIZATION_STAGE_DEFAULT)
+        tuning_space_name = TUNING_MICRO_BATCH_SIZE_PREFIX + str(stage)
+        tuning_micro_batch_sizes = []
+        max_train_batch_size_per_gpu = 0
+        tuning_micro_batch_sizes_overwritten = False
+
+        # calculate max micro batch size using gpu memory, model instantiation memory and activation memory
+        # calculated_max_micro_batch_size = (memory_per_gpu - instantiation_memory) // activation_memory_micro_batch_size_1
+        calculated_max_micro_batch_size = int(
+            self.gpu_mem -
+            self.get_instantiation_memory_required_per_gpu(stage)) // self.activation_mem
+        logger.info(
+            f"Start tuning for space {tuning_space_name}, calculated_max_micro_batch_size = {calculated_max_micro_batch_size}"
+        )
+
+        if calculated_max_micro_batch_size < prev_max_mbs:
+            logger.info(
+                f"No need to tune Zero stage {stage}. End tuning for space {tuning_space_name}"
+            )
+            return 0, 0, 0
+
+        if TRAIN_MICRO_BATCH_SIZE_PER_GPU in self.user_config and isinstance(
+                self.user_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU],
+                list):
+            # user-specified micro batch size per gpu is a list which overwrites the default tuning behavior
+            tuning_micro_batch_sizes = [
+                s for s in self.user_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU]
+                if isinstance(s,
+                              int)
+            ]
+            gas = self.get_gas_from_user_config()
+            min_micro_batch_size = min(tuning_micro_batch_sizes)
+            max_micro_batch_size = max(tuning_micro_batch_sizes)
+            max_train_batch_size_per_gpu = max_micro_batch_size * gas
+            tuning_micro_batch_sizes_overwritten = True
+        else:
+            # auto-detects the list of micro batch sizes to tune
+            min_micro_batch_size, max_micro_batch_size = self.get_min_max_micro_batch_size(
+                stage, prev_max_mbs, calculated_max_micro_batch_size)
+
+            if max_micro_batch_size < prev_max_mbs:
+                logger.info(
+                    f"No need to tune Zero stage {stage}. End tuning for space {tuning_space_name}"
+                )
+                return 0, 0, 0
+
+            tuning_micro_batch_sizes, max_train_batch_size_per_gpu = self.get_tuning_micro_batch_size_list(
+                min_micro_batch_size,
+                max_micro_batch_size,
+                num_tuning_micro_batch_sizes=self.num_tuning_micro_batch_sizes())
+
+        logger.info(
+            f"tuning_micro_batch_sizes = {tuning_micro_batch_sizes}, max_train_batch_size_per_gpu = {max_train_batch_size_per_gpu}"
+        )
+
+        # return if the tuning_micro_batch_sizes list is empty
+        if not tuning_micro_batch_sizes:
+            logger.info(f"End tuning for space {tuning_space_name}")
+            return 0, 0, 0
+
+        # tune micro batch sizes and gradient accumulation steps given max_train_batch_size_per_gpu
+        tuning_micro_batch_sizes = self.run_tuning_micro_batch_sizes(
+            tuning_micro_batch_sizes,
+            max_train_batch_size_per_gpu,
+            min_micro_batch_size,
+            stage,
+            tuning_micro_batch_sizes_overwritten)
+
+        fast_best_record = self.get_best_space_record(tuning_space_name)
+        fast_best_metric_val = fast_best_record[1] if fast_best_record else 0
+        fast_best_mbs = fast_best_record[0][DS_CONFIG][
+            TRAIN_MICRO_BATCH_SIZE_PER_GPU] if fast_best_record else 0
+        logger.info(
+            f"fast_best_mbs = {fast_best_mbs}, name = {fast_best_record[0]['name']}")
+
+        if self.fast_enabled() or stage == 0:
+            logger.info(f"End tuning for space: {tuning_space_name}")
+            return max_micro_batch_size, fast_best_mbs, fast_best_metric_val
+
+        # if the best metric or the micro batch size for that best metric in the current Zero stage after tuning micro batch size is less than the corresponding value in the previous Zero stage, return, do not tune other Zero configuration parameters
+        if stage > 0:
+            if fast_best_mbs <= prev_best_mbs or fast_best_metric_val < prev_best_metric_val:
+                logger.info(
+                    f"End tuning for space: {tuning_space_name}. No need to tune other Zero configuration parameters."
+                )
+                return max_micro_batch_size, fast_best_mbs, fast_best_metric_val
+
+        tuning_space[TRAIN_MICRO_BATCH_SIZE_PER_GPU] = tuning_micro_batch_sizes
+        tuning_space_name = canonical_name(tuning_space,
+                                           tuning_keys=get_tuning_keys(tuning_space),
+                                           prefix="z" + str(stage) + "_",
+                                           omit_val=True)
+
+        logger.info(f'Tuning space is {tuning_space}')
+        logger.info(f'Tuning space name is {tuning_space_name}')
+
+        exps = self._generate_experiments(tuning_space, max_train_batch_size_per_gpu)
+
+        logger.info(f'Tuner type is {self.autotuning_config.tuner_type}')
+        if self.autotuning_config.tuner_type == AUTOTUNING_TUNER_MODELBASED:
+            t = ModelBasedTuner(exps, self.rm, self.metric(), tuning_space)
+        elif self.autotuning_config.tuner_type == AUTOTUNING_TUNER_RANDOM:
+            t = RandomTuner(exps, self.rm, self.metric())
+        else:
+            t = GridSearchTuner(exps, self.rm, self.metric())
+
+        sample_size = len(self.rm.nodes) * self.rm.num_gpus_per_node // (
+            self.exp_num_gpus * self.exp_num_nodes)
+        num_exps = t.tune(sample_size=sample_size,
+                          n_trials=self.autotuning_config.tuner_num_trials,
+                          early_stopping=self.autotuning_config.tuner_early_stopping)
+        exp = t.best_exp
+        metric_val = t.best_metric_val
+        if exp:
+            self.update_records(tuning_space_name, exp, metric_val, num_exps)
+
+        full_best_record = self.get_best_space_record(tuning_space_name)
+        full_best_metric_val = full_best_record[1] if full_best_record else -1
+
+        if full_best_metric_val > fast_best_metric_val:
+            best_metric_val = full_best_metric_val
+            best_mbs = full_best_record[0][DS_CONFIG][
+                TRAIN_MICRO_BATCH_SIZE_PER_GPU] if full_best_record else -1
+        else:
+            best_metric_val = fast_best_metric_val
+            best_mbs = fast_best_mbs
+
+        logger.info(f"End tuning for space: {tuning_space_name}")
+        return max_micro_batch_size, best_mbs, best_metric_val
+
+    def get_plauteu_mbs(self, tuning_space_name):
+        if tuning_space_name not in self.records:
+            return 0
+        space_records = self.records[tuning_space_name]
+        sorted_space_records = sorted(
+            space_records,
+            key=lambda x: x[0][DS_CONFIG][TRAIN_MICRO_BATCH_SIZE_PER_GPU])
+        prev_metric_val = None
+        prev_micro_batch_size = 0
+        for (exp, metric_val, _) in sorted_space_records:
+            if prev_metric_val:
+                if metric_val < prev_metric_val:
+                    break
+                if (metric_val >= prev_metric_val
+                        and (metric_val - prev_metric_val) / prev_metric_val <
+                        METRIC_PERCENT_DIFF_CONST):
+                    break
+            prev_metric_val = metric_val
+            prev_micro_batch_size = exp[DS_CONFIG][TRAIN_MICRO_BATCH_SIZE_PER_GPU]
+        plateau_mbs = prev_micro_batch_size
+        return plateau_mbs
+
+    def get_model_num_params(self):
+        if self.model_info and "num_params" in self.model_info:
+            return self.model_info["num_params"]
+
+    def model_info_profile_run(self):
+        """Does a model information profling experiment that collects the number of model parameters and activation memory.\
+            The experiment produces a "profile_model_info" folder under self.results_dir.
+        Returns:
+            [dict]: a model information dictionary, e.g., {"num_params": 335144976, "trainable_num_params": 335144976, "activation_mem_per_gpu": 324358144, "rank": 0}
+        """
+        logger.info("Starting model info profile run.")
+        model_info = self.autotuning_config.model_info
+        if model_info and MODEL_INFO_NUM_PARAMS in model_info:
+            return model_info
+
+        ds_config = copy.deepcopy(self.user_config)
+        replace_dict(ds_config, DEFAULT_MIN_MEM_CONFIG)
+
+        model_info_path = os.path.join(self.results_dir,
+                                       "profile_model_info",
+                                       "model_info.json")
+        ds_config[AUTOTUNING] = {
+            "enabled": True,
+            "model_info_path": model_info_path,
+            "model_info": {
+                "profile": True
+            }
+        }
+
+        exp_config = {}
+        exp_name = "profile_model_info"
+        exp_config['name'] = exp_name
+        exp_config[DS_CONFIG] = ds_config
+        exp_config['num_gpus'] = self.exp_num_gpus
+        exp_config['num_nodes'] = self.exp_num_nodes
+        exp_path = os.path.join(self.exps_dir, f'{exp_name}.json')
+
+        with open(exp_path, 'w', buffering=BUFSIZE) as fd:
+            json.dump(exp_config, fd)
+            fd.flush()
+            os.fsync(fd)
+
+        self.rm.schedule_experiments([exp_path])
+        self.rm.run()
+
+        for exp_id, (exp_json, err) in self.rm.finished_experiments.items():
+            self.rm.clear()
+            if err:
+                logger.error(
+                    f"The model is not runnable with DeepSpeed with error = {err}")
+                return None
+
+        if os.path.exists(model_info_path):
+            with open(model_info_path, 'r') as f:
+                model_info = hjson.load(f)
+                return model_info
+
+    def update_records(self, space_name, exp, metric_val, num_exps):
+        if space_name not in self.records:
+            self.records[space_name] = [(exp, metric_val, num_exps)]
+        else:
+            self.records[space_name].append((exp, metric_val, num_exps))
+
+    def get_best_space_record(self, space_name):
+        if space_name not in self.records:
+            return None
+        space_records = self.records[space_name]
+        best_space_record = None
+        space_num_exps = 0
+        for (exp, metric_val, num_exps) in space_records:
+            space_num_exps += num_exps
+            if best_space_record is None or metric_val > best_space_record[1]:
+                best_space_record = (exp, metric_val)
+        if best_space_record:
+            best_space_record = best_space_record + (space_num_exps, )
+        return best_space_record
+
+    def get_best_space_records(self):
+        best_space_records = {}
+        global_best_record = None
+        for space_name, space_records in self.records.items():
+            best_space_record = self.get_best_space_record(space_name)
+            if best_space_record:
+                best_space_records[space_name] = best_space_record
+                if not global_best_record or best_space_record[1] > global_best_record[1]:
+                    global_best_record = best_space_record
+        if global_best_record:
+            best_space_records[GLOBAL_TUNING_SPACE] = global_best_record
+        return best_space_records
+
+    def run_tuning_micro_batch_sizes(self,
+                                     tuning_micro_batch_sizes,
+                                     max_train_batch_size_per_gpu,
+                                     min_micro_batch_size,
+                                     stage,
+                                     tuning_micro_batch_sizes_overwritten):
+        assert tuning_micro_batch_sizes, "the tuning micro batch size list is empty"
+        tuning_micro_batch_sizes.sort()
+        max_micro_batch_size = tuning_micro_batch_sizes[-1]
+        max_micro_batch_size_metric_val = 0
+
+        ds_config = get_first_config(self.user_config)
+        ds_config[ZERO_OPTIMIZATION] = {ZERO_OPTIMIZATION_STAGE: stage}
+        tuning_space_name = TUNING_MICRO_BATCH_SIZE_PREFIX + str(stage)
+
+        exp_paths = []
+        for mbs in tuning_micro_batch_sizes:
+            ds_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU] = mbs
+            gas = max_train_batch_size_per_gpu // mbs
+            ds_config[GRADIENT_ACCUMULATION_STEPS] = gas
+            ds_config[TRAIN_BATCH_SIZE] = mbs * gas * \
+                self.exp_num_gpus * self.exp_num_nodes // self.mp_size()
+            exp_name = tuning_space_name + "_gas" + str(gas) + "_tmbspg" + str(mbs)
+            exp_config = {}
+            exp_config['name'] = exp_name
+            exp_config[DS_CONFIG] = ds_config
+            exp_config['num_gpus'] = self.exp_num_gpus
+            exp_config['num_nodes'] = self.exp_num_nodes
+            exp_path = os.path.join(self.exps_dir, f'{exp_name}.json')
+
+            with open(exp_path, 'w', buffering=BUFSIZE) as fd:
+                json.dump(exp_config, fd)
+                fd.flush()
+                os.fsync(fd)
+            exp_paths.append(exp_path)
+
+        self.rm.schedule_experiments(exp_paths)
+        self.rm.run()
+        for exp_id, (exp, err) in self.rm.finished_experiments.items():
+            if exp:
+                metric_file = exp[DS_CONFIG][AUTOTUNING][AUTOTUNING_METRIC_PATH]
+
+                if os.path.exists(metric_file):
+                    with open(metric_file, 'r') as f:
+                        results = hjson.load(f)
+                        metric_val = results[self.metric()]
+                        self.update_records(tuning_space_name, exp, metric_val, 1)
+                        if max_micro_batch_size == exp[DS_CONFIG][
+                                TRAIN_MICRO_BATCH_SIZE_PER_GPU]:
+                            max_micro_batch_size_metric_val = metric_val
+                else:
+                    self.update_records(tuning_space_name, exp, 0, 1)
+            else:
+                mbs = exp[DS_CONFIG][TRAIN_MICRO_BATCH_SIZE_PER_GPU]
+                logger.info(f"micro batch size = {mbs} was not run successfully")
+        self.rm.clear()
+
+        if tuning_micro_batch_sizes_overwritten:
+            return tuning_micro_batch_sizes
+
+        # in a auto-detected tuning_micro_batch_sizs list, max_micro_batch_size might not be performant as the memory consumption is close to max
+        # try smaller values while gas stays the same
+        # if finding a more performant mbs value, use it to replace max_micro_batch_size in the list
+        min_micro_batch_size_with_same_gas = (
+            tuning_micro_batch_sizes[-2] +
+            1) if len(tuning_micro_batch_sizes) > 1 else min_micro_batch_size
+
+        prev_best_metric_val = max_micro_batch_size_metric_val
+        prev_best_mbs = max_micro_batch_size
+
+        stride = (max_micro_batch_size - min_micro_batch_size_with_same_gas) // 3
+        if stride == 0:
+            stride = 1
+        for mbs in reversed(
+                range(min_micro_batch_size_with_same_gas,
+                      max_micro_batch_size,
+                      stride)):
+            ds_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU] = mbs
+            gas = max_train_batch_size_per_gpu // mbs
+            ds_config[GRADIENT_ACCUMULATION_STEPS] = gas
+            ds_config[TRAIN_BATCH_SIZE] = mbs * gas * \
+                self.exp_num_gpus * self.exp_num_nodes // self.mp_size()
+            exp_name = tuning_space_name + "_gas" + str(gas) + "_tmbspg" + str(mbs)
+            exp, metric_val = self.run_ds_config(ds_config, exp_name)
+            if metric_val:
+                self.update_records(tuning_space_name, exp, metric_val, 1)
+                if metric_val > prev_best_metric_val * (1 + METRIC_PERCENT_DIFF_CONST):
+                    prev_best_metric_val = metric_val
+                    prev_best_mbs = mbs
+                else:
+                    break
+            else:
+                self.update_records(tuning_space_name, exp, 0, 1)
+                break
+        if prev_best_mbs != max_micro_batch_size:
+            tuning_micro_batch_sizes[-1] = prev_best_mbs
+
+        return tuning_micro_batch_sizes
+
+    def get_min_max_micro_batch_size(self,
+                                     stage,
+                                     min_micro_batch_size,
+                                     calculated_max_micro_batch_size):
+        # get min and max micro batch size with gradient accumulation steps = 1
+        if min_micro_batch_size > calculated_max_micro_batch_size:
+            return -1, -1
+
+        used_micro_batch_sizes = []
+        tuning_space_name = TUNING_MICRO_BATCH_SIZE_PREFIX + str(stage)
+
+        ds_config = get_first_config(self.user_config)
+        ds_config[ZERO_OPTIMIZATION] = {ZERO_OPTIMIZATION_STAGE: stage}
+        gas = self.get_gas_from_user_config()
+        ds_config[GRADIENT_ACCUMULATION_STEPS] = gas
+
+        # search for the min micro batch size
+        if min_micro_batch_size < 1:
+            if TRAIN_MICRO_BATCH_SIZE_PER_GPU in self.user_config and isinstance(
+                    self.user_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU],
+                    int):
+                # user specifies train_micro_batch_size_per_gpu as an int
+                mbs = int(self.user_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU])
+            else:
+                # user does not specify train_micro_batch_size_per_gpu or sets it to "auto" when using Hugging Face
+                val = self.get_val_from_user_args(TRAIN_MICRO_BATCH_SIZE_PER_GPU)
+                if val:
+                    mbs = int(val)
+                else:
+                    mbs = 1
+            assert mbs > 0, "The micro batch size per GPU must be greater than 0."
+            ds_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU] = mbs
+            ds_config[GRADIENT_ACCUMULATION_STEPS] = gas
+            ds_config[TRAIN_BATCH_SIZE] = mbs * gas * \
+                self.exp_num_gpus * self.exp_num_nodes // self.mp_size()
+            exp_name = tuning_space_name + "_gas" + str(gas) + "_tmbspg" + str(mbs)
+            exp, metric_val = self.run_ds_config(ds_config, exp_name)
+            if metric_val:
+                self.update_records(tuning_space_name, exp, metric_val, 1)
+                used_micro_batch_sizes.append(mbs)
+                min_micro_batch_size = mbs
+            else:
+                self.update_records(tuning_space_name, exp, 0, 1)
+                logger.info(
+                    f"User-specified micro batch size per GPU {mbs} does not run")
+                if self.min_train_micro_batch_size_per_gpu() == mbs:
+                    return -1, -1
+                mbs = self.min_train_micro_batch_size_per_gpu()
+                ds_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU] = mbs
+                ds_config[GRADIENT_ACCUMULATION_STEPS] = gas
+                ds_config[TRAIN_BATCH_SIZE] = mbs * gas * \
+                    self.exp_num_gpus * self.exp_num_nodes // self.mp_size()
+                exp_name = tuning_space_name + "_gas" + str(gas) + "_tmbspg" + str(mbs)
+                exp, metric_val = self.run_ds_config(ds_config, exp_name)
+                if not metric_val:
+                    self.update_records(tuning_space_name, exp, 0, 1)
+                    logger.info(
+                        f"min_train_micro_batch_size_per_gpu {mbs} is not runnable.")
+                    return -1, -1
+                self.update_records(tuning_space_name, exp, metric_val, 1)
+                min_micro_batch_size = mbs
+                used_micro_batch_sizes.append(mbs)
+        else:
+            ds_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU] = min_micro_batch_size
+            ds_config[GRADIENT_ACCUMULATION_STEPS] = gas
+            ds_config[TRAIN_BATCH_SIZE] = min_micro_batch_size * gas * \
+                self.exp_num_gpus * self.exp_num_nodes // self.mp_size()
+            exp_name = tuning_space_name + "_gas" + str(gas) + "_tmbspg" + str(
+                min_micro_batch_size)
+            exp, metric_val = self.run_ds_config(ds_config, exp_name)
+            if metric_val:
+                self.update_records(tuning_space_name, exp, metric_val, 1)
+                used_micro_batch_sizes.append(min_micro_batch_size)
+            else:
+                self.update_records(tuning_space_name, exp, 0, 1)
+                return -1, -1
+
+        # search for the max micro batch size
+        max_micro_batch_size = min(calculated_max_micro_batch_size,
+                                   self.max_train_micro_batch_size_per_gpu())
+        for mbs in [
+                math.ceil(1.05 * max_micro_batch_size),
+                max_micro_batch_size,
+                int(0.95 * max_micro_batch_size)
+        ]:
+            if mbs > self.max_train_micro_batch_size_per_gpu():
+                continue
+            if mbs in used_micro_batch_sizes:
+                return min_micro_batch_size, mbs
+            ds_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU] = mbs
+            ds_config[TRAIN_BATCH_SIZE] = mbs * gas * \
+                self.exp_num_gpus * self.exp_num_nodes // self.mp_size()
+            exp_name = tuning_space_name + "_gas" + str(gas) + "_tmbspg" + str(mbs)
+            exp, metric_val = self.run_ds_config(ds_config, exp_name)
+
+            if metric_val:
+                logger.info(f"mbs = {mbs} is found as max mbs")
+                self.update_records(tuning_space_name, exp, metric_val, 1)
+                used_micro_batch_sizes.append(mbs)
+                return min_micro_batch_size, mbs
+            else:
+                self.update_records(tuning_space_name, exp, 0, 1)
+
+        space_records = self.records[
+            tuning_space_name] if tuning_space_name in self.records else []
+        if space_records:
+            prev_idx = min(range(len(space_records)),
+                           key=lambda i: abs(space_records[i][0][DS_CONFIG][
+                               TRAIN_MICRO_BATCH_SIZE_PER_GPU] - min_micro_batch_size))
+            prev_metric_val = space_records[prev_idx][1]
+        else:
+            prev_metric_val = None
+
+        low = min_micro_batch_size
+        high = max_micro_batch_size
+        while low < high:
+            mid = int((low + high) // 2)
+            logger.debug(f"trying mbs = {mid}, low = {low}, high = {high}")
+            if mid == low:
+                break
+            if mid not in used_micro_batch_sizes:
+                ds_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU] = mid
+                ds_config[TRAIN_BATCH_SIZE] = mid * gas * \
+                    self.exp_num_gpus * self.exp_num_nodes // self.mp_size()
+                exp_name = tuning_space_name + "_gas" + str(gas) + "_tmbspg" + str(mid)
+                exp, metric_val = self.run_ds_config(ds_config, exp_name)
+                if metric_val:
+                    low = mid
+                    self.update_records(tuning_space_name, exp, metric_val, 1)
+                    used_micro_batch_sizes.append(mid)
+                    if prev_metric_val and ((metric_val - prev_metric_val) /
+                                            prev_metric_val) < METRIC_PERCENT_DIFF_CONST:
+                        logger.info(f"performance plateaus at mbs = {low}")
+                        break
+                    prev_metric_val = metric_val
+                else:
+                    self.update_records(tuning_space_name, exp, 0, 1)
+                    high = mid - 1
+            else:
+                low = mid
+        max_micro_batch_size = low
+
+        logger.info(
+            f"min_micro_batch_size = {min_micro_batch_size}, max_micro_batch_size = {max_micro_batch_size}."
+        )
+
+        return min_micro_batch_size, max_micro_batch_size
+
+    def get_gas_from_user_config(self):
+        gas = 1
+        if GRADIENT_ACCUMULATION_STEPS in self.user_config:
+            gas_in_config = self.user_config[GRADIENT_ACCUMULATION_STEPS]
+            if isinstance(gas_in_config, int):
+                gas = gas_in_config
+            elif gas_in_config == "auto":  # GRADIENT_ACCUMULATION_STEPS: "auto"
+                val = self.get_val_from_config(GRADIENT_ACCUMULATION_STEPS)
+                if val:
+                    gas = int(val)
+            elif isinstance(gas_in_config, list):
+                logger.info(
+                    f"Specifying a list of {GRADIENT_ACCUMULATION_STEPS} to tune is not supported. 1 would be used."
+                )
+        assert gas > 0, "Gradient accumulation steps must be positive."
+        return gas
+
+    def get_val_from_user_args(self, ds_name):
+        arg_mappings = self.autotuning_config.arg_mappings
+        user_args = self.args.user_args
+        if arg_mappings and ds_name in arg_mappings:
+            arg_name = arg_mappings[ds_name]
+            if arg_name in user_args:
+                idx = user_args.index(arg_name)
+                if user_args[idx + 1].isnumeric():
+                    return (user_args[idx + 1])
+        return None
+
+    def get_tuning_micro_batch_size_list(self,
+                                         min_micro_batch_size,
+                                         max_micro_batch_size,
+                                         num_tuning_micro_batch_sizes):
+        """Get a list of micro batch sizes to tune based on min and max values, as well as the size of the list.
+        Args:
+            min_micro_batch_size ([int]): min micro batch size per GPU
+            max_micro_batch_size ([int]): max micro batch size per GPU
+            num_tuning_micro_batch_sizes (int): the number of items in the returned list
+
+        Returns:
+            [list]: a list of micro batch sizes to tune.
+        """
+        if min_micro_batch_size <= 0 or max_micro_batch_size <= 0:
+            logger.info(
+                f"min_micro_batch_size = {min_micro_batch_size}, max_micro_batch_size = {max_micro_batch_size}"
+            )
+            return [], 0
+
+        # NUM_GPUS=$(( ${NUM_WORKERS} * ${NUM_GPUS_PER_WORKER} ))
+        # DP_SIZE=$(( ${NUM_GPUS} / (${PP_SIZE} * ${MP_SIZE}) ))
+        # GRAD_ACC_STEPS=$(( ${TARGET_GLOBAL_BATCH_SIZE} / (${BATCH_SIZE} * ${DP_SIZE}) ))
+        if self.max_train_batch_size() and self.max_train_batch_size(
+        ) > 0:  # if the user specifies a max_train_batch_size
+            max_train_batch_size_per_gpu = self.max_train_batch_size() * self.mp_size(
+            ) // (self.exp_num_gpus * self.exp_num_nodes)
+        else:
+            gas = self.get_gas_from_user_config()
+            max_train_batch_size_per_gpu = max_micro_batch_size * gas // self.mp_size()
+        logger.info(f"max_train_batch_size_per_gpu = {max_train_batch_size_per_gpu}")
+        if min_micro_batch_size < max_micro_batch_size // 2:
+            min_micro_batch_size = max_micro_batch_size // 2
+
+        # constant stride
+        stride = (max_micro_batch_size -
+                  min_micro_batch_size) // num_tuning_micro_batch_sizes
+        if stride == 0:
+            stride = 1
+        ls = []
+        min_gas = max_train_batch_size_per_gpu // max_micro_batch_size
+        # if gas is the same as min_gas, do not add mbs to the tuning list
+        for mbs in range(min_micro_batch_size, max_micro_batch_size, stride):
+            if max_micro_batch_size // mbs != min_gas:
+                ls.append(mbs)
+        ls.append(max_micro_batch_size)
+
+        return ls, max_train_batch_size_per_gpu
+
+    def run_ds_config(self, ds_config, exp_name):
+        exp_config = {}
+        exp_config['name'] = exp_name
+        exp_config[DS_CONFIG] = ds_config
+        exp_config['num_gpus'] = self.exp_num_gpus
+        exp_config['num_nodes'] = self.exp_num_nodes
+        exp_path = os.path.join(self.exps_dir, f'{exp_name}.json')
+
+        logger.debug(f'run_ds_config exp_name = {exp_name}')
+
+        with open(exp_path, 'w', buffering=BUFSIZE) as fd:
+            json.dump(exp_config, fd)
+            fd.flush()
+            os.fsync(fd)
+
+        self.rm.schedule_experiments([exp_path])
+        self.rm.run()
+        exp, metric_val = self.rm.parse_results(self.metric())
+        self.rm.clear()
+        return exp, metric_val
+
+    def run_after_tuning(self):
+        """ Launches the training with the optmimal DeepSpeed configuration found through the autotuning process.
+            "ds_config_optimal.json" describing the optmimal DeepSpeed configuration as well the command used to launch training "cmd_optimal.txt" are saved to self.results_dir.
+        """
+        best_space_records = self.get_best_space_records()
+        if GLOBAL_TUNING_SPACE not in best_space_records:
+            return
+        best_exp, best_metric_val, _ = best_space_records[GLOBAL_TUNING_SPACE]
+        if best_exp:
+            logger.info(
+                "Start training with the optmimal DeepSpeed configuration found through the tuning process"
+            )
+
+            exp_dir = best_exp["result_dir"]
+            cmd = None
+            with open(os.path.join(exp_dir, "cmd.txt"), "r") as f:
+                cmd = [str(i) for i in f.read().split()]
+
+            ds_config = hjson.load(open(os.path.join(exp_dir, "ds_config.json"), "r"))
+            ds_config.pop(AUTOTUNING)
+
+            ds_config_path = os.path.join(self.results_dir, "ds_config_optimal.json")
+            json.dump(ds_config, open(ds_config_path, "w"))
+
+            idx = cmd.index(os.path.join(exp_dir, "ds_config.json"))
+            cmd[idx] = ds_config_path
+
+            cmd_path = os.path.join(self.results_dir, "cmd_optimal.txt")
+            with open(cmd_path, "w") as fd:
+                fd.write(" ".join(cmd))
+                fd.write("\n")
+                fd.flush()
+
+            result = subprocess.Popen(cmd)
+            result.wait()
+
+            logger.info(
+                f"Done running with the optimal DeepSpeed configuration found by autotuning: {ds_config_path}"
+            )
--- a/deepspeed/autotuning/config.py
+++ b/deepspeed/autotuning/config.py
+"""
+Copyright (c) Microsoft Corporation
+Licensed under the MIT license.
+"""
+
+from deepspeed.runtime.config_utils import get_scalar_param, get_dict_param, DeepSpeedConfigObject
+from deepspeed.autotuning.constants import *
+
+
+class DeepSpeedAutotuningConfig(DeepSpeedConfigObject):
+    def __init__(self, param_dict):
+        super(DeepSpeedAutotuningConfig, self).__init__()
+
+        self.enabled = None
+        self.start_step = None
+        self.end_step = None
+        self.metric_path = None
+        self.arg_mappings = None
+        self.metric = None
+        self.model_info = None
+        self.results_dir = None
+        self.exps_dir = None
+        self.overwrite = None
+
+        if param_dict and AUTOTUNING in param_dict.keys():
+            autotuning_dict = param_dict[AUTOTUNING]
+        else:
+            autotuning_dict = {}
+
+        self._initialize(autotuning_dict)
+
+    def _initialize(self, autotuning_dict):
+        self.enabled = get_scalar_param(autotuning_dict,
+                                        AUTOTUNING_ENABLED,
+                                        AUTOTUNING_ENABLED_DEFAULT)
+
+        self.fast = get_scalar_param(autotuning_dict,
+                                     AUTOTUNING_FAST,
+                                     AUTOTUNING_FAST_DEFAULT)
+
+        self.results_dir = get_scalar_param(autotuning_dict,
+                                            AUTOTUNING_RESULTS_DIR,
+                                            AUTOTUNING_RESULTS_DIR_DEFAULT)
+
+        self.exps_dir = get_scalar_param(autotuning_dict,
+                                         AUTOTUNING_EXPS_DIR,
+                                         AUTOTUNING_EXPS_DIR_DEFAULT)
+
+        self.overwrite = get_scalar_param(autotuning_dict,
+                                          AUTOTUNING_OVERWRITE,
+                                          AUTOTUNING_OVERWRITE_DEFAULT)
+
+        self.start_profile_step = get_scalar_param(
+            autotuning_dict,
+            AUTOTUNING_START_PROFILE_STEP,
+            AUTOTUNING_START_PROFILE_STEP_DEFAULT)
+
+        self.end_profile_step = get_scalar_param(autotuning_dict,
+                                                 AUTOTUNING_END_PROFILE_STEP,
+                                                 AUTOTUNING_END_PROFILE_STEP_DEFAULT)
+
+        self.metric = get_scalar_param(autotuning_dict,
+                                       AUTOTUNING_METRIC,
+                                       AUTOTUNING_METRIC_DEFAULT)
+
+        self.metric_path = get_scalar_param(autotuning_dict,
+                                            AUTOTUNING_METRIC_PATH,
+                                            AUTOTUNING_METRIC_PATH_DEFAULT)
+
+        self.tuner_type = get_scalar_param(autotuning_dict,
+                                           AUTOTUNING_TUNER_TYPE,
+                                           AUTOTUNING_TUNER_TYPE_DEFAULT)
+
+        self.tuner_early_stopping = get_scalar_param(
+            autotuning_dict,
+            AUTOTUNING_TUNER_EARLY_STOPPING,
+            AUTOTUNING_TUNER_EARLY_STOPPING_DEFAULT)
+
+        self.tuner_num_trials = get_scalar_param(autotuning_dict,
+                                                 AUTOTUNING_TUNER_NUM_TRIALS,
+                                                 AUTOTUNING_TUNER_NUM_TRIALS_DEFAULT)
+
+        self.arg_mappings = get_dict_param(autotuning_dict,
+                                           AUTOTUNING_ARG_MAPPINGS,
+                                           AUTOTUNING_ARG_MAPPINGS_DEFAULT)
+
+        self.model_info = get_model_info_config(autotuning_dict)
+
+        self.model_info_path = get_scalar_param(autotuning_dict,
+                                                AUTOTUNING_MODEL_INFO_PATH,
+                                                AUTOTUNING_MODEL_INFO_PATH_DEFAULT)
+        self.mp_size = get_scalar_param(autotuning_dict,
+                                        AUTOTUNING_MP_SIZE,
+                                        AUTOTUNING_MP_SIZE_DEFAULT)
+
+        self.max_train_batch_size = get_dict_param(
+            autotuning_dict,
+            AUTOTUNING_MAX_TRAIN_BATCH_SIZE,
+            AUTOTUNING_MAX_TRAIN_BATCH_SIZE_DEFAULT)
+
+        self.min_train_batch_size = get_dict_param(
+            autotuning_dict,
+            AUTOTUNING_MIN_TRAIN_BATCH_SIZE,
+            AUTOTUNING_MIN_TRAIN_BATCH_SIZE_DEFAULT)
+
+        self.max_train_micro_batch_size_per_gpu = get_dict_param(
+            autotuning_dict,
+            AUTOTUNING_MAX_TRAIN_MICRO_BATCH_SIZE_PER_GPU,
+            AUTOTUNING_MAX_TRAIN_MICRO_BATCH_SIZE_PER_GPU_DEFAULT)
+
+        self.min_train_micro_batch_size_per_gpu = get_dict_param(
+            autotuning_dict,
+            AUTOTUNING_MIN_TRAIN_MICRO_BATCH_SIZE_PER_GPU,
+            AUTOTUNING_MIN_TRAIN_MICRO_BATCH_SIZE_PER_GPU_DEFAULT)
+
+        self.num_tuning_micro_batch_sizes = get_dict_param(
+            autotuning_dict,
+            AUTOTUNING_NUM_TUNING_MICRO_BATCH_SIZES,
+            AUTOTUNING_NUM_TUNING_MICRO_BATCH_SIZES_DEFAULT)
+
+
+def get_model_info_config(param_dict):
+    if MODEL_INFO in param_dict and param_dict[MODEL_INFO] is not None:
+        model_info_config = {}
+        for key, default_value in MODEL_INFO_KEY_DEFAULT_DICT.items():
+            model_info_config[key] = get_scalar_param(param_dict[MODEL_INFO],
+                                                      key,
+                                                      default_value)
+        return model_info_config
+    return None
+
+
+def get_default_model_info_config():
+    return MODEL_INFO_KEY_DEFAULT_DICT
--- a/deepspeed/autotuning/config_templates/template_zero0.json
+++ b/deepspeed/autotuning/config_templates/template_zero0.json
+{
+  "zero_optimization": {
+    "stage": 0
+  }
+}
--- a/deepspeed/autotuning/config_templates/template_zero1.json
+++ b/deepspeed/autotuning/config_templates/template_zero1.json
+{
+  "zero_optimization": {
+    "stage": 1,
+    "reduce_bucket_size": 5e8,
+    "allgather_bucket_size": 5e8
+  }
+}
--- a/deepspeed/autotuning/config_templates/template_zero2.json
+++ b/deepspeed/autotuning/config_templates/template_zero2.json
+{
+  "zero_optimization": {
+    "stage": 2,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 5e8,
+    "overlap_comm": false,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 5e8,
+    "contiguous_gradients": false
+  }
+}
--- a/deepspeed/autotuning/config_templates/template_zero3.json
+++ b/deepspeed/autotuning/config_templates/template_zero3.json
+{
+  "zero_optimization": {
+    "stage": 3,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 5e8,
+    "overlap_comm": false,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 5e8,
+    "contiguous_gradients": false,
+    "stage3_max_live_parameters": 1e9,
+    "stage3_max_reuse_distance": 1e9,
+    "stage3_prefetch_bucket_size": 5e8,
+    "stage3_param_persistence_threshold": 1e6,
+    "stage3_gather_16bit_weights_on_model_save": false,
+    "sub_group_size": 1e12
+  }
+}
--- a/deepspeed/autotuning/constants.py
+++ b/deepspeed/autotuning/constants.py
+"""
+Copyright (c) Microsoft Corporation
+Licensed under the MIT license.
+"""
+
+#########################################
+# autotunner implementation constants
+#########################################
+
+import os
+
+DEFAULT_TEMPLATE_PATH_ZERO_0 = os.path.join(os.path.dirname(os.path.realpath(__file__)),
+                                            "config_templates",
+                                            "template_zero0.json")
+DEFAULT_TEMPLATE_PATH_ZERO_1 = os.path.join(os.path.dirname(os.path.realpath(__file__)),
+                                            "config_templates",
+                                            "template_zero1.json")
+DEFAULT_TEMPLATE_PATH_ZERO_2 = os.path.join(os.path.dirname(os.path.realpath(__file__)),
+                                            "config_templates",
+                                            "template_zero2.json")
+DEFAULT_TEMPLATE_PATH_ZERO_3 = os.path.join(os.path.dirname(os.path.realpath(__file__)),
+                                            "config_templates",
+                                            "template_zero3.json")
+
+DEFAULT_EXPRS_DIR = os.path.join(os.getcwd(), "autotuning_exps")
+DEFAULT_RESULTS_DIR = os.path.join(os.getcwd(), "autotuning_results")
+
+METRIC_PERCENT_DIFF_CONST = 0.05
+DS_CONFIG = "ds_config"
+BUFSIZE = 1  # line buffer size for writing files
+
+#########################################
+# autotuner configuration constants
+#########################################
+# Autotuner. By default, this feature is not enabled.
+# Users can configure in ds_config.json as below example:
+AUTOTUNING_FORMAT = """
+autotuner should be enabled as:
+"session_params": {
+  "autotuning": {
+    "enabled": true,
+    "start_step": 5,
+    "end_step": 15
+    }
+}
+"""
+
+AUTOTUNING = "autotuning"
+
+AUTOTUNING_ENABLED = "enabled"
+AUTOTUNING_ENABLED_DEFAULT = False
+
+AUTOTUNING_FAST = "fast"
+AUTOTUNING_FAST_DEFAULT = True
+
+AUTOTUNING_RESULTS_DIR = "results_dir"
+AUTOTUNING_RESULTS_DIR_DEFAULT = None
+
+AUTOTUNING_EXPS_DIR = "exps_dir"
+AUTOTUNING_EXPS_DIR_DEFAULT = None
+
+AUTOTUNING_OVERWRITE = "overwrite"
+AUTOTUNING_OVERWRITE_DEFAULT = True
+
+AUTOTUNING_START_PROFILE_STEP = "start_profile_step"
+AUTOTUNING_START_PROFILE_STEP_DEFAULT = 3
+
+AUTOTUNING_END_PROFILE_STEP = "end_profile_step"
+AUTOTUNING_END_PROFILE_STEP_DEFAULT = 5
+AUTOTUNING_METRIC_PATH = "metric_path"
+AUTOTUNING_METRIC_PATH_DEFAULT = None
+
+AUTOTUNING_TUNER_TYPE = "tuner_type"
+AUTOTUNING_TUNER_GRIDSEARCH = "gridsearch"
+AUTOTUNING_TUNER_RANDOM = "random"
+AUTOTUNING_TUNER_MODELBASED = "model_based"
+AUTOTUNING_TUNER_TYPE_DEFAULT = AUTOTUNING_TUNER_GRIDSEARCH
+AUTOTUNING_TUNER_EARLY_STOPPING = "tuner_early_stopping"
+AUTOTUNING_TUNER_EARLY_STOPPING_DEFAULT = 5
+AUTOTUNING_TUNER_NUM_TRIALS = "tuner_num_trials"
+AUTOTUNING_TUNER_NUM_TRIALS_DEFAULT = 50
+
+AUTOTUNING_ARG_MAPPINGS = "arg_mappings"
+AUTOTUNING_ARG_MAPPINGS_DEFAULT = None
+
+AUTOTUNING_MAX_TRAIN_BATCH_SIZE = "max_train_batch_size"
+AUTOTUNING_MAX_TRAIN_BATCH_SIZE_DEFAULT = None
+AUTOTUNING_MIN_TRAIN_BATCH_SIZE = "min_train_batch_size"
+AUTOTUNING_MIN_TRAIN_BATCH_SIZE_DEFAULT = 1
+AUTOTUNING_MAX_TRAIN_MICRO_BATCH_SIZE_PER_GPU = "max_train_micro_batch_size_per_gpu"
+AUTOTUNING_MAX_TRAIN_MICRO_BATCH_SIZE_PER_GPU_DEFAULT = 1024
+AUTOTUNING_MIN_TRAIN_MICRO_BATCH_SIZE_PER_GPU = "min_train_micro_batch_size_per_gpu"
+AUTOTUNING_MIN_TRAIN_MICRO_BATCH_SIZE_PER_GPU_DEFAULT = 1
+AUTOTUNING_NUM_TUNING_MICRO_BATCH_SIZES = "num_tuning_micro_batch_sizes"
+AUTOTUNING_NUM_TUNING_MICRO_BATCH_SIZES_DEFAULT = 3
+
+AUTOTUNING_MP_SIZE = "mp_size"
+AUTOTUNING_MP_SIZE_DEFAULT = 1
+
+AUTOTUNING_METRIC = "metric"
+AUTOTUNING_METRIC_LATENCY = "latency"
+AUTOTUNING_METRIC_THROUGHPUT = "throughput"
+AUTOTUNING_METRIC_FLOPS = "flops"
+AUTOTUNING_METRIC_FORWARD = "forward"
+AUTOTUNING_METRIC_BACKWRAD = "flops"
+AUTOTUNING_METRIC_STEPS = "step"
+AUTOTUNING_METRIC_DEFAULT = AUTOTUNING_METRIC_THROUGHPUT
+
+#########################################
+# MODEL INFO
+#########################################
+AUTOTUNING_MODEL_INFO_PATH = "model_info_path"
+AUTOTUNING_MODEL_INFO_PATH_DEFAULT = None
+
+MODEL_INFO_FORMAT = '''
+"model_info": {
+  "num_params": 1000000000,
+  "hidden_size": 10,
+  "num_layers": 12,
+}
+'''
+MODEL_INFO = "model_info"
+MODEL_INFO_PROFILE = "profile"
+MODEL_INFO_PROFILE_DEFAULT = False
+MODEL_INFO_NUM_PARAMS = "num_params"
+MODEL_INFO_NUM_PARAMS_DEFAULT = None
+MODEL_INFO_HIDDEN_SIZE = "hideen_size"
+MODEL_INFO_HIDDEN_SIZE_DEFAULT = None
+MODEL_INFO_NUM_LAYERS = "num_layers"
+MODEL_INFO_NUM_LAYERS_DEFAULT = None
+
+MODEL_INFO_KEY_DEFAULT_DICT = {
+    MODEL_INFO_PROFILE: MODEL_INFO_PROFILE_DEFAULT,
+    MODEL_INFO_NUM_PARAMS: MODEL_INFO_NUM_PARAMS_DEFAULT,
+    MODEL_INFO_HIDDEN_SIZE: MODEL_INFO_HIDDEN_SIZE_DEFAULT,
+    MODEL_INFO_NUM_LAYERS: MODEL_INFO_NUM_LAYERS_DEFAULT
+}
+
+#########################################
+# autotunner search space constants
+#########################################
+
+DEFAULT_HF_CONFIG = {
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "gradient_accumulation_steps": "auto",
+}
+
+DEFAULT_MIN_MEM_CONFIG = {
+    "train_micro_batch_size_per_gpu": 1,
+    "zero_optimization": {
+        "stage": 3
+    },
+    "memory_break_down": False
+}
+
+DEFAULT_TUNING_SPACE_ZERO_0 = {"zero_optimization": {"stage": 0}}
+
+DEFAULT_TUNING_SPACE_ZERO_1 = {
+    "zero_optimization": {
+        "stage": 1,
+        "reduce_bucket_size": [5e7,
+                               5e8,
+                               1e9],
+        "allgather_bucket_size": [5e7,
+                                  5e8,
+                                  1e9],
+    }
+}
+
+DEFAULT_TUNING_SPACE_ZERO_2 = {
+    "zero_optimization": {
+        "stage": 2,
+        "overlap_comm": [True,
+                         False],
+        "reduce_scatter": [False,
+                           True],
+        "reduce_bucket_size": [5e7,
+                               5e8,
+                               1e9],
+        "allgather_bucket_size": [5e7,
+                                  5e8,
+                                  1e9],
+        "contiguous_gradients": [False,
+                                 True]
+    },
+}
+
+DEFAULT_TUNING_SPACE_ZERO_3 = {
+    "zero_optimization": {
+        "stage": 3,
+        "overlap_comm": [True,
+                         False],
+        "reduce_scatter": [False,
+                           True],
+        "reduce_bucket_size": [5e7,
+                               5e8,
+                               1e9],
+        "allgather_partitions": [True,
+                                 False],
+        "allgather_bucket_size": [5e7,
+                                  5e8,
+                                  1e9],
+        "contiguous_gradients": [False,
+                                 True]
+    },
+}
+
+GLOBAL_TUNING_SPACE = 'global'
+# TUNING_MICRO_BATCH_SIZE_PREFIX="tune_micro_batch_size_z"
+TUNING_MICRO_BATCH_SIZE_PREFIX = "z"