add colossalai kernel module (#55)

5c3843dc · shenggan · GitHub · 648f8063 · 5c3843dc · 5c3843dc
Unverified Commit 5c3843dc authored Dec 21, 2021 by shenggan Committed by GitHub Dec 21, 2021
20 changed files
--- a/colossalai/kernel/__init__.py
+++ b/colossalai/kernel/__init__.py
+from .jit.bias_dropout_add import bias_dropout_add_fused_train, bias_dropout_add_fused_inference
+from .jit.bias_gelu import bias_gelu_impl
+from .cuda_native import LayerNorm, FusedScaleMaskSoftmax, MultiHeadAttention
+
+__all__ = [
+    "bias_dropout_add_fused_train", "bias_dropout_add_fused_inference", "bias_gelu_impl",
+    "LayerNorm", "FusedScaleMaskSoftmax", "MultiHeadAttention"
+]
--- a/colossalai/kernel/cuda_native/__init__.py
+++ b/colossalai/kernel/cuda_native/__init__.py
+from .builder import _build_cuda_native_kernel
+
+CUDA_NATIVE_KERNEL_BUILD = False
+
+
+def build_cuda_native_kernel():
+    global CUDA_NATIVE_KERNEL_BUILD
+    if CUDA_NATIVE_KERNEL_BUILD == False:
+        _build_cuda_native_kernel()
+        CUDA_NATIVE_KERNEL_BUILD = True
+
+
+build_cuda_native_kernel()
+
+from .layer_norm import MixedFusedLayerNorm as LayerNorm
+from .scaled_softmax import FusedScaleMaskSoftmax
+from .multihead_attention import MultiHeadAttention
\ No newline at end of file
--- a/colossalai/kernel/cuda_native/builder.py
+++ b/colossalai/kernel/cuda_native/builder.py
+import os
+import pathlib
+import subprocess
+
+from torch.utils import cpp_extension
+
+# Setting this param to a list has a problem of generating different
+# compilation commands (with diferent order of architectures) and
+# leading to recompilation of fused kernels. Set it to empty string
+# to avoid recompilation and assign arch flags explicity in
+# extra_cuda_cflags below
+os.environ["TORCH_CUDA_ARCH_LIST"] = ""
+
+
+def _build_cuda_native_kernel():
+
+    # Check if cuda 11 is installed for compute capability 8.0
+    cc_flag = []
+    _, bare_metal_major, _ = _get_cuda_bare_metal_version(cpp_extension.CUDA_HOME)
+    if int(bare_metal_major) >= 11:
+        cc_flag.append('-gencode')
+        cc_flag.append('arch=compute_80,code=sm_80')
+
+    # Build path
+    basepath = pathlib.Path(__file__).parent.absolute()
+    srcpath = basepath / 'csrc'
+    buildpath = basepath / 'build'
+    _create_build_dir(buildpath)
+
+    # Helper function to build the kernels.
+    def _cpp_extention_load_helper(name, sources, extra_cuda_flags):
+        return cpp_extension.load(
+            name=name,
+            sources=sources,
+            build_directory=buildpath,
+            extra_cflags=[
+                '-O3',
+            ],
+            extra_include_paths=[str(srcpath / 'kernels' / 'include')],
+            extra_cuda_cflags=['-O3', '-gencode', 'arch=compute_70,code=sm_70', '--use_fast_math'] +
+            extra_cuda_flags + cc_flag,
+            verbose=False)
+
+    # ==============
+    # Fused softmax.
+    # ==============
+
+    extra_cuda_flags = ['-U__CUDA_NO_HALF_OPERATORS__',
+                        '-U__CUDA_NO_HALF_CONVERSIONS__',
+                        '--expt-relaxed-constexpr',
+                        '--expt-extended-lambda']
+    
+    # Upper triangular softmax.
+    sources=[srcpath / 'scaled_upper_triang_masked_softmax.cpp',
+                srcpath / 'scaled_upper_triang_masked_softmax_cuda.cu']
+    colossal_scaled_upper_triang_masked_softmax = _cpp_extention_load_helper(
+        "colossal_scaled_upper_triang_masked_softmax",
+        sources, extra_cuda_flags)
+
+    # Masked softmax.
+    sources=[srcpath / 'scaled_masked_softmax.cpp',
+                srcpath / 'scaled_masked_softmax_cuda.cu']
+    colossal_scaled_masked_softmax = _cpp_extention_load_helper(
+        "colossal_scaled_masked_softmax", sources, extra_cuda_flags)
+
+    # =================================
+    # Mixed precision fused layer norm.
+    # =================================
+
+    extra_cuda_flags = ['-maxrregcount=50']
+    sources = [srcpath / 'layer_norm_cuda.cpp', srcpath / 'layer_norm_cuda_kernel.cu']
+    colossal_layer_norm_cuda = _cpp_extention_load_helper("colossal_layer_norm_cuda", sources,
+                                                          extra_cuda_flags)
+
+    # ==========================================
+    # Mixed precision Transformer Encoder Layer.
+    # ==========================================
+
+    extra_cuda_flags = ['-std=c++14',
+                        '-U__CUDA_NO_HALF_OPERATORS__',
+                        '-U__CUDA_NO_HALF_CONVERSIONS__',
+                        '-U__CUDA_NO_HALF2_OPERATORS__',
+                        '-DTHRUST_IGNORE_CUB_VERSION_CHECK']
+
+    sources = [srcpath / 'multihead_attention_1d.cpp']
+    kernel_sources = ["cublas_wrappers.cu",
+                      "transform_kernels.cu",
+                      "dropout_kernels.cu",
+                      "normalize_kernels.cu",
+                      "softmax_kernels.cu",
+                      "general_kernels.cu",
+                      "cuda_util.cu"]
+    sources += [(srcpath / 'kernels' / cu_file) for cu_file in kernel_sources]
+    colossal_multihead_attention = _cpp_extention_load_helper("colossal_multihead_attention", sources,
+                                                          extra_cuda_flags)
+
+
+def _get_cuda_bare_metal_version(cuda_dir):
+    raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
+    output = raw_output.split()
+    release_idx = output.index("release") + 1
+    release = output[release_idx].split(".")
+    bare_metal_major = release[0]
+    bare_metal_minor = release[1][0]
+
+    return raw_output, bare_metal_major, bare_metal_minor
+
+
+def _create_build_dir(buildpath):
+    try:
+        os.mkdir(buildpath)
+    except OSError:
+        if not os.path.isdir(buildpath):
+            print(f"Creation of the build directory {buildpath} failed")
--- a/colossalai/kernel/cuda_native/csrc/compat.h
+++ b/colossalai/kernel/cuda_native/csrc/compat.h
+/*This code from NVIDIA apex:
+ *     https://github.com/NVIDIA/apex
+ *     with minor changes. */
+
+#ifndef TORCH_CHECK
+#define TORCH_CHECK AT_CHECK
+#endif
+
+#ifdef VERSION_GE_1_3
+#define DATA_PTR data_ptr
+#else
+#define DATA_PTR data
+#endif
\ No newline at end of file
--- a/colossalai/kernel/cuda_native/csrc/kernels/cross_entropy.cu
+++ b/colossalai/kernel/cuda_native/csrc/kernels/cross_entropy.cu
+#include "block_reduce.h"
+#include "cuda_util.h"
+#include "kernels.h"
+#include "ls_cub.cuh"
+
+ls::cub::CachingDeviceAllocator g_allocator(true);
+
+template <typename T>
+__global__ void ls_cross_entropy_fw_kernel(
+    const T *__restrict__ inputs, const int *__restrict__ targets,
+    float *__restrict__ outputs, float *__restrict__ nll_loss_outputs,
+    const int padding_idx, const float epsilon, const int vocab_size) {
+  /* step1: compute each thread's max_logit and sum_exp_logit, store in
+   * max_input, sum_exp_logit */
+  const int block_start = blockIdx.x * vocab_size;
+  const int left_idx = block_start + threadIdx.x;
+  const int right_idx = (blockIdx.x + 1) * vocab_size;
+  float max_input[1] = {REDUCE_FLOAT_INF_NEG};
+  float sum_logits[2] = {0.f, 0.f};  // logit and logit exp
+  int target_tid = targets[blockIdx.x];
+
+  if (target_tid == padding_idx) {
+    if (threadIdx.x == 0) {
+      nll_loss_outputs[blockIdx.x] = 0.f;
+      outputs[blockIdx.x] = 0.f;
+    }
+    return;
+  }
+
+  for (int i = left_idx; i < right_idx; i += blockDim.x) {
+    max_input[0] = fmaxf(max_input[0], static_cast<float>(inputs[i]));
+  }
+  blockReduce<ReduceType::kMax, 1>(max_input);
+  __shared__ float s_max_input;
+  if (threadIdx.x == 0) {
+    s_max_input = max_input[0];
+  }
+  __syncthreads();
+
+  for (int i = left_idx; i < right_idx; i += blockDim.x) {
+    float logit = static_cast<float>(inputs[i]) - s_max_input;
+    sum_logits[0] += logit;
+    sum_logits[1] += expf(logit);
+  }
+
+  blockReduce<ReduceType::kSum, 2>(sum_logits);
+  __shared__ float s_sum_logit;
+  __shared__ float s_sum_exp;
+  if (threadIdx.x == 0) {
+    s_sum_logit = sum_logits[0];
+    s_sum_exp = sum_logits[1];
+  }
+  __syncthreads();
+
+  float eps_i = epsilon / (vocab_size - 1);
+  if (threadIdx.x == 0) {
+    // neg_log_prob = log(sum(exp(x - x_max))) - (x - x_max)
+    float nll_loss = logf(s_sum_exp) -
+                     static_cast<float>(inputs[block_start + target_tid]) +
+                     s_max_input;
+    nll_loss_outputs[blockIdx.x] = nll_loss;
+    float sum_nll_loss = vocab_size * logf(s_sum_exp) - s_sum_logit;
+    outputs[blockIdx.x] =
+        (1.f - epsilon - eps_i) * nll_loss + eps_i * sum_nll_loss;
+  }
+}
+
+template <typename T>
+__global__ void ls_cross_entropy_bw_kernel(
+    const float *__restrict__ grad_outputs, const T *__restrict__ inputs,
+    const int *__restrict__ targets, T *__restrict__ grad_inputs,
+    const int padding_idx, const float epsilon, const int vocab_size) {
+  /* step1: compute each thread's max_logit and sum_exp_logit, store in
+   * max_input, sum_exp_logit */
+  const int block_start = blockIdx.x * vocab_size;
+  const int left_idx = block_start + threadIdx.x;
+  const int right_idx = (blockIdx.x + 1) * vocab_size;
+  float max_input[1] = {REDUCE_FLOAT_INF_NEG};
+  float sum_logits[1] = {0.f};
+  const float grad_out = static_cast<float>(grad_outputs[0]);
+  int target_tid = targets[blockIdx.x];
+
+  if (target_tid == padding_idx) {
+    for (int i = left_idx; i < right_idx; i += blockDim.x) {
+      grad_inputs[i] = 0.f;
+    }
+    return;
+  }
+
+  for (int i = left_idx; i < right_idx; i += blockDim.x) {
+    max_input[0] = fmaxf(max_input[0], static_cast<float>(inputs[i]));
+  }
+  blockReduce<ReduceType::kMax, 1>(max_input);
+  __shared__ float s_max_input;
+  if (threadIdx.x == 0) {
+    s_max_input = max_input[0];
+  }
+  __syncthreads();
+
+  for (int i = left_idx; i < right_idx; i += blockDim.x) {
+    float logit = static_cast<float>(inputs[i]) - s_max_input;
+    sum_logits[0] += expf(logit);
+  }
+
+  blockReduce<ReduceType::kSum, 1>(sum_logits);
+  __shared__ float s_sum_exp;
+  if (threadIdx.x == 0) {
+    s_sum_exp = sum_logits[0];
+  }
+  __syncthreads();
+
+  float eps_i = epsilon / (vocab_size - 1);
+  float nll_weight = 1.0 - epsilon - eps_i;
+
+  for (int i = left_idx; i < right_idx; i += blockDim.x) {
+    float prob = expf(static_cast<float>(inputs[i]) - s_max_input) / s_sum_exp;
+    float grad = 0;
+    grad += (vocab_size * prob - 1) * eps_i;
+    grad += prob * nll_weight;
+    if ((i - block_start) == target_tid) {
+      grad -= nll_weight;
+    }
+    grad_inputs[i] = grad_out * grad;
+  }
+}
+
+template <typename T>
+void launch_cross_entropy_fw(const T *inputs_ptr, const int *targets_ptr,
+                             float *outputs_ptr, float *nll_loss_ptr,
+                             float *loss_buffer, const int padding_idx,
+                             const float epsilon, const int batch_size,
+                             const int seq_len, const int vocab_size,
+                             cudaStream_t stream) {
+  int grid_dim = batch_size * seq_len;
+  float *nll_loss_buffer = loss_buffer + grid_dim;
+  ls_cross_entropy_fw_kernel<<<grid_dim, MAX_THREADS, 0, stream>>>(
+      inputs_ptr, targets_ptr, loss_buffer, nll_loss_buffer, padding_idx,
+      epsilon, vocab_size);
+
+  int num_items = grid_dim;
+  void *d_temp_storage = NULL;
+  size_t temp_storage_bytes = 0;
+  CHECK_GPU_ERROR(ls::cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes,
+                                             loss_buffer, outputs_ptr,
+                                             num_items, stream));
+  CHECK_GPU_ERROR(
+      g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+  CHECK_GPU_ERROR(ls::cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes,
+                                             loss_buffer, outputs_ptr,
+                                             num_items, stream));
+  CHECK_GPU_ERROR(ls::cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes,
+                                             nll_loss_buffer, nll_loss_ptr,
+                                             num_items, stream));
+  CHECK_GPU_ERROR(g_allocator.DeviceFree(d_temp_storage));
+}
+
+template void launch_cross_entropy_fw<float>(
+    const float *inputs_ptr, const int *targets_ptr, float *outputs_ptr,
+    float *nll_loss_ptr, float *loss_buffer, const int padding_idx,
+    const float epsilon, const int batch_size, const int seq_len,
+    const int vocab_size, cudaStream_t stream);
+
+template void launch_cross_entropy_fw<__half>(
+    const __half *inputs_ptr, const int *targets_ptr, float *outputs_ptr,
+    float *nll_loss_ptr, float *loss_buffer, const int padding_idx,
+    const float epsilon, const int batch_size, const int seq_len,
+    const int vocab_size, cudaStream_t stream);
+
+template <typename T>
+void launch_cross_entropy_bw(const float *grad_outputs_ptr, const T *inputs_ptr,
+                             const int *targets_ptr, T *grad_inputs_ptr,
+                             const int padding_idx, const float epsilon,
+                             const int batch_size, const int seq_len,
+                             const int vocab_size, cudaStream_t stream) {
+  int grid_dim = batch_size * seq_len;
+  ls_cross_entropy_bw_kernel<<<grid_dim, MAX_THREADS, 0, stream>>>(
+      grad_outputs_ptr, inputs_ptr, targets_ptr, grad_inputs_ptr, padding_idx,
+      epsilon, vocab_size);
+}
+
+template void launch_cross_entropy_bw<float>(
+    const float *grad_outputs_ptr, const float *inputs_ptr,
+    const int *targets_ptr, float *grad_inputs_ptr, const int padding_idx,
+    const float epsilon, const int batch_size, const int seq_len,
+    const int vocab_size, cudaStream_t stream);
+
+template void launch_cross_entropy_bw<__half>(
+    const float *grad_outputs_ptr, const __half *inputs_ptr,
+    const int *targets_ptr, __half *grad_inputs_ptr, const int padding_idx,
+    const float epsilon, const int batch_size, const int seq_len,
+    const int vocab_size, cudaStream_t stream);
--- a/colossalai/kernel/cuda_native/csrc/kernels/cublas_wrappers.cu
+++ b/colossalai/kernel/cuda_native/csrc/kernels/cublas_wrappers.cu
+/* Copyright 2021 The LightSeq Team
+   Copyright Microsoft DeepSpeed
+   This file is adapted from Microsoft DeepSpeed
+*/
+#include "cublas_wrappers.h"
+
+int cublas_gemm_ex(cublasHandle_t handle, cublasOperation_t transa,
+                   cublasOperation_t transb, int m, int n, int k,
+                   const float *alpha, const float *beta, const float *A,
+                   const float *B, float *C, cublasGemmAlgo_t algo) {
+  cublasStatus_t status =
+      cublasGemmEx(handle, transa, transb, m, n, k, (const void *)alpha,
+                   (const void *)A, CUDA_R_32F, (transa == CUBLAS_OP_N) ? m : k,
+                   (const void *)B, CUDA_R_32F, (transb == CUBLAS_OP_N) ? k : n,
+                   (const void *)beta, C, CUDA_R_32F, m, CUDA_R_32F, algo);
+
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    fprintf(stderr,
+            "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
+            m, n, k, (int)status);
+    return EXIT_FAILURE;
+  }
+  return 0;
+}
+
+int cublas_gemm_ex(cublasHandle_t handle, cublasOperation_t transa,
+                   cublasOperation_t transb, int m, int n, int k,
+                   const float *alpha, const float *beta, const __half *A,
+                   const __half *B, __half *C, cublasGemmAlgo_t algo) {
+  cublasStatus_t status = cublasGemmEx(
+      handle, transa, transb, m, n, k, (const void *)alpha, (const void *)A,
+      CUDA_R_16F, (transa == CUBLAS_OP_N) ? m : k, (const void *)B, CUDA_R_16F,
+      (transb == CUBLAS_OP_N) ? k : n, (const void *)beta, (void *)C,
+      CUDA_R_16F, m, CUDA_R_32F, algo);
+
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    fprintf(stderr,
+            "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
+            m, n, k, (int)status);
+    return EXIT_FAILURE;
+  }
+  return 0;
+}
+
+int cublas_strided_batched_gemm(cublasHandle_t handle, int m, int n, int k,
+                                const float *alpha, const float *beta,
+                                const float *A, const float *B, float *C,
+                                cublasOperation_t op_A, cublasOperation_t op_B,
+                                int stride_A, int stride_B, int stride_C,
+                                int batch, cublasGemmAlgo_t algo) {
+  cublasStatus_t status = cublasGemmStridedBatchedEx(
+      handle, op_A, op_B, m, n, k, alpha, A, CUDA_R_32F,
+      (op_A == CUBLAS_OP_N) ? m : k, stride_A, B, CUDA_R_32F,
+      (op_B == CUBLAS_OP_N) ? k : n, stride_B, beta, C, CUDA_R_32F, m, stride_C,
+      batch, CUDA_R_32F, algo);
+
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    fprintf(stderr,
+            "!!!! kernel execution error. (batch: %d, m: %d, n: %d, k: %d, "
+            "error: %d) \n",
+            batch, m, n, k, (int)status);
+    return EXIT_FAILURE;
+  }
+  return 0;
+}
+
+int cublas_strided_batched_gemm(cublasHandle_t handle, int m, int n, int k,
+                                const float *alpha, const float *beta,
+                                const __half *A, const __half *B, __half *C,
+                                cublasOperation_t op_A, cublasOperation_t op_B,
+                                int stride_A, int stride_B, int stride_C,
+                                int batch, cublasGemmAlgo_t algo) {
+  cublasStatus_t status = cublasGemmStridedBatchedEx(
+      handle, op_A, op_B, m, n, k, alpha, A, CUDA_R_16F,
+      (op_A == CUBLAS_OP_N) ? m : k, stride_A, B, CUDA_R_16F,
+      (op_B == CUBLAS_OP_N) ? k : n, stride_B, beta, C, CUDA_R_16F, m, stride_C,
+      batch, CUDA_R_32F, algo);
+
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    fprintf(stderr,
+            "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
+            m, n, k, (int)status);
+    return EXIT_FAILURE;
+  }
+
+  return 0;
+}
--- a/colossalai/kernel/cuda_native/csrc/kernels/cuda_util.cu
+++ b/colossalai/kernel/cuda_native/csrc/kernels/cuda_util.cu
+#include <thrust/device_vector.h>
+#include <thrust/reduce.h>
+
+#include "cuda_util.h"
+
+/* GPU function guard */
+std::string _cudaGetErrorString(cudaError_t error) {
+  return cudaGetErrorString(error);
+}
+
+std::string _cudaGetErrorString(cublasStatus_t error) {
+  switch (error) {
+    case CUBLAS_STATUS_SUCCESS:
+      return "CUBLAS_STATUS_SUCCESS";
+
+    case CUBLAS_STATUS_NOT_INITIALIZED:
+      return "CUBLAS_STATUS_NOT_INITIALIZED";
+
+    case CUBLAS_STATUS_ALLOC_FAILED:
+      return "CUBLAS_STATUS_ALLOC_FAILED";
+
+    case CUBLAS_STATUS_INVALID_VALUE:
+      return "CUBLAS_STATUS_INVALID_VALUE";
+
+    case CUBLAS_STATUS_ARCH_MISMATCH:
+      return "CUBLAS_STATUS_ARCH_MISMATCH";
+
+    case CUBLAS_STATUS_MAPPING_ERROR:
+      return "CUBLAS_STATUS_MAPPING_ERROR";
+
+    case CUBLAS_STATUS_EXECUTION_FAILED:
+      return "CUBLAS_STATUS_EXECUTION_FAILED";
+
+    case CUBLAS_STATUS_INTERNAL_ERROR:
+      return "CUBLAS_STATUS_INTERNAL_ERROR";
+
+    case CUBLAS_STATUS_NOT_SUPPORTED:
+      return "CUBLAS_STATUS_NOT_SUPPORTED";
+
+    case CUBLAS_STATUS_LICENSE_ERROR:
+      return "CUBLAS_STATUS_LICENSE_ERROR";
+  }
+  return "CUBLAS_UNKNOW";
+}
+
+template <typename T>
+void check_gpu_error(T result, char const *const func, const char *const file,
+                     int const line) {
+  if (result) {
+    throw std::runtime_error(std::string("[CUDA][ERROR] ") + +file + "(" +
+                             std::to_string(line) +
+                             "): " + (_cudaGetErrorString(result)) + "\n");
+  }
+}
+
+template void check_gpu_error<cudaError_t>(cudaError_t result,
+                                           char const *const func,
+                                           const char *const file,
+                                           int const line);
+template void check_gpu_error<cublasStatus_t>(cublasStatus_t result,
+                                              char const *const func,
+                                              const char *const file,
+                                              int const line);
+
+template <typename T>
+void print_vec(const T *outv, std::string outn, int num_output_ele) {
+  std::cout << outn << ": ";
+  std::vector<T> hout(num_output_ele, (T)0);
+  cudaMemcpy(hout.data(), outv, num_output_ele * sizeof(T),
+             cudaMemcpyDeviceToHost);
+  for (int i = 0; i < num_output_ele; i++) {
+    std::cout << hout[i] << ", ";
+  }
+  std::cout << std::endl;
+}
+
+template <>
+void print_vec<__half>(const __half *outv, std::string outn,
+                       int num_output_ele) {
+  std::cout << outn << ": ";
+  std::vector<__half> hout(num_output_ele, (__half)0.f);
+  cudaMemcpy(hout.data(), outv, num_output_ele * sizeof(__half),
+             cudaMemcpyDeviceToHost);
+  for (int i = 0; i < num_output_ele; i++) {
+    std::cout << __half2float(hout[i]) << ", ";
+  }
+  std::cout << std::endl;
+}
+
+template void print_vec<float>(const float *outv, std::string outn,
+                               int num_output_ele);
+
+template void print_vec<int>(const int *outv, std::string outn,
+                             int num_output_ele);
+
+template void print_vec<__half>(const __half *outv, std::string outn,
+                                int num_output_ele);
+
+template <typename T>
+T *cuda_malloc(size_t ele_num) {
+  size_t byte_size = ele_num * sizeof(T);
+  T *pdata = nullptr;
+  CHECK_GPU_ERROR(cudaMalloc((void **)&pdata, byte_size));
+  return pdata;
+}
+
+template float *cuda_malloc<float>(size_t ele_num);
+
+template __half *cuda_malloc<__half>(size_t ele_num);
+
+template uint8_t *cuda_malloc<uint8_t>(size_t ele_num);
+
+void cuda_free(void *pdata) {
+  if (pdata != nullptr) {
+    cudaFree(pdata);
+  }
+}
+
+template <typename T>
+struct _isnan {
+  __device__ bool operator()(T a) const { return isnan(a); }
+};
+
+template <>
+struct _isnan<__half> {
+  __device__ bool operator()(const __half a) const { return __hisnan(a); }
+};
+
+template <typename T>
+struct _isinf {
+  __device__ bool operator()(T a) const { return isinf(a); }
+};
+
+template <>
+struct _isinf<__half> {
+  __device__ bool operator()(const __half a) const { return __hisinf(a); }
+};
+
+template <typename T>
+void check_nan_inf(const T *data_ptr, int dsize, bool check_nan_inf,
+                   std::string file, int line, cudaStream_t stream) {
+  // check_nan_inf = 0 for checking nan
+  // check_nan_inf = 1 for checking inf
+  bool res = false;
+  std::string msg = file + "(" + std::to_string(line) + "): ";
+  if (check_nan_inf) {
+    msg += "nan.";
+    res = thrust::transform_reduce(thrust::cuda::par.on(stream), data_ptr,
+                                   data_ptr + dsize, _isnan<T>(), false,
+                                   thrust::logical_or<bool>());
+  } else {
+    msg += "inf.";
+    res = thrust::transform_reduce(thrust::cuda::par.on(stream), data_ptr,
+                                   data_ptr + dsize, _isinf<T>(), false,
+                                   thrust::logical_or<bool>());
+  }
+  if (res) {
+    throw std::runtime_error(msg);
+  }
+  std::cout << msg << " [check pass]." << std::endl;
+}
+
+template void check_nan_inf<float>(const float *data_ptr, int dsize,
+                                   bool check_nan_inf, std::string file,
+                                   int line, cudaStream_t stream);
+
+template void check_nan_inf<__half>(const __half *data_ptr, int dsize,
+                                    bool check_nan_inf, std::string file,
+                                    int line, cudaStream_t stream);
--- a/colossalai/kernel/cuda_native/csrc/kernels/dropout_kernels.cu
+++ b/colossalai/kernel/cuda_native/csrc/kernels/dropout_kernels.cu
--- a/colossalai/kernel/cuda_native/csrc/kernels/general_kernels.cu
+++ b/colossalai/kernel/cuda_native/csrc/kernels/general_kernels.cu
+#include "kernels.h"
+
+#include <cooperative_groups.h>
+
+namespace cg = cooperative_groups;
+
+/**
+@brief: fuse_transpose_bias
+Calculate the sum of elements in each column of the matrix.
+
+@thread
+gridDim.x = ceil(cols / WARP_SIZE)
+blockDim.x = WARP_SIZE
+blockDim.y = WARP_SIZE
+
+@param
+inp: [rows, cols]
+out: [cols]
+rows: the number of rows in the matrix
+cols: the number of cols in the matrix
+*/
+template <typename T>
+__global__ void column_sum_reduce(const T *__restrict__ inp,
+                                  T *__restrict__ out, int rows, int cols) {
+  __shared__ float tile[WARP_SIZE][WARP_SIZE];
+
+  cg::thread_block b = cg::this_thread_block();
+  cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+
+  int idx = flat_2dim(blockIdx.x, threadIdx.x, WARP_SIZE);
+  int y_stride = cols * WARP_SIZE;
+  float localSum = 0;
+
+  // Loop across matrix row
+  // TODO: optimize to log complexity
+  if (idx < cols) {
+    int offset = flat_2dim(threadIdx.y, idx, cols);
+    for (int r = threadIdx.y; r < rows; r += WARP_SIZE) {
+      localSum += (float)inp[offset];
+      offset += y_stride;
+    }
+  }
+
+  // The sum of a row in tile is equal to the sum of a col in original matrix
+  tile[threadIdx.x][threadIdx.y] = localSum;
+
+  __syncthreads();
+
+  // Sum the shared buffer.
+  // The change of threadIdx.x is continuous
+  float sum = tile[threadIdx.y][threadIdx.x];
+
+  __syncthreads();
+
+  // Calculate the sum of a row in tile
+  for (int i = 1; i < WARP_SIZE; i <<= 1) sum += g.shfl_down(sum, i);
+
+  if (threadIdx.x == 0) {
+    int pos = flat_2dim(blockIdx.x, threadIdx.y, WARP_SIZE);
+    if (pos < cols) out[pos] = sum;
+  }
+}
+
+// [r, c] -> [c]
+template <>
+void launch_fuse_transpose_bias_kernel<float>(const float *inp, float *out,
+                                              int rows, int cols,
+                                              cudaStream_t stream) {
+  dim3 grid_dim((cols - 1) / WARP_SIZE + 1);
+  dim3 block_dim(WARP_SIZE, WARP_SIZE);
+
+  column_sum_reduce<float>
+      <<<grid_dim, block_dim, 0, stream>>>(inp, out, rows, cols);
+}
+
+template <>
+void launch_fuse_transpose_bias_kernel<__half>(const __half *inp, __half *out,
+                                               int rows, int cols,
+                                               cudaStream_t stream) {
+  dim3 grid_dim((cols - 1) / WARP_SIZE + 1);
+  dim3 block_dim(WARP_SIZE, WARP_SIZE);
+
+  column_sum_reduce<__half>
+      <<<grid_dim, block_dim, 0, stream>>>(inp, out, rows, cols);
+}
+
+/**
+@brief: fused_add2
+Add two matrix inp1 and inp2 to out.
+
+@thread
+gridDim.x = batch_size * seq_len
+blockDim.x = min(hidden_dim, MAX_THREADS)
+
+@param
+inp1: [batch_size, seq_len, hidden_dim]
+inp2: [batch_size, seq_len, hidden_dim]
+out: [batch_size, seq_len, hidden_dim]
+batch_size: the size of the current batch
+seq_len: the sequence length of the current batch
+hidden_dim: dim of the hidden tensor
+*/
+template <typename T>
+__global__ void fused_add2_kernel(T *out, const T *inp1, const T *inp2,
+                                  int hidden_dim);
+
+template <>
+__global__ void fused_add2_kernel<float>(float *out, const float *inp1,
+                                         const float *inp2, int hidden_dim) {
+  int row_id = blockIdx.x;
+  int offset = flat_2dim(row_id, 0, hidden_dim);
+
+  const float4 *inp1_4 = reinterpret_cast<const float4 *>(inp1);
+  const float4 *inp2_4 = reinterpret_cast<const float4 *>(inp2);
+  float4 *out_4 = reinterpret_cast<float4 *>(out);
+  float4 vinp1;
+  float4 vinp2;
+  float4 val;
+
+  for (std::size_t i = threadIdx.x; i < hidden_dim; i += blockDim.x) {
+    vinp1 = inp1_4[offset + i];
+    vinp2 = inp2_4[offset + i];
+    val.x = vinp1.x + vinp2.x;
+    val.y = vinp1.y + vinp2.y;
+    val.z = vinp1.z + vinp2.z;
+    val.w = vinp1.w + vinp2.w;
+    out_4[offset + i] = val;
+  }
+}
+
+template <>
+__global__ void fused_add2_kernel<__half>(__half *out, const __half *inp1,
+                                          const __half *inp2, int hidden_dim) {
+  int row_id = blockIdx.x;
+  int offset = flat_2dim(row_id, 0, hidden_dim);
+
+  const float4 *inp1_4 = reinterpret_cast<const float4 *>(inp1);
+  const float4 *inp2_4 = reinterpret_cast<const float4 *>(inp2);
+  float4 *out_4 = reinterpret_cast<float4 *>(out);
+  float4 vinp1;
+  float4 vinp2;
+  float4 val;
+  __half2 *h2_inp1 = reinterpret_cast<__half2 *>(&vinp1);
+  __half2 *h2_inp2 = reinterpret_cast<__half2 *>(&vinp2);
+  __half2 *h2_val = reinterpret_cast<__half2 *>(&val);
+
+  for (std::size_t i = threadIdx.x; i < hidden_dim; i += blockDim.x) {
+    vinp1 = inp1_4[offset + i];
+    vinp2 = inp2_4[offset + i];
+    h2_val[0] = __hadd2(h2_inp1[0], h2_inp2[0]);
+    h2_val[1] = __hadd2(h2_inp1[1], h2_inp2[1]);
+    h2_val[2] = __hadd2(h2_inp1[2], h2_inp2[2]);
+    h2_val[3] = __hadd2(h2_inp1[3], h2_inp2[3]);
+    out_4[offset + i] = val;
+  }
+}
+
+//[b, s, h] -> [b, s, h]
+template <>
+void launch_fused_add2<float>(float *out, const float *inp1, const float *inp2,
+                              int batch_size, int seq_len, int hidden_dim,
+                              cudaStream_t &stream) {
+  hidden_dim >>= 2;
+
+  dim3 grid_dim(batch_size * seq_len);
+  dim3 block_dim(min(hidden_dim, MAX_THREADS));
+
+  fused_add2_kernel<<<grid_dim, block_dim, 0, stream>>>(out, inp1, inp2,
+                                                        hidden_dim);
+}
+
+template <>
+void launch_fused_add2<__half>(__half *out, const __half *inp1,
+                               const __half *inp2, int batch_size, int seq_len,
+                               int hidden_dim, cudaStream_t &stream) {
+  hidden_dim >>= 3;
+
+  dim3 grid_dim(batch_size * seq_len);
+  dim3 block_dim(min(hidden_dim, MAX_THREADS));
+
+  fused_add2_kernel<<<grid_dim, block_dim, 0, stream>>>(out, inp1, inp2,
+                                                        hidden_dim);
+}
+
+template <typename T>
+__global__ void kernel_concat3_dim1(const T *inp1, const T *inp2, T *output,
+                                    int sz0, int sz2, int sz1_1, int sz1_2) {
+  int nele = sz0 * sz2 * (sz1_1 + sz1_2);
+  int idx = flat_2dim(blockIdx.x, threadIdx.x, blockDim.x);
+  if (idx >= nele) {
+    return;
+  }
+  float4 *dst_ptr = (float4 *)output + idx;
+  int idx2 = idx % sz2;
+  idx = idx / sz2;
+  int idx1 = idx % (sz1_1 + sz1_2);
+  int idx0 = idx / (sz1_1 + sz1_2);
+  float4 *src_ptr = nullptr;
+  int sz1 = 0;
+  if (idx1 < sz1_1) {
+    sz1 = sz1_1;
+    src_ptr = (float4 *)inp1;
+  } else {
+    idx1 -= sz1_1;
+    sz1 = sz1_2;
+    src_ptr = (float4 *)inp2;
+  }
+  src_ptr += flat_3dim(idx0, idx1, idx2, sz1, sz2);
+  dst_ptr[0] = src_ptr[0];
+}
+
+template <>
+void launch_concat3_dim1<float>(const float *inp1, const float *inp2,
+                                float *output, int sz0, int sz2, int sz1_1,
+                                int sz1_2, cudaStream_t stream) {
+  sz2 >>= 2;
+  int nele = sz0 * sz2 * (sz1_1 + sz1_2);
+  int nblock = (nele + MAX_THREADS - 1) / MAX_THREADS;
+  kernel_concat3_dim1<<<nblock, MAX_THREADS, 0, stream>>>(
+      inp1, inp2, output, sz0, sz2, sz1_1, sz1_2);
+}
+
+template <>
+void launch_concat3_dim1<__half>(const __half *inp1, const __half *inp2,
+                                 __half *output, int sz0, int sz2, int sz1_1,
+                                 int sz1_2, cudaStream_t stream) {
+  sz2 >>= 3;
+  int nele = sz0 * sz2 * (sz1_1 + sz1_2);
+  int nblock = (nele + MAX_THREADS - 1) / MAX_THREADS;
+  kernel_concat3_dim1<<<nblock, MAX_THREADS, 0, stream>>>(
+      inp1, inp2, output, sz0, sz2, sz1_1, sz1_2);
+}
--- a/colossalai/kernel/cuda_native/csrc/kernels/include/block_reduce.h
+++ b/colossalai/kernel/cuda_native/csrc/kernels/include/block_reduce.h
+/* Copyright 2021 The LightSeq Team
+   Copyright Tencent/TurboTransformers
+   This block_reduce_n is adapted from Tencent/TurboTransformers
+*/
+#pragma once
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+enum class ReduceType { kMax = 0, kSum };
+const unsigned int WARP_REDUCE_MASK = 0xffffffff;
+const float REDUCE_FLOAT_INF_NEG = -100000000.f;
+const float REDUCE_FLOAT_INF_POS = 100000000.f;
+const unsigned int WARP_REDUCE_SIZE = 32;
+
+template <typename T>
+__forceinline__ __device__ T warpReduceSum(T val) {
+  for (int mask = (WARP_REDUCE_SIZE >> 1); mask > 0; mask >>= 1)
+    val += __shfl_xor_sync(WARP_REDUCE_MASK, val, mask, WARP_REDUCE_SIZE);
+  return val;
+}
+
+/* Calculate the sum of all elements in a block */
+template <typename T>
+__forceinline__ __device__ T blockReduceSum(T val) {
+  static __shared__ T shared[32];
+  int lane = threadIdx.x & 0x1f;
+  int wid = threadIdx.x >> 5;
+
+  val = warpReduceSum<T>(val);
+
+  if (lane == 0) shared[wid] = val;
+  __syncthreads();
+
+  val = (threadIdx.x < (blockDim.x >> 5)) ? shared[lane] : (T)0.0f;
+  val = warpReduceSum<T>(val);
+  return val;
+}
+
+template <ReduceType Rtype, int Num>
+__inline__ __device__ void blockReduce(float *pval);
+
+// use template to make code more concise
+template <ReduceType Rtype, int Num>
+__inline__ __device__ void warpReduce(float *pval);
+
+// static
+template <>
+__inline__ __device__ void warpReduce<ReduceType::kMax, 1>(float *pval) {
+  *pval = max(*pval, __shfl_xor_sync(WARP_REDUCE_MASK, *pval, 16, 32));
+  *pval = max(*pval, __shfl_xor_sync(WARP_REDUCE_MASK, *pval, 8, 32));
+  *pval = max(*pval, __shfl_xor_sync(WARP_REDUCE_MASK, *pval, 4, 32));
+  *pval = max(*pval, __shfl_xor_sync(WARP_REDUCE_MASK, *pval, 2, 32));
+  *pval = max(*pval, __shfl_xor_sync(WARP_REDUCE_MASK, *pval, 1, 32));
+}
+
+template <>
+__inline__ __device__ void warpReduce<ReduceType::kMax, 2>(float *pval) {
+  float val0_tmp, val1_tmp;
+#define WarpReduceMaxOneStep(a, b)                                 \
+  val0_tmp = __shfl_xor_sync(WARP_REDUCE_MASK, *(pval), a, b);     \
+  val1_tmp = __shfl_xor_sync(WARP_REDUCE_MASK, *(pval + 1), a, b); \
+  *(pval) = max(val0_tmp, *(pval));                                \
+  *(pval + 1) = max(val1_tmp, *(pval + 1));
+
+  WarpReduceMaxOneStep(16, 32);
+  WarpReduceMaxOneStep(8, 32);
+  WarpReduceMaxOneStep(4, 32);
+  WarpReduceMaxOneStep(2, 32);
+  WarpReduceMaxOneStep(1, 32);
+#undef WarpReduceMaxOneStep
+}
+
+template <>
+__inline__ __device__ void warpReduce<ReduceType::kSum, 1>(float *pval) {
+  *pval += __shfl_xor_sync(WARP_REDUCE_MASK, *pval, 16, 32);
+  *pval += __shfl_xor_sync(WARP_REDUCE_MASK, *pval, 8, 32);
+  *pval += __shfl_xor_sync(WARP_REDUCE_MASK, *pval, 4, 32);
+  *pval += __shfl_xor_sync(WARP_REDUCE_MASK, *pval, 2, 32);
+  *pval += __shfl_xor_sync(WARP_REDUCE_MASK, *pval, 1, 32);
+}
+
+/*
+ * Unorll for loop for warpreduce to
+ * imporve instruction issue efficiency
+ * ElemX means there are X numbers to be summed
+ */
+
+template <>
+__inline__ __device__ void warpReduce<ReduceType::kSum, 2>(float *pval) {
+  float val0_tmp, val1_tmp;
+#define WarpReduceSumOneStep(a, b)                                 \
+  val0_tmp = __shfl_xor_sync(WARP_REDUCE_MASK, *(pval + 0), a, b); \
+  val1_tmp = __shfl_xor_sync(WARP_REDUCE_MASK, *(pval + 1), a, b); \
+  *(pval + 0) += val0_tmp;                                         \
+  *(pval + 1) += val1_tmp
+
+  WarpReduceSumOneStep(16, 32);
+  WarpReduceSumOneStep(8, 32);
+  WarpReduceSumOneStep(4, 32);
+  WarpReduceSumOneStep(2, 32);
+  WarpReduceSumOneStep(1, 32);
+
+#undef WarpReduceSumOneStep
+}
+
+template <>
+__inline__ __device__ void warpReduce<ReduceType::kSum, 4>(float *pval) {
+  float val0_tmp, val1_tmp, val2_tmp, val3_tmp;
+#define WarpReduceSumOneStep(a, b)                                 \
+  val0_tmp = __shfl_xor_sync(WARP_REDUCE_MASK, *(pval + 0), a, b); \
+  val1_tmp = __shfl_xor_sync(WARP_REDUCE_MASK, *(pval + 1), a, b); \
+  val2_tmp = __shfl_xor_sync(WARP_REDUCE_MASK, *(pval + 2), a, b); \
+  val3_tmp = __shfl_xor_sync(WARP_REDUCE_MASK, *(pval + 3), a, b); \
+  *(pval + 0) += val0_tmp;                                         \
+  *(pval + 1) += val1_tmp;                                         \
+  *(pval + 2) += val2_tmp;                                         \
+  *(pval + 3) += val3_tmp
+
+  WarpReduceSumOneStep(16, 32);
+  WarpReduceSumOneStep(8, 32);
+  WarpReduceSumOneStep(4, 32);
+  WarpReduceSumOneStep(2, 32);
+  WarpReduceSumOneStep(1, 32);
+#undef WarpReduceSumOneStep
+}
+
+template <>
+__inline__ __device__ void blockReduce<ReduceType::kSum, 1>(float *pval) {
+  const int num = 1;
+  static __shared__ float shared[num][32];
+  int lane_id = threadIdx.x & 0x1f;
+  int wid = threadIdx.x >> 5;
+
+  warpReduce<ReduceType::kSum, num>(pval);
+
+  if (lane_id == 0) {
+#pragma unroll
+    for (int i = 0; i < num; ++i) {
+      shared[i][wid] = *(pval + i);
+    }
+  }
+  __syncthreads();
+
+  if (threadIdx.x < (blockDim.x >> 5)) {
+#pragma unroll
+    for (int i = 0; i < num; ++i) {
+      *(pval + i) = shared[i][lane_id];
+    }
+  } else {
+#pragma unroll
+    for (int i = 0; i < num; ++i) {
+      *(pval + i) = 0.f;
+    }
+  }
+  warpReduce<ReduceType::kSum, num>(pval);
+}
+
+template <>
+__inline__ __device__ void blockReduce<ReduceType::kSum, 2>(float *pval) {
+  const int num = 2;
+  static __shared__ float shared[num][32];
+  int lane_id = threadIdx.x & 0x1f;
+  int wid = threadIdx.x >> 5;
+
+  warpReduce<ReduceType::kSum, num>(pval);
+
+  if (lane_id == 0) {
+#pragma unroll
+    for (int i = 0; i < num; ++i) {
+      shared[i][wid] = *(pval + i);
+    }
+  }
+  __syncthreads();
+
+  if (threadIdx.x < (blockDim.x >> 5)) {
+#pragma unroll
+    for (int i = 0; i < num; ++i) {
+      *(pval + i) = shared[i][lane_id];
+    }
+  } else {
+#pragma unroll
+    for (int i = 0; i < num; ++i) {
+      *(pval + i) = 0.f;
+    }
+  }
+  warpReduce<ReduceType::kSum, num>(pval);
+}
+
+template <>
+__inline__ __device__ void blockReduce<ReduceType::kSum, 4>(float *pval) {
+  const int num = 4;
+  static __shared__ float shared[num][32];
+  int lane_id = threadIdx.x & 0x1f;
+  int wid = threadIdx.x >> 5;
+
+  warpReduce<ReduceType::kSum, num>(pval);
+
+  if (lane_id == 0) {
+#pragma unroll
+    for (int i = 0; i < num; ++i) {
+      shared[i][wid] = *(pval + i);
+    }
+  }
+  __syncthreads();
+
+  if (threadIdx.x < (blockDim.x >> 5)) {
+#pragma unroll
+    for (int i = 0; i < num; ++i) {
+      *(pval + i) = shared[i][lane_id];
+    }
+  } else {
+#pragma unroll
+    for (int i = 0; i < num; ++i) {
+      *(pval + i) = 0.f;
+    }
+  }
+  warpReduce<ReduceType::kSum, num>(pval);
+}
+
+template <>
+__inline__ __device__ void blockReduce<ReduceType::kMax, 1>(float *pval) {
+  const int num = 1;
+  static __shared__ float shared[num][32];
+  int lane_id = threadIdx.x & 0x1f;
+  int wid = threadIdx.x >> 5;
+
+  warpReduce<ReduceType::kMax, num>(pval);
+
+  if (lane_id == 0) {
+#pragma unroll
+    for (int i = 0; i < num; ++i) {
+      shared[i][wid] = *(pval + i);
+    }
+  }
+  __syncthreads();
+
+  if (threadIdx.x < (blockDim.x >> 5)) {
+#pragma unroll
+    for (int i = 0; i < num; ++i) {
+      *(pval + i) = shared[i][lane_id];
+    }
+  } else {
+#pragma unroll
+    for (int i = 0; i < num; ++i) {
+      *(pval + i) = REDUCE_FLOAT_INF_NEG;
+    }
+  }
+  warpReduce<ReduceType::kMax, num>(pval);
+}
+
+template <>
+__inline__ __device__ void blockReduce<ReduceType::kMax, 2>(float *pval) {
+  const int num = 1;
+  static __shared__ float shared[num][32];
+  int lane_id = threadIdx.x & 0x1f;
+  int wid = threadIdx.x >> 5;
+
+  warpReduce<ReduceType::kMax, num>(pval);
+
+  if (lane_id == 0) {
+#pragma unroll
+    for (int i = 0; i < num; ++i) {
+      shared[i][wid] = *(pval + i);
+    }
+  }
+  __syncthreads();
+
+  if (threadIdx.x < (blockDim.x >> 5)) {
+#pragma unroll
+    for (int i = 0; i < num; ++i) {
+      *(pval + i) = shared[i][lane_id];
+    }
+  } else {
+#pragma unroll
+    for (int i = 0; i < num; ++i) {
+      *(pval + i) = REDUCE_FLOAT_INF_NEG;
+    }
+  }
+  warpReduce<ReduceType::kMax, num>(pval);
+}
+
+template <>
+__inline__ __device__ void blockReduce<ReduceType::kMax, 4>(float *pval) {
+  const int num = 1;
+  static __shared__ float shared[num][32];
+  int lane_id = threadIdx.x & 0x1f;
+  int wid = threadIdx.x >> 5;
+
+  warpReduce<ReduceType::kMax, num>(pval);
+
+  if (lane_id == 0) {
+#pragma unroll
+    for (int i = 0; i < num; ++i) {
+      shared[i][wid] = *(pval + i);
+    }
+  }
+  __syncthreads();
+
+  if (threadIdx.x < (blockDim.x >> 5)) {
+#pragma unroll
+    for (int i = 0; i < num; ++i) {
+      *(pval + i) = shared[i][lane_id];
+    }
+  } else {
+#pragma unroll
+    for (int i = 0; i < num; ++i) {
+      *(pval + i) = REDUCE_FLOAT_INF_NEG;
+    }
+  }
+  warpReduce<ReduceType::kMax, num>(pval);
+}
--- a/colossalai/kernel/cuda_native/csrc/kernels/include/context.h
+++ b/colossalai/kernel/cuda_native/csrc/kernels/include/context.h
+#pragma once
+
+#include <cublas_v2.h>
+#include <cuda.h>
+
+#include <iostream>
+#include <string>
+
+#include "cuda_util.h"
+
+class Context {
+ public:
+  Context() : _stream(nullptr) {
+    CHECK_GPU_ERROR(cublasCreate(&_cublasHandle));
+  }
+
+  virtual ~Context() {}
+
+  static Context &Instance() {
+    static Context _ctx;
+    return _ctx;
+  }
+
+  void set_stream(cudaStream_t stream) {
+    _stream = stream;
+    CHECK_GPU_ERROR(cublasSetStream(_cublasHandle, _stream));
+  }
+
+  cudaStream_t get_stream() { return _stream; }
+
+  cublasHandle_t get_cublashandle() { return _cublasHandle; }
+
+ private:
+  cudaStream_t _stream;
+  cublasHandle_t _cublasHandle;
+};
--- a/colossalai/kernel/cuda_native/csrc/kernels/include/cross_entropy_layer.h
+++ b/colossalai/kernel/cuda_native/csrc/kernels/include/cross_entropy_layer.h
+#pragma once
+
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime_api.h>
+
+#include <type_traits>
+
+#include "cuda_util.h"
+
+template <typename T>
+class CrossEntropyLayer {
+ public:
+  CrossEntropyLayer(float epsilon, int padding_idx, int max_batch_tokens);
+
+  virtual ~CrossEntropyLayer();
+
+  void Forward(const T *inputs_ptr, const int *targets_ptr, float *outputs_ptr,
+               float *nll_loss_ptr);
+
+  void Backward(const float *grad_outputs_ptr, const T *inputs_ptr,
+                const int *targets_ptr, T *grad_inputs_ptr);
+
+  void set_cur_batch_shape(int batch_size, int seq_len, int vocab_size);
+
+ private:
+  void allocate_mem_buffer() {
+    // allocate local gpu memory
+    _loss_buffer = cuda_malloc<float>(_max_batch_tokens * 2);
+  }
+
+  void free_mem_buffer() {
+    // free local gpu memory
+    cuda_free(_loss_buffer);
+  }
+
+  const int _padding_idx;
+  const float _epsilon;
+  const int _max_batch_tokens;
+
+  size_t _batch_size;
+  size_t _seq_len;
+  size_t _vocab_size;
+
+  float *_loss_buffer;
+};
--- a/colossalai/kernel/cuda_native/csrc/kernels/include/cublas_wrappers.h
+++ b/colossalai/kernel/cuda_native/csrc/kernels/include/cublas_wrappers.h
+/* Copyright 2021 The LightSeq Team
+   Copyright Microsoft DeepSpeed
+   This file is adapted from Microsoft DeepSpeed
+*/
+#pragma once
+
+#include <assert.h>
+#include <cublas_v2.h>
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <mma.h>
+#include <stdio.h>
+
+int cublas_gemm_ex(cublasHandle_t handle, cublasOperation_t transa,
+                   cublasOperation_t transb, int m, int n, int k,
+                   const float *alpha, const float *beta, const float *A,
+                   const float *B, float *C,
+                   cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
+
+int cublas_gemm_ex(cublasHandle_t handle, cublasOperation_t transa,
+                   cublasOperation_t transb, int m, int n, int k,
+                   const float *alpha, const float *beta, const __half *A,
+                   const __half *B, __half *C,
+                   cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+
+int cublas_strided_batched_gemm(cublasHandle_t handle, int m, int n, int k,
+                                const float *alpha, const float *beta,
+                                const float *A, const float *B, float *C,
+                                cublasOperation_t op_A, cublasOperation_t op_B,
+                                int stride_A, int stride_B, int stride_C,
+                                int batch,
+                                cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
+
+int cublas_strided_batched_gemm(
+    cublasHandle_t handle, int m, int n, int k, const float *alpha,
+    const float *beta, const __half *A, const __half *B, __half *C,
+    cublasOperation_t op_A, cublasOperation_t op_B, int stride_A, int stride_B,
+    int stride_C, int batch,
+    cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
--- a/colossalai/kernel/cuda_native/csrc/kernels/include/cuda_util.h
+++ b/colossalai/kernel/cuda_native/csrc/kernels/include/cuda_util.h
+#pragma once
+
+#include <cublas_v2.h>
+#include <cuda.h>
+#include <math_constants.h>
+
+#include <chrono>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+template <typename T>
+void check_gpu_error(T result, char const *const func, const char *const file,
+                     int const line);
+
+#define CHECK_GPU_ERROR(val) check_gpu_error((val), #val, __FILE__, __LINE__)
+
+template <typename T>
+void print_vec(const T *outv, std::string outn, int num_output_ele);
+
+template <typename T>
+T *cuda_malloc(size_t ele_num);
+
+void cuda_free(void *pdata);
+
+template <typename T>
+void check_nan_inf(const T *data_ptr, int dsize, bool check_nan_inf,
+                   std::string file, int line, cudaStream_t stream);
+
+#define CHECK_NAN_INF(ptr, size, stream)                            \
+  check_nan_inf((ptr), (size), true, __FILE__, __LINE__, (stream)); \
+  check_nan_inf((ptr), (size), false, __FILE__, __LINE__, (stream))
--- a/colossalai/kernel/cuda_native/csrc/kernels/include/dropout.h
+++ b/colossalai/kernel/cuda_native/csrc/kernels/include/dropout.h
+#pragma once
+
+#include <string>
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <stdio.h>
+
+#include "kernels.h"
+
+template <typename T>
+class Dropout {
+ public:
+  struct Config {
+    float ratio;
+    bool training;
+
+    Config(float r) : ratio(r), training(true) {}
+    float RATIO() const { return training ? ratio : 0.0; }
+  };
+
+  Dropout(const Config &config, size_t max_ele_num)
+      : _config(config), _mask(nullptr) {
+    _mask = cuda_malloc<uint8_t>(max_ele_num);
+  }
+
+  virtual ~Dropout() { cuda_free(_mask); }
+
+  // after attention softmax
+  void dropout(T *output, const T *input, int count, cudaStream_t stream,
+               bool bwd = false) {
+    launch_ls_dropout<T>(output, input, _mask, count, _config.RATIO(), stream,
+                         bwd);
+  }
+
+  void d_dropout(T *d_inp_out, int count, cudaStream_t stream) {
+    launch_ls_dropout<T>(d_inp_out, d_inp_out, _mask, count, _config.RATIO(),
+                         stream, true);
+  }
+
+  // transformer layer's postprocessing dropout, after attn or ffn module,
+  // before residual add.
+  void bias_dropout_residual(T *output, const T *input, const T *residual,
+                             const T *bias, int rows, int cols,
+                             cudaStream_t stream) {
+    launch_ls_dropout_res_bias<T>(output, input, _mask, bias, residual,
+                                  rows * cols, cols, _config.RATIO(), stream);
+  }
+
+  void d_bias_dropout_residual(T *d_input, T *d_bias, const T *d_output,
+                               int rows, int cols, cudaStream_t stream) {
+    launch_ls_dropout_bias_bwd<T>(d_input, d_bias, d_output, _mask, rows, cols,
+                                  _config.RATIO(), stream);
+  }
+
+  // dropout inside ffn.
+  void bias_act_dropout(T *output, const T *input, const T *bias, int rows,
+                        int cols, std::string activation_fn,
+                        cudaStream_t stream) {
+    if (activation_fn == "relu") {
+      launch_ls_dropout_act_bias<ActivationType::kRelu, T>(
+          output, input, _mask, bias, rows * cols, cols, _config.RATIO(),
+          stream);
+    } else if (activation_fn == "gelu") {
+      launch_ls_dropout_act_bias<ActivationType::kGelu, T>(
+          output, input, _mask, bias, rows * cols, cols, _config.RATIO(),
+          stream);
+    } else {
+      throw std::runtime_error("not supported activation: " + activation_fn);
+    }
+  }
+
+  void d_bias_act_dropout(T *d_inp_out, T *d_bias_out, const T *input,
+                          const T *bias, int rows, int cols,
+                          std::string activation_fn, cudaStream_t stream) {
+    if (activation_fn == "relu") {
+      launch_ls_dropout_act_bias_bwd<ActivationType::kRelu, T>(
+          d_inp_out, d_bias_out, input, bias, d_inp_out, _mask, rows, cols,
+          _config.RATIO(), stream);
+    } else if (activation_fn == "gelu") {
+      launch_ls_dropout_act_bias_bwd<ActivationType::kGelu, T>(
+          d_inp_out, d_bias_out, input, bias, d_inp_out, _mask, rows, cols,
+          _config.RATIO(), stream);
+    } else {
+      throw std::runtime_error("not supported activation: " + activation_fn);
+    }
+  }
+
+  bool HasDropout() const { return _config.RATIO() > 0.0; }
+
+  void SetTrainingMode(bool training) { _config.training = training; }
+
+ private:
+  uint8_t *_mask;
+  Config _config;
+};
--- a/colossalai/kernel/cuda_native/csrc/kernels/include/feed_forward.h
+++ b/colossalai/kernel/cuda_native/csrc/kernels/include/feed_forward.h
+#pragma once
+
+/* Copyright 2021 The LightSeq Team
+   Copyright Microsoft DeepSpeed
+   This file is adapted from Microsoft DeepSpeed
+*/
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <stdio.h>
+
+#include <array>
+
+#include "cublas_wrappers.h"
+#include "kernels.h"
+
+template <typename T>
+class FeedForward {
+ public:
+  struct Config {
+    int outputSize;
+    int inputSize;
+    std::array<int, 3> gemm_algos;
+    Config(int outputs, int inputs)
+        : outputSize(outputs),
+          inputSize(inputs),
+          gemm_algos(std::array<int, 3>({99, 99, 99})) {}
+  };
+
+  FeedForward(Config config) : config_(config) {}
+
+  ~FeedForward() {}
+
+  void Forward(int bsz, const T *input_ptr, const T *weights, T *out,
+               cublasHandle_t &_cublasHandle) {
+    float alpha = T(1.);
+    float beta = T(0.);
+
+    cublas_gemm_ex(_cublasHandle, CUBLAS_OP_T, CUBLAS_OP_N, config_.outputSize,
+                   bsz, config_.inputSize, &alpha, &beta, weights, input_ptr,
+                   out, cublasGemmAlgo_t(config_.gemm_algos[0]));
+  }
+  void Backward(int bsz, const T *out_grad, const T *input_ptr,
+                const T *weights, T *weights_grad, T *bias_grad,
+                cublasHandle_t &_cublasHandle, cudaStream_t &stream,
+                T *inp_grad_out = nullptr, T *out_grad_trans_out = nullptr,
+                bool compute_bias = true) {
+    float alpha = (T)1.0, beta = (T)0.0;
+    cublas_gemm_ex(_cublasHandle, CUBLAS_OP_N, CUBLAS_OP_T, config_.inputSize,
+                   config_.outputSize, bsz, &alpha, &beta, input_ptr, out_grad,
+                   weights_grad, cublasGemmAlgo_t(config_.gemm_algos[1]));
+
+    cublas_gemm_ex(_cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, config_.inputSize,
+                   bsz, config_.outputSize, &alpha, &beta, weights, out_grad,
+                   inp_grad_out, cublasGemmAlgo_t(config_.gemm_algos[2]));
+    if (compute_bias) {
+      launch_fuse_transpose_bias_kernel<T>(out_grad, bias_grad, bsz,
+                                           config_.outputSize, stream);
+    }
+  }
+
+  void reset_size(int outputSize, int inputSize) {
+    config_.outputSize = outputSize;
+    config_.inputSize = inputSize;
+  }
+
+ private:
+  Config config_;
+};
--- a/colossalai/kernel/cuda_native/csrc/kernels/include/kernels.h
+++ b/colossalai/kernel/cuda_native/csrc/kernels/include/kernels.h
+#pragma once
+
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <curand_kernel.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdexcept>
+
+#define MAX_THREADS 1024
+#define WARP_SIZE 32
+
+enum class ActivationType { kRelu, kGelu };
+
+void launch_curand_init(int total_count, int dim, cudaStream_t stream);
+
+template <typename T>
+void launch_layer_norm(T *ln_res, T *vars, T *means, const T *inp,
+                       const T *scale, const T *bias, int batch_size,
+                       int hidden_dim, cudaStream_t stream);
+
+template <typename T>
+void launch_ln_bw(T *gamma_grad, T *betta_grad, T *inp_grad, const T *out_grad,
+                  const T *residual_grad, const T *inp_or_out, const T *gamma,
+                  const T *betta, const T *vars, const T *means, int batch,
+                  int hidden_dim, cudaStream_t stream[2]);
+
+template <typename T>
+void launch_attn_softmax(T *vals, const T *attn_mask, int batch_size, int heads,
+                         int from_len, int to_len, bool mask_future,
+                         cudaStream_t stream);
+
+template <typename T>
+void launch_attn_softmax_bw(T *out_grad, const T *soft_inp, int rows,
+                            int softmax_len, cudaStream_t stream);
+
+// [b, s, h] -> [b, nh, s, ad]
+template <typename T>
+void launch_transform_0213(T *output, const T *vals, int batch_size,
+                           int seq_length, int hidden_dim, int nhead,
+                           cudaStream_t stream);
+
+// [b, s, 3, h] -> [3, b, nh, s, ad]
+template <typename T>
+void launch_bias_add_transform_20314(T *output, const T *input, const T *bias,
+                                     int dim_0, int dim_1, int dim_2, int dim_3,
+                                     int dim_4, cudaStream_t stream);
+
+// [tc, b, nh, s, ad] -> [b, s, tc, nh, ad]
+template <typename T>
+void launch_transform4d_0213(T *output, const T *vals, int batch_size,
+                             int seq_len, int hidden_dim, int nhead,
+                             int trans_count, cudaStream_t stream);
+
+template <typename T>
+void launch_ls_dropout(T *out, const T *vals, uint8_t *mask, int total_count,
+                       float ratio, cudaStream_t stream, bool backward = false);
+
+template <typename T>
+void launch_ls_dropout_res_bias(T *out, const T *vals, uint8_t *mask,
+                                const T *bias, const T *residual,
+                                int total_count, int dim, float ratio,
+                                cudaStream_t stream);
+
+template <ActivationType, typename T>
+void launch_ls_dropout_act_bias(T *out, const T *vals, uint8_t *mask,
+                                const T *bias, int total_count, int dim,
+                                float ratio, cudaStream_t stream);
+
+template <typename T>
+void launch_ls_dropout_bias_bwd(T *in_grad, T *bias_grad, const T *out_grad,
+                                const uint8_t *mask, int row_size, int dim,
+                                float ratio, cudaStream_t stream);
+
+template <ActivationType act_type, typename T>
+void launch_ls_dropout_act_bias_bwd(T *in_grad, T *bias_grad, const T *input,
+                                    const T *bias, const T *out_grad,
+                                    const uint8_t *mask, int row_size, int dim,
+                                    float ratio, cudaStream_t stream);
+
+template <typename T>
+void launch_fuse_transpose_bias_kernel(const T *inp, T *out, int rows, int cols,
+                                       cudaStream_t stream);
+
+void launch_param_update(const float *input, __half *output, int size,
+                         cudaStream_t stream);
+
+template <typename T>
+void launch_concat3_dim1(const T *inp1, const T *inp2, T *output, int sz0,
+                         int sz2, int sz1_1, int sz1_2, cudaStream_t stream);
+
+template <typename T>
+void launch_fused_add2(T *out, const T *inp1, const T *inp2, int batch_size,
+                       int seq_len, int hidden_size, cudaStream_t &stream);
+
+template <typename T>
+void launch_cross_entropy_fw(const T *inputs_ptr, const int *targets_ptr,
+                             float *outputs_ptr, float *nll_loss_ptr,
+                             float *loss_buffer, const int padding_idx,
+                             const float epsilon, const int batch_size,
+                             const int seq_len, const int vocab_size,
+                             cudaStream_t stream);
+
+template <typename T>
+void launch_cross_entropy_bw(const float *grad_outputs_ptr, const T *inputs_ptr,
+                             const int *targets_ptr, T *grad_inputs_ptr,
+                             const int padding_idx, const float epsilon,
+                             const int batch_size, const int seq_len,
+                             const int vocab_size, cudaStream_t stream);
+
+template <typename T>
+void launch_lookup_scale_pos_dropout(
+    T *output, const int *input, const T *embeddings, const T *pos_embeddings,
+    uint8_t *dropout_mask, int batch_size, int seq_len, int embedding_dim,
+    int padding_idx, float dropout_ratio, int step, cudaStream_t &stream);
+
+template <typename T>
+void launch_d_lookup_scale_pos_dropout(
+    T *grad_embeddings, const T *grad_output, const int *input,
+    const uint8_t *dropout_mask, int batch_size, int seq_len, int embedding_dim,
+    int vocab_size, int padding_idx, float dropout_ratio, cudaStream_t &stream);
+
+/* Convert 2-dim tensor index into vector index */
+__forceinline__ __host__ __device__ int flat_2dim(int id1, int id2, int dim2) {
+  return id1 * dim2 + id2;
+}
+
+/* Convert 3-dim tensor index into vector index */
+__forceinline__ __host__ __device__ int flat_3dim(int id1, int id2, int id3,
+                                                  int dim2, int dim3) {
+  return id1 * dim2 * dim3 + id2 * dim3 + id3;
+}
+
+/* Convert 4-dim tensor index into vector index */
+__forceinline__ __host__ __device__ int flat_4dim(int id1, int id2, int id3,
+                                                  int id4, int dim2, int dim3,
+                                                  int dim4) {
+  // return id1*(dim2*dim3*dim4) + id2*(dim3*dim4) + id3*dim4 + id4;
+  int res = id4;
+
+  int ld = dim4;
+  res += id3 * ld;
+
+  ld *= dim3;
+  res += id2 * ld;
+
+  ld *= dim2;
+  res += id1 * ld;
+
+  return res;
+}
+
+/* Convert 5-dim tensor index into vector index */
+__forceinline__ __host__ __device__ int flat_5dim(int id1, int id2, int id3,
+                                                  int id4, int id5, int dim2,
+                                                  int dim3, int dim4,
+                                                  int dim5) {
+  // return id1*(dim2*dim3*dim4*dim5) + id2*(dim3*dim4*dim5) + id3*(dim4*dim5) +
+  // id4*dim5 + dim5;
+  int res = id5;
+
+  int ld = dim5;
+  res += id4 * ld;
+
+  ld *= dim4;
+  res += id3 * ld;
+
+  ld *= dim3;
+  res += id2 * ld;
+
+  ld *= dim2;
+  res += id1 * ld;
+
+  return res;
+}
+
+/* Convert 6-dim tensor index into vector index */
+__forceinline__ __host__ __device__ int flat_6dim(int id1, int id2, int id3,
+                                                  int id4, int id5, int id6,
+                                                  int dim2, int dim3, int dim4,
+                                                  int dim5, int dim6) {
+  // return id1*(dim2*dim3*dim4*dim5*dim6) + id2*(dim3*dim4*dim5*dim6) +
+  // id3*(dim4*dim5*dim6) + id4*(dim5*dim6) + id5*dim6 + id6;
+  int res = id6;
+
+  int ld = dim6;
+  res += id5 * ld;
+
+  ld *= dim5;
+  res += id4 * ld;
+
+  ld *= dim4;
+  res += id3 * ld;
+
+  ld *= dim3;
+  res += id2 * ld;
+
+  ld *= dim2;
+  res += id1 * ld;
+
+  return res;
+}
+
+/* Convert vector index to 6-dim tensor index */
+__forceinline__ __host__ __device__ void decompose_6dim(
+    int src, int dim1, int dim2, int dim3, int dim4, int dim5, int *id0,
+    int *id1, int *id2, int *id3, int *id4, int *id5) {
+  *id5 = src % dim5;
+  src /= dim5;
+
+  *id4 = src % dim4;
+  src /= dim4;
+
+  *id3 = src % dim3;
+  src /= dim3;
+
+  *id2 = src % dim2;
+  src /= dim2;
+
+  *id1 = src % dim1;
+  *id0 = src / dim1;
+}
+
+/* Convert vector index to 5-dim tensor index */
+__forceinline__ __host__ __device__ void decompose_5dim(int src, int dim1,
+                                                        int dim2, int dim3,
+                                                        int dim4, int *id0,
+                                                        int *id1, int *id2,
+                                                        int *id3, int *id4) {
+  *id4 = src % dim4;
+  src /= dim4;
+
+  *id3 = src % dim3;
+  src /= dim3;
+
+  *id2 = src % dim2;
+  src /= dim2;
+
+  *id1 = src % dim1;
+  *id0 = src / dim1;
+}
+
+/* Convert vector index to 4-dim tensor index */
+__forceinline__ __host__ __device__ void decompose_4dim(int src, int dim1,
+                                                        int dim2, int dim3,
+                                                        int *id0, int *id1,
+                                                        int *id2, int *id3) {
+  *id3 = src % dim3;
+  src /= dim3;
+
+  *id2 = src % dim2;
+  src /= dim2;
+
+  *id1 = src % dim1;
+  *id0 = src / dim1;
+}
+
+/* Convert vector index to 3-dim tensor index */
+__forceinline__ __host__ __device__ void decompose_3dim(int src, int dim1,
+                                                        int dim2, int *id0,
+                                                        int *id1, int *id2) {
+  *id2 = src % dim2;
+  src /= dim2;
+
+  *id1 = src % dim1;
+  *id0 = src / dim1;
+}
+
+/* Convert vector index to 2-dim tensor index */
+__forceinline__ __host__ __device__ void decompose_2dim(int src, int dim1,
+                                                        int *id0, int *id1) {
+  *id1 = src % dim1;
+  *id0 = src / dim1;
+}
--- a/colossalai/kernel/cuda_native/csrc/kernels/include/ls_cub.cuh
+++ b/colossalai/kernel/cuda_native/csrc/kernels/include/ls_cub.cuh
+// copied from https://github.com/dmlc/dgl/pull/2758
+#ifndef DGL_ARRAY_CUDA_DGL_CUB_CUH_
+#define DGL_ARRAY_CUDA_DGL_CUB_CUH_
+
+#define CUB_NS_PREFIX namespace ls {
+#define CUB_NS_POSTFIX }
+#include "cub/cub.cuh"
+#include "cub/util_allocator.cuh"
+#undef CUB_NS_POSTFIX
+#undef CUB_NS_PREFIX
+
+#endif
--- a/colossalai/kernel/cuda_native/csrc/kernels/include/normalize_layer.h
+++ b/colossalai/kernel/cuda_native/csrc/kernels/include/normalize_layer.h
+#pragma once
+
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <stdio.h>
+
+#include <fstream>
+
+#include "kernels.h"
+
+using namespace std;
+
+template <typename T>
+class Normalize_Layer {
+ public:
+  struct Config {
+    uint32_t hidden_dim;
+    bool use_mean;
+    Config(uint32_t hidden_dim, bool use_mean = false)
+        : hidden_dim(hidden_dim), use_mean(use_mean) {}
+  };
+
+  Normalize_Layer(Config config, size_t max_rows)
+      : config_(config), vars_(nullptr), means_(nullptr) {
+    vars_ = cuda_malloc<T>(max_rows);
+    if (config_.use_mean) {
+      means_ = cuda_malloc<T>(max_rows);
+    }
+  }
+
+  ~Normalize_Layer() {
+    cuda_free(vars_);
+    cuda_free(means_);
+  }
+
+  void Forward(T *ln_res, const T *inp, const T *gamma, const T *betta,
+               int batch_size, cudaStream_t stream) {
+    launch_layer_norm(ln_res, vars_, means_, inp, gamma, betta, batch_size,
+                      config_.hidden_dim, stream);
+  }
+
+  /*
+  residual_grad, inp_or_out, betta should be treated carefully.
+  inp_or_out = input if use_mean else output
+  residual_grad, betta can be nullptr.
+  residual_grad will be added to dinp if it is not nullptr
+    which is useful in transformer layer when pre-ln
+  betta are only used to compute xhat,
+    (use_mean == false) ^ (betta == nullptr) should be true
+  */
+  void Backward(T *gamma_grad, T *betta_grad, T *inp_grad, const T *out_grad,
+                const T *residual_grad, const T *inp_or_out, const T *gamma,
+                const T *betta, int batch_size, cudaStream_t stream[2]) {
+    launch_ln_bw(gamma_grad, betta_grad, inp_grad, out_grad, residual_grad,
+                 inp_or_out, gamma, betta, vars_, means_, batch_size,
+                 config_.hidden_dim, stream);
+  }
+
+  inline bool use_mean() const { return config_.use_mean; }
+
+ private:
+  Config config_;
+  T *vars_;
+  T *means_;
+};
--- a/colossalai/kernel/cuda_native/csrc/kernels/include/softmax.h
+++ b/colossalai/kernel/cuda_native/csrc/kernels/include/softmax.h
+#pragma once
+
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <stdio.h>
+
+#include <fstream>
+
+#include "kernels.h"
+
+using namespace std;
+
+template <typename T>
+class Softmax {
+ public:
+  struct Config {
+    size_t nhead;
+    Config(size_t nhead) : nhead(nhead) {}
+  };
+
+  Softmax(Config config) : config_(config) {}
+
+  ~Softmax() {}
+
+  void Forward(T *vals, const T *attn_mask, int batch_size, int from_len,
+               int to_len, cudaStream_t &stream, bool mask_future = true) {
+    launch_attn_softmax<T>(vals, attn_mask, batch_size, config_.nhead, from_len,
+                           to_len, mask_future, stream);
+  }
+
+  void Backward(T *out_grad, const T *soft_out, int batch_size, int from_len,
+                int to_len, cudaStream_t stream) {
+    launch_attn_softmax_bw<T>(out_grad, soft_out,
+                              batch_size * config_.nhead * from_len, to_len,
+                              stream);
+  }
+
+  void reset_size(size_t nhead) {
+    config_.nhead = nhead;
+  }
+
+ private:
+  Config config_;
+};