Merge pull request #5310 from hpcaitech/feature/npu

Feature/npu

Merge pull request #5310 from hpcaitech/feature/npu
Feature/npu
8823cc48 · Frank Lee · GitHub · bce9499e · 73f4dc57 · bce9499e
Unverified Commit 8823cc48 authored Jan 29, 2024 by Frank Lee Committed by GitHub Jan 29, 2024
20 changed files
--- a/colossalai/kernel/cuda_native/csrc/gptq/q4_matmul.cuh
+++ b/colossalai/kernel/cuda_native/csrc/gptq/q4_matmul.cuh
-// Adapted from turboderp exllama: https://github.com/turboderp/exllama
-#ifndef _q4_matmul_cuh
-#define _q4_matmul_cuh
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-#include <cstdint>
-#include <cstdio>
-#include <ATen/cuda/CUDAContext.h>
-#include "q4_matrix.cuh"
-#include "tuning.h"
-// Workaround for hipify_python using rocblas instead of hipblas.
-#if defined(USE_ROCM)
-#include <hipblas/hipblas.h>
-#define rocblas_handle hipblasHandle_t
-#endif
-void q4_matmul_cuda
-(
-    ExLlamaTuning* tuningParams,
-    const half* x,
-    const int x_height,
-    const Q4Matrix* w,
-    half* out,
-    bool no_zero = false,
-    cudaStream_t alt_stream = NULL
-);
-void q4_matmul_recons_cuda
-(
-    ExLlamaTuning* tuningParams,
-    const half* x,
-    const int x_height,
-    Q4Matrix* w,
-    half* out,
-    const cublasHandle_t handle,
-    bool no_zero = false
-);
-#endif
--- a/colossalai/kernel/cuda_native/csrc/gptq/q4_matrix.cu
+++ b/colossalai/kernel/cuda_native/csrc/gptq/q4_matrix.cu
-// Adapted from turboderp exllama: https://github.com/turboderp/exllama
-#include "q4_matrix.cuh"
-#include <vector>
-#include "util.cuh"
-#include "matrix.cuh"
-using namespace std;
-const int UNSHUF_BLOCKSIZE_X = 64;
-const int RECONS_THREADS_X = 64;      // Block size and thread count along columns in out, each thread converts 1 column
-const int RECONS_THREADS_Y = 1;       // Block size and thread count along rows in x and out, each thread converts 8 rows
-vector<Q4Matrix*> g_q4_matrices;
-void g_q4_keep_matrix(Q4Matrix* m)
-{
-    g_q4_matrices.push_back(m);
-}
-void g_q4_free_matrices()
-{
-    for (const auto& m : g_q4_matrices) delete m;
-    g_q4_matrices.clear();
-}
-Q4Matrix::Q4Matrix
-(
-    const int _height,
-    const int _width,
-    const int _groups,
-    uint32_t* _qweight,
-    uint32_t* _qzeros,
-    half* _scales,
-    uint32_t* _g_idx,
-    const int _device
-) :
-    height(_height),
-    width(_width),
-    groups(_groups),
-    device(_device)
-{
-    cudaSetDevice(device);
-    cuda_qweight = _qweight;
-    cuda_qzeros = _qzeros;
-    cuda_scales = _scales;
-    groupsize = height / groups;
-    if (_g_idx) make_sequential(_g_idx);
-}
-Q4Matrix::~Q4Matrix()
-{
-}
-// Make sequential
-__global__ void make_sequential_kernel
-(
-    const uint32_t* __restrict__ w,
-    uint32_t* __restrict__ w_new,
-    const uint32_t* __restrict__ x_map,
-    const int w_height,
-    const int w_width
-)
-{
-    const uint64_t* w2 = (uint64_t*) w;
-    uint64_t* w_new2 = (uint64_t*) w_new;
-    int w2_stride = w_width >> 1;
-    int w2_column = UNSHUF_BLOCKSIZE_X * blockIdx.x + threadIdx.x;
-    if (w2_column >= w2_stride) return;
-    int w_new2_row = blockIdx.y;
-    int x_map_idx = w_new2_row << 3;
-    uint64_t dst = 0;
-    #pragma unroll
-    for (int i = 0; i < 8; i++)
-    {
-        int source_row = x_map[x_map_idx++];
-        int w2_row = source_row >> 3;
-        int w2_subrow = source_row & 0x07;
-        int w2_row_shift = w2_subrow << 2;
-        int wnew2_row_shift = i << 2;
-        uint64_t src = w2[w2_row * w2_stride + w2_column];
-        src >>= w2_row_shift;
-        src &= 0x0000000f0000000f;
-        src <<= wnew2_row_shift;
-        dst |= src;
-    }
-    w_new2[w_new2_row * w2_stride + w2_column] = dst;
-}
-void Q4Matrix::make_sequential(const uint32_t* cpu_g_idx)
-{
-    uint32_t* cuda_new_qweight = NULL;
-    cudaMalloc(&cuda_new_qweight, height / 8 * width * sizeof(uint32_t));
-    cudaMalloc(&cuda_x_map, height * sizeof(uint32_t));  // TODO: Should probably be allocated in PyTorch
-    uint32_t* cpu_g_idx_map = (uint32_t*) calloc(groups, sizeof(uint32_t));
-    uint32_t* cpu_x_map = (uint32_t*) malloc(height * sizeof(uint32_t));
-    uint32_t* cpu_x_map_inv = (uint32_t*) malloc(height * sizeof(uint32_t));
-    // Group histogram
-    for (int i = 0; i < height; i++) cpu_g_idx_map[cpu_g_idx[i]]++;
-    // Group map
-    for (int i = 0, acc = 0; i < groups; i++)
-    {
-        short tmp = cpu_g_idx_map[i];
-        cpu_g_idx_map[i] = acc;
-        acc += tmp;
-    }
-    // X map (inverse)
-    for (int row = 0; row < height; row++)
-    {
-        uint32_t target_group = cpu_g_idx[row];
-        uint32_t target_row = cpu_g_idx_map[target_group];
-        cpu_g_idx_map[target_group]++;
-        cpu_x_map_inv[row] = target_row;
-    }
-    // X map
-    for (int row = 0; row < height; row++) cpu_x_map[cpu_x_map_inv[row]] = row;
-    // Move to CUDA
-    cudaMemcpyAsync(cuda_x_map, cpu_x_map, height * sizeof(uint32_t), cudaMemcpyHostToDevice);
-    // Rearrange rows in w
-    dim3 threads(UNSHUF_BLOCKSIZE_X, 1, 1);
-    dim3 blocks
-    (
-        (width + UNSHUF_BLOCKSIZE_X * 2 - 1) / (UNSHUF_BLOCKSIZE_X * 2),
-        height / 8,
-        1
-    );
-    make_sequential_kernel<<<blocks, threads>>>(cuda_qweight, cuda_new_qweight, cuda_x_map, height / 8, width);
-    // Replace qweights
-    cudaMemcpyAsync(cuda_qweight, cuda_new_qweight, height / 8 * width * sizeof(uint32_t), cudaMemcpyDeviceToDevice);
-    // Cleanup
-    cudaDeviceSynchronize();
-    cudaFree(cuda_new_qweight);
-    free(cpu_g_idx_map);
-    free(cpu_x_map);
-    free(cpu_x_map_inv);
-}
-__global__ void reconstruct_kernel
-(
-    const uint32_t* __restrict__ w,
-    half* __restrict__ out,  // (y)
-    const half* __restrict__ w_scales,
-    const uint32_t* __restrict__ w_zeros,
-    const int height,
-    const int width,
-    const int groupsize
-)
-{
-    // Start of block
-    int column = RECONS_THREADS_X * blockIdx.x + threadIdx.x;
-    int row = (RECONS_THREADS_Y * blockIdx.y + threadIdx.y) * 8;
-    if (column >= width) return;
-    // Views
-    MatrixView_q4_column w_(w, height, width);
-    MatrixView_half_rw out_(out, height, width);
-    MatrixView_half w_scales_(w_scales, height / groupsize, width);
-    MatrixView_q4_row w_zeros_(w_zeros, height / groupsize, width);
-    // Groupsize version
-    int group = row / groupsize;
-    half w_scale = w_scales_.item(group, column);
-    uint32_t w_zero = w_zeros_.item(group, column) + 1;
-    uint32_t w_read = w_.item_uint32_t(row, column);
-    half* out_ptr = out_.item_ptr(row, column);
-    #pragma unroll
-    for (int s = 0; s < 32; s += 4)
-    {
-        half w_item = __hmul(__int2half_rn((int)((w_read >> s) & 0x0f) - w_zero), w_scale);
-        *out_ptr = w_item; out_ptr += out_.width;
-    }
-}
-void Q4Matrix::reconstruct(half* out)
-{
-    dim3 threads(RECONS_THREADS_X, RECONS_THREADS_Y, 1);
-    dim3 blocks
-    (
-        (width + threads.x - 1) / threads.x,
-        (height / 8 + threads.y - 1) / threads.y,
-        1
-    );
-    reconstruct_kernel<<<blocks, threads>>>(cuda_qweight, out, cuda_scales, cuda_qzeros, height / 8, width, groupsize);
-}
--- a/colossalai/kernel/cuda_native/csrc/gptq/q4_matrix.cuh
+++ b/colossalai/kernel/cuda_native/csrc/gptq/q4_matrix.cuh
-// Adapted from turboderp exllama: https://github.com/turboderp/exllama
-#ifndef _q4_matrix_cuh
-#define _q4_matrix_cuh
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-#include <cstdint>
-class Q4Matrix
-{
-public:
-    int device;
-    int height;
-    int width;
-    int groups;
-    int groupsize;
-    uint32_t* cuda_qweight = NULL;
-    uint32_t* cuda_qzeros = NULL;
-    half* cuda_scales = NULL;
-    uint32_t* cuda_x_map = NULL;
-    Q4Matrix
-    (
-        const int _height,
-        const int _width,
-        const int _groups,
-        uint32_t* _qweight,
-        uint32_t* _qzeros,
-        half* _scales,
-        uint32_t* _g_idx,
-        const int _device
-    );
-    ~Q4Matrix();
-    void reconstruct(half* out);
-private:
-    void make_sequential(const uint32_t* cpu_g_idx);
-};
-void g_q4_keep_matrix(Q4Matrix* m);
-void g_q4_free_matrices();
-#endif
\ No newline at end of file
--- a/colossalai/kernel/cuda_native/csrc/gptq/tuning.h
+++ b/colossalai/kernel/cuda_native/csrc/gptq/tuning.h
-// Adapted from turboderp exllama: https://github.com/turboderp/exllama
-#ifndef _tuning_h
-#define _tuning_h
-struct ExLlamaTuning {
-  int matmul_recons_thd;
-  bool matmul_fused_remap;
-  bool matmul_no_half2;
-};
-#endif
--- a/colossalai/kernel/cuda_native/csrc/gptq/util.cuh
+++ b/colossalai/kernel/cuda_native/csrc/gptq/util.cuh
-// Adapted from turboderp exllama: https://github.com/turboderp/exllama
-#ifndef _util_cuh
-#define _util_cuh
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-#include <cstdint>
-#include <cstdio>
-#if defined(USE_ROCM)
-#define cudaUnspecified hipErrorUnknown
-#else
-#define cudaUnspecified cudaErrorApiFailureBase
-#endif
-// React to failure on return code != cudaSuccess
-#define _cuda_check(fn) \
-do { \
-    {_cuda_err = fn;} \
-    if (_cuda_err != cudaSuccess) goto _cuda_fail; \
-} while(false)
-// React to failure on return code == 0
-#define _alloc_check(fn) \
-do { \
-    if (!(fn)) { _cuda_err = cudaUnspecified; goto _cuda_fail; } \
-    else _cuda_err = cudaSuccess; \
-} while(false)
-#endif
--- a/colossalai/kernel/cuda_native/csrc/kernels/cross_entropy.cu
+++ b/colossalai/kernel/cuda_native/csrc/kernels/cross_entropy.cu
-#include "block_reduce.h"
-#include "cuda_util.h"
-#include "kernels.h"
-#include "ls_cub.cuh"
-ls::cub::CachingDeviceAllocator g_allocator(true);
-template <typename T>
-__global__ void ls_cross_entropy_fw_kernel(
-    const T *__restrict__ inputs, const int *__restrict__ targets,
-    float *__restrict__ outputs, float *__restrict__ nll_loss_outputs,
-    const int padding_idx, const float epsilon, const int vocab_size) {
-  /* step1: compute each thread's max_logit and sum_exp_logit, store in
-   * max_input, sum_exp_logit */
-  const int block_start = blockIdx.x * vocab_size;
-  const int left_idx = block_start + threadIdx.x;
-  const int right_idx = (blockIdx.x + 1) * vocab_size;
-  float max_input[1] = {REDUCE_FLOAT_INF_NEG};
-  float sum_logits[2] = {0.f, 0.f};  // logit and logit exp
-  int target_tid = targets[blockIdx.x];
-  if (target_tid == padding_idx) {
-    if (threadIdx.x == 0) {
-      nll_loss_outputs[blockIdx.x] = 0.f;
-      outputs[blockIdx.x] = 0.f;
-    }
-    return;
-  }
-  for (int i = left_idx; i < right_idx; i += blockDim.x) {
-    max_input[0] = fmaxf(max_input[0], static_cast<float>(inputs[i]));
-  }
-  blockReduce<ReduceType::kMax, 1>(max_input);
-  __shared__ float s_max_input;
-  if (threadIdx.x == 0) {
-    s_max_input = max_input[0];
-  }
-  __syncthreads();
-  for (int i = left_idx; i < right_idx; i += blockDim.x) {
-    float logit = static_cast<float>(inputs[i]) - s_max_input;
-    sum_logits[0] += logit;
-    sum_logits[1] += expf(logit);
-  }
-  blockReduce<ReduceType::kSum, 2>(sum_logits);
-  __shared__ float s_sum_logit;
-  __shared__ float s_sum_exp;
-  if (threadIdx.x == 0) {
-    s_sum_logit = sum_logits[0];
-    s_sum_exp = sum_logits[1];
-  }
-  __syncthreads();
-  float eps_i = epsilon / (vocab_size - 1);
-  if (threadIdx.x == 0) {
-    // neg_log_prob = log(sum(exp(x - x_max))) - (x - x_max)
-    float nll_loss = logf(s_sum_exp) -
-                     static_cast<float>(inputs[block_start + target_tid]) +
-                     s_max_input;
-    nll_loss_outputs[blockIdx.x] = nll_loss;
-    float sum_nll_loss = vocab_size * logf(s_sum_exp) - s_sum_logit;
-    outputs[blockIdx.x] =
-        (1.f - epsilon - eps_i) * nll_loss + eps_i * sum_nll_loss;
-  }
-}
-template <typename T>
-__global__ void ls_cross_entropy_bw_kernel(
-    const float *__restrict__ grad_outputs, const T *__restrict__ inputs,
-    const int *__restrict__ targets, T *__restrict__ grad_inputs,
-    const int padding_idx, const float epsilon, const int vocab_size) {
-  /* step1: compute each thread's max_logit and sum_exp_logit, store in
-   * max_input, sum_exp_logit */
-  const int block_start = blockIdx.x * vocab_size;
-  const int left_idx = block_start + threadIdx.x;
-  const int right_idx = (blockIdx.x + 1) * vocab_size;
-  float max_input[1] = {REDUCE_FLOAT_INF_NEG};
-  float sum_logits[1] = {0.f};
-  const float grad_out = static_cast<float>(grad_outputs[0]);
-  int target_tid = targets[blockIdx.x];
-  if (target_tid == padding_idx) {
-    for (int i = left_idx; i < right_idx; i += blockDim.x) {
-      grad_inputs[i] = 0.f;
-    }
-    return;
-  }
-  for (int i = left_idx; i < right_idx; i += blockDim.x) {
-    max_input[0] = fmaxf(max_input[0], static_cast<float>(inputs[i]));
-  }
-  blockReduce<ReduceType::kMax, 1>(max_input);
-  __shared__ float s_max_input;
-  if (threadIdx.x == 0) {
-    s_max_input = max_input[0];
-  }
-  __syncthreads();
-  for (int i = left_idx; i < right_idx; i += blockDim.x) {
-    float logit = static_cast<float>(inputs[i]) - s_max_input;
-    sum_logits[0] += expf(logit);
-  }
-  blockReduce<ReduceType::kSum, 1>(sum_logits);
-  __shared__ float s_sum_exp;
-  if (threadIdx.x == 0) {
-    s_sum_exp = sum_logits[0];
-  }
-  __syncthreads();
-  float eps_i = epsilon / (vocab_size - 1);
-  float nll_weight = 1.0 - epsilon - eps_i;
-  for (int i = left_idx; i < right_idx; i += blockDim.x) {
-    float prob = expf(static_cast<float>(inputs[i]) - s_max_input) / s_sum_exp;
-    float grad = 0;
-    grad += (vocab_size * prob - 1) * eps_i;
-    grad += prob * nll_weight;
-    if ((i - block_start) == target_tid) {
-      grad -= nll_weight;
-    }
-    grad_inputs[i] = grad_out * grad;
-  }
-}
-template <typename T>
-void launch_cross_entropy_fw(const T *inputs_ptr, const int *targets_ptr,
-                             float *outputs_ptr, float *nll_loss_ptr,
-                             float *loss_buffer, const int padding_idx,
-                             const float epsilon, const int batch_size,
-                             const int seq_len, const int vocab_size,
-                             cudaStream_t stream) {
-  int grid_dim = batch_size * seq_len;
-  float *nll_loss_buffer = loss_buffer + grid_dim;
-  ls_cross_entropy_fw_kernel<<<grid_dim, MAX_THREADS, 0, stream>>>(
-      inputs_ptr, targets_ptr, loss_buffer, nll_loss_buffer, padding_idx,
-      epsilon, vocab_size);
-  int num_items = grid_dim;
-  void *d_temp_storage = NULL;
-  size_t temp_storage_bytes = 0;
-  CHECK_GPU_ERROR(ls::cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes,
-                                             loss_buffer, outputs_ptr,
-                                             num_items, stream));
-  CHECK_GPU_ERROR(
-      g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
-  CHECK_GPU_ERROR(ls::cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes,
-                                             loss_buffer, outputs_ptr,
-                                             num_items, stream));
-  CHECK_GPU_ERROR(ls::cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes,
-                                             nll_loss_buffer, nll_loss_ptr,
-                                             num_items, stream));
-  CHECK_GPU_ERROR(g_allocator.DeviceFree(d_temp_storage));
-}
-template void launch_cross_entropy_fw<float>(
-    const float *inputs_ptr, const int *targets_ptr, float *outputs_ptr,
-    float *nll_loss_ptr, float *loss_buffer, const int padding_idx,
-    const float epsilon, const int batch_size, const int seq_len,
-    const int vocab_size, cudaStream_t stream);
-template void launch_cross_entropy_fw<__half>(
-    const __half *inputs_ptr, const int *targets_ptr, float *outputs_ptr,
-    float *nll_loss_ptr, float *loss_buffer, const int padding_idx,
-    const float epsilon, const int batch_size, const int seq_len,
-    const int vocab_size, cudaStream_t stream);
-template <typename T>
-void launch_cross_entropy_bw(const float *grad_outputs_ptr, const T *inputs_ptr,
-                             const int *targets_ptr, T *grad_inputs_ptr,
-                             const int padding_idx, const float epsilon,
-                             const int batch_size, const int seq_len,
-                             const int vocab_size, cudaStream_t stream) {
-  int grid_dim = batch_size * seq_len;
-  ls_cross_entropy_bw_kernel<<<grid_dim, MAX_THREADS, 0, stream>>>(
-      grad_outputs_ptr, inputs_ptr, targets_ptr, grad_inputs_ptr, padding_idx,
-      epsilon, vocab_size);
-}
-template void launch_cross_entropy_bw<float>(
-    const float *grad_outputs_ptr, const float *inputs_ptr,
-    const int *targets_ptr, float *grad_inputs_ptr, const int padding_idx,
-    const float epsilon, const int batch_size, const int seq_len,
-    const int vocab_size, cudaStream_t stream);
-template void launch_cross_entropy_bw<__half>(
-    const float *grad_outputs_ptr, const __half *inputs_ptr,
-    const int *targets_ptr, __half *grad_inputs_ptr, const int padding_idx,
-    const float epsilon, const int batch_size, const int seq_len,
-    const int vocab_size, cudaStream_t stream);
--- a/colossalai/kernel/cuda_native/csrc/kernels/cublas_wrappers.cu
+++ b/colossalai/kernel/cuda_native/csrc/kernels/cublas_wrappers.cu
-/* Copyright 2021 The LightSeq Team
-   Copyright Microsoft DeepSpeed
-   This file is adapted from Microsoft DeepSpeed
-   Licensed under the MIT License.
-*/
-#include "cublas_wrappers.h"
-int cublas_gemm_ex(cublasHandle_t handle, cublasOperation_t transa,
-                   cublasOperation_t transb, int m, int n, int k,
-                   const float *alpha, const float *beta, const float *A,
-                   const float *B, float *C, cublasGemmAlgo_t algo) {
-  cublasStatus_t status =
-      cublasGemmEx(handle, transa, transb, m, n, k, (const void *)alpha,
-                   (const void *)A, CUDA_R_32F, (transa == CUBLAS_OP_N) ? m : k,
-                   (const void *)B, CUDA_R_32F, (transb == CUBLAS_OP_N) ? k : n,
-                   (const void *)beta, C, CUDA_R_32F, m, CUDA_R_32F, algo);
-  if (status != CUBLAS_STATUS_SUCCESS) {
-    fprintf(stderr,
-            "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
-            m, n, k, (int)status);
-    return EXIT_FAILURE;
-  }
-  return 0;
-}
-int cublas_gemm_ex(cublasHandle_t handle, cublasOperation_t transa,
-                   cublasOperation_t transb, int m, int n, int k,
-                   const float *alpha, const float *beta, const __half *A,
-                   const __half *B, __half *C, cublasGemmAlgo_t algo) {
-  cublasStatus_t status = cublasGemmEx(
-      handle, transa, transb, m, n, k, (const void *)alpha, (const void *)A,
-      CUDA_R_16F, (transa == CUBLAS_OP_N) ? m : k, (const void *)B, CUDA_R_16F,
-      (transb == CUBLAS_OP_N) ? k : n, (const void *)beta, (void *)C,
-      CUDA_R_16F, m, CUDA_R_32F, algo);
-  if (status != CUBLAS_STATUS_SUCCESS) {
-    fprintf(stderr,
-            "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
-            m, n, k, (int)status);
-    return EXIT_FAILURE;
-  }
-  return 0;
-}
-int cublas_strided_batched_gemm(cublasHandle_t handle, int m, int n, int k,
-                                const float *alpha, const float *beta,
-                                const float *A, const float *B, float *C,
-                                cublasOperation_t op_A, cublasOperation_t op_B,
-                                int stride_A, int stride_B, int stride_C,
-                                int batch, cublasGemmAlgo_t algo) {
-  cublasStatus_t status = cublasGemmStridedBatchedEx(
-      handle, op_A, op_B, m, n, k, alpha, A, CUDA_R_32F,
-      (op_A == CUBLAS_OP_N) ? m : k, stride_A, B, CUDA_R_32F,
-      (op_B == CUBLAS_OP_N) ? k : n, stride_B, beta, C, CUDA_R_32F, m, stride_C,
-      batch, CUDA_R_32F, algo);
-  if (status != CUBLAS_STATUS_SUCCESS) {
-    fprintf(stderr,
-            "!!!! kernel execution error. (batch: %d, m: %d, n: %d, k: %d, "
-            "error: %d) \n",
-            batch, m, n, k, (int)status);
-    return EXIT_FAILURE;
-  }
-  return 0;
-}
-int cublas_strided_batched_gemm(cublasHandle_t handle, int m, int n, int k,
-                                const float *alpha, const float *beta,
-                                const __half *A, const __half *B, __half *C,
-                                cublasOperation_t op_A, cublasOperation_t op_B,
-                                int stride_A, int stride_B, int stride_C,
-                                int batch, cublasGemmAlgo_t algo) {
-  cublasStatus_t status = cublasGemmStridedBatchedEx(
-      handle, op_A, op_B, m, n, k, alpha, A, CUDA_R_16F,
-      (op_A == CUBLAS_OP_N) ? m : k, stride_A, B, CUDA_R_16F,
-      (op_B == CUBLAS_OP_N) ? k : n, stride_B, beta, C, CUDA_R_16F, m, stride_C,
-      batch, CUDA_R_32F, algo);
-  if (status != CUBLAS_STATUS_SUCCESS) {
-    fprintf(stderr,
-            "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
-            m, n, k, (int)status);
-    return EXIT_FAILURE;
-  }
-  return 0;
-}
--- a/colossalai/kernel/cuda_native/csrc/kernels/cuda_util.cu
+++ b/colossalai/kernel/cuda_native/csrc/kernels/cuda_util.cu
-#include <thrust/device_vector.h>
-#include <thrust/reduce.h>
-#include <thrust/transform_reduce.h>
-#include "cuda_util.h"
-/* GPU function guard */
-std::string _cudaGetErrorString(cudaError_t error) {
-  return cudaGetErrorString(error);
-}
-std::string _cudaGetErrorString(cublasStatus_t error) {
-  switch (error) {
-    case CUBLAS_STATUS_SUCCESS:
-      return "CUBLAS_STATUS_SUCCESS";
-    case CUBLAS_STATUS_NOT_INITIALIZED:
-      return "CUBLAS_STATUS_NOT_INITIALIZED";
-    case CUBLAS_STATUS_ALLOC_FAILED:
-      return "CUBLAS_STATUS_ALLOC_FAILED";
-    case CUBLAS_STATUS_INVALID_VALUE:
-      return "CUBLAS_STATUS_INVALID_VALUE";
-    case CUBLAS_STATUS_ARCH_MISMATCH:
-      return "CUBLAS_STATUS_ARCH_MISMATCH";
-    case CUBLAS_STATUS_MAPPING_ERROR:
-      return "CUBLAS_STATUS_MAPPING_ERROR";
-    case CUBLAS_STATUS_EXECUTION_FAILED:
-      return "CUBLAS_STATUS_EXECUTION_FAILED";
-    case CUBLAS_STATUS_INTERNAL_ERROR:
-      return "CUBLAS_STATUS_INTERNAL_ERROR";
-    case CUBLAS_STATUS_NOT_SUPPORTED:
-      return "CUBLAS_STATUS_NOT_SUPPORTED";
-    case CUBLAS_STATUS_LICENSE_ERROR:
-      return "CUBLAS_STATUS_LICENSE_ERROR";
-  }
-  return "CUBLAS_UNKNOW";
-}
-template <typename T>
-void check_gpu_error(T result, char const *const func, const char *const file,
-                     int const line) {
-  if (result) {
-    throw std::runtime_error(std::string("[CUDA][ERROR] ") + +file + "(" +
-                             std::to_string(line) +
-                             "): " + (_cudaGetErrorString(result)) + "\n");
-  }
-}
-template void check_gpu_error<cudaError_t>(cudaError_t result,
-                                           char const *const func,
-                                           const char *const file,
-                                           int const line);
-template void check_gpu_error<cublasStatus_t>(cublasStatus_t result,
-                                              char const *const func,
-                                              const char *const file,
-                                              int const line);
-template <typename T>
-void print_vec(const T *outv, std::string outn, int num_output_ele) {
-  std::cout << outn << ": ";
-  std::vector<T> hout(num_output_ele, (T)0);
-  cudaMemcpy(hout.data(), outv, num_output_ele * sizeof(T),
-             cudaMemcpyDeviceToHost);
-  for (int i = 0; i < num_output_ele; i++) {
-    std::cout << hout[i] << ", ";
-  }
-  std::cout << std::endl;
-}
-template <>
-void print_vec<__half>(const __half *outv, std::string outn,
-                       int num_output_ele) {
-  std::cout << outn << ": ";
-  std::vector<__half> hout(num_output_ele, (__half)0.f);
-  cudaMemcpy(hout.data(), outv, num_output_ele * sizeof(__half),
-             cudaMemcpyDeviceToHost);
-  for (int i = 0; i < num_output_ele; i++) {
-    std::cout << __half2float(hout[i]) << ", ";
-  }
-  std::cout << std::endl;
-}
-template void print_vec<float>(const float *outv, std::string outn,
-                               int num_output_ele);
-template void print_vec<int>(const int *outv, std::string outn,
-                             int num_output_ele);
-template void print_vec<__half>(const __half *outv, std::string outn,
-                                int num_output_ele);
-template <typename T>
-T *cuda_malloc(size_t ele_num) {
-  size_t byte_size = ele_num * sizeof(T);
-  T *pdata = nullptr;
-  CHECK_GPU_ERROR(cudaMalloc((void **)&pdata, byte_size));
-  return pdata;
-}
-template float *cuda_malloc<float>(size_t ele_num);
-template __half *cuda_malloc<__half>(size_t ele_num);
-template uint8_t *cuda_malloc<uint8_t>(size_t ele_num);
-void cuda_free(void *pdata) {
-  if (pdata != nullptr) {
-    cudaFree(pdata);
-  }
-}
-template <typename T>
-struct _isnan {
-  __device__ bool operator()(T a) const { return isnan(a); }
-};
-template <>
-struct _isnan<__half> {
-  __device__ bool operator()(const __half a) const { return __hisnan(a); }
-};
-template <typename T>
-struct _isinf {
-  __device__ bool operator()(T a) const { return isinf(a); }
-};
-template <>
-struct _isinf<__half> {
-  __device__ bool operator()(const __half a) const { return __hisinf(a); }
-};
-template <typename T>
-void check_nan_inf(const T *data_ptr, int dsize, bool check_nan_inf,
-                   std::string file, int line, cudaStream_t stream) {
-  // check_nan_inf = 0 for checking nan
-  // check_nan_inf = 1 for checking inf
-  bool res = false;
-  std::string msg = file + "(" + std::to_string(line) + "): ";
-  if (check_nan_inf) {
-    msg += "nan.";
-    res = thrust::transform_reduce(thrust::cuda::par.on(stream), data_ptr,
-                                   data_ptr + dsize, _isnan<T>(), false,
-                                   thrust::logical_or<bool>());
-  } else {
-    msg += "inf.";
-    res = thrust::transform_reduce(thrust::cuda::par.on(stream), data_ptr,
-                                   data_ptr + dsize, _isinf<T>(), false,
-                                   thrust::logical_or<bool>());
-  }
-  if (res) {
-    throw std::runtime_error(msg);
-  }
-  std::cout << msg << " [check pass]." << std::endl;
-}
-template void check_nan_inf<float>(const float *data_ptr, int dsize,
-                                   bool check_nan_inf, std::string file,
-                                   int line, cudaStream_t stream);
-template void check_nan_inf<__half>(const __half *data_ptr, int dsize,
-                                    bool check_nan_inf, std::string file,
-                                    int line, cudaStream_t stream);
--- a/colossalai/kernel/cuda_native/csrc/kernels/dropout_kernels.cu
+++ b/colossalai/kernel/cuda_native/csrc/kernels/dropout_kernels.cu
-#include <chrono>
-#include <ctime>
-#include "kernels.h"
-#include <cooperative_groups.h>
-namespace cg = cooperative_groups;
-curandStatePhilox4_32_10_t *curandstate;
-/**
- * @brief element-wise activation function on device, like Relu, Gelu
- *
- * @tparam enum class ActivationType, kRelu, kGelu
- * @tparam input type
- * @param any shape of float and __half2
- * @return same shape and type with input
- */
-template <ActivationType, typename T>
-__forceinline__ __device__ T activation_kernel(T x);
-template <>
-__device__ float activation_kernel<ActivationType::kGelu, float>(float x) {
-  float cdf =
-      0.5f *
-      (1.0f + tanhf((0.7978845608028654f * (x + 0.044715f * x * x * x))));
-  return x * cdf;
-}
-template <>
-__device__ __half2
-activation_kernel<ActivationType::kGelu, __half2>(__half2 val) {
-  __half2 val_pow3 = __hmul2(val, __hmul2(val, val));
-  float2 tmp_pow = __half22float2(val_pow3);
-  float2 tmp = __half22float2(val);
-  tmp.x =
-      0.5f *
-      (1.0f + tanhf((0.7978845608028654f * (tmp.x + 0.044715f * tmp_pow.x))));
-  tmp.y =
-      0.5f *
-      (1.0f + tanhf((0.7978845608028654f * (tmp.y + 0.044715f * tmp_pow.y))));
-  return __hmul2(val, __float22half2_rn(tmp));
-}
-template <>
-__device__ float activation_kernel<ActivationType::kRelu, float>(float x) {
-  return fmaxf(x, 0);
-}
-template <>
-__device__ __half2
-activation_kernel<ActivationType::kRelu, __half2>(__half2 x) {
-  return __floats2half2_rn(fmaxf(0.f, __half2float(x.x)),
-                           fmaxf(0.f, __half2float(x.y)));
-}
-/**
- * @brief element-wise activation backward function on device
- *
- * @tparam enum class ActivationType
- * @tparam input type
- * @param any shape of float and __half2
- * @return same shape of input
- */
-template <ActivationType, typename T>
-__forceinline__ __device__ T activation_bwd_kernel(T grad, T x);
-template <>
-__device__ float activation_bwd_kernel<ActivationType::kGelu, float>(float grad,
-                                                                     float x) {
-  const float sqrt_param = 0.79788456080286535587989211986876f;
-  const float mul_param = 0.044715;
-  float x2mul = x * x * mul_param;
-  float tan_h = tanhf(sqrt_param * (x + x * x2mul));
-  float dg1 = 0.5f * (1.0f + tan_h);
-  float dg2 = x * 0.5f * sqrt_param * (1 - tan_h * tan_h);
-  float dg3 = dg2 * 3 * x2mul;
-  return grad * (dg1 + dg2 + dg3);
-}
-template <>
-__device__ __half activation_bwd_kernel<ActivationType::kGelu, __half>(
-    __half grad, __half x_half) {
-  float x = __half2float(x_half);
-  const float sqrt_param = 0.79788456080286535587989211986876f;
-  const float mul_param = 0.044715;
-  float x2mul = x * x * mul_param;
-  float tan_h = tanhf(sqrt_param * (x + x * x2mul));
-  float dg1 = 0.5f * (1.0f + tan_h);
-  float dg2 = x * 0.5f * sqrt_param * (1 - tan_h * tan_h);
-  float dg3 = dg2 * 3 * x2mul;
-  return grad * __float2half(dg1 + dg2 + dg3);
-}
-template <>
-__device__ float activation_bwd_kernel<ActivationType::kRelu, float>(float grad,
-                                                                     float x) {
-  return x > 0.f ? grad : 0.f;
-}
-template <>
-__device__ __half
-activation_bwd_kernel<ActivationType::kRelu, __half>(__half grad, __half x) {
-  const __half half_zero = __float2half(0.f);
-  return x > half_zero ? grad : half_zero;
-}
-template <>
-__device__ __half2 activation_bwd_kernel<ActivationType::kRelu, __half2>(
-    __half2 grad2, __half2 x_half2) {
-  const __half half_zero = __float2half(0.f);
-  return __floats2half2_rn(x_half2.x > half_zero ? grad2.x : half_zero,
-                           x_half2.y > half_zero ? grad2.y : half_zero);
-}
-/**
- * @brief init curand states in global memory
- *
- * @thread grid_dim * block*dim to suuport any size of states
- * @param state persistant curand states
- * @param seed seed to init states
- * @return void
- */
-__global__ void curand_init_kernel(curandStatePhilox4_32_10_t *state,
-                                   int seed) {
-  /* Each thread gets same seed, a different sequence
-     number, no offset */
-  int id = threadIdx.x + blockIdx.x * blockDim.x;
-  curand_init(seed, id, 0, &state[id]);
-}
-void launch_curand_init(int total_count, int dim, cudaStream_t stream) {
-  cudaMalloc(&curandstate, total_count * sizeof(curandStatePhilox4_32_10_t));
-  int grid_dim = total_count >> 9;
-  curand_init_kernel<<<grid_dim, 512, 0, stream>>>(
-      curandstate, std::chrono::duration_cast<std::chrono::microseconds>(
-                       std::chrono::system_clock::now().time_since_epoch())
-                       .count());
-}
-/**
- * @brief element-wise dropout, store dropped position in mask, it's not
- * in-place
- *
- * @thread
- * gridDim.x = total_count / 1024
- * blockDim.x = 1024
- *
- * @param total_count total elements
- * @param ratio drop ratio
- * @param out any size of float and __half
- * @param in same with out
- * @param mask uint8 type, same size with out
- * @param seed seed to curand
- * @return void
- */
-__global__ void ls_dropout_kernel(const int total_count, const float ratio,
-                                  float *__restrict__ out,
-                                  const float *__restrict__ in,
-                                  uint8_t *__restrict__ mask, const int seed) {
-  const float scale = 1.f / (1.f - ratio);
-  int i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i * 4 >= total_count) return;
-  curandStatePhilox4_32_10_t state;
-  curand_init(seed, i, 0, &state);
-  uint8_t m[4];
-  float4 *out4 = reinterpret_cast<float4 *>(out);
-  const float4 *data4 = reinterpret_cast<const float4 *>(in);
-  uint32_t *mask4 = reinterpret_cast<uint32_t *>(mask);
-  float4 rand = curand_uniform4(&state);
-  m[0] = (uint8_t)(rand.x > ratio);
-  m[1] = (uint8_t)(rand.y > ratio);
-  m[2] = (uint8_t)(rand.z > ratio);
-  m[3] = (uint8_t)(rand.w > ratio);
-  uint32_t *m4 = reinterpret_cast<uint32_t *>(m);
-  mask4[i] = m4[0];
-  float4 input4 = data4[i];
-  float4 res4;
-  res4.x = input4.x * scale * m[0];
-  res4.y = input4.y * scale * m[1];
-  res4.z = input4.z * scale * m[2];
-  res4.w = input4.w * scale * m[3];
-  out4[i] = res4;
-}
-__global__ void ls_dropout_kernel(const int total_count, const float ratio,
-                                  __half *__restrict__ out,
-                                  const __half *__restrict__ in,
-                                  uint8_t *__restrict__ mask, const int seed) {
-  const float scale = 1.f / (1.f - ratio);
-  int i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i * 8 >= total_count) return;
-  curandStatePhilox4_32_10_t state;
-  curand_init(seed, i, 0, &state);
-  const float4 *vals_float4 = reinterpret_cast<const float4 *>(in);
-  float4 *outs_float4 = reinterpret_cast<float4 *>(out);
-  uint64_t *mask8 = reinterpret_cast<uint64_t *>(mask);
-  uint8_t m[8];
-  float4 rand = curand_uniform4(&state);
-  m[0] = (uint8_t)(rand.x > ratio);
-  m[1] = (uint8_t)(rand.y > ratio);
-  m[2] = (uint8_t)(rand.z > ratio);
-  m[3] = (uint8_t)(rand.w > ratio);
-  rand = curand_uniform4(&state);
-  m[4] = (uint8_t)(rand.x > ratio);
-  m[5] = (uint8_t)(rand.y > ratio);
-  m[6] = (uint8_t)(rand.z > ratio);
-  m[7] = (uint8_t)(rand.w > ratio);
-  uint64_t *m8 = reinterpret_cast<uint64_t *>(m);
-  mask8[i] = *m8;
-  float4 val_float4 = vals_float4[i];
-  float4 out_float4;
-  __half2 *val_half2 = reinterpret_cast<__half2 *>(&val_float4);
-  __half2 *out_half2 = reinterpret_cast<__half2 *>(&out_float4);
-  __half2 scale_mask_1 = __floats2half2_rn(scale * m[0], scale * m[1]);
-  __half2 scale_mask_2 = __floats2half2_rn(scale * m[2], scale * m[3]);
-  __half2 scale_mask_3 = __floats2half2_rn(scale * m[4], scale * m[5]);
-  __half2 scale_mask_4 = __floats2half2_rn(scale * m[6], scale * m[7]);
-  out_half2[0] = __hmul2(val_half2[0], scale_mask_1);
-  out_half2[1] = __hmul2(val_half2[1], scale_mask_2);
-  out_half2[2] = __hmul2(val_half2[2], scale_mask_3);
-  out_half2[3] = __hmul2(val_half2[3], scale_mask_4);
-  outs_float4[i] = out_float4;
-}
-/**
- * @brief element-wise dropout backward with dropout mask, it's
- * not in-place
- *
- * @thread
- * gridDim.x = total_count / 1024
- * blockDim.x = 1024
- *
- * @param total_count total elements
- * @param ratio drop ratio
- * @param in any size of float and __half
- * @param mask uint8 type, same size with in
- * @return void
- */
-__global__ void ls_dropout_bwd_kernel(const int total_count, const float ratio,
-                                      float *out, const float *in,
-                                      const uint8_t *__restrict__ mask) {
-  const float scale = 1.f / (1.f - ratio);
-  int i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i * 4 >= total_count) return;
-  uint8_t m[4];
-  float4 *out4 = reinterpret_cast<float4 *>(out);
-  const float4 *in4 = reinterpret_cast<const float4 *>(in);
-  const uint32_t *mask4 = reinterpret_cast<const uint32_t *>(mask);
-  uint32_t *m4 = reinterpret_cast<uint32_t *>(m);
-  m4[0] = mask4[i];
-  float4 input4 = in4[i];
-  float4 res4;
-  res4.x = input4.x * scale * static_cast<float>(m[0]);
-  res4.y = input4.y * scale * static_cast<float>(m[1]);
-  res4.z = input4.z * scale * static_cast<float>(m[2]);
-  res4.w = input4.w * scale * static_cast<float>(m[3]);
-  out4[i] = res4;
-}
-__global__ void ls_dropout_bwd_kernel(const int total_count, const float ratio,
-                                      __half *out, const __half *in,
-                                      const uint8_t *__restrict__ mask) {
-  const __half scale = 1.f / (1.f - ratio);
-  int i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i * 8 >= total_count) return;
-  float4 *out4 = reinterpret_cast<float4 *>(out);
-  const float4 *vals_float4 = reinterpret_cast<const float4 *>(in);
-  const uint64_t *mask8 = reinterpret_cast<const uint64_t *>(mask);
-  uint8_t m[8];
-  uint64_t *m8 = reinterpret_cast<uint64_t *>(m);
-  m8[0] = mask8[i];
-  float4 val_float4 = vals_float4[i];
-  float4 out_float4;
-  __half2 *val_half2 = reinterpret_cast<__half2 *>(&val_float4);
-  __half2 *out_half2 = reinterpret_cast<__half2 *>(&out_float4);
-  __half2 scale_mask_1 =
-      __halves2half2(scale * __float2half(m[0]), scale * __float2half(m[1]));
-  __half2 scale_mask_2 =
-      __halves2half2(scale * __float2half(m[2]), scale * __float2half(m[3]));
-  __half2 scale_mask_3 =
-      __halves2half2(scale * __float2half(m[4]), scale * __float2half(m[5]));
-  __half2 scale_mask_4 =
-      __halves2half2(scale * __float2half(m[6]), scale * __float2half(m[7]));
-  out_half2[0] = __hmul2(val_half2[0], scale_mask_1);
-  out_half2[1] = __hmul2(val_half2[1], scale_mask_2);
-  out_half2[2] = __hmul2(val_half2[2], scale_mask_3);
-  out_half2[3] = __hmul2(val_half2[3], scale_mask_4);
-  out4[i] = out_float4;
-}
-template <>
-void launch_ls_dropout<float>(float *out, const float *vals, uint8_t *mask,
-                              int total_count, float ratio, cudaStream_t stream,
-                              bool backward) {
-  int grid_dim = total_count >> 12;
-  if (!backward) {
-    ls_dropout_kernel<<<grid_dim + 1, 1024, 0, stream>>>(
-        total_count, ratio, out, vals, mask,
-        std::chrono::duration_cast<std::chrono::microseconds>(
-            std::chrono::system_clock::now().time_since_epoch())
-            .count());
-  } else {
-    ls_dropout_bwd_kernel<<<grid_dim + 1, 1024, 0, stream>>>(total_count, ratio,
-                                                             out, vals, mask);
-  }
-}
-template <>
-void launch_ls_dropout<__half>(__half *out, const __half *vals, uint8_t *mask,
-                               int total_count, float ratio,
-                               cudaStream_t stream, bool backward) {
-  int grid_dim = total_count >> 13;
-  if (!backward) {
-    ls_dropout_kernel<<<grid_dim + 1, 1024, 0, stream>>>(
-        total_count, ratio, out, vals, mask,
-        std::chrono::duration_cast<std::chrono::microseconds>(
-            std::chrono::system_clock::now().time_since_epoch())
-            .count());
-  } else {
-    ls_dropout_bwd_kernel<<<grid_dim + 1, 1024, 0, stream>>>(total_count, ratio,
-                                                             out, vals, mask);
-  }
-}
-/**
- * @brief fused bias, dropout, and residual at the end of Attention and FFN,
- * store dropped position in mask, it's not in-place
- *
- * @thread
- * gridDim.x = total_count / 1024
- * blockDim.x = 1024
- *
- * @param total_count total elements
- * @param ratio drop ratio
- * @param out [batch_size, seq_len, hidden_size], float and __half
- * @param in [batch_size, seq_len, hidden_size], float and __half
- * @param mask [batch_size, seq_len, hidden_size], uint8 type
- * @param bias [hidden_size], ffn bias
- * @param residual [batch_size, seq_len, hidden_size], float and __half
- * @param seed seed to curand
- * @param hidden_size hidden size
- * @return void
- */
-__global__ void ls_dropout_res_bias_kernel(
-    const int total_count, const float ratio, float *__restrict__ out,
-    const float *__restrict__ in, uint8_t *__restrict__ mask,
-    const float *__restrict__ bias, const float *__restrict__ residual,
-    const int seed, const int hidden_size) {
-  const float scale = 1.f / (1.f - ratio);
-  int i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i * 4 >= total_count) return;
-  curandStatePhilox4_32_10_t state;
-  curand_init(seed, i, 0, &state);
-  uint8_t m[4];
-  float4 *out4 = reinterpret_cast<float4 *>(out);
-  const float4 *data4 = reinterpret_cast<const float4 *>(in);
-  const float4 *residual4 = reinterpret_cast<const float4 *>(residual);
-  const float4 *bias4 = reinterpret_cast<const float4 *>(bias);
-  uint32_t *mask4 = reinterpret_cast<uint32_t *>(mask);
-  float4 rand = curand_uniform4(&state);
-  m[0] = static_cast<uint8_t>(rand.x > ratio);
-  m[1] = static_cast<uint8_t>(rand.y > ratio);
-  m[2] = static_cast<uint8_t>(rand.z > ratio);
-  m[3] = static_cast<uint8_t>(rand.w > ratio);
-  int bias_i = i % (hidden_size >> 2);
-  uint32_t *m4 = reinterpret_cast<uint32_t *>(m);
-  mask4[i] = m4[0];
-  const float4 input4 = data4[i];
-  const float4 b4 = __ldg(&bias4[bias_i]);
-  const float4 res4 = residual4[i];
-  float4 output4;
-  output4.x = (input4.x + b4.x) * scale * m[0] + res4.x;
-  output4.y = (input4.y + b4.y) * scale * m[1] + res4.y;
-  output4.z = (input4.z + b4.z) * scale * m[2] + res4.z;
-  output4.w = (input4.w + b4.w) * scale * m[3] + res4.w;
-  out4[i] = output4;
-}
-__global__ void ls_dropout_res_bias_kernel(
-    const int total_count, const float ratio, __half *__restrict__ out,
-    const __half *__restrict__ in, uint8_t *__restrict__ mask,
-    const __half *__restrict__ bias, const __half *__restrict__ residual,
-    const int seed, const int hidden_size) {
-  const __half scale = 1. / (1. - ratio);
-  int i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i * 8 >= total_count) return;
-  curandStatePhilox4_32_10_t state;
-  curand_init(seed, i, 0, &state);
-  const float4 *vals_float4 = reinterpret_cast<const float4 *>(in);
-  float4 *outs_float4 = reinterpret_cast<float4 *>(out);
-  const float4 *residual4 = reinterpret_cast<const float4 *>(residual);
-  const float4 *bias4 = reinterpret_cast<const float4 *>(bias);
-  uint64_t *mask8 = reinterpret_cast<uint64_t *>(mask);
-  uint8_t m[8];
-  float4 rand = curand_uniform4(&state);
-  m[0] = static_cast<uint8_t>(rand.x > ratio);
-  m[1] = static_cast<uint8_t>(rand.y > ratio);
-  m[2] = static_cast<uint8_t>(rand.z > ratio);
-  m[3] = static_cast<uint8_t>(rand.w > ratio);
-  rand = curand_uniform4(&state);
-  m[4] = static_cast<uint8_t>(rand.x > ratio);
-  m[5] = static_cast<uint8_t>(rand.y > ratio);
-  m[6] = static_cast<uint8_t>(rand.z > ratio);
-  m[7] = static_cast<uint8_t>(rand.w > ratio);
-  uint64_t *m8 = reinterpret_cast<uint64_t *>(m);
-  mask8[i] = m8[0];
-  int bias_i = i % (hidden_size >> 3);
-  float4 val_float4 = vals_float4[i];
-  const float4 b4 = __ldg(&bias4[bias_i]);
-  const float4 res4 = residual4[i];
-  float4 out_float4;
-  __half2 *val_half2 = reinterpret_cast<__half2 *>(&val_float4);
-  __half2 *out_half2 = reinterpret_cast<__half2 *>(&out_float4);
-  const __half2 *b_half2 = reinterpret_cast<const __half2 *>(&b4);
-  const __half2 *res_half2 = reinterpret_cast<const __half2 *>(&res4);
-  __half2 scale_mask_1 =
-      __halves2half2(scale * __float2half(m[0]), scale * __float2half(m[1]));
-  __half2 scale_mask_2 =
-      __halves2half2(scale * __float2half(m[2]), scale * __float2half(m[3]));
-  __half2 scale_mask_3 =
-      __halves2half2(scale * __float2half(m[4]), scale * __float2half(m[5]));
-  __half2 scale_mask_4 =
-      __halves2half2(scale * __float2half(m[6]), scale * __float2half(m[7]));
-  out_half2[0] =
-      __hfma2(__hadd2(val_half2[0], b_half2[0]), scale_mask_1, res_half2[0]);
-  out_half2[1] =
-      __hfma2(__hadd2(val_half2[1], b_half2[1]), scale_mask_2, res_half2[1]);
-  out_half2[2] =
-      __hfma2(__hadd2(val_half2[2], b_half2[2]), scale_mask_3, res_half2[2]);
-  out_half2[3] =
-      __hfma2(__hadd2(val_half2[3], b_half2[3]), scale_mask_4, res_half2[3]);
-  outs_float4[i] = out_float4;
-}
-template <>
-void launch_ls_dropout_res_bias<float>(float *out, const float *vals,
-                                       uint8_t *mask, const float *bias,
-                                       const float *residual, int total_count,
-                                       int dim, float ratio,
-                                       cudaStream_t stream) {
-  int grid_dim = total_count >> 12;
-  ls_dropout_res_bias_kernel<<<grid_dim + 1, 1024, 0, stream>>>(
-      total_count, ratio, out, vals, mask, bias, residual,
-      std::chrono::duration_cast<std::chrono::microseconds>(
-          std::chrono::system_clock::now().time_since_epoch())
-          .count(),
-      dim);
-}
-template <>
-void launch_ls_dropout_res_bias<__half>(__half *out, const __half *vals,
-                                        uint8_t *mask, const __half *bias,
-                                        const __half *residual, int total_count,
-                                        int dim, float ratio,
-                                        cudaStream_t stream) {
-  int grid_dim = total_count >> 13;
-  ls_dropout_res_bias_kernel<<<grid_dim + 1, 1024, 0, stream>>>(
-      total_count, ratio, out, vals, mask, bias, residual,
-      std::chrono::duration_cast<std::chrono::microseconds>(
-          std::chrono::system_clock::now().time_since_epoch())
-          .count(),
-      dim);
-}
-/**
- * @brief fused bias and dropout backward at the end of Attention and FFN
- *
- * @thread
- * gridDim.x = hidden_size / 8
- * blockDim.x = 8
- * blockDim.y = 1024 / 8 = 128
- *
- * @param row_size batch_size * seq_len
- * @param ratio dropout ratio
- * @param in_grad [batch_size, seq_len, hidden_size], input grad
- * @param bias_grad [hidden_size], bias grad
- * @param out_grad [batch_size, seq_len, hidden_size], output grad
- * @param mask [batch_size, seq_len, hidden_size], dropout mask
- * @param hidden_size
- * @return void
- */
-__global__ void ls_dropout_bias_bwd_kernel(
-    const int row_size, const float ratio, float *__restrict__ in_grad,
-    float *__restrict__ bias_grad, const float *__restrict__ out_grad,
-    const uint8_t *__restrict__ mask, const int hidden_size) {
-  const float scale = 1.f / (1.f - ratio);
-  // every block generate 8 bias result
-  __shared__ float tile[8][129];
-  cg::thread_block b = cg::this_thread_block();
-  cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-  int col_idx = flat_2dim(blockIdx.x, threadIdx.x, 8);
-  int stride = hidden_size * 128;
-  float local_sum = 0;
-  int idx = flat_2dim(threadIdx.y, col_idx, hidden_size);
-  for (int r = threadIdx.y; r < row_size; r += 128) {
-    float val = out_grad[idx];
-    val *= scale * static_cast<float>(mask[idx]);
-    local_sum += val;
-    in_grad[idx] = val;
-    idx += stride;
-  }
-  tile[threadIdx.x][threadIdx.y] = local_sum;
-  __syncthreads();
-  float sum = 0;
-  int tid = threadIdx.y * blockDim.x + threadIdx.x;
-  int x = tid >> 7;
-  int y = tid & (127);
-  if (y < 32) {
-#pragma unroll
-    for (int i = 0; i < 4; i++) {
-      sum += tile[x][y + i * 32];
-    }
-  }
-  __syncthreads();
-  for (int i = 1; i < 32; i <<= 1) sum += g.shfl_down(sum, i);
-  if (y == 0) tile[0][x] = sum;
-  __syncthreads();
-  if (threadIdx.x < 8) {
-    int pos = flat_2dim(blockIdx.x, threadIdx.x, 8);
-    bias_grad[pos] = tile[0][threadIdx.x];
-  }
-}
-__global__ void ls_dropout_bias_bwd_kernel(
-    const int row_size, const float ratio, __half *__restrict__ in_grad,
-    __half *__restrict__ bias_grad, const __half *__restrict__ out_grad,
-    const uint8_t *__restrict__ mask, const int hidden_size) {
-  const __half2 scale = __float2half2_rn(1.f / (1.f - ratio));
-  __shared__ __half2 tile[8][129];
-  cg::thread_block b = cg::this_thread_block();
-  cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-  __half2 *in_grad2 = reinterpret_cast<__half2 *>(in_grad);
-  const __half2 *out_grad2 = reinterpret_cast<const __half2 *>(out_grad);
-  __half2 *bias_grad2 = reinterpret_cast<__half2 *>(bias_grad);
-  int col_idx = flat_2dim(blockIdx.x, threadIdx.x, 8);
-  int stride = hidden_size * 128;
-  __half2 local_sum = __float2half2_rn(0.f);
-  int idx = flat_2dim(threadIdx.y, col_idx, hidden_size);
-  for (int r = threadIdx.y; r < row_size; r += 128) {
-    __half2 val = out_grad2[idx];
-    __half2 m2 = __floats2half2_rn(mask[2 * idx], mask[2 * idx + 1]);
-    val *= scale * m2;
-    local_sum += val;
-    in_grad2[idx] = val;
-    idx += stride;
-  }
-  tile[threadIdx.x][threadIdx.y] = local_sum;
-  __syncthreads();
-  __half2 sum = __float2half2_rn(0.f);
-  int tid = threadIdx.y * blockDim.x + threadIdx.x;
-  int x = tid >> 7;
-  int y = tid & (127);
-  if (y < 32) {
-#pragma unroll
-    for (int i = 0; i < 4; i++) {
-      sum += tile[x][y + i * 32];
-    }
-  }
-  __syncthreads();
-  for (int i = 1; i < WARP_SIZE; i <<= 1) sum += g.shfl_down(sum, i);
-  if (y == 0) tile[0][x] = sum;
-  __syncthreads();
-  if (threadIdx.x < 8) {
-    int pos = flat_2dim(blockIdx.x, threadIdx.x, 8);
-    bias_grad2[pos] = tile[0][threadIdx.x];
-  }
-}
-template <typename T>
-void launch_ls_dropout_bias_bwd(T *in_grad, T *bias_grad, const T *out_grad,
-                                const uint8_t *mask, int row_size, int dim,
-                                float ratio, cudaStream_t stream) {
-  dim3 grid_dim((dim - 1) / 8 + 1);
-  dim3 block_dim(8, 128);
-  ls_dropout_bias_bwd_kernel<<<grid_dim, block_dim, 0, stream>>>(
-      row_size, ratio, in_grad, bias_grad, out_grad, mask, dim);
-}
-template <>
-void launch_ls_dropout_bias_bwd(__half *in_grad, __half *bias_grad,
-                                const __half *out_grad, const uint8_t *mask,
-                                int row_size, int dim, float ratio,
-                                cudaStream_t stream) {
-  dim >>= 1;
-  dim3 grid_dim((dim - 1) / 8 + 1);
-  dim3 block_dim(8, 128);
-  ls_dropout_bias_bwd_kernel<<<grid_dim, block_dim, 0, stream>>>(
-      row_size, ratio, in_grad, bias_grad, out_grad, mask, dim);
-}
-template void launch_ls_dropout_bias_bwd(float *in_grad, float *bias_grad,
-                                         const float *out_grad,
-                                         const uint8_t *mask, int row_size,
-                                         int dim, float ratio,
-                                         cudaStream_t stream);
-/**
- * @brief fused bias, activation, and dropout at the end of first ffn
- *
- * @thread
- * gridDim.x = hidden_size / 8
- * blockDim.x = 8
- * blockDim.y = 1024 / 8 = 128
- *
- * @tparam act_type activation function, like kRelu, kGelu
- * @param total_count total elements
- * @param ratio drop ratio
- * @param out [batch_size, seq_len, hidden_size], float and __half
- * @param in [batch_size, seq_len, hidden_size], float and __half
- * @param mask [batch_size, seq_len, hidden_size], uint8 type
- * @param bias [hidden_size], ffn bias
- * @param seed seed to curand
- * @param hidden_size
- * @return void
- */
-template <ActivationType act_type>
-__global__ void ls_dropout_act_bias_kernel(
-    const int total_count, const float ratio, float *__restrict__ out,
-    const float *__restrict__ in, uint8_t *__restrict__ mask,
-    const float *__restrict__ bias, const int seed, const int hidden_size) {
-  const float scale = 1.f / (1.f - ratio);
-  int i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i * 4 >= total_count) return;
-  curandStatePhilox4_32_10_t state;
-  curand_init(seed, i, 0, &state);
-  uint8_t m[4];
-  float4 *out4 = reinterpret_cast<float4 *>(out);
-  const float4 *data4 = reinterpret_cast<const float4 *>(in);
-  const float4 *bias4 = reinterpret_cast<const float4 *>(bias);
-  uint32_t *mask4 = reinterpret_cast<uint32_t *>(mask);
-  float4 rand = curand_uniform4(&state);
-  m[0] = (uint8_t)(rand.x > ratio);
-  m[1] = (uint8_t)(rand.y > ratio);
-  m[2] = (uint8_t)(rand.z > ratio);
-  m[3] = (uint8_t)(rand.w > ratio);
-  int bias_i = i % (hidden_size >> 2);
-  uint32_t *m4 = reinterpret_cast<uint32_t *>(m);
-  mask4[i] = m4[0];
-  const float4 input4 = data4[i];
-  const float4 b4 = __ldg(&bias4[bias_i]);
-  float4 output4;
-  output4.x =
-      activation_kernel<act_type, float>(input4.x + b4.x) * scale * m[0];
-  output4.y =
-      activation_kernel<act_type, float>(input4.y + b4.y) * scale * m[1];
-  output4.z =
-      activation_kernel<act_type, float>(input4.z + b4.z) * scale * m[2];
-  output4.w =
-      activation_kernel<act_type, float>(input4.w + b4.w) * scale * m[3];
-  out4[i] = output4;
-}
-template <ActivationType act_type>
-__global__ void ls_dropout_act_bias_kernel(
-    const int total_count, const float ratio, __half *__restrict__ out,
-    const __half *__restrict__ in, uint8_t *__restrict__ mask,
-    const __half *__restrict__ bias, const int seed, const int hidden_size) {
-  const float scale = 1.f / (1.f - ratio);
-  int i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i * 8 >= total_count) return;
-  curandStatePhilox4_32_10_t state;
-  curand_init(seed, i, 0, &state);
-  const float4 *vals_float4 = reinterpret_cast<const float4 *>(in);
-  float4 *outs_float4 = reinterpret_cast<float4 *>(out);
-  const float4 *bias4 = reinterpret_cast<const float4 *>(bias);
-  uint64_t *mask8 = reinterpret_cast<uint64_t *>(mask);
-  uint8_t m[8];
-  float4 rand = curand_uniform4(&state);
-  m[0] = (uint8_t)(rand.x > ratio);
-  m[1] = (uint8_t)(rand.y > ratio);
-  m[2] = (uint8_t)(rand.z > ratio);
-  m[3] = (uint8_t)(rand.w > ratio);
-  rand = curand_uniform4(&state);
-  m[4] = (uint8_t)(rand.x > ratio);
-  m[5] = (uint8_t)(rand.y > ratio);
-  m[6] = (uint8_t)(rand.z > ratio);
-  m[7] = (uint8_t)(rand.w > ratio);
-  uint64_t *m8 = reinterpret_cast<uint64_t *>(m);
-  mask8[i] = *m8;
-  int bias_i = i % (hidden_size >> 3);
-  float4 val_float4 = vals_float4[i];
-  const float4 b4 = __ldg(&bias4[bias_i]);
-  float4 out_float4;
-  __half2 *val_half2 = reinterpret_cast<__half2 *>(&val_float4);
-  __half2 *out_half2 = reinterpret_cast<__half2 *>(&out_float4);
-  const __half2 *b_half2 = reinterpret_cast<const __half2 *>(&b4);
-  __half2 scale_mask_1 = __floats2half2_rn(scale * m[0], scale * m[1]);
-  __half2 scale_mask_2 = __floats2half2_rn(scale * m[2], scale * m[3]);
-  __half2 scale_mask_3 = __floats2half2_rn(scale * m[4], scale * m[5]);
-  __half2 scale_mask_4 = __floats2half2_rn(scale * m[6], scale * m[7]);
-  out_half2[0] = __hmul2(
-      activation_kernel<act_type, __half2>(__hadd2(val_half2[0], b_half2[0])),
-      scale_mask_1);
-  out_half2[1] = __hmul2(
-      activation_kernel<act_type, __half2>(__hadd2(val_half2[1], b_half2[1])),
-      scale_mask_2);
-  out_half2[2] = __hmul2(
-      activation_kernel<act_type, __half2>(__hadd2(val_half2[2], b_half2[2])),
-      scale_mask_3);
-  out_half2[3] = __hmul2(
-      activation_kernel<act_type, __half2>(__hadd2(val_half2[3], b_half2[3])),
-      scale_mask_4);
-  outs_float4[i] = out_float4;
-}
-template <>
-void launch_ls_dropout_act_bias<ActivationType::kGelu, float>(
-    float *out, const float *vals, uint8_t *mask, const float *bias,
-    int total_count, int dim, float ratio, cudaStream_t stream) {
-  int grid_dim = total_count >> 10;
-  ls_dropout_act_bias_kernel<ActivationType::kGelu>
-      <<<grid_dim + 1, 256, 0, stream>>>(
-          total_count, ratio, out, vals, mask, bias,
-          std::chrono::duration_cast<std::chrono::microseconds>(
-              std::chrono::system_clock::now().time_since_epoch())
-              .count(),
-          dim);
-}
-template <>
-void launch_ls_dropout_act_bias<ActivationType::kGelu, __half>(
-    __half *out, const __half *vals, uint8_t *mask, const __half *bias,
-    int total_count, int dim, float ratio, cudaStream_t stream) {
-  int grid_dim = total_count >> 11;
-  ls_dropout_act_bias_kernel<ActivationType::kGelu>
-      <<<grid_dim + 1, 256, 0, stream>>>(
-          total_count, ratio, out, vals, mask, bias,
-          std::chrono::duration_cast<std::chrono::microseconds>(
-              std::chrono::system_clock::now().time_since_epoch())
-              .count(),
-          dim);
-}
-template <>
-void launch_ls_dropout_act_bias<ActivationType::kRelu, float>(
-    float *out, const float *vals, uint8_t *mask, const float *bias,
-    int total_count, int dim, float ratio, cudaStream_t stream) {
-  int grid_dim = total_count >> 10;
-  ls_dropout_act_bias_kernel<ActivationType::kRelu>
-      <<<grid_dim + 1, 256, 0, stream>>>(
-          total_count, ratio, out, vals, mask, bias,
-          std::chrono::duration_cast<std::chrono::microseconds>(
-              std::chrono::system_clock::now().time_since_epoch())
-              .count(),
-          dim);
-}
-template <>
-void launch_ls_dropout_act_bias<ActivationType::kRelu, __half>(
-    __half *out, const __half *vals, uint8_t *mask, const __half *bias,
-    int total_count, int dim, float ratio, cudaStream_t stream) {
-  int grid_dim = total_count >> 11;
-  ls_dropout_act_bias_kernel<ActivationType::kRelu>
-      <<<grid_dim + 1, 256, 0, stream>>>(
-          total_count, ratio, out, vals, mask, bias,
-          std::chrono::duration_cast<std::chrono::microseconds>(
-              std::chrono::system_clock::now().time_since_epoch())
-              .count(),
-          dim);
-}
-/**
- * @brief fused bias, activation, and dropout backward
- *
- * @thread
- * gridDim.x = total_count / 1024
- * blockDim.x = 1024
- *
- * @tparam act_type kRelu
- * @param row_size batch_size * seq_len
- * @param ratio dropout ratio
- * @param in_grad [batch_size, seq_len, hidden_size], input grad
- * @param bias_grad [hidden_size], bias grad
- * @param out_grad [batch_size, seq_len, hidden_size], output grad
- * @param mask [batch_size, seq_len, hidden_size], dropout mask
- * @param hidden_size
- * @return void
- */
-template <ActivationType act_type, typename T>
-__global__ void ls_dropout_act_bias_bwd_kernel(
-    const int row_size, const float ratio, T *in_grad,
-    T *__restrict__ bias_grad, const T *__restrict__ input,
-    const T *__restrict__ bias, const T *out_grad,
-    const uint8_t *__restrict__ mask, const int hidden_size) {
-  const float scale = 1.f / (1.f - ratio);
-  __shared__ float tile[WARP_SIZE][WARP_SIZE + 1];
-  cg::thread_block b = cg::this_thread_block();
-  cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-  int col_idx = flat_2dim(blockIdx.x, threadIdx.x, WARP_SIZE);
-  int stride = hidden_size * WARP_SIZE;
-  float local_sum = 0;
-  int idx = flat_2dim(threadIdx.y, col_idx, hidden_size);
-  if (col_idx < hidden_size) {
-    for (int r = threadIdx.y; r < row_size; r += WARP_SIZE) {
-      float val = out_grad[idx];
-      float in = input[idx];
-      float b = bias[idx % hidden_size];
-      val = activation_bwd_kernel<act_type, float>(
-          val * scale * static_cast<float>(mask[idx]), in + b);
-      local_sum += val;
-      in_grad[idx] = val;
-      idx += stride;
-    }
-  }
-  tile[threadIdx.x][threadIdx.y] = local_sum;
-  __syncthreads();
-  float sum = tile[threadIdx.y][threadIdx.x];
-  __syncthreads();
-  for (int i = 1; i < WARP_SIZE; i <<= 1) sum += g.shfl_down(sum, i);
-  if (threadIdx.x == 0) tile[0][threadIdx.y] = sum;
-  __syncthreads();
-  if (threadIdx.y == 0) {
-    int pos = flat_2dim(blockIdx.x, threadIdx.x, WARP_SIZE);
-    bias_grad[pos] = tile[0][threadIdx.x];
-  }
-}
-// @brief fused bias, activation, and dropout backward
-// It is deprecated for precision reason. Keep it for future optimization.
-//
-// template <ActivationType act_type>
-// __global__ void ls_dropout_act_bias_bwd_kernel(
-//     const int row_size, const float ratio, __half * in_grad,
-//     __half *__restrict__ bias_grad, const __half *__restrict__ input, const
-//     __half *__restrict__ bias, const __half * out_grad, const uint8_t
-//     *__restrict__ mask, const int hidden_size) {
-//   const __half2 scale = __float2half2_rn(1.f / (1.f - ratio));
-//   __shared__ __half2 tile[WARP_SIZE][WARP_SIZE + 1];
-//   cg::thread_block b = cg::this_thread_block();
-//   cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-//   __half2 *in_grad2 = reinterpret_cast<__half2 *>(in_grad);
-//   __half2 *bias_grad2 = reinterpret_cast<__half2 *>(bias_grad);
-//   const __half2 *out_grad2 = reinterpret_cast<const __half2 *>(out_grad);
-//   const __half2 *input2 = reinterpret_cast<const __half2 *>(input);
-//   const __half2 *bias2 = reinterpret_cast<const __half2 *>(bias);
-//   int col_idx = flat_2dim(blockIdx.x, threadIdx.x, WARP_SIZE);
-//   int stride = hidden_size * WARP_SIZE;
-//   __half2 local_sum = __float2half2_rn(0.f);
-//   int idx = flat_2dim(threadIdx.y, col_idx, hidden_size);
-//   if (col_idx < hidden_size) {
-//     for (int r = threadIdx.y; r < row_size; r += WARP_SIZE) {
-//       __half2 val = out_grad2[idx];
-//       __half2 in2 = input2[idx];
-//       __half2 b2 = bias2[idx % hidden_size ];
-//       __half2 m2 = __floats2half2_rn(mask[2 * idx], mask[2 * idx + 1]);
-//       val = activation_bwd_kernel<ActivationType::kRelu, __half2>(val * scale
-//       *
-//                                                                   m2,
-//                                                                   in2+b2);
-//       local_sum += val;
-//       in_grad2[idx] = val;
-//       idx += stride;
-//     }
-//   }
-//   tile[threadIdx.x][threadIdx.y] = local_sum;
-//   __syncthreads();
-//   __half2 sum = tile[threadIdx.y][threadIdx.x];
-//   __syncthreads();
-//   for (int i = 1; i < WARP_SIZE; i <<= 1) sum += g.shfl_down(sum, i);
-//   if (threadIdx.x == 0) tile[0][threadIdx.y] = sum;
-//   __syncthreads();
-//   if (threadIdx.y == 0) {
-//     int pos = flat_2dim(blockIdx.x, threadIdx.x, WARP_SIZE);
-//     bias_grad2[pos] = tile[0][threadIdx.x];
-//   }
-// }
-template <ActivationType act_type, typename T>
-void launch_ls_dropout_act_bias_bwd(T *in_grad, T *bias_grad, const T *input,
-                                    const T *bias, const T *out_grad,
-                                    const uint8_t *mask, int row_size, int dim,
-                                    float ratio, cudaStream_t stream) {
-  dim3 grid_dim((dim - 1) / WARP_SIZE + 1);
-  dim3 block_dim(WARP_SIZE, WARP_SIZE);
-  ls_dropout_act_bias_bwd_kernel<act_type><<<grid_dim, block_dim, 0, stream>>>(
-      row_size, ratio, in_grad, bias_grad, input, bias, out_grad, mask, dim);
-}
-// template <>
-// void launch_ls_dropout_act_bias_bwd<ActivationType::kRelu, __half>(
-//     __half *in_grad, __half *bias_grad,const __half *input, const __half
-//     *bias, const __half *out_grad, const uint8_t *mask, int row_size, int
-//     dim, float ratio, cudaStream_t stream) {
-//   dim >>= 1;
-//   dim3 grid_dim((dim - 1) / WARP_SIZE + 1);
-//   dim3 block_dim(WARP_SIZE, WARP_SIZE);
-//   ls_dropout_act_bias_bwd_kernel<ActivationType::kRelu>
-//       <<<grid_dim, block_dim, 0, stream>>>(row_size, ratio, in_grad,
-//       bias_grad,
-//                                            input, bias,out_grad, mask, dim);
-// }
-template void launch_ls_dropout_act_bias_bwd<ActivationType::kRelu, float>(
-    float *in_grad, float *bias_grad, const float *input, const float *bias,
-    const float *out_grad, const uint8_t *mask, int row_size, int dim,
-    float ratio, cudaStream_t stream);
-template void launch_ls_dropout_act_bias_bwd<ActivationType::kRelu, __half>(
-    __half *in_grad, __half *bias_grad, const __half *input, const __half *bias,
-    const __half *out_grad, const uint8_t *mask, int row_size, int dim,
-    float ratio, cudaStream_t stream);
-template void launch_ls_dropout_act_bias_bwd<ActivationType::kGelu, float>(
-    float *in_grad, float *bias_grad, const float *input, const float *bias,
-    const float *out_grad, const uint8_t *mask, int row_size, int dim,
-    float ratio, cudaStream_t stream);
-template void launch_ls_dropout_act_bias_bwd<ActivationType::kGelu, __half>(
-    __half *in_grad, __half *bias_grad, const __half *input, const __half *bias,
-    const __half *out_grad, const uint8_t *mask, int row_size, int dim,
-    float ratio, cudaStream_t stream);
--- a/colossalai/kernel/cuda_native/csrc/kernels/general_kernels.cu
+++ b/colossalai/kernel/cuda_native/csrc/kernels/general_kernels.cu
-#include <cooperative_groups.h>
-#include "kernels.h"
-namespace cg = cooperative_groups;
-/**
-@brief: fuse_transpose_bias
-Calculate the sum of elements in each column of the matrix.
-@thread
-gridDim.x = ceil(cols / WARP_SIZE)
-blockDim.x = WARP_SIZE
-blockDim.y = WARP_SIZE
-@param
-inp: [rows, cols]
-out: [cols]
-rows: the number of rows in the matrix
-cols: the number of cols in the matrix
-*/
-template <typename T>
-__global__ void column_sum_reduce(const T *__restrict__ inp,
-                                  T *__restrict__ out, int rows, int cols) {
-  __shared__ float tile[WARP_SIZE][WARP_SIZE];
-  cg::thread_block b = cg::this_thread_block();
-  cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-  int idx = flat_2dim(blockIdx.x, threadIdx.x, WARP_SIZE);
-  int y_stride = cols * WARP_SIZE;
-  float localSum = 0;
-  // Loop across matrix row
-  // TODO: optimize to log complexity
-  if (idx < cols) {
-    int offset = flat_2dim(threadIdx.y, idx, cols);
-    for (int r = threadIdx.y; r < rows; r += WARP_SIZE) {
-      localSum += (float)inp[offset];
-      offset += y_stride;
-    }
-  }
-  // The sum of a row in tile is equal to the sum of a col in original matrix
-  tile[threadIdx.x][threadIdx.y] = localSum;
-  __syncthreads();
-  // Sum the shared buffer.
-  // The change of threadIdx.x is continuous
-  float sum = tile[threadIdx.y][threadIdx.x];
-  __syncthreads();
-  // Calculate the sum of a row in tile
-  for (int i = 1; i < WARP_SIZE; i <<= 1) sum += g.shfl_down(sum, i);
-  if (threadIdx.x == 0) {
-    int pos = flat_2dim(blockIdx.x, threadIdx.y, WARP_SIZE);
-    if (pos < cols) out[pos] = sum;
-  }
-}
-// [r, c] -> [c]
-template <>
-void launch_fuse_transpose_bias_kernel<float>(const float *inp, float *out,
-                                              int rows, int cols,
-                                              cudaStream_t stream) {
-  dim3 grid_dim((cols - 1) / WARP_SIZE + 1);
-  dim3 block_dim(WARP_SIZE, WARP_SIZE);
-  column_sum_reduce<float>
-      <<<grid_dim, block_dim, 0, stream>>>(inp, out, rows, cols);
-}
-template <>
-void launch_fuse_transpose_bias_kernel<__half>(const __half *inp, __half *out,
-                                               int rows, int cols,
-                                               cudaStream_t stream) {
-  dim3 grid_dim((cols - 1) / WARP_SIZE + 1);
-  dim3 block_dim(WARP_SIZE, WARP_SIZE);
-  column_sum_reduce<__half>
-      <<<grid_dim, block_dim, 0, stream>>>(inp, out, rows, cols);
-}
-/**
-@brief: fused_add2
-Add two matrix inp1 and inp2 to out.
-@thread
-gridDim.x = batch_size * seq_len
-blockDim.x = min(hidden_dim, MAX_THREADS)
-@param
-inp1: [batch_size, seq_len, hidden_dim]
-inp2: [batch_size, seq_len, hidden_dim]
-out: [batch_size, seq_len, hidden_dim]
-batch_size: the size of the current batch
-seq_len: the sequence length of the current batch
-hidden_dim: dim of the hidden tensor
-*/
-template <typename T>
-__global__ void fused_add2_kernel(T *out, const T *inp1, const T *inp2,
-                                  int hidden_dim);
-template <>
-__global__ void fused_add2_kernel<float>(float *out, const float *inp1,
-                                         const float *inp2, int hidden_dim) {
-  int row_id = blockIdx.x;
-  int offset = flat_2dim(row_id, 0, hidden_dim);
-  const float4 *inp1_4 = reinterpret_cast<const float4 *>(inp1);
-  const float4 *inp2_4 = reinterpret_cast<const float4 *>(inp2);
-  float4 *out_4 = reinterpret_cast<float4 *>(out);
-  float4 vinp1;
-  float4 vinp2;
-  float4 val;
-  for (std::size_t i = threadIdx.x; i < hidden_dim; i += blockDim.x) {
-    vinp1 = inp1_4[offset + i];
-    vinp2 = inp2_4[offset + i];
-    val.x = vinp1.x + vinp2.x;
-    val.y = vinp1.y + vinp2.y;
-    val.z = vinp1.z + vinp2.z;
-    val.w = vinp1.w + vinp2.w;
-    out_4[offset + i] = val;
-  }
-}
-template <>
-__global__ void fused_add2_kernel<__half>(__half *out, const __half *inp1,
-                                          const __half *inp2, int hidden_dim) {
-  int row_id = blockIdx.x;
-  int offset = flat_2dim(row_id, 0, hidden_dim);
-  const float4 *inp1_4 = reinterpret_cast<const float4 *>(inp1);
-  const float4 *inp2_4 = reinterpret_cast<const float4 *>(inp2);
-  float4 *out_4 = reinterpret_cast<float4 *>(out);
-  float4 vinp1;
-  float4 vinp2;
-  float4 val;
-  __half2 *h2_inp1 = reinterpret_cast<__half2 *>(&vinp1);
-  __half2 *h2_inp2 = reinterpret_cast<__half2 *>(&vinp2);
-  __half2 *h2_val = reinterpret_cast<__half2 *>(&val);
-  for (std::size_t i = threadIdx.x; i < hidden_dim; i += blockDim.x) {
-    vinp1 = inp1_4[offset + i];
-    vinp2 = inp2_4[offset + i];
-    h2_val[0] = __hadd2(h2_inp1[0], h2_inp2[0]);
-    h2_val[1] = __hadd2(h2_inp1[1], h2_inp2[1]);
-    h2_val[2] = __hadd2(h2_inp1[2], h2_inp2[2]);
-    h2_val[3] = __hadd2(h2_inp1[3], h2_inp2[3]);
-    out_4[offset + i] = val;
-  }
-}
-//[b, s, h] -> [b, s, h]
-template <>
-void launch_fused_add2<float>(float *out, const float *inp1, const float *inp2,
-                              int batch_size, int seq_len, int hidden_dim,
-                              cudaStream_t &stream) {
-  hidden_dim >>= 2;
-  dim3 grid_dim(batch_size * seq_len);
-  dim3 block_dim(min(hidden_dim, MAX_THREADS));
-  fused_add2_kernel<<<grid_dim, block_dim, 0, stream>>>(out, inp1, inp2,
-                                                        hidden_dim);
-}
-template <>
-void launch_fused_add2<__half>(__half *out, const __half *inp1,
-                               const __half *inp2, int batch_size, int seq_len,
-                               int hidden_dim, cudaStream_t &stream) {
-  hidden_dim >>= 3;
-  dim3 grid_dim(batch_size * seq_len);
-  dim3 block_dim(min(hidden_dim, MAX_THREADS));
-  fused_add2_kernel<<<grid_dim, block_dim, 0, stream>>>(out, inp1, inp2,
-                                                        hidden_dim);
-}
-template <typename T>
-__global__ void kernel_concat3_dim1(const T *inp1, const T *inp2, T *output,
-                                    int sz0, int sz2, int sz1_1, int sz1_2) {
-  int nele = sz0 * sz2 * (sz1_1 + sz1_2);
-  int idx = flat_2dim(blockIdx.x, threadIdx.x, blockDim.x);
-  if (idx >= nele) {
-    return;
-  }
-  float4 *dst_ptr = (float4 *)output + idx;
-  int idx2 = idx % sz2;
-  idx = idx / sz2;
-  int idx1 = idx % (sz1_1 + sz1_2);
-  int idx0 = idx / (sz1_1 + sz1_2);
-  float4 *src_ptr = nullptr;
-  int sz1 = 0;
-  if (idx1 < sz1_1) {
-    sz1 = sz1_1;
-    src_ptr = (float4 *)inp1;
-  } else {
-    idx1 -= sz1_1;
-    sz1 = sz1_2;
-    src_ptr = (float4 *)inp2;
-  }
-  src_ptr += flat_3dim(idx0, idx1, idx2, sz1, sz2);
-  dst_ptr[0] = src_ptr[0];
-}
-template <>
-void launch_concat3_dim1<float>(const float *inp1, const float *inp2,
-                                float *output, int sz0, int sz2, int sz1_1,
-                                int sz1_2, cudaStream_t stream) {
-  sz2 >>= 2;
-  int nele = sz0 * sz2 * (sz1_1 + sz1_2);
-  int nblock = (nele + MAX_THREADS - 1) / MAX_THREADS;
-  kernel_concat3_dim1<<<nblock, MAX_THREADS, 0, stream>>>(
-      inp1, inp2, output, sz0, sz2, sz1_1, sz1_2);
-}
-template <>
-void launch_concat3_dim1<__half>(const __half *inp1, const __half *inp2,
-                                 __half *output, int sz0, int sz2, int sz1_1,
-                                 int sz1_2, cudaStream_t stream) {
-  sz2 >>= 3;
-  int nele = sz0 * sz2 * (sz1_1 + sz1_2);
-  int nblock = (nele + MAX_THREADS - 1) / MAX_THREADS;
-  kernel_concat3_dim1<<<nblock, MAX_THREADS, 0, stream>>>(
-      inp1, inp2, output, sz0, sz2, sz1_1, sz1_2);
-}
--- a/colossalai/kernel/cuda_native/csrc/kernels/include/context.h
+++ b/colossalai/kernel/cuda_native/csrc/kernels/include/context.h
-#pragma once
-#include <cublas_v2.h>
-#include <cuda.h>
-#include <iostream>
-#include <string>
-#include "cuda_util.h"
-class Context {
- public:
-  Context() : _stream(nullptr) {
-    CHECK_GPU_ERROR(cublasCreate(&_cublasHandle));
-  }
-  virtual ~Context() {}
-  static Context &Instance() {
-    static Context _ctx;
-    return _ctx;
-  }
-  void set_stream(cudaStream_t stream) {
-    _stream = stream;
-    CHECK_GPU_ERROR(cublasSetStream(_cublasHandle, _stream));
-  }
-  cudaStream_t get_stream() { return _stream; }
-  cublasHandle_t get_cublashandle() { return _cublasHandle; }
- private:
-  cudaStream_t _stream;
-  cublasHandle_t _cublasHandle;
-};
--- a/colossalai/kernel/cuda_native/csrc/kernels/include/cross_entropy_layer.h
+++ b/colossalai/kernel/cuda_native/csrc/kernels/include/cross_entropy_layer.h
-#pragma once
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <cuda_runtime_api.h>
-#include <type_traits>
-#include "cuda_util.h"
-template <typename T>
-class CrossEntropyLayer {
- public:
-  CrossEntropyLayer(float epsilon, int padding_idx, int max_batch_tokens);
-  virtual ~CrossEntropyLayer();
-  void Forward(const T *inputs_ptr, const int *targets_ptr, float *outputs_ptr,
-               float *nll_loss_ptr);
-  void Backward(const float *grad_outputs_ptr, const T *inputs_ptr,
-                const int *targets_ptr, T *grad_inputs_ptr);
-  void set_cur_batch_shape(int batch_size, int seq_len, int vocab_size);
- private:
-  void allocate_mem_buffer() {
-    // allocate local gpu memory
-    _loss_buffer = cuda_malloc<float>(_max_batch_tokens * 2);
-  }
-  void free_mem_buffer() {
-    // free local gpu memory
-    cuda_free(_loss_buffer);
-  }
-  const int _padding_idx;
-  const float _epsilon;
-  const int _max_batch_tokens;
-  size_t _batch_size;
-  size_t _seq_len;
-  size_t _vocab_size;
-  float *_loss_buffer;
-};
--- a/colossalai/kernel/cuda_native/csrc/kernels/include/cublas_wrappers.h
+++ b/colossalai/kernel/cuda_native/csrc/kernels/include/cublas_wrappers.h
-/* Copyright 2021 The LightSeq Team
-   Copyright Microsoft DeepSpeed
-   This file is adapted from Microsoft DeepSpeed
-   Licensed under the MIT License.
-*/
-#pragma once
-#include <assert.h>
-#include <cublas_v2.h>
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
-#include <mma.h>
-#include <stdio.h>
-int cublas_gemm_ex(cublasHandle_t handle, cublasOperation_t transa,
-                   cublasOperation_t transb, int m, int n, int k,
-                   const float *alpha, const float *beta, const float *A,
-                   const float *B, float *C,
-                   cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
-int cublas_gemm_ex(cublasHandle_t handle, cublasOperation_t transa,
-                   cublasOperation_t transb, int m, int n, int k,
-                   const float *alpha, const float *beta, const __half *A,
-                   const __half *B, __half *C,
-                   cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-int cublas_strided_batched_gemm(cublasHandle_t handle, int m, int n, int k,
-                                const float *alpha, const float *beta,
-                                const float *A, const float *B, float *C,
-                                cublasOperation_t op_A, cublasOperation_t op_B,
-                                int stride_A, int stride_B, int stride_C,
-                                int batch,
-                                cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
-int cublas_strided_batched_gemm(
-    cublasHandle_t handle, int m, int n, int k, const float *alpha,
-    const float *beta, const __half *A, const __half *B, __half *C,
-    cublasOperation_t op_A, cublasOperation_t op_B, int stride_A, int stride_B,
-    int stride_C, int batch,
-    cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
--- a/colossalai/kernel/cuda_native/csrc/kernels/include/cuda_util.h
+++ b/colossalai/kernel/cuda_native/csrc/kernels/include/cuda_util.h
-#pragma once
-#include <cublas_v2.h>
-#include <cuda.h>
-#include <math_constants.h>
-#include <chrono>
-#include <fstream>
-#include <iostream>
-#include <string>
-#include <type_traits>
-#include <vector>
-template <typename T>
-void check_gpu_error(T result, char const *const func, const char *const file,
-                     int const line);
-#define CHECK_GPU_ERROR(val) check_gpu_error((val), #val, __FILE__, __LINE__)
-template <typename T>
-void print_vec(const T *outv, std::string outn, int num_output_ele);
-template <typename T>
-T *cuda_malloc(size_t ele_num);
-void cuda_free(void *pdata);
-template <typename T>
-void check_nan_inf(const T *data_ptr, int dsize, bool check_nan_inf,
-                   std::string file, int line, cudaStream_t stream);
-#define CHECK_NAN_INF(ptr, size, stream)                            \
-  check_nan_inf((ptr), (size), true, __FILE__, __LINE__, (stream)); \
-  check_nan_inf((ptr), (size), false, __FILE__, __LINE__, (stream))
--- a/colossalai/kernel/cuda_native/csrc/kernels/include/dropout.h
+++ b/colossalai/kernel/cuda_native/csrc/kernels/include/dropout.h
-#pragma once
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <stdio.h>
-#include <string>
-#include "kernels.h"
-template <typename T>
-class Dropout {
- public:
-  struct Config {
-    float ratio;
-    bool training;
-    Config(float r) : ratio(r), training(true) {}
-    float RATIO() const { return training ? ratio : 0.0; }
-  };
-  Dropout(const Config &config, size_t max_ele_num)
-      : _config(config), _mask(nullptr) {
-    _mask = cuda_malloc<uint8_t>(max_ele_num);
-  }
-  virtual ~Dropout() { cuda_free(_mask); }
-  // after attention softmax
-  void dropout(T *output, const T *input, int count, cudaStream_t stream,
-               bool bwd = false) {
-    launch_ls_dropout<T>(output, input, _mask, count, _config.RATIO(), stream,
-                         bwd);
-  }
-  void d_dropout(T *d_inp_out, int count, cudaStream_t stream) {
-    launch_ls_dropout<T>(d_inp_out, d_inp_out, _mask, count, _config.RATIO(),
-                         stream, true);
-  }
-  // transformer layer's postprocessing dropout, after attn or ffn module,
-  // before residual add.
-  void bias_dropout_residual(T *output, const T *input, const T *residual,
-                             const T *bias, int rows, int cols,
-                             cudaStream_t stream) {
-    launch_ls_dropout_res_bias<T>(output, input, _mask, bias, residual,
-                                  rows * cols, cols, _config.RATIO(), stream);
-  }
-  void d_bias_dropout_residual(T *d_input, T *d_bias, const T *d_output,
-                               int rows, int cols, cudaStream_t stream) {
-    launch_ls_dropout_bias_bwd<T>(d_input, d_bias, d_output, _mask, rows, cols,
-                                  _config.RATIO(), stream);
-  }
-  // dropout inside ffn.
-  void bias_act_dropout(T *output, const T *input, const T *bias, int rows,
-                        int cols, std::string activation_fn,
-                        cudaStream_t stream) {
-    if (activation_fn == "relu") {
-      launch_ls_dropout_act_bias<ActivationType::kRelu, T>(
-          output, input, _mask, bias, rows * cols, cols, _config.RATIO(),
-          stream);
-    } else if (activation_fn == "gelu") {
-      launch_ls_dropout_act_bias<ActivationType::kGelu, T>(
-          output, input, _mask, bias, rows * cols, cols, _config.RATIO(),
-          stream);
-    } else {
-      throw std::runtime_error("not supported activation: " + activation_fn);
-    }
-  }
-  void d_bias_act_dropout(T *d_inp_out, T *d_bias_out, const T *input,
-                          const T *bias, int rows, int cols,
-                          std::string activation_fn, cudaStream_t stream) {
-    if (activation_fn == "relu") {
-      launch_ls_dropout_act_bias_bwd<ActivationType::kRelu, T>(
-          d_inp_out, d_bias_out, input, bias, d_inp_out, _mask, rows, cols,
-          _config.RATIO(), stream);
-    } else if (activation_fn == "gelu") {
-      launch_ls_dropout_act_bias_bwd<ActivationType::kGelu, T>(
-          d_inp_out, d_bias_out, input, bias, d_inp_out, _mask, rows, cols,
-          _config.RATIO(), stream);
-    } else {
-      throw std::runtime_error("not supported activation: " + activation_fn);
-    }
-  }
-  bool HasDropout() const { return _config.RATIO() > 0.0; }
-  void SetTrainingMode(bool training) { _config.training = training; }
- private:
-  uint8_t *_mask;
-  Config _config;
-};
--- a/colossalai/kernel/cuda_native/csrc/kernels/include/feed_forward.h
+++ b/colossalai/kernel/cuda_native/csrc/kernels/include/feed_forward.h
-#pragma once
-/* Copyright 2021 The LightSeq Team
-   Copyright Microsoft DeepSpeed
-   This file is adapted from Microsoft DeepSpeed
-   Licensed under the MIT License.
-*/
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <stdio.h>
-#include <array>
-#include "cublas_wrappers.h"
-#include "kernels.h"
-template <typename T>
-class FeedForward {
- public:
-  struct Config {
-    int outputSize;
-    int inputSize;
-    std::array<int, 3> gemm_algos;
-    Config(int outputs, int inputs)
-        : outputSize(outputs),
-          inputSize(inputs),
-          gemm_algos(std::array<int, 3>({99, 99, 99})) {}
-  };
-  FeedForward(Config config) : config_(config) {}
-  ~FeedForward() {}
-  void Forward(int bsz, const T *input_ptr, const T *weights, T *out,
-               cublasHandle_t &_cublasHandle) {
-    float alpha = T(1.);
-    float beta = T(0.);
-    cublas_gemm_ex(_cublasHandle, CUBLAS_OP_T, CUBLAS_OP_N, config_.outputSize,
-                   bsz, config_.inputSize, &alpha, &beta, weights, input_ptr,
-                   out, cublasGemmAlgo_t(config_.gemm_algos[0]));
-  }
-  void Backward(int bsz, const T *out_grad, const T *input_ptr,
-                const T *weights, T *weights_grad, T *bias_grad,
-                cublasHandle_t &_cublasHandle, cudaStream_t &stream,
-                T *inp_grad_out = nullptr, T *out_grad_trans_out = nullptr,
-                bool compute_bias = true) {
-    float alpha = (T)1.0, beta = (T)0.0;
-    cublas_gemm_ex(_cublasHandle, CUBLAS_OP_N, CUBLAS_OP_T, config_.inputSize,
-                   config_.outputSize, bsz, &alpha, &beta, input_ptr, out_grad,
-                   weights_grad, cublasGemmAlgo_t(config_.gemm_algos[1]));
-    cublas_gemm_ex(_cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, config_.inputSize,
-                   bsz, config_.outputSize, &alpha, &beta, weights, out_grad,
-                   inp_grad_out, cublasGemmAlgo_t(config_.gemm_algos[2]));
-    if (compute_bias) {
-      launch_fuse_transpose_bias_kernel<T>(out_grad, bias_grad, bsz,
-                                           config_.outputSize, stream);
-    }
-  }
-  void reset_size(int outputSize, int inputSize) {
-    config_.outputSize = outputSize;
-    config_.inputSize = inputSize;
-  }
- private:
-  Config config_;
-};
--- a/colossalai/kernel/cuda_native/csrc/kernels/include/kernels.h
+++ b/colossalai/kernel/cuda_native/csrc/kernels/include/kernels.h
-#pragma once
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <curand_kernel.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdexcept>
-#define MAX_THREADS 1024
-#define WARP_SIZE 32
-enum class ActivationType { kRelu, kGelu };
-void launch_curand_init(int total_count, int dim, cudaStream_t stream);
-template <typename T>
-void launch_layer_norm(T *ln_res, T *vars, T *means, const T *inp,
-                       const T *scale, const T *bias, int batch_size,
-                       int hidden_dim, cudaStream_t stream);
-template <typename T>
-void launch_ln_bw(T *gamma_grad, T *betta_grad, T *inp_grad, const T *out_grad,
-                  const T *residual_grad, const T *inp_or_out, const T *gamma,
-                  const T *betta, const T *vars, const T *means, int batch,
-                  int hidden_dim, cudaStream_t stream[2]);
-template <typename T>
-void launch_attn_softmax(T *vals, const T *attn_mask, int batch_size, int heads,
-                         int from_len, int to_len, bool mask_future,
-                         cudaStream_t stream);
-template <typename T>
-void launch_attn_softmax_bw(T *out_grad, const T *soft_inp, int rows,
-                            int softmax_len, cudaStream_t stream);
-// [b, s, h] -> [b, nh, s, ad]
-template <typename T>
-void launch_transform_0213(T *output, const T *vals, int batch_size,
-                           int seq_length, int hidden_dim, int nhead,
-                           cudaStream_t stream);
-// [b, s, 3, h] -> [3, b, nh, s, ad]
-template <typename T>
-void launch_bias_add_transform_20314(T *output, const T *input, const T *bias,
-                                     int dim_0, int dim_1, int dim_2, int dim_3,
-                                     int dim_4, cudaStream_t stream);
-// [tc, b, nh, s, ad] -> [b, s, tc, nh, ad]
-template <typename T>
-void launch_transform4d_0213(T *output, const T *vals, int batch_size,
-                             int seq_len, int hidden_dim, int nhead,
-                             int trans_count, cudaStream_t stream);
-template <typename T>
-void launch_ls_dropout(T *out, const T *vals, uint8_t *mask, int total_count,
-                       float ratio, cudaStream_t stream, bool backward = false);
-template <typename T>
-void launch_ls_dropout_res_bias(T *out, const T *vals, uint8_t *mask,
-                                const T *bias, const T *residual,
-                                int total_count, int dim, float ratio,
-                                cudaStream_t stream);
-template <ActivationType, typename T>
-void launch_ls_dropout_act_bias(T *out, const T *vals, uint8_t *mask,
-                                const T *bias, int total_count, int dim,
-                                float ratio, cudaStream_t stream);
-template <typename T>
-void launch_ls_dropout_bias_bwd(T *in_grad, T *bias_grad, const T *out_grad,
-                                const uint8_t *mask, int row_size, int dim,
-                                float ratio, cudaStream_t stream);
-template <ActivationType act_type, typename T>
-void launch_ls_dropout_act_bias_bwd(T *in_grad, T *bias_grad, const T *input,
-                                    const T *bias, const T *out_grad,
-                                    const uint8_t *mask, int row_size, int dim,
-                                    float ratio, cudaStream_t stream);
-template <typename T>
-void launch_fuse_transpose_bias_kernel(const T *inp, T *out, int rows, int cols,
-                                       cudaStream_t stream);
-void launch_param_update(const float *input, __half *output, int size,
-                         cudaStream_t stream);
-template <typename T>
-void launch_concat3_dim1(const T *inp1, const T *inp2, T *output, int sz0,
-                         int sz2, int sz1_1, int sz1_2, cudaStream_t stream);
-template <typename T>
-void launch_fused_add2(T *out, const T *inp1, const T *inp2, int batch_size,
-                       int seq_len, int hidden_size, cudaStream_t &stream);
-template <typename T>
-void launch_cross_entropy_fw(const T *inputs_ptr, const int *targets_ptr,
-                             float *outputs_ptr, float *nll_loss_ptr,
-                             float *loss_buffer, const int padding_idx,
-                             const float epsilon, const int batch_size,
-                             const int seq_len, const int vocab_size,
-                             cudaStream_t stream);
-template <typename T>
-void launch_cross_entropy_bw(const float *grad_outputs_ptr, const T *inputs_ptr,
-                             const int *targets_ptr, T *grad_inputs_ptr,
-                             const int padding_idx, const float epsilon,
-                             const int batch_size, const int seq_len,
-                             const int vocab_size, cudaStream_t stream);
-template <typename T>
-void launch_lookup_scale_pos_dropout(
-    T *output, const int *input, const T *embeddings, const T *pos_embeddings,
-    uint8_t *dropout_mask, int batch_size, int seq_len, int embedding_dim,
-    int padding_idx, float dropout_ratio, int step, cudaStream_t &stream);
-template <typename T>
-void launch_d_lookup_scale_pos_dropout(
-    T *grad_embeddings, const T *grad_output, const int *input,
-    const uint8_t *dropout_mask, int batch_size, int seq_len, int embedding_dim,
-    int vocab_size, int padding_idx, float dropout_ratio, cudaStream_t &stream);
-/* Convert 2-dim tensor index into vector index */
-__forceinline__ __host__ __device__ int flat_2dim(int id1, int id2, int dim2) {
-  return id1 * dim2 + id2;
-}
-/* Convert 3-dim tensor index into vector index */
-__forceinline__ __host__ __device__ int flat_3dim(int id1, int id2, int id3,
-                                                  int dim2, int dim3) {
-  return id1 * dim2 * dim3 + id2 * dim3 + id3;
-}
-/* Convert 4-dim tensor index into vector index */
-__forceinline__ __host__ __device__ int flat_4dim(int id1, int id2, int id3,
-                                                  int id4, int dim2, int dim3,
-                                                  int dim4) {
-  // return id1*(dim2*dim3*dim4) + id2*(dim3*dim4) + id3*dim4 + id4;
-  int res = id4;
-  int ld = dim4;
-  res += id3 * ld;
-  ld *= dim3;
-  res += id2 * ld;
-  ld *= dim2;
-  res += id1 * ld;
-  return res;
-}
-/* Convert 5-dim tensor index into vector index */
-__forceinline__ __host__ __device__ int flat_5dim(int id1, int id2, int id3,
-                                                  int id4, int id5, int dim2,
-                                                  int dim3, int dim4,
-                                                  int dim5) {
-  // return id1*(dim2*dim3*dim4*dim5) + id2*(dim3*dim4*dim5) + id3*(dim4*dim5) +
-  // id4*dim5 + dim5;
-  int res = id5;
-  int ld = dim5;
-  res += id4 * ld;
-  ld *= dim4;
-  res += id3 * ld;
-  ld *= dim3;
-  res += id2 * ld;
-  ld *= dim2;
-  res += id1 * ld;
-  return res;
-}
-/* Convert 6-dim tensor index into vector index */
-__forceinline__ __host__ __device__ int flat_6dim(int id1, int id2, int id3,
-                                                  int id4, int id5, int id6,
-                                                  int dim2, int dim3, int dim4,
-                                                  int dim5, int dim6) {
-  // return id1*(dim2*dim3*dim4*dim5*dim6) + id2*(dim3*dim4*dim5*dim6) +
-  // id3*(dim4*dim5*dim6) + id4*(dim5*dim6) + id5*dim6 + id6;
-  int res = id6;
-  int ld = dim6;
-  res += id5 * ld;
-  ld *= dim5;
-  res += id4 * ld;
-  ld *= dim4;
-  res += id3 * ld;
-  ld *= dim3;
-  res += id2 * ld;
-  ld *= dim2;
-  res += id1 * ld;
-  return res;
-}
-/* Convert vector index to 6-dim tensor index */
-__forceinline__ __host__ __device__ void decompose_6dim(
-    int src, int dim1, int dim2, int dim3, int dim4, int dim5, int *id0,
-    int *id1, int *id2, int *id3, int *id4, int *id5) {
-  *id5 = src % dim5;
-  src /= dim5;
-  *id4 = src % dim4;
-  src /= dim4;
-  *id3 = src % dim3;
-  src /= dim3;
-  *id2 = src % dim2;
-  src /= dim2;
-  *id1 = src % dim1;
-  *id0 = src / dim1;
-}
-/* Convert vector index to 5-dim tensor index */
-__forceinline__ __host__ __device__ void decompose_5dim(int src, int dim1,
-                                                        int dim2, int dim3,
-                                                        int dim4, int *id0,
-                                                        int *id1, int *id2,
-                                                        int *id3, int *id4) {
-  *id4 = src % dim4;
-  src /= dim4;
-  *id3 = src % dim3;
-  src /= dim3;
-  *id2 = src % dim2;
-  src /= dim2;
-  *id1 = src % dim1;
-  *id0 = src / dim1;
-}
-/* Convert vector index to 4-dim tensor index */
-__forceinline__ __host__ __device__ void decompose_4dim(int src, int dim1,
-                                                        int dim2, int dim3,
-                                                        int *id0, int *id1,
-                                                        int *id2, int *id3) {
-  *id3 = src % dim3;
-  src /= dim3;
-  *id2 = src % dim2;
-  src /= dim2;
-  *id1 = src % dim1;
-  *id0 = src / dim1;
-}
-/* Convert vector index to 3-dim tensor index */
-__forceinline__ __host__ __device__ void decompose_3dim(int src, int dim1,
-                                                        int dim2, int *id0,
-                                                        int *id1, int *id2) {
-  *id2 = src % dim2;
-  src /= dim2;
-  *id1 = src % dim1;
-  *id0 = src / dim1;
-}
-/* Convert vector index to 2-dim tensor index */
-__forceinline__ __host__ __device__ void decompose_2dim(int src, int dim1,
-                                                        int *id0, int *id1) {
-  *id1 = src % dim1;
-  *id0 = src / dim1;
-}
--- a/colossalai/kernel/cuda_native/csrc/kernels/include/ls_cub.cuh
+++ b/colossalai/kernel/cuda_native/csrc/kernels/include/ls_cub.cuh
-// copied from https://github.com/dmlc/dgl/pull/2758
-#ifndef DGL_ARRAY_CUDA_DGL_CUB_CUH_
-#define DGL_ARRAY_CUDA_DGL_CUB_CUH_
-#define CUB_NS_PREFIX namespace ls {
-#define CUB_NS_POSTFIX }
-#include "cub/cub.cuh"
-#include "cub/util_allocator.cuh"
-#undef CUB_NS_POSTFIX
-#undef CUB_NS_PREFIX
-#endif
--- a/colossalai/kernel/cuda_native/csrc/kernels/include/normalize_layer.h
+++ b/colossalai/kernel/cuda_native/csrc/kernels/include/normalize_layer.h
-#pragma once
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <stdio.h>
-#include <fstream>
-#include "kernels.h"
-using namespace std;
-template <typename T>
-class Normalize_Layer {
- public:
-  struct Config {
-    uint32_t hidden_dim;
-    bool use_mean;
-    Config(uint32_t hidden_dim, bool use_mean = false)
-        : hidden_dim(hidden_dim), use_mean(use_mean) {}
-  };
-  Normalize_Layer(Config config, size_t max_rows)
-      : config_(config), vars_(nullptr), means_(nullptr) {
-    vars_ = cuda_malloc<T>(max_rows);
-    if (config_.use_mean) {
-      means_ = cuda_malloc<T>(max_rows);
-    }
-  }
-  ~Normalize_Layer() {
-    cuda_free(vars_);
-    cuda_free(means_);
-  }
-  void Forward(T *ln_res, const T *inp, const T *gamma, const T *betta,
-               int batch_size, cudaStream_t stream) {
-    launch_layer_norm(ln_res, vars_, means_, inp, gamma, betta, batch_size,
-                      config_.hidden_dim, stream);
-  }
-  /*
-  residual_grad, inp_or_out, betta should be treated carefully.
-  inp_or_out = input if use_mean else output
-  residual_grad, betta can be nullptr.
-  residual_grad will be added to dinp if it is not nullptr
-    which is useful in transformer layer when pre-ln
-  betta are only used to compute xhat,
-    (use_mean == false) ^ (betta == nullptr) should be true
-  */
-  void Backward(T *gamma_grad, T *betta_grad, T *inp_grad, const T *out_grad,
-                const T *residual_grad, const T *inp_or_out, const T *gamma,
-                const T *betta, int batch_size, cudaStream_t stream[2]) {
-    launch_ln_bw(gamma_grad, betta_grad, inp_grad, out_grad, residual_grad,
-                 inp_or_out, gamma, betta, vars_, means_, batch_size,
-                 config_.hidden_dim, stream);
-  }
-  inline bool use_mean() const { return config_.use_mean; }
- private:
-  Config config_;
-  T *vars_;
-  T *means_;
-};
--- a/colossalai/kernel/cuda_native/csrc/kernels/include/softmax.h
+++ b/colossalai/kernel/cuda_native/csrc/kernels/include/softmax.h
-#pragma once
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <stdio.h>
-#include <fstream>
-#include "kernels.h"
-using namespace std;
-template <typename T>
-class Softmax {
- public:
-  struct Config {
-    size_t nhead;
-    Config(size_t nhead) : nhead(nhead) {}
-  };
-  Softmax(Config config) : config_(config) {}
-  ~Softmax() {}
-  void Forward(T *vals, const T *attn_mask, int batch_size, int from_len,
-               int to_len, cudaStream_t &stream, bool mask_future = true) {
-    launch_attn_softmax<T>(vals, attn_mask, batch_size, config_.nhead, from_len,
-                           to_len, mask_future, stream);
-  }
-  void Backward(T *out_grad, const T *soft_out, int batch_size, int from_len,
-                int to_len, cudaStream_t stream) {
-    launch_attn_softmax_bw<T>(out_grad, soft_out,
-                              batch_size * config_.nhead * from_len, to_len,
-                              stream);
-  }
-  void reset_size(size_t nhead) { config_.nhead = nhead; }
- private:
-  Config config_;
-};