Support windows platform (#209)

* __PRETTY_FUNCTION__ * CASE_K * uint * remove not * HALF_FLT_MAX * struct init * port utils * better build pthread-win32 * port kernels * port utils/gemm_test * hide windows header * port models * port examples && triton_backend && unittests * update build readme * fix lint * fix lint * fix lint * fix lint * fix lint * fix build * fix build * cmake version * fix typos * update ci * port kernels/gemm_s_f16 * update ci * fix ci * use cudaStreamSynchronize instead of volatile check * remove gettimeofday * remove pthread-win32 * remove dirent.h * update pre-commit * update * remove todo * fix include * fix build * fix build * fix build ci * fix github action trigger * update README * fix linux-build ci * remove windows folder * fix lint * update readme

Support windows platform (#209)
* __PRETTY_FUNCTION__ * CASE_K * uint * remove not * HALF_FLT_MAX * struct init * port utils * better build pthread-win32 * port kernels * port utils/gemm_test * hide windows header * port models * port examples && triton_backend && unittests * update build readme * fix lint * fix lint * fix lint * fix lint * fix lint * fix build * fix build * cmake version * fix typos * update ci * port kernels/gemm_s_f16 * update ci * fix ci * use cudaStreamSynchronize instead of volatile check * remove gettimeofday * remove pthread-win32 * remove dirent.h * update pre-commit * update * remove todo * fix include * fix build * fix build * fix build ci * fix github action trigger * update README * fix linux-build ci * remove windows folder * fix lint * update readme
4c9959f6 · Chen Xin · GitHub · 0d21f366 · 4c9959f6 · 4c9959f6
Unverified Commit 4c9959f6 authored Aug 17, 2023 by Chen Xin Committed by GitHub Aug 17, 2023
18 changed files
--- a/src/turbomind/utils/logger.h
+++ b/src/turbomind/utils/logger.h
@@ -24,6 +24,12 @@
 namespace turbomind {
+// cub.cuh brings windows.h
+// should be included after cub.cuh
+#ifdef ERROR
+#undef ERROR
+#endif
 class Logger {
 public:

--- a/src/turbomind/utils/memory_utils.cu
+++ b/src/turbomind/utils/memory_utils.cu
@@ -14,6 +14,7 @@
 * limitations under the License.
 */
+#include "src/turbomind/macro.h"
 #include "src/turbomind/utils/Tensor.h"
 #include "src/turbomind/utils/cuda_type_utils.cuh"
 #include "src/turbomind/utils/logger.h"
@@ -356,8 +357,8 @@ loadWeightFromBinHelper(std::vector<size_t> shape, std::string filename, std::ve
        }
        // get slices
-        ConcateSlice slice0{.slices = {{0, dim0}}};
+        ConcateSlice slice0{{{0, dim0}}};
-        ConcateSlice slice1{.slices = {{0, dim1}}};
+        ConcateSlice slice1{{{0, dim1}}};
        if (slices.size() > 0 && slices[0].slices.size() > 0) {
            slice0 = slices[0];
        }

--- a/src/turbomind/utils/nccl_utils.cc
+++ b/src/turbomind/utils/nccl_utils.cc
@@ -15,6 +15,7 @@
 */
 #include "src/turbomind/utils/nccl_utils.h"
+#include "src/turbomind/macro.h"
 #include <atomic>
 namespace turbomind {

--- a/src/turbomind/utils/nvtx_utils.cc
+++ b/src/turbomind/utils/nvtx_utils.cc
@@ -18,7 +18,7 @@
 #include "nvtx_utils.h"
 #ifdef USE_NVTX
-#include "nvToolsExt.h"
+#include "nvtx3/nvToolsExt.h"
 #endif
 namespace ft_nvtx {

--- a/tests/csrc/gemm_dequantize/th_gemm_dequantize.cc
+++ b/tests/csrc/gemm_dequantize/th_gemm_dequantize.cc
@@ -49,12 +49,12 @@ Tensor fused_gemm_dq_helper(
    const T*          scales_ptr    = get_ptr<const T>(scales);
    turbomind::CutlassFpAIntBGemmRunner<T, WeightType> fused_gemm_dq_runner;
-    const int ws_bytes = fused_gemm_dq_runner.getWorkspaceSize(m, n, k);
+    const int                                          ws_bytes = fused_gemm_dq_runner.getWorkspaceSize(m, n, k);
    auto output_tensor = torch::empty({m, n}, torch::dtype(_st).device(torch::kCUDA).requires_grad(false));
    auto ws_tensor     = torch::empty({ws_bytes}, torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false));
-    T*   output_tensor_ptr = get_ptr<T>(output_tensor);
+    T*    output_tensor_ptr = get_ptr<T>(output_tensor);
    char* ws_ptr            = get_ptr<char>(ws_tensor);
    cudaEvent_t start, stop;
@@ -258,12 +258,12 @@ Tensor fused_gemm_dq_bias_act_helper(
    const T*          bias_ptr      = get_ptr<const T>(bias);
    turbomind::CutlassFpAIntBGemmRunner<T, WeightType> fused_gemm_dq_runner;
-    const int ws_bytes = fused_gemm_dq_runner.getWorkspaceSize(m, n, k);
+    const int                                          ws_bytes = fused_gemm_dq_runner.getWorkspaceSize(m, n, k);
    auto output_tensor = torch::empty({m, n}, torch::dtype(_st).device(torch::kCUDA).requires_grad(false));
    auto ws_tensor     = torch::empty({ws_bytes}, torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false));
-    T*   output_tensor_ptr = get_ptr<T>(output_tensor);
+    T*    output_tensor_ptr = get_ptr<T>(output_tensor);
    char* ws_ptr            = get_ptr<char>(ws_tensor);
    fused_gemm_dq_runner.gemm_bias_act(input_act_ptr,

--- a/tests/csrc/int8_gemm/int8_gemm_test.cu
+++ b/tests/csrc/int8_gemm/int8_gemm_test.cu
@@ -14,11 +14,11 @@
 * limitations under the License.
 */
+#include <chrono>
+#include <cstdlib>
 #include <cublas_v2.h>
 #include <iostream>
 #include <vector>
-#include <cstdlib>
-#include <chrono>
 #include "torch/csrc/cuda/Stream.h"
 #include <torch/custom_class.h>
@@ -37,18 +37,17 @@ using torch_ext::get_ptr;
 namespace ft = turbomind;
 template<typename T>
-void int8_gemm_test(
+void int8_gemm_test(const int            m,
-    const int m,
+                    const int            n,
-    const int n,
+                    const int            k,
-    const int k,
+                    const at::ScalarType output_data_type,
-    const at::ScalarType output_data_type,
+                    const QuantMode      quant_mode,
-    const QuantMode quant_mode,
+                    const int            iters)
-    const int iters)
 {
-     const bool per_token_quant = quant_mode == QuantMode::PerTokenChannelQuant
+    const bool per_token_quant =
-        || quant_mode == QuantMode::PerTokenQuant;
+        quant_mode == QuantMode::PerTokenChannelQuant || quant_mode == QuantMode::PerTokenQuant;
-    const bool per_channel_quant = quant_mode == QuantMode::PerTokenChannelQuant
+    const bool per_channel_quant =
-        || quant_mode == QuantMode::PerChannelQuant;
+        quant_mode == QuantMode::PerTokenChannelQuant || quant_mode == QuantMode::PerChannelQuant;
    const int row_scale_size = per_token_quant ? m : 1;
    const int col_scale_size = per_channel_quant ? n : 1;
@@ -76,16 +75,16 @@ void int8_gemm_test(
    ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, {(size_t)k, (size_t)n}, get_ptr<int32_t>(w)}.saveNpy("w.npy");
    ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, {(size_t)m, (size_t)n}, get_ptr<int32_t>(y)}.saveNpy("y.npy");
-    auto x_gpu = x.to(at_int8).to(torch::kCUDA);
+    auto x_gpu       = x.to(at_int8).to(torch::kCUDA);
-    auto w_T_gpu = w.to(at_int8).to(torch::kCUDA).t().contiguous();
+    auto w_T_gpu     = w.to(at_int8).to(torch::kCUDA).t().contiguous();
-    auto w_gpu = w.to(at_int8).to(torch::kCUDA);
+    auto w_gpu       = w.to(at_int8).to(torch::kCUDA);
-    auto y_gpu = torch::zeros({m, n}, torch::dtype(output_data_type).device(torch::kCUDA).requires_grad(false));
+    auto y_gpu       = torch::zeros({m, n}, torch::dtype(output_data_type).device(torch::kCUDA).requires_grad(false));
    auto y_gpu_int32 = torch::zeros({m, n}, torch::dtype(at_int32).device(torch::kCUDA).requires_grad(false));
-    auto alpha_row_cultass = torch::ones({row_scale_size, 1}, torch::dtype(at_fp32).requires_grad(false)) * (1.0 / 100) *
+    auto alpha_row_cultass = torch::ones({row_scale_size, 1}, torch::dtype(at_fp32).requires_grad(false)) * (1.0 / 100)
-        torch::randint(1, 10, {row_scale_size, 1}, torch::dtype(at_fp32));
+                             * torch::randint(1, 10, {row_scale_size, 1}, torch::dtype(at_fp32));
-    auto alpha_col_cutlass = torch::ones({1, col_scale_size}, torch::dtype(at_fp32).requires_grad(false)) * (1.0 / 100) *
+    auto alpha_col_cutlass = torch::ones({1, col_scale_size}, torch::dtype(at_fp32).requires_grad(false)) * (1.0 / 100)
-        torch::randint(1, 10, {1, col_scale_size}, torch::dtype(at_fp32));
+                             * torch::randint(1, 10, {1, col_scale_size}, torch::dtype(at_fp32));
    auto alpha_row_torch = alpha_row_cultass.expand({m, 1});
    auto alpha_col_torch = alpha_col_cutlass.expand({1, n});
@@ -101,40 +100,41 @@ void int8_gemm_test(
    auto stream = at::cuda::getCurrentCUDAStream().stream();
    // warm_up
    cutlass_runner_half.gemm(get_ptr<int8_t>(x_gpu),
-            get_ptr<int8_t>(w_T_gpu),
+                             get_ptr<int8_t>(w_T_gpu),
-            quant_mode,
+                             quant_mode,
-            get_ptr<float>(alpha_col_gpu),
+                             get_ptr<float>(alpha_col_gpu),
-            get_ptr<float>(alpha_row_gpu),
+                             get_ptr<float>(alpha_row_gpu),
-            get_ptr<T>(y_gpu),
+                             get_ptr<T>(y_gpu),
-            m,
+                             m,
-            n,
+                             n,
-            k,
+                             k,
-            nullptr,
+                             nullptr,
-            0,
+                             0,
-            stream);
+                             stream);
    ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT8, {(size_t)m, (size_t)k}, get_ptr<int8_t>(x_gpu)}.saveNpy("x_gpu.npy");
    ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT8, {(size_t)n, (size_t)k}, get_ptr<int8_t>(w_T_gpu)}.saveNpy("w_T_gpu.npy");
    ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT8, {(size_t)k, (size_t)n}, get_ptr<int8_t>(w_gpu)}.saveNpy("w_gpu.npy");
    ft::Tensor{ft::MEMORY_GPU, ft::TYPE_FP16, {(size_t)m, (size_t)n}, get_ptr<T>(y_gpu)}.saveNpy("y_gpu.npy");
-    ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT32, {(size_t)m, (size_t)n}, get_ptr<int32_t>(y_gpu_int32)}.saveNpy("y_gpu_int32.npy");
+    ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT32, {(size_t)m, (size_t)n}, get_ptr<int32_t>(y_gpu_int32)}.saveNpy(
+        "y_gpu_int32.npy");
    ft::check_cuda_error(cudaStreamSynchronize(stream));
    auto start = high_resolution_clock::now();
    for (int i = 0; i < iters; ++i) {
        cutlass_runner_half.gemm(get_ptr<int8_t>(x_gpu),
-            get_ptr<int8_t>(w_T_gpu),
+                                 get_ptr<int8_t>(w_T_gpu),
-            quant_mode,
+                                 quant_mode,
-            get_ptr<float>(alpha_col_gpu),
+                                 get_ptr<float>(alpha_col_gpu),
-            get_ptr<float>(alpha_row_gpu),
+                                 get_ptr<float>(alpha_row_gpu),
-            get_ptr<T>(y_gpu),
+                                 get_ptr<T>(y_gpu),
-            m,
+                                 m,
-            n,
+                                 n,
-            k,
+                                 k,
-            nullptr,
+                                 nullptr,
-            0,
+                                 0,
-            stream);
+                                 stream);
    }
    ft::check_cuda_error(cudaStreamSynchronize(stream));
@@ -142,27 +142,30 @@ void int8_gemm_test(
    auto duration = duration_cast<microseconds>(end - start);
-    if (torch::allclose((y.to(torch::kCUDA).to(at_fp32) * alpha_row_col_scale_gpu.to(torch::kCUDA)).to(output_data_type), y_gpu)) {
+    if (torch::allclose(
+            (y.to(torch::kCUDA).to(at_fp32) * alpha_row_col_scale_gpu.to(torch::kCUDA)).to(output_data_type), y_gpu)) {
        TM_LOG_INFO("SUCCESS " + std::to_string((double(duration.count()) / iters) / 1000) + " ms");
-    } else {
+    }
+    else {
        TM_LOG_ERROR("FAILED " + std::to_string((double(duration.count()) / iters) / 1000) + " ms");
-        // std::cout << "diff " << (y.to(torch::kCUDA).to(at_fp32) * alpha_row_col_scale_gpu.to(torch::kCUDA)).to(at_fp16) - y_gpu << std::endl;
+        // std::cout << "diff " << (y.to(torch::kCUDA).to(at_fp32) *
+        // alpha_row_col_scale_gpu.to(torch::kCUDA)).to(at_fp16) - y_gpu << std::endl;
    }
 }
-int main(int argc, char **argv)
+int main(int argc, char** argv)
 {
    if (argc != 7) {
-        TM_LOG_ERROR("arguments missing, needs m, n, k, data_type(fp16=0, bf16=1), quant_mode (perTensor=0, perToken=1, perChannel=2, perTokenChannel=3), iters.");
+        TM_LOG_ERROR(
+            "arguments missing, needs m, n, k, data_type(fp16=0, bf16=1), quant_mode (perTensor=0, perToken=1, perChannel=2, perTokenChannel=3), iters.");
        return 0;
    }
-    const int m = atoi(argv[1]);
+    const int            m                = atoi(argv[1]);
-    const int n = atoi(argv[2]);
+    const int            n                = atoi(argv[2]);
-    const int k = atoi(argv[3]);
+    const int            k                = atoi(argv[3]);
-    const at::ScalarType output_data_type = atoi(argv[4]) == 0 ?
+    const at::ScalarType output_data_type = atoi(argv[4]) == 0 ? at::ScalarType::Half : at::ScalarType::BFloat16;
-        at::ScalarType::Half : at::ScalarType::BFloat16;
+    const QuantMode      quant_mode       = static_cast<QuantMode>(atoi(argv[5]));
-    const QuantMode quant_mode = static_cast<QuantMode>(atoi(argv[5]));
    if (quant_mode == QuantMode::PerChannelQuant) {
        printf("per channel quant \n");
    }
@@ -170,7 +173,8 @@ int main(int argc, char **argv)
    if (output_data_type == at::ScalarType::Half) {
        int8_gemm_test<half>(m, n, k, output_data_type, quant_mode, iters);
-    } else {
+    }
+    else {
 #if ENABLE_BF16
        int8_gemm_test<__nv_bfloat16>(m, n, k, output_data_type, quant_mode, iters);
 #endif

--- a/tests/csrc/unittests/CMakeLists.txt
+++ b/tests/csrc/unittests/CMakeLists.txt
@@ -20,7 +20,12 @@ FetchContent_Declare(
  GIT_REPOSITORY https://github.com/google/googletest.git
  GIT_TAG release-1.12.1
 )
-add_definitions(-DTORCH_CUDA=1)
+find_package(CUDAToolkit REQUIRED)
+if (NOT MSVC)
+  add_definitions(-DTORCH_CUDA=1)
+endif()
 # For Windows: Prevent overriding the parent project's compiler/linker settings
 set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
@@ -41,23 +46,23 @@ target_compile_features(unittest PRIVATE cxx_std_14)
 # Sorted by alphabetical order of test name.
 target_link_libraries(  # Libs for test_attention_kernels
  unittest PUBLIC
-    -lcudart -lcurand
+    CUDA::cudart CUDA::curand
    gpt_kernels gtest memory_utils tensor unfused_attention_kernels cuda_utils logger)
 target_link_libraries(  # Libs for test_logprob_kernels
  unittest PUBLIC
-    -lcudart
+    CUDA::cudart
    logprob_kernels memory_utils cuda_utils logger)
 target_link_libraries(  # Libs for test_penalty_kernels
  unittest PUBLIC
-    -lcublas -lcublasLt -lcudart
+    CUDA::cublas CUDA::cublasLt CUDA::cudart
    sampling_penalty_kernels memory_utils cuda_utils logger)
 target_link_libraries(  # Libs for test_sampling_kernel
  unittest PUBLIC
-    -lcudart
+    CUDA::cudart
    sampling_topk_kernels sampling_topp_kernels memory_utils tensor cuda_utils logger)
 target_link_libraries(  # Libs for test_sampling_layer
  unittest PUBLIC
-    -lcublas -lcublasLt -lcudart
+    CUDA::cublas CUDA::cublasLt CUDA::cudart
    cublasMMWrapper memory_utils
    DynamicDecodeLayer TopKSamplingLayer TopPSamplingLayer tensor cuda_utils logger)
 target_link_libraries(  # Libs for test_tensor
@@ -65,7 +70,7 @@ target_link_libraries(  # Libs for test_tensor
 remove_definitions(-DTORCH_CUDA=1)
 add_executable(test_gemm test_gemm.cu)
-target_link_libraries(test_gemm PUBLIC -lcublas -lcudart -lcurand gemm cublasMMWrapper tensor cuda_utils logger)
+target_link_libraries(test_gemm PUBLIC CUDA::cublas CUDA::cudart CUDA::curand gemm cublasMMWrapper tensor cuda_utils logger)
 add_executable(test_gpt_kernels test_gpt_kernels.cu)
 target_link_libraries(test_gpt_kernels PUBLIC
@@ -73,6 +78,6 @@ target_link_libraries(test_gpt_kernels PUBLIC
 add_executable(test_context_attention_layer test_context_attention_layer.cu)
 target_link_libraries(test_context_attention_layer PUBLIC
-                      Llama -lcublas -lcublasLt -lcudart
+                      Llama CUDA::cublas CUDA::cublasLt CUDA::cudart
                      unfused_attention_kernels
                      memory_utils tensor cublasMMWrapper cuda_utils logger)
--- a/tests/csrc/unittests/test_attention_kernels.cu
+++ b/tests/csrc/unittests/test_attention_kernels.cu
@@ -14,13 +14,12 @@
 * limitations under the License.
 */
+#include "gtest_utils.h"
 #include "src/turbomind/kernels/gpt_kernels.h"
 #include "src/turbomind/kernels/unfused_attention_kernels.h"
 #include "src/turbomind/utils/Tensor.h"
 #include "src/turbomind/utils/memory_utils.h"
 #include "src/turbomind/utils/nccl_utils.h"
-#include "gtest_utils.h"
 #include <curand.h>
 #include <sstream>

--- a/tests/csrc/unittests/test_context_attention_layer.cu
+++ b/tests/csrc/unittests/test_context_attention_layer.cu
@@ -336,35 +336,26 @@ int main(int argc, const char* argv[])
    // compute actual
    using AttentionOp = FlashAttentionOp<scalar_t>;
    using Layout      = typename AttentionOp::AttentionLayout;
-    Layout      layout_q{.stride_batch = num_heads * seq_len * size_per_head,
+    Layout      layout_q{num_heads * seq_len * size_per_head, size_per_head, seq_len * size_per_head};
-                         .stride_seq   = size_per_head,
+    Layout      layout_k{num_heads * key_len * size_per_head, size_per_head, key_len * size_per_head};
-                         .stride_head  = seq_len * size_per_head};
+    Layout      layout_v{num_heads * key_len * size_per_head, size_per_head, key_len * size_per_head};
-    Layout      layout_k{.stride_batch = num_heads * key_len * size_per_head,
+    Layout      layout_o{num_heads * seq_len * size_per_head, num_heads * size_per_head, size_per_head, true};
-                         .stride_seq   = size_per_head,
-                         .stride_head  = key_len * size_per_head};
-    Layout      layout_v{.stride_batch = num_heads * key_len * size_per_head,
-                         .stride_seq   = size_per_head,
-                         .stride_head  = key_len * size_per_head};
-    Layout      layout_o{.stride_batch = num_heads * seq_len * size_per_head,
-                         .stride_seq   = num_heads * size_per_head,
-                         .stride_head  = size_per_head,
-                         .use_seqlens  = true};
    AttentionOp flash_attention(batch_size, num_heads, key_len, seq_len, size_per_head);
    float*      accum_buf_ptr = (float*)allocator.malloc(flash_attention.get_workspace_size(), true);
-    typename AttentionOp::Params attn_params{.attn_out     = actual_out_ptr,
+    typename AttentionOp::Params attn_params{actual_out_ptr,
-                                             .query        = query_ptr,
+                                             query_ptr,
-                                             .key          = key_ptr,
+                                             key_ptr,
-                                             .val          = val_ptr,
+                                             val_ptr,
-                                             .mask         = mask_ptr,
+                                             mask_ptr,
-                                             .out_accum    = accum_buf_ptr,
+                                             accum_buf_ptr,
-                                             .cu_seqlens_q = cu_seqlens_ptr,
+                                             cu_seqlens_ptr,
-                                             .cu_seqlens_k = nullptr,
+                                             nullptr,
-                                             .group_size   = 1,
+                                             1,
-                                             .layout_q     = layout_q,
+                                             layout_q,
-                                             .layout_k     = layout_k,
+                                             layout_k,
-                                             .layout_v     = layout_v,
+                                             layout_v,
-                                             .layout_o     = layout_o};
+                                             layout_o};
    flash_attention(attn_params, stream);
    sync_check_cuda_error();

--- a/tests/csrc/unittests/test_gemm.cu
+++ b/tests/csrc/unittests/test_gemm.cu
 #include <assert.h>
-#include <math.h>
 #include <cublas_v2.h>
+#include <math.h>
 #include <numeric>
 #include <stdexcept>
 #include <tuple>
@@ -18,35 +18,38 @@ using namespace turbomind;
 // Can be replaced by the function provided by a test framework
-class TestFailureError : public std::exception {
+class TestFailureError: public std::exception {
 private:
    std::string msg_;
 public:
    explicit TestFailureError() = default;
-    explicit TestFailureError(std::string name, std::string msg = "") {
+    explicit TestFailureError(std::string name, std::string msg = "")
+    {
        msg_ = fmtstr("TEST FAIL [%s] %s", name.c_str(), msg.c_str());
    }
-    const char* what () const throw () {
+    const char* what() const throw()
+    {
        return msg_.c_str();
    }
 };
-#define EXPECT_TRUE(cond)                           \
+#define EXPECT_TRUE(cond)                                                                                              \
-    do { if(!(cond)) {                              \
+    do {                                                                                                               \
-        TM_LOG_ERROR("TEST FAIL [%s] at %s:%d",     \
+        if (!(cond)) {                                                                                                 \
-                     __func__, __FILE__, __LINE__); \
+            TM_LOG_ERROR("TEST FAIL [%s] at %s:%d", __func__, __FILE__, __LINE__);                                     \
-        throw TestFailureError(__func__);           \
+            throw TestFailureError(__func__);                                                                          \
-    } } while(false)
+        }                                                                                                              \
+    } while (false)
-#define EXPECT_ALMOST_EQUAL(name, dtype, ctype, out, ref)       \
-    do {                                                        \
+#define EXPECT_ALMOST_EQUAL(name, dtype, ctype, out, ref)                                                              \
-        bool is_ok = checkResult<dtype,ctype>(name, out, ref);  \
+    do {                                                                                                               \
-        if(!is_ok) {                                            \
+        bool is_ok = checkResult<dtype, ctype>(name, out, ref);                                                        \
-            TM_LOG_ERROR("TEST FAIL [%s] at %s:%d",             \
+        if (!is_ok) {                                                                                                  \
-                        __func__, __FILE__, __LINE__);          \
+            TM_LOG_ERROR("TEST FAIL [%s] at %s:%d", __func__, __FILE__, __LINE__);                                     \
-            throw TestFailureError(__func__);                   \
+            throw TestFailureError(__func__);                                                                          \
-        }                                                       \
+        }                                                                                                              \
-    } while(false)
+    } while (false)
 ////////////////////////////////////////////////////////////////////////////////////
@@ -58,28 +61,29 @@ private:
 public:
    std::vector<size_t> shape;
-    DataType type;
+    DataType            type;
-    Tensor* tensor;
+    Tensor*             tensor;
-    void* data;
+    void*               data;
    TensorWrapper(IAllocator* allocator, DataType dtype, std::vector<size_t> shape, bool zero_init = false)
    {
        this->allocator = allocator;
-        this->type = dtype;
+        this->type      = dtype;
-        this->shape = shape;
+        this->shape     = shape;
        size_t tensor_memsize = this->memsize();
-        this->data = this->allocator->malloc(tensor_memsize, false);
+        this->data            = this->allocator->malloc(tensor_memsize, false);
        if (zero_init) {
            check_cuda_error(cudaMemset(data, 0x0, tensor_memsize));
-        } else {
+        }
+        else {
            setRandomValues();
        }
        this->tensor = new Tensor(MEMORY_GPU, dtype, shape, data);
    }
-    TensorWrapper(TensorWrapper const& other)
+    TensorWrapper(TensorWrapper const& other):
-        : allocator(other.allocator), shape(other.shape), type(other.type), data(other.data), tensor(other.tensor)
+        allocator(other.allocator), shape(other.shape), type(other.type), data(other.data), tensor(other.tensor)
    {
        TM_LOG_DEBUG("TensorWrapper copy: this=%p other=%p", data, other.data);
    }
@@ -91,13 +95,14 @@ public:
    void setInvalidValues()
    {
-        size_t type_size = tensor->type == TYPE_FP32 ? sizeof(float) : sizeof(half);
+        size_t type_size   = tensor->type == TYPE_FP32 ? sizeof(float) : sizeof(half);
        size_t tensor_size = type_size * tensor->size();
        // Fill by a random number to guarantee invalid values
        check_cuda_error(cudaMemset(data, 0xdc, tensor_size));
    }
-    void setRandomValues() {
+    void setRandomValues()
+    {
        // random initialization
        size_t num_elements = this->size();
        switch (this->type) {
@@ -113,7 +118,8 @@ public:
        }
    }
-    size_t size() {
+    size_t size()
+    {
        size_t n_elements = 1;
        for (size_t s : this->shape) {
            n_elements *= s;
@@ -121,7 +127,8 @@ public:
        return n_elements;
    }
-    size_t memsize() {
+    size_t memsize()
+    {
        size_t type_size = 0;
        switch (this->type) {
            case TYPE_FP32:
@@ -138,13 +145,13 @@ public:
 };
 template<DataType computeType>
-void computeReference(GemmOp transa,
+void computeReference(GemmOp         transa,
-                      GemmOp transb,
+                      GemmOp         transb,
                      TensorWrapper& C,
                      TensorWrapper& A,
                      TensorWrapper& B,
-                      float alpha = 1.0f,
+                      float          alpha = 1.0f,
-                      float beta = 0.0f)
+                      float          beta  = 0.0f)
 {
    size_t m = C.shape[0];
    size_t n = C.shape[1];
@@ -154,28 +161,36 @@ void computeReference(GemmOp transa,
    size_t ldb = (transb == GEMM_OP_N) ? n : k;
    size_t ldc = n;
-    cudaDataType_t atype = (A.type == TYPE_FP16) ? CUDA_R_16F : CUDA_R_32F;
+    cudaDataType_t atype        = (A.type == TYPE_FP16) ? CUDA_R_16F : CUDA_R_32F;
-    cudaDataType_t btype = (B.type == TYPE_FP16) ? CUDA_R_16F : CUDA_R_32F;
+    cudaDataType_t btype        = (B.type == TYPE_FP16) ? CUDA_R_16F : CUDA_R_32F;
-    cudaDataType_t ctype = (C.type == TYPE_FP16) ? CUDA_R_16F : CUDA_R_32F;
+    cudaDataType_t ctype        = (C.type == TYPE_FP16) ? CUDA_R_16F : CUDA_R_32F;
    cudaDataType_t compute_type = (computeType == TYPE_FP16) ? CUDA_R_16F : CUDA_R_32F;
    cublasHandle_t cublas_handle;
    check_cuda_error(cublasCreate(&cublas_handle));
-    half h_alpha = (half)alpha;
+    half        h_alpha = (half)alpha;
-    half h_beta = (half)beta;
+    half        h_beta  = (half)beta;
-    const void* _alpha = (computeType == TYPE_FP16) ? (const void*)&h_alpha : (const void*)&alpha;
+    const void* _alpha  = (computeType == TYPE_FP16) ? (const void*)&h_alpha : (const void*)&alpha;
-    const void* _beta = (computeType == TYPE_FP16) ? (const void*)&h_beta : (const void*)&beta;
+    const void* _beta   = (computeType == TYPE_FP16) ? (const void*)&h_beta : (const void*)&beta;
    check_cuda_error(cublasGemmEx(cublas_handle,
                                  getCublasOperation(transb),
                                  getCublasOperation(transa),
-                                  n, m, k,
+                                  n,
+                                  m,
+                                  k,
                                  _alpha,
-                                  (const void*)B.data, btype, ldb,
+                                  (const void*)B.data,
-                                  (const void*)A.data, atype, lda,
+                                  btype,
+                                  ldb,
+                                  (const void*)A.data,
+                                  atype,
+                                  lda,
                                  _beta,
-                                  (void*)C.data, ctype, ldc,
+                                  (void*)C.data,
+                                  ctype,
+                                  ldc,
                                  compute_type,
                                  CUBLAS_GEMM_DEFAULT));
    check_cuda_error(cublasDestroy(cublas_handle));
@@ -199,13 +214,14 @@ bool almostEqual(float a, float b, float atol = 1e-5, float rtol = 1e-8)
 }
 template<typename T>
-bool _checkResult(std::string name, TensorWrapper& out, TensorWrapper& ref, float atol, float rtol) {
+bool _checkResult(std::string name, TensorWrapper& out, TensorWrapper& ref, float atol, float rtol)
+{
    assert(out.type == ref.type);
    size_t out_size = out.size();
    size_t ref_size = ref.size();
-    T* h_out = reinterpret_cast<T*>(malloc(sizeof(T) * out_size));
+    T*     h_out    = reinterpret_cast<T*>(malloc(sizeof(T) * out_size));
-    T* h_ref = reinterpret_cast<T*>(malloc(sizeof(T) * ref_size));
+    T*     h_ref    = reinterpret_cast<T*>(malloc(sizeof(T) * ref_size));
    cudaMemcpy(h_out, out.data, sizeof(T) * out_size, cudaMemcpyDeviceToHost);
    cudaMemcpy(h_ref, ref.data, sizeof(T) * ref_size, cudaMemcpyDeviceToHost);
@@ -219,7 +235,7 @@ bool _checkResult(std::string name, TensorWrapper& out, TensorWrapper& ref, floa
        bool ok = almostEqual(a, b, atol, rtol);
        // Print the error.
-        if( !ok && failures < 4 ) {
+        if (!ok && failures < 4) {
            TM_LOG_ERROR(">> invalid result for i=%lu:", i);
            TM_LOG_ERROR(">>    found......: %10.6f", a);
            TM_LOG_ERROR(">>    expected...: %10.6f", b);
@@ -234,38 +250,46 @@ bool _checkResult(std::string name, TensorWrapper& out, TensorWrapper& ref, floa
    // Allow not matched up to 1% elements.
    size_t tol_failures = (size_t)(0.01 * out_size);
    TM_LOG_INFO("check....... %30s : %s (failures: %.2f%% atol: %.2e rtol: %.2e)",
-                name.c_str(), failures <= tol_failures ? "OK" : "FAILED",
+                name.c_str(),
-                100. * failures / out_size, atol, rtol);
+                failures <= tol_failures ? "OK" : "FAILED",
+                100. * failures / out_size,
+                atol,
+                rtol);
    return failures <= tol_failures;
 }
 template<typename T, DataType computeType>
-bool checkResult(std::string name, TensorWrapper& out, TensorWrapper& ref) {
+bool checkResult(std::string name, TensorWrapper& out, TensorWrapper& ref)
-    float atol = (computeType == TYPE_FP32) ? 1e-6f : 1e-3f;
+{
-    float rtol = (computeType == TYPE_FP32) ? 1e-4f : 1e-1f;
+    float atol  = (computeType == TYPE_FP32) ? 1e-6f : 1e-3f;
-    bool is_ok = false;
+    float rtol  = (computeType == TYPE_FP32) ? 1e-4f : 1e-1f;
+    bool  is_ok = false;
    if (sizeof(T) == 4) {
        is_ok = _checkResult<float>(name, out, ref, atol, rtol);
-    } else {
+    }
+    else {
        is_ok = _checkResult<half>(name, out, ref, atol, rtol);
    }
    return is_ok;
 }
 template<typename T, DataType computeType>
-bool checkResult(TensorWrapper& out, TensorWrapper& ref) {
+bool checkResult(TensorWrapper& out, TensorWrapper& ref)
+{
    return checkResult<T, computeType>("", out, ref);
 }
 template<typename T>
-std::string toString() {
+std::string toString()
+{
    std::string str = "dtype=";
    str += std::is_same<T, float>::value ? "FP32" : "FP16";
    return str;
 }
 template<typename T, DataType ctype>
-std::string toString() {
+std::string toString()
+{
    std::string str = "dtype=";
    str += std::is_same<T, float>::value ? "FP32" : "FP16";
    str += ", compute_type=";
@@ -273,7 +297,8 @@ std::string toString() {
    return str;
 }
-std::string toString(GemmOp op) {
+std::string toString(GemmOp op)
+{
    return op == GEMM_OP_N ? "N" : "T";
 }
@@ -282,38 +307,38 @@ struct GemmOpPair {
    GemmOp transb;
 };
-static const std::vector<GemmOpPair> op_pairs {{GEMM_OP_N, GEMM_OP_N},
+static const std::vector<GemmOpPair> op_pairs{
-                                               {GEMM_OP_N, GEMM_OP_T},
+    {GEMM_OP_N, GEMM_OP_N}, {GEMM_OP_N, GEMM_OP_T}, {GEMM_OP_T, GEMM_OP_N}, {GEMM_OP_T, GEMM_OP_T}};
-                                               {GEMM_OP_T, GEMM_OP_N},
-                                               {GEMM_OP_T, GEMM_OP_T}};
-static inline std::string getTestName(const char* func_name, GemmOp transa, GemmOp transb,
+static inline std::string getTestName(const char* func_name, GemmOp transa, GemmOp transb, size_t m, size_t n, size_t k)
-                                      size_t m, size_t n, size_t k)
 {
    return fmtstr("%s [opA=%s, opB=%s, m=%ld, n=%ld, k=%ld]",
-                  func_name, getGemmOpString(transa).c_str(), getGemmOpString(transb).c_str(),
+                  func_name,
-                  m, n, k);
+                  getGemmOpString(transa).c_str(),
+                  getGemmOpString(transb).c_str(),
+                  m,
+                  n,
+                  k);
 }
-static inline std::string getTestName(const char* func_name, GemmOpPair op_pairs,
+static inline std::string getTestName(const char* func_name, GemmOpPair op_pairs, size_t m, size_t n, size_t k)
-                                      size_t m, size_t n, size_t k)
 {
    return getTestName(func_name, op_pairs.transa, op_pairs.transb, m, n, k);
 }
 /////////////////////////////////// Unittests //////////////////////////////////////////
 template<typename T, DataType computeType>
-void testGemmCorrectnessMatmul(size_t m, size_t n, size_t k) {
+void testGemmCorrectnessMatmul(size_t m, size_t n, size_t k)
-    TM_LOG_INFO("Matmul function correctness test [m=%ld, n=%ld, k=%ld, %s]",
+{
-                m, n, k, toString<T, computeType>().c_str());
+    TM_LOG_INFO(
+        "Matmul function correctness test [m=%ld, n=%ld, k=%ld, %s]", m, n, k, toString<T, computeType>().c_str());
    cudaStream_t stream;
    check_cuda_error(cudaStreamCreate(&stream));
    Allocator<AllocatorType::CUDA> allocator(getDevice());
-    DataType dtype = getTensorType<T>();
+    DataType      dtype = getTensorType<T>();
    TensorWrapper a_tensor(&allocator, dtype, {m, k}, false);
    TensorWrapper b_tensor(&allocator, dtype, {k, n}, false);
    TensorWrapper c_tensor(&allocator, dtype, {m, n}, true);
@@ -322,72 +347,80 @@ void testGemmCorrectnessMatmul(size_t m, size_t n, size_t k) {
    std::shared_ptr<Gemm> gemm = createGemm(&allocator, stream, false, false);
    gemm->setTypes(a_tensor.type, b_tensor.type, c_tensor.type, computeType);
-    for (auto &op_pair : op_pairs) {
+    for (auto& op_pair : op_pairs) {
        std::string tc_name = getTestName(__func__, op_pair, m, n, k);
        TM_LOG_DEBUG(tc_name);
-        computeReference<computeType>(op_pair.transa, op_pair.transb,
+        computeReference<computeType>(op_pair.transa, op_pair.transb, expected, a_tensor, b_tensor);
-                                      expected, a_tensor, b_tensor);
        size_t lda = (op_pair.transa == GEMM_OP_N) ? k : m;
        size_t ldb = (op_pair.transb == GEMM_OP_N) ? n : k;
        size_t ldc = n;
-        c_tensor.setInvalidValues(); // to guarantee C has invalid data
+        c_tensor.setInvalidValues();  // to guarantee C has invalid data
-        gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
+        gemm->gemm(op_pair.transa,
-                   a_tensor.data, a_tensor.type, lda,
+                   op_pair.transb,
-                   b_tensor.data, b_tensor.type, ldb,
+                   m,
-                   c_tensor.data, c_tensor.type, ldc);
+                   n,
+                   k,
+                   a_tensor.data,
+                   a_tensor.type,
+                   lda,
+                   b_tensor.data,
+                   b_tensor.type,
+                   ldb,
+                   c_tensor.data,
+                   c_tensor.type,
+                   ldc);
        EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected);
        c_tensor.setInvalidValues();
-        gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
+        gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, lda, b_tensor.data, ldb, c_tensor.data, ldc);
-                   a_tensor.data, lda,
-                   b_tensor.data, ldb,
-                   c_tensor.data, ldc);
        EXPECT_ALMOST_EQUAL(tc_name + " api2", T, computeType, c_tensor, expected);
        c_tensor.setInvalidValues();
-        gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
+        gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, b_tensor.data, c_tensor.data);
-                   a_tensor.data, b_tensor.data, c_tensor.data);
        EXPECT_ALMOST_EQUAL(tc_name + " api3", T, computeType, c_tensor, expected);
        c_tensor.setInvalidValues();
-        gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
+        gemm->gemm(op_pair.transa,
-                    a_tensor.data, DenseWeight<T>{(const T*)b_tensor.data, nullptr, nullptr}, c_tensor.data);
+                   op_pair.transb,
+                   m,
+                   n,
+                   k,
+                   a_tensor.data,
+                   DenseWeight<T>{(const T*)b_tensor.data, nullptr, nullptr},
+                   c_tensor.data);
        EXPECT_ALMOST_EQUAL(tc_name + " api4", T, computeType, c_tensor, expected);
    }
    check_cuda_error(cudaStreamDestroy(stream));
 }
 template<typename T, DataType computeType>
-void testGemmConsistencyMatmul(size_t m, size_t n, size_t k) {
+void testGemmConsistencyMatmul(size_t m, size_t n, size_t k)
+{
    // Test if Gemm is consistent with cublasWrapper
-    TM_LOG_INFO("Matmul function consistency test [m=%ld, n=%ld, k=%ld, %s]",
+    TM_LOG_INFO(
-                m, n, k, toString<T, computeType>().c_str());
+        "Matmul function consistency test [m=%ld, n=%ld, k=%ld, %s]", m, n, k, toString<T, computeType>().c_str());
    Allocator<AllocatorType::CUDA> allocator(getDevice());
-    cudaStream_t stream;
+    cudaStream_t                   stream;
    check_cuda_error(cudaStreamCreate(&stream));
-    DataType dtype = getTensorType<T>();
+    DataType      dtype = getTensorType<T>();
    TensorWrapper a_tensor(&allocator, dtype, {m, k}, false);
    TensorWrapper b_tensor(&allocator, dtype, {k, n}, false);
    TensorWrapper c_tensor(&allocator, dtype, {m, n}, true);
    TensorWrapper expected(&allocator, dtype, {m, n}, true);
-    cublasHandle_t cublas_handle;
+    cublasHandle_t   cublas_handle;
    cublasLtHandle_t cublaslt_handle;
    check_cuda_error(cublasCreate(&cublas_handle));
    check_cuda_error(cublasLtCreate(&cublaslt_handle));
    check_cuda_error(cublasSetStream(cublas_handle, stream));
-    cublasAlgoMap cublas_algo_map(GEMM_CONFIG);
+    cublasAlgoMap   cublas_algo_map(GEMM_CONFIG);
-    std::mutex* cublas_wrapper_mutex = new std::mutex();
+    std::mutex*     cublas_wrapper_mutex = new std::mutex();
-    cublasMMWrapper cublas_wrapper(cublas_handle,
+    cublasMMWrapper cublas_wrapper(
-                                   cublaslt_handle,
+        cublas_handle, cublaslt_handle, stream, &cublas_algo_map, cublas_wrapper_mutex, &allocator);
-                                   stream,
-                                   &cublas_algo_map,
-                                   cublas_wrapper_mutex,
-                                   &allocator);
    cudaDataType_t cuda_dtype = std::is_same<float, T>::value ? CUDA_R_32F : CUDA_R_16F;
    cudaDataType_t cuda_ctype = (DataType::TYPE_FP32 == computeType) ? CUDA_R_32F : CUDA_R_16F;
@@ -396,7 +429,7 @@ void testGemmConsistencyMatmul(size_t m, size_t n, size_t k) {
    std::shared_ptr<Gemm> gemm = createGemm(&allocator, stream, false, false);
    gemm->setTypes(a_tensor.type, b_tensor.type, c_tensor.type, computeType);
-    for (auto &op_pair : op_pairs) {
+    for (auto& op_pair : op_pairs) {
        std::string tc_name = getTestName(__func__, op_pair, m, n, k);
        // Switch A/B because Gemm expects column major layout as cublas does.
@@ -405,33 +438,50 @@ void testGemmConsistencyMatmul(size_t m, size_t n, size_t k) {
        size_t ldc = n;
        cublas_wrapper.Gemm(getCublasOperation(op_pair.transb),
                            getCublasOperation(op_pair.transa),
-                            n, m, k,
+                            n,
-                            b_tensor.data, ldb,
+                            m,
-                            a_tensor.data, lda,
+                            k,
-                            expected.data, ldc);
+                            b_tensor.data,
+                            ldb,
-        c_tensor.setInvalidValues(); // to guarantee C has invalid data
+                            a_tensor.data,
-        gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
+                            lda,
-                   a_tensor.data, a_tensor.type, lda,
+                            expected.data,
-                   b_tensor.data, b_tensor.type, ldb,
+                            ldc);
-                   c_tensor.data, c_tensor.type, ldc);
+        c_tensor.setInvalidValues();  // to guarantee C has invalid data
+        gemm->gemm(op_pair.transa,
+                   op_pair.transb,
+                   m,
+                   n,
+                   k,
+                   a_tensor.data,
+                   a_tensor.type,
+                   lda,
+                   b_tensor.data,
+                   b_tensor.type,
+                   ldb,
+                   c_tensor.data,
+                   c_tensor.type,
+                   ldc);
        EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected);
        c_tensor.setInvalidValues();
-        gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
+        gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, lda, b_tensor.data, ldb, c_tensor.data, ldc);
-                   a_tensor.data, lda,
-                   b_tensor.data, ldb,
-                   c_tensor.data, ldc);
        EXPECT_ALMOST_EQUAL(tc_name + " api2", T, computeType, c_tensor, expected);
        c_tensor.setInvalidValues();
-        gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
+        gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, b_tensor.data, c_tensor.data);
-                   a_tensor.data, b_tensor.data, c_tensor.data);
        EXPECT_ALMOST_EQUAL(tc_name + " api3", T, computeType, c_tensor, expected);
        c_tensor.setInvalidValues();
-        gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
+        gemm->gemm(op_pair.transa,
-                    a_tensor.data, DenseWeight<T>{(const T*)b_tensor.data, nullptr, nullptr}, c_tensor.data);
+                   op_pair.transb,
+                   m,
+                   n,
+                   k,
+                   a_tensor.data,
+                   DenseWeight<T>{(const T*)b_tensor.data, nullptr, nullptr},
+                   c_tensor.data);
        EXPECT_ALMOST_EQUAL(tc_name + " api4", T, computeType, c_tensor, expected);
    }
@@ -442,24 +492,28 @@ void testGemmConsistencyMatmul(size_t m, size_t n, size_t k) {
 }
 template<typename T, DataType computeType>
-void testGemmConsistencyBatchedMatmul(size_t m, size_t n, size_t k) {
+void testGemmConsistencyBatchedMatmul(size_t m, size_t n, size_t k)
+{
    // Test if Gemm is consistent with cublasWrapper
    TM_LOG_INFO("Batched gemm function consistency test [m=%ld, n=%ld, k=%ld, %s]",
-                m, n, k, toString<T, computeType>().c_str());
+                m,
+                n,
+                k,
+                toString<T, computeType>().c_str());
    Allocator<AllocatorType::CUDA> allocator(getDevice());
-    cudaStream_t stream;
+    cudaStream_t                   stream;
    check_cuda_error(cudaStreamCreate(&stream));
    // batch of in/out tensors
-    DataType a_type = getTensorType<T>();
+    DataType                    a_type = getTensorType<T>();
-    DataType b_type = getTensorType<T>();
+    DataType                    b_type = getTensorType<T>();
-    DataType c_type = getTensorType<T>();
+    DataType                    c_type = getTensorType<T>();
    std::vector<TensorWrapper*> a_tensors;
    std::vector<TensorWrapper*> b_tensors;
    std::vector<TensorWrapper*> c_tensors;
    std::vector<TensorWrapper*> expecteds;
-    const size_t batch_size = 3;
+    const size_t                batch_size = 3;
    for (size_t i = 0; i < batch_size; ++i) {
        a_tensors.push_back(new TensorWrapper(&allocator, a_type, {m, k}, false));
        b_tensors.push_back(new TensorWrapper(&allocator, b_type, {k, n}, false));
@@ -484,26 +538,21 @@ void testGemmConsistencyBatchedMatmul(size_t m, size_t n, size_t k) {
                  (const T*)expecteds[2]->data};
    T** batch_tensor_ptrs = reinterpret_cast<T**>(allocator.malloc(sizeof(T*) * 16, false));
-    check_cuda_error(cudaMemcpyAsync(
+    check_cuda_error(cudaMemcpyAsync((void*)batch_tensor_ptrs, hA, sizeof(T*) * 16, cudaMemcpyHostToDevice, stream));
-        (void*)batch_tensor_ptrs, hA, sizeof(T*) * 16, cudaMemcpyHostToDevice, stream));
+    const void* const* batch_a        = reinterpret_cast<const void* const*>(batch_tensor_ptrs);
-    const void* const* batch_a = reinterpret_cast<const void* const*>(batch_tensor_ptrs);
+    const void* const* batch_b        = reinterpret_cast<const void* const*>(batch_tensor_ptrs + 4);
-    const void* const* batch_b = reinterpret_cast<const void* const*>(batch_tensor_ptrs + 4);
+    void* const*       batch_c        = reinterpret_cast<void* const*>(batch_tensor_ptrs + 8);
-    void* const* batch_c = reinterpret_cast<void* const*>(batch_tensor_ptrs + 8);
+    void* const*       batch_expected = reinterpret_cast<void* const*>(batch_tensor_ptrs + 12);
-    void* const* batch_expected = reinterpret_cast<void* const*>(batch_tensor_ptrs + 12);
-    cublasHandle_t cublas_handle;
+    cublasHandle_t   cublas_handle;
    cublasLtHandle_t cublaslt_handle;
    check_cuda_error(cublasCreate(&cublas_handle));
    check_cuda_error(cublasLtCreate(&cublaslt_handle));
    check_cuda_error(cublasSetStream(cublas_handle, stream));
-    cublasAlgoMap cublas_algo_map(GEMM_CONFIG);
+    cublasAlgoMap   cublas_algo_map(GEMM_CONFIG);
-    std::mutex* cublas_wrapper_mutex = new std::mutex();
+    std::mutex*     cublas_wrapper_mutex = new std::mutex();
-    cublasMMWrapper cublas_wrapper(cublas_handle,
+    cublasMMWrapper cublas_wrapper(
-                                   cublaslt_handle,
+        cublas_handle, cublaslt_handle, stream, &cublas_algo_map, cublas_wrapper_mutex, &allocator);
-                                   stream,
-                                   &cublas_algo_map,
-                                   cublas_wrapper_mutex,
-                                   &allocator);
    cudaDataType_t dtype = std::is_same<float, T>::value ? CUDA_R_32F : CUDA_R_16F;
    cudaDataType_t ctype = (computeType == DataType::TYPE_FP32) ? CUDA_R_32F : CUDA_R_16F;
@@ -512,7 +561,7 @@ void testGemmConsistencyBatchedMatmul(size_t m, size_t n, size_t k) {
    std::shared_ptr<Gemm> gemm = createGemm(&allocator, stream, false, false);
    gemm->setTypes(a_type, b_type, c_type, computeType);
-    for (auto &op_pair : op_pairs) {
+    for (auto& op_pair : op_pairs) {
        std::string tc_name = getTestName(__func__, op_pair, m, n, k);
        TM_LOG_DEBUG(tc_name);
@@ -526,42 +575,51 @@ void testGemmConsistencyBatchedMatmul(size_t m, size_t n, size_t k) {
                                   n,
                                   m,
                                   k,
-                                   (const void* const*)batch_b, ldb,
+                                   (const void* const*)batch_b,
-                                   (const void* const*)batch_a, lda,
+                                   ldb,
-                                   (void* const*)batch_expected, ldc,
+                                   (const void* const*)batch_a,
+                                   lda,
+                                   (void* const*)batch_expected,
+                                   ldc,
                                   batch_size);
-        gemm->batchedGemm(op_pair.transa, op_pair.transb, m, n, k,
+        gemm->batchedGemm(op_pair.transa,
-                          batch_a, a_type, lda,
+                          op_pair.transb,
-                          batch_b, b_type, ldb,
+                          m,
-                          batch_c, c_type, ldc,
+                          n,
+                          k,
+                          batch_a,
+                          a_type,
+                          lda,
+                          batch_b,
+                          b_type,
+                          ldb,
+                          batch_c,
+                          c_type,
+                          ldc,
                          batch_size);
        for (size_t i = 0; i < batch_size; ++i) {
-            EXPECT_ALMOST_EQUAL(tc_name + " api1 batch" + std::to_string(i),
+            EXPECT_ALMOST_EQUAL(
-                                T, computeType, *c_tensors[i], *expecteds[i]);
+                tc_name + " api1 batch" + std::to_string(i), T, computeType, *c_tensors[i], *expecteds[i]);
        }
        for (size_t i = 0; i < batch_size; ++i) {
            c_tensors[i]->setInvalidValues();
        }
-        gemm->batchedGemm(op_pair.transa, op_pair.transb, m, n, k,
+        gemm->batchedGemm(
-                          batch_a, lda,
+            op_pair.transa, op_pair.transb, m, n, k, batch_a, lda, batch_b, ldb, batch_c, ldc, batch_size);
-                          batch_b, ldb,
-                          batch_c, ldc,
-                          batch_size);
        for (size_t i = 0; i < batch_size; ++i) {
-            EXPECT_ALMOST_EQUAL(tc_name + " api2 batch" + std::to_string(i),
+            EXPECT_ALMOST_EQUAL(
-                                T, computeType, *c_tensors[i], *expecteds[i]);
+                tc_name + " api2 batch" + std::to_string(i), T, computeType, *c_tensors[i], *expecteds[i]);
        }
        for (size_t i = 0; i < batch_size; ++i) {
            c_tensors[i]->setInvalidValues();
        }
-        gemm->batchedGemm(op_pair.transa, op_pair.transb, m, n, k,
+        gemm->batchedGemm(op_pair.transa, op_pair.transb, m, n, k, batch_a, batch_b, batch_c, batch_size);
-                          batch_a, batch_b, batch_c, batch_size);
        for (size_t i = 0; i < batch_size; ++i) {
-            EXPECT_ALMOST_EQUAL(tc_name + " api3 batch" + std::to_string(i),
+            EXPECT_ALMOST_EQUAL(
-                                T, computeType, *c_tensors[i], *expecteds[i]);
+                tc_name + " api3 batch" + std::to_string(i), T, computeType, *c_tensors[i], *expecteds[i]);
        }
    }
    a_tensors.clear();
@@ -574,36 +632,36 @@ void testGemmConsistencyBatchedMatmul(size_t m, size_t n, size_t k) {
    check_cuda_error(cudaStreamDestroy(stream));
 }
 template<typename T, DataType computeType>
-void testGemmConsistencyStridedBatchedMatmul(size_t batch_size, size_t m, size_t n, size_t k) {
+void testGemmConsistencyStridedBatchedMatmul(size_t batch_size, size_t m, size_t n, size_t k)
+{
    // Test if Gemm is consistent with cublasWrapper
    TM_LOG_INFO("Strided batched gemm function consistency test [bsz=%ld, m=%ld, n=%ld, k=%ld, %s]",
-                batch_size, m, n, k, toString<T, computeType>().c_str());
+                batch_size,
+                m,
+                n,
+                k,
+                toString<T, computeType>().c_str());
    Allocator<AllocatorType::CUDA> allocator(getDevice());
-    cudaStream_t stream;
+    cudaStream_t                   stream;
    check_cuda_error(cudaStreamCreate(&stream));
-    DataType data_type = getTensorType<T>();
+    DataType      data_type = getTensorType<T>();
    TensorWrapper a_tensor(&allocator, data_type, {batch_size, m, k}, false);
    TensorWrapper b_tensor(&allocator, data_type, {batch_size, k, n}, false);
    TensorWrapper c_tensor(&allocator, data_type, {batch_size, m, n}, true);
    TensorWrapper expected(&allocator, data_type, {batch_size, m, n}, true);
-    cublasHandle_t cublas_handle;
+    cublasHandle_t   cublas_handle;
    cublasLtHandle_t cublaslt_handle;
    check_cuda_error(cublasCreate(&cublas_handle));
    check_cuda_error(cublasLtCreate(&cublaslt_handle));
    check_cuda_error(cublasSetStream(cublas_handle, stream));
-    cublasAlgoMap cublas_algo_map(GEMM_CONFIG);
+    cublasAlgoMap   cublas_algo_map(GEMM_CONFIG);
-    std::mutex* cublas_wrapper_mutex = new std::mutex();
+    std::mutex*     cublas_wrapper_mutex = new std::mutex();
-    cublasMMWrapper cublas_wrapper(cublas_handle,
+    cublasMMWrapper cublas_wrapper(
-                                   cublaslt_handle,
+        cublas_handle, cublaslt_handle, stream, &cublas_algo_map, cublas_wrapper_mutex, &allocator);
-                                   stream,
-                                   &cublas_algo_map,
-                                   cublas_wrapper_mutex,
-                                   &allocator);
    cudaDataType_t dtype = std::is_same<float, T>::value ? CUDA_R_32F : CUDA_R_16F;
    cudaDataType_t ctype = (computeType == DataType::TYPE_FP32) ? CUDA_R_32F : CUDA_R_16F;
@@ -612,7 +670,7 @@ void testGemmConsistencyStridedBatchedMatmul(size_t batch_size, size_t m, size_t
    std::shared_ptr<Gemm> gemm = createGemm(&allocator, stream, false, false);
    gemm->setTypes(a_tensor.type, b_tensor.type, c_tensor.type, computeType);
-    for (auto &op_pair : op_pairs) {
+    for (auto& op_pair : op_pairs) {
        std::string tc_name = getTestName(__func__, op_pair, m, n, k);
        // Switch A/B because Gemm expects column major layout as cublas does.
@@ -625,7 +683,7 @@ void testGemmConsistencyStridedBatchedMatmul(size_t batch_size, size_t m, size_t
        int64_t stridec = m * n;
        float alpha = 1.0f;
-        float beta = 0.0f;
+        float beta  = 0.0f;
        cublas_wrapper.stridedBatchedGemm(getCublasOperation(op_pair.transb),
                                          getCublasOperation(op_pair.transa),
@@ -650,35 +708,78 @@ void testGemmConsistencyStridedBatchedMatmul(size_t batch_size, size_t m, size_t
                                          getCublasDataType(computeType));
        c_tensor.setInvalidValues();  // to guarantee C has invalid data
-        gemm->stridedBatchedGemm(op_pair.transa, op_pair.transb, m, n, k,
+        gemm->stridedBatchedGemm(op_pair.transa,
-                                 a_tensor.data, a_tensor.type, lda, stridea,
+                                 op_pair.transb,
-                                 b_tensor.data, b_tensor.type, ldb, strideb,
+                                 m,
-                                 c_tensor.data, c_tensor.type, ldc, stridec,
+                                 n,
-                                 batch_size, computeType, alpha, beta);
+                                 k,
+                                 a_tensor.data,
+                                 a_tensor.type,
+                                 lda,
+                                 stridea,
+                                 b_tensor.data,
+                                 b_tensor.type,
+                                 ldb,
+                                 strideb,
+                                 c_tensor.data,
+                                 c_tensor.type,
+                                 ldc,
+                                 stridec,
+                                 batch_size,
+                                 computeType,
+                                 alpha,
+                                 beta);
        EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected);
        c_tensor.setInvalidValues();
-        gemm->stridedBatchedGemm(op_pair.transa, op_pair.transb, m, n, k,
+        gemm->stridedBatchedGemm(op_pair.transa,
-                                 a_tensor.data, lda, stridea,
+                                 op_pair.transb,
-                                 b_tensor.data, ldb, strideb,
+                                 m,
-                                 c_tensor.data, ldc, stridec,
+                                 n,
-                                 batch_size, alpha, beta);
+                                 k,
+                                 a_tensor.data,
+                                 lda,
+                                 stridea,
+                                 b_tensor.data,
+                                 ldb,
+                                 strideb,
+                                 c_tensor.data,
+                                 ldc,
+                                 stridec,
+                                 batch_size,
+                                 alpha,
+                                 beta);
        EXPECT_ALMOST_EQUAL(tc_name + " api2", T, computeType, c_tensor, expected);
        c_tensor.setInvalidValues();
-        gemm->stridedBatchedGemm(op_pair.transa, op_pair.transb, m, n, k,
+        gemm->stridedBatchedGemm(op_pair.transa,
-                                 a_tensor.data, stridea,
+                                 op_pair.transb,
-                                 b_tensor.data, strideb,
+                                 m,
-                                 c_tensor.data, stridec,
+                                 n,
-                                 batch_size, alpha, beta);
+                                 k,
+                                 a_tensor.data,
+                                 stridea,
+                                 b_tensor.data,
+                                 strideb,
+                                 c_tensor.data,
+                                 stridec,
+                                 batch_size,
+                                 alpha,
+                                 beta);
        EXPECT_ALMOST_EQUAL(tc_name + " api3", T, computeType, c_tensor, expected);
        c_tensor.setInvalidValues();
-        gemm->stridedBatchedGemm(op_pair.transa, op_pair.transb, m, n, k,
+        gemm->stridedBatchedGemm(op_pair.transa,
+                                 op_pair.transb,
+                                 m,
+                                 n,
+                                 k,
                                 a_tensor.data,
                                 b_tensor.data,
                                 c_tensor.data,
-                                 batch_size, alpha, beta);
+                                 batch_size,
+                                 alpha,
+                                 beta);
        EXPECT_ALMOST_EQUAL(tc_name + " api4", T, computeType, c_tensor, expected);
    }
@@ -692,15 +793,16 @@ void testGemmConsistencyStridedBatchedMatmul(size_t batch_size, size_t m, size_t
 // The current SpGemm only supports TYPE_FP16 for T, computeType,
 // but let us keep these template variables for later use.
 template<typename T, DataType computeType>
-void testSpGemmCorrectnessMatmul(size_t m, size_t n, size_t k) {
+void testSpGemmCorrectnessMatmul(size_t m, size_t n, size_t k)
-    TM_LOG_INFO("Sparse gemm function correctness test [m=%ld, n=%ld, k=%ld, %s]",
+{
-                m, n, k, toString<T, computeType>().c_str());
+    TM_LOG_INFO(
+        "Sparse gemm function correctness test [m=%ld, n=%ld, k=%ld, %s]", m, n, k, toString<T, computeType>().c_str());
    cudaStream_t stream;
    check_cuda_error(cudaStreamCreate(&stream));
    Allocator<AllocatorType::CUDA> allocator(getDevice());
-    DataType dtype = getTensorType<T>();
+    DataType      dtype = getTensorType<T>();
    TensorWrapper a_tensor(&allocator, dtype, {m, k}, false);
    TensorWrapper b_tensor(&allocator, dtype, {k, n}, false);
    TensorWrapper c_tensor(&allocator, dtype, {m, n}, true);
@@ -709,47 +811,54 @@ void testSpGemmCorrectnessMatmul(size_t m, size_t n, size_t k) {
    std::shared_ptr<Gemm> gemm = createGemm(&allocator, stream, true, false);
    gemm->setTypes(a_tensor.type, b_tensor.type, c_tensor.type, computeType);
-    for (auto &op_pair : op_pairs) {
+    for (auto& op_pair : op_pairs) {
        // A/B will be switched in SpGemm.
        std::string tc_name = getTestName(__func__, op_pair, m, n, k);
        TM_LOG_DEBUG(tc_name);
        b_tensor.setRandomValues();
-        pruneMatrixB(b_tensor.data, stream,
+        pruneMatrixB(b_tensor.data, stream, b_tensor.shape[0], b_tensor.shape[1], op_pair.transb);
-                     b_tensor.shape[0], b_tensor.shape[1], op_pair.transb);
+        computeReference<computeType>(op_pair.transa, op_pair.transb, expected, a_tensor, b_tensor);
-        computeReference<computeType>(op_pair.transa, op_pair.transb,
-                                      expected, a_tensor, b_tensor);
        void* b_compressed;
-        compressMatrixB(&b_compressed, allocator, stream,
+        compressMatrixB(
-                        b_tensor.data, b_tensor.shape[0], b_tensor.shape[1],
+            &b_compressed, allocator, stream, b_tensor.data, b_tensor.shape[0], b_tensor.shape[1], op_pair.transb);
-                        op_pair.transb);
        size_t lda = (op_pair.transa == GEMM_OP_N) ? k : m;
        size_t ldb = (op_pair.transb == GEMM_OP_N) ? n : k;
        size_t ldc = n;
-        c_tensor.setInvalidValues(); // to guarantee C has invalid data
+        c_tensor.setInvalidValues();  // to guarantee C has invalid data
-        gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
+        gemm->gemm(op_pair.transa,
-                   a_tensor.data, a_tensor.type, lda,
+                   op_pair.transb,
-                   b_compressed, b_tensor.type, ldb,
+                   m,
-                   c_tensor.data, c_tensor.type, ldc);
+                   n,
+                   k,
+                   a_tensor.data,
+                   a_tensor.type,
+                   lda,
+                   b_compressed,
+                   b_tensor.type,
+                   ldb,
+                   c_tensor.data,
+                   c_tensor.type,
+                   ldc);
        EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected);
        c_tensor.setInvalidValues();
-        gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
+        gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, lda, b_compressed, ldb, c_tensor.data, ldc);
-                   a_tensor.data, lda,
-                   b_compressed, ldb,
-                   c_tensor.data, ldc);
        EXPECT_ALMOST_EQUAL(tc_name + " api2", T, computeType, c_tensor, expected);
        c_tensor.setInvalidValues();
-        gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
+        gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, b_compressed, c_tensor.data);
-                   a_tensor.data, b_compressed, c_tensor.data);
        EXPECT_ALMOST_EQUAL(tc_name + " api3", T, computeType, c_tensor, expected);
        c_tensor.setInvalidValues();
-        gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
+        gemm->gemm(op_pair.transa,
+                   op_pair.transb,
+                   m,
+                   n,
+                   k,
                   a_tensor.data,
                   DenseWeight<T>{(const T*)b_tensor.data, nullptr, (const T*)b_compressed},
                   c_tensor.data);
@@ -761,34 +870,34 @@ void testSpGemmCorrectnessMatmul(size_t m, size_t n, size_t k) {
 }
 template<typename T, DataType computeType>
-void testSpGemmConsistencyMatmul(size_t m, size_t n, size_t k) {
+void testSpGemmConsistencyMatmul(size_t m, size_t n, size_t k)
+{
    // Test if Gemm is consistent with cublasWrapper
    TM_LOG_INFO("Sparse Matmul function consistency test [m=%ld, n=%ld, k=%ld, %s]",
-                m, n, k, toString<T, computeType>().c_str());
+                m,
+                n,
+                k,
+                toString<T, computeType>().c_str());
    Allocator<AllocatorType::CUDA> allocator(getDevice());
-    cudaStream_t stream;
+    cudaStream_t                   stream;
    check_cuda_error(cudaStreamCreate(&stream));
-    DataType dtype = getTensorType<T>();
+    DataType      dtype = getTensorType<T>();
    TensorWrapper a_tensor(&allocator, dtype, {m, k}, false);
    TensorWrapper b_tensor(&allocator, dtype, {k, n}, false);
    TensorWrapper c_tensor(&allocator, dtype, {m, n}, true);
    TensorWrapper expected(&allocator, dtype, {m, n}, true);
-    cublasHandle_t cublas_handle;
+    cublasHandle_t   cublas_handle;
    cublasLtHandle_t cublaslt_handle;
    check_cuda_error(cublasCreate(&cublas_handle));
    check_cuda_error(cublasLtCreate(&cublaslt_handle));
    check_cuda_error(cublasSetStream(cublas_handle, stream));
-    cublasAlgoMap cublas_algo_map(GEMM_CONFIG);
+    cublasAlgoMap   cublas_algo_map(GEMM_CONFIG);
-    std::mutex* cublas_wrapper_mutex = new std::mutex();
+    std::mutex*     cublas_wrapper_mutex = new std::mutex();
-    cublasMMWrapper cublas_wrapper(cublas_handle,
+    cublasMMWrapper cublas_wrapper(
-                                   cublaslt_handle,
+        cublas_handle, cublaslt_handle, stream, &cublas_algo_map, cublas_wrapper_mutex, &allocator);
-                                   stream,
-                                   &cublas_algo_map,
-                                   cublas_wrapper_mutex,
-                                   &allocator);
    cudaDataType_t cu_dtype = std::is_same<float, T>::value ? CUDA_R_32F : CUDA_R_16F;
    cudaDataType_t cu_ctype = (DataType::TYPE_FP32 == computeType) ? CUDA_R_32F : CUDA_R_16F;
@@ -797,13 +906,12 @@ void testSpGemmConsistencyMatmul(size_t m, size_t n, size_t k) {
    std::shared_ptr<Gemm> gemm = createGemm(&allocator, stream, true, false);
    gemm->setTypes(a_tensor.type, b_tensor.type, c_tensor.type, computeType);
-    for (auto &op_pair : op_pairs) {
+    for (auto& op_pair : op_pairs) {
        std::string tc_name = getTestName(__func__, op_pair, m, n, k);
        TM_LOG_DEBUG(tc_name);
        b_tensor.setRandomValues();
-        pruneMatrixB(b_tensor.data, stream,
+        pruneMatrixB(b_tensor.data, stream, b_tensor.shape[0], b_tensor.shape[1], op_pair.transb);
-                     b_tensor.shape[0], b_tensor.shape[1], op_pair.transb);
        // Switch A/B because Gemm expects column major layout as cublas does.
        size_t lda = (op_pair.transa == GEMM_OP_N) ? k : m;
@@ -814,32 +922,40 @@ void testSpGemmConsistencyMatmul(size_t m, size_t n, size_t k) {
                            n,
                            m,
                            k,
-                            b_tensor.data, ldb,
+                            b_tensor.data,
-                            a_tensor.data, lda,
+                            ldb,
-                            expected.data, ldc);
+                            a_tensor.data,
+                            lda,
+                            expected.data,
+                            ldc);
        void* b_compressed;
-        compressMatrixB(&b_compressed, allocator, stream,
+        compressMatrixB(
-                        b_tensor.data, b_tensor.shape[0], b_tensor.shape[1],
+            &b_compressed, allocator, stream, b_tensor.data, b_tensor.shape[0], b_tensor.shape[1], op_pair.transb);
-                        op_pair.transb);
        c_tensor.setInvalidValues();  // to guarantee C has invalid data
-        gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
+        gemm->gemm(op_pair.transa,
-                   a_tensor.data, a_tensor.type, lda,
+                   op_pair.transb,
-                   b_compressed, b_tensor.type, ldb,
+                   m,
-                   c_tensor.data, c_tensor.type, ldc);
+                   n,
+                   k,
+                   a_tensor.data,
+                   a_tensor.type,
+                   lda,
+                   b_compressed,
+                   b_tensor.type,
+                   ldb,
+                   c_tensor.data,
+                   c_tensor.type,
+                   ldc);
        EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected);
        c_tensor.setInvalidValues();
-        gemm->gemm(op_pair.transa, op_pair.transb,  m, n, k,
+        gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, lda, b_compressed, ldb, c_tensor.data, ldc);
-                   a_tensor.data, lda,
-                   b_compressed, ldb,
-                   c_tensor.data, ldc);
        EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected);
        c_tensor.setInvalidValues();
-        gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
+        gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, b_compressed, c_tensor.data);
-                   a_tensor.data, b_compressed, c_tensor.data);
        EXPECT_ALMOST_EQUAL(tc_name + " api3", T, computeType, c_tensor, expected);
    }
@@ -850,18 +966,16 @@ void testSpGemmConsistencyMatmul(size_t m, size_t n, size_t k) {
 }
 #endif
-int main(int argc, char* argv[]) {
+int main(int argc, char* argv[])
+{
    // testGemmCreate();
    using testcase_t = std::tuple<size_t, size_t, size_t>;
-    std::vector<testcase_t> testcases = {{16, 32, 64},
+    std::vector<testcase_t> testcases = {
-                                         {255, 255, 255},
+        {16, 32, 64}, {255, 255, 255}, {1041, 2047, 9999}, {1041, 1, 9999}, {1041, 999, 1}};
-                                         {1041, 2047, 9999},
-                                         {1041, 1, 9999},
-                                         {1041, 999, 1}};
    // Computation correctness tests
-    for (testcase_t &tc : testcases) {
+    for (testcase_t& tc : testcases) {
        size_t m = std::get<0>(tc);
        size_t n = std::get<1>(tc);
        size_t k = std::get<2>(tc);
@@ -887,16 +1001,16 @@ int main(int argc, char* argv[]) {
    // Reset for SpGemm test.
    testcases.clear();
    testcases.insert(testcases.end(),
-                    {{8, 32, 32},  // minimum possible example.
+                     {{8, 32, 32},  // minimum possible example.
-                     {8, 32, 64},
+                      {8, 32, 64},
-                     {64, 64, 64},
+                      {64, 64, 64},
-                     {16, 32, 64},
+                      {16, 32, 64},
-                     {1024, 32, 1024},
+                      {1024, 32, 1024},
-                     {1024, 1024, 32},
+                      {1024, 1024, 32},
-                     {16, 1024, 1024},
+                      {16, 1024, 1024},
-                     {1024, 1024, 1024}});
+                      {1024, 1024, 1024}});
-    for (testcase_t &tc : testcases) {
+    for (testcase_t& tc : testcases) {
        size_t m = std::get<0>(tc);
        size_t n = std::get<1>(tc);
        size_t k = std::get<2>(tc);

--- a/tests/csrc/unittests/test_int8.cu
+++ b/tests/csrc/unittests/test_int8.cu
@@ -5,10 +5,10 @@
 #include <string>
 #include <vector>
+#include "src/turbomind/kernels/transpose_int8_kernels.h"
+#include "src/turbomind/utils/Tensor.h"
 #include "src/turbomind/utils/cuda_utils.h"
 #include "src/turbomind/utils/memory_utils.h"
-#include "src/turbomind/utils/Tensor.h"
-#include "src/turbomind/kernels/transpose_int8_kernels.h"
 #include <algorithm>
 #include <iostream>
@@ -39,13 +39,14 @@ protected:
    void testTransposition();
 };
-void fill_tensor_random(Tensor a) {
+void fill_tensor_random(Tensor a)
-    const size_t num_elems = a.size();
+{
-    std::vector<int8_t> host_values(num_elems);
+    const size_t                          num_elems = a.size();
+    std::vector<int8_t>                   host_values(num_elems);
    std::uniform_int_distribution<int8_t> int8_random(-128, 127);
-    std::mt19937 rng(0);
+    std::mt19937                          rng(0);
-    std::generate(host_values.begin(), host_values.end(), [&int8_random, &rng](){ return int8_random(rng); });
+    std::generate(host_values.begin(), host_values.end(), [&int8_random, &rng]() { return int8_random(rng); });
    cudaH2Dcpy(a.getPtr<int8_t>(), host_values.data(), num_elems);
 }
@@ -70,11 +71,11 @@ void Int8TestSuite::testTransposition()
    int8_t *a_data, *a_t_data;
    cudaMalloc(&a_data, m * k * sizeof(int8_t));
-    Tensor a {MEMORY_GPU, TYPE_INT8, {32, 2048}, a_data};
+    Tensor a{MEMORY_GPU, TYPE_INT8, {32, 2048}, a_data};
    fill_tensor_random(a);
    cudaMalloc(&a_t_data, k * m * sizeof(int8_t));
-    Tensor a_t {MEMORY_GPU, TYPE_INT8, {2048, 32}, a_t_data};
+    Tensor a_t{MEMORY_GPU, TYPE_INT8, {2048, 32}, a_t_data};
    std::vector<int8_t> a_t_host_ref(a_t.size());
    reference_transpose_host(a_t_host_ref, a);

--- a/tests/csrc/unittests/test_logprob_kernels.cu
+++ b/tests/csrc/unittests/test_logprob_kernels.cu
 #include <assert.h>
-#include <math.h>
 #include <float.h>
+#include <math.h>
 #include <stdexcept>
 #include <tuple>
 #include <vector>
+#ifdef __linux__
 #include <sys/time.h>
+#endif
 #include "src/turbomind/kernels/logprob_kernels.h"
 #include "src/turbomind/utils/allocator.h"
 #include "src/turbomind/utils/cuda_utils.h"
@@ -24,22 +25,26 @@ struct LogProbKernelTestParam {
    size_t vocab_size;
    size_t beam_width;
-    std::string toString() {
+    std::string toString()
+    {
        return fmtstr("LogProbKernelTestParam[max_input_length=%ld, batch=%ld, vocab=%ld, beam_width=%ld]",
-                      max_input_length, batch_size, vocab_size, beam_width);
+                      max_input_length,
+                      batch_size,
+                      vocab_size,
+                      beam_width);
    }
 };
 /////////////////////////////////// Unittests //////////////////////////////////////////
 template<typename T>
-class LogProbKernelTest : public FtTestBase {
+class LogProbKernelTest: public FtTestBase {
 protected:
-    void computeCumLogProbs(float* cum_log_probs,
+    void computeCumLogProbs(float*       cum_log_probs,
-                            float* log_probs,
+                            float*       log_probs,
-                            const T* logits,
+                            const T*     logits,
-                            const int* input_ids,
+                            const int*   input_ids,
-                            const int* input_lengths,
+                            const int*   input_lengths,
                            const size_t max_input_length,
                            const size_t batch_size,
                            const size_t vocab_size,
@@ -54,9 +59,9 @@ protected:
                    cum_log_probs[i] = 0.0f;
                }
                else if ((int)step < input_lengths[i]) {
-                    size_t step_offset = (step - 1) * batch_size * vocab_size_padded;
+                    size_t   step_offset = (step - 1) * batch_size * vocab_size_padded;
-                    const T* vec = logits + step_offset + i * vocab_size_padded;
+                    const T* vec         = logits + step_offset + i * vocab_size_padded;
-                    float max_logits = -FLT_MAX;
+                    float    max_logits  = -FLT_MAX;
                    for (size_t v = 0; v < vocab_size; ++v) {
                        float val = static_cast<float>(vec[v]);
                        if (val > max_logits) {
@@ -67,7 +72,7 @@ protected:
                    for (size_t v = 0; v < vocab_size; ++v) {
                        sum += expf(static_cast<float>(vec[v]) - max_logits);
                    }
-                    int token_id = input_ids[step * batch_size + i];
+                    int   token_id = input_ids[step * batch_size + i];
                    float log_prob = static_cast<float>(vec[token_id]) - max_logits - log(sum);
                    if (log_probs != nullptr) {
                        log_probs[step * batch_size + i] = log_prob;
@@ -78,11 +83,11 @@ protected:
        }
    }
-    void computeCumLogProbsBatchFirst(float* cum_log_probs,
+    void computeCumLogProbsBatchFirst(float*       cum_log_probs,
-                                      float* log_probs,
+                                      float*       log_probs,
-                                      const T* logits,
+                                      const T*     logits,
-                                      const int* input_ids,
+                                      const int*   input_ids,
-                                      const int* input_lengths,
+                                      const int*   input_lengths,
                                      const size_t max_input_length,
                                      const size_t batch_size,
                                      const size_t vocab_size,
@@ -98,8 +103,8 @@ protected:
                    cum_log_probs[i] = 0.0f;
                }
                else if ((int)step < input_lengths[i]) {
-                    const T* vec = logits + batch_offset + (step - 1) * vocab_size_padded;
+                    const T* vec        = logits + batch_offset + (step - 1) * vocab_size_padded;
-                    float max_logits = -FLT_MAX;
+                    float    max_logits = -FLT_MAX;
                    for (size_t v = 0; v < vocab_size; ++v) {
                        float val = static_cast<float>(vec[v]);
                        if (val > max_logits) {
@@ -110,7 +115,7 @@ protected:
                    for (size_t v = 0; v < vocab_size; ++v) {
                        sum += expf(static_cast<float>(vec[v]) - max_logits);
                    }
-                    int token_id = input_ids[i * max_input_length + step];
+                    int   token_id = input_ids[i * max_input_length + step];
                    float log_prob = static_cast<float>(vec[token_id]) - max_logits - log(sum);
                    if (log_probs != nullptr) {
                        log_probs[i * max_input_length + step] = log_prob;
@@ -122,17 +127,17 @@ protected:
    }
 public:
+    void runTest(LogProbKernelTestParam param)
-    void runTest(LogProbKernelTestParam param) {
+    {
        size_t max_input_length = param.max_input_length;
-        size_t batchxbeam = param.batch_size * param.beam_width;
+        size_t batchxbeam       = param.batch_size * param.beam_width;
-        size_t vocab_size = param.vocab_size;
+        size_t vocab_size       = param.vocab_size;
        // Make multiple of 8 as GPT does.
        size_t vocab_size_padded = static_cast<size_t>(ceil(vocab_size / 8.f) * 8);
        // input values
-        T* h_logits = new T[max_input_length * batchxbeam * vocab_size];
+        T*   h_logits        = new T[max_input_length * batchxbeam * vocab_size];
-        int* h_input_ids = new int[max_input_length * batchxbeam];
+        int* h_input_ids     = new int[max_input_length * batchxbeam];
        int* h_input_lengths = new int[batchxbeam];
        // output buffers
@@ -145,9 +150,9 @@ public:
        memset(expected_cum_log_probs, 0, sizeof(float) * batchxbeam);
        // device buffers
-        T* d_logits = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * max_input_length * batchxbeam * vocab_size));
+        T*   d_logits = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * max_input_length * batchxbeam * vocab_size));
-        int *d_input_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_input_length * batchxbeam));
+        int* d_input_ids       = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_input_length * batchxbeam));
-        int *d_input_lengths = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam));
+        int* d_input_lengths   = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam));
        float* d_cum_log_probs = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * batchxbeam));
        // initialize device buffers
@@ -157,7 +162,7 @@ public:
        deviceFill(d_cum_log_probs, batchxbeam, 0.0f);
        size_t workspace_size = sizeof(float) * max_input_length * batchxbeam;
-        void* workspace = allocator->malloc(workspace_size);
+        void*  workspace      = allocator->malloc(workspace_size);
        invokeLogProbFromLogits(d_cum_log_probs,
                                d_logits,
                                d_input_ids,
@@ -189,16 +194,17 @@ public:
        delete[] h_logits;
    }
-    void runBatchFirstTest(LogProbKernelTestParam param) {
+    void runBatchFirstTest(LogProbKernelTestParam param)
+    {
        size_t max_input_length = param.max_input_length;
-        size_t batchxbeam = param.batch_size * param.beam_width;
+        size_t batchxbeam       = param.batch_size * param.beam_width;
-        size_t vocab_size = param.vocab_size;
+        size_t vocab_size       = param.vocab_size;
        // Make multiple of 8 as GPT does.
        size_t vocab_size_padded = static_cast<size_t>(ceil(vocab_size / 8.f) * 8);
        // input values
-        T* h_logits = new T[max_input_length * batchxbeam * vocab_size_padded];
+        T*   h_logits        = new T[max_input_length * batchxbeam * vocab_size_padded];
-        int* h_input_ids = new int[max_input_length * batchxbeam];
+        int* h_input_ids     = new int[max_input_length * batchxbeam];
        int* h_input_lengths = new int[batchxbeam];
        // output buffers
@@ -213,8 +219,8 @@ public:
        // device buffers
        T* d_logits =
            reinterpret_cast<T*>(allocator->malloc(sizeof(T) * max_input_length * batchxbeam * vocab_size_padded));
-        int *d_input_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_input_length * batchxbeam));
+        int*   d_input_ids     = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_input_length * batchxbeam));
-        int *d_input_lengths = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam));
+        int*   d_input_lengths = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam));
        float* d_cum_log_probs = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * batchxbeam));
        // initialize device buffers
@@ -224,7 +230,7 @@ public:
        check_cuda_error(cudaMemset(d_cum_log_probs, 0, sizeof(float) * batchxbeam));
        size_t workspace_size = sizeof(float) * max_input_length * batchxbeam;
-        void* workspace = allocator->malloc(workspace_size);
+        void*  workspace      = allocator->malloc(workspace_size);
        invokeLogProbFromLogits(d_cum_log_probs,
                                d_logits,
                                d_input_ids,
@@ -239,16 +245,16 @@ public:
                                true);
        computeCumLogProbsBatchFirst(expected_cum_log_probs,
-                                    nullptr,
+                                     nullptr,
-                                    h_logits,
+                                     h_logits,
-                                    h_input_ids,
+                                     h_input_ids,
-                                    h_input_lengths,
+                                     h_input_lengths,
-                                    max_input_length,
+                                     max_input_length,
-                                    batchxbeam,
+                                     batchxbeam,
-                                    vocab_size,
+                                     vocab_size,
-                                    vocab_size_padded);
+                                     vocab_size_padded);
-        std::string tag = param.toString() + (std::is_same<T, float>::value ? " (fp32)" : " (fp16)");
+        std::string tag    = param.toString() + (std::is_same<T, float>::value ? " (fp32)" : " (fp16)");
-        bool passed = checkResult(tag.c_str(), d_cum_log_probs, expected_cum_log_probs, batchxbeam);
+        bool        passed = checkResult(tag.c_str(), d_cum_log_probs, expected_cum_log_probs, batchxbeam);
        EXPECT_TRUE(passed);
        delete[] expected_cum_log_probs;
@@ -256,10 +262,8 @@ public:
        delete[] h_input_ids;
        delete[] h_logits;
    }
 };
 TYPED_TEST_SUITE(LogProbKernelTest, FloatAndHalfTypes);
 TYPED_TEST(LogProbKernelTest, SingleStep)

--- a/tests/csrc/unittests/test_penalty_kernels.cu
+++ b/tests/csrc/unittests/test_penalty_kernels.cu
@@ -14,24 +14,24 @@
 * limitations under the License.
 */
-#include <algorithm>   // std::min, std::max
+#include <algorithm>  // std::min, std::max
-#include <iostream>    // snprintf
+#include <iostream>   // snprintf
-#include <math.h>      // expf, log
+#include <math.h>     // expf, log
 #include <stdexcept>
-#include <stdlib.h>    // rand
+#include <stdlib.h>   // rand
-#include <string>      // std::string
+#include <string>     // std::string
 #include <unordered_map>
-#include <vector>      // std::vector
+#include <vector>     // std::vector
-#include <cublas_v2.h>
 #include <cublasLt.h>
+#include <cublas_v2.h>
 #include <cuda_runtime.h>
+#include "gtest_utils.h"
 #include "src/turbomind/kernels/penalty_types.h"
 #include "src/turbomind/kernels/sampling_penalty_kernels.h"
 #include "src/turbomind/utils/cuda_utils.h"
 #include "src/turbomind/utils/memory_utils.h"
-#include "gtest_utils.h"
 using namespace turbomind;
@@ -41,21 +41,25 @@ struct TemperatureTestParam {
    float* temperatures;
    size_t temperatures_size;
-    std::string toString() {
+    std::string toString()
+    {
        return fmtstr("TemperatureTestParam[batch=%ld, vocab=%ld, temperatures=%s]",
-                      batch_size, vocab_size, arr2str(temperatures, temperatures_size).c_str());
+                      batch_size,
+                      vocab_size,
+                      arr2str(temperatures, temperatures_size).c_str());
    }
 };
-size_t pad_vocab_size(size_t vocab_size, size_t pad = 8) {
+size_t pad_vocab_size(size_t vocab_size, size_t pad = 8)
+{
    return (vocab_size + pad - 1) / pad * pad;
 }
 template<typename T>
-void applyRepetitonPenalty(T* logits,
+void applyRepetitonPenalty(T*           logits,
-                           const int* output_ids,
+                           const int*   output_ids,
-                           const int* input_lengths,
+                           const int*   input_lengths,
-                           const float repetition_penalty,
+                           const float  repetition_penalty,
                           const size_t step,
                           const size_t max_input_length,
                           const size_t batch_size,
@@ -74,8 +78,8 @@ void applyRepetitonPenalty(T* logits,
            int token_id = output_ids[i + t * batch_size];
            if (!penalized[token_id]) {
                float logit = static_cast<float>(logits[offset + token_id]);
-                logits[offset + token_id] = static_cast<T>(logit < 0.0f ?
+                logits[offset + token_id] =
-                    logit * repetition_penalty : logit / repetition_penalty);
+                    static_cast<T>(logit < 0.0f ? logit * repetition_penalty : logit / repetition_penalty);
                penalized[token_id] = true;
            }
        }
@@ -84,9 +88,9 @@ void applyRepetitonPenalty(T* logits,
 }
 template<typename T>
-void batchApplyRepetitonPenalty(T* logits,
+void batchApplyRepetitonPenalty(T*           logits,
-                                const int* output_ids,
+                                const int*   output_ids,
-                                const int* input_lengths,
+                                const int*   input_lengths,
                                const float* repetition_penalties,
                                const size_t step,
                                const size_t max_input_length,
@@ -116,11 +120,8 @@ void batchApplyRepetitonPenalty(T* logits,
 }
 template<typename T>
-void initLogitsAndBias(T* logits,
+void initLogitsAndBias(
-                       T* bias,
+    T* logits, T* bias, const size_t batch_size, const size_t vocab_size, const size_t vocab_size_padded)
-                       const size_t batch_size,
-                       const size_t vocab_size,
-                       const size_t vocab_size_padded)
 {
    initRandom(logits, batch_size * vocab_size_padded, -5.0f, 5.0f);
    if (bias != nullptr) {
@@ -139,11 +140,10 @@ void initLogitsAndBias(T* logits,
    }
 }
 /////////////////////////////////// Tests //////////////////////////////////////////
 template<typename T>
-class TemperaturePenaltyTest : public FtTestBase {
+class TemperaturePenaltyTest: public FtTestBase {
 protected:
    // Set up test
    size_t batch_size_;
@@ -157,17 +157,18 @@ protected:
    float* d_temperatures_;
-    void subsetup(TemperatureTestParam param) {
+    void subsetup(TemperatureTestParam param)
-        batch_size_ = param.batch_size;
+    {
-        vocab_size_ = param.vocab_size;
+        batch_size_        = param.batch_size;
+        vocab_size_        = param.vocab_size;
        vocab_size_padded_ = pad_vocab_size(vocab_size_);
        h_logits_ = new T[batch_size_ * vocab_size_padded_];
-        h_bias_ = new T[vocab_size_padded_];
+        h_bias_   = new T[vocab_size_padded_];
        initLogitsAndBias(h_logits_, h_bias_, batch_size_, vocab_size_, vocab_size_padded_);
        d_logits_ = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batch_size_ * vocab_size_padded_));
-        d_bias_ = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * vocab_size_padded_));
+        d_bias_   = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * vocab_size_padded_));
        cudaAutoCpy(d_logits_, h_logits_, batch_size_ * vocab_size_padded_, stream);
        cudaAutoCpy(d_bias_, h_bias_, vocab_size_padded_, stream);
        if (param.temperatures_size > 1) {
@@ -177,7 +178,8 @@ protected:
        }
    }
-    void subteardown() {
+    void subteardown()
+    {
        delete[] h_logits_;
        delete[] h_bias_;
    }
@@ -195,7 +197,7 @@ protected:
            ASSERT_GT(temperature, 0.0f) << "temperature should be positive but got " << temperature;
            for (size_t j = 0; j < vocab_size; ++j) {
                size_t index = i * vocab_size_padded + j;
-                float logit = static_cast<float>(logits[index]);
+                float  logit = static_cast<float>(logits[index]);
                if (bias != nullptr) {
                    logit += static_cast<float>(bias[j]);
                }
@@ -204,29 +206,18 @@ protected:
        }
    }
 public:
    void runTest(TemperatureTestParam param)
    {
        subsetup(param);
        // Do test
        if (param.temperatures_size == 1) {
-            invokeApplyTemperaturePenalty(d_logits_,
+            invokeApplyTemperaturePenalty(
-                                          d_bias_,
+                d_logits_, d_bias_, param.temperatures[0], batch_size_, vocab_size_, vocab_size_padded_, stream);
-                                          param.temperatures[0],
-                                          batch_size_,
-                                          vocab_size_,
-                                          vocab_size_padded_,
-                                          stream);
        }
        else {
-            invokeBatchApplyTemperaturePenalty(d_logits_,
+            invokeBatchApplyTemperaturePenalty(
-                                               d_bias_,
+                d_logits_, d_bias_, d_temperatures_, batch_size_, vocab_size_, vocab_size_padded_, stream);
-                                               d_temperatures_,
-                                               batch_size_,
-                                               vocab_size_,
-                                               vocab_size_padded_,
-                                               stream);
        }
        computeReference(h_logits_,
                         h_bias_,
@@ -240,21 +231,17 @@ public:
        subteardown();
    }
-    void runConsistencyTest(TemperatureTestParam param) {
+    void runConsistencyTest(TemperatureTestParam param)
+    {
        // Set up test
        ASSERT_EQ(param.temperatures_size, 1) << "A consistency test assumes temperatures_size=1";
        subsetup(param);
        // Run a single runtime value case.
-        invokeApplyTemperaturePenalty(d_logits_,
+        invokeApplyTemperaturePenalty(
-                                      d_bias_,
+            d_logits_, d_bias_, param.temperatures[0], batch_size_, vocab_size_, vocab_size_padded_, stream);
-                                      param.temperatures[0],
-                                      batch_size_,
+        float  temperature    = param.temperatures[0];
-                                      vocab_size_,
-                                      vocab_size_padded_,
-                                      stream);
-        float temperature = param.temperatures[0];
        float* h_temperatures = new float[batch_size_];
        for (size_t i = 0; i < batch_size_; ++i) {
            h_temperatures[i] = temperature;
@@ -263,18 +250,14 @@ public:
        cudaAutoCpy(d_temperatures_, h_temperatures, batch_size_, stream);
        T* d_logits_batch = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batch_size_ * vocab_size_padded_));
-        T* d_bias_batch = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * vocab_size_padded_));
+        T* d_bias_batch   = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * vocab_size_padded_));
        cudaAutoCpy(d_logits_batch, h_logits_, batch_size_ * vocab_size_padded_, stream);
        cudaAutoCpy(d_bias_batch, h_bias_, vocab_size_padded_, stream);
-        invokeBatchApplyTemperaturePenalty(d_logits_batch,
+        invokeBatchApplyTemperaturePenalty(
-                                           d_bias_batch,
+            d_logits_batch, d_bias_batch, d_temperatures_, batch_size_, vocab_size_, vocab_size_padded_, stream);
-                                           d_temperatures_,
+        bool passed =
-                                           batch_size_,
+            checkResult(param.toString(), d_logits_, d_logits_batch, batch_size_ * vocab_size_padded_, true, true);
-                                           vocab_size_,
-                                           vocab_size_padded_,
-                                           stream);
-        bool passed = checkResult(param.toString(), d_logits_, d_logits_batch, batch_size_ * vocab_size_padded_, true, true);
        EXPECT_TRUE(passed);
        // Tear down test
@@ -315,7 +298,7 @@ TYPED_TEST(TemperaturePenaltyTest, LargeVocab)
 TYPED_TEST(TemperaturePenaltyTest, BatchNoPenalty)
 {
-    size_t batch_size = 6;
+    size_t batch_size   = 6;
    float* temperatures = new float[batch_size];
    for (size_t i = 0; i < batch_size; ++i) {
        temperatures[i] = 1.0f;
@@ -325,7 +308,7 @@ TYPED_TEST(TemperaturePenaltyTest, BatchNoPenalty)
 TYPED_TEST(TemperaturePenaltyTest, BatchLessThanOne)
 {
-    size_t batch_size = 6;
+    size_t batch_size   = 6;
    float* temperatures = new float[batch_size];
    for (size_t i = 0; i < batch_size; ++i) {
        temperatures[i] = 0.53f;
@@ -335,7 +318,7 @@ TYPED_TEST(TemperaturePenaltyTest, BatchLessThanOne)
 TYPED_TEST(TemperaturePenaltyTest, BatchGreaterThaneOne)
 {
-    size_t batch_size = 6;
+    size_t batch_size   = 6;
    float* temperatures = new float[batch_size];
    for (size_t i = 0; i < batch_size; ++i) {
        temperatures[i] = 2.01f;
@@ -345,10 +328,10 @@ TYPED_TEST(TemperaturePenaltyTest, BatchGreaterThaneOne)
 TYPED_TEST(TemperaturePenaltyTest, BatchMixed)
 {
-    size_t batch_size = 6;
+    size_t batch_size   = 6;
    float* temperatures = new float[batch_size];
    for (size_t i = 0; i < batch_size; ++i) {
-        temperatures[i] = i % 2 ==0 ? 2.01f : 0.53f;
+        temperatures[i] = i % 2 == 0 ? 2.01f : 0.53f;
    }
    this->runTest({batch_size, 4, temperatures, batch_size});
 }
@@ -367,22 +350,24 @@ struct RepetitionPenaltyTestCase {
    size_t                repetition_penalties_size;
    RepetitionPenaltyType repetition_penalty_type;
-    std::string toString() {
+    std::string toString()
-        static const std::unordered_map<RepetitionPenaltyType, std::string> typestr_map {
+    {
+        static const std::unordered_map<RepetitionPenaltyType, std::string> typestr_map{
            {RepetitionPenaltyType::Additive, "additive"},
            {RepetitionPenaltyType::Multiplicative, "multiplicative"},
            {RepetitionPenaltyType::None, "none"}};
-        return fmtstr(
+        return fmtstr("RepetitionPenaltyTestCase[batch=%ld, vocab=%ld, max_input_length=%ld, "
-            "RepetitionPenaltyTestCase[batch=%ld, vocab=%ld, max_input_length=%ld, "
+                      "repetition_penalties=%s, repetition_penalty_type=%s]",
-            "repetition_penalties=%s, repetition_penalty_type=%s]",
+                      batch_size,
-            batch_size, vocab_size, max_input_length,
+                      vocab_size,
-            arr2str(repetition_penalties, repetition_penalties_size).c_str(),
+                      max_input_length,
-            typestr_map.at(repetition_penalty_type).c_str());
+                      arr2str(repetition_penalties, repetition_penalties_size).c_str(),
+                      typestr_map.at(repetition_penalty_type).c_str());
    }
 };
 template<typename T>
-class RepetitionPenaltyTest : public FtTestBase {
+class RepetitionPenaltyTest: public FtTestBase {
 protected:
    // Set up test
    size_t batch_size_;
@@ -392,37 +377,38 @@ protected:
    size_t sequence_length_;
    size_t step_;
-    T* h_logits_;
+    T*   h_logits_;
-    T* h_bias_;
+    T*   h_bias_;
    int* h_output_ids_;
    int* h_input_lengths_;
-    T* d_logits_;
+    T*   d_logits_;
-    T* d_bias_;
+    T*   d_bias_;
    int* d_output_ids_;
    int* d_input_lengths_;
    float* d_repetition_penalties_;
-    void subsetup(RepetitionPenaltyTestCase param) {
+    void subsetup(RepetitionPenaltyTestCase param)
-        batch_size_ = param.batch_size;
+    {
-        vocab_size_ = param.vocab_size;
+        batch_size_        = param.batch_size;
+        vocab_size_        = param.vocab_size;
        vocab_size_padded_ = pad_vocab_size(vocab_size_);
-        max_input_length_ = param.max_input_length;
+        max_input_length_  = param.max_input_length;
-        sequence_length_ = 2 * max_input_length_;  // input + output
+        sequence_length_   = 2 * max_input_length_;  // input + output
-        step_ = sequence_length_ * 0.7;
+        step_              = sequence_length_ * 0.7;
-        h_logits_ = new T[batch_size_ * vocab_size_padded_];
+        h_logits_        = new T[batch_size_ * vocab_size_padded_];
-        h_bias_ = new T[vocab_size_padded_];
+        h_bias_          = new T[vocab_size_padded_];
-        h_output_ids_ = new int[sequence_length_ * batch_size_];
+        h_output_ids_    = new int[sequence_length_ * batch_size_];
        h_input_lengths_ = new int[batch_size_];
        initLogitsAndBias(h_logits_, h_bias_, batch_size_, vocab_size_, vocab_size_padded_);
        initRandomInt(h_output_ids_, sequence_length_ * batch_size_, 0, vocab_size_);
        initRandomInt(h_input_lengths_, batch_size_, 1, max_input_length_);
-        d_logits_ = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batch_size_ * vocab_size_padded_));
+        d_logits_        = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batch_size_ * vocab_size_padded_));
-        d_bias_ = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * vocab_size_padded_));
+        d_bias_          = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * vocab_size_padded_));
-        d_output_ids_ = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * sequence_length_ * batch_size_));
+        d_output_ids_    = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * sequence_length_ * batch_size_));
        d_input_lengths_ = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size_));
        cudaAutoCpy(d_logits_, h_logits_, batch_size_ * vocab_size_padded_, stream);
@@ -437,7 +423,8 @@ protected:
        }
    }
-    void subteardown() {
+    void subteardown()
+    {
        delete[] h_logits_;
        delete[] h_bias_;
        delete[] h_output_ids_;
@@ -540,7 +527,8 @@ public:
        subteardown();
    }
-    void runConsistencyTest(RepetitionPenaltyTestCase param) {
+    void runConsistencyTest(RepetitionPenaltyTestCase param)
+    {
        // Set up test
        ASSERT_EQ(param.repetition_penalties_size, 1) << "A consistency test assumes repetition_penalties_size=1";
        subsetup(param);
@@ -618,7 +606,7 @@ TYPED_TEST(RepetitionPenaltyTest, LargeVocab)
 TYPED_TEST(RepetitionPenaltyTest, BatchNoPenalty)
 {
-    size_t batch_size = 6;
+    size_t batch_size           = 6;
    float* repetition_penalties = new float[batch_size];
    for (size_t i = 0; i < batch_size; ++i) {
        repetition_penalties[i] = 1.0f;
@@ -628,7 +616,7 @@ TYPED_TEST(RepetitionPenaltyTest, BatchNoPenalty)
 TYPED_TEST(RepetitionPenaltyTest, BatchLessThanOne)
 {
-    size_t batch_size = 6;
+    size_t batch_size           = 6;
    float* repetition_penalties = new float[batch_size];
    for (size_t i = 0; i < batch_size; ++i) {
        repetition_penalties[i] = 0.53f;
@@ -638,7 +626,7 @@ TYPED_TEST(RepetitionPenaltyTest, BatchLessThanOne)
 TYPED_TEST(RepetitionPenaltyTest, BatchGreaterThaneOne)
 {
-    size_t batch_size = 6;
+    size_t batch_size   = 6;
    float* temperatures = new float[batch_size];
    for (size_t i = 0; i < batch_size; ++i) {
        temperatures[i] = 2.01f;
@@ -648,10 +636,10 @@ TYPED_TEST(RepetitionPenaltyTest, BatchGreaterThaneOne)
 TYPED_TEST(RepetitionPenaltyTest, BatchMixed)
 {
-    size_t batch_size = 6;
+    size_t batch_size           = 6;
    float* repetition_penalties = new float[batch_size];
    for (size_t i = 0; i < batch_size; ++i) {
-        repetition_penalties[i] = i % 2 ==0 ? 2.01f : 0.53f;
+        repetition_penalties[i] = i % 2 == 0 ? 2.01f : 0.53f;
    }
    this->runTest({batch_size, 4, 5, repetition_penalties, batch_size, RepetitionPenaltyType::Multiplicative});
 }
@@ -664,10 +652,10 @@ TYPED_TEST(RepetitionPenaltyTest, Consistency)
 TYPED_TEST(RepetitionPenaltyTest, PenaltyTypeAdditive)
 {
-    size_t batch_size = 6;
+    size_t batch_size           = 6;
    float* repetition_penalties = new float[batch_size];
    for (size_t i = 0; i < batch_size; ++i) {
-        repetition_penalties[i] = i % 2 ==0 ? 2.01f : 0.53f;
+        repetition_penalties[i] = i % 2 == 0 ? 2.01f : 0.53f;
    }
    this->runTest({batch_size, 4, 5, repetition_penalties, batch_size, RepetitionPenaltyType::Additive});
 }
@@ -680,10 +668,10 @@ TYPED_TEST(RepetitionPenaltyTest, PenaltyTypeAdditiveHasDefaultValueZero)
 TYPED_TEST(RepetitionPenaltyTest, PenaltyTypeAdditiveHasDefaultValueZero2)
 {
-    size_t batch_size = 6;
+    size_t batch_size           = 6;
    float* repetition_penalties = new float[batch_size];
    for (size_t i = 0; i < batch_size; ++i) {
-        repetition_penalties[i] = i % 2 ==0 ? 1.0f : 0.0f;
+        repetition_penalties[i] = i % 2 == 0 ? 1.0f : 0.0f;
    }
    this->runTest({batch_size, 4, 5, repetition_penalties, batch_size, RepetitionPenaltyType::Additive});
 }

--- a/tests/csrc/unittests/test_sampling.cu
+++ b/tests/csrc/unittests/test_sampling.cu
@@ -12,6 +12,7 @@
 #include "src/turbomind/kernels/sampling_topk_kernels.h"
 #include "src/turbomind/layers/DynamicDecodeLayer.h"
 #include "src/turbomind/layers/sampling_layers/TopKSamplingLayer.h"
+#include "src/turbomind/macro.h"
 #include "src/turbomind/utils/Tensor.h"
 #include "src/turbomind/utils/cublasMMWrapper.h"
 #include "src/turbomind/utils/cuda_utils.h"

--- a/tests/csrc/unittests/test_sampling_kernels.cu
+++ b/tests/csrc/unittests/test_sampling_kernels.cu
-#include <algorithm>   // std::fill_n
+#include <algorithm>  // std::fill_n
-#include <iostream>    // snprintf
+#include <iostream>   // snprintf
-#include <math.h>      // expf, log
+#include <math.h>     // expf, log
-#include <stdlib.h>    // rand
+#include <stdlib.h>   // rand
-#include <string>      // std::string
+#include <string>     // std::string
-#include <vector>      // std::vector
+#include <vector>     // std::vector
-#include <cublas_v2.h>
 #include <cublasLt.h>
+#include <cublas_v2.h>
 #include <cuda_runtime.h>
 #include <gtest/gtest.h>
@@ -14,6 +14,7 @@
 #include "src/turbomind/kernels/sampling_topp_kernels.h"
 #include "src/turbomind/layers/DynamicDecodeLayer.h"
 #include "src/turbomind/layers/sampling_layers/TopKSamplingLayer.h"
+#include "src/turbomind/macro.h"
 #include "src/turbomind/utils/Tensor.h"
 #include "src/turbomind/utils/cublasMMWrapper.h"
 #include "src/turbomind/utils/cuda_utils.h"
@@ -68,9 +69,9 @@ void computeProb(T* probs, T* logits, int batch_size, int vocab_size)
            sum += expf(static_cast<float>(logits[bidx * vocab_size + i]) - maxval);
        }
        for (int i = 0; i < vocab_size; ++i) {
-            int idx = bidx * vocab_size + i;
+            int   idx   = bidx * vocab_size + i;
            float logit = static_cast<float>(logits[idx]) - maxval;
-            probs[idx] = static_cast<T>(expf(logit) / (sum + EPSILON));
+            probs[idx]  = static_cast<T>(expf(logit) / (sum + EPSILON));
        }
    }
 }
@@ -96,8 +97,8 @@ void computeLogProb(T* logprobs, T* logits, int batch_size, int vocab_size)
            sum += expf(static_cast<float>(logits[bidx * vocab_size + i]) - maxval);
        }
        for (int i = 0; i < vocab_size; ++i) {
-            int idx = bidx * vocab_size + i;
+            int   idx     = bidx * vocab_size + i;
-            float logit = static_cast<float>(logits[idx]) - maxval;
+            float logit   = static_cast<float>(logits[idx]) - maxval;
            logprobs[idx] = static_cast<T>(logit - logf(sum + EPSILON));
        }
    }
@@ -119,10 +120,10 @@ public:
    }
 protected:
-    unsigned long long seed = 0;
+    unsigned long long              seed = 0;
-    cudaStream_t stream;
+    cudaStream_t                    stream;
    Allocator<AllocatorType::CUDA>* allocator;
-    curandState_t* curand_states;
+    curandState_t*                  curand_states;
 };
 template<typename T>
@@ -393,8 +394,8 @@ public:
    {
        this->runBatchTest(param, false, false);
        this->runBatchTest(param, false, true);
-        this->runBatchTest(param, true,  false);
+        this->runBatchTest(param, true, false);
-        this->runBatchTest(param, true,  true);
+        this->runBatchTest(param, true, true);
    }
 };
@@ -410,7 +411,6 @@ TYPED_TEST(TopKSamplingKernelTest, CorrectnessAncestral)
    this->runTest({6, 4, 1, 4, 1.0f, 1});
 };
 TYPED_TEST(TopKSamplingKernelTest, CorrectnessLargeK63)
 {
    this->runTest({16, 51200, 1, 63, 1.0f, 8});
@@ -456,7 +456,6 @@ TYPED_TEST(TopKSamplingKernelTest, BatchCorrectnessTopKTopP)
    this->runBatchTest({8, 4000, 1, 63, 0.3f, 8});
 };
 template<typename T>
 class TopPSamplingKernelTest: public SamplingKernelTest<T> {
@@ -473,7 +472,7 @@ public:
        size_t batch_size = param.batch_size;
        size_t vocab_size = param.vocab_size;
        size_t output_len = param.output_len;
-        size_t seq_len = output_len;
+        size_t seq_len    = output_len;
        float top_p = param.top_p;
@@ -496,8 +495,8 @@ public:
        struct cudaDeviceProp device_prop;
        cudaGetDeviceProperties(&device_prop, device);
-        curandState_t* curand_states = reinterpret_cast<curandState_t*>(
+        curandState_t* curand_states =
-            allocator->malloc(sizeof(curandState_t) * batch_size, false));
+            reinterpret_cast<curandState_t*>(allocator->malloc(sizeof(curandState_t) * batch_size, false));
        invokeCurandInitialize(curand_states, batch_size, seed, stream);
        int* end_ids     = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size));
@@ -515,17 +514,17 @@ public:
        int* end_offsets      = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * (batch_size + 1)));
        int* topp_id_vals_buf = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size * vocab_size));
-        size_t workspace_size = 0;
+        size_t workspace_size        = 0;
        size_t cub_temp_storage_size = 0;
        // retrieve the workspace size of the top-p sampling kernel.
        invokeTopPSampling<T>(nullptr,  // workspace
                              workspace_size,
                              cub_temp_storage_size,
-                              nullptr,  // output_ids
+                              nullptr,      // output_ids
-                              nullptr,  // sequence_length
+                              nullptr,      // sequence_length
-                              nullptr,  // finished_buffer
+                              nullptr,      // finished_buffer
-                              nullptr,  // cum_log_probs
+                              nullptr,      // cum_log_probs
-                              nullptr,  // output_log_probs
+                              nullptr,      // output_log_probs
                              (T*)nullptr,  // log_probs
                              topp_id_vals_buf,
                              end_offsets,
@@ -553,12 +552,7 @@ public:
            computeProb(h_probs, h_logits, batch_size, vocab_size);
            cudaH2Dcpy(probs, h_probs, batch_size * vocab_size);
-            invokeTopPInitialize(topp_id_vals_buf,
+            invokeTopPInitialize(topp_id_vals_buf, end_offsets, begin_offsets, batch_size, vocab_size, stream);
-                                 end_offsets,
-                                 begin_offsets,
-                                 batch_size,
-                                 vocab_size,
-                                 stream);
            invokeTopPSampling<T>(workspace,
                                  workspace_size,
@@ -612,7 +606,7 @@ public:
        size_t batch_size = param.batch_size;
        size_t vocab_size = param.vocab_size;
-        float top_p = param.top_p;
+        float  top_p    = param.top_p;
        float* h_top_ps = new float[batch_size];
        // Initialize runtime top k values.
        for (size_t i = 0; i < batch_size; ++i) {
@@ -621,7 +615,7 @@ public:
        float max_top_p = *std::max_element(h_top_ps, h_top_ps + batch_size);
        size_t output_len = param.output_len;
-        size_t seq_len = output_len;
+        size_t seq_len    = output_len;
        // Logit values in the host of shape (batch_size x vocab_size).
        T* h_logits = new T[batch_size * vocab_size];
@@ -647,8 +641,8 @@ public:
        struct cudaDeviceProp device_prop;
        cudaGetDeviceProperties(&device_prop, device);
-        curandState_t* curand_states = reinterpret_cast<curandState_t*>(
+        curandState_t* curand_states =
-            allocator->malloc(sizeof(curandState_t) * batch_size, false));
+            reinterpret_cast<curandState_t*>(allocator->malloc(sizeof(curandState_t) * batch_size, false));
        invokeCurandInitialize(curand_states, batch_size, seed, stream);
        float* top_ps = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * batch_size));
@@ -668,17 +662,17 @@ public:
        int* end_offsets      = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * (batch_size + 1)));
        int* topp_id_vals_buf = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size * vocab_size));
-        size_t workspace_size = 0;
+        size_t workspace_size        = 0;
        size_t cub_temp_storage_size = 0;
        // retrieve the workspace size of the top-p sampling kernel.
        invokeBatchTopPSampling<T>(nullptr,  // workspace
                                   workspace_size,
                                   cub_temp_storage_size,
-                                   nullptr,  // output_ids
+                                   nullptr,      // output_ids
-                                   nullptr,  // sequence_length
+                                   nullptr,      // sequence_length
-                                   nullptr,  // finished_buffer
+                                   nullptr,      // finished_buffer
-                                   nullptr,  // cum_log_probs
+                                   nullptr,      // cum_log_probs
-                                   nullptr,  // output_log_probs
+                                   nullptr,      // output_log_probs
                                   (T*)nullptr,  // log_probs
                                   topp_id_vals_buf,
                                   end_offsets,
@@ -709,12 +703,7 @@ public:
            computeProb(h_probs, h_logits, batch_size, vocab_size);
            cudaH2Dcpy(probs, h_probs, batch_size * vocab_size);
-            invokeTopPInitialize(topp_id_vals_buf,
+            invokeTopPInitialize(topp_id_vals_buf, end_offsets, begin_offsets, batch_size, vocab_size, stream);
-                                 end_offsets,
-                                 begin_offsets,
-                                 batch_size,
-                                 vocab_size,
-                                 stream);
            invokeBatchTopPSampling<T>(workspace,
                                       workspace_size,
@@ -773,8 +762,8 @@ public:
    {
        this->runBatchTest(param, false, false);
        this->runBatchTest(param, false, true);
-        this->runBatchTest(param, true,  false);
+        this->runBatchTest(param, true, false);
-        this->runBatchTest(param, true,  true);
+        this->runBatchTest(param, true, true);
    }
 };
@@ -825,30 +814,31 @@ TYPED_TEST(TopPSamplingKernelTest, BatchCorrectnessLargeP2)
    this->runBatchTest({8, 4000, 1, 0, 0.9f, 16});
 };
-__global__
+__global__ void generateRandomNumber(unsigned int* vals, curandState_t* states, const int batch_size)
-void generateRandomNumber(unsigned int *vals, curandState_t *states, const int batch_size) {
+{
    int idx = threadIdx.x;
    if (idx < batch_size) {
        vals[idx] = curand(states + idx);
    }
 }
-TEST(SamplingKernelTest, CurandBatchInitialize) {
+TEST(SamplingKernelTest, CurandBatchInitialize)
-    size_t batch_size = 127;
+{
+    size_t       batch_size = 127;
    cudaStream_t stream;
    cudaStreamCreate(&stream);
    curandState_t* curand_states;
    check_cuda_error(cudaMalloc(&curand_states, sizeof(curandState_t) * batch_size));
    unsigned long long* h_random_seeds = new unsigned long long[batch_size];
-    const size_t period_size = 3;
+    const size_t        period_size    = 3;
    for (size_t i = 0; i < batch_size; ++i) {
        h_random_seeds[i] = i / period_size;
    }
    unsigned long long* d_random_seeds;
    check_cuda_error(cudaMalloc(&d_random_seeds, sizeof(unsigned long long) * batch_size));
-    check_cuda_error(cudaMemcpy(d_random_seeds, h_random_seeds,
+    check_cuda_error(
-                                sizeof(unsigned long long) * batch_size, cudaMemcpyHostToDevice));
+        cudaMemcpy(d_random_seeds, h_random_seeds, sizeof(unsigned long long) * batch_size, cudaMemcpyHostToDevice));
    // Initialize curand states.
    invokeCurandBatchInitialize(curand_states, batch_size, d_random_seeds, stream);
@@ -859,8 +849,8 @@ TEST(SamplingKernelTest, CurandBatchInitialize) {
    unsigned int* h_rand_vals = new unsigned int[batch_size];
    check_cuda_error(cudaMalloc(&d_rand_vals, sizeof(unsigned int) * batch_size));
    generateRandomNumber<<<1, batch_size, 0, stream>>>(d_rand_vals, curand_states, batch_size);
-    check_cuda_error(cudaMemcpyAsync(
+    check_cuda_error(
-        h_rand_vals, d_rand_vals, sizeof(unsigned int) * batch_size, cudaMemcpyDeviceToHost, stream));
+        cudaMemcpyAsync(h_rand_vals, d_rand_vals, sizeof(unsigned int) * batch_size, cudaMemcpyDeviceToHost, stream));
    check_cuda_error(cudaStreamSynchronize(stream));
    // The same seed produces the same random number.

--- a/tests/csrc/unittests/test_sampling_layer.cu
+++ b/tests/csrc/unittests/test_sampling_layer.cu
-#include <algorithm>   // std::min, std::max
+#include <algorithm>  // std::min, std::max
-#include <iostream>    // snprintf
+#include <iostream>   // snprintf
-#include <math.h>      // expf, log
+#include <math.h>     // expf, log
-#include <stdlib.h>    // rand
+#include <stdlib.h>   // rand
-#include <string>      // std::string
+#include <string>     // std::string
-#include <vector>      // std::vector
+#include <vector>     // std::vector
-#include <cublas_v2.h>
 #include <cublasLt.h>
+#include <cublas_v2.h>
 #include <cuda_runtime.h>
 #include "src/turbomind/kernels/sampling_topk_kernels.h"
 #include "src/turbomind/layers/DynamicDecodeLayer.h"
 #include "src/turbomind/layers/sampling_layers/TopKSamplingLayer.h"
+#include "src/turbomind/macro.h"
+#include "src/turbomind/utils/Tensor.h"
 #include "src/turbomind/utils/cublasMMWrapper.h"
 #include "src/turbomind/utils/cuda_utils.h"
 #include "src/turbomind/utils/memory_utils.h"
-#include "src/turbomind/utils/Tensor.h"
 #include "gtest_utils.h"
@@ -26,17 +27,24 @@ struct SamplingLayerTestParam {
    size_t vocab_size;
    size_t beam_width;
    size_t top_k;
-    float top_p;
+    float  top_p;
    size_t output_len;
-    std::string toString() {
+    std::string toString()
+    {
        return fmtstr("SamplingLayerTestParam[batch=%ld, vocab=%ld, beam=%ld, k=%ld, p=%3.1f, output_len=%ld]",
-                      batch_size, vocab_size, beam_width, top_k, top_p, output_len);
+                      batch_size,
+                      vocab_size,
+                      beam_width,
+                      top_k,
+                      top_p,
+                      output_len);
    }
 };
 template<typename T>
-void computeProb(T* probs, T* logits, int batch_size, int vocab_size) {
+void computeProb(T* probs, T* logits, int batch_size, int vocab_size)
+{
    // Compute the log probability from logits.
    //   logits = batch_size x vocab_size vector.
    //   logprobs = log(softmax(logits)) (softmax along with vocab dimension)
@@ -46,14 +54,15 @@ void computeProb(T* probs, T* logits, int batch_size, int vocab_size) {
            sum += expf((float)logits[bidx * vocab_size + i]);
        }
        for (int i = 0; i < vocab_size; ++i) {
-            int idx = bidx * vocab_size + i;
+            int idx    = bidx * vocab_size + i;
            probs[idx] = static_cast<T>(expf((float)logits[idx]) / (sum + EPSILON));
        }
    }
 }
 template<typename T>
-void computeLogProb(T* logprobs, T* logits, int batch_size, int vocab_size) {
+void computeLogProb(T* logprobs, T* logits, int batch_size, int vocab_size)
+{
    // Compute the log probability from logits.
    //   logits = batch_size x vocab_size vector.
    //   logprobs = log(softmax(logits)) (softmax along with vocab dimension)
@@ -63,7 +72,7 @@ void computeLogProb(T* logprobs, T* logits, int batch_size, int vocab_size) {
            sum += expf(logits[bidx * vocab_size + i]);
        }
        for (int i = 0; i < vocab_size; ++i) {
-            int idx = bidx * vocab_size + i;
+            int idx       = bidx * vocab_size + i;
            logprobs[idx] = static_cast<T>(logf(expf(logits[idx]) / (sum + EPSILON) + EPSILON));
        }
    }
@@ -72,44 +81,45 @@ void computeLogProb(T* logprobs, T* logits, int batch_size, int vocab_size) {
 template<typename T>
 class SamplingDecodeTest: public testing::Test {
 protected:
-    unsigned long long seed = 0;
+    unsigned long long              seed           = 0;
-    const static unsigned long long max_seed = 30;
+    const static unsigned long long max_seed       = 30;
-    const size_t batch_size = 6;
+    const size_t                    batch_size     = 6;
-    const size_t beam_width = 1;
+    const size_t                    beam_width     = 1;
-    const size_t batchxbeam = batch_size * beam_width;
+    const size_t                    batchxbeam     = batch_size * beam_width;
-    const size_t vocab_size = 8;
+    const size_t                    vocab_size     = 8;
-    const size_t max_input_len = 0;  // has no effect.
+    const size_t                    max_input_len  = 0;  // has no effect.
-    const size_t max_output_len = 3;
+    const size_t                    max_output_len = 3;
-    const size_t max_seq_len = max_input_len + max_output_len;
+    const size_t                    max_seq_len    = max_input_len + max_output_len;
-    const int end_id = vocab_size - 1;
+    const int                       end_id         = vocab_size - 1;
-    const DataType data_type = getTensorType<T>();
+    const DataType                  data_type      = getTensorType<T>();
    // vocab size 8 & length 3
    T* test_input_logits;
-    cudaStream_t stream;
+    cudaStream_t                            stream;
    ft::Allocator<ft::AllocatorType::CUDA>* allocator;
-    cublasHandle_t cublas_handle;
+    cublasHandle_t                          cublas_handle;
-    cublasLtHandle_t cublaslt_handle;
+    cublasLtHandle_t                        cublaslt_handle;
-    std::mutex *cublas_wrapper_mutex;
+    std::mutex*                             cublas_wrapper_mutex;
-    cublasMMWrapper *cublas_wrapper;
+    cublasMMWrapper*                        cublas_wrapper;
-    DynamicDecodeLayer<T> *dynamic_decode_layer;
+    DynamicDecodeLayer<T>*                  dynamic_decode_layer;
-    int* h_output_ids;
+    int*   h_output_ids;
-    T* h_logits;
+    T*     h_logits;
-    T* h_probs;
+    T*     h_probs;
-    T* h_log_probs;
+    T*     h_log_probs;
    float* h_cum_log_probs;
    float* h_output_log_probs;
-    T* d_logits;
+    T*     d_logits;
-    int* d_input_lengths;
+    int*   d_input_lengths;
    float* d_cum_log_probs;
    float* d_output_log_probs;
-    int* d_output_ids;
+    int*   d_output_ids;
-    int* d_end_ids;
+    int*   d_end_ids;
-    void setup(unsigned long long seed = 0) {
+    void setup(unsigned long long seed = 0)
+    {
        this->seed = seed;
        check_cuda_error(cudaStreamCreate(&stream));
@@ -124,12 +134,8 @@ protected:
        cublasAlgoMap cublas_algo_map(GEMM_CONFIG);
        cublas_wrapper_mutex = new std::mutex();
-        cublas_wrapper = new cublasMMWrapper(cublas_handle,
+        cublas_wrapper = new cublasMMWrapper(
-                                             cublaslt_handle,
+            cublas_handle, cublaslt_handle, stream, &cublas_algo_map, cublas_wrapper_mutex, allocator);
-                                             stream,
-                                             &cublas_algo_map,
-                                             cublas_wrapper_mutex,
-                                             allocator);
        dynamic_decode_layer = new DynamicDecodeLayer<T>(vocab_size,
                                                         vocab_size,
@@ -140,26 +146,26 @@ protected:
                                                         false,   // is_free_buffer_after_forward
                                                         &prop);  // cuda_device_prop
-        h_output_ids = new int[batchxbeam];
+        h_output_ids       = new int[batchxbeam];
-        h_logits = new T[batchxbeam * vocab_size];
+        h_logits           = new T[batchxbeam * vocab_size];
-        h_probs = new T[batchxbeam * vocab_size];
+        h_probs            = new T[batchxbeam * vocab_size];
-        h_log_probs = new T[batchxbeam * vocab_size];
+        h_log_probs        = new T[batchxbeam * vocab_size];
-        h_cum_log_probs = new float[batchxbeam];
+        h_cum_log_probs    = new float[batchxbeam];
        h_output_log_probs = new float[max_output_len * batchxbeam];
        // prob = (0.4, 0.3, 0.2, 0.1, ...)
        test_input_logits = new T[24]{
-            -0.9163, -1.2040, -1.6094, -2.3026, -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX,  // step 0
+            -0.9163,  -1.2040,  -1.6094,  -2.3026,  -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX,  // step 0
-             -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX, -0.9163, -1.2040, -1.6094, -2.3026, // step 1
+            -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX, -0.9163,  -1.2040,  -1.6094,  -2.3026,   // step 1
-             -FLT_MAX, -FLT_MAX, -0.9163, -1.2040, -1.6094, -2.3026, -FLT_MAX, -FLT_MAX  // step 2
+            -FLT_MAX, -FLT_MAX, -0.9163,  -1.2040,  -1.6094,  -2.3026,  -FLT_MAX, -FLT_MAX   // step 2
        };
-        d_logits = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batchxbeam * vocab_size, true));
+        d_logits           = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batchxbeam * vocab_size, true));
-        d_input_lengths = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam));
+        d_input_lengths    = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam));
-        d_cum_log_probs = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * batchxbeam));
+        d_cum_log_probs    = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * batchxbeam));
        d_output_log_probs = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * max_output_len * batchxbeam));
-        d_output_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_seq_len * batchxbeam));
+        d_output_ids       = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_seq_len * batchxbeam));
-        d_end_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam));
+        d_end_ids          = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam));
        // Init by zero.
        cudaMemset(d_cum_log_probs, 0, sizeof(float) * batchxbeam);
@@ -168,7 +174,8 @@ protected:
        deviceFill(d_end_ids, batchxbeam, end_id, stream);
    }
-    void teardown() {
+    void teardown()
+    {
        delete[] test_input_logits;
        delete[] h_output_ids;
        delete[] h_logits;
@@ -185,12 +192,8 @@ protected:
        check_cuda_error(cudaStreamDestroy(stream));
    }
-    TensorMap* createInputTensors(int* topk,
+    TensorMap* createInputTensors(
-                                                                size_t topk_size,
+        int* topk, size_t topk_size, float* topp, size_t topp_size, float* temperature, float* repetition_penalty)
-                                                                float* topp,
-                                                                size_t topp_size,
-                                                                float* temperature,
-                                                                float* repetition_penalty)
    {
        // construct common input tensors
        TensorMap* input_tensors = new TensorMap();
@@ -206,16 +209,19 @@ protected:
        if (repetition_penalty != nullptr) {
            input_tensors->insert({"repetition_penalty", Tensor{MEMORY_CPU, TYPE_FP32, {1}, repetition_penalty}});
        }
-        input_tensors->insert({"logits", Tensor{MEMORY_GPU, TYPE_FP32, {batch_size, beam_width, vocab_size}, d_logits}});
+        input_tensors->insert(
+            {"logits", Tensor{MEMORY_GPU, TYPE_FP32, {batch_size, beam_width, vocab_size}, d_logits}});
        input_tensors->insert({"embedding_bias", Tensor{MEMORY_GPU, data_type, {vocab_size}, nullptr}});
        input_tensors->insert({"max_input_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_input_len}});
-        input_tensors->insert({"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size, beam_width}, d_input_lengths}});
+        input_tensors->insert(
+            {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size, beam_width}, d_input_lengths}});
        input_tensors->insert({"end_id", Tensor{MEMORY_CPU, TYPE_INT32, {batchxbeam}, &d_end_ids}});
        input_tensors->insert({"random_seed", Tensor{MEMORY_CPU, TYPE_UINT64, {1}, &seed}});
        return input_tensors;
    }
-    TensorMap* createOutputTensors() {
+    TensorMap* createOutputTensors()
+    {
        // construct common output tensors
        TensorMap* output_tensors = new TensorMap();
        output_tensors->insert(
@@ -225,26 +231,27 @@ protected:
            {"cum_log_probs", Tensor{MEMORY_GPU, TYPE_FP32, {batch_size * beam_width}, d_cum_log_probs}});
        output_tensors->insert(
            {"output_log_probs",
-                Tensor{MEMORY_GPU, TYPE_FP32, {max_seq_len, batch_size, beam_width}, d_output_log_probs}});
+             Tensor{MEMORY_GPU, TYPE_FP32, {max_seq_len, batch_size, beam_width}, d_output_log_probs}});
-        output_tensors->insert(
+        output_tensors->insert({"sequence_length", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, nullptr}});
-            {"sequence_length", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, nullptr}});
        return output_tensors;
    }
-    void batchH2Dcpy(T* dst, T* src, size_t m, size_t n) {
+    void batchH2Dcpy(T* dst, T* src, size_t m, size_t n)
+    {
        for (size_t i = 0; i < m; ++i) {
            cudaH2Dcpy(dst + i * n, src, n);
        }
    }
-    bool checkResult(int* d_output_ids, std::vector<std::set<int>>& expected_ids) {
+    bool checkResult(int* d_output_ids, std::vector<std::set<int>>& expected_ids)
+    {
        assert(expected_ids.size() == max_seq_len * batchxbeam);
        int* h_output_ids = new int[max_seq_len * batchxbeam];
        cudaD2Hcpy(h_output_ids, d_output_ids, max_seq_len * batchxbeam);
        int failures = 0;
        for (size_t i = 0; i < max_seq_len * batchxbeam; ++i) {
-            size_t s = i / batchxbeam;
+            size_t        s     = i / batchxbeam;
-            size_t b = i % batchxbeam;
+            size_t        b     = i % batchxbeam;
            std::set<int> expts = expected_ids.at(i);
            if (expts.count(h_output_ids[i]) == 0) {
                if (failures < 10) {
@@ -260,29 +267,29 @@ protected:
                ++failures;
            }
        }
-        TM_LOG_DEBUG("check...%6s : failures: %d / %d",
+        TM_LOG_DEBUG(
-                     failures == 0 ? "....OK" : "FAILED", failures, max_seq_len * batchxbeam);
+            "check...%6s : failures: %d / %d", failures == 0 ? "....OK" : "FAILED", failures, max_seq_len * batchxbeam);
        delete[] h_output_ids;
        return failures == 0;
    }
 public:
    void runTest(std::vector<std::set<int>> expected_output_ids,
-                 int* top_ks,
+                 int*                       top_ks,
-                 size_t top_k_size,
+                 size_t                     top_k_size,
-                 float* top_ps,
+                 float*                     top_ps,
-                 size_t top_p_size,
+                 size_t                     top_p_size,
-                 float* temperature,
+                 float*                     temperature,
-                 float* repetition_penalty,
+                 float*                     repetition_penalty,
-                 bool use_local_batch = false)
+                 bool                       use_local_batch = false)
    {
        size_t local_batch_size = use_local_batch ? batch_size / 3 : batch_size;
-        uint ite = use_local_batch ? 1 : 0;
+        uint   ite              = use_local_batch ? 1 : 0;
        for (unsigned long long seed = 0; seed < max_seed; ++seed) {
            this->setup(seed);
-            size_t step = max_input_len;
+            size_t     step = max_input_len;
-            TensorMap* input_tensors = createInputTensors(
+            TensorMap* input_tensors =
-                top_ks, top_k_size, top_ps, top_p_size, temperature, repetition_penalty);
+                createInputTensors(top_ks, top_k_size, top_ps, top_p_size, temperature, repetition_penalty);
            input_tensors->insert({"step", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &step}});
            input_tensors->insert({"ite", Tensor{MEMORY_CPU, TYPE_UINT32, {1}, &ite}});
            input_tensors->insert({"local_batch_size", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &local_batch_size}});
@@ -316,27 +323,57 @@ TYPED_TEST_SUITE(SamplingDecodeTest, FloatAndHalfTypes);
 TYPED_TEST(SamplingDecodeTest, TopK)
 {
-    int top_k = 2;
+    int                        top_k = 2;
-    std::vector<std::set<int>> expected_output_ids {
+    std::vector<std::set<int>> expected_output_ids{
        // batch
        //  0       1       2       3       4       5
-        {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, // step 0
+        {0, 1},
-        {4, 5}, {4, 5}, {4, 5}, {4, 5}, {4, 5}, {4, 5}, // step 1
+        {0, 1},
-        {2, 3}, {2, 3}, {2, 3}, {2, 3}, {2, 3}, {2, 3}  // step 2
+        {0, 1},
+        {0, 1},
+        {0, 1},
+        {0, 1},  // step 0
+        {4, 5},
+        {4, 5},
+        {4, 5},
+        {4, 5},
+        {4, 5},
+        {4, 5},  // step 1
+        {2, 3},
+        {2, 3},
+        {2, 3},
+        {2, 3},
+        {2, 3},
+        {2, 3}  // step 2
    };
    this->runTest(expected_output_ids, &top_k, 1, nullptr, 0, nullptr, nullptr);
 }
 TYPED_TEST(SamplingDecodeTest, BatchTopK)
 {
-    size_t batch_size = this->batch_size;
+    size_t                     batch_size = this->batch_size;
-    int* top_ks = new int[batch_size]{2, 1, 1, 2, 1, 1};
+    int*                       top_ks     = new int[batch_size]{2, 1, 1, 2, 1, 1};
-    std::vector<std::set<int>> expected_output_ids {
+    std::vector<std::set<int>> expected_output_ids{
        // batch
        //  0    1    2       3    4    5
-        {0, 1}, {0}, {0}, {0, 1}, {0}, {0}, // step 0
+        {0, 1},
-        {4, 5}, {4}, {4}, {4, 5}, {4}, {4}, // step 1
+        {0},
-        {2, 3}, {2}, {2}, {2, 3}, {2}, {2}  // step 2
+        {0},
+        {0, 1},
+        {0},
+        {0},  // step 0
+        {4, 5},
+        {4},
+        {4},
+        {4, 5},
+        {4},
+        {4},  // step 1
+        {2, 3},
+        {2},
+        {2},
+        {2, 3},
+        {2},
+        {2}  // step 2
    };
    this->runTest(expected_output_ids, top_ks, batch_size, nullptr, 0, nullptr, nullptr);
    delete[] top_ks;
@@ -344,52 +381,112 @@ TYPED_TEST(SamplingDecodeTest, BatchTopK)
 TYPED_TEST(SamplingDecodeTest, TopP)
 {
-    float top_p = 0.3;
+    float                      top_p = 0.3;
-    std::vector<std::set<int>> expected_output_ids {
+    std::vector<std::set<int>> expected_output_ids{
        // batch
-        {0}, {0}, {0}, {0}, {0}, {0}, // step 0
+        {0},
-        {4}, {4}, {4}, {4}, {4}, {4}, // step 1
+        {0},
-        {2}, {2}, {2}, {2}, {2}, {2}  // step 2
+        {0},
+        {0},
+        {0},
+        {0},  // step 0
+        {4},
+        {4},
+        {4},
+        {4},
+        {4},
+        {4},  // step 1
+        {2},
+        {2},
+        {2},
+        {2},
+        {2},
+        {2}  // step 2
    };
    this->runTest(expected_output_ids, nullptr, 0, &top_p, 1, nullptr, nullptr);
 }
 TYPED_TEST(SamplingDecodeTest, BatchTopP)
 {
-    size_t batch_size = this->batch_size;
+    size_t                     batch_size = this->batch_size;
-    float* top_ps = new float[batch_size]{0.3f, 0.5f, 0.5f, 0.3f, 0.5f, 0.5f};
+    float*                     top_ps     = new float[batch_size]{0.3f, 0.5f, 0.5f, 0.3f, 0.5f, 0.5f};
-    std::vector<std::set<int>> expected_output_ids {
+    std::vector<std::set<int>> expected_output_ids{
-        {0}, {0, 1}, {0, 1}, {0}, {0, 1}, {0, 1}, // step 0
+        {0},
-        {4}, {4, 5}, {4, 5}, {4}, {4, 5}, {4, 5}, // step 1
+        {0, 1},
-        {2}, {2, 3}, {2, 3}, {2}, {2, 3}, {2, 3}  // step 2
+        {0, 1},
+        {0},
+        {0, 1},
+        {0, 1},  // step 0
+        {4},
+        {4, 5},
+        {4, 5},
+        {4},
+        {4, 5},
+        {4, 5},  // step 1
+        {2},
+        {2, 3},
+        {2, 3},
+        {2},
+        {2, 3},
+        {2, 3}  // step 2
    };
    this->runTest(expected_output_ids, nullptr, 0, top_ps, batch_size, nullptr, nullptr);
    delete[] top_ps;
 }
-TYPED_TEST(SamplingDecodeTest, TopKTopP) {
+TYPED_TEST(SamplingDecodeTest, TopKTopP)
-    int top_k = 2;
+{
-    float top_p = 0.3;
+    int                        top_k = 2;
-    std::vector<std::set<int>> expected_output_ids {
+    float                      top_p = 0.3;
+    std::vector<std::set<int>> expected_output_ids{
        // batch
-        {0}, {0}, {0}, {0}, {0}, {0}, // step 0
+        {0},
-        {4}, {4}, {4}, {4}, {4}, {4}, // step 1
+        {0},
-        {2}, {2}, {2}, {2}, {2}, {2}  // step 2
+        {0},
+        {0},
+        {0},
+        {0},  // step 0
+        {4},
+        {4},
+        {4},
+        {4},
+        {4},
+        {4},  // step 1
+        {2},
+        {2},
+        {2},
+        {2},
+        {2},
+        {2}  // step 2
    };
    this->runTest(expected_output_ids, &top_k, 1, &top_p, 1, nullptr, nullptr);
 }
 TYPED_TEST(SamplingDecodeTest, BatchTopKTopP)
 {
-    size_t batch_size = this->batch_size;
+    size_t                     batch_size = this->batch_size;
-    int* top_ks = new int[batch_size]{2, 2, 1, 2, 2, 1};
+    int*                       top_ks     = new int[batch_size]{2, 2, 1, 2, 2, 1};
-    float top_p = 0.3;
+    float                      top_p      = 0.3;
-    std::vector<std::set<int>> expected_output_ids {
+    std::vector<std::set<int>> expected_output_ids{
        // batch
-        {0}, {0}, {0}, {0}, {0}, {0}, // step 0
+        {0},
-        {4}, {4}, {4}, {4}, {4}, {4}, // step 1
+        {0},
-        {2}, {2}, {2}, {2}, {2}, {2}  // step 2
+        {0},
+        {0},
+        {0},
+        {0},  // step 0
+        {4},
+        {4},
+        {4},
+        {4},
+        {4},
+        {4},  // step 1
+        {2},
+        {2},
+        {2},
+        {2},
+        {2},
+        {2}  // step 2
    };
    this->runTest(expected_output_ids, top_ks, batch_size, &top_p, 1, nullptr, nullptr);
    delete[] top_ks;
@@ -397,29 +494,59 @@ TYPED_TEST(SamplingDecodeTest, BatchTopKTopP)
 TYPED_TEST(SamplingDecodeTest, TopKBatchTopP)
 {
-    size_t batch_size = this->batch_size;
+    size_t                     batch_size = this->batch_size;
-    int top_k = 2;
+    int                        top_k      = 2;
-    float* top_ps = new float[batch_size]{0.5, 0.3, 0.5, 0.5, 0.3, 0.5};
+    float*                     top_ps     = new float[batch_size]{0.5, 0.3, 0.5, 0.5, 0.3, 0.5};
-    std::vector<std::set<int>> expected_output_ids {
+    std::vector<std::set<int>> expected_output_ids{
        // batch
-        {0, 1}, {0}, {0, 1}, {0, 1}, {0}, {0, 1}, // step 0
+        {0, 1},
-        {4, 5}, {4}, {4, 5}, {4, 5}, {4}, {4, 5}, // step 1
+        {0},
-        {2, 3}, {2}, {2, 3}, {2, 3}, {2}, {2, 3}  // step 2
+        {0, 1},
+        {0, 1},
+        {0},
+        {0, 1},  // step 0
+        {4, 5},
+        {4},
+        {4, 5},
+        {4, 5},
+        {4},
+        {4, 5},  // step 1
+        {2, 3},
+        {2},
+        {2, 3},
+        {2, 3},
+        {2},
+        {2, 3}  // step 2
    };
    this->runTest(expected_output_ids, &top_k, 1, top_ps, batch_size, nullptr, nullptr);
    delete[] top_ps;
 }
-TYPED_TEST(SamplingDecodeTest,  BatchTopKBatchTopP)
+TYPED_TEST(SamplingDecodeTest, BatchTopKBatchTopP)
 {
-    size_t batch_size = this->batch_size;
+    size_t                     batch_size = this->batch_size;
-    int* top_ks = new int[batch_size]{2, 2, 0, 2, 2, 0};
+    int*                       top_ks     = new int[batch_size]{2, 2, 0, 2, 2, 0};
-    float* top_ps = new float[batch_size]{0.0, 0.3, 0.5, 0.0, 0.3, 0.5};
+    float*                     top_ps     = new float[batch_size]{0.0, 0.3, 0.5, 0.0, 0.3, 0.5};
-    std::vector<std::set<int>> expected_output_ids {
+    std::vector<std::set<int>> expected_output_ids{
        // batch
-        {0, 1}, {0}, {0, 1}, {0, 1}, {0}, {0, 1}, // step 0
+        {0, 1},
-        {4, 5}, {4}, {4, 5}, {4, 5}, {4}, {4, 5}, // step 1
+        {0},
-        {2, 3}, {2}, {2, 3}, {2, 3}, {2}, {2, 3}  // step 2
+        {0, 1},
+        {0, 1},
+        {0},
+        {0, 1},  // step 0
+        {4, 5},
+        {4},
+        {4, 5},
+        {4, 5},
+        {4},
+        {4, 5},  // step 1
+        {2, 3},
+        {2},
+        {2, 3},
+        {2, 3},
+        {2},
+        {2, 3}  // step 2
    };
    this->runTest(expected_output_ids, top_ks, batch_size, top_ps, batch_size, nullptr, nullptr);
    delete[] top_ks;
@@ -428,162 +555,351 @@ TYPED_TEST(SamplingDecodeTest,  BatchTopKBatchTopP)
 TYPED_TEST(SamplingDecodeTest, InvalidArgsZeroTopK)
 {
-    size_t batch_size = this->batch_size;
+    size_t                     batch_size = this->batch_size;
-    int top_k = 0;
+    int                        top_k      = 0;
-    std::vector<std::set<int>> expected_output_ids {
+    std::vector<std::set<int>> expected_output_ids{
        // batch
-        {0}, {0}, {0}, {0}, {0}, {0}, // step 0
+        {0},
-        {4}, {4}, {4}, {4}, {4}, {4}, // step 1
+        {0},
-        {2}, {2}, {2}, {2}, {2}, {2}  // step 2
+        {0},
+        {0},
+        {0},
+        {0},  // step 0
+        {4},
+        {4},
+        {4},
+        {4},
+        {4},
+        {4},  // step 1
+        {2},
+        {2},
+        {2},
+        {2},
+        {2},
+        {2}  // step 2
    };
    this->runTest(expected_output_ids, &top_k, 1, nullptr, 0, nullptr, nullptr);
 }
 TYPED_TEST(SamplingDecodeTest, InvalidArgsZeroTopP)
 {
-    size_t batch_size = this->batch_size;
+    size_t                     batch_size = this->batch_size;
-    float top_p = 0;
+    float                      top_p      = 0;
-    std::vector<std::set<int>> expected_output_ids {
+    std::vector<std::set<int>> expected_output_ids{
        // batch
-        {0}, {0}, {0}, {0}, {0}, {0}, // step 0
+        {0},
-        {4}, {4}, {4}, {4}, {4}, {4}, // step 1
+        {0},
-        {2}, {2}, {2}, {2}, {2}, {2}  // step 2
+        {0},
+        {0},
+        {0},
+        {0},  // step 0
+        {4},
+        {4},
+        {4},
+        {4},
+        {4},
+        {4},  // step 1
+        {2},
+        {2},
+        {2},
+        {2},
+        {2},
+        {2}  // step 2
    };
    this->runTest(expected_output_ids, nullptr, 0, &top_p, 1, nullptr, nullptr);
 }
 TYPED_TEST(SamplingDecodeTest, InvalidArgsZeroTopKTopP)
 {
-    size_t batch_size = this->batch_size;
+    size_t                     batch_size = this->batch_size;
-    int top_k = 0;
+    int                        top_k      = 0;
-    float top_p = 0;
+    float                      top_p      = 0;
-    std::vector<std::set<int>> expected_output_ids {
+    std::vector<std::set<int>> expected_output_ids{
        // batch
-        {0}, {0}, {0}, {0}, {0}, {0}, // step 0
+        {0},
-        {4}, {4}, {4}, {4}, {4}, {4}, // step 1
+        {0},
-        {2}, {2}, {2}, {2}, {2}, {2}  // step 2
+        {0},
+        {0},
+        {0},
+        {0},  // step 0
+        {4},
+        {4},
+        {4},
+        {4},
+        {4},
+        {4},  // step 1
+        {2},
+        {2},
+        {2},
+        {2},
+        {2},
+        {2}  // step 2
    };
    this->runTest(expected_output_ids, &top_k, 1, &top_p, 1, nullptr, nullptr);
 }
-TYPED_TEST(SamplingDecodeTest, InvalidArgsZeroBatchTopKTopP) {
+TYPED_TEST(SamplingDecodeTest, InvalidArgsZeroBatchTopKTopP)
-    size_t batch_size = this->batch_size;
+{
-    int* top_ks = new int[batch_size]{0, 0, 0, 0, 0, 0};
+    size_t                     batch_size = this->batch_size;
-    float top_p = 0;
+    int*                       top_ks     = new int[batch_size]{0, 0, 0, 0, 0, 0};
-    std::vector<std::set<int>> expected_output_ids {
+    float                      top_p      = 0;
+    std::vector<std::set<int>> expected_output_ids{
        // batch
-        {0}, {0}, {0}, {0}, {0}, {0}, // step 0
+        {0},
-        {4}, {4}, {4}, {4}, {4}, {4}, // step 1
+        {0},
-        {2}, {2}, {2}, {2}, {2}, {2}  // step 2
+        {0},
+        {0},
+        {0},
+        {0},  // step 0
+        {4},
+        {4},
+        {4},
+        {4},
+        {4},
+        {4},  // step 1
+        {2},
+        {2},
+        {2},
+        {2},
+        {2},
+        {2}  // step 2
    };
    this->runTest(expected_output_ids, top_ks, batch_size, &top_p, 1, nullptr, nullptr);
    delete[] top_ks;
 }
-TYPED_TEST(SamplingDecodeTest, InvalidArgsZeroTopKBatchTopP) {
+TYPED_TEST(SamplingDecodeTest, InvalidArgsZeroTopKBatchTopP)
-    size_t batch_size = this->batch_size;
+{
-    int top_k = 0;
+    size_t                     batch_size = this->batch_size;
-    float* top_ps = new float[batch_size]{0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
+    int                        top_k      = 0;
-    std::vector<std::set<int>> expected_output_ids {
+    float*                     top_ps     = new float[batch_size]{0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
+    std::vector<std::set<int>> expected_output_ids{
        // batch
-        {0}, {0}, {0}, {0}, {0}, {0}, // step 0
+        {0},
-        {4}, {4}, {4}, {4}, {4}, {4}, // step 1
+        {0},
-        {2}, {2}, {2}, {2}, {2}, {2}  // step 2
+        {0},
+        {0},
+        {0},
+        {0},  // step 0
+        {4},
+        {4},
+        {4},
+        {4},
+        {4},
+        {4},  // step 1
+        {2},
+        {2},
+        {2},
+        {2},
+        {2},
+        {2}  // step 2
    };
    this->runTest(expected_output_ids, &top_k, 1, top_ps, batch_size, nullptr, nullptr);
    delete[] top_ps;
 }
-TYPED_TEST(SamplingDecodeTest, InvalidArgsBatchTopKContainZero) {
+TYPED_TEST(SamplingDecodeTest, InvalidArgsBatchTopKContainZero)
-    size_t batch_size = this->batch_size;
+{
-    int* top_ks = new int[batch_size]{2, 1, 0, 0, 2, 1};
+    size_t                     batch_size = this->batch_size;
-    std::vector<std::set<int>> expected_output_ids {
+    int*                       top_ks     = new int[batch_size]{2, 1, 0, 0, 2, 1};
+    std::vector<std::set<int>> expected_output_ids{
        // batch
-        {0, 1}, {0}, {0}, {0}, {0, 1}, {0}, // step 0
+        {0, 1},
-        {4, 5}, {4}, {4}, {4}, {4, 5}, {4}, // step 1
+        {0},
-        {2, 3}, {2}, {2}, {2}, {2, 3}, {2}  // step 2
+        {0},
+        {0},
+        {0, 1},
+        {0},  // step 0
+        {4, 5},
+        {4},
+        {4},
+        {4},
+        {4, 5},
+        {4},  // step 1
+        {2, 3},
+        {2},
+        {2},
+        {2},
+        {2, 3},
+        {2}  // step 2
    };
    this->runTest(expected_output_ids, top_ks, batch_size, nullptr, 0, nullptr, nullptr);
    delete[] top_ks;
 }
-TYPED_TEST(SamplingDecodeTest, InvalidArgsBatchTopPContainZero) {
+TYPED_TEST(SamplingDecodeTest, InvalidArgsBatchTopPContainZero)
-    size_t batch_size = this->batch_size;
+{
-    float* top_ps = new float[batch_size]{0.5f, 0.5f, 0.0f, 0.5f, 0.0f, 0.3f};
+    size_t                     batch_size = this->batch_size;
-    std::vector<std::set<int>> expected_output_ids {
+    float*                     top_ps     = new float[batch_size]{0.5f, 0.5f, 0.0f, 0.5f, 0.0f, 0.3f};
+    std::vector<std::set<int>> expected_output_ids{
        // batch
-        {0, 1}, {0, 1}, {0}, {0, 1}, {0}, {0}, // step 0
+        {0, 1},
-        {4, 5}, {4, 5}, {4}, {4, 5}, {4}, {4}, // step 1
+        {0, 1},
-        {2, 3}, {2, 3}, {2}, {2, 3}, {2}, {2}  // step 2
+        {0},
+        {0, 1},
+        {0},
+        {0},  // step 0
+        {4, 5},
+        {4, 5},
+        {4},
+        {4, 5},
+        {4},
+        {4},  // step 1
+        {2, 3},
+        {2, 3},
+        {2},
+        {2, 3},
+        {2},
+        {2}  // step 2
    };
    this->runTest(expected_output_ids, nullptr, 0, top_ps, batch_size, nullptr, nullptr);
    delete[] top_ps;
 }
-TYPED_TEST(SamplingDecodeTest, InvalidArgsBatchTopKTopPContainZero) {
+TYPED_TEST(SamplingDecodeTest, InvalidArgsBatchTopKTopPContainZero)
-    size_t batch_size = this->batch_size;
+{
-    int* top_ks = new int[batch_size]{2, 2, 1, 0, 2, 0};
+    size_t                     batch_size = this->batch_size;
-    float top_p = 0.0;
+    int*                       top_ks     = new int[batch_size]{2, 2, 1, 0, 2, 0};
-    std::vector<std::set<int>> expected_output_ids {
+    float                      top_p      = 0.0;
+    std::vector<std::set<int>> expected_output_ids{
        // batch
-        {0, 1}, {0, 1}, {0}, {0}, {0, 1}, {0}, // step 0
+        {0, 1},
-        {4, 5}, {4, 5}, {4}, {4}, {4, 5}, {4}, // step 1
+        {0, 1},
-        {2, 3}, {2, 3}, {2}, {2}, {2, 3}, {2}  // step 2
+        {0},
+        {0},
+        {0, 1},
+        {0},  // step 0
+        {4, 5},
+        {4, 5},
+        {4},
+        {4},
+        {4, 5},
+        {4},  // step 1
+        {2, 3},
+        {2, 3},
+        {2},
+        {2},
+        {2, 3},
+        {2}  // step 2
    };
    this->runTest(expected_output_ids, top_ks, batch_size, &top_p, 1, nullptr, nullptr);
    delete[] top_ks;
 }
-TYPED_TEST(SamplingDecodeTest, InvalidArgsTopKBatchTopPContainZero) {
+TYPED_TEST(SamplingDecodeTest, InvalidArgsTopKBatchTopPContainZero)
-    size_t batch_size = this->batch_size;
+{
-    int top_k = 0;
+    size_t                     batch_size = this->batch_size;
-    float* top_ps = new float[batch_size]{0.0, 0.3, 0.5, 0.0, 0.3, 0.5};
+    int                        top_k      = 0;
-    std::vector<std::set<int>> expected_output_ids {
+    float*                     top_ps     = new float[batch_size]{0.0, 0.3, 0.5, 0.0, 0.3, 0.5};
+    std::vector<std::set<int>> expected_output_ids{
        // batch
-        {0}, {0}, {0, 1}, {0}, {0}, {0, 1}, // step 0
+        {0},
-        {4}, {4}, {4, 5}, {4}, {4}, {4, 5}, // step 1
+        {0},
-        {2}, {2}, {2, 3}, {2}, {2}, {2, 3}  // step 2
+        {0, 1},
+        {0},
+        {0},
+        {0, 1},  // step 0
+        {4},
+        {4},
+        {4, 5},
+        {4},
+        {4},
+        {4, 5},  // step 1
+        {2},
+        {2},
+        {2, 3},
+        {2},
+        {2},
+        {2, 3}  // step 2
    };
    this->runTest(expected_output_ids, &top_k, 1, top_ps, batch_size, nullptr, nullptr);
    delete[] top_ps;
 }
-TYPED_TEST(SamplingDecodeTest, InvalidArgsBatchTopKBatchTopPContainZero) {
+TYPED_TEST(SamplingDecodeTest, InvalidArgsBatchTopKBatchTopPContainZero)
-    size_t batch_size = this->batch_size;
+{
-    int* top_ks = new int[batch_size]{0, 2, 1, 2, 2, 0};
+    size_t                     batch_size = this->batch_size;
-    float* top_ps = new float[batch_size]{0.0, 0.3, 0.9, 0.0, 0.3, 0.5};
+    int*                       top_ks     = new int[batch_size]{0, 2, 1, 2, 2, 0};
-    std::vector<std::set<int>> expected_output_ids {
+    float*                     top_ps     = new float[batch_size]{0.0, 0.3, 0.9, 0.0, 0.3, 0.5};
+    std::vector<std::set<int>> expected_output_ids{
        // batch
-        {0}, {0}, {0}, {0, 1}, {0}, {0, 1}, // step 0
+        {0},
-        {4}, {4}, {4}, {4, 5}, {4}, {4, 5}, // step 1
+        {0},
-        {2}, {2}, {2}, {2, 3}, {2}, {2, 3}  // step 2
+        {0},
+        {0, 1},
+        {0},
+        {0, 1},  // step 0
+        {4},
+        {4},
+        {4},
+        {4, 5},
+        {4},
+        {4, 5},  // step 1
+        {2},
+        {2},
+        {2},
+        {2, 3},
+        {2},
+        {2, 3}  // step 2
    };
    this->runTest(expected_output_ids, top_ks, batch_size, top_ps, batch_size, nullptr, nullptr);
    delete[] top_ks;
    delete[] top_ps;
 }
-TYPED_TEST(SamplingDecodeTest, LocalBatchBatchTopP) {
+TYPED_TEST(SamplingDecodeTest, LocalBatchBatchTopP)
-    size_t batch_size = this->batch_size;
+{
-    float* top_ps = new float[batch_size]{0.3f, 0.5f, 0.5f, 0.3f, 0.5f, 0.5f};
+    size_t                     batch_size = this->batch_size;
-    std::vector<std::set<int>> expected_output_ids {
+    float*                     top_ps     = new float[batch_size]{0.3f, 0.5f, 0.5f, 0.3f, 0.5f, 0.5f};
-        {0}, {0}, {0, 1}, {0}, {0}, {0}, // step 0
+    std::vector<std::set<int>> expected_output_ids{
-        {0}, {0}, {4, 5}, {4}, {0}, {0}, // step 1
+        {0},
-        {0}, {0}, {2, 3}, {2}, {0}, {0}  // step 2
+        {0},
+        {0, 1},
+        {0},
+        {0},
+        {0},  // step 0
+        {0},
+        {0},
+        {4, 5},
+        {4},
+        {0},
+        {0},  // step 1
+        {0},
+        {0},
+        {2, 3},
+        {2},
+        {0},
+        {0}  // step 2
    };
    this->runTest(expected_output_ids, nullptr, 0, top_ps, batch_size, nullptr, nullptr, true);
    delete[] top_ps;
 }
-TYPED_TEST(SamplingDecodeTest, LocalBatchBatchTopKBatchTopP) {
+TYPED_TEST(SamplingDecodeTest, LocalBatchBatchTopKBatchTopP)
-    size_t batch_size = this->batch_size;
+{
-    int* top_ks = new int[batch_size]{2, 2, 0, 2, 2, 0};
+    size_t                     batch_size = this->batch_size;
-    float* top_ps = new float[batch_size]{0.0, 0.3, 0.5, 0.0, 0.3, 0.5};
+    int*                       top_ks     = new int[batch_size]{2, 2, 0, 2, 2, 0};
-    std::vector<std::set<int>> expected_output_ids {
+    float*                     top_ps     = new float[batch_size]{0.0, 0.3, 0.5, 0.0, 0.3, 0.5};
+    std::vector<std::set<int>> expected_output_ids{
        // batch
-        {0}, {0}, {0, 1}, {0, 1}, {0}, {0}, // step 0
+        {0},
-        {0}, {0}, {4, 5}, {4, 5}, {0}, {0}, // step 1
+        {0},
-        {0}, {0}, {2, 3}, {2, 3}, {0}, {0}  // step 2
+        {0, 1},
+        {0, 1},
+        {0},
+        {0},  // step 0
+        {0},
+        {0},
+        {4, 5},
+        {4, 5},
+        {0},
+        {0},  // step 1
+        {0},
+        {0},
+        {2, 3},
+        {2, 3},
+        {0},
+        {0}  // step 2
    };
    this->runTest(expected_output_ids, top_ks, batch_size, top_ps, batch_size, nullptr, nullptr, true);
    delete[] top_ks;
@@ -601,15 +917,10 @@ public:
        check_cuda_error(cublasCreate(&cublas_handle));
        check_cuda_error(cublasLtCreate(&cublaslt_handle));
        check_cuda_error(cublasSetStream(cublas_handle, stream));
-        cublas_algo_map = new cublasAlgoMap("");
+        cublas_algo_map      = new cublasAlgoMap("");
        cublas_wrapper_mutex = new std::mutex();
-        cublas_wrapper = new cublasMMWrapper(cublas_handle,
+        cublas_wrapper       = new cublasMMWrapper(
-                                             cublaslt_handle,
+            cublas_handle, cublaslt_handle, stream, cublas_algo_map, cublas_wrapper_mutex, allocator);
-                                             stream,
-                                             cublas_algo_map,
-                                             cublas_wrapper_mutex,
-                                             allocator);
    }
    void TearDown() override
    {
@@ -626,12 +937,11 @@ protected:
    using FtTestBase::allocator;
    struct cudaDeviceProp prop;
-    cublasHandle_t cublas_handle;
+    cublasHandle_t        cublas_handle;
-    cublasLtHandle_t cublaslt_handle;
+    cublasLtHandle_t      cublaslt_handle;
-    cublasAlgoMap* cublas_algo_map;
+    cublasAlgoMap*        cublas_algo_map;
-    std::mutex* cublas_wrapper_mutex;
+    std::mutex*           cublas_wrapper_mutex;
-    cublasMMWrapper* cublas_wrapper;
+    cublasMMWrapper*      cublas_wrapper;
    DataType data_type = getTensorType<T>();
@@ -643,50 +953,50 @@ protected:
    size_t max_output_len;
    size_t max_seq_len;
-    uint top_k;
+    uint  top_k;
    float top_p;
    float temperature;
    float repetition_penalty;
-    int end_id;
+    int   end_id;
-    T* h_logits;
+    T*     h_logits;
-    T* h_probs;
+    T*     h_probs;
-    T* h_log_probs;
+    T*     h_log_probs;
    float* h_cum_log_probs;
    float* h_output_log_probs;
-    int* h_output_ids;
+    int*   h_output_ids;
-    T* d_logits;
+    T*     d_logits;
-    int* d_input_lengths;
+    int*   d_input_lengths;
    float* d_cum_log_probs;
    float* d_output_log_probs;
-    int* d_output_ids;
+    int*   d_output_ids;
-    int* d_end_ids;
+    int*   d_end_ids;
    void setup(SamplingLayerTestParam param)
    {
-        batch_size = param.batch_size;
+        batch_size     = param.batch_size;
-        beam_width = param.beam_width;
+        beam_width     = param.beam_width;
-        batchxbeam = batch_size * param.beam_width;
+        batchxbeam     = batch_size * param.beam_width;
-        vocab_size = param.vocab_size;
+        vocab_size     = param.vocab_size;
-        max_input_len = 0;
+        max_input_len  = 0;
        max_output_len = param.output_len;
-        max_seq_len = max_input_len + max_output_len;
+        max_seq_len    = max_input_len + max_output_len;
        top_k = param.top_k;
        top_p = param.top_p;
        // use default values having no effect.
-        temperature = 1.0f;
+        temperature        = 1.0f;
        repetition_penalty = 1.0f;
-        end_id = 0;
+        end_id             = 0;
-        h_logits = new T[batchxbeam * vocab_size];
+        h_logits     = new T[batchxbeam * vocab_size];
        h_output_ids = new int[batchxbeam];
-        d_logits = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batchxbeam * vocab_size));
+        d_logits        = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batchxbeam * vocab_size));
        d_input_lengths = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam));
-        d_output_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_seq_len * batchxbeam));
+        d_output_ids    = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_seq_len * batchxbeam));
-        d_end_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size));
+        d_end_ids       = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size));
        // Init by zero.
        deviceFill(d_input_lengths, batchxbeam, 0, stream);
@@ -694,14 +1004,13 @@ protected:
        deviceFill(d_end_ids, batch_size, end_id);
    }
-    void teardown() {
+    void teardown()
+    {
        delete[] h_logits;
        delete[] h_output_ids;
    }
-    void runCurandTest(SamplingLayerTestParam param,
+    void runCurandTest(SamplingLayerTestParam param, bool use_local_batch, bool use_single_random_seed)
-                       bool use_local_batch,
-                       bool use_single_random_seed)
    {
        setup(param);
        const DataType data_type = getTensorType<T>();
@@ -709,7 +1018,7 @@ protected:
        const size_t local_batch_size = use_local_batch ? 3 : batch_size;
        assert(batch_size % local_batch_size == 0);
-        DynamicDecodeLayer<T> *dynamic_decode_layer = new DynamicDecodeLayer<T>(vocab_size,
+        DynamicDecodeLayer<T>* dynamic_decode_layer = new DynamicDecodeLayer<T>(vocab_size,
                                                                                vocab_size,
                                                                                end_id,
                                                                                stream,
@@ -719,9 +1028,9 @@ protected:
                                                                                &prop);  // cuda_device_prop
        // Prepare decoding arguments
-        const size_t random_seed_size = use_single_random_seed ? 1 : batch_size;
+        const size_t        random_seed_size = use_single_random_seed ? 1 : batch_size;
-        const size_t period_size = 3;
+        const size_t        period_size      = 3;
-        unsigned long long* random_seed = new unsigned long long[random_seed_size];
+        unsigned long long* random_seed      = new unsigned long long[random_seed_size];
        for (size_t i = 0; i < random_seed_size; ++i) {
            random_seed[i] = i / period_size;
        }
@@ -739,29 +1048,27 @@ protected:
            cudaH2Dcpy(d_logits, h_logits, batchxbeam * vocab_size);
            for (uint ite = 0; ite < iteration_num; ++ite) {
-                TensorMap dynamic_decode_input_tensors({
+                TensorMap dynamic_decode_input_tensors(
-                    {"logits", Tensor{MEMORY_GPU, data_type, {batch_size, beam_width, vocab_size}, d_logits}},
+                    {{"logits", Tensor{MEMORY_GPU, data_type, {batch_size, beam_width, vocab_size}, d_logits}},
-                    {"embedding_bias", Tensor{MEMORY_GPU, data_type, {vocab_size}, nullptr}},
+                     {"embedding_bias", Tensor{MEMORY_GPU, data_type, {vocab_size}, nullptr}},
-                    {"step", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &step}},
+                     {"step", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &step}},
-                    {"max_input_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_input_len}},
+                     {"max_input_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_input_len}},
-                    {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size, beam_width}, d_input_lengths}},
+                     {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size, beam_width}, d_input_lengths}},
-                    {"ite", Tensor{MEMORY_CPU, TYPE_UINT32, {1}, &ite}},
+                     {"ite", Tensor{MEMORY_CPU, TYPE_UINT32, {1}, &ite}},
-                    {"local_batch_size", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &local_batch_size}},
+                     {"local_batch_size", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &local_batch_size}},
-                    {"end_id", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, d_end_ids}},
+                     {"end_id", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, d_end_ids}},
-                    {"random_seed", {MEMORY_CPU, TYPE_UINT64, {random_seed_size}, random_seed}},
+                     {"random_seed", {MEMORY_CPU, TYPE_UINT64, {random_seed_size}, random_seed}},
-                    {"runtime_top_k", {MEMORY_CPU, TYPE_UINT32, {1}, &top_k}},
+                     {"runtime_top_k", {MEMORY_CPU, TYPE_UINT32, {1}, &top_k}},
-                    {"runtime_top_p", {MEMORY_CPU, TYPE_FP32, {1}, &top_p}}
+                     {"runtime_top_p", {MEMORY_CPU, TYPE_FP32, {1}, &top_p}}});
-                });
                // common outputs
-                TensorMap dynamic_decode_output_tensors({
+                TensorMap dynamic_decode_output_tensors(
-                    {"output_ids", Tensor{MEMORY_GPU, TYPE_INT32, {max_seq_len, batch_size, beam_width}, d_output_ids}},
+                    {{"output_ids",
-                    {"finished", Tensor{MEMORY_GPU, TYPE_BOOL, {batch_size * beam_width}, nullptr}},
+                      Tensor{MEMORY_GPU, TYPE_INT32, {max_seq_len, batch_size, beam_width}, d_output_ids}},
-                    {"sequence_length", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, nullptr}}
+                     {"finished", Tensor{MEMORY_GPU, TYPE_BOOL, {batch_size * beam_width}, nullptr}},
-                });
+                     {"sequence_length", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, nullptr}}});
-                dynamic_decode_layer->forward(&dynamic_decode_output_tensors,
+                dynamic_decode_layer->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors);
-                                            &dynamic_decode_input_tensors);
                sync_check_cuda_error();
                // check results.
@@ -774,7 +1081,11 @@ protected:
                for (size_t j = 1; j < period_size; ++j) {
                    EXPECT_TRUE(h_output_ids[i] == h_output_ids[i + j])
                        << fmtstr("Fail at step %u val[%d]=%d <> val[%d]=%d",
-                                  step, i, h_output_ids[i], i + j, h_output_ids[i + j]);
+                                  step,
+                                  i,
+                                  h_output_ids[i],
+                                  i + j,
+                                  h_output_ids[i + j]);
                }
            }
        }
@@ -783,11 +1094,12 @@ protected:
        teardown();
    }
-    void runCumLogProbTest(SamplingLayerTestParam param) {
+    void runCumLogProbTest(SamplingLayerTestParam param)
+    {
        setup(param);
-        unsigned long long seed = 43;
+        unsigned long long     seed                 = 43;
-        const DataType data_type = getTensorType<T>();
+        const DataType         data_type            = getTensorType<T>();
-        DynamicDecodeLayer<T> *dynamic_decode_layer = new DynamicDecodeLayer<T>(vocab_size,
+        DynamicDecodeLayer<T>* dynamic_decode_layer = new DynamicDecodeLayer<T>(vocab_size,
                                                                                vocab_size,
                                                                                end_id,
                                                                                stream,
@@ -798,10 +1110,10 @@ protected:
        // Logit values in the host of shape ((batch_size x beam) x vocab_size) where beam = 1.
        // T* h_logits = new T[batch_size * beam_width * vocab_size];
-        T* h_probs = new T[batch_size * beam_width * vocab_size];
+        T*     h_probs                = new T[batch_size * beam_width * vocab_size];
-        T* h_log_probs = new T[batch_size * beam_width * vocab_size];
+        T*     h_log_probs            = new T[batch_size * beam_width * vocab_size];
-        float* h_cum_log_probs = new float[batch_size * beam_width];
+        float* h_cum_log_probs        = new float[batch_size * beam_width];
-        float* h_output_log_probs = new float[max_output_len * batch_size * beam_width];
+        float* h_output_log_probs     = new float[max_output_len * batch_size * beam_width];
        float* expected_cum_log_probs = new float[batch_size * beam_width];
        initRandom(h_logits, batch_size * beam_width * vocab_size, -3.0f, 3.0f);
        computeProb(h_probs, h_logits, batch_size * beam_width, vocab_size);
@@ -810,10 +1122,11 @@ protected:
        int* tiled_input_lengths_buf = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size * beam_width));
        float* cum_log_probs = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * batch_size * beam_width));
-        float* output_log_probs = reinterpret_cast<float*>(
+        float* output_log_probs =
-            allocator->malloc(sizeof(float) * max_output_len * batch_size * beam_width));
+            reinterpret_cast<float*>(allocator->malloc(sizeof(float) * max_output_len * batch_size * beam_width));
-        int* output_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_seq_len * batch_size * beam_width));
+        int* output_ids =
+            reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_seq_len * batch_size * beam_width));
        int* h_output_ids = new int[batch_size * beam_width];
        int* end_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size));
@@ -824,65 +1137,64 @@ protected:
        cudaMemset(output_log_probs, 0, sizeof(float) * max_output_len * batch_size * beam_width);
        cudaMemset(output_ids, 0, sizeof(int) * max_seq_len * batch_size * beam_width);
-        TensorMap input_tensors({
+        TensorMap input_tensors({{"random_seed", {MEMORY_CPU, TYPE_INT32, {1}, &seed}},
-            {"random_seed", {MEMORY_CPU, TYPE_INT32, {1}, &seed}},
+                                 {"runtime_top_k", {MEMORY_CPU, TYPE_UINT32, {1}, &top_k}},
-            {"runtime_top_k", {MEMORY_CPU, TYPE_UINT32, {1}, &top_k}},
+                                 {"runtime_top_p", {MEMORY_CPU, TYPE_FP32, {1}, &top_p}},
-            {"runtime_top_p", {MEMORY_CPU, TYPE_FP32, {1}, &top_p}},
+                                 {"temperature", Tensor{MEMORY_CPU, TYPE_FP32, {1}, &temperature}},
-            {"temperature", Tensor{MEMORY_CPU, TYPE_FP32, {1}, &temperature}},
+                                 {"repetition_penalty", Tensor{MEMORY_CPU, TYPE_FP32, {1}, &repetition_penalty}}});
-            {"repetition_penalty", Tensor{MEMORY_CPU, TYPE_FP32, {1}, &repetition_penalty}}
-        });
        dynamic_decode_layer->setup(batch_size, beam_width, &input_tensors);
        for (size_t step = max_input_len; step < max_output_len; ++step) {
            uint ite = 0;
            // Reset by the test value since the sampling layer internally update the logit buffer (making it log-prob).
            cudaH2Dcpy(d_logits, h_logits, batch_size * beam_width * vocab_size);
-            TensorMap dynamic_decode_input_tensors({
+            TensorMap dynamic_decode_input_tensors(
-                {"logits", Tensor{MEMORY_GPU, TYPE_FP32, {batch_size, beam_width, vocab_size}, d_logits}},
+                {{"logits", Tensor{MEMORY_GPU, TYPE_FP32, {batch_size, beam_width, vocab_size}, d_logits}},
-                {"embedding_bias", Tensor{MEMORY_GPU, data_type, {vocab_size}, nullptr}},
+                 {"embedding_bias", Tensor{MEMORY_GPU, data_type, {vocab_size}, nullptr}},
-                {"step", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &step}},
+                 {"step", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &step}},
-                {"max_input_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_input_len}},
+                 {"max_input_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_input_len}},
-                {"input_lengths",
+                 {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size, beam_width}, tiled_input_lengths_buf}},
-                    Tensor{MEMORY_GPU, TYPE_INT32, {batch_size, beam_width}, tiled_input_lengths_buf}},
+                 {"ite", Tensor{MEMORY_CPU, TYPE_UINT32, {1}, &ite}},
-                {"ite", Tensor{MEMORY_CPU, TYPE_UINT32, {1}, &ite}},
+                 {"local_batch_size", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &batch_size}},
-                {"local_batch_size", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &batch_size}},
+                 {"end_id", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, end_ids}},
-                {"end_id", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, end_ids}},
+                 {"random_seed", {MEMORY_CPU, TYPE_UINT64, {1}, &seed}},
-                {"random_seed", {MEMORY_CPU, TYPE_UINT64, {1}, &seed}},
+                 {"runtime_top_k", {MEMORY_CPU, TYPE_UINT32, {1}, &top_k}},
-                {"runtime_top_k", {MEMORY_CPU, TYPE_UINT32, {1}, &top_k}},
+                 {"runtime_top_p", {MEMORY_CPU, TYPE_FP32, {1}, &top_p}},
-                {"runtime_top_p", {MEMORY_CPU, TYPE_FP32, {1}, &top_p}},
+                 {"temperature", Tensor{MEMORY_CPU, TYPE_FP32, {1}, &temperature}},
-                {"temperature", Tensor{MEMORY_CPU, TYPE_FP32, {1}, &temperature}},
+                 {"repetition_penalty", Tensor{MEMORY_CPU, TYPE_FP32, {1}, &repetition_penalty}}});
-                {"repetition_penalty", Tensor{MEMORY_CPU, TYPE_FP32, {1}, &repetition_penalty}}
-            });
            // common outputs
-            TensorMap dynamic_decode_output_tensors({
+            TensorMap dynamic_decode_output_tensors(
-                {"output_ids", Tensor{MEMORY_GPU, TYPE_INT32, {max_seq_len, batch_size, beam_width}, output_ids}},
+                {{"output_ids", Tensor{MEMORY_GPU, TYPE_INT32, {max_seq_len, batch_size, beam_width}, output_ids}},
-                {"finished", Tensor{MEMORY_GPU, TYPE_BOOL, {batch_size * beam_width}, nullptr}},
+                 {"finished", Tensor{MEMORY_GPU, TYPE_BOOL, {batch_size * beam_width}, nullptr}},
-                {"cum_log_probs", Tensor{MEMORY_GPU, TYPE_FP32, {batch_size * beam_width}, cum_log_probs}},
+                 {"cum_log_probs", Tensor{MEMORY_GPU, TYPE_FP32, {batch_size * beam_width}, cum_log_probs}},
-                {"output_log_probs",
+                 {"output_log_probs",
-                    Tensor{MEMORY_GPU, TYPE_FP32, {max_seq_len, batch_size, beam_width}, output_log_probs}},
+                  Tensor{MEMORY_GPU, TYPE_FP32, {max_seq_len, batch_size, beam_width}, output_log_probs}},
-                {"sequence_length", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, nullptr}}});
+                 {"sequence_length", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, nullptr}}});
-            dynamic_decode_layer->forward(&dynamic_decode_output_tensors,
+            dynamic_decode_layer->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors);
-                                        &dynamic_decode_input_tensors);
            TM_LOG_DEBUG("Step %2d generated ids", step);
-            cudaD2Hcpy(h_output_ids,
+            cudaD2Hcpy(
-                       dynamic_decode_output_tensors
+                h_output_ids,
-                           .at("output_ids")
+                dynamic_decode_output_tensors.at("output_ids").getPtrWithOffset<int>(step * (batch_size * beam_width)),
-                           .getPtrWithOffset<int>(step * (batch_size * beam_width)),
+                batch_size * beam_width);
-                       batch_size * beam_width);
            cudaD2Hcpy(h_cum_log_probs, cum_log_probs, batch_size * beam_width);
            cudaD2Hcpy(h_output_log_probs, output_log_probs, max_output_len * batch_size * beam_width);
            for (size_t i = 0; i < batch_size * beam_width; ++i) {
                int idx = i * vocab_size + h_output_ids[i];
                expected_cum_log_probs[i] += (float)h_log_probs[idx];
-                TM_LOG_DEBUG(
+                TM_LOG_DEBUG("| step %2d batch %2d idx %7d id %6d | log-prob %9.4f (expt: %9.4f) "
-                    "| step %2d batch %2d idx %7d id %6d | log-prob %9.4f (expt: %9.4f) "
+                             "| cum-log-prob %9.4f (expt: %9.4f) | prob %9.4e",
-                    "| cum-log-prob %9.4f (expt: %9.4f) | prob %9.4e",
+                             (int)step,
-                    (int)step, (int)i, (int)idx, (int)h_output_ids[i],
+                             (int)i,
-                    h_output_log_probs[step * batch_size * beam_width + i], (float)h_log_probs[idx],
+                             (int)idx,
-                    h_cum_log_probs[i], expected_cum_log_probs[i], (float)h_probs[idx]);
+                             (int)h_output_ids[i],
+                             h_output_log_probs[step * batch_size * beam_width + i],
+                             (float)h_log_probs[idx],
+                             h_cum_log_probs[i],
+                             expected_cum_log_probs[i],
+                             (float)h_probs[idx]);
            }
            TM_LOG_DEBUG("");
        }
@@ -898,7 +1210,6 @@ protected:
        delete dynamic_decode_layer;
    }
 };
 TYPED_TEST_SUITE(SamplingDecodeTest2, FloatAndHalfTypes);

--- a/tests/csrc/unittests/test_tensor.cu
+++ b/tests/csrc/unittests/test_tensor.cu
 #include <iostream>
-#include <vector>
 #include <unordered_map>
+#include <vector>
 #include <gtest/gtest.h>
@@ -10,16 +10,17 @@ using namespace turbomind;
 namespace {
-#define EXPECT_EQUAL_TENSORS(t1, t2)       \
+#define EXPECT_EQUAL_TENSORS(t1, t2)                                                                                   \
-    do {                                   \
+    do {                                                                                                               \
-        EXPECT_TRUE(t1.where == t2.where); \
+        EXPECT_TRUE(t1.where == t2.where);                                                                             \
-        EXPECT_TRUE(t1.type == t2.type);   \
+        EXPECT_TRUE(t1.type == t2.type);                                                                               \
-        EXPECT_TRUE(t1.shape == t2.shape); \
+        EXPECT_TRUE(t1.shape == t2.shape);                                                                             \
-        EXPECT_TRUE(t1.data == t2.data);   \
+        EXPECT_TRUE(t1.data == t2.data);                                                                               \
-    } while(false)
+    } while (false)
-TEST(TensorMapTest, HasKeyCorrectness) {
+TEST(TensorMapTest, HasKeyCorrectness)
-    bool* v1 = new bool(true);
+{
+    bool*  v1 = new bool(true);
    float* v2 = new float[6]{1.0f, 1.1f, 1.2f, 1.3f, 1.4f, 1.5f};
    Tensor t1 = Tensor{MEMORY_CPU, TYPE_BOOL, {1}, v1};
    Tensor t2 = Tensor{MEMORY_CPU, TYPE_FP32, {3, 2}, v2};
@@ -33,8 +34,9 @@ TEST(TensorMapTest, HasKeyCorrectness) {
    delete[] v2;
 }
-TEST(TensorMapTest, InsertCorrectness) {
+TEST(TensorMapTest, InsertCorrectness)
-    int* v1 = new int[4]{1, 10, 20, 30};
+{
+    int*   v1 = new int[4]{1, 10, 20, 30};
    float* v2 = new float[2]{1.0f, 2.0f};
    Tensor t1 = Tensor(MEMORY_CPU, TYPE_INT32, {4}, v1);
    Tensor t2 = Tensor(MEMORY_CPU, TYPE_INT32, {2}, v2);
@@ -46,7 +48,8 @@ TEST(TensorMapTest, InsertCorrectness) {
    EXPECT_FALSE(map.isExist("t2"));
 }
-TEST(TensorMapTest, InsertDoesNotAllowNoneTensor) {
+TEST(TensorMapTest, InsertDoesNotAllowNoneTensor)
+{
    TensorMap map;
    EXPECT_TRUE(map.size() == 0);
    // forbid a none tensor.
@@ -57,10 +60,11 @@ TEST(TensorMapTest, InsertDoesNotAllowNoneTensor) {
    EXPECT_THROW(map.insert("empty", none_data_tensor), std::runtime_error);
 }
-TEST(TensorMapTest, InsertDoesNotAllowDuplicatedKey) {
+TEST(TensorMapTest, InsertDoesNotAllowDuplicatedKey)
-    int* v1 = new int[4]{1, 10, 20, 30};
+{
-    Tensor t1 = Tensor(MEMORY_CPU, TYPE_INT32, {4}, v1);
+    int*      v1 = new int[4]{1, 10, 20, 30};
-    Tensor t2 = Tensor(MEMORY_CPU, TYPE_INT32, {2}, v1);
+    Tensor    t1 = Tensor(MEMORY_CPU, TYPE_INT32, {4}, v1);
+    Tensor    t2 = Tensor(MEMORY_CPU, TYPE_INT32, {2}, v1);
    TensorMap map({{"t1", t1}});
    EXPECT_TRUE(map.size() == 1);
    // forbid a duplicated key.
@@ -68,8 +72,9 @@ TEST(TensorMapTest, InsertDoesNotAllowDuplicatedKey) {
    delete[] v1;
 }
-TEST(TensorMapTest, GetValCorrectness) {
+TEST(TensorMapTest, GetValCorrectness)
-    int* v1 = new int[4]{1, 10, 20, 30};
+{
+    int*   v1 = new int[4]{1, 10, 20, 30};
    Tensor t1 = Tensor(MEMORY_CPU, TYPE_INT32, {4}, v1);
    TensorMap map({{"t1", t1}});
@@ -93,13 +98,14 @@ TEST(TensorMapTest, GetValCorrectness) {
    delete[] v1;
 }
-TEST(TensorMapTest, GetTensorCorrectness) {
+TEST(TensorMapTest, GetTensorCorrectness)
-    bool* t1_val = new bool(true);
+{
+    bool*  t1_val = new bool(true);
    float* t2_val = new float[6]{1.0f, 1.1f, 1.2f, 1.3f, 1.4f, 1.5f};
-    Tensor t1 = Tensor{MEMORY_CPU, TYPE_BOOL, {1}, t1_val};
+    Tensor t1     = Tensor{MEMORY_CPU, TYPE_BOOL, {1}, t1_val};
-    Tensor t2 = Tensor{MEMORY_CPU, TYPE_FP32, {3, 2}, t2_val};
+    Tensor t2     = Tensor{MEMORY_CPU, TYPE_FP32, {3, 2}, t2_val};
-    int* default_val = new int[4]{0, 1, 2, 3};
+    int*   default_val    = new int[4]{0, 1, 2, 3};
    Tensor default_tensor = Tensor{MEMORY_CPU, TYPE_INT32, {4}, default_val};
    TensorMap map({{"t1", t1}, {"t2", t2}});
@@ -114,13 +120,14 @@ TEST(TensorMapTest, GetTensorCorrectness) {
    delete[] t1_val;
 }
-TEST(TensorMapTest, GetTensorCorrectnessAtConstTensorMap) {
+TEST(TensorMapTest, GetTensorCorrectnessAtConstTensorMap)
-    bool* t1_val = new bool(true);
+{
+    bool*  t1_val = new bool(true);
    float* t2_val = new float[6]{1.0f, 1.1f, 1.2f, 1.3f, 1.4f, 1.5f};
-    Tensor t1 = Tensor{MEMORY_CPU, TYPE_BOOL, {1}, t1_val};
+    Tensor t1     = Tensor{MEMORY_CPU, TYPE_BOOL, {1}, t1_val};
-    Tensor t2 = Tensor{MEMORY_CPU, TYPE_FP32, {3, 2}, t2_val};
+    Tensor t2     = Tensor{MEMORY_CPU, TYPE_FP32, {3, 2}, t2_val};
-    int* default_val = new int[4]{0, 1, 2, 3};
+    int*   default_val    = new int[4]{0, 1, 2, 3};
    Tensor default_tensor = Tensor{MEMORY_CPU, TYPE_INT32, {4}, default_val};
    const TensorMap map({{"t1", t1}, {"t2", t2}});
@@ -135,7 +142,8 @@ TEST(TensorMapTest, GetTensorCorrectnessAtConstTensorMap) {
    delete[] t1_val;
 }
-TEST(TensorTest, EmptyTensorMinMaxRaiseError) {
+TEST(TensorTest, EmptyTensorMinMaxRaiseError)
+{
    Tensor t1;
    EXPECT_THROW(t1.min<int>(), std::runtime_error);
    EXPECT_THROW(t1.max<int>(), std::runtime_error);
@@ -145,22 +153,22 @@ TEST(TensorTest, EmptyTensorMinMaxRaiseError) {
    EXPECT_THROW(t2.max<int>(), std::runtime_error);
 }
 using TensorTypes = testing::Types<int8_t, int, float>;
-template <typename T>
+template<typename T>
-class TensorFuncTest : public testing::Test {};
+class TensorFuncTest: public testing::Test {};
 TYPED_TEST_SUITE(TensorFuncTest, TensorTypes);
-TYPED_TEST(TensorFuncTest, MaxCorrectness) {
+TYPED_TEST(TensorFuncTest, MaxCorrectness)
+{
    using T = TypeParam;
    size_t size = 4;
-    T* v1 = new T[size] {T(1), T(2), T(3), T(4)};
+    T* v1 = new T[size]{T(1), T(2), T(3), T(4)};
-    T* v2 = new T[size] {T(4), T(3), T(2), T(1)};
+    T* v2 = new T[size]{T(4), T(3), T(2), T(1)};
-    T* v3 = new T[size] {T(1), T(2), T(4), T(3)};
+    T* v3 = new T[size]{T(1), T(2), T(4), T(3)};
    Tensor t1 = Tensor(MEMORY_CPU, getTensorType<T>(), {size}, v1);
    Tensor t2 = Tensor(MEMORY_CPU, getTensorType<T>(), {size}, v2);
@@ -175,7 +183,8 @@ TYPED_TEST(TensorFuncTest, MaxCorrectness) {
    delete[] v3;
 }
-TYPED_TEST(TensorFuncTest, MinCorrectness) {
+TYPED_TEST(TensorFuncTest, MinCorrectness)
+{
    using T = TypeParam;
    size_t size = 4;
@@ -197,42 +206,45 @@ TYPED_TEST(TensorFuncTest, MinCorrectness) {
    delete[] v3;
 }
-TYPED_TEST(TensorFuncTest, AnyCorrectness) {
+TYPED_TEST(TensorFuncTest, AnyCorrectness)
+{
    using T = TypeParam;
-    T* v = new T[4]{T(1), T(2), T(3), T(4)};
+    T*     v = new T[4]{T(1), T(2), T(3), T(4)};
    Tensor t = Tensor{MEMORY_CPU, getTensorType<T>(), {4}, v};
    EXPECT_TRUE(t.any<T>(T(1)));
    EXPECT_FALSE(t.any<T>(T(5)));
    delete[] v;
 }
-TYPED_TEST(TensorFuncTest, AllCorrectness) {
+TYPED_TEST(TensorFuncTest, AllCorrectness)
+{
    using T = TypeParam;
    constexpr size_t size = 4;
-    T* v1 = new T[size]{T(1), T(1), T(1), T(1)};
+    T*               v1   = new T[size]{T(1), T(1), T(1), T(1)};
-    T* v2 = new T[size]{T(1), T(1), T(1), T(2)};
+    T*               v2   = new T[size]{T(1), T(1), T(1), T(2)};
-    Tensor t1 = Tensor{MEMORY_CPU, getTensorType<T>(), {size}, v1};
+    Tensor           t1   = Tensor{MEMORY_CPU, getTensorType<T>(), {size}, v1};
-    Tensor t2 = Tensor{MEMORY_CPU, getTensorType<T>(), {size}, v2};
+    Tensor           t2   = Tensor{MEMORY_CPU, getTensorType<T>(), {size}, v2};
    EXPECT_TRUE(t1.all<T>(T(1)));
    EXPECT_FALSE(t2.all<T>(T(2)));
    delete[] v1;
    delete[] v2;
 }
-TYPED_TEST(TensorFuncTest, SliceCorrectness) {
+TYPED_TEST(TensorFuncTest, SliceCorrectness)
+{
    using T = TypeParam;
    constexpr int size = 12;
-    T* v = new T[size];
+    T*            v    = new T[size];
    for (int i = 0; i < size; ++i) {
        v[i] = i;
    }
    DataType dtype = getTensorType<T>();
-    Tensor t1 = Tensor(MEMORY_CPU, dtype, {3, 4}, v);
+    Tensor   t1    = Tensor(MEMORY_CPU, dtype, {3, 4}, v);
-    Tensor t2 = t1.slice({2, 4}, 4);
+    Tensor   t2    = t1.slice({2, 4}, 4);
    EXPECT_EQUAL_TENSORS(t2, Tensor(MEMORY_CPU, dtype, {2, 4}, &v[4]));
    // An overflowed tensor throws an exception.
@@ -241,4 +253,4 @@ TYPED_TEST(TensorFuncTest, SliceCorrectness) {
    delete[] v;
 }
-} // end of namespace
+}  // end of namespace
--- a/tests/csrc/unittests/unittest_utils.h
+++ b/tests/csrc/unittests/unittest_utils.h
@@ -16,15 +16,15 @@
 #pragma once
-#include <algorithm>   // min, max
+#include <algorithm>  // min, max
-#include <assert.h>    // assert
+#include <assert.h>   // assert
-#include <float.h>     // FLT_MAX
+#include <float.h>    // FLT_MAX
-#include <iostream>    // snprintf
+#include <iostream>   // snprintf
-#include <math.h>      // expf, log
+#include <limits>     // numeric_limits
-#include <limits>      // numeric_limits
+#include <math.h>     // expf, log
-#include <stdlib.h>    // rand
+#include <stdlib.h>   // rand
-#include <string>      // string
+#include <string>     // string
-#include <vector>      // vector
+#include <vector>     // vector
 #include "src/turbomind/utils/cuda_utils.h"
 #include "src/turbomind/utils/memory_utils.h"
@@ -36,32 +36,37 @@
 using namespace turbomind;
-class TestFailureError : public std::exception {
+class TestFailureError: public std::exception {
 private:
    std::string msg_;
 public:
    explicit TestFailureError() = default;
-    explicit TestFailureError(std::string name, std::string msg = "") {
+    explicit TestFailureError(std::string name, std::string msg = "")
+    {
        msg_ = fmtstr("TEST FAIL [%s] %s", name.c_str(), msg.c_str());
    }
-    const char* what () const throw () {
+    const char* what() const throw()
+    {
        return msg_.c_str();
    }
 };
-#define EXPECT_TRUE(cond)                                  \
+#define EXPECT_TRUE(cond)                                                                                              \
-    do { if(!(cond)) {                                     \
+    do {                                                                                                               \
-        TM_LOG_ERROR("TEST FAIL [%s]: %s at %s:%d",        \
+        if (!(cond)) {                                                                                                 \
-                     __func__, #cond, __FILE__, __LINE__); \
+            TM_LOG_ERROR("TEST FAIL [%s]: %s at %s:%d", __func__, #cond, __FILE__, __LINE__);                          \
-        throw TestFailureError(__func__);                  \
+            throw TestFailureError(__func__);                                                                          \
-    } } while(false)
+        }                                                                                                              \
+    } while (false)
-#define EXPECT_FALSE(cond)                                 \
-    do { if(cond) {                                        \
+#define EXPECT_FALSE(cond)                                                                                             \
-        TM_LOG_ERROR("TEST FAIL [%s]: %s at %s:%d",        \
+    do {                                                                                                               \
-                     __func__, #cond, __FILE__, __LINE__); \
+        if (cond) {                                                                                                    \
-        throw TestFailureError(__func__);                  \
+            TM_LOG_ERROR("TEST FAIL [%s]: %s at %s:%d", __func__, #cond, __FILE__, __LINE__);                          \
-    } } while(false)
+            throw TestFailureError(__func__);                                                                          \
+        }                                                                                                              \
+    } while (false)
 bool almostEqual(float a, float b, float atol = 1e-5, float rtol = 1e-8)
 {
@@ -80,9 +85,11 @@ bool almostEqual(float a, float b, float atol = 1e-5, float rtol = 1e-8)
 }
 template<typename T>
-bool checkResult(std::string name, T* out, T*ref, size_t size, float atol, float rtol) {
+bool checkResult(std::string name, T* out, T* ref, size_t size, float atol, float rtol)
-    size_t failures = 0;
+{
-    float relative_gap = 0.0f;;
+    size_t failures     = 0;
+    float  relative_gap = 0.0f;
+    ;
    for (size_t i = 0; i < size; ++i) {
        // The values for the output and the reference.
@@ -109,18 +116,21 @@ bool checkResult(std::string name, T* out, T*ref, size_t size, float atol, float
    // Allow not matched up to 1% elements.
    size_t tol_failures = (size_t)(0.01 * size);
    TM_LOG_INFO("check...%6s : %-50s (failures: %.2f%% atol: %.2e rtol: %.2e rel_gap: %.2e%%)",
-                failures <= tol_failures ? "....OK" : "FAILED", name.c_str(),
+                failures <= tol_failures ? "....OK" : "FAILED",
-                100. * failures / size, atol, rtol, 100. * relative_gap);
+                name.c_str(),
+                100. * failures / size,
+                atol,
+                rtol,
+                100. * relative_gap);
    return failures <= tol_failures;
 }
 template<typename T>
-bool checkResult(std::string name, T* out, T* ref, size_t size,
+bool checkResult(std::string name, T* out, T* ref, size_t size, bool device_out = true, bool device_ref = false)
-                 bool device_out = true, bool device_ref = false)
 {
-    bool is_fp32 = sizeof(T) == 4;
+    bool  is_fp32 = sizeof(T) == 4;
-    float atol = is_fp32 ? 1e-4f : 1e-3f;
+    float atol    = is_fp32 ? 1e-4f : 1e-3f;
-    float rtol = is_fp32 ? 1e-2f : 1e-1f;
+    float rtol    = is_fp32 ? 1e-2f : 1e-1f;
    T* h_out = nullptr;
    if (device_out) {
@@ -135,7 +145,7 @@ bool checkResult(std::string name, T* out, T* ref, size_t size,
        ref = h_ref;
    }
    bool is_ok = checkResult(name, out, ref, size, atol, rtol);
-    if (h_out != nullptr){
+    if (h_out != nullptr) {
        delete[] h_out;
    }
    if (h_ref != nullptr) {
@@ -145,7 +155,8 @@ bool checkResult(std::string name, T* out, T* ref, size_t size,
 }
 template<typename T>
-void initRandom(T* ptr, size_t size, float minval, float maxval) {
+void initRandom(T* ptr, size_t size, float minval, float maxval)
+{
    for (size_t i = 0; i < size; ++i) {
        float val = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
        val *= (maxval - minval);
@@ -153,7 +164,8 @@ void initRandom(T* ptr, size_t size, float minval, float maxval) {
    }
 }
-void initRandomInt(int* ptr, size_t size, int minval, int maxval) {
+void initRandomInt(int* ptr, size_t size, int minval, int maxval)
+{
    assert(minval < maxval);
    int mod = maxval - minval;
    for (size_t i = 0; i < size; ++i) {
@@ -162,7 +174,8 @@ void initRandomInt(int* ptr, size_t size, int minval, int maxval) {
 }
 template<typename T>
-void tile(T* x, int m, int n) {
+void tile(T* x, int m, int n)
+{
    for (int i = 1; i < m; ++i) {
        for (int j = 0; j < n; ++j) {
            x[i * n + j] = x[j];
@@ -171,7 +184,8 @@ void tile(T* x, int m, int n) {
 }
 template<typename T>
-void tile(T* dst, T* src, int m, int n) {
+void tile(T* dst, T* src, int m, int n)
+{
    for (int i = 1; i < m; ++i) {
        for (int j = 0; j < n; ++j) {
            dst[i * n + j] = src[j];
@@ -182,11 +196,13 @@ void tile(T* dst, T* src, int m, int n) {
 #define HALF_FLT_MAX 65504.0f
 template<typename T>
-bool isHalf() {
+bool isHalf()
+{
    return std::is_same<T, half>::value;
 }
 template<typename T>
-static inline void printMatrixWithLimit(T* ptr, int m, int k, int stride, bool is_device_ptr) {
+static inline void printMatrixWithLimit(T* ptr, int m, int k, int stride, bool is_device_ptr)
+{
    printMatrix(ptr, std::min(PRINT_LIMIT, m), std::min(PRINT_LIMIT, k), stride, is_device_ptr);
 }