Support windows platform (#209)

* __PRETTY_FUNCTION__ * CASE_K * uint * remove not * HALF_FLT_MAX * struct init * port utils * better build pthread-win32 * port kernels * port utils/gemm_test * hide windows header * port models * port examples && triton_backend && unittests * update build readme * fix lint * fix lint * fix lint * fix lint * fix lint * fix build * fix build * cmake version * fix typos * update ci * port kernels/gemm_s_f16 * update ci * fix ci * use cudaStreamSynchronize instead of volatile check * remove gettimeofday * remove pthread-win32 * remove dirent.h * update pre-commit * update * remove todo * fix include * fix build * fix build * fix build ci * fix github action trigger * update README * fix linux-build ci * remove windows folder * fix lint * update readme

Support windows platform (#209)
* __PRETTY_FUNCTION__ * CASE_K * uint * remove not * HALF_FLT_MAX * struct init * port utils * better build pthread-win32 * port kernels * port utils/gemm_test * hide windows header * port models * port examples && triton_backend && unittests * update build readme * fix lint * fix lint * fix lint * fix lint * fix lint * fix build * fix build * cmake version * fix typos * update ci * port kernels/gemm_s_f16 * update ci * fix ci * use cudaStreamSynchronize instead of volatile check * remove gettimeofday * remove pthread-win32 * remove dirent.h * update pre-commit * update * remove todo * fix include * fix build * fix build * fix build ci * fix github action trigger * update README * fix linux-build ci * remove windows folder * fix lint * update readme
4c9959f6 · Chen Xin · GitHub · 0d21f366 · 4c9959f6 · 4c9959f6
Unverified Commit 4c9959f6 authored Aug 17, 2023 by Chen Xin Committed by GitHub Aug 17, 2023
18 changed files
--- a/src/turbomind/utils/logger.h
+++ b/src/turbomind/utils/logger.h
@@ -24,6 +24,12 @@
 namespace turbomind {
+// cub.cuh brings windows.h
+// should be included after cub.cuh
+#ifdef ERROR
+#undef ERROR
+#endif
 class Logger {
 public:

--- a/src/turbomind/utils/memory_utils.cu
+++ b/src/turbomind/utils/memory_utils.cu
@@ -14,6 +14,7 @@
 * limitations under the License.
 */
+#include "src/turbomind/macro.h"
 #include "src/turbomind/utils/Tensor.h"
 #include "src/turbomind/utils/cuda_type_utils.cuh"
 #include "src/turbomind/utils/logger.h"
@@ -356,8 +357,8 @@ loadWeightFromBinHelper(std::vector<size_t> shape, std::string filename, std::ve
        }
        // get slices
-        ConcateSlice slice0{.slices = {{0, dim0}}};
+        ConcateSlice slice0{{{0, dim0}}};
-        ConcateSlice slice1{.slices = {{0, dim1}}};
+        ConcateSlice slice1{{{0, dim1}}};
        if (slices.size() > 0 && slices[0].slices.size() > 0) {
            slice0 = slices[0];
        }

--- a/src/turbomind/utils/nccl_utils.cc
+++ b/src/turbomind/utils/nccl_utils.cc
@@ -15,6 +15,7 @@
 */
 #include "src/turbomind/utils/nccl_utils.h"
+#include "src/turbomind/macro.h"
 #include <atomic>
 namespace turbomind {

--- a/src/turbomind/utils/nvtx_utils.cc
+++ b/src/turbomind/utils/nvtx_utils.cc
@@ -18,7 +18,7 @@
 #include "nvtx_utils.h"
 #ifdef USE_NVTX
-#include "nvToolsExt.h"
+#include "nvtx3/nvToolsExt.h"
 #endif
 namespace ft_nvtx {

--- a/tests/csrc/gemm_dequantize/th_gemm_dequantize.cc
+++ b/tests/csrc/gemm_dequantize/th_gemm_dequantize.cc
@@ -49,12 +49,12 @@ Tensor fused_gemm_dq_helper(
    const T*          scales_ptr    = get_ptr<const T>(scales);
    turbomind::CutlassFpAIntBGemmRunner<T, WeightType> fused_gemm_dq_runner;
-    const int ws_bytes = fused_gemm_dq_runner.getWorkspaceSize(m, n, k);
+    const int                                          ws_bytes = fused_gemm_dq_runner.getWorkspaceSize(m, n, k);
    auto output_tensor = torch::empty({m, n}, torch::dtype(_st).device(torch::kCUDA).requires_grad(false));
    auto ws_tensor     = torch::empty({ws_bytes}, torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false));
-    T*   output_tensor_ptr = get_ptr<T>(output_tensor);
+    T*    output_tensor_ptr = get_ptr<T>(output_tensor);
    char* ws_ptr            = get_ptr<char>(ws_tensor);
    cudaEvent_t start, stop;
@@ -258,12 +258,12 @@ Tensor fused_gemm_dq_bias_act_helper(
    const T*          bias_ptr      = get_ptr<const T>(bias);
    turbomind::CutlassFpAIntBGemmRunner<T, WeightType> fused_gemm_dq_runner;
-    const int ws_bytes = fused_gemm_dq_runner.getWorkspaceSize(m, n, k);
+    const int                                          ws_bytes = fused_gemm_dq_runner.getWorkspaceSize(m, n, k);
    auto output_tensor = torch::empty({m, n}, torch::dtype(_st).device(torch::kCUDA).requires_grad(false));
    auto ws_tensor     = torch::empty({ws_bytes}, torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false));
-    T*   output_tensor_ptr = get_ptr<T>(output_tensor);
+    T*    output_tensor_ptr = get_ptr<T>(output_tensor);
    char* ws_ptr            = get_ptr<char>(ws_tensor);
    fused_gemm_dq_runner.gemm_bias_act(input_act_ptr,

--- a/tests/csrc/int8_gemm/int8_gemm_test.cu
+++ b/tests/csrc/int8_gemm/int8_gemm_test.cu
@@ -14,11 +14,11 @@
 * limitations under the License.
 */
+#include <chrono>
+#include <cstdlib>
 #include <cublas_v2.h>
 #include <iostream>
 #include <vector>
-#include <cstdlib>
-#include <chrono>
 #include "torch/csrc/cuda/Stream.h"
 #include <torch/custom_class.h>
@@ -37,18 +37,17 @@ using torch_ext::get_ptr;
 namespace ft = turbomind;
 template<typename T>
-void int8_gemm_test(
+void int8_gemm_test(const int            m,
-    const int m,
+                    const int            n,
-    const int n,
+                    const int            k,
-    const int k,
+                    const at::ScalarType output_data_type,
-    const at::ScalarType output_data_type,
+                    const QuantMode      quant_mode,
-    const QuantMode quant_mode,
+                    const int            iters)
-    const int iters)
 {
-     const bool per_token_quant = quant_mode == QuantMode::PerTokenChannelQuant
+    const bool per_token_quant =
-        || quant_mode == QuantMode::PerTokenQuant;
+        quant_mode == QuantMode::PerTokenChannelQuant || quant_mode == QuantMode::PerTokenQuant;
-    const bool per_channel_quant = quant_mode == QuantMode::PerTokenChannelQuant
+    const bool per_channel_quant =
-        || quant_mode == QuantMode::PerChannelQuant;
+        quant_mode == QuantMode::PerTokenChannelQuant || quant_mode == QuantMode::PerChannelQuant;
    const int row_scale_size = per_token_quant ? m : 1;
    const int col_scale_size = per_channel_quant ? n : 1;
@@ -76,16 +75,16 @@ void int8_gemm_test(
    ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, {(size_t)k, (size_t)n}, get_ptr<int32_t>(w)}.saveNpy("w.npy");
    ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, {(size_t)m, (size_t)n}, get_ptr<int32_t>(y)}.saveNpy("y.npy");
-    auto x_gpu = x.to(at_int8).to(torch::kCUDA);
+    auto x_gpu       = x.to(at_int8).to(torch::kCUDA);
-    auto w_T_gpu = w.to(at_int8).to(torch::kCUDA).t().contiguous();
+    auto w_T_gpu     = w.to(at_int8).to(torch::kCUDA).t().contiguous();
-    auto w_gpu = w.to(at_int8).to(torch::kCUDA);
+    auto w_gpu       = w.to(at_int8).to(torch::kCUDA);
-    auto y_gpu = torch::zeros({m, n}, torch::dtype(output_data_type).device(torch::kCUDA).requires_grad(false));
+    auto y_gpu       = torch::zeros({m, n}, torch::dtype(output_data_type).device(torch::kCUDA).requires_grad(false));
    auto y_gpu_int32 = torch::zeros({m, n}, torch::dtype(at_int32).device(torch::kCUDA).requires_grad(false));
-    auto alpha_row_cultass = torch::ones({row_scale_size, 1}, torch::dtype(at_fp32).requires_grad(false)) * (1.0 / 100) *
+    auto alpha_row_cultass = torch::ones({row_scale_size, 1}, torch::dtype(at_fp32).requires_grad(false)) * (1.0 / 100)
-        torch::randint(1, 10, {row_scale_size, 1}, torch::dtype(at_fp32));
+                             * torch::randint(1, 10, {row_scale_size, 1}, torch::dtype(at_fp32));
-    auto alpha_col_cutlass = torch::ones({1, col_scale_size}, torch::dtype(at_fp32).requires_grad(false)) * (1.0 / 100) *
+    auto alpha_col_cutlass = torch::ones({1, col_scale_size}, torch::dtype(at_fp32).requires_grad(false)) * (1.0 / 100)
-        torch::randint(1, 10, {1, col_scale_size}, torch::dtype(at_fp32));
+                             * torch::randint(1, 10, {1, col_scale_size}, torch::dtype(at_fp32));
    auto alpha_row_torch = alpha_row_cultass.expand({m, 1});
    auto alpha_col_torch = alpha_col_cutlass.expand({1, n});
@@ -101,40 +100,41 @@ void int8_gemm_test(
    auto stream = at::cuda::getCurrentCUDAStream().stream();
    // warm_up
    cutlass_runner_half.gemm(get_ptr<int8_t>(x_gpu),
-            get_ptr<int8_t>(w_T_gpu),
+                             get_ptr<int8_t>(w_T_gpu),
-            quant_mode,
+                             quant_mode,
-            get_ptr<float>(alpha_col_gpu),
+                             get_ptr<float>(alpha_col_gpu),
-            get_ptr<float>(alpha_row_gpu),
+                             get_ptr<float>(alpha_row_gpu),
-            get_ptr<T>(y_gpu),
+                             get_ptr<T>(y_gpu),
-            m,
+                             m,
-            n,
+                             n,
-            k,
+                             k,
-            nullptr,
+                             nullptr,
-            0,
+                             0,
-            stream);
+                             stream);
    ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT8, {(size_t)m, (size_t)k}, get_ptr<int8_t>(x_gpu)}.saveNpy("x_gpu.npy");
    ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT8, {(size_t)n, (size_t)k}, get_ptr<int8_t>(w_T_gpu)}.saveNpy("w_T_gpu.npy");
    ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT8, {(size_t)k, (size_t)n}, get_ptr<int8_t>(w_gpu)}.saveNpy("w_gpu.npy");
    ft::Tensor{ft::MEMORY_GPU, ft::TYPE_FP16, {(size_t)m, (size_t)n}, get_ptr<T>(y_gpu)}.saveNpy("y_gpu.npy");
-    ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT32, {(size_t)m, (size_t)n}, get_ptr<int32_t>(y_gpu_int32)}.saveNpy("y_gpu_int32.npy");
+    ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT32, {(size_t)m, (size_t)n}, get_ptr<int32_t>(y_gpu_int32)}.saveNpy(
+        "y_gpu_int32.npy");
    ft::check_cuda_error(cudaStreamSynchronize(stream));
    auto start = high_resolution_clock::now();
    for (int i = 0; i < iters; ++i) {
        cutlass_runner_half.gemm(get_ptr<int8_t>(x_gpu),
-            get_ptr<int8_t>(w_T_gpu),
+                                 get_ptr<int8_t>(w_T_gpu),
-            quant_mode,
+                                 quant_mode,
-            get_ptr<float>(alpha_col_gpu),
+                                 get_ptr<float>(alpha_col_gpu),
-            get_ptr<float>(alpha_row_gpu),
+                                 get_ptr<float>(alpha_row_gpu),
-            get_ptr<T>(y_gpu),
+                                 get_ptr<T>(y_gpu),
-            m,
+                                 m,
-            n,
+                                 n,
-            k,
+                                 k,
-            nullptr,
+                                 nullptr,
-            0,
+                                 0,
-            stream);
+                                 stream);
    }
    ft::check_cuda_error(cudaStreamSynchronize(stream));
@@ -142,27 +142,30 @@ void int8_gemm_test(
    auto duration = duration_cast<microseconds>(end - start);
-    if (torch::allclose((y.to(torch::kCUDA).to(at_fp32) * alpha_row_col_scale_gpu.to(torch::kCUDA)).to(output_data_type), y_gpu)) {
+    if (torch::allclose(
+            (y.to(torch::kCUDA).to(at_fp32) * alpha_row_col_scale_gpu.to(torch::kCUDA)).to(output_data_type), y_gpu)) {
        TM_LOG_INFO("SUCCESS " + std::to_string((double(duration.count()) / iters) / 1000) + " ms");
-    } else {
+    }
+    else {
        TM_LOG_ERROR("FAILED " + std::to_string((double(duration.count()) / iters) / 1000) + " ms");
-        // std::cout << "diff " << (y.to(torch::kCUDA).to(at_fp32) * alpha_row_col_scale_gpu.to(torch::kCUDA)).to(at_fp16) - y_gpu << std::endl;
+        // std::cout << "diff " << (y.to(torch::kCUDA).to(at_fp32) *
+        // alpha_row_col_scale_gpu.to(torch::kCUDA)).to(at_fp16) - y_gpu << std::endl;
    }
 }
-int main(int argc, char **argv)
+int main(int argc, char** argv)
 {
    if (argc != 7) {
-        TM_LOG_ERROR("arguments missing, needs m, n, k, data_type(fp16=0, bf16=1), quant_mode (perTensor=0, perToken=1, perChannel=2, perTokenChannel=3), iters.");
+        TM_LOG_ERROR(
+            "arguments missing, needs m, n, k, data_type(fp16=0, bf16=1), quant_mode (perTensor=0, perToken=1, perChannel=2, perTokenChannel=3), iters.");
        return 0;
    }
-    const int m = atoi(argv[1]);
+    const int            m                = atoi(argv[1]);
-    const int n = atoi(argv[2]);
+    const int            n                = atoi(argv[2]);
-    const int k = atoi(argv[3]);
+    const int            k                = atoi(argv[3]);
-    const at::ScalarType output_data_type = atoi(argv[4]) == 0 ?
+    const at::ScalarType output_data_type = atoi(argv[4]) == 0 ? at::ScalarType::Half : at::ScalarType::BFloat16;
-        at::ScalarType::Half : at::ScalarType::BFloat16;
+    const QuantMode      quant_mode       = static_cast<QuantMode>(atoi(argv[5]));
-    const QuantMode quant_mode = static_cast<QuantMode>(atoi(argv[5]));
    if (quant_mode == QuantMode::PerChannelQuant) {
        printf("per channel quant \n");
    }
@@ -170,7 +173,8 @@ int main(int argc, char **argv)
    if (output_data_type == at::ScalarType::Half) {
        int8_gemm_test<half>(m, n, k, output_data_type, quant_mode, iters);
-    } else {
+    }
+    else {
 #if ENABLE_BF16
        int8_gemm_test<__nv_bfloat16>(m, n, k, output_data_type, quant_mode, iters);
 #endif

--- a/tests/csrc/unittests/CMakeLists.txt
+++ b/tests/csrc/unittests/CMakeLists.txt
@@ -20,7 +20,12 @@ FetchContent_Declare(
  GIT_REPOSITORY https://github.com/google/googletest.git
  GIT_TAG release-1.12.1
 )
-add_definitions(-DTORCH_CUDA=1)
+find_package(CUDAToolkit REQUIRED)
+if (NOT MSVC)
+  add_definitions(-DTORCH_CUDA=1)
+endif()
 # For Windows: Prevent overriding the parent project's compiler/linker settings
 set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
@@ -41,23 +46,23 @@ target_compile_features(unittest PRIVATE cxx_std_14)
 # Sorted by alphabetical order of test name.
 target_link_libraries(  # Libs for test_attention_kernels
  unittest PUBLIC
-    -lcudart -lcurand
+    CUDA::cudart CUDA::curand
    gpt_kernels gtest memory_utils tensor unfused_attention_kernels cuda_utils logger)
 target_link_libraries(  # Libs for test_logprob_kernels
  unittest PUBLIC
-    -lcudart
+    CUDA::cudart
    logprob_kernels memory_utils cuda_utils logger)
 target_link_libraries(  # Libs for test_penalty_kernels
  unittest PUBLIC
-    -lcublas -lcublasLt -lcudart
+    CUDA::cublas CUDA::cublasLt CUDA::cudart
    sampling_penalty_kernels memory_utils cuda_utils logger)
 target_link_libraries(  # Libs for test_sampling_kernel
  unittest PUBLIC
-    -lcudart
+    CUDA::cudart
    sampling_topk_kernels sampling_topp_kernels memory_utils tensor cuda_utils logger)
 target_link_libraries(  # Libs for test_sampling_layer
  unittest PUBLIC
-    -lcublas -lcublasLt -lcudart
+    CUDA::cublas CUDA::cublasLt CUDA::cudart
    cublasMMWrapper memory_utils
    DynamicDecodeLayer TopKSamplingLayer TopPSamplingLayer tensor cuda_utils logger)
 target_link_libraries(  # Libs for test_tensor
@@ -65,7 +70,7 @@ target_link_libraries(  # Libs for test_tensor
 remove_definitions(-DTORCH_CUDA=1)
 add_executable(test_gemm test_gemm.cu)
-target_link_libraries(test_gemm PUBLIC -lcublas -lcudart -lcurand gemm cublasMMWrapper tensor cuda_utils logger)
+target_link_libraries(test_gemm PUBLIC CUDA::cublas CUDA::cudart CUDA::curand gemm cublasMMWrapper tensor cuda_utils logger)
 add_executable(test_gpt_kernels test_gpt_kernels.cu)
 target_link_libraries(test_gpt_kernels PUBLIC
@@ -73,6 +78,6 @@ target_link_libraries(test_gpt_kernels PUBLIC
 add_executable(test_context_attention_layer test_context_attention_layer.cu)
 target_link_libraries(test_context_attention_layer PUBLIC
-                      Llama -lcublas -lcublasLt -lcudart
+                      Llama CUDA::cublas CUDA::cublasLt CUDA::cudart
                      unfused_attention_kernels
                      memory_utils tensor cublasMMWrapper cuda_utils logger)
--- a/tests/csrc/unittests/test_attention_kernels.cu
+++ b/tests/csrc/unittests/test_attention_kernels.cu
@@ -14,13 +14,12 @@
 * limitations under the License.
 */
+#include "gtest_utils.h"
 #include "src/turbomind/kernels/gpt_kernels.h"
 #include "src/turbomind/kernels/unfused_attention_kernels.h"
 #include "src/turbomind/utils/Tensor.h"
 #include "src/turbomind/utils/memory_utils.h"
 #include "src/turbomind/utils/nccl_utils.h"
-#include "gtest_utils.h"
 #include <curand.h>
 #include <sstream>

--- a/tests/csrc/unittests/test_context_attention_layer.cu
+++ b/tests/csrc/unittests/test_context_attention_layer.cu
@@ -336,35 +336,26 @@ int main(int argc, const char* argv[])
    // compute actual
    using AttentionOp = FlashAttentionOp<scalar_t>;
    using Layout      = typename AttentionOp::AttentionLayout;
-    Layout      layout_q{.stride_batch = num_heads * seq_len * size_per_head,
+    Layout      layout_q{num_heads * seq_len * size_per_head, size_per_head, seq_len * size_per_head};
-                         .stride_seq   = size_per_head,
+    Layout      layout_k{num_heads * key_len * size_per_head, size_per_head, key_len * size_per_head};
-                         .stride_head  = seq_len * size_per_head};
+    Layout      layout_v{num_heads * key_len * size_per_head, size_per_head, key_len * size_per_head};
-    Layout      layout_k{.stride_batch = num_heads * key_len * size_per_head,
+    Layout      layout_o{num_heads * seq_len * size_per_head, num_heads * size_per_head, size_per_head, true};
-                         .stride_seq   = size_per_head,
-                         .stride_head  = key_len * size_per_head};
-    Layout      layout_v{.stride_batch = num_heads * key_len * size_per_head,
-                         .stride_seq   = size_per_head,
-                         .stride_head  = key_len * size_per_head};
-    Layout      layout_o{.stride_batch = num_heads * seq_len * size_per_head,
-                         .stride_seq   = num_heads * size_per_head,
-                         .stride_head  = size_per_head,
-                         .use_seqlens  = true};
    AttentionOp flash_attention(batch_size, num_heads, key_len, seq_len, size_per_head);
    float*      accum_buf_ptr = (float*)allocator.malloc(flash_attention.get_workspace_size(), true);
-    typename AttentionOp::Params attn_params{.attn_out     = actual_out_ptr,
+    typename AttentionOp::Params attn_params{actual_out_ptr,
-                                             .query        = query_ptr,
+                                             query_ptr,
-                                             .key          = key_ptr,
+                                             key_ptr,
-                                             .val          = val_ptr,
+                                             val_ptr,
-                                             .mask         = mask_ptr,
+                                             mask_ptr,
-                                             .out_accum    = accum_buf_ptr,
+                                             accum_buf_ptr,
-                                             .cu_seqlens_q = cu_seqlens_ptr,
+                                             cu_seqlens_ptr,
-                                             .cu_seqlens_k = nullptr,
+                                             nullptr,
-                                             .group_size   = 1,
+                                             1,
-                                             .layout_q     = layout_q,
+                                             layout_q,
-                                             .layout_k     = layout_k,
+                                             layout_k,
-                                             .layout_v     = layout_v,
+                                             layout_v,
-                                             .layout_o     = layout_o};
+                                             layout_o};
    flash_attention(attn_params, stream);
    sync_check_cuda_error();

--- a/tests/csrc/unittests/test_gemm.cu
+++ b/tests/csrc/unittests/test_gemm.cu
--- a/tests/csrc/unittests/test_int8.cu
+++ b/tests/csrc/unittests/test_int8.cu
@@ -5,10 +5,10 @@
 #include <string>
 #include <vector>
+#include "src/turbomind/kernels/transpose_int8_kernels.h"
+#include "src/turbomind/utils/Tensor.h"
 #include "src/turbomind/utils/cuda_utils.h"
 #include "src/turbomind/utils/memory_utils.h"
-#include "src/turbomind/utils/Tensor.h"
-#include "src/turbomind/kernels/transpose_int8_kernels.h"
 #include <algorithm>
 #include <iostream>
@@ -39,13 +39,14 @@ protected:
    void testTransposition();
 };
-void fill_tensor_random(Tensor a) {
+void fill_tensor_random(Tensor a)
-    const size_t num_elems = a.size();
+{
-    std::vector<int8_t> host_values(num_elems);
+    const size_t                          num_elems = a.size();
+    std::vector<int8_t>                   host_values(num_elems);
    std::uniform_int_distribution<int8_t> int8_random(-128, 127);
-    std::mt19937 rng(0);
+    std::mt19937                          rng(0);
-    std::generate(host_values.begin(), host_values.end(), [&int8_random, &rng](){ return int8_random(rng); });
+    std::generate(host_values.begin(), host_values.end(), [&int8_random, &rng]() { return int8_random(rng); });
    cudaH2Dcpy(a.getPtr<int8_t>(), host_values.data(), num_elems);
 }
@@ -70,11 +71,11 @@ void Int8TestSuite::testTransposition()
    int8_t *a_data, *a_t_data;
    cudaMalloc(&a_data, m * k * sizeof(int8_t));
-    Tensor a {MEMORY_GPU, TYPE_INT8, {32, 2048}, a_data};
+    Tensor a{MEMORY_GPU, TYPE_INT8, {32, 2048}, a_data};
    fill_tensor_random(a);
    cudaMalloc(&a_t_data, k * m * sizeof(int8_t));
-    Tensor a_t {MEMORY_GPU, TYPE_INT8, {2048, 32}, a_t_data};
+    Tensor a_t{MEMORY_GPU, TYPE_INT8, {2048, 32}, a_t_data};
    std::vector<int8_t> a_t_host_ref(a_t.size());
    reference_transpose_host(a_t_host_ref, a);

--- a/tests/csrc/unittests/test_logprob_kernels.cu
+++ b/tests/csrc/unittests/test_logprob_kernels.cu
 #include <assert.h>
-#include <math.h>
 #include <float.h>
+#include <math.h>
 #include <stdexcept>
 #include <tuple>
 #include <vector>
+#ifdef __linux__
 #include <sys/time.h>
+#endif
 #include "src/turbomind/kernels/logprob_kernels.h"
 #include "src/turbomind/utils/allocator.h"
 #include "src/turbomind/utils/cuda_utils.h"
@@ -24,22 +25,26 @@ struct LogProbKernelTestParam {
    size_t vocab_size;
    size_t beam_width;
-    std::string toString() {
+    std::string toString()
+    {
        return fmtstr("LogProbKernelTestParam[max_input_length=%ld, batch=%ld, vocab=%ld, beam_width=%ld]",
-                      max_input_length, batch_size, vocab_size, beam_width);
+                      max_input_length,
+                      batch_size,
+                      vocab_size,
+                      beam_width);
    }
 };
 /////////////////////////////////// Unittests //////////////////////////////////////////
 template<typename T>
-class LogProbKernelTest : public FtTestBase {
+class LogProbKernelTest: public FtTestBase {
 protected:
-    void computeCumLogProbs(float* cum_log_probs,
+    void computeCumLogProbs(float*       cum_log_probs,
-                            float* log_probs,
+                            float*       log_probs,
-                            const T* logits,
+                            const T*     logits,
-                            const int* input_ids,
+                            const int*   input_ids,
-                            const int* input_lengths,
+                            const int*   input_lengths,
                            const size_t max_input_length,
                            const size_t batch_size,
                            const size_t vocab_size,
@@ -54,9 +59,9 @@ protected:
                    cum_log_probs[i] = 0.0f;
                }
                else if ((int)step < input_lengths[i]) {
-                    size_t step_offset = (step - 1) * batch_size * vocab_size_padded;
+                    size_t   step_offset = (step - 1) * batch_size * vocab_size_padded;
-                    const T* vec = logits + step_offset + i * vocab_size_padded;
+                    const T* vec         = logits + step_offset + i * vocab_size_padded;
-                    float max_logits = -FLT_MAX;
+                    float    max_logits  = -FLT_MAX;
                    for (size_t v = 0; v < vocab_size; ++v) {
                        float val = static_cast<float>(vec[v]);
                        if (val > max_logits) {
@@ -67,7 +72,7 @@ protected:
                    for (size_t v = 0; v < vocab_size; ++v) {
                        sum += expf(static_cast<float>(vec[v]) - max_logits);
                    }
-                    int token_id = input_ids[step * batch_size + i];
+                    int   token_id = input_ids[step * batch_size + i];
                    float log_prob = static_cast<float>(vec[token_id]) - max_logits - log(sum);
                    if (log_probs != nullptr) {
                        log_probs[step * batch_size + i] = log_prob;
@@ -78,11 +83,11 @@ protected:
        }
    }
-    void computeCumLogProbsBatchFirst(float* cum_log_probs,
+    void computeCumLogProbsBatchFirst(float*       cum_log_probs,
-                                      float* log_probs,
+                                      float*       log_probs,
-                                      const T* logits,
+                                      const T*     logits,
-                                      const int* input_ids,
+                                      const int*   input_ids,
-                                      const int* input_lengths,
+                                      const int*   input_lengths,
                                      const size_t max_input_length,
                                      const size_t batch_size,
                                      const size_t vocab_size,
@@ -98,8 +103,8 @@ protected:
                    cum_log_probs[i] = 0.0f;
                }
                else if ((int)step < input_lengths[i]) {
-                    const T* vec = logits + batch_offset + (step - 1) * vocab_size_padded;
+                    const T* vec        = logits + batch_offset + (step - 1) * vocab_size_padded;
-                    float max_logits = -FLT_MAX;
+                    float    max_logits = -FLT_MAX;
                    for (size_t v = 0; v < vocab_size; ++v) {
                        float val = static_cast<float>(vec[v]);
                        if (val > max_logits) {
@@ -110,7 +115,7 @@ protected:
                    for (size_t v = 0; v < vocab_size; ++v) {
                        sum += expf(static_cast<float>(vec[v]) - max_logits);
                    }
-                    int token_id = input_ids[i * max_input_length + step];
+                    int   token_id = input_ids[i * max_input_length + step];
                    float log_prob = static_cast<float>(vec[token_id]) - max_logits - log(sum);
                    if (log_probs != nullptr) {
                        log_probs[i * max_input_length + step] = log_prob;
@@ -122,17 +127,17 @@ protected:
    }
 public:
+    void runTest(LogProbKernelTestParam param)
-    void runTest(LogProbKernelTestParam param) {
+    {
        size_t max_input_length = param.max_input_length;
-        size_t batchxbeam = param.batch_size * param.beam_width;
+        size_t batchxbeam       = param.batch_size * param.beam_width;
-        size_t vocab_size = param.vocab_size;
+        size_t vocab_size       = param.vocab_size;
        // Make multiple of 8 as GPT does.
        size_t vocab_size_padded = static_cast<size_t>(ceil(vocab_size / 8.f) * 8);
        // input values
-        T* h_logits = new T[max_input_length * batchxbeam * vocab_size];
+        T*   h_logits        = new T[max_input_length * batchxbeam * vocab_size];
-        int* h_input_ids = new int[max_input_length * batchxbeam];
+        int* h_input_ids     = new int[max_input_length * batchxbeam];
        int* h_input_lengths = new int[batchxbeam];
        // output buffers
@@ -145,9 +150,9 @@ public:
        memset(expected_cum_log_probs, 0, sizeof(float) * batchxbeam);
        // device buffers
-        T* d_logits = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * max_input_length * batchxbeam * vocab_size));
+        T*   d_logits = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * max_input_length * batchxbeam * vocab_size));
-        int *d_input_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_input_length * batchxbeam));
+        int* d_input_ids       = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_input_length * batchxbeam));
-        int *d_input_lengths = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam));
+        int* d_input_lengths   = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam));
        float* d_cum_log_probs = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * batchxbeam));
        // initialize device buffers
@@ -157,7 +162,7 @@ public:
        deviceFill(d_cum_log_probs, batchxbeam, 0.0f);
        size_t workspace_size = sizeof(float) * max_input_length * batchxbeam;
-        void* workspace = allocator->malloc(workspace_size);
+        void*  workspace      = allocator->malloc(workspace_size);
        invokeLogProbFromLogits(d_cum_log_probs,
                                d_logits,
                                d_input_ids,
@@ -189,16 +194,17 @@ public:
        delete[] h_logits;
    }
-    void runBatchFirstTest(LogProbKernelTestParam param) {
+    void runBatchFirstTest(LogProbKernelTestParam param)
+    {
        size_t max_input_length = param.max_input_length;
-        size_t batchxbeam = param.batch_size * param.beam_width;
+        size_t batchxbeam       = param.batch_size * param.beam_width;
-        size_t vocab_size = param.vocab_size;
+        size_t vocab_size       = param.vocab_size;
        // Make multiple of 8 as GPT does.
        size_t vocab_size_padded = static_cast<size_t>(ceil(vocab_size / 8.f) * 8);
        // input values
-        T* h_logits = new T[max_input_length * batchxbeam * vocab_size_padded];
+        T*   h_logits        = new T[max_input_length * batchxbeam * vocab_size_padded];
-        int* h_input_ids = new int[max_input_length * batchxbeam];
+        int* h_input_ids     = new int[max_input_length * batchxbeam];
        int* h_input_lengths = new int[batchxbeam];
        // output buffers
@@ -213,8 +219,8 @@ public:
        // device buffers
        T* d_logits =
            reinterpret_cast<T*>(allocator->malloc(sizeof(T) * max_input_length * batchxbeam * vocab_size_padded));
-        int *d_input_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_input_length * batchxbeam));
+        int*   d_input_ids     = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_input_length * batchxbeam));
-        int *d_input_lengths = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam));
+        int*   d_input_lengths = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam));
        float* d_cum_log_probs = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * batchxbeam));
        // initialize device buffers
@@ -224,7 +230,7 @@ public:
        check_cuda_error(cudaMemset(d_cum_log_probs, 0, sizeof(float) * batchxbeam));
        size_t workspace_size = sizeof(float) * max_input_length * batchxbeam;
-        void* workspace = allocator->malloc(workspace_size);
+        void*  workspace      = allocator->malloc(workspace_size);
        invokeLogProbFromLogits(d_cum_log_probs,
                                d_logits,
                                d_input_ids,
@@ -239,16 +245,16 @@ public:
                                true);
        computeCumLogProbsBatchFirst(expected_cum_log_probs,
-                                    nullptr,
+                                     nullptr,
-                                    h_logits,
+                                     h_logits,
-                                    h_input_ids,
+                                     h_input_ids,
-                                    h_input_lengths,
+                                     h_input_lengths,
-                                    max_input_length,
+                                     max_input_length,
-                                    batchxbeam,
+                                     batchxbeam,
-                                    vocab_size,
+                                     vocab_size,
-                                    vocab_size_padded);
+                                     vocab_size_padded);
-        std::string tag = param.toString() + (std::is_same<T, float>::value ? " (fp32)" : " (fp16)");
+        std::string tag    = param.toString() + (std::is_same<T, float>::value ? " (fp32)" : " (fp16)");
-        bool passed = checkResult(tag.c_str(), d_cum_log_probs, expected_cum_log_probs, batchxbeam);
+        bool        passed = checkResult(tag.c_str(), d_cum_log_probs, expected_cum_log_probs, batchxbeam);
        EXPECT_TRUE(passed);
        delete[] expected_cum_log_probs;
@@ -256,10 +262,8 @@ public:
        delete[] h_input_ids;
        delete[] h_logits;
    }
 };
 TYPED_TEST_SUITE(LogProbKernelTest, FloatAndHalfTypes);
 TYPED_TEST(LogProbKernelTest, SingleStep)

--- a/tests/csrc/unittests/test_penalty_kernels.cu
+++ b/tests/csrc/unittests/test_penalty_kernels.cu
@@ -14,24 +14,24 @@
 * limitations under the License.
 */
-#include <algorithm>   // std::min, std::max
+#include <algorithm>  // std::min, std::max
-#include <iostream>    // snprintf
+#include <iostream>   // snprintf
-#include <math.h>      // expf, log
+#include <math.h>     // expf, log
 #include <stdexcept>
-#include <stdlib.h>    // rand
+#include <stdlib.h>   // rand
-#include <string>      // std::string
+#include <string>     // std::string
 #include <unordered_map>
-#include <vector>      // std::vector
+#include <vector>     // std::vector
-#include <cublas_v2.h>
 #include <cublasLt.h>
+#include <cublas_v2.h>
 #include <cuda_runtime.h>
+#include "gtest_utils.h"
 #include "src/turbomind/kernels/penalty_types.h"
 #include "src/turbomind/kernels/sampling_penalty_kernels.h"
 #include "src/turbomind/utils/cuda_utils.h"
 #include "src/turbomind/utils/memory_utils.h"
-#include "gtest_utils.h"
 using namespace turbomind;
@@ -41,21 +41,25 @@ struct TemperatureTestParam {
    float* temperatures;
    size_t temperatures_size;
-    std::string toString() {
+    std::string toString()
+    {
        return fmtstr("TemperatureTestParam[batch=%ld, vocab=%ld, temperatures=%s]",
-                      batch_size, vocab_size, arr2str(temperatures, temperatures_size).c_str());
+                      batch_size,
+                      vocab_size,
+                      arr2str(temperatures, temperatures_size).c_str());
    }
 };
-size_t pad_vocab_size(size_t vocab_size, size_t pad = 8) {
+size_t pad_vocab_size(size_t vocab_size, size_t pad = 8)
+{
    return (vocab_size + pad - 1) / pad * pad;
 }
 template<typename T>
-void applyRepetitonPenalty(T* logits,
+void applyRepetitonPenalty(T*           logits,
-                           const int* output_ids,
+                           const int*   output_ids,
-                           const int* input_lengths,
+                           const int*   input_lengths,
-                           const float repetition_penalty,
+                           const float  repetition_penalty,
                           const size_t step,
                           const size_t max_input_length,
                           const size_t batch_size,
@@ -74,8 +78,8 @@ void applyRepetitonPenalty(T* logits,
            int token_id = output_ids[i + t * batch_size];
            if (!penalized[token_id]) {
                float logit = static_cast<float>(logits[offset + token_id]);
-                logits[offset + token_id] = static_cast<T>(logit < 0.0f ?
+                logits[offset + token_id] =
-                    logit * repetition_penalty : logit / repetition_penalty);
+                    static_cast<T>(logit < 0.0f ? logit * repetition_penalty : logit / repetition_penalty);
                penalized[token_id] = true;
            }
        }
@@ -84,9 +88,9 @@ void applyRepetitonPenalty(T* logits,
 }
 template<typename T>
-void batchApplyRepetitonPenalty(T* logits,
+void batchApplyRepetitonPenalty(T*           logits,
-                                const int* output_ids,
+                                const int*   output_ids,
-                                const int* input_lengths,
+                                const int*   input_lengths,
                                const float* repetition_penalties,
                                const size_t step,
                                const size_t max_input_length,
@@ -116,11 +120,8 @@ void batchApplyRepetitonPenalty(T* logits,
 }
 template<typename T>
-void initLogitsAndBias(T* logits,
+void initLogitsAndBias(
-                       T* bias,
+    T* logits, T* bias, const size_t batch_size, const size_t vocab_size, const size_t vocab_size_padded)
-                       const size_t batch_size,
-                       const size_t vocab_size,
-                       const size_t vocab_size_padded)
 {
    initRandom(logits, batch_size * vocab_size_padded, -5.0f, 5.0f);
    if (bias != nullptr) {
@@ -139,11 +140,10 @@ void initLogitsAndBias(T* logits,
    }
 }
 /////////////////////////////////// Tests //////////////////////////////////////////
 template<typename T>
-class TemperaturePenaltyTest : public FtTestBase {
+class TemperaturePenaltyTest: public FtTestBase {
 protected:
    // Set up test
    size_t batch_size_;
@@ -157,17 +157,18 @@ protected:
    float* d_temperatures_;
-    void subsetup(TemperatureTestParam param) {
+    void subsetup(TemperatureTestParam param)
-        batch_size_ = param.batch_size;
+    {
-        vocab_size_ = param.vocab_size;
+        batch_size_        = param.batch_size;
+        vocab_size_        = param.vocab_size;
        vocab_size_padded_ = pad_vocab_size(vocab_size_);
        h_logits_ = new T[batch_size_ * vocab_size_padded_];
-        h_bias_ = new T[vocab_size_padded_];
+        h_bias_   = new T[vocab_size_padded_];
        initLogitsAndBias(h_logits_, h_bias_, batch_size_, vocab_size_, vocab_size_padded_);
        d_logits_ = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batch_size_ * vocab_size_padded_));
-        d_bias_ = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * vocab_size_padded_));
+        d_bias_   = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * vocab_size_padded_));
        cudaAutoCpy(d_logits_, h_logits_, batch_size_ * vocab_size_padded_, stream);
        cudaAutoCpy(d_bias_, h_bias_, vocab_size_padded_, stream);
        if (param.temperatures_size > 1) {
@@ -177,7 +178,8 @@ protected:
        }
    }
-    void subteardown() {
+    void subteardown()
+    {
        delete[] h_logits_;
        delete[] h_bias_;
    }
@@ -195,7 +197,7 @@ protected:
            ASSERT_GT(temperature, 0.0f) << "temperature should be positive but got " << temperature;
            for (size_t j = 0; j < vocab_size; ++j) {
                size_t index = i * vocab_size_padded + j;
-                float logit = static_cast<float>(logits[index]);
+                float  logit = static_cast<float>(logits[index]);
                if (bias != nullptr) {
                    logit += static_cast<float>(bias[j]);
                }
@@ -204,29 +206,18 @@ protected:
        }
    }
 public:
    void runTest(TemperatureTestParam param)
    {
        subsetup(param);
        // Do test
        if (param.temperatures_size == 1) {
-            invokeApplyTemperaturePenalty(d_logits_,
+            invokeApplyTemperaturePenalty(
-                                          d_bias_,
+                d_logits_, d_bias_, param.temperatures[0], batch_size_, vocab_size_, vocab_size_padded_, stream);
-                                          param.temperatures[0],
-                                          batch_size_,
-                                          vocab_size_,
-                                          vocab_size_padded_,
-                                          stream);
        }
        else {
-            invokeBatchApplyTemperaturePenalty(d_logits_,
+            invokeBatchApplyTemperaturePenalty(
-                                               d_bias_,
+                d_logits_, d_bias_, d_temperatures_, batch_size_, vocab_size_, vocab_size_padded_, stream);
-                                               d_temperatures_,
-                                               batch_size_,
-                                               vocab_size_,
-                                               vocab_size_padded_,
-                                               stream);
        }
        computeReference(h_logits_,
                         h_bias_,
@@ -240,21 +231,17 @@ public:
        subteardown();
    }
-    void runConsistencyTest(TemperatureTestParam param) {
+    void runConsistencyTest(TemperatureTestParam param)
+    {
        // Set up test
        ASSERT_EQ(param.temperatures_size, 1) << "A consistency test assumes temperatures_size=1";
        subsetup(param);
        // Run a single runtime value case.
-        invokeApplyTemperaturePenalty(d_logits_,
+        invokeApplyTemperaturePenalty(
-                                      d_bias_,
+            d_logits_, d_bias_, param.temperatures[0], batch_size_, vocab_size_, vocab_size_padded_, stream);
-                                      param.temperatures[0],
-                                      batch_size_,
+        float  temperature    = param.temperatures[0];
-                                      vocab_size_,
-                                      vocab_size_padded_,
-                                      stream);
-        float temperature = param.temperatures[0];
        float* h_temperatures = new float[batch_size_];
        for (size_t i = 0; i < batch_size_; ++i) {
            h_temperatures[i] = temperature;
@@ -263,18 +250,14 @@ public:
        cudaAutoCpy(d_temperatures_, h_temperatures, batch_size_, stream);
        T* d_logits_batch = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batch_size_ * vocab_size_padded_));
-        T* d_bias_batch = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * vocab_size_padded_));
+        T* d_bias_batch   = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * vocab_size_padded_));
        cudaAutoCpy(d_logits_batch, h_logits_, batch_size_ * vocab_size_padded_, stream);
        cudaAutoCpy(d_bias_batch, h_bias_, vocab_size_padded_, stream);
-        invokeBatchApplyTemperaturePenalty(d_logits_batch,
+        invokeBatchApplyTemperaturePenalty(
-                                           d_bias_batch,
+            d_logits_batch, d_bias_batch, d_temperatures_, batch_size_, vocab_size_, vocab_size_padded_, stream);
-                                           d_temperatures_,
+        bool passed =
-                                           batch_size_,
+            checkResult(param.toString(), d_logits_, d_logits_batch, batch_size_ * vocab_size_padded_, true, true);
-                                           vocab_size_,
-                                           vocab_size_padded_,
-                                           stream);
-        bool passed = checkResult(param.toString(), d_logits_, d_logits_batch, batch_size_ * vocab_size_padded_, true, true);
        EXPECT_TRUE(passed);
        // Tear down test
@@ -315,7 +298,7 @@ TYPED_TEST(TemperaturePenaltyTest, LargeVocab)
 TYPED_TEST(TemperaturePenaltyTest, BatchNoPenalty)
 {
-    size_t batch_size = 6;
+    size_t batch_size   = 6;
    float* temperatures = new float[batch_size];
    for (size_t i = 0; i < batch_size; ++i) {
        temperatures[i] = 1.0f;
@@ -325,7 +308,7 @@ TYPED_TEST(TemperaturePenaltyTest, BatchNoPenalty)
 TYPED_TEST(TemperaturePenaltyTest, BatchLessThanOne)
 {
-    size_t batch_size = 6;
+    size_t batch_size   = 6;
    float* temperatures = new float[batch_size];
    for (size_t i = 0; i < batch_size; ++i) {
        temperatures[i] = 0.53f;
@@ -335,7 +318,7 @@ TYPED_TEST(TemperaturePenaltyTest, BatchLessThanOne)
 TYPED_TEST(TemperaturePenaltyTest, BatchGreaterThaneOne)
 {
-    size_t batch_size = 6;
+    size_t batch_size   = 6;
    float* temperatures = new float[batch_size];
    for (size_t i = 0; i < batch_size; ++i) {
        temperatures[i] = 2.01f;
@@ -345,10 +328,10 @@ TYPED_TEST(TemperaturePenaltyTest, BatchGreaterThaneOne)
 TYPED_TEST(TemperaturePenaltyTest, BatchMixed)
 {
-    size_t batch_size = 6;
+    size_t batch_size   = 6;
    float* temperatures = new float[batch_size];
    for (size_t i = 0; i < batch_size; ++i) {
-        temperatures[i] = i % 2 ==0 ? 2.01f : 0.53f;
+        temperatures[i] = i % 2 == 0 ? 2.01f : 0.53f;
    }
    this->runTest({batch_size, 4, temperatures, batch_size});
 }
@@ -367,22 +350,24 @@ struct RepetitionPenaltyTestCase {
    size_t                repetition_penalties_size;
    RepetitionPenaltyType repetition_penalty_type;
-    std::string toString() {
+    std::string toString()
-        static const std::unordered_map<RepetitionPenaltyType, std::string> typestr_map {
+    {
+        static const std::unordered_map<RepetitionPenaltyType, std::string> typestr_map{
            {RepetitionPenaltyType::Additive, "additive"},
            {RepetitionPenaltyType::Multiplicative, "multiplicative"},
            {RepetitionPenaltyType::None, "none"}};
-        return fmtstr(
+        return fmtstr("RepetitionPenaltyTestCase[batch=%ld, vocab=%ld, max_input_length=%ld, "
-            "RepetitionPenaltyTestCase[batch=%ld, vocab=%ld, max_input_length=%ld, "
+                      "repetition_penalties=%s, repetition_penalty_type=%s]",
-            "repetition_penalties=%s, repetition_penalty_type=%s]",
+                      batch_size,
-            batch_size, vocab_size, max_input_length,
+                      vocab_size,
-            arr2str(repetition_penalties, repetition_penalties_size).c_str(),
+                      max_input_length,
-            typestr_map.at(repetition_penalty_type).c_str());
+                      arr2str(repetition_penalties, repetition_penalties_size).c_str(),
+                      typestr_map.at(repetition_penalty_type).c_str());
    }
 };
 template<typename T>
-class RepetitionPenaltyTest : public FtTestBase {
+class RepetitionPenaltyTest: public FtTestBase {
 protected:
    // Set up test
    size_t batch_size_;
@@ -392,37 +377,38 @@ protected:
    size_t sequence_length_;
    size_t step_;
-    T* h_logits_;
+    T*   h_logits_;
-    T* h_bias_;
+    T*   h_bias_;
    int* h_output_ids_;
    int* h_input_lengths_;
-    T* d_logits_;
+    T*   d_logits_;
-    T* d_bias_;
+    T*   d_bias_;
    int* d_output_ids_;
    int* d_input_lengths_;
    float* d_repetition_penalties_;
-    void subsetup(RepetitionPenaltyTestCase param) {
+    void subsetup(RepetitionPenaltyTestCase param)
-        batch_size_ = param.batch_size;
+    {
-        vocab_size_ = param.vocab_size;
+        batch_size_        = param.batch_size;
+        vocab_size_        = param.vocab_size;
        vocab_size_padded_ = pad_vocab_size(vocab_size_);
-        max_input_length_ = param.max_input_length;
+        max_input_length_  = param.max_input_length;
-        sequence_length_ = 2 * max_input_length_;  // input + output
+        sequence_length_   = 2 * max_input_length_;  // input + output
-        step_ = sequence_length_ * 0.7;
+        step_              = sequence_length_ * 0.7;
-        h_logits_ = new T[batch_size_ * vocab_size_padded_];
+        h_logits_        = new T[batch_size_ * vocab_size_padded_];
-        h_bias_ = new T[vocab_size_padded_];
+        h_bias_          = new T[vocab_size_padded_];
-        h_output_ids_ = new int[sequence_length_ * batch_size_];
+        h_output_ids_    = new int[sequence_length_ * batch_size_];
        h_input_lengths_ = new int[batch_size_];
        initLogitsAndBias(h_logits_, h_bias_, batch_size_, vocab_size_, vocab_size_padded_);
        initRandomInt(h_output_ids_, sequence_length_ * batch_size_, 0, vocab_size_);
        initRandomInt(h_input_lengths_, batch_size_, 1, max_input_length_);
-        d_logits_ = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batch_size_ * vocab_size_padded_));
+        d_logits_        = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batch_size_ * vocab_size_padded_));
-        d_bias_ = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * vocab_size_padded_));
+        d_bias_          = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * vocab_size_padded_));
-        d_output_ids_ = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * sequence_length_ * batch_size_));
+        d_output_ids_    = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * sequence_length_ * batch_size_));
        d_input_lengths_ = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size_));
        cudaAutoCpy(d_logits_, h_logits_, batch_size_ * vocab_size_padded_, stream);
@@ -437,7 +423,8 @@ protected:
        }
    }
-    void subteardown() {
+    void subteardown()
+    {
        delete[] h_logits_;
        delete[] h_bias_;
        delete[] h_output_ids_;
@@ -540,7 +527,8 @@ public:
        subteardown();
    }
-    void runConsistencyTest(RepetitionPenaltyTestCase param) {
+    void runConsistencyTest(RepetitionPenaltyTestCase param)
+    {
        // Set up test
        ASSERT_EQ(param.repetition_penalties_size, 1) << "A consistency test assumes repetition_penalties_size=1";
        subsetup(param);
@@ -618,7 +606,7 @@ TYPED_TEST(RepetitionPenaltyTest, LargeVocab)
 TYPED_TEST(RepetitionPenaltyTest, BatchNoPenalty)
 {
-    size_t batch_size = 6;
+    size_t batch_size           = 6;
    float* repetition_penalties = new float[batch_size];
    for (size_t i = 0; i < batch_size; ++i) {
        repetition_penalties[i] = 1.0f;
@@ -628,7 +616,7 @@ TYPED_TEST(RepetitionPenaltyTest, BatchNoPenalty)
 TYPED_TEST(RepetitionPenaltyTest, BatchLessThanOne)
 {
-    size_t batch_size = 6;
+    size_t batch_size           = 6;
    float* repetition_penalties = new float[batch_size];
    for (size_t i = 0; i < batch_size; ++i) {
        repetition_penalties[i] = 0.53f;
@@ -638,7 +626,7 @@ TYPED_TEST(RepetitionPenaltyTest, BatchLessThanOne)
 TYPED_TEST(RepetitionPenaltyTest, BatchGreaterThaneOne)
 {
-    size_t batch_size = 6;
+    size_t batch_size   = 6;
    float* temperatures = new float[batch_size];
    for (size_t i = 0; i < batch_size; ++i) {
        temperatures[i] = 2.01f;
@@ -648,10 +636,10 @@ TYPED_TEST(RepetitionPenaltyTest, BatchGreaterThaneOne)
 TYPED_TEST(RepetitionPenaltyTest, BatchMixed)
 {
-    size_t batch_size = 6;
+    size_t batch_size           = 6;
    float* repetition_penalties = new float[batch_size];
    for (size_t i = 0; i < batch_size; ++i) {
-        repetition_penalties[i] = i % 2 ==0 ? 2.01f : 0.53f;
+        repetition_penalties[i] = i % 2 == 0 ? 2.01f : 0.53f;
    }
    this->runTest({batch_size, 4, 5, repetition_penalties, batch_size, RepetitionPenaltyType::Multiplicative});
 }
@@ -664,10 +652,10 @@ TYPED_TEST(RepetitionPenaltyTest, Consistency)
 TYPED_TEST(RepetitionPenaltyTest, PenaltyTypeAdditive)
 {
-    size_t batch_size = 6;
+    size_t batch_size           = 6;
    float* repetition_penalties = new float[batch_size];
    for (size_t i = 0; i < batch_size; ++i) {
-        repetition_penalties[i] = i % 2 ==0 ? 2.01f : 0.53f;
+        repetition_penalties[i] = i % 2 == 0 ? 2.01f : 0.53f;
    }
    this->runTest({batch_size, 4, 5, repetition_penalties, batch_size, RepetitionPenaltyType::Additive});
 }
@@ -680,10 +668,10 @@ TYPED_TEST(RepetitionPenaltyTest, PenaltyTypeAdditiveHasDefaultValueZero)
 TYPED_TEST(RepetitionPenaltyTest, PenaltyTypeAdditiveHasDefaultValueZero2)
 {
-    size_t batch_size = 6;
+    size_t batch_size           = 6;
    float* repetition_penalties = new float[batch_size];
    for (size_t i = 0; i < batch_size; ++i) {
-        repetition_penalties[i] = i % 2 ==0 ? 1.0f : 0.0f;
+        repetition_penalties[i] = i % 2 == 0 ? 1.0f : 0.0f;
    }
    this->runTest({batch_size, 4, 5, repetition_penalties, batch_size, RepetitionPenaltyType::Additive});
 }

--- a/tests/csrc/unittests/test_sampling.cu
+++ b/tests/csrc/unittests/test_sampling.cu
@@ -12,6 +12,7 @@
 #include "src/turbomind/kernels/sampling_topk_kernels.h"
 #include "src/turbomind/layers/DynamicDecodeLayer.h"
 #include "src/turbomind/layers/sampling_layers/TopKSamplingLayer.h"
+#include "src/turbomind/macro.h"
 #include "src/turbomind/utils/Tensor.h"
 #include "src/turbomind/utils/cublasMMWrapper.h"
 #include "src/turbomind/utils/cuda_utils.h"

--- a/tests/csrc/unittests/test_sampling_kernels.cu
+++ b/tests/csrc/unittests/test_sampling_kernels.cu
-#include <algorithm>   // std::fill_n
+#include <algorithm>  // std::fill_n
-#include <iostream>    // snprintf
+#include <iostream>   // snprintf
-#include <math.h>      // expf, log
+#include <math.h>     // expf, log
-#include <stdlib.h>    // rand
+#include <stdlib.h>   // rand
-#include <string>      // std::string
+#include <string>     // std::string
-#include <vector>      // std::vector
+#include <vector>     // std::vector
-#include <cublas_v2.h>
 #include <cublasLt.h>
+#include <cublas_v2.h>
 #include <cuda_runtime.h>
 #include <gtest/gtest.h>
@@ -14,6 +14,7 @@
 #include "src/turbomind/kernels/sampling_topp_kernels.h"
 #include "src/turbomind/layers/DynamicDecodeLayer.h"
 #include "src/turbomind/layers/sampling_layers/TopKSamplingLayer.h"
+#include "src/turbomind/macro.h"
 #include "src/turbomind/utils/Tensor.h"
 #include "src/turbomind/utils/cublasMMWrapper.h"
 #include "src/turbomind/utils/cuda_utils.h"
@@ -68,9 +69,9 @@ void computeProb(T* probs, T* logits, int batch_size, int vocab_size)
            sum += expf(static_cast<float>(logits[bidx * vocab_size + i]) - maxval);
        }
        for (int i = 0; i < vocab_size; ++i) {
-            int idx = bidx * vocab_size + i;
+            int   idx   = bidx * vocab_size + i;
            float logit = static_cast<float>(logits[idx]) - maxval;
-            probs[idx] = static_cast<T>(expf(logit) / (sum + EPSILON));
+            probs[idx]  = static_cast<T>(expf(logit) / (sum + EPSILON));
        }
    }
 }
@@ -96,8 +97,8 @@ void computeLogProb(T* logprobs, T* logits, int batch_size, int vocab_size)
            sum += expf(static_cast<float>(logits[bidx * vocab_size + i]) - maxval);
        }
        for (int i = 0; i < vocab_size; ++i) {
-            int idx = bidx * vocab_size + i;
+            int   idx     = bidx * vocab_size + i;
-            float logit = static_cast<float>(logits[idx]) - maxval;
+            float logit   = static_cast<float>(logits[idx]) - maxval;
            logprobs[idx] = static_cast<T>(logit - logf(sum + EPSILON));
        }
    }
@@ -119,10 +120,10 @@ public:
    }
 protected:
-    unsigned long long seed = 0;
+    unsigned long long              seed = 0;
-    cudaStream_t stream;
+    cudaStream_t                    stream;
    Allocator<AllocatorType::CUDA>* allocator;
-    curandState_t* curand_states;
+    curandState_t*                  curand_states;
 };
 template<typename T>
@@ -393,8 +394,8 @@ public:
    {
        this->runBatchTest(param, false, false);
        this->runBatchTest(param, false, true);
-        this->runBatchTest(param, true,  false);
+        this->runBatchTest(param, true, false);
-        this->runBatchTest(param, true,  true);
+        this->runBatchTest(param, true, true);
    }
 };
@@ -410,7 +411,6 @@ TYPED_TEST(TopKSamplingKernelTest, CorrectnessAncestral)
    this->runTest({6, 4, 1, 4, 1.0f, 1});
 };
 TYPED_TEST(TopKSamplingKernelTest, CorrectnessLargeK63)
 {
    this->runTest({16, 51200, 1, 63, 1.0f, 8});
@@ -456,7 +456,6 @@ TYPED_TEST(TopKSamplingKernelTest, BatchCorrectnessTopKTopP)
    this->runBatchTest({8, 4000, 1, 63, 0.3f, 8});
 };
 template<typename T>
 class TopPSamplingKernelTest: public SamplingKernelTest<T> {
@@ -473,7 +472,7 @@ public:
        size_t batch_size = param.batch_size;
        size_t vocab_size = param.vocab_size;
        size_t output_len = param.output_len;
-        size_t seq_len = output_len;
+        size_t seq_len    = output_len;
        float top_p = param.top_p;
@@ -496,8 +495,8 @@ public:
        struct cudaDeviceProp device_prop;
        cudaGetDeviceProperties(&device_prop, device);
-        curandState_t* curand_states = reinterpret_cast<curandState_t*>(
+        curandState_t* curand_states =
-            allocator->malloc(sizeof(curandState_t) * batch_size, false));
+            reinterpret_cast<curandState_t*>(allocator->malloc(sizeof(curandState_t) * batch_size, false));
        invokeCurandInitialize(curand_states, batch_size, seed, stream);
        int* end_ids     = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size));
@@ -515,17 +514,17 @@ public:
        int* end_offsets      = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * (batch_size + 1)));
        int* topp_id_vals_buf = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size * vocab_size));
-        size_t workspace_size = 0;
+        size_t workspace_size        = 0;
        size_t cub_temp_storage_size = 0;
        // retrieve the workspace size of the top-p sampling kernel.
        invokeTopPSampling<T>(nullptr,  // workspace
                              workspace_size,
                              cub_temp_storage_size,
-                              nullptr,  // output_ids
+                              nullptr,      // output_ids
-                              nullptr,  // sequence_length
+                              nullptr,      // sequence_length
-                              nullptr,  // finished_buffer
+                              nullptr,      // finished_buffer
-                              nullptr,  // cum_log_probs
+                              nullptr,      // cum_log_probs
-                              nullptr,  // output_log_probs
+                              nullptr,      // output_log_probs
                              (T*)nullptr,  // log_probs
                              topp_id_vals_buf,
                              end_offsets,
@@ -553,12 +552,7 @@ public:
            computeProb(h_probs, h_logits, batch_size, vocab_size);
            cudaH2Dcpy(probs, h_probs, batch_size * vocab_size);
-            invokeTopPInitialize(topp_id_vals_buf,
+            invokeTopPInitialize(topp_id_vals_buf, end_offsets, begin_offsets, batch_size, vocab_size, stream);
-                                 end_offsets,
-                                 begin_offsets,
-                                 batch_size,
-                                 vocab_size,
-                                 stream);
            invokeTopPSampling<T>(workspace,
                                  workspace_size,
@@ -612,7 +606,7 @@ public:
        size_t batch_size = param.batch_size;
        size_t vocab_size = param.vocab_size;
-        float top_p = param.top_p;
+        float  top_p    = param.top_p;
        float* h_top_ps = new float[batch_size];
        // Initialize runtime top k values.
        for (size_t i = 0; i < batch_size; ++i) {
@@ -621,7 +615,7 @@ public:
        float max_top_p = *std::max_element(h_top_ps, h_top_ps + batch_size);
        size_t output_len = param.output_len;
-        size_t seq_len = output_len;
+        size_t seq_len    = output_len;
        // Logit values in the host of shape (batch_size x vocab_size).
        T* h_logits = new T[batch_size * vocab_size];
@@ -647,8 +641,8 @@ public:
        struct cudaDeviceProp device_prop;
        cudaGetDeviceProperties(&device_prop, device);
-        curandState_t* curand_states = reinterpret_cast<curandState_t*>(
+        curandState_t* curand_states =
-            allocator->malloc(sizeof(curandState_t) * batch_size, false));
+            reinterpret_cast<curandState_t*>(allocator->malloc(sizeof(curandState_t) * batch_size, false));
        invokeCurandInitialize(curand_states, batch_size, seed, stream);
        float* top_ps = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * batch_size));
@@ -668,17 +662,17 @@ public:
        int* end_offsets      = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * (batch_size + 1)));
        int* topp_id_vals_buf = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size * vocab_size));
-        size_t workspace_size = 0;
+        size_t workspace_size        = 0;
        size_t cub_temp_storage_size = 0;
        // retrieve the workspace size of the top-p sampling kernel.
        invokeBatchTopPSampling<T>(nullptr,  // workspace
                                   workspace_size,
                                   cub_temp_storage_size,
-                                   nullptr,  // output_ids
+                                   nullptr,      // output_ids
-                                   nullptr,  // sequence_length
+                                   nullptr,      // sequence_length
-                                   nullptr,  // finished_buffer
+                                   nullptr,      // finished_buffer
-                                   nullptr,  // cum_log_probs
+                                   nullptr,      // cum_log_probs
-                                   nullptr,  // output_log_probs
+                                   nullptr,      // output_log_probs
                                   (T*)nullptr,  // log_probs
                                   topp_id_vals_buf,
                                   end_offsets,
@@ -709,12 +703,7 @@ public:
            computeProb(h_probs, h_logits, batch_size, vocab_size);
            cudaH2Dcpy(probs, h_probs, batch_size * vocab_size);
-            invokeTopPInitialize(topp_id_vals_buf,
+            invokeTopPInitialize(topp_id_vals_buf, end_offsets, begin_offsets, batch_size, vocab_size, stream);
-                                 end_offsets,
-                                 begin_offsets,
-                                 batch_size,
-                                 vocab_size,
-                                 stream);
            invokeBatchTopPSampling<T>(workspace,
                                       workspace_size,
@@ -773,8 +762,8 @@ public:
    {
        this->runBatchTest(param, false, false);
        this->runBatchTest(param, false, true);
-        this->runBatchTest(param, true,  false);
+        this->runBatchTest(param, true, false);
-        this->runBatchTest(param, true,  true);
+        this->runBatchTest(param, true, true);
    }
 };
@@ -825,30 +814,31 @@ TYPED_TEST(TopPSamplingKernelTest, BatchCorrectnessLargeP2)
    this->runBatchTest({8, 4000, 1, 0, 0.9f, 16});
 };
-__global__
+__global__ void generateRandomNumber(unsigned int* vals, curandState_t* states, const int batch_size)
-void generateRandomNumber(unsigned int *vals, curandState_t *states, const int batch_size) {
+{
    int idx = threadIdx.x;
    if (idx < batch_size) {
        vals[idx] = curand(states + idx);
    }
 }
-TEST(SamplingKernelTest, CurandBatchInitialize) {
+TEST(SamplingKernelTest, CurandBatchInitialize)
-    size_t batch_size = 127;
+{
+    size_t       batch_size = 127;
    cudaStream_t stream;
    cudaStreamCreate(&stream);
    curandState_t* curand_states;
    check_cuda_error(cudaMalloc(&curand_states, sizeof(curandState_t) * batch_size));
    unsigned long long* h_random_seeds = new unsigned long long[batch_size];
-    const size_t period_size = 3;
+    const size_t        period_size    = 3;
    for (size_t i = 0; i < batch_size; ++i) {
        h_random_seeds[i] = i / period_size;
    }
    unsigned long long* d_random_seeds;
    check_cuda_error(cudaMalloc(&d_random_seeds, sizeof(unsigned long long) * batch_size));
-    check_cuda_error(cudaMemcpy(d_random_seeds, h_random_seeds,
+    check_cuda_error(
-                                sizeof(unsigned long long) * batch_size, cudaMemcpyHostToDevice));
+        cudaMemcpy(d_random_seeds, h_random_seeds, sizeof(unsigned long long) * batch_size, cudaMemcpyHostToDevice));
    // Initialize curand states.
    invokeCurandBatchInitialize(curand_states, batch_size, d_random_seeds, stream);
@@ -859,8 +849,8 @@ TEST(SamplingKernelTest, CurandBatchInitialize) {
    unsigned int* h_rand_vals = new unsigned int[batch_size];
    check_cuda_error(cudaMalloc(&d_rand_vals, sizeof(unsigned int) * batch_size));
    generateRandomNumber<<<1, batch_size, 0, stream>>>(d_rand_vals, curand_states, batch_size);
-    check_cuda_error(cudaMemcpyAsync(
+    check_cuda_error(
-        h_rand_vals, d_rand_vals, sizeof(unsigned int) * batch_size, cudaMemcpyDeviceToHost, stream));
+        cudaMemcpyAsync(h_rand_vals, d_rand_vals, sizeof(unsigned int) * batch_size, cudaMemcpyDeviceToHost, stream));
    check_cuda_error(cudaStreamSynchronize(stream));
    // The same seed produces the same random number.

--- a/tests/csrc/unittests/test_sampling_layer.cu
+++ b/tests/csrc/unittests/test_sampling_layer.cu
--- a/tests/csrc/unittests/test_tensor.cu
+++ b/tests/csrc/unittests/test_tensor.cu
 #include <iostream>
-#include <vector>
 #include <unordered_map>
+#include <vector>
 #include <gtest/gtest.h>
@@ -10,16 +10,17 @@ using namespace turbomind;
 namespace {
-#define EXPECT_EQUAL_TENSORS(t1, t2)       \
+#define EXPECT_EQUAL_TENSORS(t1, t2)                                                                                   \
-    do {                                   \
+    do {                                                                                                               \
-        EXPECT_TRUE(t1.where == t2.where); \
+        EXPECT_TRUE(t1.where == t2.where);                                                                             \
-        EXPECT_TRUE(t1.type == t2.type);   \
+        EXPECT_TRUE(t1.type == t2.type);                                                                               \
-        EXPECT_TRUE(t1.shape == t2.shape); \
+        EXPECT_TRUE(t1.shape == t2.shape);                                                                             \
-        EXPECT_TRUE(t1.data == t2.data);   \
+        EXPECT_TRUE(t1.data == t2.data);                                                                               \
-    } while(false)
+    } while (false)
-TEST(TensorMapTest, HasKeyCorrectness) {
+TEST(TensorMapTest, HasKeyCorrectness)
-    bool* v1 = new bool(true);
+{
+    bool*  v1 = new bool(true);
    float* v2 = new float[6]{1.0f, 1.1f, 1.2f, 1.3f, 1.4f, 1.5f};
    Tensor t1 = Tensor{MEMORY_CPU, TYPE_BOOL, {1}, v1};
    Tensor t2 = Tensor{MEMORY_CPU, TYPE_FP32, {3, 2}, v2};
@@ -33,8 +34,9 @@ TEST(TensorMapTest, HasKeyCorrectness) {
    delete[] v2;
 }
-TEST(TensorMapTest, InsertCorrectness) {
+TEST(TensorMapTest, InsertCorrectness)
-    int* v1 = new int[4]{1, 10, 20, 30};
+{
+    int*   v1 = new int[4]{1, 10, 20, 30};
    float* v2 = new float[2]{1.0f, 2.0f};
    Tensor t1 = Tensor(MEMORY_CPU, TYPE_INT32, {4}, v1);
    Tensor t2 = Tensor(MEMORY_CPU, TYPE_INT32, {2}, v2);
@@ -46,7 +48,8 @@ TEST(TensorMapTest, InsertCorrectness) {
    EXPECT_FALSE(map.isExist("t2"));
 }
-TEST(TensorMapTest, InsertDoesNotAllowNoneTensor) {
+TEST(TensorMapTest, InsertDoesNotAllowNoneTensor)
+{
    TensorMap map;
    EXPECT_TRUE(map.size() == 0);
    // forbid a none tensor.
@@ -57,10 +60,11 @@ TEST(TensorMapTest, InsertDoesNotAllowNoneTensor) {
    EXPECT_THROW(map.insert("empty", none_data_tensor), std::runtime_error);
 }
-TEST(TensorMapTest, InsertDoesNotAllowDuplicatedKey) {
+TEST(TensorMapTest, InsertDoesNotAllowDuplicatedKey)
-    int* v1 = new int[4]{1, 10, 20, 30};
+{
-    Tensor t1 = Tensor(MEMORY_CPU, TYPE_INT32, {4}, v1);
+    int*      v1 = new int[4]{1, 10, 20, 30};
-    Tensor t2 = Tensor(MEMORY_CPU, TYPE_INT32, {2}, v1);
+    Tensor    t1 = Tensor(MEMORY_CPU, TYPE_INT32, {4}, v1);
+    Tensor    t2 = Tensor(MEMORY_CPU, TYPE_INT32, {2}, v1);
    TensorMap map({{"t1", t1}});
    EXPECT_TRUE(map.size() == 1);
    // forbid a duplicated key.
@@ -68,8 +72,9 @@ TEST(TensorMapTest, InsertDoesNotAllowDuplicatedKey) {
    delete[] v1;
 }
-TEST(TensorMapTest, GetValCorrectness) {
+TEST(TensorMapTest, GetValCorrectness)
-    int* v1 = new int[4]{1, 10, 20, 30};
+{
+    int*   v1 = new int[4]{1, 10, 20, 30};
    Tensor t1 = Tensor(MEMORY_CPU, TYPE_INT32, {4}, v1);
    TensorMap map({{"t1", t1}});
@@ -93,13 +98,14 @@ TEST(TensorMapTest, GetValCorrectness) {
    delete[] v1;
 }
-TEST(TensorMapTest, GetTensorCorrectness) {
+TEST(TensorMapTest, GetTensorCorrectness)
-    bool* t1_val = new bool(true);
+{
+    bool*  t1_val = new bool(true);
    float* t2_val = new float[6]{1.0f, 1.1f, 1.2f, 1.3f, 1.4f, 1.5f};
-    Tensor t1 = Tensor{MEMORY_CPU, TYPE_BOOL, {1}, t1_val};
+    Tensor t1     = Tensor{MEMORY_CPU, TYPE_BOOL, {1}, t1_val};
-    Tensor t2 = Tensor{MEMORY_CPU, TYPE_FP32, {3, 2}, t2_val};
+    Tensor t2     = Tensor{MEMORY_CPU, TYPE_FP32, {3, 2}, t2_val};
-    int* default_val = new int[4]{0, 1, 2, 3};
+    int*   default_val    = new int[4]{0, 1, 2, 3};
    Tensor default_tensor = Tensor{MEMORY_CPU, TYPE_INT32, {4}, default_val};
    TensorMap map({{"t1", t1}, {"t2", t2}});
@@ -114,13 +120,14 @@ TEST(TensorMapTest, GetTensorCorrectness) {
    delete[] t1_val;
 }
-TEST(TensorMapTest, GetTensorCorrectnessAtConstTensorMap) {
+TEST(TensorMapTest, GetTensorCorrectnessAtConstTensorMap)
-    bool* t1_val = new bool(true);
+{
+    bool*  t1_val = new bool(true);
    float* t2_val = new float[6]{1.0f, 1.1f, 1.2f, 1.3f, 1.4f, 1.5f};
-    Tensor t1 = Tensor{MEMORY_CPU, TYPE_BOOL, {1}, t1_val};
+    Tensor t1     = Tensor{MEMORY_CPU, TYPE_BOOL, {1}, t1_val};
-    Tensor t2 = Tensor{MEMORY_CPU, TYPE_FP32, {3, 2}, t2_val};
+    Tensor t2     = Tensor{MEMORY_CPU, TYPE_FP32, {3, 2}, t2_val};
-    int* default_val = new int[4]{0, 1, 2, 3};
+    int*   default_val    = new int[4]{0, 1, 2, 3};
    Tensor default_tensor = Tensor{MEMORY_CPU, TYPE_INT32, {4}, default_val};
    const TensorMap map({{"t1", t1}, {"t2", t2}});
@@ -135,7 +142,8 @@ TEST(TensorMapTest, GetTensorCorrectnessAtConstTensorMap) {
    delete[] t1_val;
 }
-TEST(TensorTest, EmptyTensorMinMaxRaiseError) {
+TEST(TensorTest, EmptyTensorMinMaxRaiseError)
+{
    Tensor t1;
    EXPECT_THROW(t1.min<int>(), std::runtime_error);
    EXPECT_THROW(t1.max<int>(), std::runtime_error);
@@ -145,22 +153,22 @@ TEST(TensorTest, EmptyTensorMinMaxRaiseError) {
    EXPECT_THROW(t2.max<int>(), std::runtime_error);
 }
 using TensorTypes = testing::Types<int8_t, int, float>;
-template <typename T>
+template<typename T>
-class TensorFuncTest : public testing::Test {};
+class TensorFuncTest: public testing::Test {};
 TYPED_TEST_SUITE(TensorFuncTest, TensorTypes);
-TYPED_TEST(TensorFuncTest, MaxCorrectness) {
+TYPED_TEST(TensorFuncTest, MaxCorrectness)
+{
    using T = TypeParam;
    size_t size = 4;
-    T* v1 = new T[size] {T(1), T(2), T(3), T(4)};
+    T* v1 = new T[size]{T(1), T(2), T(3), T(4)};
-    T* v2 = new T[size] {T(4), T(3), T(2), T(1)};
+    T* v2 = new T[size]{T(4), T(3), T(2), T(1)};
-    T* v3 = new T[size] {T(1), T(2), T(4), T(3)};
+    T* v3 = new T[size]{T(1), T(2), T(4), T(3)};
    Tensor t1 = Tensor(MEMORY_CPU, getTensorType<T>(), {size}, v1);
    Tensor t2 = Tensor(MEMORY_CPU, getTensorType<T>(), {size}, v2);
@@ -175,7 +183,8 @@ TYPED_TEST(TensorFuncTest, MaxCorrectness) {
    delete[] v3;
 }
-TYPED_TEST(TensorFuncTest, MinCorrectness) {
+TYPED_TEST(TensorFuncTest, MinCorrectness)
+{
    using T = TypeParam;
    size_t size = 4;
@@ -197,42 +206,45 @@ TYPED_TEST(TensorFuncTest, MinCorrectness) {
    delete[] v3;
 }
-TYPED_TEST(TensorFuncTest, AnyCorrectness) {
+TYPED_TEST(TensorFuncTest, AnyCorrectness)
+{
    using T = TypeParam;
-    T* v = new T[4]{T(1), T(2), T(3), T(4)};
+    T*     v = new T[4]{T(1), T(2), T(3), T(4)};
    Tensor t = Tensor{MEMORY_CPU, getTensorType<T>(), {4}, v};
    EXPECT_TRUE(t.any<T>(T(1)));
    EXPECT_FALSE(t.any<T>(T(5)));
    delete[] v;
 }
-TYPED_TEST(TensorFuncTest, AllCorrectness) {
+TYPED_TEST(TensorFuncTest, AllCorrectness)
+{
    using T = TypeParam;
    constexpr size_t size = 4;
-    T* v1 = new T[size]{T(1), T(1), T(1), T(1)};
+    T*               v1   = new T[size]{T(1), T(1), T(1), T(1)};
-    T* v2 = new T[size]{T(1), T(1), T(1), T(2)};
+    T*               v2   = new T[size]{T(1), T(1), T(1), T(2)};
-    Tensor t1 = Tensor{MEMORY_CPU, getTensorType<T>(), {size}, v1};
+    Tensor           t1   = Tensor{MEMORY_CPU, getTensorType<T>(), {size}, v1};
-    Tensor t2 = Tensor{MEMORY_CPU, getTensorType<T>(), {size}, v2};
+    Tensor           t2   = Tensor{MEMORY_CPU, getTensorType<T>(), {size}, v2};
    EXPECT_TRUE(t1.all<T>(T(1)));
    EXPECT_FALSE(t2.all<T>(T(2)));
    delete[] v1;
    delete[] v2;
 }
-TYPED_TEST(TensorFuncTest, SliceCorrectness) {
+TYPED_TEST(TensorFuncTest, SliceCorrectness)
+{
    using T = TypeParam;
    constexpr int size = 12;
-    T* v = new T[size];
+    T*            v    = new T[size];
    for (int i = 0; i < size; ++i) {
        v[i] = i;
    }
    DataType dtype = getTensorType<T>();
-    Tensor t1 = Tensor(MEMORY_CPU, dtype, {3, 4}, v);
+    Tensor   t1    = Tensor(MEMORY_CPU, dtype, {3, 4}, v);
-    Tensor t2 = t1.slice({2, 4}, 4);
+    Tensor   t2    = t1.slice({2, 4}, 4);
    EXPECT_EQUAL_TENSORS(t2, Tensor(MEMORY_CPU, dtype, {2, 4}, &v[4]));
    // An overflowed tensor throws an exception.
@@ -241,4 +253,4 @@ TYPED_TEST(TensorFuncTest, SliceCorrectness) {
    delete[] v;
 }
-} // end of namespace
+}  // end of namespace
--- a/tests/csrc/unittests/unittest_utils.h
+++ b/tests/csrc/unittests/unittest_utils.h
@@ -16,15 +16,15 @@
 #pragma once
-#include <algorithm>   // min, max
+#include <algorithm>  // min, max
-#include <assert.h>    // assert
+#include <assert.h>   // assert
-#include <float.h>     // FLT_MAX
+#include <float.h>    // FLT_MAX
-#include <iostream>    // snprintf
+#include <iostream>   // snprintf
-#include <math.h>      // expf, log
+#include <limits>     // numeric_limits
-#include <limits>      // numeric_limits
+#include <math.h>     // expf, log
-#include <stdlib.h>    // rand
+#include <stdlib.h>   // rand
-#include <string>      // string
+#include <string>     // string
-#include <vector>      // vector
+#include <vector>     // vector
 #include "src/turbomind/utils/cuda_utils.h"
 #include "src/turbomind/utils/memory_utils.h"
@@ -36,32 +36,37 @@
 using namespace turbomind;
-class TestFailureError : public std::exception {
+class TestFailureError: public std::exception {
 private:
    std::string msg_;
 public:
    explicit TestFailureError() = default;
-    explicit TestFailureError(std::string name, std::string msg = "") {
+    explicit TestFailureError(std::string name, std::string msg = "")
+    {
        msg_ = fmtstr("TEST FAIL [%s] %s", name.c_str(), msg.c_str());
    }
-    const char* what () const throw () {
+    const char* what() const throw()
+    {
        return msg_.c_str();
    }
 };
-#define EXPECT_TRUE(cond)                                  \
+#define EXPECT_TRUE(cond)                                                                                              \
-    do { if(!(cond)) {                                     \
+    do {                                                                                                               \
-        TM_LOG_ERROR("TEST FAIL [%s]: %s at %s:%d",        \
+        if (!(cond)) {                                                                                                 \
-                     __func__, #cond, __FILE__, __LINE__); \
+            TM_LOG_ERROR("TEST FAIL [%s]: %s at %s:%d", __func__, #cond, __FILE__, __LINE__);                          \
-        throw TestFailureError(__func__);                  \
+            throw TestFailureError(__func__);                                                                          \
-    } } while(false)
+        }                                                                                                              \
+    } while (false)
-#define EXPECT_FALSE(cond)                                 \
-    do { if(cond) {                                        \
+#define EXPECT_FALSE(cond)                                                                                             \
-        TM_LOG_ERROR("TEST FAIL [%s]: %s at %s:%d",        \
+    do {                                                                                                               \
-                     __func__, #cond, __FILE__, __LINE__); \
+        if (cond) {                                                                                                    \
-        throw TestFailureError(__func__);                  \
+            TM_LOG_ERROR("TEST FAIL [%s]: %s at %s:%d", __func__, #cond, __FILE__, __LINE__);                          \
-    } } while(false)
+            throw TestFailureError(__func__);                                                                          \
+        }                                                                                                              \
+    } while (false)
 bool almostEqual(float a, float b, float atol = 1e-5, float rtol = 1e-8)
 {
@@ -80,9 +85,11 @@ bool almostEqual(float a, float b, float atol = 1e-5, float rtol = 1e-8)
 }
 template<typename T>
-bool checkResult(std::string name, T* out, T*ref, size_t size, float atol, float rtol) {
+bool checkResult(std::string name, T* out, T* ref, size_t size, float atol, float rtol)
-    size_t failures = 0;
+{
-    float relative_gap = 0.0f;;
+    size_t failures     = 0;
+    float  relative_gap = 0.0f;
+    ;
    for (size_t i = 0; i < size; ++i) {
        // The values for the output and the reference.
@@ -109,18 +116,21 @@ bool checkResult(std::string name, T* out, T*ref, size_t size, float atol, float
    // Allow not matched up to 1% elements.
    size_t tol_failures = (size_t)(0.01 * size);
    TM_LOG_INFO("check...%6s : %-50s (failures: %.2f%% atol: %.2e rtol: %.2e rel_gap: %.2e%%)",
-                failures <= tol_failures ? "....OK" : "FAILED", name.c_str(),
+                failures <= tol_failures ? "....OK" : "FAILED",
-                100. * failures / size, atol, rtol, 100. * relative_gap);
+                name.c_str(),
+                100. * failures / size,
+                atol,
+                rtol,
+                100. * relative_gap);
    return failures <= tol_failures;
 }
 template<typename T>
-bool checkResult(std::string name, T* out, T* ref, size_t size,
+bool checkResult(std::string name, T* out, T* ref, size_t size, bool device_out = true, bool device_ref = false)
-                 bool device_out = true, bool device_ref = false)
 {
-    bool is_fp32 = sizeof(T) == 4;
+    bool  is_fp32 = sizeof(T) == 4;
-    float atol = is_fp32 ? 1e-4f : 1e-3f;
+    float atol    = is_fp32 ? 1e-4f : 1e-3f;
-    float rtol = is_fp32 ? 1e-2f : 1e-1f;
+    float rtol    = is_fp32 ? 1e-2f : 1e-1f;
    T* h_out = nullptr;
    if (device_out) {
@@ -135,7 +145,7 @@ bool checkResult(std::string name, T* out, T* ref, size_t size,
        ref = h_ref;
    }
    bool is_ok = checkResult(name, out, ref, size, atol, rtol);
-    if (h_out != nullptr){
+    if (h_out != nullptr) {
        delete[] h_out;
    }
    if (h_ref != nullptr) {
@@ -145,7 +155,8 @@ bool checkResult(std::string name, T* out, T* ref, size_t size,
 }
 template<typename T>
-void initRandom(T* ptr, size_t size, float minval, float maxval) {
+void initRandom(T* ptr, size_t size, float minval, float maxval)
+{
    for (size_t i = 0; i < size; ++i) {
        float val = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
        val *= (maxval - minval);
@@ -153,7 +164,8 @@ void initRandom(T* ptr, size_t size, float minval, float maxval) {
    }
 }
-void initRandomInt(int* ptr, size_t size, int minval, int maxval) {
+void initRandomInt(int* ptr, size_t size, int minval, int maxval)
+{
    assert(minval < maxval);
    int mod = maxval - minval;
    for (size_t i = 0; i < size; ++i) {
@@ -162,7 +174,8 @@ void initRandomInt(int* ptr, size_t size, int minval, int maxval) {
 }
 template<typename T>
-void tile(T* x, int m, int n) {
+void tile(T* x, int m, int n)
+{
    for (int i = 1; i < m; ++i) {
        for (int j = 0; j < n; ++j) {
            x[i * n + j] = x[j];
@@ -171,7 +184,8 @@ void tile(T* x, int m, int n) {
 }
 template<typename T>
-void tile(T* dst, T* src, int m, int n) {
+void tile(T* dst, T* src, int m, int n)
+{
    for (int i = 1; i < m; ++i) {
        for (int j = 0; j < n; ++j) {
            dst[i * n + j] = src[j];
@@ -182,11 +196,13 @@ void tile(T* dst, T* src, int m, int n) {
 #define HALF_FLT_MAX 65504.0f
 template<typename T>
-bool isHalf() {
+bool isHalf()
+{
    return std::is_same<T, half>::value;
 }
 template<typename T>
-static inline void printMatrixWithLimit(T* ptr, int m, int k, int stride, bool is_device_ptr) {
+static inline void printMatrixWithLimit(T* ptr, int m, int k, int stride, bool is_device_ptr)
+{
    printMatrix(ptr, std::min(PRINT_LIMIT, m), std::min(PRINT_LIMIT, k), stride, is_device_ptr);
 }