Support windows platform (#209)

* __PRETTY_FUNCTION__ * CASE_K * uint * remove not * HALF_FLT_MAX * struct init * port utils * better build pthread-win32 * port kernels * port utils/gemm_test * hide windows header * port models * port examples && triton_backend && unittests * update build readme * fix lint * fix lint * fix lint * fix lint * fix lint * fix build * fix build * cmake version * fix typos * update ci * port kernels/gemm_s_f16 * update ci * fix ci * use cudaStreamSynchronize instead of volatile check * remove gettimeofday * remove pthread-win32 * remove dirent.h * update pre-commit * update * remove todo * fix include * fix build * fix build * fix build ci * fix github action trigger * update README * fix linux-build ci * remove windows folder * fix lint * update readme

Support windows platform (#209)
* __PRETTY_FUNCTION__ * CASE_K * uint * remove not * HALF_FLT_MAX * struct init * port utils * better build pthread-win32 * port kernels * port utils/gemm_test * hide windows header * port models * port examples && triton_backend && unittests * update build readme * fix lint * fix lint * fix lint * fix lint * fix lint * fix build * fix build * cmake version * fix typos * update ci * port kernels/gemm_s_f16 * update ci * fix ci * use cudaStreamSynchronize instead of volatile check * remove gettimeofday * remove pthread-win32 * remove dirent.h * update pre-commit * update * remove todo * fix include * fix build * fix build * fix build ci * fix github action trigger * update README * fix linux-build ci * remove windows folder * fix lint * update readme
4c9959f6 · Chen Xin · GitHub · 0d21f366 · 4c9959f6 · 4c9959f6
Unverified Commit 4c9959f6 authored Aug 17, 2023 by Chen Xin Committed by GitHub Aug 17, 2023
20 changed files
--- a/src/turbomind/utils/custom_ar_comm.cc
+++ b/src/turbomind/utils/custom_ar_comm.cc
@@ -51,7 +51,7 @@ void CustomAllReduceComm<T>::customAllReduce(size_t elts, cudaStream_t stream)
    invokeOneOrTwoShotAllReduceKernel<T>(param_, stream);

    // swap back
-    output_tensor_->at(0).data = (const void*)tmp_tensor_data_;
+    output_tensor_->at(0).data = (void*)tmp_tensor_data_;
 }

 template<typename T>
@@ -114,7 +114,7 @@ bool CustomAllReduceComm<T>::swapInternalBuffer(std::vector<Tensor>* tensor_buff
    if (rank_size_ > 1 && elts * sizeof(T) <= CUSTOM_AR_SIZE_THRESHOLD) {
        tmp_tensor_data_               = (T*)(tensor_buffer->at(0).data);
        output_tensor_                 = tensor_buffer;
-        tensor_buffer->at(0).data      = param_.peer_comm_buffer_ptrs[rank_];
+        tensor_buffer->at(0).data      = (void*)param_.peer_comm_buffer_ptrs[rank_];
        param_.local_output_buffer_ptr = tmp_tensor_data_;
        return true;
    }

--- a/src/turbomind/utils/gemm_test/CMakeLists.txt
+++ b/src/turbomind/utils/gemm_test/CMakeLists.txt
@@ -13,6 +13,8 @@
 # limitations under the License.
 cmake_minimum_required(VERSION 3.8)

+find_package(CUDAToolkit REQUIRED)
+
 set(gemm_func_files
  gemm_func.cc
 )
@@ -50,58 +52,58 @@ set(swin_gemm_func_files
 )

 add_library(gemm_func STATIC ${gemm_func_files})
-target_link_libraries(gemm_func PUBLIC -lcublas -lcublasLt -lcudart cuda_utils logger)
+target_link_libraries(gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart cuda_utils logger)
 set_property(TARGET gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
 set_property(TARGET gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)

 add_library(encoder_gemm_func STATIC ${encoder_gemm_func_files})
-target_link_libraries(encoder_gemm_func PUBLIC -lcublas -lcublasLt -lcudart gemm_func cuda_utils logger)
+target_link_libraries(encoder_gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func cuda_utils logger)
 if (SPARSITY_SUPPORT)
-target_link_libraries(encoder_gemm_func PUBLIC -lcusparse -lcusparseLt)
+target_link_libraries(encoder_gemm_func PUBLIC CUDA::cusparse -lcusparseLt)
 endif()
 set_property(TARGET encoder_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
 set_property(TARGET encoder_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)

 add_library(encoder_igemm_func STATIC ${encoder_igemm_func_files})
-target_link_libraries(encoder_igemm_func PUBLIC -lcublas -lcublasLt -lcudart cuda_utils logger)
+target_link_libraries(encoder_igemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart cuda_utils logger)
 if (SPARSITY_SUPPORT)
-target_link_libraries(encoder_igemm_func PUBLIC -lcusparse -lcusparseLt)
+target_link_libraries(encoder_igemm_func PUBLIC CUDA::cusparse -lcusparseLt)
 endif()
 set_property(TARGET encoder_igemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
 set_property(TARGET encoder_igemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)

 add_library(decoding_gemm_func STATIC ${decoding_gemm_func_files})
-target_link_libraries(decoding_gemm_func PUBLIC -lcublas -lcublasLt -lcudart gemm_func cuda_utils logger)
+target_link_libraries(decoding_gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func cuda_utils logger)
 set_property(TARGET decoding_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
 set_property(TARGET decoding_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)

 add_library(gpt_gemm_func STATIC ${gpt_gemm_func_files})
-target_link_libraries(gpt_gemm_func PUBLIC -lcublas -lcublasLt -lcudart gemm_func cuda_utils logger)
+target_link_libraries(gpt_gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func cuda_utils logger)
 if (SPARSITY_SUPPORT)
-  target_link_libraries(gpt_gemm_func PUBLIC -lcusparse -lcusparseLt)
+  target_link_libraries(gpt_gemm_func PUBLIC CUDA::cusparse -lcusparseLt)
 endif()
 set_property(TARGET gpt_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
 set_property(TARGET gpt_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)

 add_library(xlnet_gemm_func STATIC ${xlnet_gemm_func_files})
-target_link_libraries(xlnet_gemm_func PUBLIC -lcublas -lcublasLt -lcudart gemm_func cuda_utils logger)
+target_link_libraries(xlnet_gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func cuda_utils logger)
 set_property(TARGET xlnet_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
 set_property(TARGET xlnet_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)

 add_library(t5_gemm_func STATIC ${t5_gemm_func_files})
-target_link_libraries(t5_gemm_func PUBLIC -lcublas -lcublasLt -lcudart gemm_func cuda_utils logger)
+target_link_libraries(t5_gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func cuda_utils logger)
 if (SPARSITY_SUPPORT)
-  target_link_libraries(t5_gemm_func PUBLIC -lcusparse -lcusparseLt)
+  target_link_libraries(t5_gemm_func PUBLIC CUDA::cusparse -lcusparseLt)
 endif()
 set_property(TARGET t5_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
 set_property(TARGET t5_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)

 add_library(swin_igemm_func STATIC ${swin_igemm_func_files})
-target_link_libraries(swin_igemm_func PUBLIC -lcublas -lcublasLt -lcudart gemm_func encoder_igemm_func cuda_utils logger)
+target_link_libraries(swin_igemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func encoder_igemm_func cuda_utils logger)
 set_property(TARGET swin_igemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
 set_property(TARGET swin_igemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)

 add_library(swin_gemm_func STATIC ${swin_gemm_func_files})
-target_link_libraries(swin_gemm_func PUBLIC -lcublas -lcublasLt -lcudart gemm_func cuda_utils logger)
+target_link_libraries(swin_gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func cuda_utils logger)
 set_property(TARGET swin_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
 set_property(TARGET swin_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
--- a/src/turbomind/utils/gemm_test/decoding_gemm_func.cc
+++ b/src/turbomind/utils/gemm_test/decoding_gemm_func.cc
@@ -15,6 +15,8 @@
 */

 #include "src/turbomind/utils/gemm_test/decoding_gemm_func.h"
+#include "src/turbomind/macro.h"
+#include <chrono>

 namespace turbomind {

@@ -137,7 +139,6 @@ void generate_decoding_gemm_config(int   batch_size,
    cudaDataType_t computeType;
    int            startAlgo, endAlgo;
    const int      ites = 100;
-    struct timeval start, end;

    CublasDataType data_type;
    if (std::is_same<T, float>::value) {
@@ -195,7 +196,7 @@ void generate_decoding_gemm_config(int   batch_size,
        for (int algo = startAlgo; algo <= endAlgo; algo++) {
            cublasStatus_t status;
            cudaDeviceSynchronize();
-            gettimeofday(&start, NULL);
+            auto start = std::chrono::high_resolution_clock::now();
            for (int ite = 0; ite < ites; ++ite) {
                status = cublasGemmEx(cublas_handle,
                                      CUBLAS_OP_N,
@@ -221,11 +222,12 @@ void generate_decoding_gemm_config(int   batch_size,
                }
            }
            cudaDeviceSynchronize();
-            gettimeofday(&end, NULL);
+            auto end = std::chrono::high_resolution_clock::now();
+            auto dur = std::chrono::duration<float, std::milli>(end - start);
            if (status == CUBLAS_STATUS_SUCCESS) {
-                printf("algo_%d costs %.3fms \n", algo, diffTime(start, end) / ites);
-                if (diffTime(start, end) / ites < exec_time) {
-                    exec_time = diffTime(start, end) / ites;
+                printf("algo_%d costs %.3fms \n", algo, dur.count() / ites);
+                if (dur.count() / ites < exec_time) {
+                    exec_time = dur.count() / ites;
                    fast_algo = algo;
                }
            }
@@ -236,7 +238,7 @@ void generate_decoding_gemm_config(int   batch_size,
        if (data_type != FLOAT_DATATYPE) {
            printf("***cublasLt Gemm Testing Begin***\n");
            // Let try a fixed number of combinations
-            int                ALGO_COMBINATIONS = 5000;
+            const int          ALGO_COMBINATIONS = 5000;
            customMatmulPerf_t perfResults[ALGO_COMBINATIONS];

            LtHgemmCustomFind<T, scaleT>(ltHandle,

--- a/src/turbomind/utils/gemm_test/decoding_gemm_func.h
+++ b/src/turbomind/utils/gemm_test/decoding_gemm_func.h
@@ -27,8 +27,10 @@
 #include <cuda_fp16.h>
 #include <cuda_profiler_api.h>
 #include <map>
+#ifdef __linux__
 #include <sys/time.h>
 #include <unistd.h>
+#endif
 #include <vector>

 namespace turbomind {

--- a/src/turbomind/utils/gemm_test/encoder_gemm_func.cc
+++ b/src/turbomind/utils/gemm_test/encoder_gemm_func.cc
@@ -15,6 +15,8 @@
 */

 #include "src/turbomind/utils/gemm_test/encoder_gemm_func.h"
+#include "src/turbomind/macro.h"
+#include <chrono>

 namespace turbomind {

@@ -134,7 +136,6 @@ void generate_encoder_gemm_config(
    cudaDataType_t computeType;
    int            startAlgo, endAlgo;
    const int      ites = 100;
-    struct timeval start, end;

    CublasDataType data_type;
    if (std::is_same<T, float>::value) {
@@ -212,7 +213,7 @@ void generate_encoder_gemm_config(
        for (int algo = startAlgo; algo <= endAlgo; algo++) {
            cublasStatus_t status;
            cudaDeviceSynchronize();
-            gettimeofday(&start, NULL);
+            auto start = std::chrono::high_resolution_clock::now();
            for (int ite = 0; ite < ites; ++ite) {
                if (i < 3) {
                    status = cublasGemmEx(cublas_handle,
@@ -312,11 +313,12 @@ void generate_encoder_gemm_config(
                }
            }
            cudaDeviceSynchronize();
-            gettimeofday(&end, NULL);
+            auto end = std::chrono::high_resolution_clock::now();
+            auto dur = std::chrono::duration<float, std::milli>(end - start);
            if (status == CUBLAS_STATUS_SUCCESS) {
-                printf("algo_%d costs %.3fms \n", algo, diffTime(start, end) / ites);
-                if (diffTime(start, end) / ites < exec_time) {
-                    exec_time = diffTime(start, end) / ites;
+                printf("algo_%d costs %.3fms \n", algo, dur.count() / ites);
+                if (dur.count() / ites < exec_time) {
+                    exec_time = dur.count() / ites;
                    fast_algo = algo;
                }
            }
@@ -327,7 +329,7 @@ void generate_encoder_gemm_config(
        if (i < 3 && data_type != FLOAT_DATATYPE) {
            printf("***cublasLt Gemm Testing Begin***\n");
            // Let try a fixed number of combinations
-            int                ALGO_COMBINATIONS = 5000;
+            const int          ALGO_COMBINATIONS = 5000;
            customMatmulPerf_t perfResults[ALGO_COMBINATIONS];
            LtHgemmCustomFind<T, scaleT>(ltHandle,
                                         batch_size,
@@ -485,7 +487,7 @@ void generate_encoder_gemm_config(
                    &handle, &mat_A, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT))
                CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &mat_B, k, n, k, alignment, CUDA_R_16F, order))
                CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &mat_C, m, n, m, alignment, CUDA_R_16F, order))
-                gettimeofday(&start, NULL);
+                auto start = std::chrono::high_resolution_clock::now();
                for (int ite = 0; ite < ites; ++ite) {
                    // initializing MatDesc takes a lot of time
                    // and these descs can be stored to other place
@@ -516,10 +518,11 @@ void generate_encoder_gemm_config(
                    CHECK_CUSPARSE(cusparseLtMatmulPlanDestroy(&plan))
                }
                cudaDeviceSynchronize();
-                gettimeofday(&end, NULL);
-                printf("algo_%d costs %.3fms \n", alg, diffTime(start, end) / ites);
-                if (diffTime(start, end) < exec_time) {
-                    exec_time = diffTime(start, end);
+                auto end = std::chrono::high_resolution_clock::now();
+                auto dur = std::chrono::duration<float, std::milli>(end - start);
+                printf("algo_%d costs %.3fms \n", alg, dur.count() / ites);
+                if (dur.count() < exec_time) {
+                    exec_time = dur.count();
                    fast_algo = alg;
                }
            }

--- a/src/turbomind/utils/gemm_test/encoder_gemm_func.h
+++ b/src/turbomind/utils/gemm_test/encoder_gemm_func.h
@@ -27,8 +27,10 @@
 #include <cuda_fp16.h>
 #include <cuda_profiler_api.h>
 #include <map>
+#ifdef __linux__
 #include <sys/time.h>
 #include <unistd.h>
+#endif
 #include <vector>

 namespace turbomind {

--- a/src/turbomind/utils/gemm_test/encoder_igemm_func.cc
+++ b/src/turbomind/utils/gemm_test/encoder_igemm_func.cc
@@ -15,6 +15,8 @@
 */

 #include "encoder_igemm_func.h"
+#include "src/turbomind/macro.h"
+#include <chrono>

 #ifndef CUDART_VERSION
 #error CUDART_VERSION Undefined!
@@ -83,7 +85,7 @@ int printPerfStructure(int m, int n, int k, const customMatmulPerf_t& perf, FILE
 #if (CUDART_VERSION >= 11000)
    cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stages, sizeof(stages), NULL);
 #else
-    stages = 0;
+    stages                     = 0;
 #endif

    printf("algo={ Id=%d, tileIdx=%d (%s) splitK=%d reduc=%d swizzle=%d custom=%d stages=%d} status %d "
@@ -149,7 +151,7 @@ int printBatchPerfStructure(
 #if (CUDART_VERSION >= 11000)
    cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stages, sizeof(stages), NULL);
 #else
-    stages = 0;
+    stages                     = 0;
 #endif

    printf("algo={ Id=%d, tileIdx=%d (%s) splitK=%d reduc=%d swizzle=%d custom=%d stages=%d} status %d "
@@ -228,10 +230,9 @@ static cublasStatus_t customMatmulRun(cublasLtHandle_t            ltHandle,  //
        cublasLtMatmulAlgoCheck(ltHandle, operationDesc, Adesc, Bdesc, Cdesc, Ddesc, &algo, &heurResult);
    if (algoStatus == CUBLAS_STATUS_SUCCESS) {
        if (heurResult.workspaceSize <= workSpaceSizeInBytes) {
-            struct timeval start, end;
            cublasStatus_t oneRunStatus;
            cudaDeviceSynchronize();
-            gettimeofday(&start, NULL);
+            auto start = std::chrono::high_resolution_clock::now();
            for (int loop = 0; loop < repeats; loop++) {
                oneRunStatus = cublasLtMatmul(ltHandle,
                                              operationDesc,
@@ -251,11 +252,12 @@ static cublasStatus_t customMatmulRun(cublasLtHandle_t            ltHandle,  //
                                              stream);
            }
            cudaDeviceSynchronize();
-            gettimeofday(&end, NULL);
+            auto end = std::chrono::high_resolution_clock::now();
+            auto dur = std::chrono::duration<float, std::milli>(end - start);
            if (oneRunStatus != CUBLAS_STATUS_SUCCESS) {
                algoStatus = oneRunStatus;
            }
-            float time = diffTime(start, end);
+            float time = dur.count();
            // For the moment only add successful findings
            if (algoStatus == CUBLAS_STATUS_SUCCESS) {
                perfResults.algo          = algo;
@@ -352,7 +354,7 @@ int LtIgemmCustomFind(cublasLtHandle_t ltHandle,
        order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
    }
 #else
-    order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
+    order_matrixB              = CUBLASLT_ORDER_COL4_4R2_8C;
 #endif

    int ldaTransform = 32 * m;
@@ -369,7 +371,7 @@ int LtIgemmCustomFind(cublasLtHandle_t ltHandle,
 #if (CUDART_VERSION >= 11000)
    status = cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType);
 #else
-    status = cublasLtMatmulDescCreate(&operationDesc, scaleType);
+    status                     = cublasLtMatmulDescCreate(&operationDesc, scaleType);
 #endif
    if (status != CUBLAS_STATUS_SUCCESS) {
        goto CLEANUP;
@@ -689,7 +691,7 @@ int LtBatchIgemmCustomFind(cublasLtHandle_t ltHandle,
        order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
    }
 #else
-    order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
+    order_matrixB              = CUBLASLT_ORDER_COL4_4R2_8C;
 #endif

    int ldaTransform = 32 * m;
@@ -711,7 +713,7 @@ int LtBatchIgemmCustomFind(cublasLtHandle_t ltHandle,
 #if (CUDART_VERSION >= 11000)
    status = cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType);
 #else
-    status = cublasLtMatmulDescCreate(&operationDesc, scaleType);
+    status                     = cublasLtMatmulDescCreate(&operationDesc, scaleType);
 #endif
    if (status != CUBLAS_STATUS_SUCCESS) {
        goto CLEANUP;
@@ -1166,11 +1168,10 @@ int generate_encoder_igemm_config(
    }
    if (do_sparse_test) {
        printf("***cusparseLt Gemm Testing Begin***\n");
-        const int      spgemm_num = 3;
-        FILE*          fd;
-        int            line_count = 0;
-        const int      ites       = 100;
-        struct timeval start, end;
+        const int spgemm_num = 3;
+        FILE*     fd;
+        int       line_count = 0;
+        const int ites       = 100;
        if (!isAppend) {
            fd = fopen(SPIGEMM_CONFIG, "w+");
        }
@@ -1267,7 +1268,7 @@ int generate_encoder_igemm_config(
                    &handle, &mat_A, m, k, k, alignment, CUDA_R_8I, row_order, CUSPARSELT_SPARSITY_50_PERCENT))
                CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &mat_B, k, n, k, alignment, CUDA_R_8I, col_order))
                CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &mat_C, m, n, m, alignment, CUDA_R_8I, col_order))
-                gettimeofday(&start, NULL);
+                auto start = std::chrono::high_resolution_clock::now();
                for (int ite = 0; ite < ites; ++ite) {
                    // initializing MatDesc takes a lot of time
                    // and these descs can be stored to other place
@@ -1298,10 +1299,11 @@ int generate_encoder_igemm_config(
                    CHECK_CUSPARSE(cusparseLtMatmulPlanDestroy(&plan))
                }
                cudaDeviceSynchronize();
-                gettimeofday(&end, NULL);
-                printf("algo_%d costs %.3fms \n", alg, diffTime(start, end) / ites);
-                if (diffTime(start, end) < exec_time) {
-                    exec_time = diffTime(start, end);
+                auto end = std::chrono::high_resolution_clock::now();
+                auto dur = std::chrono::duration<float, std::milli>(end - start);
+                printf("algo_%d costs %.3fms \n", alg, dur.count() / ites);
+                if (dur.count() < exec_time) {
+                    exec_time = dur.count();
                    fast_algo = alg;
                }
            }

--- a/src/turbomind/utils/gemm_test/encoder_igemm_func.h
+++ b/src/turbomind/utils/gemm_test/encoder_igemm_func.h
@@ -24,9 +24,11 @@
 #include <map>
 #include <stdio.h>
 #include <stdlib.h>
+#ifdef __linux__
 #include <sys/time.h>
-#include <time.h>
 #include <unistd.h>
+#endif
+#include <time.h>
 #include <vector>

 namespace turbomind {

--- a/src/turbomind/utils/gemm_test/gemm_func.cc
+++ b/src/turbomind/utils/gemm_test/gemm_func.cc
@@ -17,6 +17,7 @@
 #include "encoder_gemm_func.h"
 #include <assert.h>
 #include <sys/types.h>
+#include <vector>

 #ifndef CUDART_VERSION
 #error CUDART_VERSION Undefined!
@@ -268,17 +269,17 @@ int LtHgemmCustomFind(cublasLtHandle_t   ltHandle,
    // given algo
    const int splitKSequenceA[] = {2, 3, 4, 5, 6, 8, 12, 16, 32};
    // Let try a fixed number of combinations
-    int                  AlgoCount         = 0;
-    int                  AlgoCountRestrict = 0;            // workspace == 0
-    const int            maxNumTraversal   = 50;           // max number of traversal
-    cublasLtMatmulAlgo_t algos[AlgoCombinations];          // 0 <= workspace <= 32MB
-    cublasLtMatmulAlgo_t algosRestrict[AlgoCombinations];  // workspace == 0
-    const int            kernelRepeats = 100;              // number of time the CUDA kernels will be run back to back
-    int                  nbAlgoIds     = 0;                // Number of algorithms actually returned by
-                                                           // cublasLtMatmulAlgoGetIds function.
-#define ALGO_IDS 100                                       // Number of algorithms requested.
-    int algoIdA[ALGO_IDS];                                 // Array containing the algorithm IDs returned by
-                                                           // cublasLtMatmulAlgoGetIds function.
+    int                               AlgoCount         = 0;
+    int                               AlgoCountRestrict = 0;            // workspace == 0
+    const int                         maxNumTraversal   = 50;           // max number of traversal
+    std::vector<cublasLtMatmulAlgo_t> algos(AlgoCombinations);          // 0 <= workspace <= 32MB
+    std::vector<cublasLtMatmulAlgo_t> algosRestrict(AlgoCombinations);  // workspace == 0
+    const int                         kernelRepeats = 100;  // number of time the CUDA kernels will be run back to back
+    int                               nbAlgoIds     = 0;    // Number of algorithms actually returned by
+                                                            // cublasLtMatmulAlgoGetIds function.
+#define ALGO_IDS 100                                        // Number of algorithms requested.
+    int algoIdA[ALGO_IDS];                                  // Array containing the algorithm IDs returned by
+                                                            // cublasLtMatmulAlgoGetIds function.
    cudaDataType_t Atype, Btype, Ctype, scaleType, Dtype;
 #if (CUDART_VERSION >= 11000)
    cublasComputeType_t computeType;

--- a/src/turbomind/utils/gemm_test/gemm_func.h
+++ b/src/turbomind/utils/gemm_test/gemm_func.h
@@ -28,10 +28,12 @@
 #ifdef ENABLE_FP8
 #include <cuda_fp8.h>
 #endif
-#include <cuda_profiler_api.h>
-#include <map>
+#ifdef __linux__
 #include <sys/time.h>
 #include <unistd.h>
+#endif
+#include <cuda_profiler_api.h>
+#include <map>
 #include <vector>

 namespace turbomind {

--- a/src/turbomind/utils/gemm_test/gpt_gemm_func.cc
+++ b/src/turbomind/utils/gemm_test/gpt_gemm_func.cc
@@ -15,6 +15,8 @@
 */

 #include "src/turbomind/utils/gemm_test/gpt_gemm_func.h"
+#include "src/turbomind/macro.h"
+#include <chrono>

 namespace turbomind {

@@ -232,7 +234,6 @@ void generate_gpt_gemm_config(int   batch_size,
    cudaDataType_t computeType;
    int            startAlgo, endAlgo;
    const int      ites = 100;
-    struct timeval start, end;

    CublasDataType data_type;
    if (std::is_same<T, float>::value) {
@@ -332,7 +333,7 @@ void generate_gpt_gemm_config(int   batch_size,
        for (int algo = startAlgo; algo <= endAlgo; algo++) {
            cublasStatus_t status;
            cudaDeviceSynchronize();
-            gettimeofday(&start, NULL);
+            auto start = std::chrono::high_resolution_clock::now();
            for (int ite = 0; ite < ites; ++ite) {
                if (i == 1) {
                    status = cublasGemmStridedBatchedEx(cublas_handle,
@@ -432,11 +433,12 @@ void generate_gpt_gemm_config(int   batch_size,
                }
            }
            cudaDeviceSynchronize();
-            gettimeofday(&end, NULL);
+            auto end = std::chrono::high_resolution_clock::now();
+            auto dur = std::chrono::duration<float, std::milli>(end - start);
            if (status == CUBLAS_STATUS_SUCCESS) {
-                printf("algo_%d costs %.3fms \n", algo, diffTime(start, end) / ites);
-                if (diffTime(start, end) / ites < exec_time) {
-                    exec_time = diffTime(start, end) / ites;
+                printf("algo_%d costs %.3fms \n", algo, dur.count() / ites);
+                if (dur.count() / ites < exec_time) {
+                    exec_time = dur.count() / ites;
                    fast_algo = algo;
                }
            }
@@ -450,7 +452,7 @@ void generate_gpt_gemm_config(int   batch_size,
        if ((data_type != FLOAT_DATATYPE && i != 1 && i != 2 && i != 10) || data_type == FP8_DATATYPE) {
            printf("***cublasLt Gemm Testing Beign***\n");
            // Let try a fixed number of combinations
-            int                ALGO_COMBINATIONS = 10000;
+            const int          ALGO_COMBINATIONS = 10000;
            customMatmulPerf_t perfResults[ALGO_COMBINATIONS];

            // for gpt, computeType & scaleType should be FP32
@@ -644,7 +646,7 @@ void generate_gpt_gemm_config(int   batch_size,
                    CHECK_CUSPARSE(
                        cusparseLtDenseDescriptorInit(&handle, &mat_C, m, n, m, alignment, CUDA_R_16F, order))
                    cudaDeviceSynchronize();
-                    gettimeofday(&start, NULL);
+                    auto start = std::chrono::high_resolution_clock::now();
                    for (int ite = 0; ite < ites; ++ite) {
                        // initializing MatDesc takes a lot of time
                        // and these descs can be stored to other place
@@ -675,10 +677,11 @@ void generate_gpt_gemm_config(int   batch_size,
                        CHECK_CUSPARSE(cusparseLtMatmulPlanDestroy(&plan))
                    }
                    cudaDeviceSynchronize();
-                    gettimeofday(&end, NULL);
-                    printf("algo_%d costs %.3fms \n", alg, diffTime(start, end) / ites);
-                    if (diffTime(start, end) < exec_time) {
-                        exec_time = diffTime(start, end);
+                    auto end = std::chrono::high_resolution_clock::now();
+                    auto dur = std::chrono::duration<float, std::milli>(end - start);
+                    printf("algo_%d costs %.3fms \n", alg, dur.count() / ites);
+                    if (dur.count() < exec_time) {
+                        exec_time = dur.count();
                        fast_algo = alg;
                    }
                }

--- a/src/turbomind/utils/gemm_test/gpt_gemm_func.h
+++ b/src/turbomind/utils/gemm_test/gpt_gemm_func.h
@@ -32,8 +32,10 @@
 #endif
 #include <cuda_profiler_api.h>
 #include <map>
+#ifdef __linux__
 #include <sys/time.h>
 #include <unistd.h>
+#endif
 #include <vector>

 namespace turbomind {

--- a/src/turbomind/utils/gemm_test/swin_gemm_func.cc
+++ b/src/turbomind/utils/gemm_test/swin_gemm_func.cc
@@ -15,6 +15,8 @@
 */

 #include "src/turbomind/utils/gemm_test/swin_gemm_func.h"
+#include "src/turbomind/macro.h"
+#include <chrono>

 namespace turbomind {

@@ -140,7 +142,6 @@ void generate_swin_gemm_config(
        cudaDataType_t computeType;
        int            startAlgo, endAlgo;
        const int      ites = 100;
-        struct timeval start, end;

        CublasDataType data_type;
        if (std::is_same<T, float>::value) {
@@ -211,7 +212,7 @@ void generate_swin_gemm_config(
            for (int algo = startAlgo; algo <= endAlgo; algo++) {
                cublasStatus_t status;
                cudaDeviceSynchronize();
-                gettimeofday(&start, NULL);
+                auto start = std::chrono::high_resolution_clock::now();
                for (int ite = 0; ite < ites; ++ite) {
                    if (i < 5) {
                        status = cublasGemmEx(cublas_handle,
@@ -289,11 +290,12 @@ void generate_swin_gemm_config(
                    }
                }
                cudaDeviceSynchronize();
-                gettimeofday(&end, NULL);
+                auto end = std::chrono::high_resolution_clock::now();
+                auto dur = std::chrono::duration<float, std::milli>(end - start);
                if (status == CUBLAS_STATUS_SUCCESS) {
-                    printf("algo_%d costs %.3fms \n", algo, diffTime(start, end) / ites);
-                    if (diffTime(start, end) / ites < exec_time) {
-                        exec_time = diffTime(start, end) / ites;
+                    printf("algo_%d costs %.3fms \n", algo, dur.count() / ites);
+                    if (dur.count() / ites < exec_time) {
+                        exec_time = dur.count() / ites;
                        fast_algo = algo;
                    }
                }
@@ -304,7 +306,7 @@ void generate_swin_gemm_config(
            if (i < 5 && data_type != FLOAT_DATATYPE) {
                printf("***cublasLt Gemm Testing Begin***\n");
                // Let try a fixed number of combinations
-                int                ALGO_COMBINATIONS = 5000;
+                const int          ALGO_COMBINATIONS = 5000;
                customMatmulPerf_t perfResults[ALGO_COMBINATIONS];

                LtHgemmCustomFind<T, scaleT>(ltHandle,

--- a/src/turbomind/utils/gemm_test/swin_gemm_func.h
+++ b/src/turbomind/utils/gemm_test/swin_gemm_func.h
@@ -27,8 +27,10 @@
 #include <cuda_fp16.h>
 #include <cuda_profiler_api.h>
 #include <map>
+#ifdef __linux__
 #include <sys/time.h>
 #include <unistd.h>
+#endif
 #include <vector>

 namespace turbomind {

--- a/src/turbomind/utils/gemm_test/swin_igemm_func.cc
+++ b/src/turbomind/utils/gemm_test/swin_igemm_func.cc
@@ -15,6 +15,7 @@
 */

 #include "swin_igemm_func.h"
+#include <chrono>

 namespace turbomind {

@@ -86,10 +87,9 @@ static cublasStatus_t customMatmulRun(cublasLtHandle_t            ltHandle,  //
        cublasLtMatmulAlgoCheck(ltHandle, operationDesc, Adesc, Bdesc, Cdesc, Ddesc, &algo, &heurResult);
    if (algoStatus == CUBLAS_STATUS_SUCCESS) {
        if (heurResult.workspaceSize <= workSpaceSizeInBytes) {
-            struct timeval start, end;
            cublasStatus_t oneRunStatus;
            cudaDeviceSynchronize();
-            gettimeofday(&start, NULL);
+            auto start = std::chrono::high_resolution_clock::now();
            for (int loop = 0; loop < repeats; loop++) {
                oneRunStatus = cublasLtMatmul(ltHandle,
                                              operationDesc,
@@ -109,11 +109,12 @@ static cublasStatus_t customMatmulRun(cublasLtHandle_t            ltHandle,  //
                                              stream);
            }
            cudaDeviceSynchronize();
-            gettimeofday(&end, NULL);
+            auto end = std::chrono::high_resolution_clock::now();
+            auto dur = std::chrono::duration<float, std::milli>(end - start);
            if (oneRunStatus != CUBLAS_STATUS_SUCCESS) {
                algoStatus = oneRunStatus;
            }
-            float time = diffTime(start, end);
+            float time = dur.count();
            // For the moment only add successful findings
            if (algoStatus == CUBLAS_STATUS_SUCCESS) {
                perfResults.algo          = algo;

--- a/src/turbomind/utils/gemm_test/swin_igemm_func.h
+++ b/src/turbomind/utils/gemm_test/swin_igemm_func.h
@@ -25,9 +25,11 @@
 #include <map>
 #include <stdio.h>
 #include <stdlib.h>
-#include <sys/time.h>
 #include <time.h>
+#ifdef __linux__
+#include <sys/time.h>
 #include <unistd.h>
+#endif
 #include <vector>

 namespace turbomind {

--- a/src/turbomind/utils/gemm_test/t5_gemm_func.cc
+++ b/src/turbomind/utils/gemm_test/t5_gemm_func.cc
@@ -15,6 +15,8 @@
 */

 #include "src/turbomind/utils/gemm_test/t5_gemm_func.h"
+#include "src/turbomind/macro.h"
+#include <chrono>

 namespace turbomind {

@@ -202,7 +204,6 @@ void generate_t5_gemm_config(int   batch_size,
    cudaDataType_t computeType;
    int            startAlgo, endAlgo;
    const int      ites = 100;
-    struct timeval start, end;

    CublasDataType data_type;
    if (std::is_same<T, float>::value) {
@@ -286,7 +287,7 @@ void generate_t5_gemm_config(int   batch_size,
        for (int algo = startAlgo; algo <= endAlgo; algo++) {
            cublasStatus_t status;
            cudaDeviceSynchronize();
-            gettimeofday(&start, NULL);
+            auto start = std::chrono::high_resolution_clock::now();
            for (int ite = 0; ite < ites; ++ite) {
                if (i == 0) {
                    status = cublasGemmBatchedEx(cublas_handle,
@@ -408,11 +409,12 @@ void generate_t5_gemm_config(int   batch_size,
                }
            }
            cudaDeviceSynchronize();
-            gettimeofday(&end, NULL);
+            auto end = std::chrono::high_resolution_clock::now();
+            auto dur = std::chrono::duration<float, std::milli>(end - start);
            if (status == CUBLAS_STATUS_SUCCESS) {
-                printf("algo_%d costs %.3fms \n", algo, diffTime(start, end) / ites);
-                if (diffTime(start, end) / ites < exec_time) {
-                    exec_time = diffTime(start, end) / ites;
+                printf("algo_%d costs %.3fms \n", algo, dur.count() / ites);
+                if (dur.count() / ites < exec_time) {
+                    exec_time = dur.count() / ites;
                    fast_algo = algo;
                }
            }
@@ -431,7 +433,7 @@ void generate_t5_gemm_config(int   batch_size,
        if (data_type != FLOAT_DATATYPE && i != 1 && i != 2 && i != 0 && i != 10) {
            printf("***cublasLt Gemm Testing Begin***\n");
            // Let try a fixed number of combinations
-            int                ALGO_COMBINATIONS = 5000;
+            const int          ALGO_COMBINATIONS = 5000;
            customMatmulPerf_t perfResults[ALGO_COMBINATIONS];

            // for t5, computeType & scaleType should be FP32
@@ -643,7 +645,7 @@ void generate_t5_gemm_config(int   batch_size,
                    CHECK_CUSPARSE(
                        cusparseLtDenseDescriptorInit(&handle, &mat_C, m, n, m, alignment, CUDA_R_16F, order))
                    cudaDeviceSynchronize();
-                    gettimeofday(&start, NULL);
+                    auto start = std::chrono::high_resolution_clock::now();
                    for (int ite = 0; ite < ites; ++ite) {
                        // initializing MatDesc takes a lot of time
                        // and these descs can be stored to other place
@@ -674,10 +676,11 @@ void generate_t5_gemm_config(int   batch_size,
                        CHECK_CUSPARSE(cusparseLtMatmulPlanDestroy(&plan))
                    }
                    cudaDeviceSynchronize();
-                    gettimeofday(&end, NULL);
-                    printf("algo_%d costs %.3fms \n", alg, diffTime(start, end) / ites);
-                    if (diffTime(start, end) < exec_time) {
-                        exec_time = diffTime(start, end);
+                    auto end = std::chrono::high_resolution_clock::now();
+                    auto dur = std::chrono::duration<float, std::milli>(end - start);
+                    printf("algo_%d costs %.3fms \n", alg, dur.count() / ites);
+                    if (dur.count() < exec_time) {
+                        exec_time = dur.count();
                        fast_algo = alg;
                    }
                }

--- a/src/turbomind/utils/gemm_test/t5_gemm_func.h
+++ b/src/turbomind/utils/gemm_test/t5_gemm_func.h
@@ -27,8 +27,10 @@
 #include <cuda_fp16.h>
 #include <cuda_profiler_api.h>
 #include <map>
+#ifdef __linux__
 #include <sys/time.h>
 #include <unistd.h>
+#endif
 #include <vector>

 namespace turbomind {

--- a/src/turbomind/utils/gemm_test/xlnet_gemm_func.cc
+++ b/src/turbomind/utils/gemm_test/xlnet_gemm_func.cc
@@ -15,6 +15,8 @@
 */

 #include "src/turbomind/utils/gemm_test/xlnet_gemm_func.h"
+#include "src/turbomind/macro.h"
+#include <chrono>

 namespace turbomind {

@@ -225,7 +227,6 @@ void generate_xlnet_gemm_config(int   batch_size,
    cudaDataType_t computeType;
    int            startAlgo, endAlgo;
    const int      ites = 100;
-    struct timeval start, end;

    CublasDataType data_type;
    if (std::is_same<T, float>::value) {
@@ -285,7 +286,7 @@ void generate_xlnet_gemm_config(int   batch_size,
        for (int algo = startAlgo; algo <= endAlgo; algo++) {
            cublasStatus_t status;
            cudaDeviceSynchronize();
-            gettimeofday(&start, NULL);
+            auto start = std::chrono::high_resolution_clock::now();
            for (int ite = 0; ite < ites; ++ite) {
                if (i == 1 || i == 7 || i == 8 || i == 9) {
                    status = cublasGemmEx(cublas_handle,
@@ -338,11 +339,12 @@ void generate_xlnet_gemm_config(int   batch_size,
                }
            }
            cudaDeviceSynchronize();
-            gettimeofday(&end, NULL);
+            auto end = std::chrono::high_resolution_clock::now();
+            auto dur = std::chrono::duration<float, std::milli>(end - start);
            if (status == CUBLAS_STATUS_SUCCESS) {
-                printf("algo_%d costs %.3fms \n", algo, diffTime(start, end) / ites);
-                if (diffTime(start, end) / ites < exec_time) {
-                    exec_time = diffTime(start, end) / ites;
+                printf("algo_%d costs %.3fms \n", algo, dur.count() / ites);
+                if (dur.count() / ites < exec_time) {
+                    exec_time = dur.count() / ites;
                    fast_algo = algo;
                }  // end if diffTime
            }      // end status
@@ -353,7 +355,7 @@ void generate_xlnet_gemm_config(int   batch_size,
        if ((i == 1 || i == 7 || i == 8 || i == 9) && data_type != FLOAT_DATATYPE) {
            printf("***cublasLt Gemm Testing Begin***\n");
            // Let try a fixed number of combinations
-            int                ALGO_COMBINATIONS = 5000;
+            const int          ALGO_COMBINATIONS = 5000;
            customMatmulPerf_t perfResults[ALGO_COMBINATIONS];

            LtHgemmCustomFind<T, scaleT>(ltHandle,

--- a/src/turbomind/utils/gemm_test/xlnet_gemm_func.h
+++ b/src/turbomind/utils/gemm_test/xlnet_gemm_func.h
@@ -27,8 +27,10 @@
 #include <cuda_fp16.h>
 #include <cuda_profiler_api.h>
 #include <map>
+#ifdef __linux__
 #include <sys/time.h>
 #include <unistd.h>
+#endif
 #include <vector>

 namespace turbomind {