Merge remote-tracking branch 'origin/develop' into bf16_int8_ckprofiler

892cb743 · Chao Liu · ba132d28 · 992f71e3 · 892cb743 · 892cb743
Commit 892cb743 authored Mar 03, 2022 by Chao Liu
11 changed files
--- a/example/1_gemm_xdl/gemm_xdl.cpp
+++ b/example/1_gemm_xdl/gemm_xdl.cpp
@@ -45,7 +45,7 @@ static constexpr auto GemmDefault   = ck::tensor_operation::device::GemmSpeciali
 static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization_t::MNPadding;
 // clang-format off
-#if 1
+#if 0
 using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl
 //######| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|      Num|
 //######|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Spacialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar| Prefetch|
@@ -53,6 +53,13 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl
 //######|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |         |
 //    [256, 128, 4, 8], 1 stage, 2 occupancy
        <   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1>;
+#elif 1
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl_C_Shuffle
+//######|AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
+//######| Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
+//######|     |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|     |      |      |        |        |        |        |            |            |            |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+        <  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>;
 #elif 0
 using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl
 //######| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|      Num|
@@ -82,14 +89,6 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl
 //    [ 64, 144, 4, 8], 2 stage, 2 occupancy
 //     85 TFlops
 //      <   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   144,     4,  8,   16,   16,    1,    9,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1,        2>;
-#elif 1
-using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl_C_Shuffle
-//######| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|      Num|
-//######|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Prefetch|
-//######|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         |
-//######|      |      |      |        |        |        |        |            |            |            |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |         |
-//    [128, 144, 8, 8], 1 stage, 1 occupancy, bounded by LDS size
-        <   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   144,     8,  8,   16,   16,    2,    9,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<8,  8, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,           1,           9,             S<1, 1,  8, 1, 9, 2>,               8,        1>;
 #endif
 // clang-format on
@@ -170,9 +169,13 @@ int main(int argc, char* argv[])
        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
        break;
-    default:
+    case 2:
        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_Sequential<0>{});
+        b_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
    }
    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());

--- a/host/CMakeLists.txt
+++ b/host/CMakeLists.txt
 add_subdirectory(host_tensor)
\ No newline at end of file
--- a/host/host_tensor/include/host_tensor.hpp
+++ b/host/host_tensor/include/host_tensor.hpp
@@ -10,7 +10,6 @@
 #include <iostream>
 #include "data_type.hpp"
 template <typename Range>
 std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim)
 {

--- a/host/host_tensor/include/host_tensor_generator.hpp
+++ b/host/host_tensor/include/host_tensor_generator.hpp
@@ -153,4 +153,15 @@ struct GeneratorTensor_Checkboard
    }
 };
+template <ck::index_t Dim>
+struct GeneratorTensor_Sequential
+{
+    template <typename... Ts>
+    float operator()(Ts... Xs) const
+    {
+        std::array<ck::index_t, sizeof...(Ts)> dims = {{static_cast<ck::index_t>(Xs)...}};
+        return dims[Dim];
+    }
+};
 #endif
--- a/rbuild.ini
+++ b/rbuild.ini
@@ -5,4 +5,4 @@ ignore = pcre
 deps =
    -f dev-requirements.txt
 define =
    BUILD_DEV=On
\ No newline at end of file
--- a/requirements.txt
+++ b/requirements.txt
-half,https://github.com/pfultz2/half/archive/1.12.0.tar.gz -X header -H sha256:0a08660b68abb176ebc2a0cdf8de46e3182a7f46c66443bb80dbfaaec98cf969 --build
 danmar/cppcheck@dd05839a7e63ef04afd34711cb3e1e0ef742882f
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -13,58 +13,24 @@ include_directories(BEFORE
    ${PROJECT_SOURCE_DIR}/test/include
 )
-# test_magic_number_division
+add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR})
-set(MAGIC_NUMBER_DIVISISON_SOURCE magic_number_division/main.cpp)
+add_custom_target(tests)
-add_executable(test_magic_number_division ${MAGIC_NUMBER_DIVISISON_SOURCE})
-target_link_libraries(test_magic_number_division PRIVATE host_tensor)
+function(add_test_executeable TEST_NAME)
+    add_executable(${TEST_NAME} ${ARGN})
+    target_link_libraries(${TEST_NAME} PRIVATE host_tensor)
-set(CONV2D_FWD_SOURCE conv2d_fwd/main.cpp)
+    target_link_libraries(${TEST_NAME} PRIVATE device_gemm_instance)
+    target_link_libraries(${TEST_NAME} PRIVATE device_conv2d_fwd_instance)
-add_executable(test_conv2d_fwd ${CONV2D_FWD_SOURCE})
+    add_test(NAME ${TEST_NAME} COMMAND $<TARGET_FILE:${TEST_NAME}> )
-target_link_libraries(test_conv2d_fwd PRIVATE host_tensor)
+    add_dependencies(tests ${TEST_NAME})
-target_link_libraries(test_conv2d_fwd PRIVATE device_conv2d_fwd_instance)
+    add_dependencies(check ${TEST_NAME})
+endfunction(add_test_executeable TEST_NAME)
-# test_split_k
-set(SPLIT_K_SOURCE split_k/main.cpp)
-add_executable(test_split_k ${SPLIT_K_SOURCE})
+file(GLOB TESTS *.cpp)
-target_link_libraries(test_split_k PRIVATE host_tensor)
-target_link_libraries(test_split_k PRIVATE device_gemm_instance)
+foreach(TEST ${TESTS})
+    get_filename_component(BASE_NAME ${TEST} NAME_WE)
-# test_conv_util
+    message("adding test ${BASE_NAME}")
-set(CONV_UTIL_SOURCE conv_util/main.cpp)
+    add_test_executeable(test_${BASE_NAME} ${TEST})
-add_executable(test_conv_util ${CONV_UTIL_SOURCE})
+endforeach(TEST ${TESTS})
-target_link_libraries(test_conv_util PRIVATE host_tensor)
-# test_reference_conv_fwd
-set(REFERENCE_CONV_FWD_SOURCE reference_conv_fwd/main.cpp)
-add_executable(test_reference_conv_fwd ${REFERENCE_CONV_FWD_SOURCE})
-target_link_libraries(test_reference_conv_fwd PRIVATE host_tensor)
-# test_convnd_fwd_xdl
-set(CONVND_FWD_XDL_SOURCE convnd_fwd_xdl/main.cpp)
-add_executable(test_convnd_fwd_xdl ${CONVND_FWD_XDL_SOURCE})
-target_link_libraries(test_convnd_fwd_xdl PRIVATE host_tensor)
-# test_gemm_xdl_fp32
-set(GEMM_XDL_FP32_SOURCE gemm_xdl/test_gemm_fp32.cpp)
-add_executable(test_gemm_xdl_fp32 ${GEMM_XDL_FP32_SOURCE})
-target_link_libraries(test_gemm_xdl_fp32 PRIVATE host_tensor)
-target_link_libraries(test_gemm_xdl_fp32 PRIVATE device_gemm_instance)
-# test_gemm_xdl_bf16
-set(GEMM_XDL_BF16_SOURCE gemm_xdl/test_gemm_bf16.cpp)
-add_executable(test_gemm_xdl_bf16 ${GEMM_XDL_BF16_SOURCE})
-target_link_libraries(test_gemm_xdl_bf16 PRIVATE host_tensor)
-target_link_libraries(test_gemm_xdl_bf16 PRIVATE device_gemm_instance)
-# test_gemm_xdl_int8
-set(GEMM_XDL_INT8_SOURCE gemm_xdl/test_gemm_int8.cpp)
-add_executable(test_gemm_xdl_int8 ${GEMM_XDL_INT8_SOURCE})
-target_link_libraries(test_gemm_xdl_int8 PRIVATE host_tensor)
-target_link_libraries(test_gemm_xdl_int8 PRIVATE device_gemm_instance)
-# test space_filling_curve_
-set(SPACE_FILLING_CURVE_SOURCE space_filling_curve/space_filling_curve.cpp)
-add_executable(space_filling_curve ${SPACE_FILLING_CURVE_SOURCE})
-target_link_libraries(space_filling_curve PRIVATE host_tensor)
--- a/test/conv2d_fwd/main.cpp
+++ b/test/conv2d_fwd/main.cpp
@@ -75,8 +75,12 @@ int main(int argc, char* argv[])
    ck::index_t in_left_pad_w   = 1;
    ck::index_t in_right_pad_h  = 1;
    ck::index_t in_right_pad_w  = 1;
+    if(argc == 1)
-    if(argc == 3)
+    {
+        init_method = 1;
+        data_type   = 0;
+    }
+    else if(argc == 3)
    {
        data_type   = std::stoi(argv[1]);
        init_method = std::stoi(argv[2]);
@@ -275,20 +279,22 @@ int main(int argc, char* argv[])
        if(success)
        {
            std::cout << "test conv2d fwd : Pass" << std::endl;
+            return 0;
        }
        else
        {
            std::cout << "test conv2d fwd: Fail " << std::endl;
+            return -1;
        }
    };
+    int res = -1;
    if(data_type == 0)
    {
-        Run(float(), float(), float());
+        res = Run(float(), float(), float());
    }
    else if(data_type == 1)
    {
-        Run(ck::half_t(), ck::half_t(), ck::half_t());
+        res = Run(ck::half_t(), ck::half_t(), ck::half_t());
    }
    else if(data_type == 2)
    {
@@ -296,12 +302,8 @@ int main(int argc, char* argv[])
    }
    else if(data_type == 3)
    {
-        Run(int8_t(), int8_t(), int8_t());
+        res = Run(int8_t(), int8_t(), int8_t());
-    }
-    else
-    {
-        return 1;
    }
-    return 0;
+    return res;
 }
--- a/test/gemm_xdl/test_gemm_fp32.cpp
+++ b/test/gemm_xdl/test_gemm_fp32.cpp
@@ -111,7 +111,8 @@ bool TestGemm(DeviceGemmPtr_& gemmPtr)
        gemmPtr, params, a, b, c_device, a_element_op, b_element_op, c_element_op);
    // Assert
-    bool res = test_util::check_err(c_device.mData, c_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
+    bool res = test_util::check_err(
+        c_device.mData, c_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
    std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;

--- a/test/magic_number_division/main.cpp
+++ b/test/magic_number_division/main.cpp
@@ -161,11 +161,11 @@ int main(int, char*[])
    if(pass)
    {
        std::cout << "test magic number division: Pass" << std::endl;
+        return 0;
    }
    else
    {
        std::cout << "test magic number division: Fail" << std::endl;
+        return -1;
    }
-    return 1;
 }
--- a/test/split_k/main.cpp
+++ b/test/split_k/main.cpp
@@ -57,32 +57,23 @@ static bool check_out(const Tensor<T>& ref, const Tensor<T>& result)
    return true;
 }
-int main(int argc, char* argv[])
+struct gemmArgs
 {
-    if(argc != 9)
+    int layout;
-    {
+    int M;
-        printf("arg1: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
+    int N;
-        printf("                     1: A[m, k] * B[n, k] = C[m, n];\n");
+    int K;
-        printf("                     2: A[k, m] * B[k, n] = C[m, n];\n");
+    int StrideA;
-        printf("                     3: A[k, m] * B[n, k] = C[m, n])\n");
+    int StrideB;
-        printf("arg2 to 7: M, N, K, StrideA, StrideB, StrideC KBatch\n");
+    int StrideC;
-        return 1;
+    int KBatch;
-    }
+};
-    const int layout = static_cast<GemmMatrixLayout>(std::stoi(argv[1]));
-    const int M = std::stoi(argv[2]);
-    const int N = std::stoi(argv[3]);
-    const int K = std::stoi(argv[4]);
-    const int StrideA = std::stoi(argv[5]);
-    const int StrideB = std::stoi(argv[6]);
-    const int StrideC = std::stoi(argv[7]);
-    const int KBatch  = std::stoi(argv[8]);
+int test_gemm(const gemmArgs& args)
+{
    bool a_row_major, b_row_major, c_row_major;
-    switch(layout)
+    switch(args.layout)
    {
    case GemmMatrixLayout::MK_KN_MN:
        a_row_major = true;
@@ -121,10 +112,12 @@ int main(int argc, char* argv[])
            }
        };
-    Tensor<float> a_m_k(f_host_tensor_descriptor(M, K, StrideA, a_row_major));
+    Tensor<float> a_m_k(f_host_tensor_descriptor(args.M, args.K, args.StrideA, a_row_major));
-    Tensor<float> b_k_n(f_host_tensor_descriptor(K, N, StrideB, b_row_major));
+    Tensor<float> b_k_n(f_host_tensor_descriptor(args.K, args.N, args.StrideB, b_row_major));
-    Tensor<float> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, c_row_major));
+    Tensor<float> c_m_n_host_result(
-    Tensor<float> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, c_row_major));
+        f_host_tensor_descriptor(args.M, args.N, args.StrideC, c_row_major));
+    Tensor<float> c_m_n_device_result(
+        f_host_tensor_descriptor(args.M, args.N, args.StrideC, c_row_major));
    // init data
    std::size_t num_thread = std::thread::hardware_concurrency();
@@ -151,17 +144,17 @@ int main(int argc, char* argv[])
    // add device GEMM instances
    std::vector<DeviceGemmNoOpPtr> gemm_ptrs;
-    if(layout == GemmMatrixLayout::MK_KN_MN)
+    if(args.layout == GemmMatrixLayout::MK_KN_MN)
    {
        ck::tensor_operation::device::device_gemm_instance::
            add_device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instances(gemm_ptrs);
    }
-    else if(layout == GemmMatrixLayout::MK_NK_MN)
+    else if(args.layout == GemmMatrixLayout::MK_NK_MN)
    {
        ck::tensor_operation::device::device_gemm_instance::
            add_device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances(gemm_ptrs);
    }
-    else if(layout == GemmMatrixLayout::KM_KN_MN)
+    else if(args.layout == GemmMatrixLayout::KM_KN_MN)
    {
        ck::tensor_operation::device::device_gemm_instance::
            add_device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances(gemm_ptrs);
@@ -179,16 +172,16 @@ int main(int argc, char* argv[])
            gemm_ptr->MakeArgumentPointer(static_cast<float*>(a_device_buf.GetDeviceBuffer()),
                                          static_cast<float*>(b_device_buf.GetDeviceBuffer()),
                                          static_cast<float*>(c_device_buf.GetDeviceBuffer()),
-                                          M,
+                                          args.M,
-                                          N,
+                                          args.N,
-                                          K,
+                                          args.K,
-                                          StrideA,
+                                          args.StrideA,
-                                          StrideB,
+                                          args.StrideB,
-                                          StrideC,
+                                          args.StrideC,
                                          ck::tensor_operation::element_wise::PassThrough{},
                                          ck::tensor_operation::element_wise::PassThrough{},
                                          ck::tensor_operation::element_wise::PassThrough{},
-                                          KBatch);
+                                          args.KBatch);
        auto invoker_ptr = gemm_ptr->MakeInvokerPointer();
@@ -205,7 +198,7 @@ int main(int argc, char* argv[])
            success = true;
        }
    }
+    auto error_code = 0;
    if(success)
    {
        std::cout << "test split k : Pass" << std::endl;
@@ -213,6 +206,48 @@ int main(int argc, char* argv[])
    else
    {
        std::cout << "test split k: Fail " << std::endl;
+        error_code = -1; // test needs to report failure
+    }
+    return error_code;
+}
+int main(int argc, char* argv[])
+{
+    std::vector<gemmArgs> test_cases;
+    if(argc == 1)
+    {
+        test_cases = {{0, 3, 3, 3, 3, 3, 3, 1}};
+        // JD: Populate with more and meaningful
+        return 0;
+    }
+    else if(argc == 9)
+    {
+        const int layout = static_cast<GemmMatrixLayout>(std::stoi(argv[1]));
+        const int M = std::stoi(argv[2]);
+        const int N = std::stoi(argv[3]);
+        const int K = std::stoi(argv[4]);
+        const int StrideA = std::stoi(argv[5]);
+        const int StrideB = std::stoi(argv[6]);
+        const int StrideC = std::stoi(argv[7]);
+        const int KBatch  = std::stoi(argv[8]);
+        test_cases        = {{layout, M, N, K, StrideA, StrideB, StrideC, KBatch}};
+    }
+    else
+    {
+        printf("arg1: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
+        printf("                     1: A[m, k] * B[n, k] = C[m, n];\n");
+        printf("                     2: A[k, m] * B[k, n] = C[m, n];\n");
+        printf("                     3: A[k, m] * B[n, k] = C[m, n])\n");
+        printf("arg2 to 7: M, N, K, StrideA, StrideB, StrideC KBatch\n");
+        return -1;
+    }
+    for(const auto& kinder : test_cases)
+    {
+        const auto res = test_gemm(kinder);
+        if(!res)
+            return -1;
    }
    return 0;
 }