Merge remote-tracking branch 'origin/develop' into migx-flash-attn

4100d1d8 · Alan Turner · 48717006 · c8a8385f · 4100d1d8 · 4100d1d8
Commit 4100d1d8 authored Aug 23, 2023 by Alan Turner
20 changed files
--- a/profiler/src/profile_max_pool3d_fwd.cpp
+++ b/profiler/src/profile_max_pool3d_fwd.cpp
@@ -13,8 +13,12 @@ using ck::index_t;

 struct maxPoolFwdArgParser
 {
-    std::unordered_map<std::string, std::vector<int>> long_opts = {
-        {"length", {}}, {"wsize", {}}, {"wstride", {}}, {"pad1", {}}, {"pad2", {}}};
+    std::unordered_map<std::string, std::vector<int>> long_opts = {{"length", {}},
+                                                                   {"wsize", {}},
+                                                                   {"wstride", {}},
+                                                                   {"wdilation", {}},
+                                                                   {"pad1", {}},
+                                                                   {"pad2", {}}};

    bool parse_opt(int argc, char* argv[], const std::string& key, int i)
    {
@@ -56,10 +60,11 @@ void print_help_max_pool3d_fwd()
              << "--length: input tensor length for NCDHW(e.g, --length 2 32 30 30 30) \n"
              << "--wsize: window size for ZYX (e.g, --wsize 2 2 2) \n"
              << "--wstride: window stride for DHW (e.g, --wstride 2 2 2) \n"
+              << "--wdilation: window dilation for DHW (e.g, --wdilation 1 1 1) \n"
              << "--pad1: left side of padding in DHW (e.g, --pad1 1 1 1) \n"
              << "--pad2: right side of padding in DHW (e.g, --pad2 1 1 1) \n"
              << "eg: ckProfiler max_pool3d_fwd 0 1 2 0 1 0 --length 2 32 30 30 30 --wsize 2 2 2 "
-                 "--wstride 2 2 2 --pad1 1 1 1 --pad2 1 1 1"
+                 "--wstride 2 2 2 --wdilation 1 1 1 --pad1 1 1 1 --pad2 1 1 1"
              << std::endl;
 }

@@ -75,15 +80,16 @@ int profile_max_pool3d_fwd(int argc, char* argv[])
    std::vector<index_t> in_length = {2, 32, 30, 30, 30};
    std::vector<index_t> wsize     = {2, 2, 2};
    std::vector<index_t> wstride   = {2, 2, 2};
+    std::vector<index_t> wdilation = {1, 1, 1};
    std::vector<index_t> pad1      = {1, 1, 1};
    std::vector<index_t> pad2      = {1, 1, 1};

-    if(argc != 2 && argc != 30)
+    if(argc != 2 && argc != 34)
    {
        print_help_max_pool3d_fwd();
        return 0;
    }
-    else if(argc == 30)
+    else if(argc == 34)
    {
        data_type       = static_cast<ck::DataTypeEnum>(std::stoi(argv[2]));
        do_verification = std::stoi(argv[3]);
@@ -98,64 +104,79 @@ int profile_max_pool3d_fwd(int argc, char* argv[])
        in_length = arg_parser.long_opts["length"];
        wsize     = arg_parser.long_opts["wsize"];
        wstride   = arg_parser.long_opts["wstride"];
+        wdilation = arg_parser.long_opts["wdilation"];
        pad1      = arg_parser.long_opts["pad1"];
        pad2      = arg_parser.long_opts["pad2"];
    }

-    using F16                 = ck::half_t;
-    using F32                 = float;
-    using I32                 = int32_t;
+    using F16   = ck::half_t;
+    using F32   = float;
+    using I32   = int32_t;
+    using NDHWC = ck::tensor_layout::convolution::NDHWC;
+
+#if 1
    constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
+#else
+    constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
+#endif

    if(data_type == ck::DataTypeEnum::Half)
    {
        if(return_index)
-            ck::profiler::profile_pool3d_fwd_impl<F16, F16, F16, I32, ReduceOpId, false, true>(
-                do_verification,
-                init_method,
-                do_log,
-                time_kernel,
-                in_length,
-                wsize,
-                wstride,
-                pad1,
-                pad2);
+            ck::profiler::
+                profile_pool3d_fwd_impl<F16, F16, F16, I32, NDHWC, NDHWC, ReduceOpId, false, true>(
+                    do_verification,
+                    init_method,
+                    do_log,
+                    time_kernel,
+                    in_length,
+                    wsize,
+                    wstride,
+                    wdilation,
+                    pad1,
+                    pad2);
        else
-            ck::profiler::profile_pool3d_fwd_impl<F16, F16, F16, I32, ReduceOpId, false, false>(
-                do_verification,
-                init_method,
-                do_log,
-                time_kernel,
-                in_length,
-                wsize,
-                wstride,
-                pad1,
-                pad2);
+            ck::profiler::
+                profile_pool3d_fwd_impl<F16, F16, F16, I32, NDHWC, NDHWC, ReduceOpId, false, false>(
+                    do_verification,
+                    init_method,
+                    do_log,
+                    time_kernel,
+                    in_length,
+                    wsize,
+                    wstride,
+                    wdilation,
+                    pad1,
+                    pad2);
    }
    else if(data_type == ck::DataTypeEnum::Float)
    {
        if(return_index)
-            ck::profiler::profile_pool3d_fwd_impl<F32, F32, F32, I32, ReduceOpId, false, true>(
-                do_verification,
-                init_method,
-                do_log,
-                time_kernel,
-                in_length,
-                wsize,
-                wstride,
-                pad1,
-                pad2);
+            ck::profiler::
+                profile_pool3d_fwd_impl<F32, F32, F32, I32, NDHWC, NDHWC, ReduceOpId, false, true>(
+                    do_verification,
+                    init_method,
+                    do_log,
+                    time_kernel,
+                    in_length,
+                    wsize,
+                    wstride,
+                    wdilation,
+                    pad1,
+                    pad2);
        else
-            ck::profiler::profile_pool3d_fwd_impl<F32, F32, F32, I32, ReduceOpId, false, false>(
-                do_verification,
-                init_method,
-                do_log,
-                time_kernel,
-                in_length,
-                wsize,
-                wstride,
-                pad1,
-                pad2);
+            ck::profiler::
+                profile_pool3d_fwd_impl<F32, F32, F32, I32, NDHWC, NDHWC, ReduceOpId, false, false>(
+                    do_verification,
+                    init_method,
+                    do_log,
+                    time_kernel,
+                    in_length,
+                    wsize,
+                    wstride,
+                    wdilation,
+                    pad1,
+                    pad2);
    }
    else
    {

--- a/profiler/src/profile_softmax.cpp
+++ b/profiler/src/profile_softmax.cpp
@@ -92,27 +92,76 @@ int profile_softmax(int argc, char* argv[])
    {
        if(data_type == SoftmaxDataType::F16_F16)
        {
-            ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 3>(do_verification,
-                                                                                 init_method,
-                                                                                 do_log,
-                                                                                 time_kernel,
-                                                                                 length,
-                                                                                 stride,
-                                                                                 reduce,
-                                                                                 double(alpha),
-                                                                                 double(beta));
+            if(reduce.size() == 1)
+                ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 3, 1>(
+                    do_verification,
+                    init_method,
+                    do_log,
+                    time_kernel,
+                    length,
+                    stride,
+                    reduce,
+                    double(alpha),
+                    double(beta));
+            else if(reduce.size() == 2)
+                ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 3, 2>(
+                    do_verification,
+                    init_method,
+                    do_log,
+                    time_kernel,
+                    length,
+                    stride,
+                    reduce,
+                    double(alpha),
+                    double(beta));
+            else if(reduce.size() == 3)
+                ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 3, 3>(
+                    do_verification,
+                    init_method,
+                    do_log,
+                    time_kernel,
+                    length,
+                    stride,
+                    reduce,
+                    double(alpha),
+                    double(beta));
+            else
+                throw std::runtime_error("invalid number of dimensions to reduce");
        }
        else if(data_type == SoftmaxDataType::F32_F32)
        {
-            ck::profiler::profile_softmax_impl<float, float, float, 3>(do_verification,
-                                                                       init_method,
-                                                                       do_log,
-                                                                       time_kernel,
-                                                                       length,
-                                                                       stride,
-                                                                       reduce,
-                                                                       double(alpha),
-                                                                       double(beta));
+            if(reduce.size() == 1)
+                ck::profiler::profile_softmax_impl<float, float, float, 3, 1>(do_verification,
+                                                                              init_method,
+                                                                              do_log,
+                                                                              time_kernel,
+                                                                              length,
+                                                                              stride,
+                                                                              reduce,
+                                                                              double(alpha),
+                                                                              double(beta));
+            else if(reduce.size() == 2)
+                ck::profiler::profile_softmax_impl<float, float, float, 3, 2>(do_verification,
+                                                                              init_method,
+                                                                              do_log,
+                                                                              time_kernel,
+                                                                              length,
+                                                                              stride,
+                                                                              reduce,
+                                                                              double(alpha),
+                                                                              double(beta));
+            else if(reduce.size() == 3)
+                ck::profiler::profile_softmax_impl<float, float, float, 3, 3>(do_verification,
+                                                                              init_method,
+                                                                              do_log,
+                                                                              time_kernel,
+                                                                              length,
+                                                                              stride,
+                                                                              reduce,
+                                                                              double(alpha),
+                                                                              double(beta));
+            else
+                throw std::runtime_error("invalid number of dimensions to reduce");
        }
        else
        {
@@ -124,27 +173,97 @@ int profile_softmax(int argc, char* argv[])
    {
        if(data_type == SoftmaxDataType::F16_F16)
        {
-            ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 4>(do_verification,
-                                                                                 init_method,
-                                                                                 do_log,
-                                                                                 time_kernel,
-                                                                                 length,
-                                                                                 stride,
-                                                                                 reduce,
-                                                                                 double(alpha),
-                                                                                 double(beta));
+            if(reduce.size() == 1)
+                ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 4, 1>(
+                    do_verification,
+                    init_method,
+                    do_log,
+                    time_kernel,
+                    length,
+                    stride,
+                    reduce,
+                    double(alpha),
+                    double(beta));
+            else if(reduce.size() == 2)
+                ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 4, 2>(
+                    do_verification,
+                    init_method,
+                    do_log,
+                    time_kernel,
+                    length,
+                    stride,
+                    reduce,
+                    double(alpha),
+                    double(beta));
+            else if(reduce.size() == 3)
+                ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 4, 3>(
+                    do_verification,
+                    init_method,
+                    do_log,
+                    time_kernel,
+                    length,
+                    stride,
+                    reduce,
+                    double(alpha),
+                    double(beta));
+            else if(reduce.size() == 4)
+                ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 4, 4>(
+                    do_verification,
+                    init_method,
+                    do_log,
+                    time_kernel,
+                    length,
+                    stride,
+                    reduce,
+                    double(alpha),
+                    double(beta));
+            else
+                throw std::runtime_error("invalid number of dimensions to reduce");
        }
        else if(data_type == SoftmaxDataType::F32_F32)
        {
-            ck::profiler::profile_softmax_impl<float, float, float, 4>(do_verification,
-                                                                       init_method,
-                                                                       do_log,
-                                                                       time_kernel,
-                                                                       length,
-                                                                       stride,
-                                                                       reduce,
-                                                                       double(alpha),
-                                                                       double(beta));
+            if(reduce.size() == 1)
+                ck::profiler::profile_softmax_impl<float, float, float, 4, 1>(do_verification,
+                                                                              init_method,
+                                                                              do_log,
+                                                                              time_kernel,
+                                                                              length,
+                                                                              stride,
+                                                                              reduce,
+                                                                              double(alpha),
+                                                                              double(beta));
+            else if(reduce.size() == 2)
+                ck::profiler::profile_softmax_impl<float, float, float, 4, 2>(do_verification,
+                                                                              init_method,
+                                                                              do_log,
+                                                                              time_kernel,
+                                                                              length,
+                                                                              stride,
+                                                                              reduce,
+                                                                              double(alpha),
+                                                                              double(beta));
+            else if(reduce.size() == 3)
+                ck::profiler::profile_softmax_impl<float, float, float, 4, 3>(do_verification,
+                                                                              init_method,
+                                                                              do_log,
+                                                                              time_kernel,
+                                                                              length,
+                                                                              stride,
+                                                                              reduce,
+                                                                              double(alpha),
+                                                                              double(beta));
+            else if(reduce.size() == 4)
+                ck::profiler::profile_softmax_impl<float, float, float, 4, 4>(do_verification,
+                                                                              init_method,
+                                                                              do_log,
+                                                                              time_kernel,
+                                                                              length,
+                                                                              stride,
+                                                                              reduce,
+                                                                              double(alpha),
+                                                                              double(beta));
+            else
+                throw std::runtime_error("invalid number of dimensions to reduce");
        }
        else
        {

--- a/script/check_copyright_year.sh
+++ b/script/check_copyright_year.sh
+#!/bin/bash
+
+current_year=$(date +%Y)
+exit_code=0
+
+for file in $@; do
+    if grep -q "Copyright (c)" $file
+    then
+        if ! grep -q "Copyright (c).*$current_year" $file
+        then
+            echo "ERROR: File $file has a copyright notice without the current year ($current_year)."
+            exit_code=1
+        fi
+    fi
+done
+
+exit $exit_code
--- a/script/clang-format-overwrite.sh
+++ b/script/clang-format-overwrite.sh
-#find . -name deps -prune -o -name build -prune -o -iname '*.h' -o -iname '*.hpp' -o -iname '*.cpp' -o -iname '*.h.in' -o -iname '*.hpp.in' -o -iname '*.cpp.in' -o -iname '*.cl' -o -iname '*.cuh' -o -iname '*.cu' -o -iname '*.inc' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}'
-git status --porcelain | awk '$1 != "D" && (match($2, "\\.cpp|hpp|inc")) {print $2}' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}'
+#find . -name deps -prune -o -name build -prune -o -iname '*.h' -o -iname '*.hpp' -o -iname '*.cpp' -o -iname '*.h.in' -o -iname '*.hpp.in' -o -iname '*.cpp.in' -o -iname '*.cl' -o -iname '*.cuh' -o -iname '*.cu' -o -iname '*.inc' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-12 -i -style=file {}'
+git status --porcelain | awk '$1 != "D" && (match($2, "\\.cpp|hpp|inc")) {print $2}' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-12 -i -style=file {}'
--- a/script/cmake-ck-dev.sh
+++ b/script/cmake-ck-dev.sh
@@ -12,7 +12,7 @@ cmake
 -save-temps=$PWD"                                                                                 \
 -D CMAKE_BUILD_TYPE=Release                                                                       \
 -D BUILD_DEV=ON                                                                                   \
-D GPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942"                                               \
+-D GPU_TARGETS="gfx908;gfx90a;gfx940"                                                             \
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                                 \
 -D USE_BITINT_EXTENSION_INT4=OFF                                                                  \
 ${MY_PROJECT_SOURCE}

--- a/script/cmake-ck-release.sh
+++ b/script/cmake-ck-release.sh
@@ -11,7 +11,7 @@ cmake
 -D CMAKE_CXX_FLAGS="-O3"                                                                          \
 -D CMAKE_BUILD_TYPE=Release                                                                       \
 -D BUILD_DEV=OFF                                                                                  \
-D GPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942"                                               \
+-D GPU_TARGETS="gfx908;gfx90a;gfx940"                                                             \
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                                 \
 -D USE_BITINT_EXTENSION_INT4=OFF                                                                  \
 ${MY_PROJECT_SOURCE}

--- a/script/install_precommit.sh
+++ b/script/install_precommit.sh
+#!/bin/bash
+
+run_and_check() {
+    "$@"
+    status=$?
+    if [ $status -ne 0 ]; then
+        echo "Error with \"$@\": Exited with status $status"
+        exit $status
+    fi
+    return $status
+}
+
+echo "I: Installing tools required for pre-commit checks..."
+run_and_check apt install clang-format-12
+
+echo "I: Installing pre-commit itself..."
+run_and_check pip3 install pre-commit
+run_and_check pre-commit install
+
+echo "I: Installation successful."
--- a/script/profile_batched_gemm.sh
+++ b/script/profile_batched_gemm.sh
@@ -3,13 +3,6 @@
 ## GPU visibility
 export HIP_VISIBLE_DEVICES=0
 DRIVER="../build/bin/ckProfiler"
-OP=$1
-DATATYPE=$2
-LAYOUT=$3
-VERIFY=$4
-INIT=$5
-LOG=$6
-TIME=$7
 
 OP=$1
 DATATYPE=$2

--- a/script/uninstall_precommit.sh
+++ b/script/uninstall_precommit.sh
+pre-commit uninstall
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -61,7 +61,8 @@ else()
    add_subdirectory(batchnorm)
    add_subdirectory(contraction)
    add_subdirectory(pool_fwd)
-endif()
-if(GPU_TARGETS MATCHES "gfx1100")
+    add_subdirectory(batched_gemm_multi_d)
+    add_subdirectory(grouped_convnd_bwd_data)
+if(GPU_TARGETS MATCHES "gfx11")
    add_subdirectory(wmma_op)
 endif()
--- a/test/batched_gemm/CMakeLists.txt
+++ b/test/batched_gemm/CMakeLists.txt
@@ -2,21 +2,26 @@ list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)
-   add_test_executable(test_batched_gemm_fp16 batched_gemm_fp16.cpp)
-   target_link_libraries(test_batched_gemm_fp16 PRIVATE utility)
-   target_link_libraries(test_batched_gemm_fp16 PRIVATE device_batched_gemm_instance)
-
-   add_test_executable(test_batched_gemm_fp32 batched_gemm_fp32.cpp)
-   target_link_libraries(test_batched_gemm_fp32 PRIVATE utility)
-   target_link_libraries(test_batched_gemm_fp32 PRIVATE device_batched_gemm_instance)
-
-   add_test_executable(test_batched_gemm_bf16 batched_gemm_bf16.cpp)
-   target_link_libraries(test_batched_gemm_bf16 PRIVATE utility)
-   target_link_libraries(test_batched_gemm_bf16 PRIVATE device_batched_gemm_instance)
-
-   add_test_executable(test_batched_gemm_int8 batched_gemm_int8.cpp)
-   target_link_libraries(test_batched_gemm_int8 PRIVATE utility)
-   target_link_libraries(test_batched_gemm_int8 PRIVATE device_batched_gemm_instance)
+   if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+      add_test_executable(test_batched_gemm_fp16 batched_gemm_fp16.cpp)
+      target_link_libraries(test_batched_gemm_fp16 PRIVATE utility)
+      target_link_libraries(test_batched_gemm_fp16 PRIVATE device_batched_gemm_instance)
+   endif()
+   if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES)
+      add_test_executable(test_batched_gemm_fp32 batched_gemm_fp32.cpp)
+      target_link_libraries(test_batched_gemm_fp32 PRIVATE utility)
+      target_link_libraries(test_batched_gemm_fp32 PRIVATE device_batched_gemm_instance)
+   endif()
+   if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
+      add_test_executable(test_batched_gemm_bf16 batched_gemm_bf16.cpp)
+      target_link_libraries(test_batched_gemm_bf16 PRIVATE utility)
+      target_link_libraries(test_batched_gemm_bf16 PRIVATE device_batched_gemm_instance)
+   endif()
+   if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)
+      add_test_executable(test_batched_gemm_int8 batched_gemm_int8.cpp)
+      target_link_libraries(test_batched_gemm_int8 PRIVATE utility)
+      target_link_libraries(test_batched_gemm_int8 PRIVATE device_batched_gemm_instance)
+   endif()
   set(target 1)
 endif()
 endforeach()
\ No newline at end of file
--- a/test/batched_gemm_gemm/CMakeLists.txt
+++ b/test/batched_gemm_gemm/CMakeLists.txt
@@ -2,10 +2,12 @@ list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)
-   add_custom_target(test_batched_gemm_gemm)
-   add_gtest_executable(test_batched_gemm_gemm_fp16 test_batched_gemm_gemm_fp16.cpp)
-   target_link_libraries(test_batched_gemm_gemm_fp16 PRIVATE utility device_batched_gemm_gemm_instance)
-   add_dependencies(test_batched_gemm_gemm test_batched_gemm_gemm_fp16)
-   set(target 1)
+   if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+      add_custom_target(test_batched_gemm_gemm)
+      add_gtest_executable(test_batched_gemm_gemm_fp16 test_batched_gemm_gemm_fp16.cpp)
+      target_link_libraries(test_batched_gemm_gemm_fp16 PRIVATE utility device_batched_gemm_gemm_instance)
+      add_dependencies(test_batched_gemm_gemm test_batched_gemm_gemm_fp16)
+      set(target 1)
+   endif()
 endif()
 endforeach()
\ No newline at end of file
--- a/test/batched_gemm_multi_d/CMakeLists.txt
+++ b/test/batched_gemm_multi_d/CMakeLists.txt
-# TODO: Enable for gfx90a after complier fix
-if(NOT GPU_TARGETS MATCHES "gfx90a")
+if(DL_KERNELS)
    add_gtest_executable(test_batched_gemm_multi_d test_batched_gemm_multi_d.cpp)
    target_link_libraries(test_batched_gemm_multi_d PRIVATE utility device_batched_gemm_multi_d_instance)
 endif()
--- a/test/batched_gemm_multi_d/test_batched_gemm_multi_d.cpp
+++ b/test/batched_gemm_multi_d/test_batched_gemm_multi_d.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #include <iostream>
 #include <gtest/gtest.h>
@@ -68,7 +68,9 @@ using KernelTypes = ::testing::Types<std::tuple<Row, Row, Row>,
 } // namespace

 TYPED_TEST_SUITE(TestBatchedGemmMultiD, KernelTypes);
-
+#ifdef __fp16
 TYPED_TEST(TestBatchedGemmMultiD, f16) { this->template Run<F16>(); }
-
+#endif
+#ifdef CK_ENABLE_INT8
 TYPED_TEST(TestBatchedGemmMultiD, int8) { this->template Run<int8_t>(); }
+#endif
--- a/test/batched_gemm_reduce/CMakeLists.txt
+++ b/test/batched_gemm_reduce/CMakeLists.txt
@@ -2,9 +2,11 @@ list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)
-   add_test_executable(test_batched_gemm_reduce_fp16 batched_gemm_reduce_fp16.cpp)
-   target_link_libraries(test_batched_gemm_reduce_fp16 PRIVATE utility)
-   target_link_libraries(test_batched_gemm_reduce_fp16 PRIVATE device_batched_gemm_reduce_instance)
-   set(target 1)
+   if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+     add_test_executable(test_batched_gemm_reduce_fp16 batched_gemm_reduce_fp16.cpp)
+     target_link_libraries(test_batched_gemm_reduce_fp16 PRIVATE utility)
+     target_link_libraries(test_batched_gemm_reduce_fp16 PRIVATE device_batched_gemm_reduce_instance)
+     set(target 1)
+   endif()
 endif()
 endforeach()
--- a/test/batched_gemm_softmax_gemm/CMakeLists.txt
+++ b/test/batched_gemm_softmax_gemm/CMakeLists.txt
@@ -2,10 +2,12 @@ list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)
-   add_custom_target(test_batched_gemm_softmax_gemm)
-   add_gtest_executable(test_batched_gemm_softmax_gemm_fp16 test_batched_gemm_softmax_gemm_fp16.cpp)
-   target_link_libraries(test_batched_gemm_softmax_gemm_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_instance)
-   add_dependencies(test_batched_gemm_softmax_gemm test_batched_gemm_softmax_gemm_fp16)
-   set(target 1)
+   if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+     add_custom_target(test_batched_gemm_softmax_gemm)
+     add_gtest_executable(test_batched_gemm_softmax_gemm_fp16 test_batched_gemm_softmax_gemm_fp16.cpp)
+     target_link_libraries(test_batched_gemm_softmax_gemm_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_instance)
+     add_dependencies(test_batched_gemm_softmax_gemm test_batched_gemm_softmax_gemm_fp16)
+     set(target 1)
+   endif()
 endif()
 endforeach()
\ No newline at end of file
--- a/test/batched_gemm_softmax_gemm_permute/CMakeLists.txt
+++ b/test/batched_gemm_softmax_gemm_permute/CMakeLists.txt
@@ -2,21 +2,25 @@ list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)
-   add_custom_target(test_batched_gemm_softmax_gemm_permute)
-
-   add_gtest_executable(test_batched_gemm_softmax_gemm_permute_fp16 test_batched_gemm_softmax_gemm_permute_fp16.cpp)
-   add_gtest_executable(test_batched_gemm_softmax_gemm_permute_bf16 test_batched_gemm_softmax_gemm_permute_bf16.cpp)
-   target_link_libraries(test_batched_gemm_softmax_gemm_permute_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
-   target_link_libraries(test_batched_gemm_softmax_gemm_permute_bf16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
-   add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_softmax_gemm_permute_fp16)
-   add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_softmax_gemm_permute_bf16)
-
-   add_gtest_executable(test_batched_gemm_bias_softmax_gemm_permute_fp16 test_batched_gemm_bias_softmax_gemm_permute_fp16.cpp)
-   add_gtest_executable(test_batched_gemm_bias_softmax_gemm_permute_bf16 test_batched_gemm_bias_softmax_gemm_permute_bf16.cpp)
-   target_link_libraries(test_batched_gemm_bias_softmax_gemm_permute_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
-   target_link_libraries(test_batched_gemm_bias_softmax_gemm_permute_bf16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
-   add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_bias_softmax_gemm_permute_fp16)
-   add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_bias_softmax_gemm_permute_bf16)
+   if(DTYPES MATCHES "fp16" OR DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
+     add_custom_target(test_batched_gemm_softmax_gemm_permute)
+   endif()
+   if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+     add_gtest_executable(test_batched_gemm_softmax_gemm_permute_fp16 test_batched_gemm_softmax_gemm_permute_fp16.cpp)
+     add_gtest_executable(test_batched_gemm_bias_softmax_gemm_permute_fp16 test_batched_gemm_bias_softmax_gemm_permute_fp16.cpp)
+     target_link_libraries(test_batched_gemm_softmax_gemm_permute_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
+     target_link_libraries(test_batched_gemm_bias_softmax_gemm_permute_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
+     add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_softmax_gemm_permute_fp16)
+     add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_bias_softmax_gemm_permute_fp16)
+   endif()
+   if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
+     add_gtest_executable(test_batched_gemm_softmax_gemm_permute_bf16 test_batched_gemm_softmax_gemm_permute_bf16.cpp)
+     add_gtest_executable(test_batched_gemm_bias_softmax_gemm_permute_bf16 test_batched_gemm_bias_softmax_gemm_permute_bf16.cpp)
+     target_link_libraries(test_batched_gemm_softmax_gemm_permute_bf16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
+     target_link_libraries(test_batched_gemm_bias_softmax_gemm_permute_bf16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
+     add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_softmax_gemm_permute_bf16)
+     add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_bias_softmax_gemm_permute_bf16)
+   endif()
   set(target 1)
 endif()
 endforeach()
\ No newline at end of file
--- a/test/block_swizzle_test/block_swizzle_test.cpp
+++ b/test/block_swizzle_test/block_swizzle_test.cpp
+#include <stdio.h>
+#include <string>
+#include <algorithm>
+#include <vector>
+#include <limits>
+#include "simple_args.h"
+
+simple_args_t create_arg(int argc, char** argv)
+{
+    simple_args_t args;
+    args.insert("m", "1024", "matrix m")
+        .insert("n", "1024", "matrix n")
+        .insert("k", "1024", "matrix k")
+        .insert("m_per_block", "128", "m_per_block")
+        .insert("n_per_block", "128", "n_per_block")
+        .insert("k_per_block", "32", "k_per_block")
+        .insert("num_cu", "104", "num cu")
+        .insert("occupancy", "2", "occupancy")
+        .parse(argc, argv);
+    return args;
+}
+
+namespace impl {
+template <typename T>
+T integer_divide_ceil(T n, T d)
+{
+    return (n + d - 1) / d;
+}
+
+template <typename T>
+T min(T a, T b)
+{
+    return a > b ? b : a;
+}
+
+template <typename T>
+T max(T a, T b)
+{
+    return a > b ? a : b;
+}
+
+} // namespace impl
+
+struct block_dispatcher_t
+{
+    public:
+    uint32_t m_per_block;
+    uint32_t n_per_block;
+    uint32_t k_per_block;
+    uint32_t num_cu;
+    uint32_t occupancy;
+    uint32_t m;
+    uint32_t n;
+    uint32_t k;
+
+    //--------------------------------------
+
+    uint32_t sk_num_blocks;
+    uint32_t sk_num_big_blocks;
+    uint32_t sk_total_iters;
+
+    // uint32_t sk_num_blocks_per_tile;    // how many
+
+    uint32_t dp_start_block_idx;
+    uint32_t dp_iters_per_block;
+    uint32_t dp_num_blocks;
+
+    uint32_t k_iters_per_tile;
+    uint32_t k_iters_per_big_block;
+    //--------------------------------------
+
+    static constexpr uint32_t min_k_iters_per_sk_block = 1;
+
+    void dump()
+    {
+        printf("%dx%dx%d(%dx%dx%d), cu:%d, occ:%d, grids:%d, sk_num_big_blocks:%d, "
+               "sk_num_blocks:%d, sk_total_iters:%d, dp_start_block_idx:%d, dp_iters_per_block:%d, "
+               "dp_num_blocks:%d, k_iters_per_tile:%d, k_iters_per_big_block:%d\n",
+               m,
+               n,
+               k,
+               m_per_block,
+               n_per_block,
+               k_per_block,
+               num_cu,
+               occupancy,
+               get_grid_dims_x(),
+               sk_num_big_blocks,
+               sk_num_blocks,
+               sk_total_iters,
+               dp_start_block_idx,
+               dp_iters_per_block,
+               dp_num_blocks,
+               k_iters_per_tile,
+               k_iters_per_big_block);
+    }
+
+    block_dispatcher_t(uint32_t m_per_block_,
+                       uint32_t n_per_block_,
+                       uint32_t k_per_block_,
+                       uint32_t num_cu_,
+                       uint32_t occupancy_,
+                       uint32_t m_,
+                       uint32_t n_,
+                       uint32_t k_)
+        : m_per_block(m_per_block_),
+          n_per_block(n_per_block_),
+          k_per_block(k_per_block_),
+          num_cu(num_cu_),
+          occupancy(occupancy_),
+          m(m_),
+          n(n_),
+          k(k_)
+    {
+        init();
+    }
+
+    uint32_t get_grid_dims_x() { return dp_start_block_idx + dp_num_blocks; }
+
+    uint32_t get_block_idx(uint32_t bid)
+    {
+        // block id is linearily allocated along sk blocks (dp blocks are fine)
+        // this function will compute blockIdx.x and the linear sk block mapping
+        // uint32_t block_idx = 0;
+        // if(bid < sk_num_big_blocks) {
+        //     uint32_t current_k_iter = bid * k_iters_per_big_block;
+        //     tile_idx = current_k_iter / k_iters_per_tile;
+        // }
+        return bid;
+    }
+
+    uint32_t get_current_itr(uint32_t block_idx)
+    {
+        uint32_t current_itr = 0;
+        if(block_idx < sk_num_big_blocks)
+        {
+            current_itr = block_idx * k_iters_per_big_block;
+        }
+        else if(block_idx < sk_num_blocks)
+        {
+            current_itr = (sk_num_big_blocks * k_iters_per_big_block) +
+                          (block_idx - sk_num_big_blocks) * (k_iters_per_big_block - 1);
+        }
+        else if(block_idx >= dp_start_block_idx)
+        {
+            current_itr = sk_total_iters + (block_idx - dp_start_block_idx) * dp_iters_per_block;
+        }
+        return current_itr;
+    }
+
+    void get_block_itr(uint32_t block_idx, uint32_t& iter_start, uint32_t& iter_end)
+    {
+        if(block_idx < sk_num_big_blocks)
+        {
+            iter_start = block_idx * k_iters_per_big_block;
+            iter_end   = iter_start + k_iters_per_big_block;
+        }
+        else if(block_idx < sk_num_blocks)
+        {
+            iter_start = (sk_num_big_blocks * k_iters_per_big_block) +
+                         (block_idx - sk_num_big_blocks) * (k_iters_per_big_block - 1);
+            iter_end = iter_start + (k_iters_per_big_block - 1);
+        }
+        else if(block_idx >= dp_start_block_idx)
+        {
+            iter_start = sk_total_iters + (block_idx - dp_start_block_idx) * dp_iters_per_block;
+            iter_end   = iter_start + dp_iters_per_block;
+        }
+    }
+
+    private:
+    void init()
+    {
+        uint32_t num_tiles =
+            impl::integer_divide_ceil(m, m_per_block) * impl::integer_divide_ceil(n, n_per_block);
+        k_iters_per_tile = impl::integer_divide_ceil(k, k_per_block);
+
+        // one cu can hold one wg at one time, from the whole chip's point of view
+        // if number of wg is same as num_cu, we call it 1 dispatch
+        // if number of wg is 2x num_cu, we call it 2 dispatches.
+        // one dispatch can deliever wg same as num_cu (full dispatch), or less than num_cu (partial
+        // dispatch)
+        //
+        uint32_t full_dispatches         = num_tiles / num_cu;
+        uint32_t full_dispatch_tiles     = full_dispatches * num_cu;
+        uint32_t partial_dispatche_tiles = num_tiles - full_dispatch_tiles;
+
+        uint32_t sk_occupancy = occupancy;
+        uint32_t dp_tiles     = full_dispatch_tiles;
+        uint32_t sk_tiles     = partial_dispatche_tiles;
+
+        if(full_dispatches < occupancy)
+        {
+            // in this case, we allocate all blocks as sk blocks
+            // sk_occupancy = occupancy - full_dispatches;
+            sk_occupancy = 1; // TODO: single occ seems better
+            dp_tiles     = full_dispatch_tiles;
+            sk_tiles     = partial_dispatche_tiles;
+        }
+        else if((occupancy > 1) && (full_dispatches % occupancy == occupancy - 1))
+        {
+            // e.g. occupancy = 2, full_dispatches = 3, 5, 7 ...
+            //      occupancy = 3, full_dispatches = 5, 8, 11 ...
+            //      occupancy = 4, full_dispatches = 7, 11 ...
+            sk_occupancy = 1; // left 1 slot for sk occupancy
+            dp_tiles     = full_dispatch_tiles;
+            sk_tiles     = partial_dispatche_tiles;
+        }
+        else
+        {
+            // others, we reduce 1 dispatch from dp, together with partial dispatch,
+            // to construct sk dispatch
+            sk_occupancy = occupancy - ((full_dispatches - 1) % occupancy);
+            dp_tiles     = full_dispatch_tiles - num_cu;
+            sk_tiles     = partial_dispatche_tiles + num_cu;
+        }
+
+        // dp_num_blocks = dp_tiles;
+        // dp_start_block_idx = num_cu * sk_occupancy;
+        dp_iters_per_block = k_iters_per_tile;
+
+        sk_total_iters = k_iters_per_tile * sk_tiles;
+
+        // printf("num_tiles:%d, full_dispatches:%d, full_dispatch_tiles:%d,
+        // partial_dispatche_tiles:%d\n",
+        //         num_tiles, full_dispatches, full_dispatch_tiles, partial_dispatche_tiles);
+
+        {
+            uint32_t min_sk_tiles = (sk_tiles >= num_cu) ? num_cu : (sk_tiles + 1);
+            uint32_t max_sk_tiles =
+                (sk_tiles >= num_cu) ? num_cu * sk_occupancy
+                                     : impl::min(num_cu, sk_total_iters / min_k_iters_per_sk_block);
+
+            // if use dp for sk-block, how many iters do we need
+            uint32_t dp_for_sk_iters = k_iters_per_tile;
+
+            uint32_t best_sk_score =
+                std::numeric_limits<int>::max(); // we need to find the smallest sk iters
+            for(uint32_t tentative_sk_blocks = min_sk_tiles; tentative_sk_blocks < max_sk_tiles;
+                tentative_sk_blocks++)
+            {
+                uint32_t tentative_sk_iters_per_block =
+                    (sk_total_iters + tentative_sk_blocks - 1) / tentative_sk_blocks;
+                uint32_t tentative_sk_iters = tentative_sk_iters_per_block;
+                uint32_t sk_blocks_per_tile = (tentative_sk_blocks + sk_tiles - 1) / sk_tiles;
+
+                // TODO: carefully adjust this parameter
+                //       the more sk_blocks_per_tile, the worse the overhead
+                uint32_t cross_sk_blocks_overhead = sk_blocks_per_tile;
+                if(tentative_sk_blocks % sk_tiles != 0)
+                {
+                    // penalty for uneven divide
+                    cross_sk_blocks_overhead +=
+                        sk_blocks_per_tile * tentative_sk_iters_per_block / 50;
+                }
+
+                uint32_t tentative_sk_score = tentative_sk_iters + cross_sk_blocks_overhead;
+
+                if(tentative_sk_score < best_sk_score)
+                {
+                    best_sk_score = tentative_sk_score;
+                    sk_num_blocks = tentative_sk_blocks;
+                }
+            }
+
+            if(best_sk_score >= dp_for_sk_iters)
+            {
+                sk_num_blocks = 0;
+            }
+
+            if(sk_num_blocks == 0)
+            {
+                sk_num_big_blocks     = 0;
+                k_iters_per_big_block = 0;
+
+                dp_num_blocks      = num_tiles; // all tile to be dp block
+                dp_start_block_idx = 0;
+                sk_total_iters     = 0; // clear this tiles
+            }
+            else
+            {
+                uint32_t k_iters_per_sk_block = sk_total_iters / sk_num_blocks;
+                sk_num_big_blocks     = sk_total_iters - k_iters_per_sk_block * sk_num_blocks;
+                k_iters_per_big_block = k_iters_per_sk_block + 1;
+
+                dp_num_blocks      = dp_tiles;
+                dp_start_block_idx = (sk_num_blocks + num_cu - 1) / num_cu * num_cu;
+            }
+        }
+    }
+};
+
+struct tile_work_t
+{
+    uint32_t tile_idx;
+    uint32_t iter_begin;
+    uint32_t k_begin;
+    uint32_t k_end;
+    uint32_t k_iters_remaining;
+};
+
+int main(int argc, char** argv)
+{
+    simple_args_t arg = create_arg(argc, argv);
+    block_dispatcher_t block_dispatcher{arg.get_uint32("m_per_block"),
+                                        arg.get_uint32("n_per_block"),
+                                        arg.get_uint32("k_per_block"),
+                                        arg.get_uint32("num_cu"),
+                                        arg.get_uint32("occupancy"),
+                                        arg.get_uint32("m"),
+                                        arg.get_uint32("n"),
+                                        arg.get_uint32("k")};
+    block_dispatcher.dump();
+    // simulate actual kernel launch
+    uint32_t dim_x = block_dispatcher.get_grid_dims_x();
+    uint32_t total_k_iters =
+        impl::integer_divide_ceil(arg.get_uint32("k"), arg.get_uint32("k_per_block"));
+    uint32_t num_tiles =
+        impl::integer_divide_ceil(arg.get_uint32("m"), arg.get_uint32("m_per_block")) *
+        impl::integer_divide_ceil(arg.get_uint32("n"), arg.get_uint32("n_per_block"));
+
+    std::vector<int> valid_tile_record(num_tiles * total_k_iters);
+
+    for(uint32_t bid = 0; bid < dim_x; bid++)
+    {
+        uint32_t block_idx = block_dispatcher.get_block_idx(bid);
+        bool is_sk_block   = block_idx < (block_dispatcher.sk_num_blocks);
+        bool is_dp_block   = block_idx >= block_dispatcher.dp_start_block_idx;
+        uint32_t iter_start, iter_end;
+        block_dispatcher.get_block_itr(block_idx, iter_start, iter_end);
+        uint32_t total_iter_length = iter_end - iter_start;
+
+        while(true)
+        {
+            uint32_t iter_length_mod = iter_end % block_dispatcher.k_iters_per_tile;
+            uint32_t current_iter_length =
+                impl::min(iter_length_mod == 0 ? (iter_end - iter_start) : iter_length_mod,
+                          total_iter_length);
+            uint32_t tile_idx = (iter_end - 1) / block_dispatcher.k_iters_per_tile;
+            uint32_t tile_iter_start =
+                ((iter_end - 1) % block_dispatcher.k_iters_per_tile) - current_iter_length + 1;
+
+            if(is_sk_block)
+            {
+                printf("[sk_block] bid:%3d, block_idx:%3d, tile_idx:%3d, iter_start:%d(%d | %d), "
+                       "iter_end:%d (len:%d)\n",
+                       bid,
+                       block_idx,
+                       tile_idx,
+                       iter_end - current_iter_length,
+                       tile_iter_start,
+                       iter_start,
+                       iter_end,
+                       current_iter_length);
+            }
+            else if(is_dp_block)
+            {
+                printf("[dp_block] bid:%3d, block_idx:%3d, tile_idx:%3d, iter_start:%d(%d | %d), "
+                       "iter_end:%d (len:%d)\n",
+                       bid,
+                       block_idx,
+                       tile_idx,
+                       iter_end - current_iter_length,
+                       tile_iter_start,
+                       iter_start,
+                       iter_end,
+                       current_iter_length);
+            }
+            else
+            {
+                printf("[other   ] bid:%3d, block_idx:%3d\n", bid, block_idx);
+            }
+
+            // some validation check
+            for(auto i = iter_end - current_iter_length; i < iter_end; i++)
+            {
+                if(i >= valid_tile_record.size())
+                {
+                    printf("unexpected, current iter:%d larger than max:%d\n",
+                           i,
+                           valid_tile_record.size());
+                    return -1;
+                }
+                valid_tile_record[i] = 1;
+            }
+
+            iter_end -= current_iter_length;
+            if(iter_end <= iter_start)
+                break;
+        }
+    }
+
+    int untouched = 0;
+    for(auto i = 0; i < valid_tile_record.size(); i++)
+    {
+        if(valid_tile_record[i] != 1)
+        {
+            printf("untouched at %d (%d)\n", i, valid_tile_record.size());
+            untouched++;
+        }
+    }
+    printf("untouched %d/%d, %s\n",
+           untouched,
+           valid_tile_record.size(),
+           untouched == 0 ? "valid" : "fail");
+}
--- a/test/block_swizzle_test/rebuild.sh
+++ b/test/block_swizzle_test/rebuild.sh
+CC=g++
+
+$CC -Wall -std=c++17 -Iinclude -O3 block_swizzle_test.cpp -o block_swizzle_test.exe
\ No newline at end of file
--- a/test/block_swizzle_test/simple_args.h
+++ b/test/block_swizzle_test/simple_args.h
+#pragma once
+
+#include <iomanip>
+#include <iostream>
+#include <stdlib.h>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include <assert.h>
+
+struct arg_content_t
+{
+    std::string name; // key
+    std::string value;
+    std::string help_text;
+};
+
+class simple_args_t
+{
+    public:
+    simple_args_t() {}
+    simple_args_t& insert(const std::string& name_,
+                          const std::string& default_value_,
+                          const std::string& help_text_)
+    {
+        arg_content_t arg{name_, default_value_, help_text_};
+
+        if(arg_map.count(arg.name) != 0)
+        {
+            std::cout << "arg:" << arg.name << "already exist" << std::endl;
+        }
+        else
+        {
+            arg_map[arg.name] = arg;
+        }
+        return *this;
+    }
+    void usage()
+    {
+        for(auto& content : arg_map)
+        {
+            std::vector<std::string> help_text_lines;
+            size_t pos = 0;
+            for(size_t next_pos = content.second.help_text.find('\n', pos);
+                next_pos != std::string::npos;)
+            {
+                help_text_lines.push_back(
+                    std::string(content.second.help_text.begin() + pos,
+                                content.second.help_text.begin() + next_pos++));
+                pos      = next_pos;
+                next_pos = content.second.help_text.find('\n', pos);
+            }
+            help_text_lines.push_back(std::string(content.second.help_text.begin() + pos,
+                                                  content.second.help_text.end()));
+
+            int arg_name_width = 16 - content.second.name.length();
+            arg_name_width     = arg_name_width > 0 ? arg_name_width : 2;
+            std::cout << std::setw(4) << "-" << content.second.name << std::setw(arg_name_width)
+                      << " " << help_text_lines[0] << std::endl;
+
+            for(auto help_next_line = std::next(help_text_lines.begin());
+                help_next_line != help_text_lines.end();
+                ++help_next_line)
+            {
+                std::cout << std::setw(28) << " " << *help_next_line << std::endl;
+            }
+        }
+    }
+    bool parse(int argc, char* argv[], int start_index = 1)
+    {
+        if(argc <= start_index)
+        {
+            // std::cout << "not enough args (" << argc << ") with starting index " << start_index
+            // << std::endl;
+            return true;
+        }
+        for(int i = start_index; i < argc; i++)
+        {
+            std::string cur_arg = std::string(argv[i]);
+            if(cur_arg[0] != '-')
+            {
+                std::cout << "illegal input" << std::endl;
+                usage();
+                return false;
+            }
+            else if(cur_arg[0] == '-' && cur_arg[1] == '?')
+            {
+                usage();
+                return false;
+            }
+            else
+            {
+                size_t found_equal = cur_arg.find('=');
+                if(found_equal == std::string::npos || found_equal == (cur_arg.length() - 1))
+                {
+                    std::cout << "failed while parsing \"" << cur_arg << "\", "
+                              << "arg must be in the form \"-name=value\"" << std::endl;
+                    return false;
+                }
+                std::string arg_name  = cur_arg.substr(1, found_equal - 1);
+                std::string arg_value = cur_arg.substr(found_equal + 1);
+                if(arg_map.count(arg_name) == 0)
+                {
+                    std::cout << "no such arg \"" << arg_name << "\" registered" << std::endl;
+                    return false;
+                }
+                arg_map[arg_name].value = arg_value;
+            }
+        }
+        return true;
+    }
+
+    std::string get(const std::string& name) const { return get_str(name); }
+
+    std::string get_str(const std::string& name) const
+    {
+        assert(arg_map.count(name) != 0);
+        std::string value = arg_map.at(name).value;
+        return value;
+    }
+
+    int get_int(const std::string& name) const
+    {
+        assert(arg_map.count(name) != 0);
+        int value = atoi(arg_map.at(name).value.c_str());
+        return value;
+    }
+
+    uint32_t get_uint32(const std::string& name) const
+    {
+        assert(arg_map.count(name) != 0);
+        uint32_t value = strtoul(arg_map.at(name).value.c_str(), nullptr, 10);
+        return value;
+    }
+
+    uint64_t get_uint64(const std::string& name) const
+    {
+        assert(arg_map.count(name) != 0);
+        uint64_t value = strtoull(arg_map.at(name).value.c_str(), nullptr, 10);
+        return value;
+    }
+
+    double get_double(const std::string& name) const
+    {
+        assert(arg_map.count(name) != 0);
+        double value = atof(arg_map.at(name).value.c_str());
+        return value;
+    }
+
+    float get_float(const std::string& name) const
+    {
+        assert(arg_map.count(name) != 0);
+        float value = atof(arg_map.at(name).value.c_str());
+        return value;
+    }
+
+    private:
+    std::unordered_map<std::string, arg_content_t> arg_map;
+};