Merge remote-tracking branch 'origin/develop' into gemm_activation

4227fce6 · Chao Liu · c24b6de6 · 4041850f · 4227fce6 · 4227fce6
Commit 4227fce6 authored Dec 01, 2021 by Chao Liu
7 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -200,3 +200,4 @@ enable_cppcheck(
 add_subdirectory(host)
 add_subdirectory(example)
 add_subdirectory(profiler)
+add_subdirectory(test)
--- a/host/driver_offline/src/conv_fwd_driver_offline.cpp
+++ b/host/driver_offline/src/conv_fwd_driver_offline.cpp
@@ -20,10 +20,10 @@

 #define USE_DYNAMIC_MODE 0
 #define USE_CONV_FWD_V4R4_NCHW 0
-#define USE_CONV_FWD_V4R4R2_NHWC 1
-#define USE_CONV_FWD_V6R1_NCHW 1
+#define USE_CONV_FWD_V4R4R2_NHWC 0
+#define USE_CONV_FWD_V6R1_NCHW 0
 #define USE_CONV_FWD_V4R4R2_XDL_NCHW 0
-#define USE_CONV_FWD_V4R4R4_XDL_NHWC 0
+#define USE_CONV_FWD_V4R4R4_XDL_NHWC 1

 enum ConvTensorLayout
 {

--- a/profiler/conv_profiler.cpp
+++ b/profiler/conv_profiler.cpp
@@ -34,14 +34,14 @@ int conv_profiler(int argc, char* argv[])
 {
    if(argc != 25)
    {
-        printf("arg1: tensor operation (conv=Convolution)\n");
-        printf("arg2: data type (0=fp32, 1=fp16)\n");
-        printf("arg3: input tensor layout (0=NCHW, 1=NHWC)\n");
-        printf("arg4: weight tensor layout (0=KCYX, 1=KYXC)\n");
-        printf("arg5: output tensor layout (0=NKHW, 1=NHWK)\n");
-        printf("arg6: verification (0=no, 1=yes)\n");
-        printf("arg7: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg8: print matrix value (0=no, 1=yes)\n");
+        printf("arg1: tensor operation (conv: Convolution)\n");
+        printf("arg2: data type (0: fp32; 1: fp16)\n");
+        printf("arg3: input tensor layout (0: NCHW; 1: NHWC)\n");
+        printf("arg4: weight tensor layout (0: KCYX; 1: KYXC)\n");
+        printf("arg5: output tensor layout (0: NKHW; 1: NHWK)\n");
+        printf("arg6: verification (0: no; 1: yes)\n");
+        printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg8: print tensor value (0: no; 1: yes)\n");
        printf("arg9: run kernel # of times (>1)\n");
        printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
               "RightPx\n");

--- a/profiler/gemm_profiler.cpp
+++ b/profiler/gemm_profiler.cpp
@@ -37,12 +37,15 @@ int gemm_profiler(int argc, char* argv[])
 {
    if(argc != 14)
    {
-        printf("arg1: tensor operation (gemm=GEMM)\n");
-        printf("arg2: data type (0=fp32, 1=fp16)\n");
-        printf("arg3: matrix layout (0=NN, 1=NT, 2=TN, 3=TT)\n");
-        printf("arg4: verification (0=no, 1=yes)\n");
-        printf("arg5: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg6: print matrix value (0=no, 1=yes)\n");
+        printf("arg1: tensor operation (gemm: GEMM)\n");
+        printf("arg2: data type (0: fp32; 1: fp16)\n");
+        printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
+        printf("                     1: A[m, k] * B[n, k] = C[m, n];\n");
+        printf("                     2: A[k, n] * B[k, n] = C[m, n];\n");
+        printf("                     3: A[k, n] * B[n, k] = C[m, n])\n");
+        printf("arg4: verification (0: no; 1: yes)\n");
+        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg8: print tensor value (0: no; 1: yes)\n");
        printf("arg7: run kernel # of times (>1)\n");
        printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n");
        exit(1);
@@ -70,8 +73,16 @@ int gemm_profiler(int argc, char* argv[])
                                   ck::half_t,
                                   ck::tensor_layout::gemm::RowMajor,
                                   ck::tensor_layout::gemm::RowMajor,
-                                   ck::tensor_layout::gemm::RowMajor>(
-            do_verification, init_method, do_log, nrepeat, M, N, K, StrideA, StrideB, StrideC);
+                                   ck::tensor_layout::gemm::RowMajor>(do_verification,
+                                                                      init_method,
+                                                                      do_log,
+                                                                      nrepeat,
+                                                                      M,
+                                                                      N,
+                                                                      K,
+                                                                      (StrideA < 0) ? K : StrideA,
+                                                                      (StrideB < 0) ? N : StrideB,
+                                                                      (StrideC < 0) ? N : StrideC);
    }
    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
    {
@@ -80,8 +91,16 @@ int gemm_profiler(int argc, char* argv[])
                                   ck::half_t,
                                   ck::tensor_layout::gemm::RowMajor,
                                   ck::tensor_layout::gemm::ColumnMajor,
-                                   ck::tensor_layout::gemm::RowMajor>(
-            do_verification, init_method, do_log, nrepeat, M, N, K, StrideA, StrideB, StrideC);
+                                   ck::tensor_layout::gemm::RowMajor>(do_verification,
+                                                                      init_method,
+                                                                      do_log,
+                                                                      nrepeat,
+                                                                      M,
+                                                                      N,
+                                                                      K,
+                                                                      (StrideA < 0) ? K : StrideA,
+                                                                      (StrideB < 0) ? K : StrideB,
+                                                                      (StrideC < 0) ? N : StrideC);
    }
    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_KN_MN)
    {
@@ -90,8 +109,16 @@ int gemm_profiler(int argc, char* argv[])
                                   ck::half_t,
                                   ck::tensor_layout::gemm::ColumnMajor,
                                   ck::tensor_layout::gemm::RowMajor,
-                                   ck::tensor_layout::gemm::RowMajor>(
-            do_verification, init_method, do_log, nrepeat, M, N, K, StrideA, StrideB, StrideC);
+                                   ck::tensor_layout::gemm::RowMajor>(do_verification,
+                                                                      init_method,
+                                                                      do_log,
+                                                                      nrepeat,
+                                                                      M,
+                                                                      N,
+                                                                      K,
+                                                                      (StrideA < 0) ? M : StrideA,
+                                                                      (StrideB < 0) ? N : StrideB,
+                                                                      (StrideC < 0) ? N : StrideC);
    }
    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_NK_MN)
    {
@@ -100,8 +127,16 @@ int gemm_profiler(int argc, char* argv[])
                                   ck::half_t,
                                   ck::tensor_layout::gemm::ColumnMajor,
                                   ck::tensor_layout::gemm::ColumnMajor,
-                                   ck::tensor_layout::gemm::RowMajor>(
-            do_verification, init_method, do_log, nrepeat, M, N, K, StrideA, StrideB, StrideC);
+                                   ck::tensor_layout::gemm::RowMajor>(do_verification,
+                                                                      init_method,
+                                                                      do_log,
+                                                                      nrepeat,
+                                                                      M,
+                                                                      N,
+                                                                      K,
+                                                                      (StrideA < 0) ? M : StrideA,
+                                                                      (StrideB < 0) ? K : StrideB,
+                                                                      (StrideC < 0) ? N : StrideC);
    }
    else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_KN_MN)
    {
@@ -110,8 +145,16 @@ int gemm_profiler(int argc, char* argv[])
                                   float,
                                   ck::tensor_layout::gemm::RowMajor,
                                   ck::tensor_layout::gemm::RowMajor,
-                                   ck::tensor_layout::gemm::RowMajor>(
-            do_verification, init_method, do_log, nrepeat, M, N, K, StrideA, StrideB, StrideC);
+                                   ck::tensor_layout::gemm::RowMajor>(do_verification,
+                                                                      init_method,
+                                                                      do_log,
+                                                                      nrepeat,
+                                                                      M,
+                                                                      N,
+                                                                      K,
+                                                                      (StrideA < 0) ? K : StrideA,
+                                                                      (StrideB < 0) ? N : StrideB,
+                                                                      (StrideC < 0) ? N : StrideC);
    }
    else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_NK_MN)
    {
@@ -120,8 +163,16 @@ int gemm_profiler(int argc, char* argv[])
                                   float,
                                   ck::tensor_layout::gemm::RowMajor,
                                   ck::tensor_layout::gemm::ColumnMajor,
-                                   ck::tensor_layout::gemm::RowMajor>(
-            do_verification, init_method, do_log, nrepeat, M, N, K, StrideA, StrideB, StrideC);
+                                   ck::tensor_layout::gemm::RowMajor>(do_verification,
+                                                                      init_method,
+                                                                      do_log,
+                                                                      nrepeat,
+                                                                      M,
+                                                                      N,
+                                                                      K,
+                                                                      (StrideA < 0) ? K : StrideA,
+                                                                      (StrideB < 0) ? K : StrideB,
+                                                                      (StrideC < 0) ? N : StrideC);
    }
    else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::KM_KN_MN)
    {
@@ -130,8 +181,16 @@ int gemm_profiler(int argc, char* argv[])
                                   float,
                                   ck::tensor_layout::gemm::ColumnMajor,
                                   ck::tensor_layout::gemm::RowMajor,
-                                   ck::tensor_layout::gemm::RowMajor>(
-            do_verification, init_method, do_log, nrepeat, M, N, K, StrideA, StrideB, StrideC);
+                                   ck::tensor_layout::gemm::RowMajor>(do_verification,
+                                                                      init_method,
+                                                                      do_log,
+                                                                      nrepeat,
+                                                                      M,
+                                                                      N,
+                                                                      K,
+                                                                      (StrideA < 0) ? M : StrideA,
+                                                                      (StrideB < 0) ? N : StrideB,
+                                                                      (StrideC < 0) ? N : StrideC);
    }
    else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::KM_NK_MN)
    {
@@ -140,8 +199,16 @@ int gemm_profiler(int argc, char* argv[])
                                   float,
                                   ck::tensor_layout::gemm::ColumnMajor,
                                   ck::tensor_layout::gemm::ColumnMajor,
-                                   ck::tensor_layout::gemm::RowMajor>(
-            do_verification, init_method, do_log, nrepeat, M, N, K, StrideA, StrideB, StrideC);
+                                   ck::tensor_layout::gemm::RowMajor>(do_verification,
+                                                                      init_method,
+                                                                      do_log,
+                                                                      nrepeat,
+                                                                      M,
+                                                                      N,
+                                                                      K,
+                                                                      (StrideA < 0) ? M : StrideA,
+                                                                      (StrideB < 0) ? K : StrideB,
+                                                                      (StrideC < 0) ? N : StrideC);
    }
    else
    {

--- a/script/profile_gemm.sh
+++ b/script/profile_gemm.sh
@@ -18,7 +18,28 @@ REPEAT=$7
 ########  op  datatype  layout  verify  init  log  repeat  M___ N___ K___  StrideA StrideB StrideC
 #$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT   256  256  256      256     256     256
 #$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT   960 1024 1024     1024    1024    1024
-#$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT  1024 1024 1024     1024    1024    1024
 #$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT  1920 2048 2048     2048    2048    2048
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT  3840 4096 4096     4096    4096    4096
+#$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT  3840 4096 4096     4096    4096    4096
 #$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT  7680 8192 8192     8192    8192    8192
+#$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT  1024 1024 1024     1024    1024    1024
+#$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT  2048 2048 2048     2048    2048    2048
+
+$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT  960  1024 1024       -1     -1      -1
+$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1920  2048 2048       -1     -1      -1
+$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 3840  4096 4096       -1     -1      -1
+$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 7680  8192 8192       -1     -1      -1
+
+$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1024  1024 1024	 1024	1024	1024
+$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 2048  2048 2048	 2048	2048	2048
+$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 4096  4096 4096	 4096	4096	4096
+$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 8192  8192 8192	 8192	8192	8192
+
+$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1024  1024 1024	 1056	1056	1056
+$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 2048  2048 2048	 2080	2080	2080
+$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 4096  4096 4096	 4128	4128	4128
+$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 8192  8192 8192	 8224	8224	8224
+
+$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1024  1024 1024	 1088	1088	1088
+$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 2048  2048 2048	 2112	2112	2112
+$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 4096  4096 4096	 4160	4160	4160
+$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 8192  8192 8192	 8256	8256	8256
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
+include_directories(BEFORE
+    include
+    ${PROJECT_SOURCE_DIR}/host/host_tensor/include
+    ${PROJECT_SOURCE_DIR}/host/device/include
+    ${PROJECT_SOURCE_DIR}/device_operation/include
+    ${PROJECT_SOURCE_DIR}/composable_kernel/include
+    ${PROJECT_SOURCE_DIR}/composable_kernel/include/utility
+    ${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_description
+    ${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_operation
+    ${PROJECT_SOURCE_DIR}/composable_kernel/include/problem_transform
+    ${PROJECT_SOURCE_DIR}/external/rocm/include
+)
+
+set(MAGIC_NUMBER_DIVISISON_SOURCE magic_number_division/main.cpp)
+
+add_executable(test_magic_number_division ${MAGIC_NUMBER_DIVISISON_SOURCE})
+
+target_link_libraries(test_magic_number_division PRIVATE host_tensor)
--- a/test/magic_number_division/main.cpp
+++ b/test/magic_number_division/main.cpp
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include <half.hpp>
+#include "config.hpp"
+#include "print.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "device_tensor.hpp"
+
+__global__ void gpu_magic_number_division(uint32_t magic_multiplier,
+                                          uint32_t magic_shift,
+                                          const int32_t* p_dividend,
+                                          int32_t* p_result,
+                                          uint64_t num)
+{
+    uint64_t global_thread_num = blockDim.x * gridDim.x;
+
+    uint64_t global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+    for(uint64_t data_id = global_thread_id; data_id < num; data_id += global_thread_num)
+    {
+        p_result[data_id] =
+            ck::MagicDivision::DoMagicDivision(p_dividend[data_id], magic_multiplier, magic_shift);
+    }
+}
+
+__global__ void
+gpu_naive_division(int32_t divisor, const int32_t* p_dividend, int32_t* p_result, uint64_t num)
+{
+    uint64_t global_thread_num = blockDim.x * gridDim.x;
+
+    uint64_t global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+    for(uint64_t data_id = global_thread_id; data_id < num; data_id += global_thread_num)
+    {
+        p_result[data_id] = p_dividend[data_id] / divisor;
+    }
+}
+
+template <typename T>
+T check_error(const std::vector<T>& ref, const std::vector<T>& result)
+{
+    T error     = 0;
+    T max_diff  = 0;
+    T ref_value = 0, result_value = 0;
+
+    for(std::size_t i = 0; i < ref.size(); ++i)
+    {
+        T diff = std::abs(ref[i] - result[i]);
+        error += diff;
+
+        if(max_diff < diff)
+        {
+            max_diff     = diff;
+            ref_value    = ref[i];
+            result_value = result[i];
+        }
+    }
+
+    return max_diff;
+}
+
+int main(int, char*[])
+{
+    uint64_t num_divisor  = 4096;
+    uint64_t num_dividend = 1L << 16;
+
+    std::vector<int32_t> divisors_host(num_divisor);
+    std::vector<int32_t> dividends_host(num_dividend);
+
+    // generate divisor
+    for(uint64_t i = 0; i < num_divisor; ++i)
+    {
+        divisors_host[i] = i + 1;
+    }
+
+    // generate dividend
+    for(uint64_t i = 0; i < num_divisor; ++i)
+    {
+        dividends_host[i] = i;
+    }
+
+    DeviceMem dividends_dev_buf(sizeof(int32_t) * num_dividend);
+    DeviceMem naive_result_dev_buf(sizeof(int32_t) * num_dividend);
+    DeviceMem magic_result_dev_buf(sizeof(int32_t) * num_dividend);
+
+    std::vector<int32_t> naive_result_host(num_dividend);
+    std::vector<int32_t> magic_result_host(num_dividend);
+
+    dividends_dev_buf.ToDevice(dividends_host.data());
+
+    bool pass = true;
+
+    for(std::size_t i = 0; i < num_divisor; ++i)
+    {
+        // run naive division on GPU
+        gpu_naive_division<<<1024, 256>>>(
+            divisors_host[i],
+            static_cast<const int32_t*>(dividends_dev_buf.GetDeviceBuffer()),
+            static_cast<int32_t*>(naive_result_dev_buf.GetDeviceBuffer()),
+            num_dividend);
+
+        // calculate magic number
+        uint32_t magic_multiplier, magic_shift;
+
+        ck::tie(magic_multiplier, magic_shift) =
+            ck::MagicDivision::CalculateMagicNumbers(divisors_host[i]);
+
+        // run magic division on GPU
+        gpu_magic_number_division<<<1024, 256>>>(
+            magic_multiplier,
+            magic_shift,
+            static_cast<const int32_t*>(dividends_dev_buf.GetDeviceBuffer()),
+            static_cast<int32_t*>(magic_result_dev_buf.GetDeviceBuffer()),
+            num_dividend);
+
+        naive_result_dev_buf.FromDevice(naive_result_host.data());
+        magic_result_dev_buf.FromDevice(magic_result_host.data());
+
+        int32_t max_diff = check_error(naive_result_host, magic_result_host);
+
+        if(max_diff != 0)
+        {
+            pass = false;
+            continue;
+        }
+    }
+
+    if(pass)
+    {
+        std::cout << "test magic number division: Pass" << std::endl;
+    }
+    else
+    {
+        std::cout << "test magic number division: Fail" << std::endl;
+    }
+
+    return 1;
+}