Commit 4227fce6 authored by Chao Liu's avatar Chao Liu
Browse files

Merge remote-tracking branch 'origin/develop' into gemm_activation

parents c24b6de6 4041850f
......@@ -200,3 +200,4 @@ enable_cppcheck(
add_subdirectory(host)
add_subdirectory(example)
add_subdirectory(profiler)
add_subdirectory(test)
......@@ -20,10 +20,10 @@
#define USE_DYNAMIC_MODE 0
#define USE_CONV_FWD_V4R4_NCHW 0
#define USE_CONV_FWD_V4R4R2_NHWC 1
#define USE_CONV_FWD_V6R1_NCHW 1
#define USE_CONV_FWD_V4R4R2_NHWC 0
#define USE_CONV_FWD_V6R1_NCHW 0
#define USE_CONV_FWD_V4R4R2_XDL_NCHW 0
#define USE_CONV_FWD_V4R4R4_XDL_NHWC 0
#define USE_CONV_FWD_V4R4R4_XDL_NHWC 1
enum ConvTensorLayout
{
......
......@@ -34,14 +34,14 @@ int conv_profiler(int argc, char* argv[])
{
if(argc != 25)
{
printf("arg1: tensor operation (conv=Convolution)\n");
printf("arg2: data type (0=fp32, 1=fp16)\n");
printf("arg3: input tensor layout (0=NCHW, 1=NHWC)\n");
printf("arg4: weight tensor layout (0=KCYX, 1=KYXC)\n");
printf("arg5: output tensor layout (0=NKHW, 1=NHWK)\n");
printf("arg6: verification (0=no, 1=yes)\n");
printf("arg7: initialization (0=no init, 1=integer value, 2=decimal value)\n");
printf("arg8: print matrix value (0=no, 1=yes)\n");
printf("arg1: tensor operation (conv: Convolution)\n");
printf("arg2: data type (0: fp32; 1: fp16)\n");
printf("arg3: input tensor layout (0: NCHW; 1: NHWC)\n");
printf("arg4: weight tensor layout (0: KCYX; 1: KYXC)\n");
printf("arg5: output tensor layout (0: NKHW; 1: NHWK)\n");
printf("arg6: verification (0: no; 1: yes)\n");
printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg8: print tensor value (0: no; 1: yes)\n");
printf("arg9: run kernel # of times (>1)\n");
printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx\n");
......
......@@ -37,12 +37,15 @@ int gemm_profiler(int argc, char* argv[])
{
if(argc != 14)
{
printf("arg1: tensor operation (gemm=GEMM)\n");
printf("arg2: data type (0=fp32, 1=fp16)\n");
printf("arg3: matrix layout (0=NN, 1=NT, 2=TN, 3=TT)\n");
printf("arg4: verification (0=no, 1=yes)\n");
printf("arg5: initialization (0=no init, 1=integer value, 2=decimal value)\n");
printf("arg6: print matrix value (0=no, 1=yes)\n");
printf("arg1: tensor operation (gemm: GEMM)\n");
printf("arg2: data type (0: fp32; 1: fp16)\n");
printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
printf(" 1: A[m, k] * B[n, k] = C[m, n];\n");
printf(" 2: A[k, n] * B[k, n] = C[m, n];\n");
printf(" 3: A[k, n] * B[n, k] = C[m, n])\n");
printf("arg4: verification (0: no; 1: yes)\n");
printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg8: print tensor value (0: no; 1: yes)\n");
printf("arg7: run kernel # of times (>1)\n");
printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n");
exit(1);
......@@ -70,8 +73,16 @@ int gemm_profiler(int argc, char* argv[])
ck::half_t,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor>(
do_verification, init_method, do_log, nrepeat, M, N, K, StrideA, StrideB, StrideC);
ck::tensor_layout::gemm::RowMajor>(do_verification,
init_method,
do_log,
nrepeat,
M,
N,
K,
(StrideA < 0) ? K : StrideA,
(StrideB < 0) ? N : StrideB,
(StrideC < 0) ? N : StrideC);
}
else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
{
......@@ -80,8 +91,16 @@ int gemm_profiler(int argc, char* argv[])
ck::half_t,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::RowMajor>(
do_verification, init_method, do_log, nrepeat, M, N, K, StrideA, StrideB, StrideC);
ck::tensor_layout::gemm::RowMajor>(do_verification,
init_method,
do_log,
nrepeat,
M,
N,
K,
(StrideA < 0) ? K : StrideA,
(StrideB < 0) ? K : StrideB,
(StrideC < 0) ? N : StrideC);
}
else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_KN_MN)
{
......@@ -90,8 +109,16 @@ int gemm_profiler(int argc, char* argv[])
ck::half_t,
ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor>(
do_verification, init_method, do_log, nrepeat, M, N, K, StrideA, StrideB, StrideC);
ck::tensor_layout::gemm::RowMajor>(do_verification,
init_method,
do_log,
nrepeat,
M,
N,
K,
(StrideA < 0) ? M : StrideA,
(StrideB < 0) ? N : StrideB,
(StrideC < 0) ? N : StrideC);
}
else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_NK_MN)
{
......@@ -100,8 +127,16 @@ int gemm_profiler(int argc, char* argv[])
ck::half_t,
ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::RowMajor>(
do_verification, init_method, do_log, nrepeat, M, N, K, StrideA, StrideB, StrideC);
ck::tensor_layout::gemm::RowMajor>(do_verification,
init_method,
do_log,
nrepeat,
M,
N,
K,
(StrideA < 0) ? M : StrideA,
(StrideB < 0) ? K : StrideB,
(StrideC < 0) ? N : StrideC);
}
else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_KN_MN)
{
......@@ -110,8 +145,16 @@ int gemm_profiler(int argc, char* argv[])
float,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor>(
do_verification, init_method, do_log, nrepeat, M, N, K, StrideA, StrideB, StrideC);
ck::tensor_layout::gemm::RowMajor>(do_verification,
init_method,
do_log,
nrepeat,
M,
N,
K,
(StrideA < 0) ? K : StrideA,
(StrideB < 0) ? N : StrideB,
(StrideC < 0) ? N : StrideC);
}
else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_NK_MN)
{
......@@ -120,8 +163,16 @@ int gemm_profiler(int argc, char* argv[])
float,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::RowMajor>(
do_verification, init_method, do_log, nrepeat, M, N, K, StrideA, StrideB, StrideC);
ck::tensor_layout::gemm::RowMajor>(do_verification,
init_method,
do_log,
nrepeat,
M,
N,
K,
(StrideA < 0) ? K : StrideA,
(StrideB < 0) ? K : StrideB,
(StrideC < 0) ? N : StrideC);
}
else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::KM_KN_MN)
{
......@@ -130,8 +181,16 @@ int gemm_profiler(int argc, char* argv[])
float,
ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor>(
do_verification, init_method, do_log, nrepeat, M, N, K, StrideA, StrideB, StrideC);
ck::tensor_layout::gemm::RowMajor>(do_verification,
init_method,
do_log,
nrepeat,
M,
N,
K,
(StrideA < 0) ? M : StrideA,
(StrideB < 0) ? N : StrideB,
(StrideC < 0) ? N : StrideC);
}
else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::KM_NK_MN)
{
......@@ -140,8 +199,16 @@ int gemm_profiler(int argc, char* argv[])
float,
ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::RowMajor>(
do_verification, init_method, do_log, nrepeat, M, N, K, StrideA, StrideB, StrideC);
ck::tensor_layout::gemm::RowMajor>(do_verification,
init_method,
do_log,
nrepeat,
M,
N,
K,
(StrideA < 0) ? M : StrideA,
(StrideB < 0) ? K : StrideB,
(StrideC < 0) ? N : StrideC);
}
else
{
......
......@@ -18,7 +18,28 @@ REPEAT=$7
######## op datatype layout verify init log repeat M___ N___ K___ StrideA StrideB StrideC
#$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 256 256 256 256 256 256
#$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 960 1024 1024 1024 1024 1024
#$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1024 1024 1024 1024 1024 1024
#$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1920 2048 2048 2048 2048 2048
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 3840 4096 4096 4096 4096 4096
#$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 3840 4096 4096 4096 4096 4096
#$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 7680 8192 8192 8192 8192 8192
#$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1024 1024 1024 1024 1024 1024
#$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 2048 2048 2048 2048 2048 2048
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 960 1024 1024 -1 -1 -1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1920 2048 2048 -1 -1 -1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 3840 4096 4096 -1 -1 -1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 7680 8192 8192 -1 -1 -1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1024 1024 1024 1024 1024 1024
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 2048 2048 2048 2048 2048 2048
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 4096 4096 4096 4096 4096 4096
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 8192 8192 8192 8192 8192 8192
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1024 1024 1024 1056 1056 1056
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 2048 2048 2048 2080 2080 2080
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 4096 4096 4096 4128 4128 4128
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 8192 8192 8192 8224 8224 8224
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1024 1024 1024 1088 1088 1088
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 2048 2048 2048 2112 2112 2112
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 4096 4096 4096 4160 4160 4160
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 8192 8192 8192 8256 8256 8256
include_directories(BEFORE
include
${PROJECT_SOURCE_DIR}/host/host_tensor/include
${PROJECT_SOURCE_DIR}/host/device/include
${PROJECT_SOURCE_DIR}/device_operation/include
${PROJECT_SOURCE_DIR}/composable_kernel/include
${PROJECT_SOURCE_DIR}/composable_kernel/include/utility
${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_description
${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_operation
${PROJECT_SOURCE_DIR}/composable_kernel/include/problem_transform
${PROJECT_SOURCE_DIR}/external/rocm/include
)
set(MAGIC_NUMBER_DIVISISON_SOURCE magic_number_division/main.cpp)
add_executable(test_magic_number_division ${MAGIC_NUMBER_DIVISISON_SOURCE})
target_link_libraries(test_magic_number_division PRIVATE host_tensor)
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "config.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "device_tensor.hpp"
__global__ void gpu_magic_number_division(uint32_t magic_multiplier,
uint32_t magic_shift,
const int32_t* p_dividend,
int32_t* p_result,
uint64_t num)
{
uint64_t global_thread_num = blockDim.x * gridDim.x;
uint64_t global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
for(uint64_t data_id = global_thread_id; data_id < num; data_id += global_thread_num)
{
p_result[data_id] =
ck::MagicDivision::DoMagicDivision(p_dividend[data_id], magic_multiplier, magic_shift);
}
}
__global__ void
gpu_naive_division(int32_t divisor, const int32_t* p_dividend, int32_t* p_result, uint64_t num)
{
uint64_t global_thread_num = blockDim.x * gridDim.x;
uint64_t global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
for(uint64_t data_id = global_thread_id; data_id < num; data_id += global_thread_num)
{
p_result[data_id] = p_dividend[data_id] / divisor;
}
}
template <typename T>
T check_error(const std::vector<T>& ref, const std::vector<T>& result)
{
T error = 0;
T max_diff = 0;
T ref_value = 0, result_value = 0;
for(std::size_t i = 0; i < ref.size(); ++i)
{
T diff = std::abs(ref[i] - result[i]);
error += diff;
if(max_diff < diff)
{
max_diff = diff;
ref_value = ref[i];
result_value = result[i];
}
}
return max_diff;
}
int main(int, char*[])
{
uint64_t num_divisor = 4096;
uint64_t num_dividend = 1L << 16;
std::vector<int32_t> divisors_host(num_divisor);
std::vector<int32_t> dividends_host(num_dividend);
// generate divisor
for(uint64_t i = 0; i < num_divisor; ++i)
{
divisors_host[i] = i + 1;
}
// generate dividend
for(uint64_t i = 0; i < num_divisor; ++i)
{
dividends_host[i] = i;
}
DeviceMem dividends_dev_buf(sizeof(int32_t) * num_dividend);
DeviceMem naive_result_dev_buf(sizeof(int32_t) * num_dividend);
DeviceMem magic_result_dev_buf(sizeof(int32_t) * num_dividend);
std::vector<int32_t> naive_result_host(num_dividend);
std::vector<int32_t> magic_result_host(num_dividend);
dividends_dev_buf.ToDevice(dividends_host.data());
bool pass = true;
for(std::size_t i = 0; i < num_divisor; ++i)
{
// run naive division on GPU
gpu_naive_division<<<1024, 256>>>(
divisors_host[i],
static_cast<const int32_t*>(dividends_dev_buf.GetDeviceBuffer()),
static_cast<int32_t*>(naive_result_dev_buf.GetDeviceBuffer()),
num_dividend);
// calculate magic number
uint32_t magic_multiplier, magic_shift;
ck::tie(magic_multiplier, magic_shift) =
ck::MagicDivision::CalculateMagicNumbers(divisors_host[i]);
// run magic division on GPU
gpu_magic_number_division<<<1024, 256>>>(
magic_multiplier,
magic_shift,
static_cast<const int32_t*>(dividends_dev_buf.GetDeviceBuffer()),
static_cast<int32_t*>(magic_result_dev_buf.GetDeviceBuffer()),
num_dividend);
naive_result_dev_buf.FromDevice(naive_result_host.data());
magic_result_dev_buf.FromDevice(magic_result_host.data());
int32_t max_diff = check_error(naive_result_host, magic_result_host);
if(max_diff != 0)
{
pass = false;
continue;
}
}
if(pass)
{
std::cout << "test magic number division: Pass" << std::endl;
}
else
{
std::cout << "test magic number division: Fail" << std::endl;
}
return 1;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment