Commit e72c0c43 authored by carlushuang's avatar carlushuang
Browse files

Merge remote-tracking branch 'origin/develop' into cpu_avx2

parents d714fa15 313bbea5
......@@ -6,7 +6,7 @@
#include <half.hpp>
#include "profile_gemm_bias_relu_add_impl.hpp"
enum GemmMatrixLayout
enum struct GemmMatrixLayout
{
MK_KN_MN, // 0
MK_NK_MN, // 1
......@@ -18,7 +18,7 @@ enum GemmMatrixLayout
KM_NK_NM, // 7
};
enum GemmDataType
enum struct GemmDataType
{
F32_F32_F32, // 0
F16_F16_F16, // 1
......@@ -43,8 +43,8 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
exit(1);
}
const int data_type = static_cast<GemmDataType>(std::stoi(argv[2]));
const int layout = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
const auto data_type = static_cast<GemmDataType>(std::stoi(argv[2]));
const auto layout = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
const bool do_verification = std::stoi(argv[4]);
const int init_method = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]);
......
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "profile_gemm_reduce_impl.hpp"
int profile_gemm_reduce(int argc, char* argv[])
{
enum struct GemmMatrixLayout_t
{
MK_KN_MN, // 0
MK_NK_MN, // 1
KM_KN_MN, // 2
KM_NK_MN, // 3
};
enum struct GemmReduceDataType_t
{
F32_F32_F32_F32_F32, // 0
F16_F16_F16_F32_F32, // 1
};
if(!(argc == 14 || argc == 15))
{
printf("arg1: tensor operation (gemm: GEMM+Reduce)\n");
printf("arg2: data type (0: fp32; 1: fp16)\n");
printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
printf(" 1: A[m, k] * B[n, k] = C[m, n];\n");
printf(" 2: A[k, m] * B[k, n] = C[m, n];\n");
printf(" 3: A[k, m] * B[n, k] = C[m, n])\n");
printf("arg4: verification (0: no; 1: yes)\n");
printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg8: print tensor value (0: no; 1: yes)\n");
printf("arg7: run kernel # of times (>1)\n");
printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n");
printf("arg14: split k into mulitiple batch\n");
exit(1);
}
const auto data_type = static_cast<GemmReduceDataType_t>(std::stoi(argv[2]));
const auto layout = static_cast<GemmMatrixLayout_t>(std::stoi(argv[3]));
const bool do_verification = std::stoi(argv[4]);
const int init_method = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]);
const int nrepeat = std::stoi(argv[7]);
const int M = std::stoi(argv[8]);
const int N = std::stoi(argv[9]);
const int K = std::stoi(argv[10]);
const int StrideA = std::stoi(argv[11]);
const int StrideB = std::stoi(argv[12]);
const int StrideC = std::stoi(argv[13]);
if(data_type == GemmReduceDataType_t::F16_F16_F16_F32_F32 &&
layout == GemmMatrixLayout_t::MK_KN_MN)
{
ck::profiler::profile_gemm_reduce_impl<ck::half_t,
ck::half_t,
ck::half_t,
float,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor>(
do_verification,
init_method,
do_log,
nrepeat,
M,
N,
K,
(StrideA < 0) ? K : StrideA,
(StrideB < 0) ? N : StrideB,
(StrideC < 0) ? N : StrideC);
}
else if(data_type == GemmReduceDataType_t::F16_F16_F16_F32_F32 &&
layout == GemmMatrixLayout_t::MK_NK_MN)
{
ck::profiler::profile_gemm_reduce_impl<ck::half_t,
ck::half_t,
ck::half_t,
float,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::RowMajor>(
do_verification,
init_method,
do_log,
nrepeat,
M,
N,
K,
(StrideA < 0) ? K : StrideA,
(StrideB < 0) ? K : StrideB,
(StrideC < 0) ? N : StrideC);
}
else if(data_type == GemmReduceDataType_t::F16_F16_F16_F32_F32 &&
layout == GemmMatrixLayout_t::KM_KN_MN)
{
ck::profiler::profile_gemm_reduce_impl<ck::half_t,
ck::half_t,
ck::half_t,
float,
ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor>(
do_verification,
init_method,
do_log,
nrepeat,
M,
N,
K,
(StrideA < 0) ? M : StrideA,
(StrideB < 0) ? N : StrideB,
(StrideC < 0) ? N : StrideC);
}
else if(data_type == GemmReduceDataType_t::F16_F16_F16_F32_F32 &&
layout == GemmMatrixLayout_t::KM_NK_MN)
{
ck::profiler::profile_gemm_reduce_impl<ck::half_t,
ck::half_t,
ck::half_t,
float,
ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::RowMajor>(
do_verification,
init_method,
do_log,
nrepeat,
M,
N,
K,
(StrideA < 0) ? M : StrideA,
(StrideB < 0) ? K : StrideB,
(StrideC < 0) ? N : StrideC);
}
else
{
throw std::runtime_error("wrong! this data_type & layout is not implemented");
}
return 1;
}
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "profile_grouped_gemm_impl.hpp"
enum GemmMatrixLayout
{
MK_KN_MN, // 0
MK_NK_MN, // 1
KM_KN_MN, // 2
KM_NK_MN, // 3
MK_KN_NM, // 4
MK_NK_NM, // 5
KM_KN_NM, // 6
KM_NK_NM, // 7
};
enum GemmDataType
{
F32_F32_F32, // 0
F16_F16_F16, // 1
BF16_BF16_BF16, // 2
INT8_INT8_INT8, // 3
};
std::vector<int> argToIntArray(char* input)
{
std::vector<int> out;
std::istringstream in(input);
std::string item;
while(std::getline(in, item, ','))
{
out.push_back(std::stoi(item));
}
return out;
}
int profile_grouped_gemm(int argc, char* argv[])
{
if(!(argc == 14))
{
printf("arg1: tensor operation (grouped_gemm: Grouped GEMM)\n");
printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n");
printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
printf(" 1: A[m, k] * B[n, k] = C[m, n];\n");
printf(" 2: A[k, m] * B[k, n] = C[m, n];\n");
printf(" 3: A[k, m] * B[n, k] = C[m, n])\n");
printf("arg4: verification (0: no; 1: yes)\n");
printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg8: print tensor value (0: no; 1: yes)\n");
printf("arg7: run kernel # of times (>1)\n");
printf("arg8 to 13: Ms, Ns, Ks, StrideAs, StrideBs, StrideCs (e.g., 256,256 128,128 64,64 "
"64,64 64,64 128,128)\n");
exit(1);
}
const int data_type = static_cast<GemmDataType>(std::stoi(argv[2]));
const int layout = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
const bool do_verification = std::stoi(argv[4]);
const int init_method = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]);
const int nrepeat = std::stoi(argv[7]);
const auto Ms = argToIntArray(argv[8]);
const auto Ns = argToIntArray(argv[9]);
const auto Ks = argToIntArray(argv[10]);
const auto StrideAs = argToIntArray(argv[11]);
const auto StrideBs = argToIntArray(argv[12]);
const auto StrideCs = argToIntArray(argv[13]);
if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
{
ck::profiler::profile_grouped_gemm_impl<ck::half_t,
ck::half_t,
ck::half_t,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor>(do_verification,
init_method,
do_log,
nrepeat,
Ms,
Ns,
Ks,
StrideAs,
StrideBs,
StrideCs);
}
else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
{
ck::profiler::profile_grouped_gemm_impl<ck::half_t,
ck::half_t,
ck::half_t,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::RowMajor>(do_verification,
init_method,
do_log,
nrepeat,
Ms,
Ns,
Ks,
StrideAs,
StrideBs,
StrideCs);
}
else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_KN_MN)
{
ck::profiler::profile_grouped_gemm_impl<ck::half_t,
ck::half_t,
ck::half_t,
ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor>(do_verification,
init_method,
do_log,
nrepeat,
Ms,
Ns,
Ks,
StrideAs,
StrideBs,
StrideCs);
}
else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_NK_MN)
{
ck::profiler::profile_grouped_gemm_impl<ck::half_t,
ck::half_t,
ck::half_t,
ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::RowMajor>(do_verification,
init_method,
do_log,
nrepeat,
Ms,
Ns,
Ks,
StrideAs,
StrideBs,
StrideCs);
}
else
{
throw std::runtime_error("wrong! this GEMM data_type & layout is not implemented");
}
return 1;
}
......@@ -34,6 +34,8 @@ static struct option long_options[] = {{"inLengths", required_argument, nullptr,
{"scales", required_argument, nullptr, 'S'},
{"half", no_argument, nullptr, '?'},
{"double", no_argument, nullptr, '?'},
{"int8", no_argument, nullptr, '?'},
{"bf16", no_argument, nullptr, '?'},
{"dumpout", required_argument, nullptr, 'o'},
{"verify", required_argument, nullptr, 'v'},
{"log", required_argument, nullptr, 'l'},
......@@ -82,7 +84,7 @@ static std::vector<T> getTypeValuesFromString(const char* cstr_values)
return (values);
}
typedef enum
enum struct appDataType_t
{
appHalf = 0,
appFloat = 1,
......@@ -91,7 +93,7 @@ typedef enum
appInt8x4 = 4,
appBFloat16 = 5,
appDouble = 6,
} appDataType_t;
};
static void check_reduce_dims(const int rank, const std::vector<int>& reduceDims)
{
......@@ -119,6 +121,8 @@ class AppArgs
public:
bool use_half = false;
bool use_double = false;
bool use_int8 = false;
bool use_bf16 = false;
std::vector<size_t> inLengths;
std::vector<size_t> outLengths;
......@@ -127,8 +131,8 @@ class AppArgs
std::vector<float> scales;
ReduceTensorOp_t reduceOp = ReduceTensorOp_t::ADD;
appDataType_t compTypeId = appFloat;
appDataType_t outTypeId = appFloat;
appDataType_t compTypeId = appDataType_t::appFloat;
appDataType_t outTypeId = appDataType_t::appFloat;
bool compType_assigned = false;
bool outType_assigned = false;
......@@ -169,6 +173,8 @@ class AppArgs
<< std::endl;
std::cout << "--half, use fp16 for the input and output tensor data types" << std::endl;
std::cout << "--double, use fp64 for the input and output tensor data types" << std::endl;
std::cout << "--int8, use int8 for the input and output tensor data types" << std::endl;
std::cout << "--bf16, use bfloat16 for the input and output tensor data types" << std::endl;
std::cout << "--verify or -v, 1/0 to indicate whether to verify the reduction result by "
"comparing with the host-based reduction"
<< std::endl;
......@@ -267,6 +273,10 @@ class AppArgs
use_half = true;
else if(std::string(long_options[option_index].name) == "double")
use_double = true;
else if(std::string(long_options[option_index].name) == "int8")
use_int8 = true;
else if(std::string(long_options[option_index].name) == "bf16")
use_bf16 = true;
else if(std::string(long_options[option_index].name) == "help")
{
show_usage(argv[0]);
......@@ -329,15 +339,16 @@ int profile_reduce(int argc, char* argv[])
if(args.use_half)
{
if(!args.compType_assigned)
args.compTypeId = appHalf;
args.compTypeId = appDataType_t::appHalf;
if(args.outType_assigned && (args.outTypeId != appHalf && args.outTypeId != appFloat))
args.outTypeId = appFloat;
if(args.outType_assigned &&
(args.outTypeId != appDataType_t::appHalf && args.outTypeId != appDataType_t::appFloat))
args.outTypeId = appDataType_t::appFloat;
if(!args.outType_assigned)
args.outTypeId = appHalf;
args.outTypeId = appDataType_t::appHalf;
if(args.compTypeId == appHalf)
if(args.compTypeId == appDataType_t::appHalf)
{
profile_reduce_impl<ck::half_t, ck::half_t, ck::half_t>(args.do_verification,
args.init_method,
......@@ -352,7 +363,7 @@ int profile_reduce(int argc, char* argv[])
args.scales[0],
args.scales[1]);
}
else if(args.compTypeId == appFloat)
else if(args.compTypeId == appDataType_t::appFloat)
{
profile_reduce_impl<ck::half_t, float, ck::half_t>(args.do_verification,
args.init_method,
......@@ -385,9 +396,76 @@ int profile_reduce(int argc, char* argv[])
args.scales[0],
args.scales[1]);
}
else if(args.use_int8)
{
if(!args.compType_assigned)
args.compTypeId = appDataType_t::appInt8;
if(args.outType_assigned &&
(args.outTypeId != appDataType_t::appInt8 && args.outTypeId != appDataType_t::appInt32))
args.outTypeId = appDataType_t::appInt32;
if(!args.outType_assigned)
args.outTypeId = appDataType_t::appInt8;
if(args.compTypeId == appDataType_t::appInt8)
{
profile_reduce_impl<int8_t, int8_t, int8_t>(args.do_verification,
args.init_method,
args.do_log,
args.do_dumpout,
args.nrepeat,
args.inLengths,
args.reduceDims,
args.reduceOp,
args.nanOpt,
args.indicesOpt,
args.scales[0],
args.scales[1]);
}
else if(args.compTypeId == appDataType_t::appInt32)
{
profile_reduce_impl<int8_t, int32_t, int8_t>(args.do_verification,
args.init_method,
args.do_log,
args.do_dumpout,
args.nrepeat,
args.inLengths,
args.reduceDims,
args.reduceOp,
args.nanOpt,
args.indicesOpt,
args.scales[0],
args.scales[1]);
}
else
throw std::runtime_error("Invalid compType assignment!");
}
else if(args.use_bf16)
{
if(args.outType_assigned && (args.outTypeId != appDataType_t::appBFloat16 &&
args.outTypeId != appDataType_t::appFloat))
args.outTypeId = appDataType_t::appFloat;
if(!args.outType_assigned)
args.outTypeId = appDataType_t::appBFloat16;
profile_reduce_impl<ck::bhalf_t, float, ck::bhalf_t>(args.do_verification,
args.init_method,
args.do_log,
args.do_dumpout,
args.nrepeat,
args.inLengths,
args.reduceDims,
args.reduceOp,
args.nanOpt,
args.indicesOpt,
args.scales[0],
args.scales[1]);
}
else
{
if(args.compTypeId == appFloat)
if(args.compTypeId == appDataType_t::appFloat)
{
profile_reduce_impl<float, float, float>(args.do_verification,
args.init_method,
......@@ -402,7 +480,7 @@ int profile_reduce(int argc, char* argv[])
args.scales[0],
args.scales[1]);
}
else if(args.compTypeId == appDouble)
else if(args.compTypeId == appDataType_t::appDouble)
{
profile_reduce_impl<float, double, float>(args.do_verification,
args.init_method,
......
......@@ -5,10 +5,12 @@
#include <cstring>
int profile_gemm(int, char*[]);
int profile_batched_gemm(int, char*[]);
int profile_gemm_bias_2d(int, char*[]);
int profile_gemm_bias_relu(int, char*[]);
int profile_gemm_bias_relu_add(int, char*[]);
int profile_gemm_reduce(int, char*[]);
int profile_batched_gemm(int, char*[]);
int profile_grouped_gemm(int, char*[]);
int profile_conv_fwd(int, char*[]);
int profile_conv_fwd_bias_relu(int, char*[]);
int profile_conv_fwd_bias_relu_add(int, char*[]);
......@@ -34,10 +36,18 @@ int main(int argc, char* argv[])
{
return profile_gemm_bias_relu_add(argc, argv);
}
else if(strcmp(argv[1], "gemm_reduce") == 0)
{
return profile_gemm_reduce(argc, argv);
}
else if(strcmp(argv[1], "batched_gemm") == 0)
{
return profile_batched_gemm(argc, argv);
}
else if(strcmp(argv[1], "grouped_gemm") == 0)
{
profile_grouped_gemm(argc, argv);
}
else if(strcmp(argv[1], "conv_fwd") == 0)
{
return profile_conv_fwd(argc, argv);
......@@ -69,12 +79,14 @@ int main(int argc, char* argv[])
" gemm_bias_2d: GEMM+Bias(2D)\n"
" gemm_bias_relu: GEMM+Bias+ReLU\n"
" gemm_bias_relu_add: GEMM+Bias+ReLU+Add\n"
" gemm_reduce: GEMM+Reduce\n"
" grouped_gemm: Grouped Gemm\n"
" conv_fwd: ForwardConvolution\n"
" conv_fwd_bias_relu: ForwardConvolution+Bias+ReLU\n"
" conv_fwd_bias_relu_add: ForwardConvolution+Bias+ReLU+Add\n"
" conv_fwd_bias_relu_atomic_add: ForwardConvolution+Bias+ReLU+AtomicAdd\n"
" conv_bwd: BackwardConvolution\n"
" reduce: REDUCE\n");
" reduce: Reduce\n");
// clang-format on
return 0;
......
......@@ -3,14 +3,14 @@ rm -f CMakeCache.txt
rm -f *.cmake
rm -rf CMakeFiles
MY_PROJECT_SOURCE=../../..
MY_PROJECT_SOURCE=../
MY_PROJECT_INSTALL=../install.dir
cmake \
-D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \
-D BUILD_DEV=OFF \
-D CMAKE_BUILD_TYPE=Release \
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 -ftemplate-backtrace-limit=0 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -gline-tables-only -save-temps=$PWD" \
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 -ftemplate-backtrace-limit=0 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -gline-tables-only " \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-D CMAKE_PREFIX_PATH=/opt/rocm \
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
......
......@@ -3,13 +3,16 @@
PRECISION=
##PRECISION=--half
##PRECISION=--double
##PRECISION=--int8
##PRECISION=--bf16
if test -n $PRECISION && test "$PRECISION" = "--half"; then
if [ -n $PRECISION ] && [ "$PRECISION" = "--half" -o "$PRECISION" = "--bf16" ]; then
ACCTYPE="-C 1"
else
ACCTYPE=""
elif [ -n $PRECISION ] && [ "$PRECISION" = "--int8" ]; then
ACCTYPE="-C 2"
fi
driver="./bin/ckProfiler"
VERIFY="-v $1"
......@@ -20,10 +23,16 @@ NREPEAT=$3
#### 0 - ADD, 5 - AVG, 7 - NORM2
Operations="0 5 7"
#### 0 - ADD, 5 - AVG, for int8, no NORM2 supported
if [ -n $PRECISION ] && [ "$PRECISION" = "--int8" ]; then
Operations=5
fi
## for generic validation
for op in $Operations; do
set -x
####### datatype layout reduce dims op acctype verify init repeats
$driver reduce $PRECISION -D 64,4,280,82 -R 0,1,2,3 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 64,4,280,82 -R 0 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 64,4,280,82 -R 1 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 64,4,280,82 -R 2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
......
......@@ -3,6 +3,8 @@
PRECISION=
##PRECISION=--half
##PRECISION=--double
##PRECISION=--int8
##PRECISION=--bf16
driver="./bin/ckProfiler"
......@@ -18,6 +20,7 @@ for op in $Operations; do
for use_idx in 0 1; do
set -x
####### datatype layout reduce dims op use index verify init repeats
$driver reduce $PRECISION -D 64,4,280,82 -R 0,1,2,3 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 64,4,280,82 -R 0 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 64,4,280,82 -R 1 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 64,4,280,82 -R 2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
......
#!/usr/bin/env bash
# set -e
DIM1=False
DIM2=True
DIM3=False
DATE=220317
GIT_HASH=4e6dfda
LOG_DIR=${DATE}_${GIT_HASH}
SUFFIX=${GIT_HASH}
#--------------------------------------------------------------------------
# Commandline arguments parsing
# like: cmd -key[--key] value
#--------------------------------------------------------------------------
POSITIONAL=()
while [[ $# -gt 0 ]]
do
key="$1"
case $key in
-d1|--d1)
DIM1=True
echo DIM1: "${DIM1}"
shift # past argument
;;
-d2|--d2)
DIM2=True
echo DIM2: "${DIM2}"
shift # past argument
;;
-d3|--d3)
DIM3=True
echo DIM3: "${DIM3}"
shift # past argument
;;
-all|--all)
DIM1=True
DIM2=True
DIM3=True
echo DIM1: "${DIM1}"
echo DIM2: "${DIM2}"
echo DIM3: "${DIM3}"
shift # past argument
;;
-s|--suffix)
SUFFIX=${SUFFIX}_"$2"
echo SUFFIX: "${SUFFIX}"
shift # past argument
shift # past value
;;
*) # unknown option
POSITIONAL+=("$1") # save it in an array for later
shift # past argument
;;
esac
done
set -- "${POSITIONAL[@]}" # restore positional parameters
#--------------------------------------------------------------------------
# NUMACTL="numactl --cpunodebind=1 --membind=1"
NUMACTL=
# ENV_CONF=
GPU=mi100
PROF_ITER_COUNT=10000
LOG_DIR_PATH=../log/${LOG_DIR}
set -x
#-------------------------------------------------------------------------------
# 1D
#-------------------------------------------------------------------------------
if [[ "${DIM1}" == "True" ]]; then
mkdir -p ${LOG_DIR_PATH}
echo ">>>>>>>> RUN test conv1d nwc <<<<<<<<<<"
CMD="./../build/bin/test_conv1d_fwd"
${NUMACTL} ${CMD} 2>&1 \
| tee ${LOG_DIR_PATH}/test_conv1d_fwd_nwc_${SUFFIX}_${GPU}.log
fi
#-------------------------------------------------------------------------------
# 2D
#-------------------------------------------------------------------------------
if [[ "${DIM2}" == "True" ]]; then
mkdir -p ${LOG_DIR_PATH}
echo ">>>>>>>> RUN test conv2d nhwc <<<<<<<<<<"
CMD="./../build/bin/test_conv2d_fwd"
${NUMACTL} ${CMD} 2>&1 \
| tee ${LOG_DIR_PATH}/test_conv2d_fwd_nhwc_${SUFFIX}_${GPU}.log
fi
#-------------------------------------------------------------------------------
# 3D
#-------------------------------------------------------------------------------
if [[ "${DIM3}" == "True" ]]; then
mkdir -p ${LOG_DIR_PATH}
echo ">>>>>>>> RUN test conv3d ndhwc <<<<<<<<<<"
CMD="./../build/bin/test_conv3d_fwd"
${NUMACTL} ${CMD} 2>&1 \
| tee ${LOG_DIR_PATH}/test_conv3d_fwd_ndhwc_${SUFFIX}_${GPU}.log
fi
#!/bin/bash
## The following will be used for CI
set -x
## for float
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2,3 0 2
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2 0 2
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,3 0 2
bin/test_reduce_no_index -D 64,4,280,82 -R 0,2,3 0 2
bin/test_reduce_no_index -D 64,4,280,82 -R 1,2,3 0 2
bin/test_reduce_no_index -D 64,4,280,82 -R 0 0 2
bin/test_reduce_no_index -D 64,4,280,82 -R 1 0 2
bin/test_reduce_no_index -D 64,4,280,82 -R 2 0 2
bin/test_reduce_no_index -D 64,4,280,82 -R 3 0 2
## for float16
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2,3 1 2
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2 1 2
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,3 1 2
bin/test_reduce_no_index -D 64,4,280,82 -R 0,2,3 1 2
bin/test_reduce_no_index -D 64,4,280,82 -R 1,2,3 1 2
bin/test_reduce_no_index -D 64,4,280,82 -R 0 1 2
bin/test_reduce_no_index -D 64,4,280,82 -R 1 1 2
bin/test_reduce_no_index -D 64,4,280,82 -R 2 1 2
bin/test_reduce_no_index -D 64,4,280,82 -R 3 1 2
## for int8_t
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2,3 3 2
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2 3 2
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,3 3 2
bin/test_reduce_no_index -D 64,4,280,82 -R 0,2,3 3 2
bin/test_reduce_no_index -D 64,4,280,82 -R 1,2,3 3 2
bin/test_reduce_no_index -D 64,4,280,82 -R 0 3 2
bin/test_reduce_no_index -D 64,4,280,82 -R 1 3 2
bin/test_reduce_no_index -D 64,4,280,82 -R 2 3 2
bin/test_reduce_no_index -D 64,4,280,82 -R 3 3 2
## for bfloat16
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2,3 5 2
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2 5 2
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,3 5 2
bin/test_reduce_no_index -D 64,4,280,82 -R 0,2,3 5 2
bin/test_reduce_no_index -D 64,4,280,82 -R 1,2,3 5 2
bin/test_reduce_no_index -D 64,4,280,82 -R 0 5 2
bin/test_reduce_no_index -D 64,4,280,82 -R 1 5 2
bin/test_reduce_no_index -D 64,4,280,82 -R 2 5 2
bin/test_reduce_no_index -D 64,4,280,82 -R 3 5 2
set +x
#!/bin/bash
## The following will be used for CI
set -x
## for float
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2,3 0 2
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2 0 2
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,3 0 2
bin/test_reduce_with_index -D 64,4,280,82 -R 0,2,3 0 2
bin/test_reduce_with_index -D 64,4,280,82 -R 1,2,3 0 2
bin/test_reduce_with_index -D 64,4,280,82 -R 0 0 2
bin/test_reduce_with_index -D 64,4,280,82 -R 1 0 2
bin/test_reduce_with_index -D 64,4,280,82 -R 2 0 2
bin/test_reduce_with_index -D 64,4,280,82 -R 3 0 2
## for float16
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2,3 1 2
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2 1 2
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,3 1 2
bin/test_reduce_with_index -D 64,4,280,82 -R 0,2,3 1 2
bin/test_reduce_with_index -D 64,4,280,82 -R 1,2,3 1 2
bin/test_reduce_with_index -D 64,4,280,82 -R 0 1 2
bin/test_reduce_with_index -D 64,4,280,82 -R 1 1 2
bin/test_reduce_with_index -D 64,4,280,82 -R 2 1 2
bin/test_reduce_with_index -D 64,4,280,82 -R 3 1 2
## for int8_t
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2,3 3 2
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2 3 2
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,3 3 2
bin/test_reduce_with_index -D 64,4,280,82 -R 0,2,3 3 2
bin/test_reduce_with_index -D 64,4,280,82 -R 1,2,3 3 2
bin/test_reduce_with_index -D 64,4,280,82 -R 0 3 2
bin/test_reduce_with_index -D 64,4,280,82 -R 1 3 2
bin/test_reduce_with_index -D 64,4,280,82 -R 2 3 2
bin/test_reduce_with_index -D 64,4,280,82 -R 3 3 2
## for bfloat16
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2,3 5 2
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2 5 2
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,3 5 2
bin/test_reduce_with_index -D 64,4,280,82 -R 0,2,3 5 2
bin/test_reduce_with_index -D 64,4,280,82 -R 1,2,3 5 2
bin/test_reduce_with_index -D 64,4,280,82 -R 0 5 2
bin/test_reduce_with_index -D 64,4,280,82 -R 1 5 2
bin/test_reduce_with_index -D 64,4,280,82 -R 2 5 2
bin/test_reduce_with_index -D 64,4,280,82 -R 3 5 2
set +x
......@@ -17,6 +17,7 @@ include_directories(BEFORE
${PROJECT_SOURCE_DIR}/library/include/ck/library/reference_tensor_operation/cpu
${PROJECT_SOURCE_DIR}/library/include/ck/library/reference_tensor_operation/gpu
${PROJECT_SOURCE_DIR}/test/include
${PROJECT_SOURCE_DIR}/profiler/include
${PROJECT_SOURCE_DIR}/external/include/half
)
......@@ -37,7 +38,10 @@ add_subdirectory(conv_util)
add_subdirectory(reference_conv_fwd)
add_subdirectory(gemm)
add_subdirectory(gemm_split_k)
add_subdirectory(conv2d_fwd)
add_subdirectory(gemm_reduce)
add_subdirectory(batched_gemm)
add_subdirectory(grouped_gemm)
add_subdirectory(convnd_fwd)
add_subdirectory(conv2d_bwd_data)
add_subdirectory(reduce)
add_subdirectory(cpu_ukernel)
add_test_executable(test_batched_gemm_fp16 batched_gemm_fp16.cpp)
target_link_libraries(test_batched_gemm_fp16 PRIVATE host_tensor)
target_link_libraries(test_batched_gemm_fp16 PRIVATE device_batched_gemm_instance)
#include <half.hpp>
#include <tuple>
#include <vector>
#include "batched_gemm_util.hpp"
#include "reference_batched_gemm.hpp"
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "device_tensor.hpp"
#include "device_batched_gemm_xdl.hpp"
#include "element_wise_operation.hpp"
#include "test_util.hpp"
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using DeviceBatchedGemmPtr =
ck::tensor_operation::device::DeviceGemmPtr<ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough>;
namespace ck {
namespace tensor_operation {
namespace device {
namespace device_batched_gemm_instance {
void add_device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instances(
std::vector<DeviceBatchedGemmPtr>& instances);
}
} // namespace device
} // namespace tensor_operation
} // namespace ck
namespace {
using ADataType = ck::half_t;
using BDataType = ck::half_t;
using CDataType = ck::half_t;
using AccDataType = float;
using ALayout = ck::tensor_layout::gemm::RowMajor;
using BLayout = ck::tensor_layout::gemm::ColumnMajor;
using CLayout = ck::tensor_layout::gemm::RowMajor;
auto PrepareGemmTensor(const std::size_t batch_count,
const ck::batched_gemm_util::GemmParams& params)
{
auto f_host_tensor_descriptor =
[batch_count](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}),
std::vector<std::size_t>({row * stride, stride, 1}));
}
else
{
return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}),
std::vector<std::size_t>({col * stride, 1, stride}));
}
};
Tensor<ADataType> a_g_m_k(
f_host_tensor_descriptor(params.M, params.K, params.StrideA, ALayout{}));
Tensor<BDataType> b_g_k_n(
f_host_tensor_descriptor(params.K, params.N, params.StrideB, BLayout{}));
Tensor<CDataType> c_g_m_n_host_result(
f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
Tensor<CDataType> c_g_m_n_device_result(
f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
a_g_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{-0.5, 0.5});
b_g_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
return std::make_tuple(a_g_m_k, b_g_k_n, c_g_m_n_host_result, c_g_m_n_device_result);
}
bool TestBatchedGemm(const std::size_t batch_count, DeviceBatchedGemmPtr& gemmPtr)
{
// Arrange
ck::batched_gemm_util::GemmParams params;
params.M = 1024;
params.N = 1024;
params.K = 1024;
params.StrideA = 1024;
params.StrideB = 1024;
params.StrideC = 1024;
auto host_tensors = PrepareGemmTensor(batch_count, params);
const Tensor<ADataType>& a = std::get<0>(host_tensors);
const Tensor<BDataType>& b = std::get<1>(host_tensors);
Tensor<CDataType>& c_host = std::get<2>(host_tensors);
Tensor<CDataType>& c_device = std::get<3>(host_tensors);
auto a_element_op = PassThrough{};
auto b_element_op = PassThrough{};
auto c_element_op = PassThrough{};
using ReferenceBatchedGemmInstance =
ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
BDataType,
CDataType,
PassThrough,
PassThrough,
PassThrough>;
ck::batched_gemm_util::RunHostBatchedGemm<ReferenceBatchedGemmInstance>(
a, b, c_host, a_element_op, b_element_op, c_element_op);
// Act
ck::batched_gemm_util::RunDeviceBatchedGemm(
gemmPtr, params, a, b, c_device, a_element_op, b_element_op, c_element_op);
// Assert
// bool pass = test::check_err(
// c_device.mData, c_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
bool pass = check_error(c_device, c_host) < 0.007815f;
std::cout << (pass ? "SUCCESS" : "FAILURE") << std::endl;
return pass;
}
} // namespace
int main()
{
std::vector<DeviceBatchedGemmPtr> batched_gemm_ptrs;
ck::tensor_operation::device::device_batched_gemm_instance::
add_device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instances(batched_gemm_ptrs);
bool pass = true;
const std::size_t batch_count = 4;
for(auto& gemmPtr : batched_gemm_ptrs)
{
pass &= TestBatchedGemm(batch_count, gemmPtr);
}
std::cout << "TestGemm ..... " << (pass ? "SUCCESS" : "FAILURE") << std::endl;
return pass ? 0 : 1;
}
#ifndef BATCHED_GEMM_UTILS_HPP
#define BATCHED_GEMM_UTILS_HPP
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
namespace ck {
namespace batched_gemm_util {
struct GemmParams
{
GemmParams()
: M(1024), N(1024), K(1024), StrideA(1024), StrideB(1024), StrideC(1024), alpha(1), beta(0)
{
}
ck::index_t M;
ck::index_t N;
ck::index_t K;
ck::index_t StrideA;
ck::index_t StrideB;
ck::index_t StrideC;
float alpha;
float beta;
};
template <typename BatchedGemmInstance,
typename ADataType,
typename BDataType,
typename CDataType,
typename AElementwiseOperation,
typename BElementwiseOperation,
typename CElementwiseOperation>
void RunHostBatchedGemm(const Tensor<ADataType>& A,
const Tensor<BDataType>& B,
Tensor<CDataType>& C,
AElementwiseOperation a_element_op,
BElementwiseOperation b_element_op,
CElementwiseOperation c_element_op)
{
auto ref_batched_gemm = BatchedGemmInstance{};
auto ref_invoker = ref_batched_gemm.MakeInvoker();
auto ref_argument =
ref_batched_gemm.MakeArgument(A, B, C, a_element_op, b_element_op, c_element_op);
ref_invoker.Run(ref_argument);
}
template <typename DeviceGemmPtr,
typename ADataType,
typename BDataType,
typename CDataType,
typename AElementwiseOperation,
typename BElementwiseOperation,
typename CElementwiseOperation>
void RunDeviceBatchedGemm(DeviceGemmPtr& batched_gemm_ptr,
const ck::batched_gemm_util::GemmParams& params,
const Tensor<ADataType>& A,
const Tensor<BDataType>& B,
Tensor<CDataType>& C,
AElementwiseOperation a_element_op,
BElementwiseOperation b_element_op,
CElementwiseOperation c_element_op)
{
DeviceMem a_g_m_k_device_buf(sizeof(ADataType) * A.mDesc.GetElementSpace());
DeviceMem b_g_k_n_device_buf(sizeof(BDataType) * B.mDesc.GetElementSpace());
DeviceMem c_g_m_n_device_buf(sizeof(CDataType) * C.mDesc.GetElementSpace());
a_g_m_k_device_buf.ToDevice(A.mData.data());
b_g_k_n_device_buf.ToDevice(B.mData.data());
const auto batch_count = A.mDesc.GetLengths()[0];
auto invoker_ptr = batched_gemm_ptr->MakeInvokerPointer();
auto argument_ptr = batched_gemm_ptr->MakeArgumentPointer(
static_cast<ADataType*>(a_g_m_k_device_buf.GetDeviceBuffer()),
static_cast<BDataType*>(b_g_k_n_device_buf.GetDeviceBuffer()),
static_cast<CDataType*>(c_g_m_n_device_buf.GetDeviceBuffer()),
params.M,
params.N,
params.K,
params.StrideA,
params.StrideB,
params.StrideC,
a_element_op,
b_element_op,
c_element_op,
batch_count);
if(!batched_gemm_ptr->IsSupportedArgument(argument_ptr.get()))
{
throw std::runtime_error(
"wrong! device_gemm with the specified compilation parameters does "
"not support this GEMM problem");
}
invoker_ptr->Run(argument_ptr.get());
c_g_m_n_device_buf.FromDevice(C.mData.data());
}
} // namespace batched_gemm_util
} // namespace ck
#endif
......@@ -182,8 +182,8 @@ int main(int argc, char* argv[])
out_device_buf.ToDevice(out_n_k_ho_wo.mData.data());
wei_device_buf.ToDevice(wei_k_c_y_x.mData.data());
in_n_c_hi_wi_device_result.GenerateTensorValue(GeneratorTensor_1<InDataType>{5});
// reset input to zero
in_n_c_hi_wi_device_result.GenerateTensorValue(GeneratorTensor_1<InDataType>{0});
in_device_buf.ToDevice(in_n_c_hi_wi_device_result.mData.data());
// get host result
......@@ -225,9 +225,9 @@ int main(int argc, char* argv[])
ck::tensor_operation::device::device_conv2d_bwd_data_instance::
add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances(conv_ptrs);
}
else if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, ushort> &&
ck::is_same_v<ck::remove_cv_t<WeiDataType>, ushort> &&
ck::is_same_v<ck::remove_cv_t<OutDataType>, ushort>)
else if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, ck::bhalf_t> &&
ck::is_same_v<ck::remove_cv_t<WeiDataType>, ck::bhalf_t> &&
ck::is_same_v<ck::remove_cv_t<OutDataType>, ck::bhalf_t>)
{
ck::tensor_operation::device::device_conv2d_bwd_data_instance::
add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances(conv_ptrs);
......
add_test_executable(test_conv2d_fwd conv2d_fwd.cpp)
target_link_libraries(test_conv2d_fwd PRIVATE host_tensor)
target_link_libraries(test_conv2d_fwd PRIVATE device_conv2d_fwd_instance)
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "host_conv.hpp"
#include "tensor_layout.hpp"
#include "device_tensor.hpp"
#include "device_conv_fwd.hpp"
#include "element_wise_operation.hpp"
#include "reference_conv_fwd.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace device_conv2d_fwd_instance {
using DeviceConvFwdNoOpPtr = DeviceConvFwdPtr<ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough>;
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(std::vector<DeviceConvFwdNoOpPtr>&);
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(std::vector<DeviceConvFwdNoOpPtr>&);
void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(
std::vector<DeviceConvFwdNoOpPtr>&);
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(std::vector<DeviceConvFwdNoOpPtr>&);
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(std::vector<DeviceConvFwdNoOpPtr>&);
} // namespace device_conv2d_fwd_instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
template <typename T>
static bool check_out(const Tensor<T>& ref, const Tensor<T>& result)
{
float max_diff = 1e-6;
for(int i = 0; i < ref.mData.size(); ++i)
{
float diff = std::abs(double(ref.mData[i]) - double(result.mData[i]));
if(max_diff < diff)
{
return false;
}
}
return true;
}
int main(int argc, char* argv[])
{
int data_type = 0;
int init_method = 0;
// Conv shape
ck::index_t N = 128;
ck::index_t K = 256;
ck::index_t C = 192;
ck::index_t Y = 3;
ck::index_t X = 3;
ck::index_t Hi = 71;
ck::index_t Wi = 71;
ck::index_t conv_stride_h = 2;
ck::index_t conv_stride_w = 2;
ck::index_t conv_dilation_h = 1;
ck::index_t conv_dilation_w = 1;
ck::index_t in_left_pad_h = 1;
ck::index_t in_left_pad_w = 1;
ck::index_t in_right_pad_h = 1;
ck::index_t in_right_pad_w = 1;
if(argc == 1)
{
data_type = 1;
init_method = 1;
}
else if(argc == 3)
{
data_type = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
}
else if(argc == 18)
{
data_type = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
N = std::stoi(argv[3]);
K = std::stoi(argv[4]);
C = std::stoi(argv[5]);
Y = std::stoi(argv[6]);
X = std::stoi(argv[7]);
Hi = std::stoi(argv[8]);
Wi = std::stoi(argv[9]);
conv_stride_h = std::stoi(argv[10]);
conv_stride_w = std::stoi(argv[11]);
conv_dilation_h = std::stoi(argv[12]);
conv_dilation_w = std::stoi(argv[13]);
in_left_pad_h = std::stoi(argv[14]);
in_left_pad_w = std::stoi(argv[15]);
in_right_pad_h = std::stoi(argv[16]);
in_right_pad_w = std::stoi(argv[17]);
}
else
{
printf("arg1: data type (0=fp32, 1=fp16, 2= bfp16, 3= int8_t )\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
printf("arg3 to 17: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx\n");
exit(1);
}
auto Run = [&](auto input_type, auto wei_type, auto out_type) {
using InDataType = decltype(input_type);
using WeiDataType = decltype(wei_type);
using OutDataType = decltype(out_type);
using ReferenceConvFwdInstance = ck::tensor_operation::host::ReferenceConvFwd<InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>;
const ck::index_t YEff = (Y - 1) * conv_dilation_h + 1;
const ck::index_t XEff = (X - 1) * conv_dilation_w + 1;
const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
const std::vector<ck::index_t> input_spatial_lengths{Hi, Wi};
const std::vector<ck::index_t> filter_spatial_lengths{Y, X};
const std::vector<ck::index_t> output_spatial_lengths{Ho, Wo};
const std::vector<ck::index_t> conv_filter_strides{conv_stride_h, conv_stride_w};
const std::vector<ck::index_t> conv_filter_dilations{conv_dilation_h, conv_dilation_w};
const std::vector<ck::index_t> input_left_pads{in_left_pad_h, in_left_pad_w};
const std::vector<ck::index_t> input_right_pads{in_right_pad_h, in_right_pad_w};
auto f_host_tensor_descriptor =
[](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W) {
return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
std::vector<std::size_t>({C_ * H * W, 1, W * C_, C_}));
};
Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi));
Tensor<WeiDataType> wei_k_c_y_x(f_host_tensor_descriptor(K, C, Y, X));
Tensor<OutDataType> out_n_k_ho_wo_host_result(f_host_tensor_descriptor(N, K, Ho, Wo));
Tensor<OutDataType> out_n_k_ho_wo_device_result(f_host_tensor_descriptor(N, K, Ho, Wo));
std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl;
std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo_host_result.mDesc << std::endl;
switch(init_method)
{
case 0: break;
case 1:
in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
break;
default:
in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{0, 1});
wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-1, 1});
}
DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpace());
DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_k_c_y_x.mDesc.GetElementSpace());
DeviceMem out_device_buf(sizeof(OutDataType) *
out_n_k_ho_wo_device_result.mDesc.GetElementSpace());
in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
wei_device_buf.ToDevice(wei_k_c_y_x.mData.data());
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using DeviceConvFwdNoOpPtr =
ck::tensor_operation::device::DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>;
// add device Conv instances
std::vector<DeviceConvFwdNoOpPtr> conv_ptrs;
if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, float> &&
ck::is_same_v<ck::remove_cv_t<WeiDataType>, float> &&
ck::is_same_v<ck::remove_cv_t<OutDataType>, float>)
{
ck::tensor_operation::device::device_conv2d_fwd_instance::
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(conv_ptrs);
}
else if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, ck::half_t> &&
ck::is_same_v<ck::remove_cv_t<WeiDataType>, ck::half_t> &&
ck::is_same_v<ck::remove_cv_t<OutDataType>, ck::half_t>)
{
ck::tensor_operation::device::device_conv2d_fwd_instance::
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(conv_ptrs);
ck::tensor_operation::device::device_conv2d_fwd_instance::
add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(conv_ptrs);
}
else if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, ck::bhalf_t> &&
ck::is_same_v<ck::remove_cv_t<WeiDataType>, ck::bhalf_t> &&
ck::is_same_v<ck::remove_cv_t<OutDataType>, ck::bhalf_t>)
{
ck::tensor_operation::device::device_conv2d_fwd_instance::
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(conv_ptrs);
}
else if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, int8_t> &&
ck::is_same_v<ck::remove_cv_t<WeiDataType>, int8_t> &&
ck::is_same_v<ck::remove_cv_t<OutDataType>, int8_t>)
{
ck::tensor_operation::device::device_conv2d_fwd_instance::
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(conv_ptrs);
}
if(conv_ptrs.size() <= 0)
{
throw std::runtime_error("wrong! no device Conv instance found");
}
auto ref_conv = ReferenceConvFwdInstance{};
auto ref_invoker = ref_conv.MakeInvoker();
auto ref_argument = ref_conv.MakeArgument(in_n_c_hi_wi,
wei_k_c_y_x,
out_n_k_ho_wo_host_result,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
InElementOp{},
WeiElementOp{},
OutElementOp{});
ref_invoker.Run(ref_argument);
// profile device Conv instances
bool success = false;
for(auto& conv_ptr : conv_ptrs)
{
auto argument_ptr = conv_ptr->MakeArgumentPointer(
static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
N,
K,
C,
input_spatial_lengths,
filter_spatial_lengths,
output_spatial_lengths,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
PassThrough{},
PassThrough{},
PassThrough{});
auto invoker_ptr = conv_ptr->MakeInvokerPointer();
if(conv_ptr->IsSupportedArgument(argument_ptr.get()))
{
invoker_ptr->Run(argument_ptr.get(), 0);
out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data());
if(!check_out(out_n_k_ho_wo_host_result, out_n_k_ho_wo_device_result))
{
success = false;
break;
}
success = true;
}
}
if(success)
{
std::cout << "test conv2d fwd : Pass" << std::endl;
return 0;
}
else
{
std::cout << "test conv2d fwd: Fail " << std::endl;
return -1;
}
};
int res = -1;
if(data_type == 0)
{
res = Run(float(), float(), float());
}
else if(data_type == 1)
{
res = Run(ck::half_t(), ck::half_t(), ck::half_t());
}
else if(data_type == 2)
{
Run(ck::bhalf_t(), ck::bhalf_t(), ck::bhalf_t());
}
else if(data_type == 3)
{
res = Run(int8_t(), int8_t(), int8_t());
}
return res;
}
......@@ -5,33 +5,10 @@
#include "config.hpp"
#include "conv_utils.hpp"
#include "tensor_layout.hpp"
#include "test_util.hpp"
namespace {
template <typename T>
bool cmp_vec(const std::vector<T>& out, const std::vector<T>& ref, const std::string& msg)
{
if(out.size() != ref.size())
{
std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size()
<< std::endl
<< msg << std::endl;
return false;
}
for(std::size_t i = 0; i < ref.size(); ++i)
{
if(out[i] != ref[i])
{
std::cout << "out[" << i << "] != ref[" << i << "]: " << out[i] << "!=" << ref[i]
<< std::endl
<< msg << std::endl;
return false;
}
}
return true;
}
bool TestConvParams_GetOutputSpatialLengths()
{
bool res{true};
......@@ -43,26 +20,26 @@ bool TestConvParams_GetOutputSpatialLengths()
// padding {{1,1}, {1,1}}
ck::conv_util::ConvParams conv_params;
std::vector<ck::index_t> out_spatial_len = conv_params.GetOutputSpatialLengths();
res = cmp_vec(out_spatial_len,
std::vector<ck::index_t>{36, 36},
"Error: ConvParams 2D default constructor.");
res = test::check_err(out_spatial_len,
std::vector<ck::index_t>{36, 36},
"Error: ConvParams 2D default constructor.");
conv_params.conv_filter_strides = std::vector<ck::index_t>{1, 1};
out_spatial_len = conv_params.GetOutputSpatialLengths();
res = cmp_vec(
res = test::check_err(
out_spatial_len, std::vector<ck::index_t>{71, 71}, "Error: ConvParams 2D stride {1,1}.");
conv_params.conv_filter_strides = std::vector<ck::index_t>{2, 2};
conv_params.input_left_pads = std::vector<ck::index_t>{2, 2};
conv_params.input_right_pads = std::vector<ck::index_t>{2, 2};
out_spatial_len = conv_params.GetOutputSpatialLengths();
res = cmp_vec(out_spatial_len,
std::vector<ck::index_t>{37, 37},
"Error: ConvParams 2D padding left/right {2,2}.");
res = test::check_err(out_spatial_len,
std::vector<ck::index_t>{37, 37},
"Error: ConvParams 2D padding left/right {2,2}.");
conv_params.conv_filter_dilations = std::vector<ck::index_t>{2, 2};
out_spatial_len = conv_params.GetOutputSpatialLengths();
res = cmp_vec(
res = test::check_err(
out_spatial_len, std::vector<ck::index_t>{36, 36}, "Error: ConvParams 2D dilation {2,2}.");
conv_params.conv_filter_strides = std::vector<ck::index_t>{3, 3};
......@@ -70,9 +47,9 @@ bool TestConvParams_GetOutputSpatialLengths()
conv_params.input_right_pads = std::vector<ck::index_t>{1, 1};
conv_params.conv_filter_dilations = std::vector<ck::index_t>{2, 2};
out_spatial_len = conv_params.GetOutputSpatialLengths();
res = cmp_vec(out_spatial_len,
std::vector<ck::index_t>{23, 23},
"Error: ConvParams 2D strides{3,3}, padding {1,1}, dilations {2,2}.");
res = test::check_err(out_spatial_len,
std::vector<ck::index_t>{23, 23},
"Error: ConvParams 2D strides{3,3}, padding {1,1}, dilations {2,2}.");
// -------------------------- 1D ------------------------------------
conv_params.num_dim_spatial = 1;
......@@ -84,25 +61,24 @@ bool TestConvParams_GetOutputSpatialLengths()
conv_params.input_right_pads = std::vector<ck::index_t>{1};
out_spatial_len = conv_params.GetOutputSpatialLengths();
res = cmp_vec(
out_spatial_len, std::vector<ck::index_t>{36}, "Error: ConvParams 1D default constructor.");
res = test::check_err(out_spatial_len, std::vector<ck::index_t>{36}, "Error: ConvParams 1D.");
conv_params.conv_filter_strides = std::vector<ck::index_t>{1, 1};
out_spatial_len = conv_params.GetOutputSpatialLengths();
res =
cmp_vec(out_spatial_len, std::vector<ck::index_t>{71}, "Error: ConvParams 1D stride {1}.");
res = test::check_err(
out_spatial_len, std::vector<ck::index_t>{71}, "Error: ConvParams 1D stride {1}.");
conv_params.conv_filter_strides = std::vector<ck::index_t>{2};
conv_params.input_left_pads = std::vector<ck::index_t>{2};
conv_params.input_right_pads = std::vector<ck::index_t>{2};
out_spatial_len = conv_params.GetOutputSpatialLengths();
res = cmp_vec(out_spatial_len,
std::vector<ck::index_t>{37},
"Error: ConvParams 1D padding left/right {2}.");
res = test::check_err(out_spatial_len,
std::vector<ck::index_t>{37},
"Error: ConvParams 1D padding left/right {2}.");
conv_params.conv_filter_dilations = std::vector<ck::index_t>{2};
out_spatial_len = conv_params.GetOutputSpatialLengths();
res = cmp_vec(
res = test::check_err(
out_spatial_len, std::vector<ck::index_t>{36}, "Error: ConvParams 1D dilation {2}.");
conv_params.conv_filter_strides = std::vector<ck::index_t>{3};
......@@ -110,9 +86,52 @@ bool TestConvParams_GetOutputSpatialLengths()
conv_params.input_right_pads = std::vector<ck::index_t>{1};
conv_params.conv_filter_dilations = std::vector<ck::index_t>{2};
out_spatial_len = conv_params.GetOutputSpatialLengths();
res = cmp_vec(out_spatial_len,
std::vector<ck::index_t>{23},
"Error: ConvParams 1D strides{3}, padding {1}, dilations {2}.");
res = test::check_err(out_spatial_len,
std::vector<ck::index_t>{23},
"Error: ConvParams 1D strides{3}, padding {1}, dilations {2}.");
// -------------------------- 3D ------------------------------------
conv_params.num_dim_spatial = 3;
conv_params.filter_spatial_lengths = std::vector<ck::index_t>{3, 3, 3};
conv_params.input_spatial_lengths = std::vector<ck::index_t>{71, 71, 71};
conv_params.conv_filter_strides = std::vector<ck::index_t>{2, 2, 2};
conv_params.conv_filter_dilations = std::vector<ck::index_t>{1, 1, 1};
conv_params.input_left_pads = std::vector<ck::index_t>{1, 1, 1};
conv_params.input_right_pads = std::vector<ck::index_t>{1, 1, 1};
out_spatial_len = conv_params.GetOutputSpatialLengths();
res = test::check_err(
out_spatial_len, std::vector<ck::index_t>{36, 36, 36}, "Error: ConvParams 3D.");
conv_params.conv_filter_strides = std::vector<ck::index_t>{1, 1, 1};
out_spatial_len = conv_params.GetOutputSpatialLengths();
res = test::check_err(out_spatial_len,
std::vector<ck::index_t>{71, 71, 71},
"Error: ConvParams 3D stride {1, 1, 1}.");
conv_params.conv_filter_strides = std::vector<ck::index_t>{2, 2, 2};
conv_params.input_left_pads = std::vector<ck::index_t>{2, 2, 2};
conv_params.input_right_pads = std::vector<ck::index_t>{2, 2, 2};
out_spatial_len = conv_params.GetOutputSpatialLengths();
res = test::check_err(out_spatial_len,
std::vector<ck::index_t>{37, 37, 37},
"Error: ConvParams 3D padding left/right {2, 2, 2}.");
conv_params.conv_filter_dilations = std::vector<ck::index_t>{2, 2, 2};
out_spatial_len = conv_params.GetOutputSpatialLengths();
res = test::check_err(out_spatial_len,
std::vector<ck::index_t>{36, 36, 36},
"Error: ConvParams 3D dilation {2, 2, 2}.");
conv_params.conv_filter_strides = std::vector<ck::index_t>{3, 3, 3};
conv_params.input_left_pads = std::vector<ck::index_t>{1, 1, 1};
conv_params.input_right_pads = std::vector<ck::index_t>{1, 1, 1};
conv_params.conv_filter_dilations = std::vector<ck::index_t>{2, 2, 2};
out_spatial_len = conv_params.GetOutputSpatialLengths();
res = test::check_err(
out_spatial_len,
std::vector<ck::index_t>{23, 23, 23},
"Error: ConvParams 3D strides{3, 3, 3}, padding {1, 1, 1}, dilations {2, 2, 2}.");
return res;
}
......@@ -123,23 +142,44 @@ bool TestGetHostTensorDescriptor()
namespace tl = ck::tensor_layout::convolution;
std::vector<std::size_t> dims{2, 3, 4, 5};
HostTensorDescriptor h = ck::conv_util::GetHostTensorDescriptor(dims, tl::NHWC{});
res = cmp_vec(h.GetLengths(), {2, 3, 4, 5}, "Error: wrong NHWC dimensions lengths!");
res =
cmp_vec(h.GetStrides(), {3 * 4 * 5, 1, 3 * 5, 3}, "Error: wrong NHWC dimensions strides!");
res = test::check_err(h.GetLengths(), {2, 3, 4, 5}, "Error: wrong NHWC dimensions lengths!");
res = test::check_err(
h.GetStrides(), {3 * 4 * 5, 1, 3 * 5, 3}, "Error: wrong NHWC dimensions strides!");
h = ck::conv_util::GetHostTensorDescriptor(dims, tl::NCHW{});
res = cmp_vec(h.GetLengths(), {2, 3, 4, 5}, "Error: wrong NCHW dimensions lengths!");
res =
cmp_vec(h.GetStrides(), {3 * 4 * 5, 4 * 5, 5, 1}, "Error: wrong NCHW dimensions strides!");
res = test::check_err(h.GetLengths(), {2, 3, 4, 5}, "Error: wrong NCHW dimensions lengths!");
res = test::check_err(
h.GetStrides(), {3 * 4 * 5, 4 * 5, 5, 1}, "Error: wrong NCHW dimensions strides!");
dims = std::vector<std::size_t>{2, 3, 4};
h = ck::conv_util::GetHostTensorDescriptor(dims, tl::NWC{});
res = cmp_vec(h.GetLengths(), {2, 3, 4}, "Error: wrong NWC dimensions lengths!");
res = cmp_vec(h.GetStrides(), {3 * 4, 1, 3}, "Error: wrong NWC dimensions strides!");
res = test::check_err(h.GetLengths(), {2, 3, 4}, "Error: wrong NWC dimensions lengths!");
res = test::check_err(h.GetStrides(), {3 * 4, 1, 3}, "Error: wrong NWC dimensions strides!");
h = ck::conv_util::GetHostTensorDescriptor(dims, tl::NCW{});
res = cmp_vec(h.GetLengths(), {2, 3, 4}, "Error: wrong NCW dimensions lengths!");
res = cmp_vec(h.GetStrides(), {3 * 4, 4, 1}, "Error: wrong NCW dimensions strides!");
res = test::check_err(h.GetLengths(), {2, 3, 4}, "Error: wrong NCW dimensions lengths!");
res = test::check_err(h.GetStrides(), {3 * 4, 4, 1}, "Error: wrong NCW dimensions strides!");
dims = std::vector<std::size_t>{2, 3, 4, 5, 6};
h = ck::conv_util::GetHostTensorDescriptor(dims, tl::NDHWC{});
res = test::check_err(h.GetLengths(), dims, "Error: wrong NDHWC dimensions lengths!");
res = test::check_err(h.GetStrides(),
{3 * 4 * 5 * 6, // N
1, // C
3 * 5 * 6, // D
3 * 6, // H
3}, // W
"Error: wrong NDHWC dimensions strides!");
h = ck::conv_util::GetHostTensorDescriptor(dims, tl::NCDHW{});
res = test::check_err(h.GetLengths(), dims, "Error: wrong NCDHW dimensions lengths!");
res = test::check_err(h.GetStrides(),
{3 * 4 * 5 * 6, // N
4 * 5 * 6, // C
5 * 6, // D
6, // H
1}, // W
"Error: wrong NCDHW dimensions strides!");
return res;
}
......
add_test_executable(test_convnd_fwd convnd_fwd.cpp)
target_link_libraries(test_convnd_fwd PRIVATE host_tensor)
add_custom_target(test_convnd_fwd)
add_test_executable(test_conv1d_fwd conv1d_fwd.cpp)
target_link_libraries(test_conv1d_fwd PRIVATE host_tensor)
target_link_libraries(test_conv1d_fwd PRIVATE device_conv1d_fwd_instance)
add_dependencies(test_convnd_fwd test_conv1d_fwd)
add_test_executable(test_conv2d_fwd conv2d_fwd.cpp)
target_link_libraries(test_conv2d_fwd PRIVATE host_tensor)
target_link_libraries(test_conv2d_fwd PRIVATE device_conv2d_fwd_instance)
add_dependencies(test_convnd_fwd test_conv2d_fwd)
add_test_executable(test_conv3d_fwd conv3d_fwd.cpp)
target_link_libraries(test_conv3d_fwd PRIVATE host_tensor)
target_link_libraries(test_conv3d_fwd PRIVATE device_conv3d_fwd_instance)
add_dependencies(test_convnd_fwd test_conv3d_fwd)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment