Unverified Commit 5d37d7bf authored by Chao Liu's avatar Chao Liu Committed by GitHub
Browse files

Reorganize files, Part 1 (#119)

* delete obselete files

* move files

* build

* update cmake

* update cmake

* fix build

* reorg examples

* update cmake for example and test
parent 245f7414
add_example_executable(example_conv2d_bwd_wgt_xdl conv2d_bwd_wgt_xdl.cpp)
add_example_executable(example_reduce_blockwise reduce_blockwise.cpp)
......@@ -14,7 +14,6 @@
#include "device_reduce_blockwise.hpp"
#include "host_reduce_util.hpp"
#include "host_generic_reduction.hpp"
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
......
add_example_executable(example_pool2d_fwd pool2d_fwd.cpp)
......@@ -12,7 +12,7 @@
#include "device_tensor.hpp"
#include "tensor_layout.hpp"
#include "reduction_operator.hpp"
#include "device_operation/include/device_pool2d_fwd_nhwc_nhwc.hpp"
#include "device_pool2d_fwd_nhwc_nhwc.hpp"
using InDataType = ck::half_t;
using OutDataType = ck::half_t;
......
# Instructions for ```conv_xdl_bias_relu_add``` Example
## Docker script
```bash
docker run \
-it \
--rm \
--privileged \
--group-add sudo \
-w /root/workspace \
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \
rocm/tensorflow:rocm4.3.1-tf2.6-dev \
/bin/bash
```
## Build ```conv_xdl_bias_relu_add```
```bash
mkdir build && cd build
```
```bash
# Need to specify target ID, example below is gfx908
cmake \
-D BUILD_DEV=OFF \
-D CMAKE_BUILD_TYPE=Release \
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-D CMAKE_PREFIX_PATH=/opt/rocm \
..
```
```bash
make -j conv_xdl_bias_relu_add
```
## Run ```conv_xdl_bias_relu_add```
```bash
#arg1: verification (0=no, 1=yes)
#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
#arg3: run kernel # of times (>1)
#arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx
./example/conv_xdl_bias_relu_add 0 1 5
```
Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
```
in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
wei_k_c_y_x: dim 4, lengths {256, 192, 3, 3}, strides {1728, 1, 576, 192}
out_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256}
bias_k: dim 1, lengths {256}, strides {1}
resi_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256}
arg.a_grid_desc_k0_m_k1_{216, 165888, 8}
arg.b_grid_desc_k0_n_k1_{216, 256, 8}
arg.c_grid_desc_m_n_{ 165888, 256}
arg.c0_grid_desc_m_n_{ 165888, 256}
arg.c1_grid_desc_m_n_{ 165888, 256}
launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1}
Warm up
Start running 5 times...
Perf: 1.71779 ms, 85.4396 TFlops, 194.2 GB/s
```
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "config.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "device_tensor.hpp"
#include "tensor_layout.hpp"
#include "device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp"
#include "element_wise_operation.hpp"
#include "convolution_utility.hpp"
using InDataType = ck::half_t;
using WeiDataType = ck::half_t;
using OutDataType = ck::half_t;
using AccDataType = float;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using InLayout = ck::tensor_layout::convolution::NHWC;
using WeiLayout = ck::tensor_layout::convolution::KYXC;
using OutLayout = ck::tensor_layout::convolution::NHWK;
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::AddRelu;
static constexpr auto MemoryAtomicAdd = ck::InMemoryDataOperationEnum_t::AtomicAdd;
static constexpr auto ConvFwdDefault =
ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default;
// clang-format off
using DeviceConvFwdInstance = ck::tensor_operation::device::
DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
// clang-format off
// | InData| WeiData| OutData| AccData| In| Wei| Out| Out| ConvForward| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
// | Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| GlobalMemory| Specialization| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
// | | | | | Operation| Operation| Operation| DataOperation| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl|
// | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
<InDataType, WeiDataType, OutDataType, AccDataType, InElementOp, WeiElementOp, OutElementOp, MemoryAtomicAdd, ConvFwdDefault, 256, 128, 256, 4, 8, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 1, 8, 1, 1,32>, 2>;
// clang-format on
template <typename TIn,
typename TWei,
typename TOut,
typename InElementOp,
typename WeiElementOp,
typename OutElementOp>
void host_reference_calculation(const Tensor<TIn>& in_n_c_hi_wi,
const Tensor<TWei>& wei_k_c_y_x,
Tensor<TOut>& out_n_k_ho_wo,
const Tensor<TOut>& bias_k,
const std::vector<ck::index_t>& conv_strides,
const std::vector<ck::index_t>& conv_dilations,
const std::vector<ck::index_t>& in_left_pads,
const std::vector<ck::index_t>& /* in_right_pads */,
const InElementOp& in_element_op,
const WeiElementOp& wei_element_op,
const OutElementOp& out_element_op)
{
auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
float v_acc = 0;
for(int c = 0; c < wei_k_c_y_x.mDesc.GetLengths()[1]; ++c)
{
for(int y = 0; y < wei_k_c_y_x.mDesc.GetLengths()[2]; ++y)
{
int hi = ho * conv_strides[0] + y * conv_dilations[0] - in_left_pads[0];
for(int x = 0; x < wei_k_c_y_x.mDesc.GetLengths()[3]; ++x)
{
int wi = wo * conv_strides[1] + x * conv_dilations[1] - in_left_pads[1];
if(hi >= 0 && hi < in_n_c_hi_wi.mDesc.GetLengths()[2] && wi >= 0 &&
wi < in_n_c_hi_wi.mDesc.GetLengths()[3])
{
float v_in;
float v_wei;
in_element_op(v_in, static_cast<const float>(in_n_c_hi_wi(n, c, hi, wi)));
wei_element_op(v_wei, static_cast<const float>(wei_k_c_y_x(k, c, y, x)));
v_acc += v_in * v_wei;
}
}
}
}
float v_out;
out_element_op(v_out, v_acc, static_cast<float>(bias_k(k)));
out_n_k_ho_wo(n, k, ho, wo) += v_out;
};
make_ParallelTensorFunctor(f_nchw,
out_n_k_ho_wo.mDesc.GetLengths()[0],
out_n_k_ho_wo.mDesc.GetLengths()[1],
out_n_k_ho_wo.mDesc.GetLengths()[2],
out_n_k_ho_wo.mDesc.GetLengths()[3])(
std::thread::hardware_concurrency());
}
int main(int argc, char* argv[])
{
bool do_verification = 0;
int init_method = 0;
int nrepeat = 5;
// Conv shape
ck::index_t N = 128;
ck::index_t K = 256;
ck::index_t C = 192;
ck::index_t Y = 3;
ck::index_t X = 3;
ck::index_t Hi = 71;
ck::index_t Wi = 71;
ck::index_t conv_stride_h = 2;
ck::index_t conv_stride_w = 2;
ck::index_t conv_dilation_h = 1;
ck::index_t conv_dilation_w = 1;
ck::index_t in_left_pad_h = 1;
ck::index_t in_left_pad_w = 1;
ck::index_t in_right_pad_h = 1;
ck::index_t in_right_pad_w = 1;
if(argc == 4)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
nrepeat = std::stoi(argv[3]);
}
else if(argc == 19)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
nrepeat = std::stoi(argv[3]);
N = std::stoi(argv[4]);
K = std::stoi(argv[5]);
C = std::stoi(argv[6]);
Y = std::stoi(argv[7]);
X = std::stoi(argv[8]);
Hi = std::stoi(argv[9]);
Wi = std::stoi(argv[10]);
conv_stride_h = std::stoi(argv[11]);
conv_stride_w = std::stoi(argv[12]);
conv_dilation_h = std::stoi(argv[13]);
conv_dilation_w = std::stoi(argv[14]);
in_left_pad_h = std::stoi(argv[15]);
in_left_pad_w = std::stoi(argv[16]);
in_right_pad_h = std::stoi(argv[17]);
in_right_pad_w = std::stoi(argv[18]);
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
printf("arg3: run kernel # of times (>1)\n");
printf("arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx\n");
exit(0);
}
const std::vector<ck::index_t> conv_filter_strides{conv_stride_h, conv_stride_w};
const std::vector<ck::index_t> conv_filter_dilations{conv_dilation_h, conv_dilation_w};
const std::vector<ck::index_t> input_left_pads{in_left_pad_h, in_left_pad_w};
const std::vector<ck::index_t> input_right_pads{in_right_pad_h, in_right_pad_w};
const auto output_spatial_lengths =
ck::tensor_operation::ConvolutionUtility::ComputeOutputSpatialLengths({Hi, Wi},
{Y, X},
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads);
const ck::index_t Ho = output_spatial_lengths[0];
const ck::index_t Wo = output_spatial_lengths[1];
// tensor layout
auto f_host_tensor_descriptor = [](std::size_t N_,
std::size_t C_,
std::size_t H,
std::size_t W,
auto layout) {
if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value ||
ck::is_same<decltype(layout), ck::tensor_layout::convolution::KCYX>::value ||
ck::is_same<decltype(layout), ck::tensor_layout::convolution::NKHW>::value)
{
return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
std::vector<std::size_t>({C_ * H * W, H * W, W, 1}));
}
else if constexpr(ck::is_same<decltype(layout),
ck::tensor_layout::convolution::NHWC>::value ||
ck::is_same<decltype(layout),
ck::tensor_layout::convolution::KYXC>::value ||
ck::is_same<decltype(layout),
ck::tensor_layout::convolution::NHWK>::value)
{
return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
std::vector<std::size_t>({C_ * H * W, 1, W * C_, C_}));
}
};
Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi, InLayout{}));
Tensor<WeiDataType> wei_k_c_y_x(f_host_tensor_descriptor(K, C, Y, X, WeiLayout{}));
Tensor<OutDataType> out_n_k_ho_wo_host_result(
f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
Tensor<OutDataType> out_n_k_ho_wo_device_result(
f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
// bias: assume contiguous 1d vector
Tensor<OutDataType> bias_k(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(K)})));
std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl;
std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo_host_result.mDesc << std::endl;
std::cout << "bias_k: " << bias_k.mDesc << std::endl;
switch(init_method)
{
case 0: break;
case 1:
in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
out_n_k_ho_wo_host_result.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
bias_k.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
break;
default:
in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
out_n_k_ho_wo_host_result.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.5, 0.5});
bias_k.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
}
DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpace());
DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_k_c_y_x.mDesc.GetElementSpace());
DeviceMem out_device_buf(sizeof(OutDataType) *
out_n_k_ho_wo_device_result.mDesc.GetElementSpace());
DeviceMem bias_device_buf(sizeof(OutDataType) * bias_k.mDesc.GetElementSpace());
in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
wei_device_buf.ToDevice(wei_k_c_y_x.mData.data());
out_device_buf.ToDevice(out_n_k_ho_wo_host_result.mData.data());
bias_device_buf.ToDevice(bias_k.mData.data());
auto conv = DeviceConvFwdInstance{};
auto invoker = conv.MakeInvoker();
auto argument =
conv.MakeArgument(static_cast<const InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<const WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
static_cast<const OutDataType*>(bias_device_buf.GetDeviceBuffer()),
N,
K,
C,
std::vector<ck::index_t>{Hi, Wi},
std::vector<ck::index_t>{Y, X},
std::vector<ck::index_t>{Ho, Wo},
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
InElementOp{},
WeiElementOp{},
OutElementOp{});
if(!conv.IsSupportedArgument(argument))
{
throw std::runtime_error(
"wrong! device operator with the specified compilation parameters does "
"not support this problem");
}
float ave_time = invoker.Run(argument, nrepeat);
std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
std::size_t num_btype = sizeof(InDataType) * (N * C * Hi * Wi) +
sizeof(WeiDataType) * (K * C * Y * X) +
sizeof(OutDataType) * (N * K * Ho * Wo) + sizeof(OutDataType) * (K);
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
<< std::endl;
if(do_verification)
{
host_reference_calculation(in_n_c_hi_wi,
wei_k_c_y_x,
out_n_k_ho_wo_host_result,
bias_k,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
InElementOp{},
WeiElementOp{},
OutElementOp{});
out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data());
check_error(out_n_k_ho_wo_host_result, out_n_k_ho_wo_device_result);
}
}
# Instructions for ```conv2d_fwd_xdl``` Example
## Docker script
```bash
docker run \
-it \
--rm \
--privileged \
--group-add sudo \
-w /root/workspace \
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \
rocm/tensorflow:rocm4.3.1-tf2.6-dev \
/bin/bash
```
## Build ```conv2d_fwd_xdl```
```bash
mkdir build && cd build
```
```bash
# Need to specify target ID, example below is gfx908
cmake \
-D BUILD_DEV=OFF \
-D CMAKE_BUILD_TYPE=Release \
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-D CMAKE_PREFIX_PATH=/opt/rocm \
..
```
```bash
make -j conv2d_fwd_xdl
```
## Run ```conv2d_fwd_xdl_int8```
```bash
#arg1: verification (0=no, 1=yes)
#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
#arg3: run kernel # of times (>1)
#arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx
./example/conv2d_fwd_xdl_int8 0 1 5
```
Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
```
in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
wei_k_c_y_x: dim 4, lengths {256, 192, 3, 3}, strides {1728, 1, 576, 192}
out_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256}
arg.a_grid_desc_k0_m_k1_{216, 165888, 8}
arg.b_grid_desc_k0_n_k1_{216, 256, 8}
arg.c_grid_desc_m_n_{ 165888, 256}
launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1}
Warm up
Start running 5 times...
Perf: 1.43206 ms, 102.486 TFlops, 232.947 GB/s
```
include_directories(BEFORE
${PROJECT_SOURCE_DIR}
${PROJECT_SOURCE_DIR}/host/host_tensor/include
${PROJECT_SOURCE_DIR}/host/device/include
${PROJECT_SOURCE_DIR}/device_operation/include
${PROJECT_SOURCE_DIR}/reference_operation/include
${PROJECT_SOURCE_DIR}/composable_kernel/include
${PROJECT_SOURCE_DIR}/composable_kernel/include/utility
${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_description
${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_operation
${PROJECT_SOURCE_DIR}/composable_kernel/include/problem_transform
${PROJECT_SOURCE_DIR}/external/rocm/include
${PROJECT_SOURCE_DIR}/device_operation_reference/include
${PROJECT_SOURCE_DIR}/include/ck
${PROJECT_SOURCE_DIR}/include/ck/utility
${PROJECT_SOURCE_DIR}/include/ck/tensor_description
${PROJECT_SOURCE_DIR}/include/ck/tensor
${PROJECT_SOURCE_DIR}/include/ck/problem_transform
${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/device
${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/grid
${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/block
${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/warp
${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/thread
${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/element
${PROJECT_SOURCE_DIR}/library/include/ck/library/host_tensor
${PROJECT_SOURCE_DIR}/library/include/ck/library/reference_tensor_operation/cpu
${PROJECT_SOURCE_DIR}/library/include/ck/library/reference_tensor_operation/gpu
${PROJECT_SOURCE_DIR}/external/include/half
)
set(GEMM_XDL_SOURCE 1_gemm_xdl/gemm_xdl.cpp)
set(GEMM_XDL_INT8_SOURCE 1_gemm_xdl/gemm_xdl_int8.cpp)
set(GEMM_XDL_BF16_SOURCE 1_gemm_xdl/gemm_xdl_bf16.cpp)
set(GEMM_XDL_BIAS_RELU_SOURCE 2_gemm_xdl_bias_relu/gemm_xdl_bias_relu.cpp)
set(GEMM_XDL_BIAS_RELU_ADD_SOURCE 3_gemm_xdl_bias_relu_add/gemm_xdl_bias_relu_add.cpp)
set(CONV2D_FWD_XDL_SOURCE 4_conv2d_fwd_xdl/conv2d_fwd_xdl.cpp)
set(CONV2D_FWD_XDL_BIAS_RELU_SOURCE 5_conv2d_fwd_xdl_bias_relu/conv2d_fwd_xdl_bias_relu.cpp)
set(CONV2D_FWD_XDL_BIAS_RELU_ADD_SOURCE 6_conv2d_fwd_xdl_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp)
set(CONV2D_FWD_XDL_BIAS_RELU_ATOMIC_ADD_SOURCE 7_conv2d_fwd_xdl_bias_relu_atomic_add/conv2d_fwd_xdl_bias_relu_atomic_add.cpp)
set(GEMM_XDL_ALPHA_BETA_SOURCE 8_gemm_xdl_alpha_beta/gemm_xdl_alpha_beta.cpp)
set(CONV2D_FWD_XDL_INT8_SOURCE 9_conv2d_fwd_xdl_int8/conv2d_fwd_xdl_int8.cpp)
set(CONV2D_WRW_XDL_SOURCE 13_conv2d_backward_weight_xdl/main.cpp)
set(CONV3D_FWD_XDL_SOURCE 10_conv3d_fwd_xdl/conv3d_fwd_xdl.cpp)
set(CONVND_FWD_XDL_SOURCE 11_convnd_fwd_xdl/convnd_fwd_xdl.cpp)
set(CONV2D_BWD_DATA_XDL_SOURCE 12_conv2d_bwd_data_xdl/conv2d_bwd_data_xdl.cpp)
set(POOL2D_FWD_SOURCE 12_pool2d_fwd/pool2d_fwd.cpp)
set(REDUCE_BLOCKWISE_SOURCE 13_reduce_blockwise/reduce_blockwise.cpp)
add_custom_target(examples)
add_executable(gemm_xdl ${GEMM_XDL_SOURCE})
add_executable(gemm_xdl_int8 ${GEMM_XDL_INT8_SOURCE})
add_executable(gemm_xdl_bf16 ${GEMM_XDL_BF16_SOURCE})
add_executable(gemm_xdl_bias_relu ${GEMM_XDL_BIAS_RELU_SOURCE})
add_executable(gemm_xdl_bias_relu_add ${GEMM_XDL_BIAS_RELU_ADD_SOURCE})
add_executable(conv2d_fwd_xdl ${CONV2D_FWD_XDL_SOURCE})
add_executable(conv2d_fwd_xdl_bias_relu ${CONV2D_FWD_XDL_BIAS_RELU_SOURCE})
add_executable(conv2d_fwd_xdl_bias_relu_add ${CONV2D_FWD_XDL_BIAS_RELU_ADD_SOURCE})
add_executable(conv2d_fwd_xdl_bias_relu_atomic_add ${CONV2D_FWD_XDL_BIAS_RELU_ATOMIC_ADD_SOURCE})
add_executable(gemm_xdl_alpha_beta ${GEMM_XDL_ALPHA_BETA_SOURCE})
add_executable(conv2d_fwd_xdl_int8 ${CONV2D_FWD_XDL_INT8_SOURCE})
add_executable(conv2d_wrw_xdl ${CONV2D_WRW_XDL_SOURCE})
add_executable(conv3d_fwd_xdl ${CONV3D_FWD_XDL_SOURCE})
add_executable(convnd_fwd_xdl ${CONVND_FWD_XDL_SOURCE})
add_executable(conv2d_bwd_data_xdl ${CONV2D_BWD_DATA_XDL_SOURCE})
add_executable(pool2d_fwd ${POOL2D_FWD_SOURCE})
add_executable(reduce_blockwise ${REDUCE_BLOCKWISE_SOURCE})
target_link_libraries(gemm_xdl PRIVATE host_tensor)
target_link_libraries(gemm_xdl_int8 PRIVATE host_tensor)
target_link_libraries(gemm_xdl_bf16 PRIVATE host_tensor)
target_link_libraries(gemm_xdl_bias_relu PRIVATE host_tensor)
target_link_libraries(gemm_xdl_bias_relu_add PRIVATE host_tensor)
target_link_libraries(conv2d_fwd_xdl PRIVATE host_tensor)
target_link_libraries(conv2d_fwd_xdl_bias_relu PRIVATE host_tensor)
target_link_libraries(conv2d_fwd_xdl_bias_relu_add PRIVATE host_tensor)
target_link_libraries(conv2d_fwd_xdl_bias_relu_atomic_add PRIVATE host_tensor)
target_link_libraries(gemm_xdl_alpha_beta PRIVATE host_tensor)
target_link_libraries(conv2d_fwd_xdl_int8 PRIVATE host_tensor)
target_link_libraries(conv2d_wrw_xdl PRIVATE host_tensor)
target_link_libraries(conv3d_fwd_xdl PRIVATE host_tensor)
target_link_libraries(convnd_fwd_xdl PRIVATE host_tensor)
target_link_libraries(conv2d_bwd_data_xdl PRIVATE host_tensor)
target_link_libraries(pool2d_fwd PRIVATE host_tensor)
target_link_libraries(reduce_blockwise PRIVATE host_tensor)
function(add_example_executable EXAMPLE_NAME)
message("adding example ${EXAMPLE_NAME}")
add_executable(${EXAMPLE_NAME} ${ARGN})
target_link_libraries(${EXAMPLE_NAME} PRIVATE host_tensor)
add_dependencies(examples ${EXAMPLE_NAME})
endfunction(add_example_executable EXAMPLE_NAME)
add_subdirectory(01_gemm)
add_subdirectory(02_gemm_alpha_beta)
add_subdirectory(03_gemm_bias_relu)
add_subdirectory(04_gemm_bias_relu_add)
add_subdirectory(05_conv2d_fwd)
add_subdirectory(06_conv2d_fwd_bias_relu)
add_subdirectory(07_conv2d_fwd_bias_relu_add)
add_subdirectory(08_conv3d_fwd)
add_subdirectory(09_convnd_fwd)
add_subdirectory(10_conv2d_bwd_data)
add_subdirectory(11_conv2d_bwd_wgt)
add_subdirectory(12_reduce)
add_subdirectory(13_pool2d_fwd)
add_subdirectory(host_tensor)
#ifndef CONV_TUNABLE_FWD_V4R4_DLOPS_NCHW_KCYX_NKHW_HPP
#define CONV_TUNABLE_FWD_V4R4_DLOPS_NCHW_KCYX_NKHW_HPP
struct tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw
{
int BlockSize;
int MPerBlock;
int NPerBlock;
int KPerBlock;
int M1PerThread;
int N1PerThread;
int KPerThread;
int M1N1ThreadClusterM10;
int M1N1ThreadClusterN10;
int M1N1ThreadClusterM11;
int M1N1ThreadClusterN11;
std::array<int, 3> ABlockTransferThreadSliceLengths_K_M0_M1;
std::array<int, 3> ABlockTransferThreadClusterLengths_K_M0_M1;
std::array<int, 3> ABlockTransferThreadClusterArrangeOrder;
std::array<int, 3> ABlockTransferSrcAccessOrder;
int ABlockTransferSrcVectorDim;
int ABlockTransferSrcScalarPerVector;
int ABlockTransferDstScalarPerVector_M1;
bool AThreadTransferSrcResetCoordinateAfterRun;
std::array<int, 3> BBlockTransferThreadSliceLengths_K_N0_N1;
std::array<int, 3> BBlockTransferThreadClusterLengths_K_N0_N1;
std::array<int, 3> BBlockTransferThreadClusterArrangeOrder;
std::array<int, 3> BBlockTransferSrcAccessOrder;
int BBlockTransferSrcVectorDim;
int BBlockTransferSrcScalarPerVector;
int BBlockTransferDstScalarPerVector_N1;
bool BThreadTransferSrcResetCoordinateAfterRun;
std::array<int, 6> CThreadTransferSrcDstAccessOrder;
int CThreadTransferSrcDstVectorDim;
int CThreadTransferDstScalarPerVector;
};
static tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw
default_tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw = {
256, 128, 128, 8, 4, 4, 1,
8, 8, 2, 2, {4, 1, 1}, {2, 1, 128}, {2, 1, 0},
{2, 1, 0}, 0, 4, 1, false, {4, 1, 1}, {2, 1, 128},
{0, 1, 2}, {0, 1, 2}, 2, 1, 1, false, {3, 4, 5, 0, 1, 2},
5, 1};
#endif
#ifndef CONV_TUNABLE_FWD_V4R4_XDLOPS_NCHW_KCYX_NKHW_HPP
#define CONV_TUNABLE_FWD_V4R4_XDLOPS_NCHW_KCYX_NKHW_HPP
struct tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw
{
int BlockSize;
int MPerBlock;
int NPerBlock;
int KPerBlock;
int MPerXDL;
int NPerXDL;
int K1;
int MRepeat;
int NRepeat;
std::array<int, 3> ABlockTransferThreadSliceLengths_K0_M_K1;
std::array<int, 3> ABlockTransferThreadClusterLengths_K0_M_K1;
std::array<int, 3> ABlockTransferThreadClusterArrangeOrder;
std::array<int, 3> ABlockTransferSrcAccessOrder;
int ABlockTransferSrcVectorDim;
int ABlockTransferSrcScalarPerVector;
int ABlockTransferDstScalarPerVector_K1;
bool AThreadTransferSrcResetCoordinateAfterRun;
std::array<int, 3> BBlockTransferThreadSliceLengths_K0_N_K1;
std::array<int, 3> BBlockTransferThreadClusterLengths_K0_N_K1;
std::array<int, 3> BBlockTransferThreadClusterArrangeOrder;
std::array<int, 3> BBlockTransferSrcAccessOrder;
int BBlockTransferSrcVectorDim;
int BBlockTransferSrcScalarPerVector;
int BBlockTransferDstScalarPerVector_K1;
bool BThreadTransferSrcResetCoordinateAfterRun;
std::array<int, 8> CThreadTransferSrcDstAccessOrder;
int CThreadTransferSrcDstVectorDim;
int CThreadTransferDstScalarPerVector;
};
static tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw
default_tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw = {
256, // BlockSize
128, // MPerBlock,
128, // NPerBlock,
4, // KPerBlock,
32, // MPerXDL,
32, // NPerXDL,
4, // K1,
2, // MRepeat,
2, // NRepeat,
{1, 2, 4}, // ABlockTransferThreadSliceLengths_K0_M_K1,
{4, 64, 1}, // ABlockTransferThreadClusterLengths_K0_M_K1,
{1, 0, 2}, // ABlockTransferThreadClusterArrangeOrder,
{1, 0, 2}, // ABlockTransferSrcAccessOrder,
2, // ABlockTransferSrcVectorDim
1, // ABlockTransferSrcScalarPerVector,
4, // ABlockTransferDstScalarPerVector_K1,
false, // AThreadTransferSrcResetCoordinateAfterRun,
{1, 2, 4}, // BBlockTransferThreadSliceLengths_K0_N_K1,
{4, 64, 1}, // BBlockTransferThreadClusterLengths_K0_N_K1,
{0, 2, 1}, // BBlockTransferThreadClusterArrangeOrder,
{1, 0, 2}, // BBlockTransferSrcAccessOrder,
1, // BBlockTransferSrcVectorDim
1, // BBlockTransferSrcScalarPerVector
4, // BBlockTransferDstScalarPerVector_K1
false, // BThreadTransferSrcResetCoordinateAfterRun
{3, 0, 1, 2, 7, 5, 4, 6}, // CThreadTransferSrcDstAccessOrder
7, // CThreadTransferSrcDstVectorDim,
1 // CThreadTransferDstScalarPerVector
};
#endif
#ifndef CONV_TUNABLE_FWD_V4R4_XDLOPS_NHWC_KYXC_NHWK_HPP
#define CONV_TUNABLE_FWD_V4R4_XDLOPS_NHWC_KYXC_NHWK_HPP
struct tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk
{
int BlockSize;
int MPerBlock;
int NPerBlock;
int KPerBlock;
int MPerWave;
int NPerWave;
int K1;
int MRepeat;
int NRepeat;
std::array<int, 3> ABlockTransferThreadSliceLengths_K0_M_K1;
std::array<int, 3> ABlockTransferThreadClusterLengths_K0_M_K1;
std::array<int, 3> ABlockTransferThreadClusterArrangeOrder;
std::array<int, 3> ABlockTransferSrcAccessOrder;
int ABlockTransferSrcVectorDim;
int ABlockTransferSrcScalarPerVector;
int ABlockTransferDstScalarPerVector_K1;
bool AThreadTransferSrcResetCoordinateAfterRun;
std::array<int, 3> BBlockTransferThreadSliceLengths_K0_N_K1;
std::array<int, 3> BBlockTransferThreadClusterLengths_K0_N_K1;
std::array<int, 3> BBlockTransferThreadClusterArrangeOrder;
std::array<int, 3> BBlockTransferSrcAccessOrder;
int BBlockTransferSrcVectorDim;
int BBlockTransferSrcScalarPerVector;
int BBlockTransferDstScalarPerVector_K1;
bool BThreadTransferSrcResetCoordinateAfterRun;
std::array<int, 8> CThreadTransferSrcDstAccessOrder;
int CThreadTransferSrcDstVectorDim;
int CThreadTransferDstScalarPerVector;
};
static tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk
default_tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk = {
256, // BlockSize
128, // MPerBlock,
128, // NPerBlock,
4, // KPerBlock,
32, // MPerWave,
32, // NPerWave,
4, // K1,
2, // MRepeat,
2, // NRepeat,
{1, 2, 4}, // ABlockTransferThreadSliceLengths_K0_M_K1,
{4, 64, 1}, // ABlockTransferThreadClusterLengths_K0_M_K1,
{1, 0, 2}, // ABlockTransferThreadClusterArrangeOrder,
{1, 0, 2}, // ABlockTransferSrcAccessOrder,
2, // ABlockTransferSrcVectorDim
4, // ABlockTransferSrcScalarPerVector,
4, // ABlockTransferDstScalarPerVector_K1,
false, // AThreadTransferSrcResetCoordinateAfterRun,
{1, 2, 4}, // BBlockTransferThreadSliceLengths_K0_N_K1,
{4, 64, 1}, // BBlockTransferThreadClusterLengths_K0_N_K1,
{1, 0, 2}, // BBlockTransferThreadClusterArrangeOrder,
{1, 0, 2}, // BBlockTransferSrcAccessOrder,
2, // BBlockTransferSrcVectorDim
4, // BBlockTransferSrcScalarPerVector
4, // BBlockTransferDstScalarPerVector_K1
false, // BThreadTransferSrcResetCoordinateAfterRun
{2, 3, 0, 1, 7, 5, 4, 6}, // CThreadTransferSrcDstAccessOrder
7, // CThreadTransferSrcDstVectorDim,
1 // CThreadTransferDstScalarPerVector
};
#endif
#ifndef CONVOLUTION_PROBLEM_DESCRIPTOR
#define CONVOLUTION_PROBLEM_DESCRIPTOR
namespace ck {
namespace driver {
struct ConvolutionProblemDescriptor
{
ConvolutionProblemDescriptor() = default;
ConvolutionProblemDescriptor(int N_,
int K_,
int C_,
int Y_,
int X_,
int Hi_,
int Wi_,
int Ho_,
int Wo_,
int ConvStrideH_,
int ConvStrideW_,
int ConvDilationH_,
int ConvDilationW_,
int InLeftPadH_,
int InLeftPadW_,
int InRightPadH_,
int InRightPadW_,
ck::DataTypeEnum_t InDataTypeEnum_,
ck::DataTypeEnum_t WeiDataTypeEnum_,
ck::DataTypeEnum_t OutDataTypeEnum_)
: N{N_},
K{K_},
C{C_},
Y{Y_},
X{X_},
Hi{Hi_},
Wi{Wi_},
Ho{Ho_},
Wo{Wo_},
ConvStrideH{ConvStrideH_},
ConvStrideW{ConvStrideW_},
ConvDilationH{ConvDilationH_},
ConvDilationW{ConvDilationW_},
InLeftPadH{InLeftPadH_},
InLeftPadW{InLeftPadW_},
InRightPadH{InRightPadH_},
InRightPadW{InRightPadW_},
InDataTypeEnum{InDataTypeEnum_},
WeiDataTypeEnum{WeiDataTypeEnum_},
OutDataTypeEnum{OutDataTypeEnum_}
{
}
int N;
int K;
int C;
int Y;
int X;
int Hi;
int Wi;
int Ho;
int Wo;
int ConvStrideH;
int ConvStrideW;
int ConvDilationH;
int ConvDilationW;
int InLeftPadH;
int InLeftPadW;
int InRightPadH;
int InRightPadW;
ck::DataTypeEnum_t InDataTypeEnum;
ck::DataTypeEnum_t WeiDataTypeEnum;
ck::DataTypeEnum_t OutDataTypeEnum;
std::size_t CalculateFlop() const { return 2L * N * K * C * Y * X * Ho * Wo; }
};
} // namespace driver
} // namespace ck
#endif
#ifndef CK_SOLVER_COMMON_HPP
#define CK_SOLVER_COMMON_HPP
namespace ck {
namespace driver {
// greatest common divisor, aka highest common factor
inline int gcd(int x, int y)
{
if(x < 0)
{
return gcd(-x, y);
}
else if(y < 0)
{
return gcd(x, -y);
}
else if(x == y || x == 0)
{
return y;
}
else if(y == 0)
{
return x;
}
else if(x > y)
{
return gcd(x % y, y);
}
else
{
return gcd(x, y % x);
}
}
template <typename X,
typename... Ys,
typename std::enable_if<sizeof...(Ys) >= 2, bool>::type = false>
auto gcd(X x, Ys... ys)
{
return gcd(x, gcd(ys...));
}
} // namespace driver
} // namespace ck
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment