"ml/vscode:/vscode.git/clone" did not exist on "943464ccb85678069b2ad44051f13203c51e67b0"
Commit dd6a8de4 authored by Jehandad Khan's avatar Jehandad Khan
Browse files

Merge branch 'develop' into jd/dev_pkg

parents 0aa899aa abf4bdb9
add_example_executable(example_conv3d_fwd_xdl conv3d_fwd_xdl.cpp)
# Instructions for ```conv3d_fwd_xdl``` Example
## Docker script
```bash
docker run \
-it \
--rm \
--privileged \
--group-add sudo \
-w /root/workspace \
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \
rocm/tensorflow:rocm4.3.1-tf2.6-dev \
/bin/bash
```
## Build ```conv3d_fwd_xdl```
```bash
mkdir build && cd build
```
```bash
# Need to specify target ID, example below is gfx908
cmake \
-D BUILD_DEV=OFF \
-D CMAKE_BUILD_TYPE=Release \
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-D CMAKE_PREFIX_PATH=/opt/rocm \
..
```
```bash
make -j conv3d_fwd_xdl
```
## Run ```conv3d_fwd_xdl```
```bash
#arg1: verification (0=no, 1=yes)
#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
#arg3: run kernel # of times (>1)
#arg4 to 24: N, K, C, Z, Y, X, Di, Hi, Wi, Sz, Sy, Sx, Dz, Dy, Dx, leftPz, LeftPy, LeftPx, RightPz, RightPy, RightPx
./example/conv3d_fwd_xdl 0 1 5
```
Result (MI100 dynamic frequency)
```
in: dim 5, lengths {4, 71, 71, 71, 192}, strides {68718912, 967872, 13632, 192, 1}
wei: dim 5, lengths {256, 3, 3, 3, 192}, strides {5184, 1728, 576, 192, 1}
out: dim 5, lengths {4, 36, 36, 36, 256}, strides {11943936, 331776, 9216, 256, 1}
a_grid_desc_b_k0_m_k1{1, 648, 186624, 8}
b_grid_desc_b_k0_n_k1{1, 648, 256, 8}
launch_and_time_kernel: grid_dim {1458, 1, 1}, block_dim {256, 1, 1}
Warm up
Start running 5 times...
Perf: 4.49466 ms, 110.206 TFlops, 144.161 GB/s
```
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "config.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "host_gemm.hpp"
#include "device_tensor.hpp"
#include "device_base.hpp"
#include "device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp"
#include "device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp"
#include "convolution_utility.hpp"
// convolution data type
using InDataType = ck::half_t;
using WeiDataType = ck::half_t;
using OutDataType = ck::half_t;
using AccDataType = float;
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
using F16 = ck::half_t;
using F32 = float;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using InLayout = ck::tensor_layout::convolution::NDHWC;
using WeiLayout = ck::tensor_layout::convolution::KZYXC;
using OutLayout = ck::tensor_layout::convolution::NDHWK;
static constexpr auto ConvFwdDefault =
ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default;
using DeviceConv3dFwdInstance = ck::tensor_operation::device::
DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_K<
InDataType, // InData
WeiDataType, // WeiData
OutDataType, // OutData
AccDataType, // AccData
InElementOp, // InElementwise Operation
WeiElementOp, // WeiElementwise Operation
OutElementOp, // OutElementwise Operation
ConvFwdDefault, // ConvForwardSpecialization
256, // BlockSize
128, // MPerBlock
256, // NPerBlock
4, // K0PerBlock
8, // K1. K0PerBlock * K1 = KPerBlock
32, // MPerXDL
32, // NPerXDL. Each XDL computes a matrix of size (MPerXDL, NPerBlock)
2, // MXdlPerWave
4, // NXdlPerWave
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
2, // ABlockTransferSrcVectorDim
8, // ABlockTransferSrcScalarPerVector
8, // ABlockTransferDstScalarPerVector_K1
true, // ABlockLdsAddExtraM
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
2, // BBlockTransferSrcVectorDim
8, // BBlockTransferSrcScalarPerVector
8, // BBlockTransferDstScalarPerVector_K1
true, // BBlockLdsAddExtraN
7, // CThreadTransferSrcDstVectorDim
1>; // CThreadTransferDstScalarPerVector
int main(int argc, char* argv[])
{
bool do_verification = false;
int init_method = 0;
int nrepeat = 5;
// convolution shape
ck::index_t N = 4;
ck::index_t K = 256;
ck::index_t C = 192;
std::vector<ck::index_t> in_spatial_lengths = {71, 71, 71};
std::vector<ck::index_t> filter_spatial_lengths = {3, 3, 3};
std::vector<ck::index_t> conv_filter_strides = {2, 2, 2};
std::vector<ck::index_t> conv_filter_dilations = {1, 1, 1};
std::vector<ck::index_t> in_left_pads = {1, 1, 1};
std::vector<ck::index_t> in_right_pads = {1, 1, 1};
if(argc == 4)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
nrepeat = std::stoi(argv[3]);
}
else if(argc == 25)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
nrepeat = std::stoi(argv[3]);
N = std::stoi(argv[4]);
K = std::stoi(argv[5]);
C = std::stoi(argv[6]);
filter_spatial_lengths[0] = std::stoi(argv[7]);
filter_spatial_lengths[1] = std::stoi(argv[8]);
filter_spatial_lengths[2] = std::stoi(argv[9]);
in_spatial_lengths[0] = std::stoi(argv[10]);
in_spatial_lengths[1] = std::stoi(argv[11]);
in_spatial_lengths[2] = std::stoi(argv[12]);
conv_filter_strides[0] = std::stoi(argv[13]);
conv_filter_strides[1] = std::stoi(argv[14]);
conv_filter_strides[2] = std::stoi(argv[15]);
conv_filter_dilations[0] = std::stoi(argv[16]);
conv_filter_dilations[1] = std::stoi(argv[17]);
conv_filter_dilations[2] = std::stoi(argv[18]);
in_left_pads[0] = std::stoi(argv[19]);
in_left_pads[1] = std::stoi(argv[20]);
in_left_pads[2] = std::stoi(argv[21]);
in_right_pads[0] = std::stoi(argv[22]);
in_right_pads[1] = std::stoi(argv[23]);
in_right_pads[2] = std::stoi(argv[24]);
}
else
{
printf("Usage: 3 or 24 input arguments\n");
printf(" arg1: verification (0=no, 1=yes)\n");
printf(" arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
printf(" arg3: run kernel # of times (>1)\n");
printf(" arg4 to 24: N, K, C, Z, Y, X, Di, Hi, Wi, Sz, Sy, Sz, Dz, Dy, Dx, LeftPz, LeftPy, "
"LeftPz, RightPz, RightPy, RightPx\n");
exit(0);
}
auto conv3d = DeviceConv3dFwdInstance{};
const auto out_spatial_lengths =
ck::tensor_operation::ConvolutionUtility::ComputeOutputSpatialLengths(
in_spatial_lengths,
filter_spatial_lengths,
conv_filter_strides,
conv_filter_dilations,
in_left_pads,
in_right_pads);
Tensor<InDataType> in(
{N, in_spatial_lengths[0], in_spatial_lengths[1], in_spatial_lengths[2], C});
Tensor<WeiDataType> wei(
{K, filter_spatial_lengths[0], filter_spatial_lengths[1], filter_spatial_lengths[2], C});
Tensor<OutDataType> out(
{N, out_spatial_lengths[0], out_spatial_lengths[1], out_spatial_lengths[2], K});
std::cout << "in: " << in.mDesc << std::endl;
std::cout << "wei: " << wei.mDesc << std::endl;
std::cout << "out: " << out.mDesc << std::endl;
switch(init_method)
{
case 0: break;
case 1:
in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
break;
default:
in.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
}
DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpace());
DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpace());
DeviceMem out_device_buf(sizeof(OutDataType) * out.mDesc.GetElementSpace());
in_device_buf.ToDevice(in.mData.data());
wei_device_buf.ToDevice(wei.mData.data());
// do Convolution
auto invoker = conv3d.MakeInvoker();
auto argument = conv3d.MakeArgument(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
N,
K,
C,
in_spatial_lengths,
filter_spatial_lengths,
out_spatial_lengths,
conv_filter_strides,
conv_filter_dilations,
in_left_pads,
in_right_pads,
InElementOp{},
WeiElementOp{},
OutElementOp{});
if(!conv3d.IsSupportedArgument(argument))
{
throw std::runtime_error(
"wrong! device_conv3d with the specified compilation parameters does "
"not support this GEMM problem");
}
float ave_time = invoker.Run(argument, nrepeat);
const auto Di = in_spatial_lengths[0];
const auto Hi = in_spatial_lengths[1];
const auto Wi = in_spatial_lengths[2];
const auto Do = out_spatial_lengths[0];
const auto Ho = out_spatial_lengths[1];
const auto Wo = out_spatial_lengths[2];
const auto Z = filter_spatial_lengths[0];
const auto Y = filter_spatial_lengths[1];
const auto X = filter_spatial_lengths[2];
std::size_t flop = std::size_t(2) * N * K * Do * Ho * Wo * C * Z * Y * X;
std::size_t num_btype = sizeof(InDataType) * N * Di * Hi * Wi * C +
sizeof(WeiDataType) * K * Z * Y * X * C +
sizeof(OutDataType) * N * Do * Ho * Wo * K;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
<< std::endl;
out_device_buf.FromDevice(out.mData.data());
if(do_verification)
{
DeviceMem out_ref_device_buf(sizeof(OutDataType) * N * Do * Ho * Wo * K);
using DeviceConv3dFwdNaive = ck::tensor_operation::device::
DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_K<
InDataType,
WeiDataType,
OutDataType,
AccDataType,
InElementOp,
WeiElementOp,
OutElementOp>;
auto conv3d_naive = DeviceConv3dFwdNaive{};
auto invoker_naive = conv3d_naive.MakeInvoker();
auto argument_naive = conv3d_naive.MakeArgument(
static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_ref_device_buf.GetDeviceBuffer()),
N,
K,
C,
in_spatial_lengths,
filter_spatial_lengths,
out_spatial_lengths,
conv_filter_strides,
conv_filter_dilations,
in_left_pads,
in_right_pads,
InElementOp{},
WeiElementOp{},
OutElementOp{});
if(!conv3d_naive.IsSupportedArgument(argument_naive))
{
throw std::runtime_error(
"wrong! device_conv3d_naive does NOT support the specified compilation parameters");
}
invoker_naive.Run(argument_naive);
Tensor<OutDataType> out_ref(
{N, out_spatial_lengths[0], out_spatial_lengths[1], out_spatial_lengths[2], K});
out_ref_device_buf.FromDevice(out_ref.mData.data());
check_error(out_ref, out);
}
return 0;
}
add_example_executable(example_convnd_fwd_xdl convnd_fwd_xdl.cpp)
add_example_executable(example_convnd_fwd_xdl_int8 convnd_fwd_xdl_int8.cpp)
add_example_executable(example_convnd_fwd_xdl_fp16 convnd_fwd_xdl_fp16.cpp)
# Instructions for ```convnd_fwd_xdl``` Example
# Instructions for ```example_convnd_fwd_xdl```
## Docker script
```bash
docker run \
-it \
--rm \
--privileged \
--group-add sudo \
-w /root/workspace \
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \
rocm/tensorflow:rocm4.3.1-tf2.6-dev \
/bin/bash
```
## Build ```convnd_fwd_xdl```
```bash
mkdir build && cd build
```
```bash
# Need to specify target ID, example below is gfx908
cmake \
-D BUILD_DEV=OFF \
-D CMAKE_BUILD_TYPE=Release \
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-D CMAKE_PREFIX_PATH=/opt/rocm \
..
```
```bash
make -j convnd_fwd_xdl
```
## Run ```convnd_fwd_xdl```
## Run ```example_convnd_fwd_xdl```
```bash
#arg1: verification (0=no, 1=yes)
#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
......@@ -47,7 +14,7 @@ cmake \
# <dilations>, (ie Dy, Dx for 2D)
# <left padding>, (ie LeftPy, LeftPx for 2D)
# <right padding>, (ie RightPy, RightPx for 2D)
./example/convnd_fwd_xdl 0 1 100
./bin/example_convnd_fwd_xdl 0 1 100
```
Result (MI100 @ 1087Mhz, 33.4TFlops peak FP32)
......
......@@ -2,8 +2,10 @@
#include <iostream>
#include <numeric>
#include <type_traits>
#include "check_err.hpp"
#include "config.hpp"
#include "conv_utils.hpp"
#include "conv_fwd_util.hpp"
#include "device.hpp"
#include "device_tensor.hpp"
#include "device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp"
......@@ -13,6 +15,8 @@
#include "reference_conv_fwd.hpp"
#include "tensor_layout.hpp"
namespace {
using InDataType = float;
using WeiDataType = float;
using OutDataType = float;
......@@ -26,7 +30,7 @@ using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
static constexpr auto ConvFwdDefault =
ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default;
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
using DeviceConvFwdBasePtr =
ck::tensor_operation::device::DeviceConvFwdPtr<InElementOp, WeiElementOp, OutElementOp>;
......@@ -80,10 +84,13 @@ using ReferenceConvNDFwdInstance = ck::tensor_operation::host::ReferenceConvFwd<
OutElementOp,
NumDimSpatial>;
DeviceConvFwdBasePtr GetConvInstance(int num_dim_spatial)
DeviceConvFwdBasePtr get_conv_instance(int num_dim_spatial)
{
switch(num_dim_spatial)
{
case 3: {
return std::make_unique<DeviceConvNDFwdInstance<3>>();
}
case 2: {
return std::make_unique<DeviceConvNDFwdInstance<2>>();
}
......@@ -96,7 +103,7 @@ DeviceConvFwdBasePtr GetConvInstance(int num_dim_spatial)
}
}
void PrintUseMsg()
void print_use_msg()
{
std::cout << "arg1: verification (0=no, 1=yes)\n"
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
......@@ -113,18 +120,18 @@ void PrintUseMsg()
<< std::endl;
}
ck::conv_util::ConvParams ParseConvParams(int num_dim_spatial, int argc, char* argv[])
ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, int argc, char* argv[])
{
// (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right)
int conv_args = 3 + num_dim_spatial * 6;
int cmdline_nargs = conv_args + 5;
if(cmdline_nargs != argc)
{
PrintUseMsg();
print_use_msg();
exit(0);
}
ck::conv_util::ConvParams params;
ck::utils::conv::ConvParams params;
int arg_idx = 5;
params.num_dim_spatial = num_dim_spatial;
......@@ -166,71 +173,18 @@ ck::conv_util::ConvParams ParseConvParams(int num_dim_spatial, int argc, char* a
return params;
}
HostTensorDescriptor GetOutputHostTensorDescriptor(const std::vector<std::size_t>& dims,
int num_dim_spatial = 2)
{
namespace tl = ck::tensor_layout::convolution;
switch(num_dim_spatial)
{
case 2: {
return ck::conv_util::GetHostTensorDescriptor(dims, tl::NHWK{});
}
case 1: {
return ck::conv_util::GetHostTensorDescriptor(dims, tl::NWK{});
}
default: {
throw std::runtime_error("Unsupported number of spatial dimensions provided!");
}
}
}
HostTensorDescriptor GetFiltersHostTensorDescriptor(const std::vector<std::size_t>& dims,
int num_dim_spatial = 2)
{
namespace tl = ck::tensor_layout::convolution;
switch(num_dim_spatial)
{
case 2: {
return ck::conv_util::GetHostTensorDescriptor(dims, tl::KYXC{});
}
case 1: {
return ck::conv_util::GetHostTensorDescriptor(dims, tl::KXC{});
}
default: {
throw std::runtime_error("Unsupported number of spatial dimensions provided!");
}
}
}
HostTensorDescriptor GetInputHostTensorDescriptor(const std::vector<std::size_t>& dims,
int num_dim_spatial = 2)
{
namespace tl = ck::tensor_layout::convolution;
switch(num_dim_spatial)
{
case 2: {
return ck::conv_util::GetHostTensorDescriptor(dims, tl::NHWC{});
}
case 1: {
return ck::conv_util::GetHostTensorDescriptor(dims, tl::NWC{});
}
default: {
throw std::runtime_error("Unsupported number of spatial dimensions provided!");
}
}
}
} // anonymous namespace
int main(int argc, char* argv[])
{
using namespace ck::utils::conv;
bool do_verification = 0;
int init_method = 0;
int nrepeat = 5;
int num_dim_spatial = 2;
ck::conv_util::ConvParams params;
ck::utils::conv::ConvParams params;
if(argc >= 5)
{
......@@ -242,7 +196,7 @@ int main(int argc, char* argv[])
if(argc >= 6)
{
params = ParseConvParams(num_dim_spatial, argc, argv);
params = parse_conv_params(num_dim_spatial, argc, argv);
}
std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N),
......@@ -264,10 +218,12 @@ int main(int argc, char* argv[])
std::begin(output_spatial_lengths),
std::end(output_spatial_lengths));
Tensor<InDataType> input(GetInputHostTensorDescriptor(input_dims, num_dim_spatial));
Tensor<WeiDataType> weights(GetFiltersHostTensorDescriptor(filter_dims, num_dim_spatial));
Tensor<OutDataType> host_output(GetOutputHostTensorDescriptor(output_dims, num_dim_spatial));
Tensor<OutDataType> device_output(GetOutputHostTensorDescriptor(output_dims, num_dim_spatial));
Tensor<InDataType> input(get_input_host_tensor_descriptor(input_dims, num_dim_spatial));
Tensor<WeiDataType> weights(get_filters_host_tensor_descriptor(filter_dims, num_dim_spatial));
Tensor<OutDataType> host_output(
get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
Tensor<OutDataType> device_output(
get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
std::cout << "input: " << input.mDesc << std::endl;
std::cout << "weights: " << weights.mDesc << std::endl;
......@@ -293,7 +249,7 @@ int main(int argc, char* argv[])
wei_device_buf.ToDevice(weights.mData.data());
// do GEMM
auto conv = GetConvInstance(num_dim_spatial);
auto conv = get_conv_instance(num_dim_spatial);
auto invoker = conv->MakeInvokerPointer();
auto argument =
conv->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
......@@ -322,10 +278,10 @@ int main(int argc, char* argv[])
float ave_time = invoker->Run(argument.get(), nrepeat);
std::size_t flop = ck::conv_util::GetFlops(
std::size_t flop = get_flops(
params.N, params.C, params.K, params.filter_spatial_lengths, output_spatial_lengths);
std::size_t num_btype =
ck::conv_util::GetBtype<InDataType, WeiDataType, OutDataType>(params.N,
get_btype<InDataType, WeiDataType, OutDataType>(params.N,
params.C,
params.K,
params.input_spatial_lengths,
......@@ -355,11 +311,17 @@ int main(int argc, char* argv[])
ref_invoker.Run(ref_argument);
out_device_buf.FromDevice(device_output.mData.data());
check_error(host_output, device_output);
ck::utils::check_err(
host_output.mData, device_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
};
switch(num_dim_spatial)
{
case 3: {
auto ref_conv = ReferenceConvNDFwdInstance<3>();
verify_f(ref_conv);
break;
}
case 2: {
auto ref_conv = ReferenceConvNDFwdInstance<2>();
verify_f(ref_conv);
......
#include <cstdlib>
#include <iostream>
#include <numeric>
#include <type_traits>
#include "check_err.hpp"
#include "config.hpp"
#include "conv_fwd_util.hpp"
#include "device.hpp"
#include "device_tensor.hpp"
#include "device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp"
#include "element_wise_operation.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "reference_conv_fwd.hpp"
#include "tensor_layout.hpp"
namespace {
using InDataType = ck::half_t;
using WeiDataType = ck::half_t;
using OutDataType = ck::half_t;
using AccDataType = float;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using InLayout = ck::tensor_layout::convolution::NHWC;
using WeiLayout = ck::tensor_layout::convolution::KYXC;
using OutLayout = ck::tensor_layout::convolution::NHWK;
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
static constexpr auto ConvFwdDefault =
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
using DeviceConvFwdBasePtr =
ck::tensor_operation::device::DeviceConvFwdPtr<InElementOp, WeiElementOp, OutElementOp>;
template <ck::index_t NumDimSpatial>
using DeviceConvNDFwdInstance = ck::tensor_operation::device::
DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
// clang-format off
InDataType, //
WeiDataType, //
OutDataType, //
AccDataType, //
InElementOp, // Input Elementwise Operation
WeiElementOp, // Weights Elementwise Operation
OutElementOp, // Output Elementwise Operation
ConvFwdDefault, // ConvForwardSpecialization
NumDimSpatial, // NumDimSpatial
256, // BlockSize
128, // MPerBlock
256, // NPerBlock
4, // K0PerBlock
8, // K1
32, // MPerXdl
32, // NPerXdl
2, // MXdlPerWave
4, // NXdlPerWave
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
2, // ABlockTransferSrcVectorDim
8, // ABlockTransferSrcScalarPerVector
8, // ABlockTransferDstScalarPerVector_K1
true, // ABlockLdsAddExtraM
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
2, // BBlockTransferSrcVectorDim
8, // BBlockTransferSrcScalarPerVector
8, // BBlockTransferDstScalarPerVector_K1
true, // BBlockLdsAddExtraN
7, // CThreadTransferSrcDstVectorDim
1>; // CThreadTransferDstScalarPerVector
template <ck::index_t NumDimSpatial>
using ReferenceConvNDFwdInstance = ck::tensor_operation::host::ReferenceConvFwd<InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp,
NumDimSpatial>;
DeviceConvFwdBasePtr get_conv_instance(int num_dim_spatial)
{
switch(num_dim_spatial)
{
case 3: {
return std::make_unique<DeviceConvNDFwdInstance<3>>();
}
case 2: {
return std::make_unique<DeviceConvNDFwdInstance<2>>();
}
case 1: {
return std::make_unique<DeviceConvNDFwdInstance<1>>();
}
default: {
throw std::runtime_error("Unsupported number of spatial dimensions provided!");
}
}
}
void print_use_msg()
{
std::cout << "arg1: verification (0=no, 1=yes)\n"
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
<< "arg3: run kernel # of times (>1)\n"
<< "arg4: N spatial dimensions (default 2)\n"
<< "Following arguments (depending on number of spatial dims):\n"
<< " N, K, C, \n"
<< " <filter spatial dimensions>, (ie Y, X for 2D)\n"
<< " <input image spatial dimensions>, (ie Hi, Wi for 2D)\n"
<< " <strides>, (ie Sy, Sx for 2D)\n"
<< " <dilations>, (ie Dy, Dx for 2D)\n"
<< " <left padding>, (ie LeftPy, LeftPx for 2D)\n"
<< " <right padding>, (ie RightPy, RightPx for 2D)\n"
<< std::endl;
}
ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, int argc, char* argv[])
{
// (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right)
int conv_args = 3 + num_dim_spatial * 6;
int cmdline_nargs = conv_args + 5;
if(cmdline_nargs != argc)
{
print_use_msg();
exit(0);
}
ck::utils::conv::ConvParams params;
int arg_idx = 5;
params.num_dim_spatial = num_dim_spatial;
params.N = std::stoi(argv[arg_idx++]);
params.K = std::stoi(argv[arg_idx++]);
params.C = std::stoi(argv[arg_idx++]);
params.filter_spatial_lengths.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
}
params.input_spatial_lengths.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.input_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
}
params.conv_filter_strides.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.conv_filter_strides[i] = std::stoi(argv[arg_idx++]);
}
params.conv_filter_dilations.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.conv_filter_dilations[i] = std::stoi(argv[arg_idx++]);
}
params.input_left_pads.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.input_left_pads[i] = std::stoi(argv[arg_idx++]);
}
params.input_right_pads.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.input_right_pads[i] = std::stoi(argv[arg_idx++]);
}
return params;
}
} // anonymous namespace
int main(int argc, char* argv[])
{
using namespace ck::utils::conv;
bool do_verification = 0;
int init_method = 0;
int nrepeat = 5;
int num_dim_spatial = 2;
ck::utils::conv::ConvParams params;
if(argc >= 5)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
nrepeat = std::stoi(argv[3]);
num_dim_spatial = std::stoi(argv[4]);
}
if(argc >= 6)
{
params = parse_conv_params(num_dim_spatial, argc, argv);
}
std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N),
static_cast<std::size_t>(params.C)};
input_dims.insert(std::end(input_dims),
std::begin(params.input_spatial_lengths),
std::end(params.input_spatial_lengths));
std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K),
static_cast<std::size_t>(params.C)};
filter_dims.insert(std::end(filter_dims),
std::begin(params.filter_spatial_lengths),
std::end(params.filter_spatial_lengths));
const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N),
static_cast<std::size_t>(params.K)};
output_dims.insert(std::end(output_dims),
std::begin(output_spatial_lengths),
std::end(output_spatial_lengths));
Tensor<InDataType> input(get_input_host_tensor_descriptor(input_dims, num_dim_spatial));
Tensor<WeiDataType> weights(get_filters_host_tensor_descriptor(filter_dims, num_dim_spatial));
Tensor<OutDataType> host_output(get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
Tensor<OutDataType> device_output(get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
std::cout << "input: " << input.mDesc << std::endl;
std::cout << "weights: " << weights.mDesc << std::endl;
std::cout << "output: " << host_output.mDesc << std::endl;
switch(init_method)
{
case 0: break;
case 1:
input.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
weights.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
break;
default:
input.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
weights.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
}
DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace());
DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace());
DeviceMem out_device_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpace());
in_device_buf.ToDevice(input.mData.data());
wei_device_buf.ToDevice(weights.mData.data());
// do GEMM
auto conv = get_conv_instance(num_dim_spatial);
auto invoker = conv->MakeInvokerPointer();
auto argument =
conv->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
params.N,
params.K,
params.C,
params.input_spatial_lengths,
params.filter_spatial_lengths,
output_spatial_lengths,
params.conv_filter_strides,
params.conv_filter_dilations,
params.input_left_pads,
params.input_right_pads,
InElementOp{},
WeiElementOp{},
OutElementOp{});
if(!conv->IsSupportedArgument(argument.get()))
{
throw std::runtime_error(
"wrong! device_conv with the specified compilation parameters does "
"not support this Conv problem");
}
float ave_time = invoker->Run(argument.get(), nrepeat);
std::size_t flop = get_flops(
params.N, params.C, params.K, params.filter_spatial_lengths, output_spatial_lengths);
std::size_t num_btype = get_btype<InDataType, WeiDataType, OutDataType>(
params.N,
params.C,
params.K,
params.input_spatial_lengths,
params.filter_spatial_lengths,
output_spatial_lengths);
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
<< std::endl;
if(do_verification)
{
auto verify_f = [&input, &weights, &host_output, &params, &out_device_buf, &device_output](
const auto& ref_conv) {
auto ref_invoker = ref_conv.MakeInvoker();
auto ref_argument = ref_conv.MakeArgument(input,
weights,
host_output,
params.conv_filter_strides,
params.conv_filter_dilations,
params.input_left_pads,
params.input_right_pads,
InElementOp{},
WeiElementOp{},
OutElementOp{});
ref_invoker.Run(ref_argument);
out_device_buf.FromDevice(device_output.mData.data());
ck::utils::check_err(
host_output.mData, device_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
};
switch(num_dim_spatial)
{
case 3: {
auto ref_conv = ReferenceConvNDFwdInstance<3>();
verify_f(ref_conv);
break;
}
case 2: {
auto ref_conv = ReferenceConvNDFwdInstance<2>();
verify_f(ref_conv);
break;
}
case 1: {
auto ref_conv = ReferenceConvNDFwdInstance<1>();
verify_f(ref_conv);
break;
}
default: {
throw std::runtime_error("Unsupported number of spatial dimensions provided!");
}
}
}
}
#include <cstdlib>
#include <iostream>
#include <numeric>
#include <type_traits>
#include "check_err.hpp"
#include "config.hpp"
#include "conv_fwd_util.hpp"
#include "device.hpp"
#include "device_tensor.hpp"
#include "device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp"
#include "element_wise_operation.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "reference_conv_fwd.hpp"
#include "tensor_layout.hpp"
namespace {
using InDataType = int8_t;
using WeiDataType = int8_t;
using OutDataType = int8_t;
using AccDataType = int32_t;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using InLayout = ck::tensor_layout::convolution::NHWC;
using WeiLayout = ck::tensor_layout::convolution::KYXC;
using OutLayout = ck::tensor_layout::convolution::NHWK;
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
static constexpr auto ConvFwdDefault =
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
using DeviceConvFwdBasePtr =
ck::tensor_operation::device::DeviceConvFwdPtr<InElementOp, WeiElementOp, OutElementOp>;
template <ck::index_t NumDimSpatial>
using DeviceConvNDFwdInstance = ck::tensor_operation::device::
DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
// clang-format off
InDataType, //
WeiDataType, //
OutDataType, //
AccDataType, //
InElementOp, // Input Elementwise Operation
WeiElementOp, // Weights Elementwise Operation
OutElementOp, // Output Elementwise Operation
ConvFwdDefault, // ConvForwardSpecialization
NumDimSpatial, // NumDimSpatial
256, // BlockSize
128, // MPerBlock
256, // NPerBlock
4, // K0PerBlock
16, // K1
32, // MPerXdl
32, // NPerXdl
2, // MXdlPerWave
4, // NXdlPerWave
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
2, // ABlockTransferSrcVectorDim
16, // ABlockTransferSrcScalarPerVector
16, // ABlockTransferDstScalarPerVector_K1
true, // ABlockLdsAddExtraM
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
2, // BBlockTransferSrcVectorDim
16, // BBlockTransferSrcScalarPerVector
16, // BBlockTransferDstScalarPerVector_K1
true, // BBlockLdsAddExtraN
7, // CThreadTransferSrcDstVectorDim
1>; // CThreadTransferDstScalarPerVector
template <ck::index_t NumDimSpatial>
using ReferenceConvNDFwdInstance = ck::tensor_operation::host::ReferenceConvFwd<InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp,
NumDimSpatial>;
DeviceConvFwdBasePtr get_conv_instance(int num_dim_spatial)
{
switch(num_dim_spatial)
{
case 3: {
return std::make_unique<DeviceConvNDFwdInstance<3>>();
}
case 2: {
return std::make_unique<DeviceConvNDFwdInstance<2>>();
}
case 1: {
return std::make_unique<DeviceConvNDFwdInstance<1>>();
}
default: {
throw std::runtime_error("Unsupported number of spatial dimensions provided!");
}
}
}
void print_use_msg()
{
std::cout << "arg1: verification (0=no, 1=yes)\n"
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
<< "arg3: run kernel # of times (>1)\n"
<< "arg4: N spatial dimensions (default 2)\n"
<< "Following arguments (depending on number of spatial dims):\n"
<< " N, K, C, \n"
<< " <filter spatial dimensions>, (ie Y, X for 2D)\n"
<< " <input image spatial dimensions>, (ie Hi, Wi for 2D)\n"
<< " <strides>, (ie Sy, Sx for 2D)\n"
<< " <dilations>, (ie Dy, Dx for 2D)\n"
<< " <left padding>, (ie LeftPy, LeftPx for 2D)\n"
<< " <right padding>, (ie RightPy, RightPx for 2D)\n"
<< std::endl;
}
ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, int argc, char* argv[])
{
// (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right)
int conv_args = 3 + num_dim_spatial * 6;
int cmdline_nargs = conv_args + 5;
if(cmdline_nargs != argc)
{
print_use_msg();
exit(0);
}
ck::utils::conv::ConvParams params;
int arg_idx = 5;
params.num_dim_spatial = num_dim_spatial;
params.N = std::stoi(argv[arg_idx++]);
params.K = std::stoi(argv[arg_idx++]);
params.C = std::stoi(argv[arg_idx++]);
params.filter_spatial_lengths.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
}
params.input_spatial_lengths.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.input_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
}
params.conv_filter_strides.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.conv_filter_strides[i] = std::stoi(argv[arg_idx++]);
}
params.conv_filter_dilations.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.conv_filter_dilations[i] = std::stoi(argv[arg_idx++]);
}
params.input_left_pads.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.input_left_pads[i] = std::stoi(argv[arg_idx++]);
}
params.input_right_pads.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.input_right_pads[i] = std::stoi(argv[arg_idx++]);
}
return params;
}
} // anonymous namespace
int main(int argc, char* argv[])
{
using namespace ck::utils::conv;
bool do_verification = 0;
int init_method = 0;
int nrepeat = 5;
int num_dim_spatial = 2;
ck::utils::conv::ConvParams params;
if(argc >= 5)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
nrepeat = std::stoi(argv[3]);
num_dim_spatial = std::stoi(argv[4]);
}
if(argc >= 6)
{
params = parse_conv_params(num_dim_spatial, argc, argv);
}
std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N),
static_cast<std::size_t>(params.C)};
input_dims.insert(std::end(input_dims),
std::begin(params.input_spatial_lengths),
std::end(params.input_spatial_lengths));
std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K),
static_cast<std::size_t>(params.C)};
filter_dims.insert(std::end(filter_dims),
std::begin(params.filter_spatial_lengths),
std::end(params.filter_spatial_lengths));
const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N),
static_cast<std::size_t>(params.K)};
output_dims.insert(std::end(output_dims),
std::begin(output_spatial_lengths),
std::end(output_spatial_lengths));
Tensor<InDataType> input(get_input_host_tensor_descriptor(input_dims, num_dim_spatial));
Tensor<WeiDataType> weights(get_filters_host_tensor_descriptor(filter_dims, num_dim_spatial));
Tensor<OutDataType> host_output(get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
Tensor<OutDataType> device_output(get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
std::cout << "input: " << input.mDesc << std::endl;
std::cout << "weights: " << weights.mDesc << std::endl;
std::cout << "output: " << host_output.mDesc << std::endl;
switch(init_method)
{
case 0: break;
case 1:
input.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
weights.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
break;
default:
input.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
weights.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
}
DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace());
DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace());
DeviceMem out_device_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpace());
in_device_buf.ToDevice(input.mData.data());
wei_device_buf.ToDevice(weights.mData.data());
// do GEMM
auto conv = get_conv_instance(num_dim_spatial);
auto invoker = conv->MakeInvokerPointer();
auto argument =
conv->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
params.N,
params.K,
params.C,
params.input_spatial_lengths,
params.filter_spatial_lengths,
output_spatial_lengths,
params.conv_filter_strides,
params.conv_filter_dilations,
params.input_left_pads,
params.input_right_pads,
InElementOp{},
WeiElementOp{},
OutElementOp{});
if(!conv->IsSupportedArgument(argument.get()))
{
throw std::runtime_error(
"wrong! device_conv with the specified compilation parameters does "
"not support this Conv problem");
}
float ave_time = invoker->Run(argument.get(), nrepeat);
std::size_t flop = get_flops(
params.N, params.C, params.K, params.filter_spatial_lengths, output_spatial_lengths);
std::size_t num_btype = get_btype<InDataType, WeiDataType, OutDataType>(
params.N,
params.C,
params.K,
params.input_spatial_lengths,
params.filter_spatial_lengths,
output_spatial_lengths);
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
<< std::endl;
if(do_verification)
{
auto verify_f = [&input, &weights, &host_output, &params, &out_device_buf, &device_output](
const auto& ref_conv) {
auto ref_invoker = ref_conv.MakeInvoker();
auto ref_argument = ref_conv.MakeArgument(input,
weights,
host_output,
params.conv_filter_strides,
params.conv_filter_dilations,
params.input_left_pads,
params.input_right_pads,
InElementOp{},
WeiElementOp{},
OutElementOp{});
ref_invoker.Run(ref_argument);
out_device_buf.FromDevice(device_output.mData.data());
ck::utils::check_err(
host_output.mData, device_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
};
switch(num_dim_spatial)
{
case 3: {
auto ref_conv = ReferenceConvNDFwdInstance<3>();
verify_f(ref_conv);
break;
}
case 2: {
auto ref_conv = ReferenceConvNDFwdInstance<2>();
verify_f(ref_conv);
break;
}
case 1: {
auto ref_conv = ReferenceConvNDFwdInstance<1>();
verify_f(ref_conv);
break;
}
default: {
throw std::runtime_error("Unsupported number of spatial dimensions provided!");
}
}
}
}
# Instructions for ```conv2d_bwd_data_xdl``` Example
# Instructions for ```example_conv2d_bwd_data_xdl``` Example
## Docker script
```bash
docker run \
-it \
--rm \
--privileged \
--group-add sudo \
-w /root/workspace \
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \
rocm/tensorflow:rocm4.3.1-tf2.6-dev \
/bin/bash
```
## Build ```conv2d_bwd_data_xdl```
```bash
mkdir build && cd build
```
```bash
# Need to specify target ID, example below is gfx908
cmake \
-D BUILD_DEV=OFF \
-D CMAKE_BUILD_TYPE=Release \
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-D CMAKE_PREFIX_PATH=/opt/rocm \
..
```
```bash
make -j conv2d_bwd_data_xdl
```
## Run ```conv2d_bwd_data_xdl```
## Run ```example_conv2d_bwd_data_xdl```
```bash
#arg1: verification (0=no, 1=yes)
#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
#arg3: run kernel # of times (>1)
#arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx
./bin/conv2d_bwd_data_xdl 0 1 5
./bin/example_conv2d_bwd_data_xdl 0 1 5
```
Result
......
......@@ -4,6 +4,8 @@
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "check_err.hpp"
#include "config.hpp"
#include "print.hpp"
#include "device.hpp"
......@@ -27,7 +29,7 @@ using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
static constexpr auto ConvBwdDefault =
ck::tensor_operation::device::ConvolutionBackwardDataSpecialization_t::Default;
ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default;
using DeviceConvBwdDataInstance = ck::tensor_operation::device::
DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
......@@ -38,7 +40,7 @@ using DeviceConvBwdDataInstance = ck::tensor_operation::device::
InElementOp, // InElementwiseOperation
WeiElementOp, // WeiElementwiseOperation
OutElementOp, // OutElementwiseOperation
ConvBwdDefault, // ConvolutionBackwardDataSpecialization_t
ConvBwdDefault, // ConvolutionBackwardDataSpecialization
256, // BlockSize
128, // MPerBlock
128, // NPerBlock
......@@ -68,6 +70,7 @@ using DeviceConvBwdDataInstance = ck::tensor_operation::device::
using ReferenceConvBwdInstance = ck::tensor_operation::host::ReferenceConvBwdData<InDataType,
WeiDataType,
OutDataType,
AccDataType,
InElementOp,
WeiElementOp,
OutElementOp>;
......@@ -180,6 +183,10 @@ int main(int argc, char* argv[])
out_device_buf.ToDevice(out_n_k_ho_wo.mData.data());
wei_device_buf.ToDevice(wei_k_c_y_x.mData.data());
// reset input to zero
in_n_c_hi_wi_device_result.GenerateTensorValue(GeneratorTensor_1<InDataType>{0});
in_device_buf.ToDevice(in_n_c_hi_wi_device_result.mData.data());
// do GEMM
auto conv = DeviceConvBwdDataInstance{};
auto invoker = conv.MakeInvoker();
......@@ -242,6 +249,6 @@ int main(int argc, char* argv[])
in_device_buf.FromDevice(in_n_c_hi_wi_device_result.mData.data());
check_error(in_n_c_hi_wi_host_result, in_n_c_hi_wi_device_result);
ck::utils::check_err(in_n_c_hi_wi_device_result.mData, in_n_c_hi_wi_host_result.mData);
}
}
add_example_executable(example_conv2d_bwd_weight_xdl conv2d_bwd_weight_xdl.cpp)
# Instructions for ```conv2d_wrw_xdl``` Example
# Instructions for ```example_conv2d_bwd_weight_xdl``` Example
## Docker script
```bash
docker run \
-it \
--rm \
--privileged \
--group-add sudo \
-w /root/workspace \
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \
rocm/tensorflow:rocm4.3.1-tf2.6-dev \
/bin/bash
```
## Build ```conv2d_wrw_xdl```
```bash
mkdir build && cd build
```
```bash
# Need to specify target ID, example below is gfx908
cmake \
-D BUILD_DEV=OFF \
-D CMAKE_BUILD_TYPE=Release \
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-D CMAKE_PREFIX_PATH=/opt/rocm \
..
```
```bash
make -j conv2d_wrw_xdl
```
## Run ```conv2d_wrw_xdl```
## Run ```example_conv2d_bwd_weight_xdl```
```bash
#arg1: verification (0=no, 1=yes)
#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
#arg3: run kernel # of times (>1)
#arg4: is show log (0=no, 1=yes)
#arg5 to 19: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx, split-k
./example/conv2d_fwd_xdl 0 1 5 0 4
./bin/example_conv2d_bwd_weight_xdl 0 1 5 0 4
```
Result
......
......@@ -4,6 +4,8 @@
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "check_err.hpp"
#include "config.hpp"
#include "print.hpp"
#include "device.hpp"
......@@ -32,8 +34,8 @@ using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
// clang-format off
using DeviceConvWrWInstance = ck::tensor_operation::device::
DeviceConv2dWrWXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
using DeviceConvBwdWeightInstance = ck::tensor_operation::device::
DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
InDataType, // InDataType
WeiDataType, // WeiDataType
OutDataType, // OutDataType
......@@ -70,8 +72,8 @@ using DeviceConvWrWInstance = ck::tensor_operation::device::
8>; // CBlockTransferScalarPerVector_NWaveNPerXdl
// clang-format on
using ReferenceConvWrwInstance = ck::tensor_operation::host::
ReferenceConvWrw<InDataType, WeiDataType, OutDataType, InElementOp, WeiElementOp, OutElementOp>;
using ReferenceConvBwdWeightInstance = ck::tensor_operation::host::
ReferenceConvBwdWeight<InDataType, WeiDataType, OutDataType, InElementOp, WeiElementOp, OutElementOp>;
int main(int argc, char* argv[])
{
......@@ -211,7 +213,7 @@ int main(int argc, char* argv[])
wei_device_buf.ToDevice(wei_k_c_y_x_device_result.mData.data());
// do GEMM
auto conv = DeviceConvWrWInstance{};
auto conv = DeviceConvBwdWeightInstance{};
auto invoker = conv.MakeInvoker();
auto argument = conv.MakeArgument(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
......@@ -256,7 +258,7 @@ int main(int argc, char* argv[])
if(do_verification)
{
auto ref_conv = ReferenceConvWrwInstance{};
auto ref_conv = ReferenceConvBwdWeightInstance{};
auto ref_invoker = ref_conv.MakeInvoker();
auto ref_argument = ref_conv.MakeArgument(in_n_c_hi_wi,
......@@ -284,6 +286,6 @@ int main(int argc, char* argv[])
LogRangeAsType<float>(std::cout << "wei_host : ", wei_k_c_y_x_host_result.mData, ",")
<< std::endl;
}
check_error(wei_k_c_y_x_host_result, wei_k_c_y_x_device_result);
ck::utils::check_err(wei_k_c_y_x_device_result.mData, wei_k_c_y_x_host_result.mData);
}
}
add_example_executable(example_conv2d_bwd_wgt_xdl conv2d_bwd_wgt_xdl.cpp)
# Instructions for ```example_reduce_blockwise```
## Run ```example_reduce_blockwise```
```bash
# -D <xxx> : input 4-d tensor lengths
# -v <x> : verification (0=no, 1=yes)
#arg1: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
#arg2: run kernel # of times (>1)
./bin/example_reduce_blockwise -D 16,64,32,960 -v 1 1 10
```
Result
```
launch_and_time_kernel: grid_dim {240, 1, 1}, block_dim {256, 1, 1}
Warm up
Start running 3 times...
Perf: 0.23536 ms, 267.32 GB/s, DeviceReduceBlockWise<256,M_C4_S1,K_C64_S1,InSrcVectorDim_0_InSrcVectorSize_1_OutDstVectorSize_1>
error: 0
max_diff: 0, 529, 529
root@dc-smc-18:/data/composable_kernel/Build3# bin/example_reduce_blockwise -D 16,64,32,960 -v 1 1 10
launch_and_time_kernel: grid_dim {240, 1, 1}, block_dim {256, 1, 1}
Warm up
Start running 10 times...
Perf: 0.23392 ms, 268.966 GB/s, DeviceReduceBlockWise<256,M_C4_S1,K_C64_S1,InSrcVectorDim_0_InSrcVectorSize_1_OutDstVectorSize_1>
error: 0
max_diff: 0, 528, 528
```
......@@ -4,6 +4,8 @@
#include <cstdlib>
#include <getopt.h>
#include <half.hpp>
#include "check_err.hpp"
#include "config.hpp"
#include "print.hpp"
#include "device.hpp"
......@@ -13,28 +15,29 @@
#include "device_base.hpp"
#include "device_reduce_blockwise.hpp"
#include "host_reduce_util.hpp"
#include "host_generic_reduction.hpp"
#include "host_reduction.hpp"
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
using namespace ck;
using namespace ck::tensor_operation::device;
using InDataType = half_float::half;
using OutDataType = half_float::half;
using InDataType = ck::half_t;
using OutDataType = ck::half_t;
using AccDataType = float;
using kInDataType = ck::half_t;
using kOutDataType = ck::half_t;
using kAccDataType = float;
using HostInDataType = half_float::half;
using HostOutDataType = half_float::half;
using HostAccDataType = float;
constexpr int Rank = 4;
using ReduceDims_ = ck::Sequence<0, 1, 2>;
constexpr int NumReduceDim = 3;
constexpr ReduceTensorOp_t ReduceOpId = ReduceTensorOp_t::NORM2;
constexpr NanPropagation_t NanOpt = NanPropagation_t::PROPAGATE_NAN;
constexpr bool PropagateNan = (NanOpt == NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true;
constexpr ReduceTensorIndices_t IndicesOpt = ReduceTensorIndices_t::NO_INDICES;
constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::NORM2;
constexpr NanPropagation NanOpt = NanPropagation::PROPAGATE_NAN;
constexpr bool PropagateNan = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true;
constexpr ReduceTensorIndices IndicesOpt = ReduceTensorIndices::NO_INDICES;
using ReduceOperation = typename reduce_binary_operator<AccDataType, ReduceOpId>::opType;
using InElementwiseOperation =
......@@ -42,11 +45,11 @@ using InElementwiseOperation =
using AccElementwiseOperation =
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::AccElementwiseOperation;
using DeviceReduceInstance = DeviceReduceBlockWise<kInDataType,
kAccDataType,
kOutDataType,
using DeviceReduceInstance = DeviceReduceBlockWise<InDataType,
AccDataType,
OutDataType,
Rank,
ReduceDims_,
NumReduceDim,
ReduceOperation,
InElementwiseOperation,
AccElementwiseOperation,
......@@ -134,6 +137,10 @@ class SimpleAppArgs
std::cout << "--verify or -v, 1/0 to indicate whether to verify the reduction result by "
"comparing with the host-based reduction"
<< std::endl;
std::cout << "Arg1 -- init method (0=no init, 1=single integer value, 2=scope integer "
"value, 3=decimal value)"
<< std::endl;
std::cout << "Arg2 -- number of repeats to run the kernel" << std::endl;
};
int processArgs(int argc, char* argv[])
......@@ -192,50 +199,24 @@ class SimpleAppArgs
};
};
template <int Rank, typename ReduceDims>
static std::vector<int> get_reduce_dims()
{
std::vector<int> resDims;
static_for<0, ReduceDims::Size(), 1>{}([&](auto i) { resDims.push_back(ReduceDims::At(i)); });
return (resDims);
};
template <int Rank, typename ReduceDims>
static std::vector<int> get_invariant_dims()
{
std::vector<int> resDims;
unsigned int incFlag = 0;
static_for<0, ReduceDims::Size(), 1>{}(
[&](auto i) { incFlag = incFlag | (0x1 << ReduceDims::At(i)); });
for(int dim = 0; dim < Rank; dim++)
{
if(incFlag & (0x1 << dim))
continue;
resDims.push_back(dim);
};
return (resDims);
};
int main(int argc, char* argv[])
{
using namespace ck::host_reduce;
const std::vector<int> reduceDims{0, 1, 2};
const std::vector<int> invariantDims{3};
SimpleAppArgs args;
if(args.processArgs(argc, argv) < 0)
return (-1);
constexpr bool op_support_indices =
(ReduceOpId == ReduceTensorOp_t::MIN || ReduceOpId == ReduceTensorOp_t::MAX ||
ReduceOpId == ReduceTensorOp_t::AMAX);
(ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
ReduceOpId == ReduceTensorOp::AMAX);
constexpr bool NeedIndices =
(op_support_indices && (IndicesOpt != ReduceTensorIndices_t::NO_INDICES));
(op_support_indices && (IndicesOpt != ReduceTensorIndices::NO_INDICES));
// if input is half type, no reason to use float for indiced reduction operation and must use
// float for non-indiced reduction operation for accuracy
......@@ -251,7 +232,7 @@ int main(int argc, char* argv[])
// indices option can only be used when it is really needed
constexpr bool invalid_reduce_3 =
(!op_support_indices && IndicesOpt != ReduceTensorIndices_t::NO_INDICES);
(!op_support_indices && IndicesOpt != ReduceTensorIndices::NO_INDICES);
constexpr bool invalid_reduce = (invalid_reduce_1 || invalid_reduce_2 || invalid_reduce_3);
......@@ -260,15 +241,12 @@ int main(int argc, char* argv[])
Tensor<InDataType> in(args.inLengths);
const std::vector<int> InvariantDims = get_invariant_dims<Rank, ReduceDims_>();
const std::vector<int> ReduceDims = get_reduce_dims<Rank, ReduceDims_>();
std::vector<size_t> outLengths;
if(InvariantDims.empty())
if(invariantDims.empty())
outLengths.push_back(1);
else
for(auto dim : InvariantDims)
for(auto dim : invariantDims)
outLengths.push_back(args.inLengths[dim]);
Tensor<OutDataType> out_ref(outLengths);
......@@ -285,26 +263,27 @@ int main(int argc, char* argv[])
float alpha = args.scales[0];
float beta = args.scales[1];
std::size_t num_thread = std::thread::hardware_concurrency();
std::size_t num_thread = 1;
if(args.do_verification)
{
switch(args.init_method)
{
case 0:
in.GenerateTensorValue(GeneratorTensor_1<InDataType>{}, num_thread);
case 0: break;
case 1:
in.GenerateTensorValue(GeneratorTensor_1<InDataType>{1}, num_thread);
if(beta != 0.0f)
out_ref.GenerateTensorValue(GeneratorTensor_1<InDataType>{}, num_thread);
out_ref.GenerateTensorValue(GeneratorTensor_1<InDataType>{1}, num_thread);
break;
case 1:
case 2:
in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}, num_thread);
if(beta != 0.0f)
out_ref.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}, num_thread);
break;
default:
in.GenerateTensorValue(GeneratorTensor_2<InDataType>{1, 5}, num_thread);
in.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0}, num_thread);
if(beta != 0.0f)
out_ref.GenerateTensorValue(GeneratorTensor_2<InDataType>{1, 5}, num_thread);
out_ref.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0}, num_thread);
}
if(beta != 0.0f)
......@@ -321,17 +300,27 @@ int main(int argc, char* argv[])
if(beta != 0.0f)
out_dev.ToDevice(out.mData.data());
size_t indicesSizeInBytes = NeedIndices ? out.mDesc.GetElementSize() * sizeof(int) : 0;
size_t indicesSizeInBytes = NeedIndices ? out.mDesc.GetElementSize() * sizeof(int32_t) : 0;
DeviceMem out_indices_dev(indicesSizeInBytes);
if(args.do_verification)
{
ReductionHost<InDataType, AccDataType, OutDataType, ReduceOpId, PropagateNan, NeedIndices>
hostReduce(in.mDesc, out_ref.mDesc, InvariantDims, ReduceDims);
ReductionHost<HostInDataType,
HostAccDataType,
HostOutDataType,
ReduceOpId,
Rank,
NumReduceDim,
PropagateNan,
NeedIndices>
hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
hostReduce.Run(
alpha, in.mData.data(), beta, out_ref.mData.data(), out_indices_ref.mData.data());
hostReduce.Run(alpha,
reinterpret_cast<const HostInDataType*>(in.mData.data()),
beta,
reinterpret_cast<HostOutDataType*>(out_ref.mData.data()),
out_indices_ref.mData.data());
};
const auto i_inLengths = to_int_vector(args.inLengths);
......@@ -341,7 +330,7 @@ int main(int argc, char* argv[])
auto reduce = DeviceReduceInstance{};
auto wsSizeInBytes = reduce.GetWorkspaceSizeInBytes(i_inLengths);
auto wsSizeInBytes = reduce.GetWorkspaceSizeInBytes(i_inLengths, reduceDims);
DeviceMem ws_dev(wsSizeInBytes);
......@@ -350,6 +339,7 @@ int main(int argc, char* argv[])
i_inStrides,
i_outLengths,
i_outStrides,
reduceDims,
alpha,
beta,
in_dev.GetDeviceBuffer(),
......@@ -383,12 +373,13 @@ int main(int argc, char* argv[])
if(args.do_verification)
{
out_dev.FromDevice(out.mData.data());
check_error(out_ref, out);
ck::utils::check_err(out.mData, out_ref.mData);
if(NeedIndices)
{
out_indices_dev.FromDevice(out_indices.mData.data());
check_indices(out_indices_ref, out_indices);
ck::utils::check_err(out_indices.mData, out_indices_ref.mData);
;
};
};
}
# Instructions for ```example_pool2d_fwd``` Example
## Run ```example_pool2d_fwd```
```bash
#arg1: verification (0=no, 1=yes)
#arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
#arg3: run kernel # of times (>1)
#arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, RightPx
./bin/example_pool2d_fwd 1 1 10
```
Result
```
in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
out_n_c_ho_wo: dim 4, lengths {128, 192, 36, 36}, strides {248832, 1, 6912, 192}
launch_and_time_kernel: grid_dim {124416, 1, 1}, block_dim {64, 1, 1}
Warm up
Start running 10 times...
Perf: 0.415453 ms, 1.37996 TFlops, 749.726 GB/s
error: 0
max_diff: 0, 1, 1
```
......@@ -3,6 +3,8 @@
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include "check_err.hpp"
#include "config.hpp"
#include "print.hpp"
#include "device.hpp"
......@@ -22,9 +24,9 @@ using InLayout = ck::tensor_layout::convolution::NHWC;
using OutLayout = ck::tensor_layout::convolution::NHWC;
#if 1
static constexpr auto ReduceOpId = ck::ReduceTensorOp_t::MAX;
static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
#else
static constexpr auto ReduceOpId = ck::ReduceTensorOp_t::AVG;
static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
#endif
static constexpr bool NeedIndices = false;
......@@ -47,7 +49,7 @@ using DevicePoolFwdInstance =
template <typename InDataType,
typename OutDataType,
typename AccDataType,
ck::ReduceTensorOp_t ReduceOpId,
ck::ReduceTensorOp ReduceOpId,
bool PropagateNan,
bool NeedIndices>
static void pool_host_verify(const Tensor<InDataType>& in,
......@@ -236,8 +238,9 @@ int main(int argc, char* argv[])
switch(init_method)
{
case 0: break;
case 1: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}); break;
default: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
case 1: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_1<InDataType>{1}); break;
case 2: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}); break;
default: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0});
}
DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpace());
......@@ -299,13 +302,14 @@ int main(int argc, char* argv[])
out_device_buf.FromDevice(out_n_c_ho_wo_device.mData.data());
check_error(out_n_c_ho_wo_host, out_n_c_ho_wo_device);
ck::utils::check_err(out_n_c_ho_wo_device.mData, out_n_c_ho_wo_host.mData);
if constexpr(NeedIndices)
{
out_indices_device_buf.FromDevice(out_indices_n_c_ho_wo_device.mData.data());
// check_indices(out_indices_n_c_ho_wo_host, out_indices_n_c_ho_wo_device);
// ck::utils::check_err(out_indices_n_c_ho_wo_device.mData,
// out_indices_n_c_ho_wo_host.mData);;
};
}
}
add_example_executable(example_gemm_xdl_requant_relu_requant_int8 gemm_xdl_requant_relu_requant_int8.cpp)
\ No newline at end of file
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "check_err.hpp"
#include "config.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "host_gemm.hpp"
#include "device_tensor.hpp"
#include "device_gemm_xdl.hpp"
#include "device_gemm_xdl_c_shuffle.hpp"
#include "element_wise_operation.hpp"
#include "reference_gemm.hpp"
#include "gemm_specialization.hpp"
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using RequantReluRequant = ck::tensor_operation::element_wise::RequantReluRequant;
using ADataType = int8_t;
using BDataType = int8_t;
using CDataType = int8_t;
using AccDataType = int32_t;
using CShuffleDataType = int32_t;
using ALayout = ck::tensor_layout::gemm::RowMajor;
using BLayout = ck::tensor_layout::gemm::ColumnMajor;
using CLayout = ck::tensor_layout::gemm::RowMajor;
// clang-format off
using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl_C_Shuffle<
ADataType, // ADataType
BDataType, // BDataType
CDataType, // CDataType
AccDataType, // AccDataType
CShuffleDataType, // CShuffleDataType
ALayout, // ALayout
BLayout, // BLayout
CLayout, // CLayout
PassThrough, // AElementwiseOperation
PassThrough, // BElementwiseOperation
RequantReluRequant, // CElementwiseOperation
256, // BlockSize
256, // MPerBlock
128, // NPerBlock
64, // KPerBlock
16, // AK1
16, // BK1
32, // MPerXDL
32, // NPerXDL
4, // MXdlPerWave
2, // NXdlPerWave
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
2, // ABlockTransferSrcVectorDim
16, // ABlockTransferSrcScalarPerVector
16, // ABlockTransferDstScalarPerVector_K1
true, // ABlockLdsAddExtraM
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
2, // BBlockTransferSrcVectorDim
16, // BBlockTransferSrcScalarPerVector
16, // BBlockTransferDstScalarPerVector_K1
true, // BBlockLdsAddExtraN
1, // CShuffleMXdlPerWavePerShuffle
1, // CShuffleNXdlPerWavePerShuffle
S<1, 1, 64, 1, 1, 4>, // CBlockTransferClusterLengths_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
16>; // CBlockTransferScalarPerVector_NWaveNPerXdl
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::
ReferenceGemm<ADataType, BDataType, CDataType, PassThrough, PassThrough, RequantReluRequant>;
int main(int argc, char* argv[])
{
bool do_verification = 0;
int init_method = 0;
int nrepeat = 5;
// GEMM shape
ck::index_t M = 3840;
ck::index_t N = 4096;
ck::index_t K = 4096;
ck::index_t StrideA = 4096;
ck::index_t StrideB = 4096;
ck::index_t StrideC = 4096;
float scale_gemm = 0.03;
float scale_relu = 1;
if(argc == 4)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
nrepeat = std::stoi(argv[3]);
}
else if(argc == 10)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
nrepeat = std::stoi(argv[3]);
M = std::stoi(argv[4]);
N = std::stoi(argv[5]);
K = std::stoi(argv[6]);
StrideA = std::stoi(argv[7]);
StrideB = std::stoi(argv[8]);
StrideC = std::stoi(argv[9]);
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
printf("arg3: run kernel # of times (>1)\n");
printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
exit(0);
}
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({stride, 1}));
}
else
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({1, stride}));
}
};
Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
switch(init_method)
{
case 0: break;
case 1:
a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
break;
default:
a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
}
DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace());
a_m_k_device_buf.ToDevice(a_m_k.mData.data());
b_k_n_device_buf.ToDevice(b_k_n.mData.data());
auto a_element_op = PassThrough{};
auto b_element_op = PassThrough{};
auto c_element_op = RequantReluRequant{scale_gemm, scale_relu};
// do GEMM
auto gemm = DeviceGemmInstance{};
auto invoker = gemm.MakeInvoker();
auto argument = gemm.MakeArgument(static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
M,
N,
K,
StrideA,
StrideB,
StrideC,
a_element_op,
b_element_op,
c_element_op);
if(!gemm.IsSupportedArgument(argument))
{
throw std::runtime_error(
"wrong! device_gemm with the specified compilation parameters does "
"not support this GEMM problem");
}
float ave_time = invoker.Run(argument, nrepeat);
std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype =
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< gemm.GetTypeString() << std::endl;
c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
if(do_verification)
{
auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker();
auto ref_argument = ref_gemm.MakeArgument(
a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
ref_invoker.Run(ref_argument);
ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
}
return 0;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment