Commit b79df771 authored by carlushuang's avatar carlushuang
Browse files

Merge remote-tracking branch 'origin/develop' into cpu_avx2

parents 05d38218 63914743
include_directories(BEFORE
include
${PROJECT_SOURCE_DIR}/host/host_tensor/include
${PROJECT_SOURCE_DIR}/host/device/include
${PROJECT_SOURCE_DIR}/host/solver/include
${PROJECT_SOURCE_DIR}/composable_kernel/include
${PROJECT_SOURCE_DIR}/composable_kernel/include/utility
${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_description
${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_operation
${PROJECT_SOURCE_DIR}/composable_kernel/include/problem_transform
${PROJECT_SOURCE_DIR}/composable_kernel/include/driver
${PROJECT_SOURCE_DIR}/external/rocm/include
)
set(CONV_FWD_DRIVER_OFFLINE_SOURCE src/conv_fwd_driver_offline.cpp)
set(CONV_FWD_DRIVER_OFFLINE_NCHWC_SOURCE src/conv_fwd_driver_offline_nchwc.cpp)
set(CONV_ADD_FWD_DRIVER_OFFLINE_NCHWC_SOURCE src/conv_add_fwd_driver_offline_nchwc.cpp)
set(CONV_MAXPOOL_FWD_DRIVER_OFFLINE_NCHWC_SOURCE src/conv_maxpool_fwd_driver_offline_nchwc.cpp)
set(CONV_BWD_DRIVER_OFFLINE_SOURCE src/conv_bwd_driver_offline.cpp)
set(CONV_WRW_DRIVER_OFFLINE_SOURCE src/conv_wrw_driver_offline.cpp)
set(GEMM_DRIVER_OFFLINE_SOURCE src/gemm_driver_offline.cpp)
add_executable(conv_fwd_driver_offline ${CONV_FWD_DRIVER_OFFLINE_SOURCE})
add_executable(conv_fwd_driver_offline_nchwc ${CONV_FWD_DRIVER_OFFLINE_NCHWC_SOURCE})
add_executable(conv_add_fwd_driver_offline_nchwc ${CONV_ADD_FWD_DRIVER_OFFLINE_NCHWC_SOURCE})
add_executable(conv_maxpool_fwd_driver_offline_nchwc ${CONV_MAXPOOL_FWD_DRIVER_OFFLINE_NCHWC_SOURCE})
add_executable(conv_bwd_driver_offline ${CONV_BWD_DRIVER_OFFLINE_SOURCE})
add_executable(conv_wrw_driver_offline ${CONV_WRW_DRIVER_OFFLINE_SOURCE})
add_executable(gemm_driver_offline ${GEMM_DRIVER_OFFLINE_SOURCE})
target_link_libraries(conv_fwd_driver_offline PRIVATE host_tensor)
target_link_libraries(conv_fwd_driver_offline_nchwc PRIVATE host_tensor)
target_link_libraries(conv_add_fwd_driver_offline_nchwc PRIVATE host_tensor)
target_link_libraries(conv_maxpool_fwd_driver_offline_nchwc PRIVATE host_tensor)
target_link_libraries(conv_bwd_driver_offline PRIVATE host_tensor)
target_link_libraries(conv_wrw_driver_offline PRIVATE host_tensor)
target_link_libraries(gemm_driver_offline PRIVATE host_tensor)
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "check_err.hpp"
#include "config.hpp"
#include "debug.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "conv_common.hpp"
#include "device_tensor.hpp"
#include "device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp"
#define USE_DYNAMIC_MODE 0
#define USE_CONV_FWD_V5R1_NCHWC 1
enum ConvForwardAlgo
{
V5R1NCHWC // 0
};
template <typename TIn,
typename TWei,
typename TOut,
typename ConvStrides,
typename ConvDilations,
typename InLeftPads,
typename InRightPads>
void host_direct_convolution_add_nchwc(const Tensor<TIn>& in,
const Tensor<TWei>& wei,
const Tensor<TOut>& add,
const Tensor<TOut>& bias,
Tensor<TOut>& add_host,
Tensor<TOut>& out_host,
const ConvStrides& conv_strides,
const ConvDilations& conv_dilations,
const InLeftPads& in_left_pads,
const InRightPads&,
const ck::ActivTypeEnum activ_type)
{
using namespace ck;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
auto f_nchw = [&](auto n, auto k0, auto ho, auto wo, auto k1) {
double v = 0;
auto k = k0 * out_host.mDesc.GetLengths()[4] + k1;
for(int c0 = 0; c0 < wei.mDesc.GetLengths()[1]; ++c0)
{
for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
{
int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x)
{
int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
wi < in.mDesc.GetLengths()[3])
{
for(int c1 = 0; c1 < wei.mDesc.GetLengths()[4]; ++c1)
{
v += static_cast<const double>(in(n, c0, hi, wi, c1)) *
static_cast<const double>(wei(k, c0, y, x, c1));
}
}
}
}
}
v += bias(k0, k1);
v = activ(v, activ_type);
const int hox2 = ho * 2;
const int wox2 = wo * 2;
out_host(n, k0, ho, wo, k1) = v;
add_host(n, k0, hox2, wox2, k1) = v + add(n, k0, hox2, wox2, k1);
add_host(n, k0, hox2, wox2 + 1, k1) = v + add(n, k0, hox2, wox2 + 1, k1);
add_host(n, k0, hox2 + 1, wox2, k1) = v + add(n, k0, hox2 + 1, wox2, k1);
add_host(n, k0, hox2 + 1, wox2 + 1, k1) = v + add(n, k0, hox2 + 1, wox2 + 1, k1);
};
make_ParallelTensorFunctor(f_nchw,
out_host.mDesc.GetLengths()[0],
out_host.mDesc.GetLengths()[1],
out_host.mDesc.GetLengths()[2],
out_host.mDesc.GetLengths()[3],
out_host.mDesc.GetLengths()[4])(std::thread::hardware_concurrency());
}
int main(int argc, char* argv[])
{
using namespace ck;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto I4 = Number<4>{};
constexpr auto I5 = Number<5>{};
constexpr auto I6 = Number<6>{};
constexpr auto I7 = Number<7>{};
#if USE_DYNAMIC_MODE
// dynamic mode
if(argc != 23)
{
printf("arg1 to 5: algo, do_verification, init_method, do_log, nrepeat\n");
printf("rest: N, K0, K1, C0, C1, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx\n");
exit(1);
}
constexpr ck::ActivTypeEnum activ_type = ActivTypeEnum::LeakyRelu;
const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1]));
const bool do_verification = std::stoi(argv[2]);
const int init_method = std::stoi(argv[3]);
const bool do_log = std::stoi(argv[4]);
const int nrepeat = std::stoi(argv[5]);
const index_t N = std::stoi(argv[6]);
const index_t K0 = std::stoi(argv[7]);
const index_t K1 = std::stoi(argv[8]);
const index_t C0 = std::stoi(argv[9]);
const index_t C1 = std::stoi(argv[10]);
const index_t Y = std::stoi(argv[11]);
const index_t X = std::stoi(argv[12]);
const index_t Hi = std::stoi(argv[13]);
const index_t Wi = std::stoi(argv[14]);
const index_t conv_stride_h = std::stoi(argv[15]);
const index_t conv_stride_w = std::stoi(argv[16]);
const index_t conv_dilation_h = std::stoi(argv[17]);
const index_t conv_dilation_w = std::stoi(argv[18]);
const index_t in_left_pad_h = std::stoi(argv[19]);
const index_t in_left_pad_w = std::stoi(argv[20]);
const index_t in_right_pad_h = std::stoi(argv[21]);
const index_t in_right_pad_w = std::stoi(argv[22]);
const index_t YEff = (Y - 1) * conv_dilation_h + 1;
const index_t XEff = (X - 1) * conv_dilation_w + 1;
const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
const auto Hox2 = Ho * 2;
const auto Wox2 = Wo * 2;
#else
// static mode
if(argc < 6)
{
printf("arg1 to 5: algo, do_verification, init_method, do_log, nrepeat\n");
exit(1);
}
const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1]));
const bool do_verification = std::stoi(argv[2]);
const int init_method = std::stoi(argv[3]);
const bool do_log = std::stoi(argv[4]);
const int nrepeat = std::stoi(argv[5]);
constexpr ck::ActivTypeEnum activ_type = ActivTypeEnum::LeakyRelu;
#if 0
constexpr auto N = Number<1>{};
constexpr auto Hi = Number<1080>{};
constexpr auto Wi = Number<1920>{};
constexpr auto Y = Number<3>{};
constexpr auto X = Number<3>{};
constexpr auto C0 = Number<2>{};
constexpr auto C1 = Number<8>{};
constexpr auto K1 = Number<8>{};
constexpr auto K0 = Number<8>{};
#elif 0
constexpr auto N = Number<1>{};
constexpr auto Hi = Number<540>{};
constexpr auto Wi = Number<960>{};
constexpr auto Y = Number<3>{};
constexpr auto X = Number<3>{};
constexpr auto C0 = Number<2>{};
constexpr auto C1 = Number<8>{};
constexpr auto K0 = Number<2>{};
constexpr auto K1 = Number<8>{};
#elif 0
constexpr auto N = Number<1>{};
constexpr auto Hi = Number<270>{};
constexpr auto Wi = Number<480>{};
constexpr auto Y = Number<3>{};
constexpr auto X = Number<3>{};
constexpr auto C0 = Number<2>{};
constexpr auto C1 = Number<8>{};
constexpr auto K0 = Number<2>{};
constexpr auto K1 = Number<8>{};
#elif 1
constexpr auto N = Number<128>{};
constexpr auto Hi = Number<135>{};
constexpr auto Wi = Number<240>{};
constexpr auto Y = Number<3>{};
constexpr auto X = Number<3>{};
constexpr auto C0 = Number<2>{};
constexpr auto C1 = Number<8>{};
constexpr auto K0 = Number<2>{};
constexpr auto K1 = Number<8>{};
#elif 1
constexpr auto N = Number<1>{};
constexpr auto Hi = Number<32>{};
constexpr auto Wi = Number<32>{};
constexpr auto Y = Number<3>{};
constexpr auto X = Number<3>{};
constexpr auto C0 = Number<2>{};
constexpr auto C1 = Number<8>{};
constexpr auto K1 = Number<8>{};
constexpr auto K0 = Number<8>{};
#endif
constexpr auto conv_stride_h = I1;
constexpr auto conv_stride_w = I1;
constexpr auto conv_dilation_h = I1;
constexpr auto conv_dilation_w = I1;
constexpr auto in_left_pad_h = I1;
constexpr auto in_left_pad_w = I1;
constexpr auto in_right_pad_h = I1;
constexpr auto in_right_pad_w = I1;
constexpr auto YEff = (Y - I1) * conv_dilation_h + I1;
constexpr auto XEff = (X - I1) * conv_dilation_w + I1;
constexpr auto Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + I1;
constexpr auto Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + I1;
constexpr auto Hox2 = Number<Ho * 2>{};
constexpr auto Wox2 = Number<Wo * 2>{};
#endif
#if 0
using in_data_t = float;
using acc_data_t = float;
using out_data_t = float;
#elif 1
using in_data_t = half_t;
using acc_data_t = float;
using out_data_t = half_t;
#elif 1
using in_data_t = int8_t;
using acc_data_t = int32_t;
using out_data_t = int8_t;
#endif
std::vector<std::size_t> in_lengths_host(5), wei_lengths_host(5), out_lengths_host(5),
add_lengths_host(5), bias_lengths_host(2);
in_lengths_host[0] = static_cast<std::size_t>(N);
in_lengths_host[1] = static_cast<std::size_t>(C0);
in_lengths_host[2] = static_cast<std::size_t>(Hi);
in_lengths_host[3] = static_cast<std::size_t>(Wi);
in_lengths_host[4] = static_cast<std::size_t>(C1);
wei_lengths_host[0] = static_cast<std::size_t>(K0 * K1);
wei_lengths_host[1] = static_cast<std::size_t>(C0);
wei_lengths_host[2] = static_cast<std::size_t>(Y);
wei_lengths_host[3] = static_cast<std::size_t>(X);
wei_lengths_host[4] = static_cast<std::size_t>(C1);
out_lengths_host[0] = static_cast<std::size_t>(N);
out_lengths_host[1] = static_cast<std::size_t>(K0);
out_lengths_host[2] = static_cast<std::size_t>(Ho);
out_lengths_host[3] = static_cast<std::size_t>(Wo);
out_lengths_host[4] = static_cast<std::size_t>(K1);
add_lengths_host[0] = static_cast<std::size_t>(N);
add_lengths_host[1] = static_cast<std::size_t>(K0);
add_lengths_host[2] = static_cast<std::size_t>(Hox2);
add_lengths_host[3] = static_cast<std::size_t>(Wox2);
add_lengths_host[4] = static_cast<std::size_t>(K1);
bias_lengths_host[0] = static_cast<std::size_t>(K0);
bias_lengths_host[1] = static_cast<std::size_t>(K1);
Tensor<in_data_t> in(in_lengths_host);
Tensor<in_data_t> wei(wei_lengths_host);
Tensor<in_data_t> add(add_lengths_host);
Tensor<in_data_t> add_device(add_lengths_host);
Tensor<in_data_t> add_host(add_lengths_host);
Tensor<out_data_t> bias(bias_lengths_host);
Tensor<out_data_t> out_host(out_lengths_host);
ostream_HostTensorDescriptor(in.mDesc, std::cout << "in: ");
ostream_HostTensorDescriptor(wei.mDesc, std::cout << "wei: ");
ostream_HostTensorDescriptor(add.mDesc, std::cout << "add: ");
print_array("InLeftPads", make_tuple(in_left_pad_h, in_left_pad_w));
print_array("InRightPads", make_tuple(in_right_pad_h, in_right_pad_w));
print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
std::size_t num_thread = 1;
switch(init_method)
{
case 0:
// no initialization
break;
case 1:
in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
break;
case 2:
in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
break;
case 3:
in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
break;
case 4:
in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
break;
case 5:
in.GenerateTensorValue(GeneratorTensor_3<float>{0.0, 1.0}, num_thread);
wei.GenerateTensorValue(GeneratorTensor_3<float>{-0.5, 0.5}, num_thread);
break;
default:
in.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);
auto gen_wei = [](auto... is) {
return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
};
wei.GenerateTensorValue(gen_wei, num_thread);
}
bias.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
add.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
auto f_make_for_device_nchwc = [&]() {
const auto in_lengths_dev = make_tuple(N, C0, Hi, Wi, C1);
const auto wei_lengths_dev = make_tuple(K0 * K1, C0, Y, X, C1);
const auto add_lengths_dev = make_tuple(N, K0, Hox2, Wox2, K1);
const auto out_lengths_dev = make_tuple(N, K0, Ho, Wo, K1);
const auto conv_strides_dev = make_tuple(conv_stride_h, conv_stride_w);
const auto conv_dilations_dev = make_tuple(conv_dilation_h, conv_dilation_w);
const auto in_left_pads_dev = make_tuple(in_left_pad_h, in_left_pad_w);
const auto in_right_pads_dev = make_tuple(in_right_pad_h, in_right_pad_w);
return make_tuple(in_lengths_dev,
wei_lengths_dev,
add_lengths_dev,
out_lengths_dev,
conv_strides_dev,
conv_dilations_dev,
in_left_pads_dev,
in_right_pads_dev);
};
#if USE_CONV_FWD_V5R1_NCHWC
if(algo == ConvForwardAlgo::V5R1NCHWC)
{
const auto tmp = f_make_for_device_nchwc();
device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1<in_data_t,
acc_data_t,
out_data_t,
activ_type>(
tmp[I0], // in_lengths_dev
tmp[I1], // wei_lengths_dev
tmp[I2], // add_lengths_dev
tmp[I3], // out_lengths_dev
tmp[I4], // conv_strides_dev
tmp[I5], // conv_dilations_dev
tmp[I6], // in_left_pads_dev
tmp[I7], // in_right_pads_dev
in,
wei,
bias,
add,
add_device,
nrepeat);
}
#endif
if(do_verification)
{
host_direct_convolution_add_nchwc(in,
wei,
add,
bias,
add_host,
out_host,
make_tuple(conv_stride_h, conv_stride_w),
make_tuple(conv_dilation_h, conv_dilation_w),
make_tuple(in_left_pad_h, in_left_pad_w),
make_tuple(in_right_pad_h, in_right_pad_w),
activ_type);
ck::utils::check_err(add_device.mData, add_host.mData);
if(do_log)
{
LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "add_host: ", add_host.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "add_device: ", add_device.mData, ",") << std::endl;
}
}
}
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "check_err.hpp"
#include "config.hpp"
#include "debug.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "conv_common.hpp"
#include "device_tensor.hpp"
#include "device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp"
#include "device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp"
#include "device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk_1x1.hpp"
#define USE_MODE 1
#define USE_CONV_BWD_V4R1_XDL_NHWC 0
#define USE_CONV_BWD_V4R1R2_XDL_NHWC 1
enum ConvTensorLayout
{
NCHW,
NHWC,
CHWN,
NCHWc,
NHWCc
};
enum ConvBackwardDataAlgo
{
V4R1XDLNHWC, // 0
V4R1R2XDLNHWC, // 1
};
template <typename TIn,
typename TWei,
typename TOut,
typename ConvStrides,
typename ConvDilations,
typename InLeftPads,
typename InRightPads>
void host_convolution_backward_data(Tensor<TIn>& in,
const Tensor<TWei>& wei,
const Tensor<TOut>& out,
const ConvStrides& conv_strides,
const ConvDilations& conv_dilations,
const InLeftPads& in_left_pads,
const InRightPads& /* in_right_pads */,
const ConvTensorLayout layout = ConvTensorLayout::NCHW)
{
using namespace ck;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
auto f_nchw = [&](auto n, auto c, auto hi, auto wi) {
std::size_t K = wei.mDesc.GetLengths()[I0];
std::size_t Y = wei.mDesc.GetLengths()[I2];
std::size_t X = wei.mDesc.GetLengths()[I3];
std::size_t Ho = out.mDesc.GetLengths()[I2];
std::size_t Wo = out.mDesc.GetLengths()[I3];
double v = 0;
for(int y = 0; y < Y; ++y)
{
int h_tmp = hi + in_left_pads[I0] - y * conv_dilations[I0];
if(h_tmp % conv_strides[I0] == 0)
{
int ho = h_tmp / conv_strides[I0];
if(ho >= 0 && ho < Ho)
{
for(int x = 0; x < X; ++x)
{
int w_tmp = wi + in_left_pads[I1] - x * conv_dilations[I1];
if(w_tmp % conv_strides[I1] == 0)
{
int wo = w_tmp / conv_strides[I1];
if(wo >= 0 && wo < Wo)
{
for(int k = 0; k < K; ++k)
{
v += out(n, k, ho, wo) * wei(k, c, y, x);
}
}
}
}
}
}
}
in(n, c, hi, wi) = v;
};
auto f_nhwc = [&](auto n, auto hi, auto wi, auto c) {
std::size_t K = wei.mDesc.GetLengths()[I0];
std::size_t Y = wei.mDesc.GetLengths()[I1];
std::size_t X = wei.mDesc.GetLengths()[I2];
std::size_t Ho = out.mDesc.GetLengths()[I1];
std::size_t Wo = out.mDesc.GetLengths()[I2];
double v = 0;
for(int y = 0; y < Y; ++y)
{
int h_tmp = hi + in_left_pads[I0] - y * conv_dilations[I0];
if(h_tmp % conv_strides[I0] == 0)
{
int ho = h_tmp / conv_strides[I0];
if(ho >= 0 && ho < Ho)
{
for(int x = 0; x < X; ++x)
{
int w_tmp = wi + in_left_pads[I1] - x * conv_dilations[I1];
if(w_tmp % conv_strides[I1] == 0)
{
int wo = w_tmp / conv_strides[I1];
if(wo >= 0 && wo < Wo)
{
for(int k = 0; k < K; ++k)
{
v += out(n, ho, wo, k) * wei(k, y, x, c);
}
}
}
}
}
}
}
in(n, hi, wi, c) = v;
};
if(layout == ConvTensorLayout::NCHW)
{
make_ParallelTensorFunctor(f_nchw,
in.mDesc.GetLengths()[0],
in.mDesc.GetLengths()[1],
in.mDesc.GetLengths()[2],
in.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
}
else if(layout == ConvTensorLayout::NHWC)
{
make_ParallelTensorFunctor(f_nhwc,
in.mDesc.GetLengths()[0],
in.mDesc.GetLengths()[1],
in.mDesc.GetLengths()[2],
in.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
}
else
{
throw std::runtime_error("wrong! not supported layout");
}
}
int main(int argc, char* argv[])
{
using namespace ck;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto I4 = Number<4>{};
constexpr auto I5 = Number<5>{};
constexpr auto I6 = Number<6>{};
#if USE_MODE
// dynamic mode
if(argc != 22)
{
printf("arg1 to 6: layout, algo, do_verification, init_method, do_log, nrepeat\n");
printf("rest: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx\n");
exit(1);
}
const ConvTensorLayout layout = static_cast<ConvTensorLayout>(std::stoi(argv[1]));
const ConvBackwardDataAlgo algo = static_cast<ConvBackwardDataAlgo>(std::stoi(argv[2]));
const bool do_verification = std::stoi(argv[3]);
const int init_method = std::stoi(argv[4]);
const bool do_log = std::stoi(argv[5]);
const int nrepeat = std::stoi(argv[6]);
const index_t N = std::stoi(argv[7]);
const index_t K = std::stoi(argv[8]);
const index_t C = std::stoi(argv[9]);
const index_t Y = std::stoi(argv[10]);
const index_t X = std::stoi(argv[11]);
const index_t Hi = std::stoi(argv[12]);
const index_t Wi = std::stoi(argv[13]);
const index_t conv_stride_h = std::stoi(argv[14]);
const index_t conv_stride_w = std::stoi(argv[15]);
const index_t conv_dilation_h = std::stoi(argv[16]);
const index_t conv_dilation_w = std::stoi(argv[17]);
const index_t in_left_pad_h = std::stoi(argv[18]);
const index_t in_left_pad_w = std::stoi(argv[19]);
const index_t in_right_pad_h = std::stoi(argv[20]);
const index_t in_right_pad_w = std::stoi(argv[21]);
const index_t YEff = (Y - 1) * conv_dilation_h + 1;
const index_t XEff = (X - 1) * conv_dilation_w + 1;
const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
#else
// static mode
if(argc < 7)
{
printf("arg1 to 6: layout, algo, do_verification, init_method, do_log, nrepeat\n");
exit(1);
}
const ConvTensorLayout layout = static_cast<ConvTensorLayout>(std::stoi(argv[1]));
const ConvBackwardDataAlgo algo = static_cast<ConvBackwardDataAlgo>(std::stoi(argv[2]));
const bool do_verification = std::stoi(argv[3]);
const int init_method = std::stoi(argv[4]);
const bool do_log = std::stoi(argv[5]);
const int nrepeat = std::stoi(argv[6]);
constexpr auto N = Number<128>{};
constexpr auto C = Number<192>{};
constexpr auto Hi = Number<71>{};
constexpr auto Wi = Number<71>{};
constexpr auto K = Number<256>{};
constexpr auto Y = Number<3>{};
constexpr auto X = Number<3>{};
constexpr auto conv_stride_h = I2;
constexpr auto conv_stride_w = I2;
constexpr auto conv_dilation_h = I1;
constexpr auto conv_dilation_w = I1;
constexpr auto in_left_pad_h = I1;
constexpr auto in_left_pad_w = I1;
constexpr auto in_right_pad_h = I1;
constexpr auto in_right_pad_w = I1;
constexpr auto YEff = (Y - I1) * conv_dilation_h + I1;
constexpr auto XEff = (X - I1) * conv_dilation_w + I1;
constexpr auto Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + I1;
constexpr auto Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + I1;
#endif
#if 0
using in_data_t = float;
using acc_data_t = float;
using out_data_t = float;
#elif 1
using in_data_t = half_t;
using acc_data_t = float;
using out_data_t = half_t;
#endif
std::vector<std::size_t> in_lengths_host(4), wei_lengths_host(4), out_lengths_host(4);
if(layout == ConvTensorLayout::NCHW)
{
in_lengths_host[0] = static_cast<std::size_t>(N);
in_lengths_host[1] = static_cast<std::size_t>(C);
in_lengths_host[2] = static_cast<std::size_t>(Hi);
in_lengths_host[3] = static_cast<std::size_t>(Wi);
wei_lengths_host[0] = static_cast<std::size_t>(K);
wei_lengths_host[1] = static_cast<std::size_t>(C);
wei_lengths_host[2] = static_cast<std::size_t>(Y);
wei_lengths_host[3] = static_cast<std::size_t>(X);
out_lengths_host[0] = static_cast<std::size_t>(N);
out_lengths_host[1] = static_cast<std::size_t>(K);
out_lengths_host[2] = static_cast<std::size_t>(Ho);
out_lengths_host[3] = static_cast<std::size_t>(Wo);
}
else if(layout == ConvTensorLayout::NHWC)
{
in_lengths_host[0] = static_cast<std::size_t>(N);
in_lengths_host[1] = static_cast<std::size_t>(Hi);
in_lengths_host[2] = static_cast<std::size_t>(Wi);
in_lengths_host[3] = static_cast<std::size_t>(C);
wei_lengths_host[0] = static_cast<std::size_t>(K);
wei_lengths_host[1] = static_cast<std::size_t>(Y);
wei_lengths_host[2] = static_cast<std::size_t>(X);
wei_lengths_host[3] = static_cast<std::size_t>(C);
out_lengths_host[0] = static_cast<std::size_t>(N);
out_lengths_host[1] = static_cast<std::size_t>(Ho);
out_lengths_host[2] = static_cast<std::size_t>(Wo);
out_lengths_host[3] = static_cast<std::size_t>(K);
}
else
{
throw std::runtime_error("wrong! not implemented");
}
Tensor<in_data_t> in_host(in_lengths_host);
Tensor<in_data_t> in_device(in_lengths_host);
Tensor<in_data_t> wei(wei_lengths_host);
Tensor<out_data_t> out(out_lengths_host);
std::cout << "layout: " << layout << std::endl;
ostream_HostTensorDescriptor(in_host.mDesc, std::cout << "in: ");
ostream_HostTensorDescriptor(wei.mDesc, std::cout << "wei: ");
ostream_HostTensorDescriptor(out.mDesc, std::cout << "out: ");
print_array("InLeftPads", make_tuple(in_left_pad_h, in_left_pad_w));
print_array("InRightPads", make_tuple(in_right_pad_h, in_right_pad_w));
print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
std::size_t num_thread = 1;
switch(init_method)
{
case 0:
// no initialization
break;
case 1:
out.GenerateTensorValue(GeneratorTensor_1<out_data_t>{}, num_thread);
wei.GenerateTensorValue(GeneratorTensor_1<in_data_t>{}, num_thread);
break;
case 2:
out.GenerateTensorValue(GeneratorTensor_1<out_data_t>{}, num_thread);
wei.GenerateTensorValue(GeneratorTensor_2<in_data_t>{-5, 5}, num_thread);
break;
case 3:
out.GenerateTensorValue(GeneratorTensor_2<out_data_t>{-5, 5}, num_thread);
wei.GenerateTensorValue(GeneratorTensor_1<in_data_t>{}, num_thread);
break;
case 4:
out.GenerateTensorValue(GeneratorTensor_2<out_data_t>{-5, 5}, num_thread);
wei.GenerateTensorValue(GeneratorTensor_2<in_data_t>{-5, 5}, num_thread);
break;
case 5:
out.GenerateTensorValue(GeneratorTensor_3<out_data_t>{0.0, 1.0}, num_thread);
wei.GenerateTensorValue(GeneratorTensor_3<in_data_t>{-0.5, 0.5}, num_thread);
break;
default:
out.GenerateTensorValue(GeneratorTensor_2<out_data_t>{1, 5}, num_thread);
auto gen_wei = [](auto... is) {
return GeneratorTensor_2<in_data_t>{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
};
wei.GenerateTensorValue(gen_wei, num_thread);
}
auto f_make_for_device_nhwc = [&]() {
#if USE_MODE
const auto in_lengths_dev = make_tuple(N, Hi, Wi, C);
const auto wei_lengths_dev = make_tuple(K, Y, X, C);
const auto out_lengths_dev = make_tuple(N, Ho, Wo, K);
const auto conv_strides_dev = make_tuple(conv_stride_h, conv_stride_w);
const auto conv_dilations_dev = make_tuple(conv_dilation_h, conv_dilation_w);
const auto in_left_pads_dev = make_tuple(in_left_pad_h, in_left_pad_w);
const auto in_right_pads_dev = make_tuple(in_right_pad_h, in_right_pad_w);
#else
const auto in_lengths_dev =
make_tuple(Number<N>{}, Number<Hi>{}, Number<Wi>{}, Number<C>{});
const auto wei_lengths_dev = make_tuple(Number<K>{}, Number<Y>{}, Number<X>{}, Number<C>{});
const auto out_lengths_dev =
make_tuple(Number<N>{}, Number<Ho>{}, Number<Wo>{}, Number<K>{});
const auto conv_strides_dev = make_tuple(Number<conv_stride_h>{}, Number<conv_stride_w>{});
const auto conv_dilations_dev =
make_tuple(Number<conv_dilation_h>{}, Number<conv_dilation_w>{});
const auto in_left_pads_dev = make_tuple(Number<in_left_pad_h>{}, Number<in_left_pad_w>{});
const auto in_right_pads_dev =
make_tuple(Number<in_right_pad_h>{}, Number<in_right_pad_w>{});
#endif
return make_tuple(in_lengths_dev,
wei_lengths_dev,
out_lengths_dev,
conv_strides_dev,
conv_dilations_dev,
in_left_pads_dev,
in_right_pads_dev);
};
#if USE_CONV_BWD_V4R1_XDL_NHWC
if(algo == ConvBackwardDataAlgo::V4R1XDLNHWC)
{
if(layout != ConvTensorLayout::NHWC)
{
throw std::runtime_error("wrong! layout");
}
const auto tmp = f_make_for_device_nhwc();
device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk<in_data_t,
acc_data_t,
out_data_t>(
tmp[I0],
tmp[I1],
tmp[I2],
tmp[I3],
tmp[I4],
tmp[I5],
tmp[I6],
in_device,
wei,
out,
nrepeat);
}
#endif
#if USE_CONV_BWD_V4R1R2_XDL_NHWC
if(algo == ConvBackwardDataAlgo::V4R1R2XDLNHWC)
{
if(layout != ConvTensorLayout::NHWC)
{
throw std::runtime_error("wrong! layout");
}
const auto tmp = f_make_for_device_nhwc();
if(Y == 1 && X == 1 && in_left_pad_h == 0 && in_left_pad_w == 0 && in_right_pad_h == 0 &&
in_right_pad_w == 0)
{
device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk_1x1<
in_data_t,
acc_data_t,
out_data_t>(tmp[I0],
tmp[I1],
tmp[I2],
tmp[I3],
tmp[I4],
tmp[I5],
tmp[I6],
in_device,
wei,
out,
nrepeat);
}
else
{
#if 1
device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk<in_data_t,
acc_data_t,
out_data_t>(
tmp[I0],
tmp[I1],
tmp[I2],
tmp[I3],
tmp[I4],
tmp[I5],
tmp[I6],
in_device,
wei,
out,
nrepeat);
#endif
}
}
#endif
if(do_verification)
{
host_convolution_backward_data(in_host,
wei,
out,
make_tuple(conv_stride_h, conv_stride_w),
make_tuple(conv_dilation_h, conv_dilation_w),
make_tuple(in_left_pad_h, in_left_pad_w),
make_tuple(in_right_pad_h, in_right_pad_w),
layout);
ck::utils::check_err(in_device.mData, in_host.mData);
if(do_log)
{
LogRangeAsType<float>(std::cout << "out : ", out.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "in_host : ", in_host.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "in_device: ", in_device.mData, ",") << std::endl;
}
}
}
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "check_err.hpp"
#include "config.hpp"
#include "debug.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "conv_common.hpp"
#include "device_tensor.hpp"
#include "device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp"
#include "device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp"
#include "device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp"
#include "device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp"
#include "device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"
#define USE_DYNAMIC_MODE 1
#define USE_CONV_FWD_V4R4_NCHW 0
#define USE_CONV_FWD_V4R4R2_NHWC 0
#define USE_CONV_FWD_V6R1_NCHW 0
#define USE_CONV_FWD_V4R4R2_XDL_NCHW 0
#define USE_CONV_FWD_V4R4R4_XDL_NHWC 1
enum ConvTensorLayout
{
NCHW,
NHWC,
CHWN,
NCHWc,
NHWCc
};
enum ConvForwardAlgo
{
V4R4NCHW, // 0
V4R4R2NHWC, // 1
V6R1NCHW, // 2
V4R4R2XDLNCHW, // 3
V4R4R4XDLNHWC // 4
};
template <typename TIn,
typename TWei,
typename TOut,
typename ConvStrides,
typename ConvDilations,
typename InLeftPads,
typename InRightPads>
void host_convolution_forward(const Tensor<TIn>& in,
const Tensor<TWei>& wei,
Tensor<TOut>& out,
const ConvStrides& conv_strides,
const ConvDilations& conv_dilations,
const InLeftPads& in_left_pads,
const InRightPads&,
const ConvTensorLayout layout = ConvTensorLayout::NCHW)
{
using namespace ck;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
double v = 0;
for(int c = 0; c < wei.mDesc.GetLengths()[1]; ++c)
{
for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
{
int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x)
{
int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
wi < in.mDesc.GetLengths()[3])
{
if constexpr(is_same<TIn, bhalf_t>::value)
{
v += ck::type_convert<float>(in(n, c, hi, wi)) *
ck::type_convert<float>(wei(k, c, y, x));
}
else
{
v += static_cast<const double>(in(n, c, hi, wi)) *
static_cast<const double>(wei(k, c, y, x));
}
}
}
}
}
if constexpr(is_same<TOut, bhalf_t>::value)
{
out(n, k, ho, wo) = ck::type_convert<bhalf_t>(static_cast<float>(v));
}
else
{
out(n, k, ho, wo) = v;
}
};
auto f_nhwc = [&](auto n, auto ho, auto wo, auto k) {
double v = 0;
for(int c = 0; c < wei.mDesc.GetLengths()[3]; ++c)
{
for(int y = 0; y < wei.mDesc.GetLengths()[1]; ++y)
{
int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
for(int x = 0; x < wei.mDesc.GetLengths()[2]; ++x)
{
int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
if(hi >= 0 && hi < in.mDesc.GetLengths()[1] && wi >= 0 &&
wi < in.mDesc.GetLengths()[2])
{
if constexpr(is_same<TIn, bhalf_t>::value)
{
v += ck::type_convert<float>(in(n, hi, wi, c)) *
ck::type_convert<float>(wei(k, y, x, c));
}
else
{
v += static_cast<const double>(in(n, hi, wi, c)) *
static_cast<const double>(wei(k, y, x, c));
}
}
}
}
}
if constexpr(is_same<TOut, bhalf_t>::value)
{
out(n, ho, wo, k) = ck::type_convert<bhalf_t>(static_cast<float>(v));
}
else
{
out(n, ho, wo, k) = v;
}
};
if(layout == ConvTensorLayout::NCHW)
{
make_ParallelTensorFunctor(f_nchw,
out.mDesc.GetLengths()[0],
out.mDesc.GetLengths()[1],
out.mDesc.GetLengths()[2],
out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
}
else if(layout == ConvTensorLayout::NHWC)
{
make_ParallelTensorFunctor(f_nhwc,
out.mDesc.GetLengths()[0],
out.mDesc.GetLengths()[1],
out.mDesc.GetLengths()[2],
out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
}
else
{
throw std::runtime_error("wrong! not supported layout");
}
}
int main(int argc, char* argv[])
{
using namespace ck;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto I4 = Number<4>{};
constexpr auto I5 = Number<5>{};
constexpr auto I6 = Number<6>{};
#if USE_DYNAMIC_MODE
// dynamic mode
if(argc != 22)
{
printf("arg1 to 6: layout, algo, do_verification, init_method, do_log, nrepeat\n");
printf("rest: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx\n");
exit(1);
}
const ConvTensorLayout layout = static_cast<ConvTensorLayout>(std::stoi(argv[1]));
const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[2]));
const bool do_verification = std::stoi(argv[3]);
const int init_method = std::stoi(argv[4]);
const bool do_log = std::stoi(argv[5]);
const int nrepeat = std::stoi(argv[6]);
const index_t N = std::stoi(argv[7]);
const index_t K = std::stoi(argv[8]);
const index_t C = std::stoi(argv[9]);
const index_t Y = std::stoi(argv[10]);
const index_t X = std::stoi(argv[11]);
const index_t Hi = std::stoi(argv[12]);
const index_t Wi = std::stoi(argv[13]);
const index_t conv_stride_h = std::stoi(argv[14]);
const index_t conv_stride_w = std::stoi(argv[15]);
const index_t conv_dilation_h = std::stoi(argv[16]);
const index_t conv_dilation_w = std::stoi(argv[17]);
const index_t in_left_pad_h = std::stoi(argv[18]);
const index_t in_left_pad_w = std::stoi(argv[19]);
const index_t in_right_pad_h = std::stoi(argv[20]);
const index_t in_right_pad_w = std::stoi(argv[21]);
const index_t YEff = (Y - 1) * conv_dilation_h + 1;
const index_t XEff = (X - 1) * conv_dilation_w + 1;
const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
#else
// static mode
if(argc < 7)
{
printf("arg1 to 6: layout, algo, do_verification, init_method, do_log, nrepeat\n");
exit(1);
}
const ConvTensorLayout layout = static_cast<ConvTensorLayout>(std::stoi(argv[1]));
const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[2]));
const bool do_verification = std::stoi(argv[3]);
const int init_method = std::stoi(argv[4]);
const bool do_log = std::stoi(argv[5]);
const int nrepeat = std::stoi(argv[6]);
constexpr auto N = Number<128>{};
constexpr auto C = Number<192>{};
constexpr auto Hi = Number<71>{};
constexpr auto Wi = Number<71>{};
constexpr auto K = Number<256>{};
constexpr auto Y = Number<3>{};
constexpr auto X = Number<3>{};
constexpr auto conv_stride_h = I1;
constexpr auto conv_stride_w = I1;
constexpr auto conv_dilation_h = I1;
constexpr auto conv_dilation_w = I1;
constexpr auto in_left_pad_h = I1;
constexpr auto in_left_pad_w = I1;
constexpr auto in_right_pad_h = I1;
constexpr auto in_right_pad_w = I1;
constexpr auto YEff = (Y - I1) * conv_dilation_h + I1;
constexpr auto XEff = (X - I1) * conv_dilation_w + I1;
constexpr auto Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + I1;
constexpr auto Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + I1;
#endif
#if 1
using in_data_t = float;
using acc_data_t = float;
using out_data_t = float;
#elif 1
using in_data_t = half_t;
using acc_data_t = float;
using out_data_t = half_t;
#elif 0
using in_data_t = bhalf_t;
using acc_data_t = float;
using out_data_t = bhalf_t;
#elif 1
using in_data_t = int8_t;
using acc_data_t = int32_t;
using out_data_t = int8_t;
#endif
std::vector<std::size_t> in_lengths_host(4), wei_lengths_host(4), out_lengths_host(4);
if(layout == ConvTensorLayout::NCHW)
{
in_lengths_host[0] = static_cast<std::size_t>(N);
in_lengths_host[1] = static_cast<std::size_t>(C);
in_lengths_host[2] = static_cast<std::size_t>(Hi);
in_lengths_host[3] = static_cast<std::size_t>(Wi);
wei_lengths_host[0] = static_cast<std::size_t>(K);
wei_lengths_host[1] = static_cast<std::size_t>(C);
wei_lengths_host[2] = static_cast<std::size_t>(Y);
wei_lengths_host[3] = static_cast<std::size_t>(X);
out_lengths_host[0] = static_cast<std::size_t>(N);
out_lengths_host[1] = static_cast<std::size_t>(K);
out_lengths_host[2] = static_cast<std::size_t>(Ho);
out_lengths_host[3] = static_cast<std::size_t>(Wo);
}
else if(layout == ConvTensorLayout::NHWC)
{
in_lengths_host[0] = static_cast<std::size_t>(N);
in_lengths_host[1] = static_cast<std::size_t>(Hi);
in_lengths_host[2] = static_cast<std::size_t>(Wi);
in_lengths_host[3] = static_cast<std::size_t>(C);
wei_lengths_host[0] = static_cast<std::size_t>(K);
wei_lengths_host[1] = static_cast<std::size_t>(Y);
wei_lengths_host[2] = static_cast<std::size_t>(X);
wei_lengths_host[3] = static_cast<std::size_t>(C);
out_lengths_host[0] = static_cast<std::size_t>(N);
out_lengths_host[1] = static_cast<std::size_t>(Ho);
out_lengths_host[2] = static_cast<std::size_t>(Wo);
out_lengths_host[3] = static_cast<std::size_t>(K);
}
else
{
std::runtime_error("wrong! not implemented");
}
Tensor<in_data_t> in(in_lengths_host);
Tensor<in_data_t> wei(wei_lengths_host);
Tensor<out_data_t> out_host(out_lengths_host);
Tensor<out_data_t> out_device(out_lengths_host);
std::cout << "layout: " << layout << std::endl;
ostream_HostTensorDescriptor(in.mDesc, std::cout << "in: ");
ostream_HostTensorDescriptor(wei.mDesc, std::cout << "wei: ");
ostream_HostTensorDescriptor(out_host.mDesc, std::cout << "out: ");
print_array("InLeftPads", make_tuple(in_left_pad_h, in_left_pad_w));
print_array("InRightPads", make_tuple(in_right_pad_h, in_right_pad_w));
print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
std::size_t num_thread = 1;
switch(init_method)
{
case 0:
// no initialization
break;
case 1:
in.GenerateTensorValue(GeneratorTensor_1<in_data_t>{}, num_thread);
wei.GenerateTensorValue(GeneratorTensor_1<in_data_t>{}, num_thread);
break;
case 2:
in.GenerateTensorValue(GeneratorTensor_1<in_data_t>{}, num_thread);
wei.GenerateTensorValue(GeneratorTensor_2<in_data_t>{-5, 5}, num_thread);
break;
case 3:
in.GenerateTensorValue(GeneratorTensor_2<in_data_t>{-5, 5}, num_thread);
wei.GenerateTensorValue(GeneratorTensor_1<in_data_t>{}, num_thread);
break;
case 4:
in.GenerateTensorValue(GeneratorTensor_2<in_data_t>{-5, 5}, num_thread);
wei.GenerateTensorValue(GeneratorTensor_2<in_data_t>{-5, 5}, num_thread);
break;
case 5:
in.GenerateTensorValue(GeneratorTensor_3<in_data_t>{0.0, 1.0}, num_thread);
wei.GenerateTensorValue(GeneratorTensor_3<in_data_t>{-0.5, 0.5}, num_thread);
break;
default:
in.GenerateTensorValue(GeneratorTensor_2<in_data_t>{1, 5}, num_thread);
auto gen_wei = [](auto... is) {
return GeneratorTensor_2<in_data_t>{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
};
wei.GenerateTensorValue(gen_wei, num_thread);
}
auto f_make_for_device_nchw = [&]() {
const auto in_lengths_dev = make_tuple(N, C, Hi, Wi);
const auto wei_lengths_dev = make_tuple(K, C, Y, X);
const auto out_lengths_dev = make_tuple(N, K, Ho, Wo);
const auto conv_strides_dev = make_tuple(conv_stride_h, conv_stride_w);
const auto conv_dilations_dev = make_tuple(conv_dilation_h, conv_dilation_w);
const auto in_left_pads_dev = make_tuple(in_left_pad_h, in_left_pad_w);
const auto in_right_pads_dev = make_tuple(in_right_pad_h, in_right_pad_w);
return make_tuple(in_lengths_dev,
wei_lengths_dev,
out_lengths_dev,
conv_strides_dev,
conv_dilations_dev,
in_left_pads_dev,
in_right_pads_dev);
};
auto f_make_for_device_nhwc = [&]() {
const auto in_lengths_dev = make_tuple(N, Hi, Wi, C);
const auto wei_lengths_dev = make_tuple(K, Y, X, C);
const auto out_lengths_dev = make_tuple(N, Ho, Wo, K);
const auto conv_strides_dev = make_tuple(conv_stride_h, conv_stride_w);
const auto conv_dilations_dev = make_tuple(conv_dilation_h, conv_dilation_w);
const auto in_left_pads_dev = make_tuple(in_left_pad_h, in_left_pad_w);
const auto in_right_pads_dev = make_tuple(in_right_pad_h, in_right_pad_w);
return make_tuple(in_lengths_dev,
wei_lengths_dev,
out_lengths_dev,
conv_strides_dev,
conv_dilations_dev,
in_left_pads_dev,
in_right_pads_dev);
};
#if USE_CONV_FWD_V4R4_NCHW
if(algo == ConvForwardAlgo::V4R4NCHW)
{
if(layout != ConvTensorLayout::NCHW)
{
throw std::runtime_error("wrong! layout");
}
const auto tmp = f_make_for_device_nchw();
device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw<in_data_t,
acc_data_t,
out_data_t>(tmp[I0],
tmp[I1],
tmp[I2],
tmp[I3],
tmp[I4],
tmp[I5],
tmp[I6],
in,
wei,
out_device,
nrepeat);
}
#endif
#if USE_CONV_FWD_V4R4R2_NHWC
if(algo == ConvForwardAlgo::V4R4R2NHWC)
{
if(layout != ConvTensorLayout::NHWC)
{
throw std::runtime_error("wrong! layout");
}
const auto tmp = f_make_for_device_nhwc();
device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk<in_data_t,
acc_data_t,
out_data_t>(tmp[I0],
tmp[I1],
tmp[I2],
tmp[I3],
tmp[I4],
tmp[I5],
tmp[I6],
in,
wei,
out_device,
nrepeat);
}
#endif
#if USE_CONV_FWD_V6R1_NCHW
if(algo == ConvForwardAlgo::V6R1NCHW)
{
if(layout != ConvTensorLayout::NCHW)
{
throw std::runtime_error("wrong! layout");
}
const auto tmp = f_make_for_device_nchw();
device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw<in_data_t,
acc_data_t,
out_data_t>(tmp[I0],
tmp[I1],
tmp[I2],
tmp[I3],
tmp[I4],
tmp[I5],
tmp[I6],
in,
wei,
out_device,
nrepeat);
}
#endif
#if USE_CONV_FWD_V4R4R2_XDL_NCHW
if(algo == ConvForwardAlgo::V4R4R2XDLNCHW)
{
if(layout != ConvTensorLayout::NCHW)
{
throw std::runtime_error("wrong! layout");
}
const auto tmp = f_make_for_device_nchw();
device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw<in_data_t,
acc_data_t,
out_data_t>(
tmp[I0],
tmp[I1],
tmp[I2],
tmp[I3],
tmp[I4],
tmp[I5],
tmp[I6],
in,
wei,
out_device,
nrepeat);
}
#endif
#if USE_CONV_FWD_V4R4R4_XDL_NHWC
if(algo == ConvForwardAlgo::V4R4R4XDLNHWC)
{
if(layout != ConvTensorLayout::NHWC)
{
throw std::runtime_error("wrong! layout");
}
const auto tmp = f_make_for_device_nhwc();
device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk<in_data_t,
acc_data_t,
out_data_t>(
tmp[I0],
tmp[I1],
tmp[I2],
tmp[I3],
tmp[I4],
tmp[I5],
tmp[I6],
in,
wei,
out_device,
nrepeat);
}
#endif
if(do_verification)
{
host_convolution_forward(in,
wei,
out_host,
make_tuple(conv_stride_h, conv_stride_w),
make_tuple(conv_dilation_h, conv_dilation_w),
make_tuple(in_left_pad_h, in_left_pad_w),
make_tuple(in_right_pad_h, in_right_pad_w),
layout);
ck::utils::check_err(out_device.mData, out_host.mData);
if(do_log)
{
LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "out_host : ", out_host.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "out_device: ", out_device.mData, ",") << std::endl;
}
}
}
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "check_err.hpp"
#include "config.hpp"
#include "debug.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "conv_common.hpp"
#include "device_tensor.hpp"
#include "device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp"
#define USE_DYNAMIC_MODE 0
#define USE_CONV_FWD_V5R1_NCHWC 1
enum ConvForwardAlgo
{
V5R1NCHWC // 0
};
template <typename TIn,
typename TWei,
typename TOut,
typename ConvStrides,
typename ConvDilations,
typename InLeftPads,
typename InRightPads>
void host_direct_convolution_nchwc(const Tensor<TIn>& in,
const Tensor<TWei>& wei,
const Tensor<TOut>& bias,
Tensor<TOut>& out,
const ConvStrides& conv_strides,
const ConvDilations& conv_dilations,
const InLeftPads& in_left_pads,
const InRightPads&,
const ck::ActivTypeEnum activ_type)
{
using namespace ck;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
auto f_nchw = [&](auto n, auto k0, auto ho, auto wo, auto k1) {
double v = 0;
const int k = k0 * out.mDesc.GetLengths()[4] + k1;
for(int c0 = 0; c0 < wei.mDesc.GetLengths()[1]; ++c0)
{
for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
{
int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x)
{
int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
wi < in.mDesc.GetLengths()[3])
{
for(int c1 = 0; c1 < wei.mDesc.GetLengths()[4]; ++c1)
{
v += static_cast<const double>(in(n, c0, hi, wi, c1)) *
static_cast<const double>(wei(k, c0, y, x, c1));
}
}
}
}
}
v += bias(k0, k1);
out(n, k0, ho, wo, k1) = activ(v, activ_type);
};
make_ParallelTensorFunctor(f_nchw,
out.mDesc.GetLengths()[0],
out.mDesc.GetLengths()[1],
out.mDesc.GetLengths()[2],
out.mDesc.GetLengths()[3],
out.mDesc.GetLengths()[4])(std::thread::hardware_concurrency());
}
int main(int argc, char* argv[])
{
using namespace ck;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto I4 = Number<4>{};
constexpr auto I5 = Number<5>{};
constexpr auto I6 = Number<6>{};
#if USE_DYNAMIC_MODE
// dynamic mode
if(argc != 23)
{
printf("arg1 to 5: algo, do_verification, init_method, do_log, nrepeat\n");
printf("rest: N, K0, K1, C0, C1, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx\n");
exit(1);
}
constexpr ck::ActivTypeEnum activ_type = ActivTypeEnum::LeakyRelu;
const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1]));
const bool do_verification = std::stoi(argv[2]);
const int init_method = std::stoi(argv[3]);
const bool do_log = std::stoi(argv[4]);
const int nrepeat = std::stoi(argv[5]);
const index_t N = std::stoi(argv[6]);
const index_t K0 = std::stoi(argv[7]);
const index_t K1 = std::stoi(argv[8]);
const index_t C0 = std::stoi(argv[9]);
const index_t C1 = std::stoi(argv[10]);
const index_t Y = std::stoi(argv[11]);
const index_t X = std::stoi(argv[12]);
const index_t Hi = std::stoi(argv[13]);
const index_t Wi = std::stoi(argv[14]);
const index_t conv_stride_h = std::stoi(argv[15]);
const index_t conv_stride_w = std::stoi(argv[16]);
const index_t conv_dilation_h = std::stoi(argv[17]);
const index_t conv_dilation_w = std::stoi(argv[18]);
const index_t in_left_pad_h = std::stoi(argv[19]);
const index_t in_left_pad_w = std::stoi(argv[20]);
const index_t in_right_pad_h = std::stoi(argv[21]);
const index_t in_right_pad_w = std::stoi(argv[22]);
const index_t YEff = (Y - 1) * conv_dilation_h + 1;
const index_t XEff = (X - 1) * conv_dilation_w + 1;
const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
#else
// static mode
if(argc < 6)
{
printf("arg1 to 5: algo, do_verification, init_method, do_log, nrepeat\n");
exit(1);
}
const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1]));
const bool do_verification = std::stoi(argv[2]);
const int init_method = std::stoi(argv[3]);
const bool do_log = std::stoi(argv[4]);
const int nrepeat = std::stoi(argv[5]);
// constexpr ck::ActivTypeEnum activ_type = ActivTypeEnum::Sigmoid;
constexpr ck::ActivTypeEnum activ_type = ActivTypeEnum::LeakyRelu;
#if 0
constexpr auto N = Number<1>{};
constexpr auto Hi = Number<1080>{};
constexpr auto Wi = Number<1920>{};
constexpr auto Y = Number<3>{};
constexpr auto X = Number<3>{};
constexpr auto C0 = Number<2>{};
constexpr auto C1 = Number<8>{};
constexpr auto K0 = Number<1>{};
constexpr auto K1 = Number<4>{};
#elif 1
constexpr auto N = Number<1>{};
constexpr auto Hi = Number<1080>{};
constexpr auto Wi = Number<1920>{};
constexpr auto Y = Number<3>{};
constexpr auto X = Number<3>{};
constexpr auto C0 = Number<2>{};
constexpr auto C1 = Number<8>{};
constexpr auto K0 = Number<2>{};
constexpr auto K1 = Number<8>{};
#elif 0
constexpr auto N = Number<1>{};
constexpr auto Hi = Number<1080>{};
constexpr auto Wi = Number<1920>{};
constexpr auto Y = Number<1>{};
constexpr auto X = Number<1>{};
constexpr auto C0 = Number<2>{};
constexpr auto C1 = Number<8>{};
constexpr auto K0 = Number<2>{};
constexpr auto K1 = Number<8>{};
#elif 0
constexpr auto N = Number<1>{};
constexpr auto Hi = Number<540>{};
constexpr auto Wi = Number<960>{};
constexpr auto Y = Number<1>{};
constexpr auto X = Number<1>{};
constexpr auto C0 = Number<2>{};
constexpr auto C1 = Number<8>{};
constexpr auto K0 = Number<2>{};
constexpr auto K1 = Number<8>{};
#elif 0
constexpr auto N = Number<128>{};
constexpr auto Hi = Number<270>{};
constexpr auto Wi = Number<480>{};
constexpr auto Y = Number<1>{};
constexpr auto X = Number<1>{};
constexpr auto C0 = Number<2>{};
constexpr auto C1 = Number<8>{};
constexpr auto K0 = Number<2>{};
constexpr auto K1 = Number<8>{};
#endif
constexpr auto conv_stride_h = I1;
constexpr auto conv_stride_w = I1;
constexpr auto conv_dilation_h = I1;
constexpr auto conv_dilation_w = I1;
#if 1
constexpr auto in_left_pad_h = I1;
constexpr auto in_left_pad_w = I1;
constexpr auto in_right_pad_h = I1;
constexpr auto in_right_pad_w = I1;
#else
constexpr auto in_left_pad_h = I0;
constexpr auto in_left_pad_w = I0;
constexpr auto in_right_pad_h = I0;
constexpr auto in_right_pad_w = I0;
#endif
constexpr auto YEff = (Y - I1) * conv_dilation_h + I1;
constexpr auto XEff = (X - I1) * conv_dilation_w + I1;
constexpr auto Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + I1;
constexpr auto Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + I1;
#endif
#if 0
using in_data_t = float;
using acc_data_t = float;
using out_data_t = float;
#elif 1
using in_data_t = half_t;
using acc_data_t = float;
using out_data_t = half_t;
#elif 1
using in_data_t = int8_t;
using acc_data_t = int32_t;
using out_data_t = int8_t;
#endif
std::vector<std::size_t> in_lengths_host(5), wei_lengths_host(5), out_lengths_host(5),
bias_lengths_host(2);
in_lengths_host[0] = static_cast<std::size_t>(N);
in_lengths_host[1] = static_cast<std::size_t>(C0);
in_lengths_host[2] = static_cast<std::size_t>(Hi);
in_lengths_host[3] = static_cast<std::size_t>(Wi);
in_lengths_host[4] = static_cast<std::size_t>(C1);
wei_lengths_host[0] = static_cast<std::size_t>(K0 * K1);
wei_lengths_host[1] = static_cast<std::size_t>(C0);
wei_lengths_host[2] = static_cast<std::size_t>(Y);
wei_lengths_host[3] = static_cast<std::size_t>(X);
wei_lengths_host[4] = static_cast<std::size_t>(C1);
out_lengths_host[0] = static_cast<std::size_t>(N);
out_lengths_host[1] = static_cast<std::size_t>(K0);
out_lengths_host[2] = static_cast<std::size_t>(Ho);
out_lengths_host[3] = static_cast<std::size_t>(Wo);
out_lengths_host[4] = static_cast<std::size_t>(K1);
bias_lengths_host[0] = static_cast<std::size_t>(K0);
bias_lengths_host[1] = static_cast<std::size_t>(K1);
Tensor<in_data_t> in(in_lengths_host);
Tensor<in_data_t> wei(wei_lengths_host);
Tensor<out_data_t> bias(bias_lengths_host);
Tensor<out_data_t> out_host(out_lengths_host);
Tensor<out_data_t> out_device(out_lengths_host);
ostream_HostTensorDescriptor(in.mDesc, std::cout << "in: ");
ostream_HostTensorDescriptor(wei.mDesc, std::cout << "wei: ");
ostream_HostTensorDescriptor(bias.mDesc, std::cout << "bias: ");
ostream_HostTensorDescriptor(out_host.mDesc, std::cout << "out: ");
print_array("InLeftPads", make_tuple(in_left_pad_h, in_left_pad_w));
print_array("InRightPads", make_tuple(in_right_pad_h, in_right_pad_w));
print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
std::size_t num_thread = 1;
switch(init_method)
{
case 0:
// no initialization
break;
case 1:
in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
bias.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
break;
case 2:
in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
bias.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
break;
case 3:
in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
bias.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
break;
case 4:
in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
bias.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
break;
case 5:
in.GenerateTensorValue(GeneratorTensor_3<float>{0.0, 1.0}, num_thread);
wei.GenerateTensorValue(GeneratorTensor_3<float>{-0.5, 0.5}, num_thread);
bias.GenerateTensorValue(GeneratorTensor_3<float>{-0.5, 0.5}, num_thread);
break;
default:
in.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);
auto gen_wei = [](auto... is) {
return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
};
wei.GenerateTensorValue(gen_wei, num_thread);
}
auto f_make_for_device_nchwc = [&]() {
const auto in_lengths_dev = make_tuple(N, C0, Hi, Wi, C1);
const auto wei_lengths_dev = make_tuple(K0 * K1, C0, Y, X, C1);
const auto out_lengths_dev = make_tuple(N, K0, Ho, Wo, K1);
const auto conv_strides_dev = make_tuple(conv_stride_h, conv_stride_w);
const auto conv_dilations_dev = make_tuple(conv_dilation_h, conv_dilation_w);
const auto in_left_pads_dev = make_tuple(in_left_pad_h, in_left_pad_w);
const auto in_right_pads_dev = make_tuple(in_right_pad_h, in_right_pad_w);
return make_tuple(in_lengths_dev,
wei_lengths_dev,
out_lengths_dev,
conv_strides_dev,
conv_dilations_dev,
in_left_pads_dev,
in_right_pads_dev);
};
#if USE_CONV_FWD_V5R1_NCHWC
if(algo == ConvForwardAlgo::V5R1NCHWC)
{
const auto tmp = f_make_for_device_nchwc();
device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1<in_data_t,
acc_data_t,
out_data_t,
activ_type>(
tmp[I0],
tmp[I1],
tmp[I2],
tmp[I3],
tmp[I4],
tmp[I5],
tmp[I6],
in,
wei,
bias,
out_device,
nrepeat);
}
#endif
if(do_verification)
{
host_direct_convolution_nchwc(in,
wei,
bias,
out_host,
make_tuple(conv_stride_h, conv_stride_w),
make_tuple(conv_dilation_h, conv_dilation_w),
make_tuple(in_left_pad_h, in_left_pad_w),
make_tuple(in_right_pad_h, in_right_pad_w),
activ_type);
ck::utils::check_err(out_device.mData, out_host.mData);
if(do_log)
{
LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "bias: ", bias.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "out_host : ", out_host.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "out_device: ", out_device.mData, ",") << std::endl;
}
}
}
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "check_err.hpp"
#include "config.hpp"
#include "debug.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "conv_common.hpp"
#include "device_tensor.hpp"
#include "device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp"
#define USE_DYNAMIC_MODE 0
#define USE_CONV_FWD_V5R1_NCHWC 1
enum ConvForwardAlgo
{
V5R1NCHWC // 0
};
template <typename TIn,
typename TWei,
typename TOut,
typename ConvStrides,
typename ConvDilations,
typename InLeftPads,
typename InRightPads>
void host_direct_convolution_maxpool_nchwc(const Tensor<TIn>& in,
const Tensor<TWei>& wei,
const Tensor<TOut>& bias,
Tensor<TOut>& out_host,
Tensor<TOut>& max_host,
const ConvStrides& conv_strides,
const ConvDilations& conv_dilations,
const InLeftPads& in_left_pads,
const InRightPads&,
const ck::ActivTypeEnum activ_type)
{
using namespace ck;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
auto f_nchw = [&](auto n, auto k0, auto ho, auto wo, auto k1) {
double v = 0;
auto k = k0 * out_host.mDesc.GetLengths()[4] + k1;
for(int c0 = 0; c0 < wei.mDesc.GetLengths()[1]; ++c0)
{
for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
{
int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x)
{
int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
wi < in.mDesc.GetLengths()[3])
{
for(int c1 = 0; c1 < wei.mDesc.GetLengths()[4]; ++c1)
{
v += static_cast<const double>(in(n, c0, hi, wi, c1)) *
static_cast<const double>(wei(k, c0, y, x, c1));
}
}
}
}
}
v += bias(k0, k1);
v = activ(v, activ_type);
out_host(n, k0, ho, wo, k1) = v;
};
make_ParallelTensorFunctor(f_nchw,
out_host.mDesc.GetLengths()[0],
out_host.mDesc.GetLengths()[1],
out_host.mDesc.GetLengths()[2],
out_host.mDesc.GetLengths()[3],
out_host.mDesc.GetLengths()[4])(std::thread::hardware_concurrency());
auto maxpool_nchw = [&](auto n, auto k0, auto ho, auto wo, auto k1) {
auto hx = ho * 2;
auto wx = wo * 2;
auto v0 = out_host(n, k0, hx, wx, k1);
auto v1 = out_host(n, k0, hx, wx + 1, k1);
auto v2 = out_host(n, k0, hx + 1, wx, k1);
auto v3 = out_host(n, k0, hx + 1, wx + 1, k1);
max_host(n, k0, ho, wo, k1) = std::max({v0, v1, v2, v3});
};
make_ParallelTensorFunctor(maxpool_nchw,
max_host.mDesc.GetLengths()[0],
max_host.mDesc.GetLengths()[1],
max_host.mDesc.GetLengths()[2],
max_host.mDesc.GetLengths()[3],
max_host.mDesc.GetLengths()[4])(std::thread::hardware_concurrency());
}
int main(int argc, char* argv[])
{
using namespace ck;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto I4 = Number<4>{};
constexpr auto I5 = Number<5>{};
constexpr auto I6 = Number<6>{};
constexpr auto I7 = Number<7>{};
#if USE_DYNAMIC_MODE
// dynamic mode
if(argc != 23)
{
printf("arg1 to 5: algo, do_verification, init_method, do_log, nrepeat\n");
printf("rest: N, K0, K1, C0, C1, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx\n");
exit(1);
}
constexpr ck::ActivTypeEnum activ_type = ActivTypeEnum::LeakyRelu;
const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1]));
const bool do_verification = std::stoi(argv[2]);
const int init_method = std::stoi(argv[3]);
const bool do_log = std::stoi(argv[4]);
const int nrepeat = std::stoi(argv[5]);
const index_t N = std::stoi(argv[6]);
const index_t K0 = std::stoi(argv[7]);
const index_t K1 = std::stoi(argv[8]);
const index_t C0 = std::stoi(argv[9]);
const index_t C1 = std::stoi(argv[10]);
const index_t Y = std::stoi(argv[11]);
const index_t X = std::stoi(argv[12]);
const index_t Hi = std::stoi(argv[13]);
const index_t Wi = std::stoi(argv[14]);
const index_t conv_stride_h = std::stoi(argv[15]);
const index_t conv_stride_w = std::stoi(argv[16]);
const index_t conv_dilation_h = std::stoi(argv[17]);
const index_t conv_dilation_w = std::stoi(argv[18]);
const index_t in_left_pad_h = std::stoi(argv[19]);
const index_t in_left_pad_w = std::stoi(argv[20]);
const index_t in_right_pad_h = std::stoi(argv[21]);
const index_t in_right_pad_w = std::stoi(argv[22]);
const index_t YEff = (Y - 1) * conv_dilation_h + 1;
const index_t XEff = (X - 1) * conv_dilation_w + 1;
const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
const index_t Ho_2 = Ho / 2;
const index_t Wo_2 = Wo / 2;
#else
// static mode
if(argc < 6)
{
printf("arg1 to 5: algo, do_verification, init_method, do_log, nrepeat\n");
exit(1);
}
const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1]));
const bool do_verification = std::stoi(argv[2]);
const int init_method = std::stoi(argv[3]);
const bool do_log = std::stoi(argv[4]);
const int nrepeat = std::stoi(argv[5]);
constexpr ck::ActivTypeEnum activ_type = ActivTypeEnum::LeakyRelu;
#if 1
constexpr auto N = Number<1>{};
constexpr auto Hi = Number<1080>{};
constexpr auto Wi = Number<1920>{};
constexpr auto Y = Number<3>{};
constexpr auto X = Number<3>{};
constexpr auto C0 = Number<2>{};
constexpr auto C1 = Number<8>{};
constexpr auto K0 = Number<2>{};
constexpr auto K1 = Number<8>{};
#elif 0
constexpr auto N = Number<1>{};
constexpr auto Hi = Number<1080>{};
constexpr auto Wi = Number<1920>{};
constexpr auto Y = Number<3>{};
constexpr auto X = Number<3>{};
constexpr auto C0 = Number<3>{};
constexpr auto C1 = Number<4>{};
constexpr auto K0 = Number<2>{};
constexpr auto K1 = Number<8>{};
#elif 0
constexpr auto N = Number<1>{};
constexpr auto Hi = Number<540>{};
constexpr auto Wi = Number<960>{};
constexpr auto Y = Number<3>{};
constexpr auto X = Number<3>{};
constexpr auto C0 = Number<2>{};
constexpr auto C1 = Number<8>{};
constexpr auto K0 = Number<2>{};
constexpr auto K1 = Number<8>{};
#elif 0
constexpr auto N = Number<128>{};
constexpr auto Hi = Number<270>{};
constexpr auto Wi = Number<480>{};
constexpr auto Y = Number<3>{};
constexpr auto X = Number<3>{};
constexpr auto C0 = Number<2>{};
constexpr auto C1 = Number<8>{};
constexpr auto K0 = Number<2>{};
constexpr auto K1 = Number<8>{};
#endif
constexpr auto conv_stride_h = I1;
constexpr auto conv_stride_w = I1;
constexpr auto conv_dilation_h = I1;
constexpr auto conv_dilation_w = I1;
constexpr auto in_left_pad_h = I1;
constexpr auto in_left_pad_w = I1;
constexpr auto in_right_pad_h = I1;
constexpr auto in_right_pad_w = I1;
constexpr auto YEff = (Y - I1) * conv_dilation_h + I1;
constexpr auto XEff = (X - I1) * conv_dilation_w + I1;
constexpr auto Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + I1;
constexpr auto Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + I1;
constexpr auto Ho_2 = Number<Ho / 2>{};
constexpr auto Wo_2 = Number<Wo / 2>{};
#endif
#if 0
using in_data_t = float;
using acc_data_t = float;
using out_data_t = float;
#elif 1
using in_data_t = half_t;
using acc_data_t = float;
using out_data_t = half_t;
#elif 1
using in_data_t = int8_t;
using acc_data_t = int32_t;
using out_data_t = int8_t;
#endif
std::vector<std::size_t> in_lengths_host(5), wei_lengths_host(5), out_lengths_host(5),
max_lengths_host(5), bias_lengths_host(2);
in_lengths_host[0] = static_cast<std::size_t>(N);
in_lengths_host[1] = static_cast<std::size_t>(C0);
in_lengths_host[2] = static_cast<std::size_t>(Hi);
in_lengths_host[3] = static_cast<std::size_t>(Wi);
in_lengths_host[4] = static_cast<std::size_t>(C1);
wei_lengths_host[0] = static_cast<std::size_t>(K0 * K1);
wei_lengths_host[1] = static_cast<std::size_t>(C0);
wei_lengths_host[2] = static_cast<std::size_t>(Y);
wei_lengths_host[3] = static_cast<std::size_t>(X);
wei_lengths_host[4] = static_cast<std::size_t>(C1);
out_lengths_host[0] = static_cast<std::size_t>(N);
out_lengths_host[1] = static_cast<std::size_t>(K0);
out_lengths_host[2] = static_cast<std::size_t>(Ho);
out_lengths_host[3] = static_cast<std::size_t>(Wo);
out_lengths_host[4] = static_cast<std::size_t>(K1);
max_lengths_host[0] = static_cast<std::size_t>(N);
max_lengths_host[1] = static_cast<std::size_t>(K0);
max_lengths_host[2] = static_cast<std::size_t>(Ho_2);
max_lengths_host[3] = static_cast<std::size_t>(Wo_2);
max_lengths_host[4] = static_cast<std::size_t>(K1);
bias_lengths_host[0] = static_cast<std::size_t>(K0);
bias_lengths_host[1] = static_cast<std::size_t>(K1);
Tensor<in_data_t> in(in_lengths_host);
Tensor<in_data_t> wei(wei_lengths_host);
Tensor<out_data_t> bias(bias_lengths_host);
Tensor<out_data_t> out_device(out_lengths_host);
Tensor<out_data_t> out_host(out_lengths_host);
Tensor<in_data_t> max_device(max_lengths_host);
Tensor<in_data_t> max_host(max_lengths_host);
ostream_HostTensorDescriptor(in.mDesc, std::cout << "in: ");
ostream_HostTensorDescriptor(wei.mDesc, std::cout << "wei: ");
print_array("InLeftPads", make_tuple(in_left_pad_h, in_left_pad_w));
print_array("InRightPads", make_tuple(in_right_pad_h, in_right_pad_w));
print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
std::size_t num_thread = 1;
switch(init_method)
{
case 0:
// no initialization
break;
case 1:
in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
break;
case 2:
in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
break;
case 3:
in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
break;
case 4:
in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
break;
case 5:
in.GenerateTensorValue(GeneratorTensor_3<float>{0.0, 1.0}, num_thread);
wei.GenerateTensorValue(GeneratorTensor_3<float>{-0.5, 0.5}, num_thread);
break;
default:
in.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);
auto gen_wei = [](auto... is) {
return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
};
wei.GenerateTensorValue(gen_wei, num_thread);
}
bias.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
auto f_make_for_device_nchwc = [&]() {
const auto in_lengths_dev = make_tuple(N, C0, Hi, Wi, C1);
const auto wei_lengths_dev = make_tuple(K0 * K1, C0, Y, X, C1);
const auto max_lengths_dev = make_tuple(N, K0, Ho_2, Wo_2, K1);
const auto out_lengths_dev = make_tuple(N, K0, Ho, Wo, K1);
const auto conv_strides_dev = make_tuple(conv_stride_h, conv_stride_w);
const auto conv_dilations_dev = make_tuple(conv_dilation_h, conv_dilation_w);
const auto in_left_pads_dev = make_tuple(in_left_pad_h, in_left_pad_w);
const auto in_right_pads_dev = make_tuple(in_right_pad_h, in_right_pad_w);
return make_tuple(in_lengths_dev,
wei_lengths_dev,
max_lengths_dev,
out_lengths_dev,
conv_strides_dev,
conv_dilations_dev,
in_left_pads_dev,
in_right_pads_dev);
};
#if USE_CONV_FWD_V5R1_NCHWC
if(algo == ConvForwardAlgo::V5R1NCHWC)
{
const auto tmp = f_make_for_device_nchwc();
device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1<
in_data_t,
acc_data_t,
out_data_t,
activ_type>(tmp[I0], // in_lengths_dev
tmp[I1], // wei_lengths_dev
tmp[I2], // max_lengths_dev
tmp[I3], // out_lengths_dev
tmp[I4], // conv_strides_dev
tmp[I5], // conv_dilations_dev
tmp[I6], // in_left_pads_dev
tmp[I7], // in_right_pads_dev
in,
wei,
bias,
out_device,
max_device,
nrepeat);
}
#endif
if(do_verification)
{
host_direct_convolution_maxpool_nchwc(in,
wei,
bias,
out_host,
max_host,
make_tuple(conv_stride_h, conv_stride_w),
make_tuple(conv_dilation_h, conv_dilation_w),
make_tuple(in_left_pad_h, in_left_pad_w),
make_tuple(in_right_pad_h, in_right_pad_w),
activ_type);
ck::utils::check_err(out_device.mData, out_host.mData);
ck::utils::check_err(max_device.mData, max_host.mData);
if(do_log)
{
// LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
// LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
// LogRangeAsType<float>(std::cout << "out_device: ", out_device.mData, ",") <<
// std::endl;
LogRangeAsType<float>(std::cout << "max_host: ", max_host.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "max_device: ", max_device.mData, ",") << std::endl;
}
}
}
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "check_err.hpp"
#include "config.hpp"
#include "debug.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "conv_common.hpp"
#include "device_tensor.hpp"
#include "device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp"
#include "device_convolution_backward_weight_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"
#include "device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_atomic_nchw_kcyx_nkhw.hpp"
#include "device_convolution_backward_weight_implicit_gemm_v4r4r4_xdlops_atomic_nhwc_kyxc_nhwk.hpp"
#include "device_convolution_backward_weight_implicit_gemm_v4r4r5_xdlops_atomic_nhwc_kyxc_nhwk.hpp"
enum ConvTensorLayout
{
NCHW,
NHWC,
CHWN,
NCHWc,
NHWCc
};
#define USE_DYNAMIC_MODE 1
#define USE_CONV_WRW_V4R4R2_XDL_NCHW 0
#define USE_CONV_WRW_V4R4R4_XDL_NHWC 0
#define USE_CONV_WRW_V4R4R2_XDL_ATOMIC_NCHW 0
#define USE_CONV_WRW_V4R4R4_XDL_ATOMIC_NHWC 0
#define USE_CONV_WRW_V4R4R5_XDL_ATOMIC_NHWC 1
enum ConvBackwardWeightAlgo
{
V4R4R2XDLNCHW, // 0
V4R4R4XDLNHWC, // 1
V4R4R2XDLATOMICNCHW, // 2
V4R4R4XDLATOMICNHWC, // 3
V4R4R5XDLATOMICNHWC, // 4
};
template <typename TOut,
typename TIn,
typename TWei,
typename ConvStrides,
typename ConvDilations,
typename InLeftPads,
typename InRightPads>
void host_convolution_backward_weight(const Tensor<TOut>& out,
const Tensor<TIn>& in,
Tensor<TWei>& wei,
const ConvStrides& conv_strides,
const ConvDilations& conv_dilations,
const InLeftPads& in_left_pads,
const InRightPads&,
const ConvTensorLayout layout = ConvTensorLayout::NCHW)
{
using namespace ck;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
auto f_kcyx = [&](auto k, auto c, auto y, auto x) {
double v = 0;
for(int n = 0; n < out.mDesc.GetLengths()[0]; ++n)
{
for(int ho = 0; ho < out.mDesc.GetLengths()[2]; ++ho)
{
int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
for(int wo = 0; wo < out.mDesc.GetLengths()[3]; ++wo)
{
int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
wi < in.mDesc.GetLengths()[3])
{
v += static_cast<const double>(in(n, c, hi, wi)) *
static_cast<const double>(out(n, k, ho, wo));
}
}
}
}
wei(k, c, y, x) = v;
};
auto f_kyxc = [&](auto k, auto y, auto x, auto c) {
double v = 0;
for(int n = 0; n < out.mDesc.GetLengths()[0]; ++n)
{
for(int ho = 0; ho < out.mDesc.GetLengths()[1]; ++ho)
{
int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
for(int wo = 0; wo < out.mDesc.GetLengths()[2]; ++wo)
{
int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
if(hi >= 0 && hi < in.mDesc.GetLengths()[1] && wi >= 0 &&
wi < in.mDesc.GetLengths()[2])
{
v += static_cast<const double>(in(n, hi, wi, c)) *
static_cast<const double>(out(n, ho, wo, k));
}
}
}
}
wei(k, y, x, c) = v;
};
if(layout == ConvTensorLayout::NCHW)
{
make_ParallelTensorFunctor(f_kcyx,
wei.mDesc.GetLengths()[0],
wei.mDesc.GetLengths()[1],
wei.mDesc.GetLengths()[2],
wei.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
}
else if(layout == ConvTensorLayout::NHWC)
{
make_ParallelTensorFunctor(f_kyxc,
wei.mDesc.GetLengths()[0],
wei.mDesc.GetLengths()[1],
wei.mDesc.GetLengths()[2],
wei.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
}
else
{
throw std::runtime_error("wrong! not supported layout");
}
}
int main(int argc, char* argv[])
{
using namespace ck;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto I4 = Number<4>{};
constexpr auto I5 = Number<5>{};
constexpr auto I6 = Number<6>{};
#if USE_DYNAMIC_MODE
// dynamic mode
if(argc != 23)
{
printf("arg1 to 6: layout, algo, do_verification, init_method, do_log, nrepeat\n");
printf("rest: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx\n");
printf("additional: desired_grid_size\n");
exit(1);
}
const ConvTensorLayout layout = static_cast<ConvTensorLayout>(std::stoi(argv[1]));
const ConvBackwardWeightAlgo algo = static_cast<ConvBackwardWeightAlgo>(std::stoi(argv[2]));
const bool do_verification = std::stoi(argv[3]);
const int init_method = std::stoi(argv[4]);
const bool do_log = std::stoi(argv[5]);
const int nrepeat = std::stoi(argv[6]);
const index_t N = std::stoi(argv[7]);
const index_t K = std::stoi(argv[8]);
const index_t C = std::stoi(argv[9]);
const index_t Y = std::stoi(argv[10]);
const index_t X = std::stoi(argv[11]);
const index_t Hi = std::stoi(argv[12]);
const index_t Wi = std::stoi(argv[13]);
const index_t conv_stride_h = std::stoi(argv[14]);
const index_t conv_stride_w = std::stoi(argv[15]);
const index_t conv_dilation_h = std::stoi(argv[16]);
const index_t conv_dilation_w = std::stoi(argv[17]);
const index_t in_left_pad_h = std::stoi(argv[18]);
const index_t in_left_pad_w = std::stoi(argv[19]);
const index_t in_right_pad_h = std::stoi(argv[20]);
const index_t in_right_pad_w = std::stoi(argv[21]);
const index_t desired_grid_size = std::stoi(argv[22]);
const index_t YEff = (Y - 1) * conv_dilation_h + 1;
const index_t XEff = (X - 1) * conv_dilation_w + 1;
const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
#else
// static mode
if(argc < 7)
{
printf("arg1 to 6: layout, algo, do_verification, init_method, do_log, nrepeat\n");
exit(1);
}
const ConvTensorLayout layout = static_cast<ConvTensorLayout>(std::stoi(argv[1]));
const ConvBackwardWeightAlgo algo = static_cast<ConvBackwardWeightAlgo>(std::stoi(argv[2]));
const bool do_verification = std::stoi(argv[3]);
const int init_method = std::stoi(argv[4]);
const bool do_log = std::stoi(argv[5]);
const int nrepeat = std::stoi(argv[6]);
constexpr auto N = Number<128>{};
constexpr auto C = Number<128>{};
constexpr auto Hi = Number<14>{};
constexpr auto Wi = Number<14>{};
constexpr auto K = Number<256>{};
constexpr auto Y = Number<3>{};
constexpr auto X = Number<3>{};
constexpr auto conv_stride_h = I1;
constexpr auto conv_stride_w = I1;
constexpr auto conv_dilation_h = I1;
constexpr auto conv_dilation_w = I1;
constexpr auto in_left_pad_h = I1;
constexpr auto in_left_pad_w = I1;
constexpr auto in_right_pad_h = I1;
constexpr auto in_right_pad_w = I1;
constexpr auto YEff = (Y - I1) * conv_dilation_h + I1;
constexpr auto XEff = (X - I1) * conv_dilation_w + I1;
constexpr auto Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + I1;
constexpr auto Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + I1;
#endif
#if 0
using in_data_t = float;
using wei_data_t = float;
using acc_data_t = float;
using out_data_t = float;
#elif 1
using in_data_t = half_t;
using out_data_t = half_t;
using acc_data_t = float;
using wei_data_t = float;
#elif 1
using in_data_t = int8_t;
using out_data_t = int8_t;
using acc_data_t = int32_t;
using wei_data_t = int8_t;
#endif
std::vector<std::size_t> in_lengths_host(4), wei_lengths_host(4), out_lengths_host(4);
if(layout == ConvTensorLayout::NCHW)
{
in_lengths_host[0] = static_cast<std::size_t>(N);
in_lengths_host[1] = static_cast<std::size_t>(C);
in_lengths_host[2] = static_cast<std::size_t>(Hi);
in_lengths_host[3] = static_cast<std::size_t>(Wi);
wei_lengths_host[0] = static_cast<std::size_t>(K);
wei_lengths_host[1] = static_cast<std::size_t>(C);
wei_lengths_host[2] = static_cast<std::size_t>(Y);
wei_lengths_host[3] = static_cast<std::size_t>(X);
out_lengths_host[0] = static_cast<std::size_t>(N);
out_lengths_host[1] = static_cast<std::size_t>(K);
out_lengths_host[2] = static_cast<std::size_t>(Ho);
out_lengths_host[3] = static_cast<std::size_t>(Wo);
}
else if(layout == ConvTensorLayout::NHWC)
{
in_lengths_host[0] = static_cast<std::size_t>(N);
in_lengths_host[1] = static_cast<std::size_t>(Hi);
in_lengths_host[2] = static_cast<std::size_t>(Wi);
in_lengths_host[3] = static_cast<std::size_t>(C);
wei_lengths_host[0] = static_cast<std::size_t>(K);
wei_lengths_host[1] = static_cast<std::size_t>(Y);
wei_lengths_host[2] = static_cast<std::size_t>(X);
wei_lengths_host[3] = static_cast<std::size_t>(C);
out_lengths_host[0] = static_cast<std::size_t>(N);
out_lengths_host[1] = static_cast<std::size_t>(Ho);
out_lengths_host[2] = static_cast<std::size_t>(Wo);
out_lengths_host[3] = static_cast<std::size_t>(K);
}
else
{
std::runtime_error("wrong! not implemented");
}
Tensor<in_data_t> in(in_lengths_host);
Tensor<wei_data_t> wei_device(wei_lengths_host);
Tensor<wei_data_t> wei_host(wei_lengths_host);
Tensor<out_data_t> out(out_lengths_host);
std::cout << "layout: " << layout << std::endl;
ostream_HostTensorDescriptor(in.mDesc, std::cout << "in: ");
ostream_HostTensorDescriptor(wei_host.mDesc, std::cout << "wei: ");
ostream_HostTensorDescriptor(out.mDesc, std::cout << "out: ");
print_array("InLeftPads", make_tuple(in_left_pad_h, in_left_pad_w));
print_array("InRightPads", make_tuple(in_right_pad_h, in_right_pad_w));
print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
std::size_t num_thread = 1;
switch(init_method)
{
case 0:
// no initialization
break;
case 1:
in.GenerateTensorValue(GeneratorTensor_1<in_data_t>{}, num_thread);
out.GenerateTensorValue(GeneratorTensor_1<out_data_t>{}, num_thread);
break;
case 2:
in.GenerateTensorValue(GeneratorTensor_1<in_data_t>{}, num_thread);
out.GenerateTensorValue(GeneratorTensor_2<out_data_t>{-5, 5}, num_thread);
break;
case 3:
in.GenerateTensorValue(GeneratorTensor_2<in_data_t>{-5, 5}, num_thread);
out.GenerateTensorValue(GeneratorTensor_1<out_data_t>{}, num_thread);
break;
case 4:
in.GenerateTensorValue(GeneratorTensor_2<in_data_t>{-5, 5}, num_thread);
out.GenerateTensorValue(GeneratorTensor_2<out_data_t>{-5, 5}, num_thread);
break;
case 5:
in.GenerateTensorValue(GeneratorTensor_3<in_data_t>{-0.1, 0.1}, num_thread);
out.GenerateTensorValue(GeneratorTensor_3<out_data_t>{-0.1, 0.1}, num_thread);
break;
default:
in.GenerateTensorValue(GeneratorTensor_2<in_data_t>{1, 5}, num_thread);
auto gen_out = [](auto... is) {
return GeneratorTensor_2<out_data_t>{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
};
out.GenerateTensorValue(gen_out, num_thread);
}
auto f_make_for_device_nchw = [&]() {
const auto in_lengths_dev = make_tuple(N, C, Hi, Wi);
const auto wei_lengths_dev = make_tuple(K, C, Y, X);
const auto out_lengths_dev = make_tuple(N, K, Ho, Wo);
const auto conv_strides_dev = make_tuple(conv_stride_h, conv_stride_w);
const auto conv_dilations_dev = make_tuple(conv_dilation_h, conv_dilation_w);
const auto in_left_pads_dev = make_tuple(in_left_pad_h, in_left_pad_w);
const auto in_right_pads_dev = make_tuple(in_right_pad_h, in_right_pad_w);
return make_tuple(in_lengths_dev,
wei_lengths_dev,
out_lengths_dev,
conv_strides_dev,
conv_dilations_dev,
in_left_pads_dev,
in_right_pads_dev);
};
auto f_make_for_device_nhwc = [&]() {
const auto in_lengths_dev = make_tuple(N, Hi, Wi, C);
const auto wei_lengths_dev = make_tuple(K, Y, X, C);
const auto out_lengths_dev = make_tuple(N, Ho, Wo, K);
const auto conv_strides_dev = make_tuple(conv_stride_h, conv_stride_w);
const auto conv_dilations_dev = make_tuple(conv_dilation_h, conv_dilation_w);
const auto in_left_pads_dev = make_tuple(in_left_pad_h, in_left_pad_w);
const auto in_right_pads_dev = make_tuple(in_right_pad_h, in_right_pad_w);
return make_tuple(in_lengths_dev,
wei_lengths_dev,
out_lengths_dev,
conv_strides_dev,
conv_dilations_dev,
in_left_pads_dev,
in_right_pads_dev);
};
// set zero to wei_device
wei_device.GenerateTensorValue(GeneratorTensor_0{}, num_thread);
#if USE_CONV_WRW_V4R4R2_XDL_NCHW
if(algo == ConvBackwardWeightAlgo::V4R4R2XDLNCHW)
{
if(layout != ConvTensorLayout::NCHW)
{
throw std::runtime_error("wrong! layout");
}
const auto tmp = f_make_for_device_nchw();
device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw<in_data_t,
wei_data_t,
acc_data_t,
out_data_t>(
tmp[I0],
tmp[I1],
tmp[I2],
tmp[I3],
tmp[I4],
tmp[I5],
tmp[I6],
in,
wei_device,
out,
nrepeat);
}
#endif
#if USE_CONV_WRW_V4R4R4_XDL_NHWC
if(algo == ConvBackwardWeightAlgo::V4R4R4XDLNHWC)
{
if(layout != ConvTensorLayout::NHWC)
{
throw std::runtime_error("wrong! layout");
}
const auto tmp = f_make_for_device_nhwc();
device_convolution_backward_weight_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk<in_data_t,
wei_data_t,
acc_data_t,
out_data_t>(
tmp[I0],
tmp[I1],
tmp[I2],
tmp[I3],
tmp[I4],
tmp[I5],
tmp[I6],
in,
wei_device,
out,
nrepeat);
}
#endif
#if USE_CONV_WRW_V4R4R2_XDL_ATOMIC_NCHW
if(algo == ConvBackwardWeightAlgo::V4R4R2XDLATOMICNCHW)
{
if(layout != ConvTensorLayout::NCHW)
{
throw std::runtime_error("wrong! layout");
}
const auto tmp = f_make_for_device_nchw();
device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_atomic_nchw_kcyx_nkhw<
in_data_t,
wei_data_t,
acc_data_t,
out_data_t>(tmp[I0],
tmp[I1],
tmp[I2],
tmp[I3],
tmp[I4],
tmp[I5],
tmp[I6],
in,
wei_device,
out,
desired_grid_size,
nrepeat);
}
#endif
#if USE_CONV_WRW_V4R4R4_XDL_ATOMIC_NHWC
if(algo == ConvBackwardWeightAlgo::V4R4R4XDLATOMICNHWC)
{
if(layout != ConvTensorLayout::NHWC)
{
throw std::runtime_error("wrong! layout");
}
const auto tmp = f_make_for_device_nhwc();
device_convolution_backward_weight_implicit_gemm_v4r4r4_xdlops_atomic_nhwc_kyxc_nhwk<
in_data_t,
wei_data_t,
acc_data_t,
out_data_t>(tmp[I0],
tmp[I1],
tmp[I2],
tmp[I3],
tmp[I4],
tmp[I5],
tmp[I6],
in,
wei_device,
out,
desired_grid_size,
nrepeat);
}
#endif
#if USE_CONV_WRW_V4R4R5_XDL_ATOMIC_NHWC
if(algo == ConvBackwardWeightAlgo::V4R4R5XDLATOMICNHWC)
{
if(layout != ConvTensorLayout::NHWC)
{
throw std::runtime_error("wrong! layout");
}
const auto tmp = f_make_for_device_nhwc();
device_convolution_backward_weight_implicit_gemm_v4r4r5_xdlops_atomic_nhwc_kyxc_nhwk<
in_data_t,
wei_data_t,
acc_data_t,
out_data_t>(tmp[I0],
tmp[I1],
tmp[I2],
tmp[I3],
tmp[I4],
tmp[I5],
tmp[I6],
in,
wei_device,
out,
desired_grid_size,
nrepeat);
}
#endif
if(do_verification)
{
host_convolution_backward_weight(out,
in,
wei_host,
make_tuple(conv_stride_h, conv_stride_w),
make_tuple(conv_dilation_h, conv_dilation_w),
make_tuple(in_left_pad_h, in_left_pad_w),
make_tuple(in_right_pad_h, in_right_pad_w),
layout);
ck::utils::check_err(wei_device.mData, wei_host.mData);
if(do_log)
{
LogRangeAsType<float>(std::cout << "out: ", out.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "wei_device: ", wei_device.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "wei_host : ", wei_host.mData, ",") << std::endl;
}
}
}
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "check_err.hpp"
#include "config.hpp"
#include "debug.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "host_gemm.hpp"
#include "device_tensor.hpp"
#include "device_gemm_xdlops_mk_kn_mn.hpp"
#include "device_gemm_xdlops_mk_nk_mn.hpp"
#include "device_gemm_xdlops_km_kn_mn.hpp"
#include "device_gemm_xdlops_km_nk_mn.hpp"
#include "device_gemm_xdlops_mk_kn_nm.hpp"
#include "device_gemm_xdlops_mk_nk_nm.hpp"
#include "device_gemm_xdlops_km_kn_nm.hpp"
#include "device_gemm_xdlops_km_nk_nm.hpp"
#define USE_GEMM_XDL_MK_KN_MN 1
#define USE_GEMM_XDL_MK_NK_MN 1
#define USE_GEMM_XDL_KM_KN_MN 1
#define USE_GEMM_XDL_KM_NK_MN 1
#define USE_GEMM_XDL_MK_KN_NM 0
#define USE_GEMM_XDL_MK_NK_NM 0
#define USE_GEMM_XDL_KM_KN_NM 0
#define USE_GEMM_XDL_KM_NK_NM 0
enum struct GemmMatrixLayout
{
MK_KN_MN, // 0
MK_NK_MN, // 1
KM_KN_MN, // 2
KM_NK_MN, // 3
MK_KN_NM, // 4
MK_NK_NM, // 5
KM_KN_NM, // 6
KM_NK_NM // 7
};
enum struct GemmAlgo
{
Xdl_MK_KN_MN, // 0
Xdl_MK_NK_MN, // 1
Xdl_KM_KN_MN, // 2
Xdl_KM_NK_MN, // 3
Xdl_MK_KN_NM, // 4
Xdl_MK_NK_NM, // 5
Xdl_KM_KN_NM, // 6
Xdl_KM_NK_NM, // 7
};
template <typename AType, typename BType, typename CType>
void host_gemm(const Tensor<AType>& a,
const Tensor<BType>& b,
Tensor<CType>& c,
const GemmMatrixLayout layout)
{
if(layout == GemmMatrixLayout::MK_KN_MN)
{
auto f_mk_kn_mn = [&](auto m, auto n) {
const int K = a.mDesc.GetLengths()[1];
double v = 0;
for(int k = 0; k < K; ++k)
{
v += static_cast<const double>(a(m, k)) * static_cast<const double>(b(k, n));
}
c(m, n) = v;
};
make_ParallelTensorFunctor(f_mk_kn_mn, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
std::thread::hardware_concurrency());
}
else if(layout == GemmMatrixLayout::MK_NK_MN)
{
auto f_mk_nk_mn = [&](auto m, auto n) {
const int K = a.mDesc.GetLengths()[1];
double v = 0;
for(int k = 0; k < K; ++k)
{
v += static_cast<const double>(a(m, k)) * static_cast<const double>(b(n, k));
}
c(m, n) = v;
};
make_ParallelTensorFunctor(f_mk_nk_mn, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
std::thread::hardware_concurrency());
}
else if(layout == GemmMatrixLayout::KM_KN_MN)
{
auto f_km_kn_mn = [&](auto m, auto n) {
const int K = a.mDesc.GetLengths()[0];
double v = 0;
for(int k = 0; k < K; ++k)
{
v += static_cast<const double>(a(k, m)) * static_cast<const double>(b(k, n));
}
c(m, n) = v;
};
make_ParallelTensorFunctor(f_km_kn_mn, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
std::thread::hardware_concurrency());
}
else if(layout == GemmMatrixLayout::KM_NK_MN)
{
auto f_km_nk_mn = [&](auto m, auto n) {
const int K = a.mDesc.GetLengths()[0];
double v = 0;
for(int k = 0; k < K; ++k)
{
v += static_cast<const double>(a(k, m)) * static_cast<const double>(b(n, k));
}
c(m, n) = v;
};
make_ParallelTensorFunctor(f_km_nk_mn, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
std::thread::hardware_concurrency());
}
else if(layout == GemmMatrixLayout::MK_KN_NM)
{
auto f_mk_kn_nm = [&](auto n, auto m) {
const int K = a.mDesc.GetLengths()[1];
double v = 0;
for(int k = 0; k < K; ++k)
{
v += static_cast<const double>(a(m, k)) * static_cast<const double>(b(k, n));
}
c(n, m) = v;
};
make_ParallelTensorFunctor(f_mk_kn_nm, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
std::thread::hardware_concurrency());
}
else if(layout == GemmMatrixLayout::MK_NK_NM)
{
auto f_mk_nk_nm = [&](auto n, auto m) {
const int K = a.mDesc.GetLengths()[1];
double v = 0;
for(int k = 0; k < K; ++k)
{
v += static_cast<const double>(a(m, k)) * static_cast<const double>(b(n, k));
}
c(n, m) = v;
};
make_ParallelTensorFunctor(f_mk_nk_nm, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
std::thread::hardware_concurrency());
}
else if(layout == GemmMatrixLayout::KM_KN_NM)
{
auto f_km_kn_nm = [&](auto n, auto m) {
const int K = a.mDesc.GetLengths()[0];
double v = 0;
for(int k = 0; k < K; ++k)
{
v += static_cast<const double>(a(k, m)) * static_cast<const double>(b(k, n));
}
c(n, m) = v;
};
make_ParallelTensorFunctor(f_km_kn_nm, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
std::thread::hardware_concurrency());
}
else if(layout == GemmMatrixLayout::KM_NK_NM)
{
auto f_km_nk_nm = [&](auto n, auto m) {
const int K = a.mDesc.GetLengths()[0];
double v = 0;
for(int k = 0; k < K; ++k)
{
v += static_cast<const double>(a(k, m)) * static_cast<const double>(b(n, k));
}
c(n, m) = v;
};
make_ParallelTensorFunctor(f_km_nk_nm, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
std::thread::hardware_concurrency());
}
else
{
throw std::runtime_error("wrong! not supported layout");
}
}
int main(int argc, char* argv[])
{
using namespace ck;
if(argc != 12)
{
printf("arg1 to 6: layout, algo, do_verification, init_method, do_log, nrepeat\n");
printf("rest: M, N, K\n");
printf("debug_driver_gemm_xdlops_v2r3::M01, debug_driver_gemm_xdlops_v2r3::N01\n");
exit(1);
}
const auto layout = static_cast<GemmMatrixLayout>(std::stoi(argv[1]));
const auto algo = static_cast<GemmAlgo>(std::stoi(argv[2]));
const bool do_verification = std::stoi(argv[3]);
const int init_method = std::stoi(argv[4]);
const bool do_log = std::stoi(argv[5]);
const int nrepeat = std::stoi(argv[6]);
const index_t M = std::stoi(argv[7]);
const index_t N = std::stoi(argv[8]);
const index_t K = std::stoi(argv[9]);
debug::debug_driver_gemm_xdlops_v2r3::M01 = std::stoi(argv[10]);
debug::debug_driver_gemm_xdlops_v2r3::N01 = std::stoi(argv[11]);
#if 0
using ab_data_t = float;
using acc_data_t = float;
using c_data_t = float;
#elif 1
using ab_data_t = half_t;
using acc_data_t = float;
using c_data_t = half_t;
#elif 1
using ab_data_t = int8_t;
using acc_data_t = int32_t;
using c_data_t = int8_t;
#endif
std::vector<std::size_t> a_lengths_host(2), b_lengths_host(2), c_lengths_host(2);
std::vector<std::size_t> a_strides_host(2), b_strides_host(2), c_strides_host(2);
// A
if(layout == GemmMatrixLayout::MK_KN_MN || layout == GemmMatrixLayout::MK_NK_MN ||
layout == GemmMatrixLayout::MK_KN_NM || layout == GemmMatrixLayout::MK_NK_NM)
{
a_lengths_host[0] = static_cast<std::size_t>(M);
a_lengths_host[1] = static_cast<std::size_t>(K);
a_strides_host[0] = static_cast<std::size_t>(K);
a_strides_host[1] = static_cast<std::size_t>(1);
}
else
{
a_lengths_host[0] = static_cast<std::size_t>(K);
a_lengths_host[1] = static_cast<std::size_t>(M);
a_strides_host[0] = static_cast<std::size_t>(M);
a_strides_host[1] = static_cast<std::size_t>(1);
}
// B
if(layout == GemmMatrixLayout::MK_NK_MN || layout == GemmMatrixLayout::KM_NK_MN ||
layout == GemmMatrixLayout::MK_NK_NM || layout == GemmMatrixLayout::KM_NK_NM)
{
b_lengths_host[0] = static_cast<std::size_t>(N);
b_lengths_host[1] = static_cast<std::size_t>(K);
b_strides_host[0] = static_cast<std::size_t>(K);
b_strides_host[1] = static_cast<std::size_t>(1);
}
else
{
b_lengths_host[0] = static_cast<std::size_t>(K);
b_lengths_host[1] = static_cast<std::size_t>(N);
b_strides_host[0] = static_cast<std::size_t>(N);
b_strides_host[1] = static_cast<std::size_t>(1);
}
// C
if(layout == GemmMatrixLayout::MK_KN_MN || layout == GemmMatrixLayout::KM_KN_MN ||
layout == GemmMatrixLayout::MK_NK_MN || layout == GemmMatrixLayout::KM_NK_MN)
{
c_lengths_host[0] = static_cast<std::size_t>(M);
c_lengths_host[1] = static_cast<std::size_t>(N);
c_strides_host[0] = static_cast<std::size_t>(N);
c_strides_host[1] = static_cast<std::size_t>(1);
}
else
{
c_lengths_host[0] = static_cast<std::size_t>(N);
c_lengths_host[1] = static_cast<std::size_t>(M);
c_strides_host[0] = static_cast<std::size_t>(M);
c_strides_host[1] = static_cast<std::size_t>(1);
}
Tensor<ab_data_t> a(a_lengths_host, a_strides_host);
Tensor<ab_data_t> b(b_lengths_host, b_strides_host);
Tensor<c_data_t> c_host(c_lengths_host, c_strides_host);
Tensor<c_data_t> c_device(c_lengths_host, c_strides_host);
std::cout << "layout: " << layout << std::endl;
ostream_HostTensorDescriptor(a.mDesc, std::cout << "a: ");
ostream_HostTensorDescriptor(b.mDesc, std::cout << "b: ");
ostream_HostTensorDescriptor(c_host.mDesc, std::cout << "c: ");
std::size_t num_thread = 1;
switch(init_method)
{
case 0:
// no initialization
break;
case 1:
a.GenerateTensorValue(GeneratorTensor_1<ab_data_t>{}, num_thread);
b.GenerateTensorValue(GeneratorTensor_1<ab_data_t>{}, num_thread);
break;
case 2:
a.GenerateTensorValue(GeneratorTensor_1<ab_data_t>{}, num_thread);
b.GenerateTensorValue(GeneratorTensor_2<ab_data_t>{-5, 5}, num_thread);
break;
case 3:
a.GenerateTensorValue(GeneratorTensor_2<ab_data_t>{-5, 5}, num_thread);
b.GenerateTensorValue(GeneratorTensor_1<ab_data_t>{}, num_thread);
break;
case 4:
a.GenerateTensorValue(GeneratorTensor_2<ab_data_t>{-5, 5}, num_thread);
b.GenerateTensorValue(GeneratorTensor_2<ab_data_t>{-5, 5}, num_thread);
break;
default:
a.GenerateTensorValue(GeneratorTensor_3<ab_data_t>{0.0, 1.0}, num_thread);
b.GenerateTensorValue(GeneratorTensor_3<ab_data_t>{-0.5, 0.5}, num_thread);
}
#if USE_GEMM_XDL_MK_KN_MN
if(algo == GemmAlgo::Xdl_MK_KN_MN)
{
if(layout != GemmMatrixLayout::MK_KN_MN)
{
throw std::runtime_error("wrong! layout");
}
device_gemm_xdlops_mk_kn_mn<ab_data_t, acc_data_t, c_data_t>(a, b, c_device, nrepeat);
}
#endif
#if USE_GEMM_XDL_MK_NK_MN
if(algo == GemmAlgo::Xdl_MK_NK_MN)
{
if(layout != GemmMatrixLayout::MK_NK_MN)
{
throw std::runtime_error("wrong! layout");
}
device_gemm_xdlops_mk_nk_mn<ab_data_t, acc_data_t, c_data_t>(a, b, c_device, nrepeat);
}
#endif
#if USE_GEMM_XDL_KM_KN_MN
if(algo == GemmAlgo::Xdl_KM_KN_MN)
{
if(layout != GemmMatrixLayout::KM_KN_MN)
{
throw std::runtime_error("wrong! layout");
}
device_gemm_xdlops_km_kn_mn<ab_data_t, acc_data_t, c_data_t>(a, b, c_device, nrepeat);
}
#endif
#if USE_GEMM_XDL_KM_NK_MN
if(algo == GemmAlgo::Xdl_KM_NK_MN)
{
if(layout != GemmMatrixLayout::KM_NK_MN)
{
throw std::runtime_error("wrong! layout");
}
device_gemm_xdlops_km_nk_mn<ab_data_t, acc_data_t, c_data_t>(a, b, c_device, nrepeat);
}
#endif
#if USE_GEMM_XDL_MK_KN_NM
if(algo == GemmAlgo::Xdl_MK_KN_NM)
{
if(layout != GemmMatrixLayout::MK_KN_NM)
{
throw std::runtime_error("wrong! layout");
}
device_gemm_xdlops_mk_kn_nm<ab_data_t, acc_data_t, c_data_t>(a, b, c_device, nrepeat);
}
#endif
#if USE_GEMM_XDL_MK_NK_NM
if(algo == GemmAlgo::Xdl_MK_NK_NM)
{
if(layout != GemmMatrixLayout::MK_NK_NM)
{
throw std::runtime_error("wrong! layout");
}
device_gemm_xdlops_mk_nk_nm<ab_data_t, acc_data_t, c_data_t>(a, b, c_device, nrepeat);
}
#endif
#if USE_GEMM_XDL_KM_KN_NM
if(algo == GemmAlgo::Xdl_KM_KN_NM)
{
if(layout != GemmMatrixLayout::KM_KN_NM)
{
throw std::runtime_error("wrong! layout");
}
device_gemm_xdlops_km_kn_nm<ab_data_t, acc_data_t, c_data_t>(a, b, c_device, nrepeat);
}
#endif
#if USE_GEMM_XDL_KM_NK_NM
if(algo == GemmAlgo::Xdl_KM_NK_NM)
{
if(layout != GemmMatrixLayout::KM_NK_NM)
{
throw std::runtime_error("wrong! layout");
}
device_gemm_xdlops_km_nk_nm<ab_data_t, acc_data_t, c_data_t>(a, b, c_device, nrepeat);
}
#endif
if(do_verification)
{
host_gemm(a, b, c_host, layout);
ck::utils::check_err(c_device.mData, c_host.mData);
if(do_log)
{
LogRangeAsType<float>(std::cout << "a : ", a.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "b: ", b.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "c_host : ", c_host.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "c_device: ", c_device.mData, ",") << std::endl;
}
}
}
include_directories(BEFORE
${PROJECT_SOURCE_DIR}/include/ck
${PROJECT_SOURCE_DIR}/include/ck/utility
${PROJECT_SOURCE_DIR}/include/ck/host_utility
${PROJECT_SOURCE_DIR}/include/ck/tensor_description
${PROJECT_SOURCE_DIR}/include/ck/tensor
${PROJECT_SOURCE_DIR}/include/ck/problem_transform
${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/device
${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/grid
${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/block
${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/warp
${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/thread
${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/element
${PROJECT_SOURCE_DIR}/library/include/ck/library/host_tensor
${PROJECT_SOURCE_DIR}/library/include/ck/library/host
${PROJECT_SOURCE_DIR}/library/include/ck/library/tensor_operation_instance
${PROJECT_SOURCE_DIR}/library/include/ck/library/tensor_operation_instance/gpu/reduce
${PROJECT_SOURCE_DIR}/external/include/half
)
function(add_instance_library INSTANCE_NAME)
message("adding instance ${INSTANCE_NAME}")
add_library(${INSTANCE_NAME} OBJECT ${ARGN})
add_library(${INSTANCE_NAME} OBJECT ${ARGN})
target_compile_features(${INSTANCE_NAME} PUBLIC)
set_target_properties(${INSTANCE_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
endfunction(add_instance_library INSTANCE_NAME)
add_subdirectory(gemm)
add_subdirectory(gemm_bias2d)
add_subdirectory(gemm_bias_relu)
add_subdirectory(gemm_bias_relu_add)
add_subdirectory(gemm_splitk)
add_subdirectory(gemm_bilinear)
add_subdirectory(gemm_add_add_fastgelu)
add_subdirectory(gemm_reduce)
add_subdirectory(gemm_bias_add_reduce)
add_subdirectory(batched_gemm)
add_subdirectory(batched_gemm_reduce)
add_subdirectory(grouped_gemm)
add_subdirectory(contraction_scale)
add_subdirectory(contraction_bilinear)
add_subdirectory(conv1d_fwd)
add_subdirectory(conv2d_fwd)
add_subdirectory(conv3d_fwd)
add_subdirectory(conv2d_fwd_bias_relu)
add_subdirectory(conv2d_fwd_bias_relu_add)
add_subdirectory(conv2d_fwd_bias_relu_atomic_add)
add_subdirectory(conv2d_bwd_data)
add_subdirectory(reduce)
add_subdirectory(convnd_bwd_data)
add_subdirectory(grouped_gemm)
add_subdirectory(conv2d_bwd_weight)
add_subdirectory(batched_gemm_reduce)
add_subdirectory(convnd_bwd_weight)
add_subdirectory(reduce)
add_subdirectory(normalization)
add_subdirectory(elementwise)
add_library(device_operations STATIC
$<TARGET_OBJECTS:device_conv1d_fwd_instance>
$<TARGET_OBJECTS:device_batched_gemm_instance>
$<TARGET_OBJECTS:device_conv2d_bwd_data_instance>
$<TARGET_OBJECTS:device_conv2d_fwd_instance>
$<TARGET_OBJECTS:device_conv2d_fwd_bias_relu_instance>
$<TARGET_OBJECTS:device_conv2d_fwd_bias_relu_add_instance>
$<TARGET_OBJECTS:device_conv2d_fwd_bias_relu_atomic_add_instance>
add_library(device_operations STATIC
$<TARGET_OBJECTS:device_gemm_instance>
$<TARGET_OBJECTS:device_gemm_bias_relu_instance>
$<TARGET_OBJECTS:device_gemm_bias_relu_add_instance>
$<TARGET_OBJECTS:device_gemm_bias2d_instance>
$<TARGET_OBJECTS:device_reduce_instance>
$<TARGET_OBJECTS:device_convnd_bwd_data_instance>
$<TARGET_OBJECTS:device_grouped_gemm_instance>
$<TARGET_OBJECTS:device_conv2d_bwd_weight_instance>
$<TARGET_OBJECTS:device_gemm_splitk_instance>
$<TARGET_OBJECTS:device_gemm_bilinear_instance>
$<TARGET_OBJECTS:device_gemm_add_add_fastgelu_instance>
$<TARGET_OBJECTS:device_gemm_bias_add_reduce_instance>
$<TARGET_OBJECTS:device_batched_gemm_instance>
$<TARGET_OBJECTS:device_batched_gemm_reduce_instance>
$<TARGET_OBJECTS:device_grouped_gemm_instance>
$<TARGET_OBJECTS:device_contraction_scale_instance>
$<TARGET_OBJECTS:device_contraction_bilinear_instance>
$<TARGET_OBJECTS:device_conv1d_fwd_instance>
$<TARGET_OBJECTS:device_conv2d_fwd_instance>
$<TARGET_OBJECTS:device_conv3d_fwd_instance>
device_conv2d.cpp
$<TARGET_OBJECTS:device_conv2d_fwd_bias_relu_instance>
$<TARGET_OBJECTS:device_conv2d_fwd_bias_relu_add_instance>
$<TARGET_OBJECTS:device_conv2d_bwd_data_instance>
$<TARGET_OBJECTS:device_convnd_bwd_data_instance>
$<TARGET_OBJECTS:device_conv2d_bwd_weight_instance>
$<TARGET_OBJECTS:device_convnd_bwd_weight_instance>
$<TARGET_OBJECTS:device_reduce_instance>
$<TARGET_OBJECTS:device_normalization_instance>
$<TARGET_OBJECTS:device_elementwise_instance>
)
add_library(composablekernels::device_operations ALIAS device_operations)
set(DEV_OPS_INC_DIRS
set(DEV_OPS_INC_DIRS
${PROJECT_SOURCE_DIR}/include/ck/
${PROJECT_SOURCE_DIR}/library/include/ck/
${PROJECT_SOURCE_DIR}/external/include/
)
target_compile_features(device_operations PUBLIC)
set_target_properties(device_operations PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_include_directories(device_operations PUBLIC
target_include_directories(device_operations PUBLIC
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck>
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/utility>
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_description>
......@@ -87,29 +76,25 @@ target_include_directories(device_operations PUBLIC
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/thread>
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/element>
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/host_tensor>
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/host>
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance>
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance/gpu>
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance/gpu/reduce>
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/half>
)
#once new arches are enabled make this an option on the main cmake file
# and pass down here to be exported
target_compile_options(device_operations
PRIVATE --offload-arch=gfx908
target_compile_options(device_operations PRIVATE
--offload-arch=gfx908
--offload-arch=gfx90a
)
# install(TARGETS device_operations LIBRARY DESTINATION lib)
install(TARGETS device_operations
EXPORT device_operationsTargets
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
)
install(DIRECTORY ${DEV_OPS_INC_DIRS} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/ck)
install(EXPORT device_operationsTargets
FILE composable_kerneldevice_operationsTargets.cmake
rocm_install(TARGETS device_operations
EXPORT device_operationsTargets)
rocm_install(DIRECTORY ${DEV_OPS_INC_DIRS} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/ck)
rocm_install(EXPORT device_operationsTargets
FILE composable_kerneldevice_operationsTargets.cmake
NAMESPACE composable_kernel::
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
)
#include <stdlib.h>
#include "config.hpp"
#include "device_batched_gemm_xdl.hpp"
#include "element_wise_operation.hpp"
#include "device_operation_instance.hpp"
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace device_batched_gemm_instance {
namespace instance {
using BF16 = ck::bhalf_t;
using F32 = float;
......@@ -23,29 +29,31 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough;
// Compilation parameters for a[k, m] * b[k, n] = c[m, n]
using device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instances = std::tuple<
// clang-format off
//##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
//##########| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar|
//##########| | | | | | | | Operation| Operation| Operation| | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector|
//##########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceBatchedGemmXdl< BF16, BF16, BF16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, 256, 256, 128, 4, 8, 32, 32, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 7, 1>,
DeviceBatchedGemmXdl< BF16, BF16, BF16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, 256, 128, 256, 4, 8, 32, 32, 2, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, true, 7, 1>,
DeviceBatchedGemmXdl< BF16, BF16, BF16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, 128, 128, 128, 4, 8, 32, 32, 4, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, true, 7, 1>,
DeviceBatchedGemmXdl< BF16, BF16, BF16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, 256, 128, 128, 4, 8, 32, 32, 2, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 7, 1>,
DeviceBatchedGemmXdl< BF16, BF16, BF16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, 128, 128, 64, 4, 8, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 7, 1>,
DeviceBatchedGemmXdl< BF16, BF16, BF16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, 128, 64, 128, 4, 8, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, true, 7, 1>,
DeviceBatchedGemmXdl< BF16, BF16, BF16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, 256, 128, 64, 4, 8, 32, 32, 2, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 7, 1>,
DeviceBatchedGemmXdl< BF16, BF16, BF16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, 256, 64, 128, 4, 8, 32, 32, 1, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 7, 1>
//##################| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
//##################| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar|
//##################| | | | | | | | Operation| Operation| Operation| | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector|
//##################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceBatchedGemmXdl< BF16, BF16, BF16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, 256, 256, 128, 4, 8, 32, 32, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 7, 1>,
DeviceBatchedGemmXdl< BF16, BF16, BF16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, 256, 128, 256, 4, 8, 32, 32, 2, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, true, 7, 1>,
DeviceBatchedGemmXdl< BF16, BF16, BF16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, 128, 128, 128, 4, 8, 32, 32, 4, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, true, 7, 1>,
DeviceBatchedGemmXdl< BF16, BF16, BF16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, 256, 128, 128, 4, 8, 32, 32, 2, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 7, 1>,
DeviceBatchedGemmXdl< BF16, BF16, BF16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, 128, 128, 64, 4, 8, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 7, 1>,
DeviceBatchedGemmXdl< BF16, BF16, BF16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, 128, 64, 128, 4, 8, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, true, 7, 1>,
DeviceBatchedGemmXdl< BF16, BF16, BF16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, 256, 128, 64, 4, 8, 32, 32, 2, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 7, 1>,
DeviceBatchedGemmXdl< BF16, BF16, BF16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, 256, 64, 128, 4, 8, 32, 32, 1, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 7, 1>
// clang-format on
>;
void add_device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instances(
std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
std::vector<std::unique_ptr<
DeviceBatchedGemm<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
instances)
{
add_device_operation_instances(instances,
device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instances{});
}
} // namespace device_batched_gemm_instance
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
#include <stdlib.h>
#include "config.hpp"
#include "device_batched_gemm_xdl.hpp"
#include "element_wise_operation.hpp"
#include "device_operation_instance.hpp"
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace device_batched_gemm_instance {
namespace instance {
using BF16 = ck::bhalf_t;
using F32 = float;
......@@ -39,13 +44,15 @@ using device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instances = std::tuple<
>;
void add_device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instances(
std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
std::vector<std::unique_ptr<
DeviceBatchedGemm<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
instances)
{
add_device_operation_instances(instances,
device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instances{});
}
} // namespace device_batched_gemm_instance
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
#include <stdlib.h>
#include "config.hpp"
#include "device_batched_gemm_xdl.hpp"
#include "element_wise_operation.hpp"
#include "device_operation_instance.hpp"
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace device_batched_gemm_instance {
namespace instance {
using BF16 = ck::bhalf_t;
using F32 = float;
......@@ -43,13 +48,15 @@ using device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instances = std::tuple<
>;
void add_device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instances(
std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
std::vector<std::unique_ptr<
DeviceBatchedGemm<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
instances)
{
add_device_operation_instances(instances,
device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instances{});
}
} // namespace device_batched_gemm_instance
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
#include <stdlib.h>
#include "config.hpp"
#include "device_batched_gemm_xdl.hpp"
#include "element_wise_operation.hpp"
#include "device_operation_instance.hpp"
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace device_batched_gemm_instance {
namespace instance {
using BF16 = ck::bhalf_t;
using F32 = float;
......@@ -44,13 +49,15 @@ using device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instances = std::tuple<
>;
void add_device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instances(
std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
std::vector<std::unique_ptr<
DeviceBatchedGemm<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
instances)
{
add_device_operation_instances(instances,
device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instances{});
}
} // namespace device_batched_gemm_instance
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
#include <stdlib.h>
#include "config.hpp"
#include "device_batched_gemm_xdl.hpp"
#include "element_wise_operation.hpp"
#include "device_operation_instance.hpp"
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace device_batched_gemm_instance {
namespace instance {
using F16 = ck::half_t;
using F32 = float;
......@@ -39,13 +44,15 @@ using device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instances = std::tuple<
>;
void add_device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instances(
std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
std::vector<std::unique_ptr<
DeviceBatchedGemm<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
instances)
{
add_device_operation_instances(instances,
device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instances{});
}
} // namespace device_batched_gemm_instance
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
#include <stdlib.h>
#include "config.hpp"
#include "device_batched_gemm_xdl.hpp"
#include "element_wise_operation.hpp"
#include "device_operation_instance.hpp"
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace device_batched_gemm_instance {
namespace instance {
using F16 = ck::half_t;
using F32 = float;
......@@ -39,13 +44,15 @@ using device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instances = std::tuple<
>;
void add_device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instances(
std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
std::vector<std::unique_ptr<
DeviceBatchedGemm<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
instances)
{
add_device_operation_instances(instances,
device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instances{});
}
} // namespace device_batched_gemm_instance
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
#include <stdlib.h>
#include "config.hpp"
#include "device_batched_gemm_xdl.hpp"
#include "element_wise_operation.hpp"
#include "device_operation_instance.hpp"
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace device_batched_gemm_instance {
namespace instance {
using F16 = ck::half_t;
using F32 = float;
......@@ -48,13 +53,15 @@ using device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instances = std::tuple<
>;
void add_device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instances(
std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
std::vector<std::unique_ptr<
DeviceBatchedGemm<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
instances)
{
add_device_operation_instances(instances,
device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instances{});
}
} // namespace device_batched_gemm_instance
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
#include <stdlib.h>
#include "config.hpp"
#include "device_batched_gemm_xdl.hpp"
#include "element_wise_operation.hpp"
#include "device_operation_instance.hpp"
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace device_batched_gemm_instance {
namespace instance {
using F16 = ck::half_t;
using F32 = float;
......@@ -44,13 +49,15 @@ using device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instances = std::tuple<
>;
void add_device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instances(
std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
std::vector<std::unique_ptr<
DeviceBatchedGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
instances)
{
add_device_operation_instances(instances,
device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instances{});
}
} // namespace device_batched_gemm_instance
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
#include <stdlib.h>
#include "config.hpp"
#include "device_batched_gemm_xdl.hpp"
#include "element_wise_operation.hpp"
#include "device_operation_instance.hpp"
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace device_batched_gemm_instance {
namespace instance {
using F16 = ck::half_t;
using F32 = float;
......@@ -39,13 +44,15 @@ using device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instances = std::tuple<
>;
void add_device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instances(
std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
std::vector<std::unique_ptr<
DeviceBatchedGemm<Col, Row, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
instances)
{
add_device_operation_instances(instances,
device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instances{});
}
} // namespace device_batched_gemm_instance
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
#include <stdlib.h>
#include "config.hpp"
#include "device_batched_gemm_xdl.hpp"
#include "element_wise_operation.hpp"
#include "device_operation_instance.hpp"
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace device_batched_gemm_instance {
namespace instance {
using F16 = ck::half_t;
using F32 = float;
......@@ -39,13 +44,15 @@ using device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instances = std::tuple<
>;
void add_device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instances(
std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
std::vector<std::unique_ptr<
DeviceBatchedGemm<Col, Col, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
instances)
{
add_device_operation_instances(instances,
device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instances{});
}
} // namespace device_batched_gemm_instance
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
#include <stdlib.h>
#include "config.hpp"
#include "device_batched_gemm_xdl.hpp"
#include "element_wise_operation.hpp"
#include "device_operation_instance.hpp"
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace device_batched_gemm_instance {
namespace instance {
using F16 = ck::half_t;
using F32 = float;
......@@ -39,13 +44,15 @@ using device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instances = std::tuple<
>;
void add_device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instances(
std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
std::vector<std::unique_ptr<
DeviceBatchedGemm<Row, Row, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
instances)
{
add_device_operation_instances(instances,
device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instances{});
}
} // namespace device_batched_gemm_instance
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment