Commit b89a88b5 authored by Adam Osewski's avatar Adam Osewski
Browse files

Merge branch 'develop' into wavelet_model

parents 41d5fca7 43c898f6
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <cstdlib>
#include <iostream>
#include <iterator>
#include <numeric>
#include <type_traits>
#include <vector>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/fill.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
using BF16 = ck::bhalf_t;
using FP16 = ck::half_t;
using FP32 = float;
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
using I4 = ck::int4_t;
#endif
using I8 = std::int8_t;
using I32 = std::int32_t;
template <typename ALay, typename BLay, typename DELay, typename RLay>
struct LayoutSetting
{
using ALayout = ALay;
using BLayout = BLay;
using DELayout = DELay;
using RLayout = RLay;
};
template <ck::index_t NDimSpatial>
struct LayoutSettingSelector;
namespace ctl = ck::tensor_layout::convolution;
template <>
struct LayoutSettingSelector<1> final : LayoutSetting<ctl::GNWC, ctl::GKXC, ctl::GNWK, ctl::GNW>
{
};
template <>
struct LayoutSettingSelector<2> final : LayoutSetting<ctl::GNHWC, ctl::GKYXC, ctl::GNHWK, ctl::GNHW>
{
};
template <>
struct LayoutSettingSelector<3> final
: LayoutSetting<ctl::GNDHWC, ctl::GKZYXC, ctl::GNDHWK, ctl::GNDHW>
{
};
template <ck::index_t NDimSpatial>
using ALayout = typename LayoutSettingSelector<NDimSpatial>::ALayout;
template <ck::index_t NDimSpatial>
using BLayout = typename LayoutSettingSelector<NDimSpatial>::BLayout;
template <ck::index_t NDimSpatial>
using DELayout = typename LayoutSettingSelector<NDimSpatial>::DELayout;
template <ck::index_t NDimSpatial>
using RLayout = typename LayoutSettingSelector<NDimSpatial>::RLayout;
struct ExecutionConfig final
{
bool do_verification = true;
int init_method = 1;
bool time_kernel = false;
};
inline void print_help_msg()
{
std::cerr << "arg1: verification (0=no, 1=yes)\n"
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
<< "arg3: time kernel (0=no, 1=yes)\n"
<< ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
}
inline bool parse_cmd_args(int argc,
char* argv[],
ck::utils::conv::ConvParam& problem_size,
ExecutionConfig& config)
{
constexpr int num_execution_config_args =
3; // arguments for do_verification, init_method, time_kernel
constexpr int num_conv_param_leading_args = 5; // arguments for num_dim_spatial_, G_, N_, K_, C_
constexpr int threshold_to_catch_partial_args = 1 + num_execution_config_args;
constexpr int threshold_to_catch_all_args =
threshold_to_catch_partial_args + num_conv_param_leading_args;
if(argc == 1)
{
// use default
}
// catch only ExecutionConfig arguments
else if(argc == threshold_to_catch_partial_args)
{
config.do_verification = std::stoi(argv[1]);
config.init_method = std::stoi(argv[2]);
config.time_kernel = std::stoi(argv[3]);
}
// catch both ExecutionConfig & ConvParam arguments
else if(threshold_to_catch_all_args < argc && ((argc - threshold_to_catch_all_args) % 3 == 0))
{
config.do_verification = std::stoi(argv[1]);
config.init_method = std::stoi(argv[2]);
config.time_kernel = std::stoi(argv[3]);
const ck::index_t num_dim_spatial = std::stoi(argv[4]);
problem_size = ck::utils::conv::parse_conv_param(
num_dim_spatial, threshold_to_catch_partial_args, argv);
}
else
{
print_help_msg();
return false;
}
return true;
}
inline HostTensorDescriptor
make_r0_host_tensor_descriptor(const ck::utils::conv::ConvParam& problem_size)
{
std::vector<ck::index_t> dimensions{problem_size.G_, problem_size.N_};
std::copy(begin(problem_size.output_spatial_lengths_),
end(problem_size.output_spatial_lengths_),
std::back_inserter(dimensions));
return HostTensorDescriptor(dimensions);
}
template <typename Lengths, typename Strides>
void unpack_host_tensor_descriptor(const HostTensorDescriptor& descriptor,
Lengths& lengths,
Strides& strides)
{
assert(size(descriptor.GetLengths()) == size(lengths));
std::copy_n(begin(descriptor.GetLengths()), size(descriptor.GetLengths()), begin(lengths));
assert(size(descriptor.GetStrides()) == size(strides));
std::copy_n(begin(descriptor.GetStrides()), size(descriptor.GetStrides()), begin(strides));
}
template <typename Range, typename OutputIterator>
auto copy(const Range& range, OutputIterator iter)
-> decltype(std::copy(std::begin(range), std::end(range), iter))
{
return std::copy(std::begin(range), std::end(range), iter);
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
using ADataType = BF16;
using BDataType = BF16;
using AccDataType = FP32;
using CShuffleDataType = FP32;
using DsDataType = ck::Tuple<>;
using EDataType = BF16;
using ReduceAccDataType = FP32;
using R0DataType = FP32;
using RsDataType = ck::Tuple<R0DataType>;
#include "run_convnd_fwd_max_example.inc"
int main(int argc, char* argv[]) { return !run_convnd_fwd_max_example(argc, argv); }
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
using ADataType = FP16;
using BDataType = FP16;
using AccDataType = FP32;
using CShuffleDataType = FP32;
using DsDataType = ck::Tuple<>;
using EDataType = FP16;
using ReduceAccDataType = FP32;
using R0DataType = FP32;
using RsDataType = ck::Tuple<R0DataType>;
#include "run_convnd_fwd_max_example.inc"
int main(int argc, char* argv[]) { return !run_convnd_fwd_max_example(argc, argv); }
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
using ADataType = FP32;
using BDataType = FP32;
using AccDataType = FP32;
using CShuffleDataType = FP32;
using DsDataType = ck::Tuple<>;
using EDataType = FP32;
using ReduceAccDataType = FP32;
using R0DataType = FP32;
using RsDataType = ck::Tuple<R0DataType>;
#include "run_convnd_fwd_max_example.inc"
int main(int argc, char* argv[]) { return !run_convnd_fwd_max_example(argc, argv); }
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#ifndef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
#error Should compile this file with ck::int4_t support
#endif
#define BUILD_INT4_EXAMPLE
#include "common.hpp"
using ADataType = I4;
using BDataType = I4;
using KernelADataType = I8;
using KernelBDataType = I8;
using AccDataType = I32;
using CShuffleDataType = I32;
using DsDataType = ck::Tuple<>;
using EDataType = I32;
using ReduceAccDataType = I32;
using R0DataType = I32;
using RsDataType = ck::Tuple<R0DataType>;
#include "run_convnd_fwd_max_example.inc"
int main(int argc, char* argv[]) { return !run_convnd_fwd_max_example(argc, argv); }
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
using ADataType = I8;
using BDataType = I8;
using AccDataType = I32;
using CShuffleDataType = I32;
using DsDataType = ck::Tuple<>;
using EDataType = I32;
using ReduceAccDataType = I32;
using R0DataType = I32;
using RsDataType = ck::Tuple<R0DataType>;
#include "run_convnd_fwd_max_example.inc"
int main(int argc, char* argv[]) { return !run_convnd_fwd_max_example(argc, argv); }
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CDEElementOp = PassThrough;
using QsElementOp = ck::Tuple<PassThrough>;
using RsElementOp = ck::Tuple<PassThrough>;
// ReduceOp
using RsThreadReduceOp = ck::Tuple<ck::reduce::Max>;
using RsGlobalReduceOp =
ck::InMemoryDataOperationEnumSequence<ck::InMemoryDataOperationEnum::AtomicMax>;
static constexpr auto ConvSpec =
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
// clang-format off
template <ck::index_t NDimSpatial>
using DeviceInstance =
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleDMultipleR_Xdl_CShuffle
//######| NDimSpatial| ALayout| BLayout| DELayout| RLayout| AData| BData| AccData| CShuffle| DsData| EData| ReduceAccData| RsData| A| B| CDE| Qs| Rs| Thread| Global| Conv| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CDRThreadTransfer| CDE| RThreadTransfer|
//######| | | | | | Type| Type| Type| DataType| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Elementwise| Elementwise| Reduce| Reduce| Fwd|Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| ClusterLengths| ReduceThreadTransfer| DstScalarPerVector|
//######| | | | | | | | | | | | | | Operation| Operation| Operation| Operation| Operation| Operation| Operation| Specialization| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _MPerBlock_NPerBlock| ScalarPerVector| _MPerBlock|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | _NPerBlock| |
#ifdef BUILD_INT4_EXAMPLE
< NDimSpatial, ALayout<NDimSpatial>, BLayout<NDimSpatial>, DELayout<NDimSpatial>, RLayout<NDimSpatial>, KernelADataType, KernelBDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType, AElementOp, BElementOp, CDEElementOp, QsElementOp, RsElementOp, RsThreadReduceOp, RsGlobalReduceOp, ConvSpec, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<64, 4>, 4, 1>;
#else
< NDimSpatial, ALayout<NDimSpatial>, BLayout<NDimSpatial>, DELayout<NDimSpatial>, RLayout<NDimSpatial>, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType, AElementOp, BElementOp, CDEElementOp, QsElementOp, RsElementOp, RsThreadReduceOp, RsGlobalReduceOp, ConvSpec, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<64, 4>, 4, 1>;
#endif
template <ck::index_t NDimSpatial>
using HostInstance = ck::tensor_operation::host::ReferenceConvFwd
<NDimSpatial, ADataType, BDataType, EDataType, AElementOp, BElementOp, PassThrough>;
// clang-format on
template <ck::index_t NDimSpatial>
bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
const ExecutionConfig& config)
{
static_assert(1 <= NDimSpatial && NDimSpatial <= 3, "Unsupported NDimSpatial");
#if defined(BUILD_INT4_EXAMPLE) && defined(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
static_assert(sizeof(ck::int4_t) == sizeof(int8_t));
#endif
const auto conv_input_g_n_c_wis_desc =
ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<ALayout<NDimSpatial>>(
problem_size);
const auto conv_weight_g_k_c_xs_desc =
ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<BLayout<NDimSpatial>>(
problem_size);
const auto conv_output_g_n_k_wos_desc =
ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<DELayout<NDimSpatial>>(
problem_size);
const auto r0_desc = make_r0_host_tensor_descriptor(problem_size);
Tensor<ADataType> conv_input(conv_input_g_n_c_wis_desc);
Tensor<BDataType> conv_weight(conv_weight_g_k_c_xs_desc);
Tensor<EDataType> conv_output_device(conv_output_g_n_k_wos_desc);
Tensor<R0DataType> r0_device(r0_desc);
switch(config.init_method)
{
case 0: break;
case 1:
ck::utils::FillUniformDistributionIntegerValue<ADataType>{-8, 7}(conv_input.begin(),
conv_input.end());
ck::utils::FillUniformDistributionIntegerValue<BDataType>{-8, 7}(conv_weight.begin(),
conv_weight.end());
break;
default:
ck::utils::FillUniformDistribution<ADataType>{-5, 5}(conv_input.begin(), conv_input.end());
ck::utils::FillUniformDistribution<BDataType>{-5, 5}(conv_weight.begin(),
conv_weight.end());
}
DeviceMem conv_input_device_buf(sizeof(ADataType) * conv_input.mDesc.GetElementSpaceSize());
DeviceMem conv_weight_device_buf(sizeof(BDataType) * conv_weight.mDesc.GetElementSpaceSize());
DeviceMem conv_output_device_buf(sizeof(EDataType) *
conv_output_device.mDesc.GetElementSpaceSize());
DeviceMem r0_device_buf(sizeof(R0DataType) * r0_device.mDesc.GetElementSpaceSize());
#ifdef BUILD_INT4_EXAMPLE
const Tensor<KernelADataType> conv_input_converted(conv_input);
const Tensor<KernelBDataType> conv_weight_converted(conv_weight);
conv_input_device_buf.ToDevice(conv_input_converted.mData.data());
conv_weight_device_buf.ToDevice(conv_weight_converted.mData.data());
#else
conv_input_device_buf.ToDevice(conv_input.mData.data());
conv_weight_device_buf.ToDevice(conv_weight.mData.data());
#endif
std::array<ck::index_t, NDimSpatial + 3> conv_input_g_n_c_wis_lengths{},
conv_input_g_n_c_wis_strides{};
std::array<ck::index_t, NDimSpatial + 3> conv_weight_g_k_c_xs_lengths{},
conv_weight_g_k_c_xs_strides{};
std::array<ck::index_t, NDimSpatial + 3> conv_output_g_n_k_wos_lengths{},
conv_output_g_n_k_wos_strides{};
std::array<ck::index_t, NDimSpatial + 2> r0_lengths{}, r0_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_strides{}, conv_filter_dilations{};
std::array<ck::index_t, NDimSpatial> input_left_pads{}, input_right_pads{};
unpack_host_tensor_descriptor(
conv_input_g_n_c_wis_desc, conv_input_g_n_c_wis_lengths, conv_input_g_n_c_wis_strides);
unpack_host_tensor_descriptor(
conv_weight_g_k_c_xs_desc, conv_weight_g_k_c_xs_lengths, conv_weight_g_k_c_xs_strides);
unpack_host_tensor_descriptor(
conv_output_g_n_k_wos_desc, conv_output_g_n_k_wos_lengths, conv_output_g_n_k_wos_strides);
unpack_host_tensor_descriptor(r0_desc, r0_lengths, r0_strides);
copy(problem_size.conv_filter_strides_, begin(conv_filter_strides));
copy(problem_size.conv_filter_dilations_, begin(conv_filter_dilations));
copy(problem_size.input_left_pads_, begin(input_left_pads));
copy(problem_size.input_right_pads_, begin(input_right_pads));
// run Conv + Reduction on device
auto conv = DeviceInstance<NDimSpatial>{};
auto invoker = conv.MakeInvoker();
auto argument = conv.MakeArgument(conv_input_device_buf.GetDeviceBuffer(),
conv_weight_device_buf.GetDeviceBuffer(),
std::array<const void*, 0>{},
conv_output_device_buf.GetDeviceBuffer(),
{r0_device_buf.GetDeviceBuffer()},
conv_input_g_n_c_wis_lengths,
conv_input_g_n_c_wis_strides,
conv_weight_g_k_c_xs_lengths,
conv_weight_g_k_c_xs_strides,
std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
conv_output_g_n_k_wos_lengths,
conv_output_g_n_k_wos_strides,
r0_lengths,
r0_strides,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
AElementOp{},
BElementOp{},
CDEElementOp{},
QsElementOp{},
RsElementOp{});
if(!conv.IsSupportedArgument(argument))
{
std::cerr << "wrong! device_conv with the specified compilation parameters does "
"not support this Conv problem"
<< std::endl;
return false;
}
const float avg_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
const std::size_t flop = problem_size.GetFlops();
const std::size_t num_btype = problem_size.GetByte<ADataType, BDataType, EDataType>();
const float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
const float gb_per_sec = num_btype / 1.E6 / avg_time;
std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< conv.GetTypeString() << std::endl;
if(config.do_verification)
{
Tensor<EDataType> conv_output_host(conv_output_g_n_k_wos_desc);
// run Conv + Reduction on host
auto ref_conv = HostInstance<NDimSpatial>{};
auto ref_invoker = ref_conv.MakeInvoker();
auto ref_argument = ref_conv.MakeArgument(conv_input,
conv_weight,
conv_output_host,
problem_size.conv_filter_strides_,
problem_size.conv_filter_dilations_,
problem_size.input_left_pads_,
problem_size.input_right_pads_,
AElementOp{},
BElementOp{},
PassThrough{});
ref_invoker.Run(ref_argument);
Tensor<R0DataType> r0_host(r0_device.mDesc);
auto reduce0_op = RsThreadReduceOp{}[ck::Number<0>{}];
auto& output_dims = conv_output_g_n_k_wos_desc.GetLengths();
if constexpr(NDimSpatial == 1)
{
for(std::size_t g = 0; g < output_dims[0]; ++g)
{
for(std::size_t n = 0; n < output_dims[1]; ++n)
{
for(std::size_t w = 0; w < output_dims[3]; ++w)
{
auto reduce0_acc = reduce0_op.GetIdentityValue<ReduceAccDataType>();
for(std::size_t k = 0; k < output_dims[2]; ++k)
{
auto e_val =
ck::type_convert<ReduceAccDataType>(conv_output_host(g, n, k, w));
reduce0_op(reduce0_acc, e_val);
}
r0_host(g, n, w) = ck::type_convert<R0DataType>(reduce0_acc);
}
}
}
}
else if constexpr(NDimSpatial == 2)
{
for(std::size_t g = 0; g < output_dims[0]; ++g)
{
for(std::size_t n = 0; n < output_dims[1]; ++n)
{
for(std::size_t h = 0; h < output_dims[3]; ++h)
{
for(std::size_t w = 0; w < output_dims[4]; ++w)
{
auto reduce0_acc = reduce0_op.GetIdentityValue<ReduceAccDataType>();
for(std::size_t k = 0; k < output_dims[2]; ++k)
{
auto e_val = ck::type_convert<ReduceAccDataType>(
conv_output_host(g, n, k, h, w));
reduce0_op(reduce0_acc, e_val);
}
r0_host(g, n, h, w) = ck::type_convert<R0DataType>(reduce0_acc);
}
}
}
}
}
else if constexpr(NDimSpatial == 3)
{
for(std::size_t g = 0; g < output_dims[0]; ++g)
{
for(std::size_t n = 0; n < output_dims[1]; ++n)
{
for(std::size_t d = 0; d < output_dims[3]; ++d)
{
for(std::size_t h = 0; h < output_dims[4]; ++h)
{
for(std::size_t w = 0; w < output_dims[5]; ++w)
{
auto reduce0_acc = reduce0_op.GetIdentityValue<ReduceAccDataType>();
for(std::size_t k = 0; k < output_dims[2]; ++k)
{
auto e_val = ck::type_convert<ReduceAccDataType>(
conv_output_host(g, n, k, d, h, w));
reduce0_op(reduce0_acc, e_val);
}
r0_host(g, n, d, h, w) = ck::type_convert<R0DataType>(reduce0_acc);
}
}
}
}
}
}
conv_output_device_buf.FromDevice(conv_output_device.mData.data());
r0_device_buf.FromDevice(r0_device.mData.data());
return ck::utils::check_err(conv_output_device.mData,
conv_output_host.mData,
"Error: incorrect results! (Matrix E)",
1e-5f,
1e-4f) &&
ck::utils::check_err(r0_device.mData,
r0_host.mData,
"Error: incorrect results! (Matrix R0)",
1e-5f,
1e-4f);
}
return true;
}
bool run_convnd_fwd_max_example(int argc, char* argv[])
{
ck::utils::conv::ConvParam problem_size{
2, 1, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
ExecutionConfig config;
if(!parse_cmd_args(argc, argv, problem_size, config))
{
return false;
}
switch(problem_size.num_dim_spatial_)
{
case 1: return run_convnd_fwd_max<1>(problem_size, config);
case 2: return run_convnd_fwd_max<2>(problem_size, config);
case 3: return run_convnd_fwd_max<3>(problem_size, config);
}
return false;
}
......@@ -225,6 +225,28 @@ int main(int argc, char* argv[])
arg.scales[0],
arg.scales[1]);
}
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
else if(arg.data_type == 7)
{
pass = reduce_blockwise_test<int4_t, int32_t, ReduceTensorOp::AVG, false, false>(
arg.do_verification,
arg.init_method,
arg.time_kernel,
arg.inLengths,
arg.reduceDims,
arg.scales[0],
arg.scales[1]);
pass = pass && reduce_blockwise_test<int4_t, int8_t, ReduceTensorOp::MAX, false, false>(
arg.do_verification,
arg.init_method,
arg.time_kernel,
arg.inLengths,
arg.reduceDims,
arg.scales[0],
arg.scales[1]);
}
#endif
}
else
{
......@@ -251,6 +273,15 @@ int main(int argc, char* argv[])
pass && reduce_blockwise_test<int8_t, int32_t, ReduceOpId, PropagateNan, OutputIndex>(
true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
// for testing int4_t using AVG operation
pass = pass && reduce_blockwise_test<int4_t, int32_t, ReduceTensorOp::AVG, false, false>(
true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
// for testing int4_t using MAX operation
pass = pass && reduce_blockwise_test<int4_t, int8_t, ReduceTensorOp::MAX, false, false>(
true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
#endif
// for testing 3D input
pass = pass && reduce_blockwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
true, 2, true, {16, 64, 960}, {0, 1}, 1.0f, 0.0f);
......
......@@ -58,28 +58,47 @@ int reduce_blockwise_impl(bool do_verification,
std::is_same<InOutDataType, float>::value &&
(op_support_indices && !std::is_same<AccDataType, float>::value);
// 1) If InOutDataType is int8_t, must use int8_t as AccDataType for indexable reduction
// operations 2) If InOutDataType is int8_t, must use int32_t as AccDataType for non-indexable
// reduction operations
// 1) If InOutDataType is int8_t or int4_t, must use int8_t as AccDataType for indexable
// reduction operations 2) If InOutDataType is int8_t or int4_t, must use int32_t as AccDataType
// for non-indexable reduction operations
constexpr bool invalid_reduce_4 =
std::is_same<InOutDataType, int8_t>::value &&
((!op_support_indices && !std::is_same<AccDataType, int32_t>::value) ||
(op_support_indices && !std::is_same<AccDataType, int8_t>::value));
// 1) If InOutDataType is int8_t, the supported operation must be either indexable operations or
// ADD/AVG
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
constexpr bool invalid_reduce_4_2 =
std::is_same<InOutDataType, int4_t>::value &&
((!op_support_indices && !std::is_same<AccDataType, int32_t>::value) ||
(op_support_indices && !std::is_same<AccDataType, int8_t>::value));
#endif
// 1) If InOutDataType is int8_t or int4_t, the supported operation must be either indexable
// operations or ADD/AVG
constexpr bool invalid_reduce_5 = std::is_same<InOutDataType, int8_t>::value &&
(!op_support_indices && ReduceOpId != ReduceTensorOp::ADD &&
ReduceOpId != ReduceTensorOp::AVG);
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
constexpr bool invalid_reduce_5_2 = std::is_same<InOutDataType, int4_t>::value &&
(!op_support_indices && ReduceOpId != ReduceTensorOp::ADD &&
ReduceOpId != ReduceTensorOp::AVG);
#endif
// 1) If InOutDataType is bhalf_t, must use float as AccDataType for all reduction operations
constexpr bool invalid_reduce_6 =
std::is_same<InOutDataType, bhalf_t>::value && !std::is_same<AccDataType, float>::value;
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
constexpr bool invalid_reduce =
(invalid_reduce_1 || invalid_reduce_2 || invalid_reduce_3 || invalid_reduce_4 ||
invalid_reduce_5 || invalid_reduce_6 || invalid_reduce_4_2 || invalid_reduce_5_2);
#else
constexpr bool invalid_reduce = (invalid_reduce_1 || invalid_reduce_2 || invalid_reduce_3 ||
invalid_reduce_4 || invalid_reduce_5 || invalid_reduce_6);
#endif
if(invalid_reduce)
if constexpr(invalid_reduce)
{
std::cerr << "The reduction setting is invalid, exiting!" << std::endl;
return (-1);
......@@ -91,10 +110,17 @@ int reduce_blockwise_impl(bool do_verification,
using AccElementwiseOperation =
typename reduce_unary_operator<ReduceOpId, true, true>::AccElementwiseOperation;
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
using InOutDataTypeInDevice = typename std::
conditional<std::is_same<InOutDataType, int4_t>::value, int8_t, InOutDataType>::type;
#else
using InOutDataTypeInDevice = InOutDataType;
#endif
using DeviceReduceInstance =
ck::tensor_operation::device::DeviceReduceMultiBlock<InOutDataType,
ck::tensor_operation::device::DeviceReduceMultiBlock<InOutDataTypeInDevice,
AccDataType,
InOutDataType,
InOutDataTypeInDevice,
Rank,
NumReduceDim,
ReduceOperation,
......@@ -166,13 +192,35 @@ int reduce_blockwise_impl(bool do_verification,
};
// these buffers are usually provided by the user application
DeviceMem in_dev(sizeof(InOutDataType) * in.mDesc.GetElementSpaceSize());
DeviceMem out_dev(sizeof(InOutDataType) * out.mDesc.GetElementSpaceSize());
DeviceMem in_dev(sizeof(InOutDataTypeInDevice) * in.mDesc.GetElementSpaceSize());
DeviceMem out_dev(sizeof(InOutDataTypeInDevice) * out.mDesc.GetElementSpaceSize());
in_dev.ToDevice(in.mData.data());
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
if(std::is_same<InOutDataType, int4_t>::value)
{
std::vector<InOutDataTypeInDevice> tmp_buf(in.mData.size());
std::copy_n(in.mData.data(), in.mData.size(), tmp_buf.data());
in_dev.ToDevice(tmp_buf.data());
}
else
#endif
in_dev.ToDevice(in.mData.data());
if(beta != 0.0f)
out_dev.ToDevice(out.mData.data());
{
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
if(std::is_same<InOutDataType, int4_t>::value)
{
std::vector<InOutDataTypeInDevice> tmp_buf(in.mData.size());
std::copy_n(out.mData.data(), out.mData.size(), tmp_buf.data());
out_dev.ToDevice(tmp_buf.data());
}
else
#endif
out_dev.ToDevice(out.mData.data());
};
size_t indicesSizeInBytes = OutputIndex ? out.mDesc.GetElementSize() * sizeof(int32_t) : 0;
......@@ -261,7 +309,19 @@ int reduce_blockwise_impl(bool do_verification,
if(do_verification)
{
out_dev.FromDevice(out.mData.data());
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
if(std::is_same<InOutDataType, int4_t>::value)
{
std::vector<InOutDataTypeInDevice> tmp_buf(out.mData.size());
out_dev.FromDevice(tmp_buf.data());
std::copy_n(tmp_buf.data(), out.mData.size(), out.mData.data());
}
else
#endif
out_dev.FromDevice(out.mData.data());
pass = pass && ck::utils::check_err(out.mData, out_ref.mData);
if(OutputIndex)
......
add_custom_target(example_grouped_gemm_xdl)
add_example_executable(example_grouped_gemm_xdl_fp32 grouped_gemm_xdl_fp32.cpp)
add_example_executable(example_grouped_gemm_xdl_fp16 grouped_gemm_xdl_fp16.cpp)
add_example_executable(example_grouped_gemm_xdl_bfp16 grouped_gemm_xdl_bfp16.cpp)
add_example_executable(example_grouped_gemm_xdl_int8 grouped_gemm_xdl_int8.cpp)
add_dependencies(example_grouped_gemm_xdl
example_grouped_gemm_xdl_fp32
example_grouped_gemm_xdl_fp16
example_grouped_gemm_xdl_bfp16
example_grouped_gemm_xdl_int8)
if(USE_BITINT_EXTENSION_INT4)
add_example_executable(example_grouped_gemm_xdl_int4 grouped_gemm_xdl_int4.cpp)
add_dependencies(example_grouped_gemm_xdl example_grouped_gemm_xdl_int4)
endif()
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using BF16 = ck::bhalf_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using ADataType = BF16;
using BDataType = BF16;
using AccDataType = F32;
using CShuffleDataType = BF16;
using DsDataType = ck::Tuple<>;
using EDataType = BF16;
using ALayout = Row;
using BLayout = Col;
using DsLayout = ck::Tuple<>;
using ELayout = Row;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CDEElementOp = PassThrough;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemm_Xdl
// clang-format off
//######| ALayout| BLayout| DsLayout| ELayout| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
//######| | | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//######| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>;
// clang-format on
#include "run_grouped_gemm_example.inc"
int main(int argc, char* argv[]) { return !run_grouped_gemm_example(argc, argv); }
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using F16 = ck::half_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using ADataType = F32;
using BDataType = F32;
using AccDataType = F32;
using CShuffleDataType = F32;
using DsDataType = ck::Tuple<>;
using EDataType = F32;
using ALayout = Row;
using BLayout = Col;
using DsLayout = ck::Tuple<>;
using ELayout = Row;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CDEElementOp = PassThrough;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemm_Xdl
// clang-format off
//######| ALayout| BLayout| DsLayout| ELayout| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
//######| | | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//######| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmDefault, 1, 256, 256, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 32, 1, 8>, 4>;
// clang-format on
#include "run_grouped_gemm_example.inc"
int main(int argc, char* argv[]) { return !run_grouped_gemm_example(argc, argv); }
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment