"vscode:/vscode.git/clone" did not exist on "72b768033c5a3910f619550100d248d96eb7aefa"
Unverified Commit f4980310 authored by Po Yen Chen's avatar Po Yen Chen Committed by GitHub
Browse files

Add client example of grouped conv2d forward (data type: fp16) (#488)

* Rename example folder for GroupedConvFwdMultipleD

* Unify example codes

* Change target names

* Add fp16 example for multiple d instance

* Re-format common.hpp

* Add interface 'DeviceGroupedConvFwd'

* Use simpler interface

* Move common conv params out

* Rename conv fwd client example folder

* Add missing include directive

* Update grouped conv instance implementations

* Simplify ckProfiler (grouped conv forward)

* Use GroupedConvFwd to implement client example

* Use greater groupe count in example

* Add custom target to group examples

* Add extra tag param to instance factory function

* Use tag to differentiate factory functions

* Add missing tag argument for factory function

* Remove inheritance relationship

* Remove no-longer used include directive

* Add license in front of file
parent 38470e04
add_executable(client_conv2d_fwd conv2d_fwd.cpp)
target_link_libraries(client_conv2d_fwd PRIVATE composable_kernel::device_operations)
add_executable(client_grouped_conv2d_fwd grouped_conv2d_fwd.cpp)
target_link_libraries(client_grouped_conv2d_fwd PRIVATE composable_kernel::device_operations)
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iomanip> #include <iomanip>
#include <iostream> #include <iostream>
#include <iterator>
#include <numeric>
#include <vector> #include <vector>
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/convolution_forward.hpp" #include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
using InDataType = ck::half_t; using InDataType = ck::half_t;
using WeiDataType = ck::half_t; using WeiDataType = ck::half_t;
using OutDataType = ck::half_t; using OutDataType = ck::half_t;
using InLayout = ck::tensor_layout::convolution::NHWC; using InLayout = ck::tensor_layout::convolution::GNHWC;
using WeiLayout = ck::tensor_layout::convolution::KYXC; using WeiLayout = ck::tensor_layout::convolution::GKYXC;
using OutLayout = ck::tensor_layout::convolution::NHWK; using OutLayout = ck::tensor_layout::convolution::GNHWK;
using PassThrough = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
static constexpr ck::index_t NumDimSpatial = 2; static constexpr ck::index_t NumDimSpatial = 2;
static constexpr ck::index_t N = 16; static constexpr ck::index_t G = 32;
static constexpr ck::index_t K = 32; static constexpr ck::index_t N = 256;
static constexpr ck::index_t C = 3; static constexpr ck::index_t K = 192;
static constexpr ck::index_t C = 192;
static constexpr ck::index_t Y = 3; static constexpr ck::index_t Y = 3;
static constexpr ck::index_t X = 3; static constexpr ck::index_t X = 3;
static constexpr ck::index_t Hi = 224; static constexpr ck::index_t Hi = 28;
static constexpr ck::index_t Wi = 224; static constexpr ck::index_t Wi = 28;
static constexpr ck::index_t Ho = 113; static constexpr ck::index_t Ho = 28;
static constexpr ck::index_t Wo = 113; static constexpr ck::index_t Wo = 28;
struct SimpleDeviceMem struct SimpleDeviceMem
{ {
...@@ -47,30 +50,66 @@ struct SimpleDeviceMem ...@@ -47,30 +50,66 @@ struct SimpleDeviceMem
void* p_mem_; void* p_mem_;
}; };
int main(int argc, char* argv[]) int main()
{ {
std::vector<ck::index_t> in_spatial_lengths{Hi, Wi}; std::array<ck::index_t, NumDimSpatial + 3> in_lengths{G, N, Hi, Wi, C};
std::vector<ck::index_t> filter_spatial_lengths{Y, X}; std::array<ck::index_t, NumDimSpatial + 3> in_strides{0, 0, 0, 0, 1};
std::vector<ck::index_t> out_spatial_lengths{Ho, Wo};
std::vector<ck::index_t> filter_strides{2, 2}; std::array<ck::index_t, NumDimSpatial + 3> wei_lengths{G, K, Y, X, C};
std::vector<ck::index_t> filter_dilations{1, 1}; std::array<ck::index_t, NumDimSpatial + 3> wei_strides{0, 0, 0, 0, 1};
std::vector<ck::index_t> input_left_pads{2, 2};
std::vector<ck::index_t> input_right_pads{2, 2}; std::array<ck::index_t, NumDimSpatial + 3> out_lengths{G, N, Ho, Wo, K};
std::array<ck::index_t, NumDimSpatial + 3> out_strides{0, 0, 0, 0, 1};
SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C); std::partial_sum(rbegin(in_lengths),
SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K); std::prev(rend(in_lengths)),
std::next(rbegin(in_strides)),
using DeviceOp = ck::tensor_operation::device::DeviceConvFwd<NumDimSpatial, std::multiplies<>{});
std::partial_sum(rbegin(wei_lengths),
std::prev(rend(wei_lengths)),
std::next(rbegin(wei_strides)),
std::multiplies<>{});
std::partial_sum(rbegin(out_lengths),
std::prev(rend(out_lengths)),
std::next(rbegin(out_strides)),
std::multiplies<>{});
// transpose GNHWC/GKYXC/GNHWK to GNCHW/GKCYX/GNCHW
std::rotate(
rbegin(in_lengths), std::next(rbegin(in_lengths)), std::next(rbegin(in_lengths), 3));
std::rotate(
rbegin(in_strides), std::next(rbegin(in_strides)), std::next(rbegin(in_strides), 3));
std::rotate(
rbegin(wei_lengths), std::next(rbegin(wei_lengths)), std::next(rbegin(wei_lengths), 3));
std::rotate(
rbegin(wei_strides), std::next(rbegin(wei_strides)), std::next(rbegin(wei_strides), 3));
std::rotate(
rbegin(out_lengths), std::next(rbegin(out_lengths)), std::next(rbegin(out_lengths), 3));
std::rotate(
rbegin(out_strides), std::next(rbegin(out_strides)), std::next(rbegin(out_strides), 3));
std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1};
std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1};
std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1};
std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};
SimpleDeviceMem in(sizeof(InDataType) * G * N * Hi * Wi * C);
SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
SimpleDeviceMem out(sizeof(OutDataType) * G * N * Ho * Wo * K);
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
InLayout, InLayout,
WeiLayout, WeiLayout,
ck::Tuple<>,
OutLayout, OutLayout,
InDataType, InDataType,
WeiDataType, WeiDataType,
ck::Tuple<>,
OutDataType, OutDataType,
PassThrough, PassThrough,
PassThrough, PassThrough,
PassThrough>; PassThrough>;
// get device op instances // get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances(); DeviceOp>::GetInstances();
...@@ -91,13 +130,16 @@ int main(int argc, char* argv[]) ...@@ -91,13 +130,16 @@ int main(int argc, char* argv[])
auto& op_ptr = op_ptrs[i]; auto& op_ptr = op_ptrs[i];
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
wei.GetDeviceBuffer(), wei.GetDeviceBuffer(),
{},
out.GetDeviceBuffer(), out.GetDeviceBuffer(),
N, in_lengths,
K, in_strides,
C, wei_lengths,
in_spatial_lengths, wei_strides,
filter_spatial_lengths, {},
out_spatial_lengths, {},
out_lengths,
out_strides,
filter_strides, filter_strides,
filter_dilations, filter_dilations,
input_left_pads, input_left_pads,
...@@ -112,10 +154,10 @@ int main(int argc, char* argv[]) ...@@ -112,10 +154,10 @@ int main(int argc, char* argv[])
{ {
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t flop = 2 * N * K * C * Ho * Wo * Y * X; std::size_t flop = std::size_t(2) * G * N * K * C * Ho * Wo * Y * X;
std::size_t num_bytes = sizeof(InDataType) * N * Hi * Wi * C + std::size_t num_bytes = sizeof(InDataType) * G * N * Hi * Wi * C +
sizeof(WeiDataType) * K * Y * X * C + sizeof(WeiDataType) * G * K * Y * X * C +
sizeof(OutDataType) * N * Ho * Wo * K; sizeof(OutDataType) * G * N * Ho * Wo * K;
float tflops = static_cast<float>(flop) / 1.E9 / avg_time; float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
float gb_per_sec = num_bytes / 1.E6 / avg_time; float gb_per_sec = num_bytes / 1.E6 / avg_time;
...@@ -134,10 +176,16 @@ int main(int argc, char* argv[]) ...@@ -134,10 +176,16 @@ int main(int argc, char* argv[])
} }
else else
{ {
std::cout << op_name << " does not support this problem" << std::endl; std::cerr << op_name << " does not support this problem" << std::endl;
} }
} }
if(best_op_id < 0)
{
std::cerr << "no suitable instance" << std::endl;
return EXIT_FAILURE;
}
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl; << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
...@@ -148,13 +196,16 @@ int main(int argc, char* argv[]) ...@@ -148,13 +196,16 @@ int main(int argc, char* argv[])
<< std::endl; << std::endl;
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
wei.GetDeviceBuffer(), wei.GetDeviceBuffer(),
{},
out.GetDeviceBuffer(), out.GetDeviceBuffer(),
N, in_lengths,
K, in_strides,
C, wei_lengths,
in_spatial_lengths, wei_strides,
filter_spatial_lengths, {},
out_spatial_lengths, {},
out_lengths,
out_strides,
filter_strides, filter_strides,
filter_dilations, filter_dilations,
input_left_pads, input_left_pads,
...@@ -172,6 +223,4 @@ int main(int argc, char* argv[]) ...@@ -172,6 +223,4 @@ int main(int argc, char* argv[])
std::cout << "Done" << std::endl; std::cout << "Done" << std::endl;
} }
return 0;
} }
add_custom_target(example_grouped_conv_fwd_multiple_d)
add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_fp16 grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp)
add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_fp32 grouped_conv_fwd_bias_relu_add_xdl_fp32.cpp)
add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_bf16 grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp)
add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_int8 grouped_conv_fwd_bias_relu_add_xdl_int8.cpp)
add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_fp16)
add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_fp32)
add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_bf16)
add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_int8)
if(USE_BITINT_EXTENSION_INT4)
add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_int4 grouped_conv_fwd_bias_relu_add_xdl_int4.cpp)
add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_int4)
endif() # USE_BITINT_EXTENSION_INT4
add_example_executable(example_grouped_conv_fwd_xdl_fp16 grouped_conv_fwd_xdl_fp16.cpp)
add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_xdl_fp16)
Command
```bash
arg1: verification (0=no, 1=yes)
arg2: initialization (0=no init, 1=integer value, 2=decimal value)
arg3: time kernel (0=no, 1=yes)
Following arguments (depending on number of spatial dims):
Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)
G, N, K, C,
<filter spatial dimensions>, (ie Y, X for 2D)
<input image spatial dimensions>, (ie Hi, Wi for 2D)
<strides>, (ie Sy, Sx for 2D)
<dilations>, (ie Dy, Dx for 2D)
<left padding>, (ie LeftPy, LeftPx for 2D)
<right padding>, (ie RightPy, RightPx for 2D)
./bin/example_grouped_conv_fwd_bias_relu_add_xdl_fp16 1 1 1
```
Result (MI100)
```
in: dim 5, lengths {1, 128, 192, 71, 71}, strides {192, 967872, 1, 13632, 192}
wei: dim 5, lengths {1, 256, 192, 3, 3}, strides {442368, 1728, 1, 576, 192}
bias: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 0, 1, 0, 0}
residual: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 0, 1, 0, 0}
out: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 331776, 1, 9216, 256}
launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1}
Warm up 1 time
Start running 10 times...
Perf: 1.55981 ms, 94.0927 TFlops, 213.868 GB/s, DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<256, 128, 256, 16, Default>
```
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
// kernel data types
using InKernelDataType = BF16;
using WeiKernelDataType = BF16;
using AccDataType = FP32;
using CShuffleDataType = FP32;
using BiasKernelDataType = BF16;
using ResidualKernelDataType = BF16;
using OutKernelDataType = BF16;
// tensor data types
using InUserDataType = InKernelDataType;
using WeiUserDataType = WeiKernelDataType;
using OutUserDataType = OutKernelDataType;
using InElementOp = PassThrough;
using WeiElementOp = PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
#include "run_grouped_conv_fwd_bias_relu_add_example.inc"
int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_bias_relu_add_example(argc, argv); }
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
// kernel data types
using InKernelDataType = FP16;
using WeiKernelDataType = FP16;
using AccDataType = FP32;
using CShuffleDataType = FP16;
using BiasKernelDataType = FP16;
using ResidualKernelDataType = FP16;
using OutKernelDataType = FP16;
// tensor data types
using InUserDataType = InKernelDataType;
using WeiUserDataType = WeiKernelDataType;
using OutUserDataType = OutKernelDataType;
using InElementOp = PassThrough;
using WeiElementOp = PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
#include "run_grouped_conv_fwd_bias_relu_add_example.inc"
int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_bias_relu_add_example(argc, argv); }
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
// kernel data types
using InKernelDataType = FP32;
using WeiKernelDataType = FP32;
using AccDataType = FP32;
using CShuffleDataType = FP32;
using BiasKernelDataType = FP32;
using ResidualKernelDataType = FP32;
using OutKernelDataType = FP32;
// tensor data types
using InUserDataType = InKernelDataType;
using WeiUserDataType = WeiKernelDataType;
using OutUserDataType = OutKernelDataType;
using InElementOp = PassThrough;
using WeiElementOp = PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
#include "run_grouped_conv_fwd_bias_relu_add_example.inc"
int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_bias_relu_add_example(argc, argv); }
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#ifndef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
#error Should compile this file with ck::int4_t support
#endif
#include "common.hpp"
// kernel data types
using InKernelDataType = I8;
using WeiKernelDataType = I8;
using AccDataType = I32;
using CShuffleDataType = I8;
using BiasKernelDataType = I8;
using ResidualKernelDataType = I8;
using OutKernelDataType = I8;
// tensor data types
using InUserDataType = I4;
using WeiUserDataType = I4;
using OutUserDataType = I4;
using InElementOp = PassThrough;
using WeiElementOp = PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
#define BUILD_INT4_EXAMPLE
#include "run_grouped_conv_fwd_bias_relu_add_example.inc"
int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_bias_relu_add_example(argc, argv); }
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
// kernel data types
using InKernelDataType = I8;
using WeiKernelDataType = I8;
using AccDataType = I32;
using CShuffleDataType = I8;
using BiasKernelDataType = I8;
using ResidualKernelDataType = I8;
using OutKernelDataType = I8;
// tensor data types
using InUserDataType = InKernelDataType;
using WeiUserDataType = WeiKernelDataType;
using OutUserDataType = OutKernelDataType;
using InElementOp = PassThrough;
using WeiElementOp = PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
#include "run_grouped_conv_fwd_bias_relu_add_example.inc"
int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_bias_relu_add_example(argc, argv); }
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
// kernel data types
using InKernelDataType = FP16;
using WeiKernelDataType = FP16;
using AccDataType = FP32;
using CShuffleDataType = FP16;
using OutKernelDataType = FP16;
// tensor data types
using InUserDataType = InKernelDataType;
using WeiUserDataType = WeiKernelDataType;
using OutUserDataType = OutKernelDataType;
using InElementOp = PassThrough;
using WeiElementOp = PassThrough;
using OutElementOp = PassThrough;
#include "run_grouped_conv_fwd_example.inc"
int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_example(argc, argv); }
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib> template <typename BiasLay, typename ResidualLay>
#include <iostream> struct LayoutSetting
#include <numeric>
#include <type_traits>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
void print_helper_msg()
{ {
std::cout << "arg1: verification (0=no, 1=yes)\n" using BiasLayout = BiasLay;
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n" using ResidualLayout = ResidualLay;
<< "arg3: time kernel (0=no, 1=yes)\n" };
<< ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
} template <ck::index_t NDimSpatial>
struct LayoutSettingSelector;
template <>
struct LayoutSettingSelector<1> final : LayoutSetting<ctl::G_K, ctl::G_NW_K>
{
};
template <>
struct LayoutSettingSelector<2> final : LayoutSetting<ctl::G_K, ctl::G_NHW_K>
{
};
template <>
struct LayoutSettingSelector<3> final : LayoutSetting<ctl::G_K, ctl::G_NDHW_K>
{
};
template <ck::index_t NDimSpatial>
using BiasLayout = typename LayoutSettingSelector<NDimSpatial>::BiasLayout;
template <ck::index_t NDimSpatial>
using ResidualLayout = typename LayoutSettingSelector<NDimSpatial>::ResidualLayout;
template <ck::index_t NDimSpatial>
using DeviceConvFwdInstance =
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>,
WeightLayout<NDimSpatial>,
ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>,
OutputLayout<NDimSpatial>,
InKernelDataType,
WeiKernelDataType,
AccDataType,
CShuffleDataType,
ck::Tuple<BiasKernelDataType, ResidualKernelDataType>,
OutKernelDataType,
InElementOp,
WeiElementOp,
OutElementOp,
ConvSpec, // ConvForwardSpecialization
GemmSpec, // GemmSpecialization
1, //
256, // BlockSize
128, // MPerBlock
256, // NPerBlock
16, // KPerBlock
4, // AK1
4, // BK1
32, // MPerXdl
32, // NPerXdl
2, // MXdlPerWave
4, // NXdlPerWave
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
2, // ABlockTransferSrcVectorDim
4, // ABlockTransferSrcScalarPerVector
4, // ABlockTransferDstScalarPerVector_AK1
1, // ABlockLdsExtraM
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
2, // BBlockTransferSrcVectorDim
4, // BBlockTransferSrcScalarPerVector
4, // BBlockTransferDstScalarPerVector_BK1
1, // BBlockLdsExtraN
1,
1,
S<1, 16, 1, 16>,
4>;
template <ck::index_t NDimSpatial>
using HostConvFwdInstance = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
InUserDataType,
WeiUserDataType,
CShuffleDataType,
InElementOp,
WeiElementOp,
PassThrough>;
template <ck::index_t NDimSpatial, template <ck::index_t NDimSpatial>
typename InKernelDataType, bool run_grouped_conv_fwd_bias_relu_add(const ExecutionConfig& config,
typename WeiKernelDataType, const ck::utils::conv::ConvParam& conv_param)
typename CShuffleDataType,
typename OutKernelDataType,
typename InElementOp,
typename WeiElementOp,
typename OutElementOp,
typename InUserDataType,
typename WeiUserDataType,
typename OutUserDataType,
typename DeviceConvNDFwdInstance>
int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
int init_method,
bool time_kernel,
const ck::utils::conv::ConvParam& conv_param,
const HostTensorDescriptor& in_g_n_c_wis_desc,
const HostTensorDescriptor& wei_g_k_c_xs_desc,
const HostTensorDescriptor& bias_g_n_k_wos_desc,
const HostTensorDescriptor& residual_g_n_k_wos_desc,
const HostTensorDescriptor& out_g_n_k_wos_desc,
const InElementOp& in_element_op,
const WeiElementOp& wei_element_op,
const OutElementOp& out_element_op)
{ {
static_assert(1 <= NDimSpatial && NDimSpatial <= 3, "Unsupported NDimSpatial");
const auto in_g_n_c_wis_desc = make_input_descriptor(conv_param);
const auto wei_g_k_c_xs_desc = make_weight_descriptor(conv_param);
const auto bias_g_n_k_wos_desc = make_bias_descriptor(conv_param);
const auto out_g_n_k_wos_desc = make_output_descriptor(conv_param);
Tensor<InUserDataType> in(in_g_n_c_wis_desc); Tensor<InUserDataType> in(in_g_n_c_wis_desc);
Tensor<WeiUserDataType> wei(wei_g_k_c_xs_desc); Tensor<WeiUserDataType> wei(wei_g_k_c_xs_desc);
Tensor<OutUserDataType> bias(bias_g_n_k_wos_desc); Tensor<OutUserDataType> bias(bias_g_n_k_wos_desc);
Tensor<OutUserDataType> residual(residual_g_n_k_wos_desc); Tensor<OutUserDataType> residual(bias_g_n_k_wos_desc);
Tensor<OutUserDataType> out_host(out_g_n_k_wos_desc); Tensor<OutUserDataType> out_host(out_g_n_k_wos_desc);
Tensor<OutKernelDataType> out_device(out_g_n_k_wos_desc); Tensor<OutKernelDataType> out_device(out_g_n_k_wos_desc);
...@@ -63,7 +114,7 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification, ...@@ -63,7 +114,7 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
std::cout << "residual: " << residual.mDesc << std::endl; std::cout << "residual: " << residual.mDesc << std::endl;
std::cout << "out: " << out_host.mDesc << std::endl; std::cout << "out: " << out_host.mDesc << std::endl;
switch(init_method) switch(config.init_method)
{ {
case 0: break; case 0: break;
case 1: case 1:
...@@ -83,7 +134,7 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification, ...@@ -83,7 +134,7 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
DeviceMem residual_device_buf(sizeof(OutKernelDataType) * residual.mDesc.GetElementSpaceSize()); DeviceMem residual_device_buf(sizeof(OutKernelDataType) * residual.mDesc.GetElementSpaceSize());
DeviceMem out_device_buf(sizeof(OutKernelDataType) * out_device.mDesc.GetElementSpaceSize()); DeviceMem out_device_buf(sizeof(OutKernelDataType) * out_device.mDesc.GetElementSpaceSize());
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 #ifdef BUILD_INT4_EXAMPLE
const Tensor<InKernelDataType> in_converted(in); const Tensor<InKernelDataType> in_converted(in);
const Tensor<WeiKernelDataType> wei_converted(wei); const Tensor<WeiKernelDataType> wei_converted(wei);
const Tensor<OutKernelDataType> bias_converted(bias); const Tensor<OutKernelDataType> bias_converted(bias);
...@@ -93,12 +144,12 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification, ...@@ -93,12 +144,12 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
wei_device_buf.ToDevice(wei_converted.mData.data()); wei_device_buf.ToDevice(wei_converted.mData.data());
bias_device_buf.ToDevice(bias_converted.mData.data()); bias_device_buf.ToDevice(bias_converted.mData.data());
residual_device_buf.ToDevice(residual_converted.mData.data()); residual_device_buf.ToDevice(residual_converted.mData.data());
#else // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 #else
in_device_buf.ToDevice(in.mData.data()); in_device_buf.ToDevice(in.mData.data());
wei_device_buf.ToDevice(wei.mData.data()); wei_device_buf.ToDevice(wei.mData.data());
bias_device_buf.ToDevice(bias.mData.data()); bias_device_buf.ToDevice(bias.mData.data());
residual_device_buf.ToDevice(residual.mData.data()); residual_device_buf.ToDevice(residual.mData.data());
#endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 #endif
std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{}; std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{}; std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
...@@ -123,8 +174,8 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification, ...@@ -123,8 +174,8 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides); copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
copy(bias_g_n_k_wos_desc.GetLengths(), d0_g_n_k_wos_lengths); copy(bias_g_n_k_wos_desc.GetLengths(), d0_g_n_k_wos_lengths);
copy(bias_g_n_k_wos_desc.GetStrides(), d0_g_n_k_wos_strides); copy(bias_g_n_k_wos_desc.GetStrides(), d0_g_n_k_wos_strides);
copy(residual_g_n_k_wos_desc.GetLengths(), d1_g_n_k_wos_lengths); copy(bias_g_n_k_wos_desc.GetLengths(), d1_g_n_k_wos_lengths);
copy(residual_g_n_k_wos_desc.GetStrides(), d1_g_n_k_wos_strides); copy(bias_g_n_k_wos_desc.GetStrides(), d1_g_n_k_wos_strides);
copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths); copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides); copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
copy(conv_param.conv_filter_strides_, conv_filter_strides); copy(conv_param.conv_filter_strides_, conv_filter_strides);
...@@ -133,7 +184,7 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification, ...@@ -133,7 +184,7 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
copy(conv_param.input_right_pads_, input_right_pads); copy(conv_param.input_right_pads_, input_right_pads);
// do Conv // do Conv
auto conv = DeviceConvNDFwdInstance{}; auto conv = DeviceConvFwdInstance<NDimSpatial>{};
auto invoker = conv.MakeInvoker(); auto invoker = conv.MakeInvoker();
auto argument = auto argument =
conv.MakeArgument(in_device_buf.GetDeviceBuffer(), conv.MakeArgument(in_device_buf.GetDeviceBuffer(),
...@@ -155,9 +206,9 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification, ...@@ -155,9 +206,9 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
conv_filter_dilations, conv_filter_dilations,
input_left_pads, input_left_pads,
input_right_pads, input_right_pads,
in_element_op, InElementOp{},
wei_element_op, WeiElementOp{},
out_element_op); OutElementOp{});
if(!conv.IsSupportedArgument(argument)) if(!conv.IsSupportedArgument(argument))
{ {
...@@ -166,7 +217,7 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification, ...@@ -166,7 +217,7 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
"not support this Conv problem"); "not support this Conv problem");
} }
float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); float avg_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
std::size_t flop = conv_param.GetFlops(); std::size_t flop = conv_param.GetFlops();
std::size_t num_btype = conv_param.GetByte<InUserDataType, WeiUserDataType, OutUserDataType>(); std::size_t num_btype = conv_param.GetByte<InUserDataType, WeiUserDataType, OutUserDataType>();
...@@ -176,20 +227,11 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification, ...@@ -176,20 +227,11 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< conv.GetTypeString() << std::endl; << conv.GetTypeString() << std::endl;
if(do_verification) if(config.do_verification)
{ {
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
Tensor<CShuffleDataType> c_host(out_g_n_k_wos_desc); Tensor<CShuffleDataType> c_host(out_g_n_k_wos_desc);
auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial, auto ref_conv = HostConvFwdInstance<NDimSpatial>{};
InUserDataType,
WeiUserDataType,
CShuffleDataType,
InElementOp,
WeiElementOp,
PassThrough>();
auto ref_invoker = ref_conv.MakeInvoker(); auto ref_invoker = ref_conv.MakeInvoker();
auto ref_argument = ref_conv.MakeArgument(in, auto ref_argument = ref_conv.MakeArgument(in,
wei, wei,
...@@ -198,36 +240,49 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification, ...@@ -198,36 +240,49 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
conv_param.conv_filter_dilations_, conv_param.conv_filter_dilations_,
conv_param.input_left_pads_, conv_param.input_left_pads_,
conv_param.input_right_pads_, conv_param.input_right_pads_,
in_element_op, InElementOp{},
wei_element_op, WeiElementOp{},
PassThrough{}); PassThrough{});
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
// TODO: implement elementwise operation for host // TODO: implement elementwise operation for host
out_host.ForEach([&](auto&, auto idx) { out_host.ForEach([&](auto&, auto idx) {
out_element_op(out_host(idx), c_host(idx), bias(idx), residual(idx)); OutElementOp{}(out_host(idx), c_host(idx), bias(idx), residual(idx));
}); });
out_device_buf.FromDevice(out_device.mData.data()); out_device_buf.FromDevice(out_device.mData.data());
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 #ifdef BUILD_INT4_EXAMPLE
const Tensor<OutUserDataType> out_device_converted(out_device); const Tensor<OutUserDataType> out_device_converted(out_device);
return ck::utils::check_err(out_device_converted.mData,
out_host.mData,
"Error: incorrect results!",
1e-5f,
1e-4f)
? 0
: 1;
#else // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
return ck::utils::check_err( return ck::utils::check_err(
out_device.mData, out_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f) out_device_converted.mData, out_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
? 0 #else
: 1; return ck::utils::check_err(
#endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 out_device.mData, out_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
#endif
}
return true;
}
bool run_grouped_conv_fwd_bias_relu_add_example(int argc, char* argv[])
{
ExecutionConfig config;
ck::utils::conv::ConvParam conv_param = DefaultConvParam;
if(!parse_cmd_args(argc, argv, config, conv_param))
{
return false;
}
switch(conv_param.num_dim_spatial_)
{
case 1: return run_grouped_conv_fwd_bias_relu_add<1>(config, conv_param);
case 2: return run_grouped_conv_fwd_bias_relu_add<2>(config, conv_param);
case 3: return run_grouped_conv_fwd_bias_relu_add<3>(config, conv_param);
} }
return 0; return false;
} }
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
template <ck::index_t NDimSpatial>
using DeviceConvFwdInstance =
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>,
WeightLayout<NDimSpatial>,
ck::Tuple<>,
OutputLayout<NDimSpatial>,
InKernelDataType,
WeiKernelDataType,
AccDataType,
CShuffleDataType,
ck::Tuple<>,
OutKernelDataType,
InElementOp,
WeiElementOp,
OutElementOp,
ConvSpec, // ConvForwardSpecialization
GemmSpec, // GemmSpecialization
1, //
256, // BlockSize
128, // MPerBlock
256, // NPerBlock
16, // KPerBlock
4, // AK1
4, // BK1
32, // MPerXdl
32, // NPerXdl
2, // MXdlPerWave
4, // NXdlPerWave
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
2, // ABlockTransferSrcVectorDim
4, // ABlockTransferSrcScalarPerVector
4, // ABlockTransferDstScalarPerVector_AK1
1, // ABlockLdsExtraM
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
2, // BBlockTransferSrcVectorDim
4, // BBlockTransferSrcScalarPerVector
4, // BBlockTransferDstScalarPerVector_BK1
1, // BBlockLdsExtraN
1,
1,
S<1, 16, 1, 16>,
4>;
template <ck::index_t NDimSpatial>
using HostConvFwdInstance = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
InUserDataType,
WeiUserDataType,
CShuffleDataType,
InElementOp,
WeiElementOp,
PassThrough>;
template <ck::index_t NDimSpatial>
bool run_grouped_conv_fwd(const ExecutionConfig& config,
const ck::utils::conv::ConvParam& conv_param)
{
static_assert(1 <= NDimSpatial && NDimSpatial <= 3, "Unsupported NDimSpatial");
const auto in_g_n_c_wis_desc = make_input_descriptor(conv_param);
const auto wei_g_k_c_xs_desc = make_weight_descriptor(conv_param);
const auto out_g_n_k_wos_desc = make_output_descriptor(conv_param);
Tensor<InUserDataType> in(in_g_n_c_wis_desc);
Tensor<WeiUserDataType> wei(wei_g_k_c_xs_desc);
Tensor<OutUserDataType> out_host(out_g_n_k_wos_desc);
Tensor<OutKernelDataType> out_device(out_g_n_k_wos_desc);
std::cout << "in: " << in.mDesc << std::endl;
std::cout << "wei: " << wei.mDesc << std::endl;
std::cout << "out: " << out_host.mDesc << std::endl;
switch(config.init_method)
{
case 0: break;
case 1:
in.GenerateTensorValue(GeneratorTensor_2<InUserDataType>{-5, 5});
wei.GenerateTensorValue(GeneratorTensor_2<WeiUserDataType>{-5, 5});
break;
default:
in.GenerateTensorValue(GeneratorTensor_3<InUserDataType>{0.0, 1.0});
wei.GenerateTensorValue(GeneratorTensor_3<WeiUserDataType>{-0.5, 0.5});
}
DeviceMem in_device_buf(sizeof(InKernelDataType) * in.mDesc.GetElementSpaceSize());
DeviceMem wei_device_buf(sizeof(WeiKernelDataType) * wei.mDesc.GetElementSpaceSize());
DeviceMem out_device_buf(sizeof(OutKernelDataType) * out_device.mDesc.GetElementSpaceSize());
#ifdef BUILD_INT4_EXAMPLE
const Tensor<InKernelDataType> in_converted(in);
const Tensor<WeiKernelDataType> wei_converted(wei);
in_device_buf.ToDevice(in_converted.mData.data());
wei_device_buf.ToDevice(wei_converted.mData.data());
#else
in_device_buf.ToDevice(in.mData.data());
wei_device_buf.ToDevice(wei.mData.data());
#endif
std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
std::array<ck::index_t, NDimSpatial> input_left_pads{};
std::array<ck::index_t, NDimSpatial> input_right_pads{};
auto copy = [](auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); };
copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
copy(conv_param.conv_filter_strides_, conv_filter_strides);
copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
copy(conv_param.input_left_pads_, input_left_pads);
copy(conv_param.input_right_pads_, input_right_pads);
// do Conv
auto conv = DeviceConvFwdInstance<NDimSpatial>{};
auto invoker = conv.MakeInvoker();
auto argument = conv.MakeArgument(in_device_buf.GetDeviceBuffer(),
wei_device_buf.GetDeviceBuffer(),
std::array<const void*, 0>{},
out_device_buf.GetDeviceBuffer(),
a_g_n_c_wis_lengths,
a_g_n_c_wis_strides,
b_g_k_c_xs_lengths,
b_g_k_c_xs_strides,
std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{},
std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{},
e_g_n_k_wos_lengths,
e_g_n_k_wos_strides,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
InElementOp{},
WeiElementOp{},
OutElementOp{});
if(!conv.IsSupportedArgument(argument))
{
throw std::runtime_error(
"wrong! device_conv with the specified compilation parameters does "
"not support this Conv problem");
}
float avg_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
std::size_t flop = conv_param.GetFlops();
std::size_t num_btype = conv_param.GetByte<InUserDataType, WeiUserDataType, OutUserDataType>();
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
float gb_per_sec = num_btype / 1.E6 / avg_time;
std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< conv.GetTypeString() << std::endl;
if(config.do_verification)
{
auto ref_conv = HostConvFwdInstance<NDimSpatial>{};
auto ref_invoker = ref_conv.MakeInvoker();
auto ref_argument = ref_conv.MakeArgument(in,
wei,
out_host,
conv_param.conv_filter_strides_,
conv_param.conv_filter_dilations_,
conv_param.input_left_pads_,
conv_param.input_right_pads_,
InElementOp{},
WeiElementOp{},
OutElementOp{});
ref_invoker.Run(ref_argument);
out_device_buf.FromDevice(out_device.mData.data());
#ifdef BUILD_INT4_EXAMPLE
const Tensor<OutUserDataType> out_device_converted(out_device);
return ck::utils::check_err(
out_device_converted.mData, out_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
#else
return ck::utils::check_err(
out_device.mData, out_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
#endif
}
return true;
}
bool run_grouped_conv_fwd_example(int argc, char* argv[])
{
ExecutionConfig config;
ck::utils::conv::ConvParam conv_param = DefaultConvParam;
if(!parse_cmd_args(argc, argv, config, conv_param))
{
return false;
}
switch(conv_param.num_dim_spatial_)
{
case 1: return run_grouped_conv_fwd<1>(config, conv_param);
case 2: return run_grouped_conv_fwd<2>(config, conv_param);
case 3: return run_grouped_conv_fwd<3>(config, conv_param);
}
return false;
}
add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_fp16 grouped_convnd_fwd_bias_relu_add_xdl_fp16.cpp)
add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_fp32 grouped_convnd_fwd_bias_relu_add_xdl_fp32.cpp)
add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_bf16 grouped_convnd_fwd_bias_relu_add_xdl_bf16.cpp)
add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_int8 grouped_convnd_fwd_bias_relu_add_xdl_int8.cpp)
if(USE_BITINT_EXTENSION_INT4)
add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_int4 grouped_convnd_fwd_bias_relu_add_xdl_int4.cpp)
endif() # USE_BITINT_EXTENSION_INT4
```bash
#arg1: verification (0=no, 1=yes)
#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
#arg3: time kernel (0=no, 1=yes)
#Following arguments (depending on number of spatial dims):
# N spatial dimensions
# G, N, K, C,
# <filter spatial dimensions>, (ie Y, X for 2D)
# <input image spatial dimensions>, (ie Hi, Wi for 2D)
# <strides>, (ie Sy, Sx for 2D)
# <dilations>, (ie Dy, Dx for 2D)
# <left padding>, (ie LeftPy, LeftPx for 2D)
# <right padding>, (ie RightPy, RightPx for 2D)
bin/example_grouped_convnd_fwd_bias_relu_add_xdl_fp16 1 1 1
```
Result (MI100)
```
in: dim 5, lengths {2, 128, 192, 71, 71}, strides {192, 1935744, 1, 27264, 384}
wei: dim 5, lengths {2, 256, 192, 3, 3}, strides {442368, 1728, 1, 576, 192}
bias: dim 5, lengths {2, 128, 256, 36, 36}, strides {256, 0, 1, 0, 0}
residual: dim 5, lengths {2, 128, 256, 36, 36}, strides {256, 0, 1, 0, 0}
out: dim 5, lengths {2, 128, 256, 36, 36}, strides {256, 663552, 1, 18432, 512}
A[M, K]: {165888, 1728}
B[N, K]: {256, 1728}
Ds[M, N]: {165888, 256}
Ds[M, N]: {165888, 256}
E[M, N]: {165888, 256}
launch_and_time_kernel: grid_dim {2592, 1, 1}, block_dim {256, 1, 1}
Warm up 1 time
Start running 10 times...
Perf: 2.48075 ms, 118.325 TFlops, 268.946 GB/s, DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<256, 128, 256, 32, Default>
```
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment