Unverified Commit 984889fb authored by Illia Silin's avatar Illia Silin Committed by GitHub
Browse files

Merge pull request #13 from ROCmSoftwarePlatform/merge_from-_public_repo

Merge from  public repo
parents a73ab0d8 4a106f7d
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip> #include <iomanip>
#include <vector> #include <vector>
...@@ -172,18 +172,19 @@ int main() ...@@ -172,18 +172,19 @@ int main()
BLayout, BLayout,
CLayout>(); CLayout>();
const auto normalize_ptrs =
ck::tensor_operation::device::instance::get_device_normalize_from_mean_meansquare_instances<
CDataType,
ReduceDataType,
ReduceDataType,
GammaDataType,
BetaDataType,
LayerNormOutDataType>();
std::cout << "found " << gemm_reduce_ptrs.size() std::cout << "found " << gemm_reduce_ptrs.size()
<< " gemm_reduceMean_reduceSquareMean instances" << std::endl; << " gemm_reduceMean_reduceSquareMean instances" << std::endl;
using NormalizeDeviceOp = ck::tensor_operation::device::DeviceElementwise<
ck::Tuple<CDataType, ReduceDataType, ReduceDataType, GammaDataType, BetaDataType>,
ck::Tuple<LayerNormOutDataType>,
ck::tensor_operation::element_wise::Normalize,
2>;
const auto normalize_ptrs =
ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
NormalizeDeviceOp>::GetInstances();
std::cout << "found " << normalize_ptrs.size() << " normalize instances" << std::endl; std::cout << "found " << normalize_ptrs.size() << " normalize instances" << std::endl;
auto f_matrix_space_size = auto f_matrix_space_size =
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip> #include <iomanip>
#include <iostream> #include <iostream>
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip> #include <iomanip>
#include <numeric> #include <numeric>
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip> #include <iomanip>
#include <numeric> #include <numeric>
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip> #include <iomanip>
#include <numeric> #include <numeric>
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip> #include <iomanip>
#include <numeric> #include <numeric>
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip> #include <iomanip>
#include <numeric> #include <numeric>
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip> #include <iomanip>
#include <vector> #include <vector>
...@@ -12,12 +12,14 @@ ...@@ -12,12 +12,14 @@
#include "ck/library/tensor_operation_instance/gpu/normalization.hpp" #include "ck/library/tensor_operation_instance/gpu/normalization.hpp"
using XDataType = ck::half_t; using XDataType = ck::half_t;
using GammaDataType = ck::half_t; using GammaDataType = ck::half_t;
using BetaDataType = ck::half_t; using BetaDataType = ck::half_t;
using YDataType = ck::half_t; using YDataType = ck::half_t;
using ComputeDataType = float; using SaveMeanInvStdDataType = float;
using PassThrough = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
#define SAVE_MEAN_INV_STD
constexpr int Rank = 2; constexpr int Rank = 2;
constexpr int NumReduceDim = 1; constexpr int NumReduceDim = 1;
...@@ -50,12 +52,16 @@ int main(int argc, char* argv[]) ...@@ -50,12 +52,16 @@ int main(int argc, char* argv[])
SimpleDeviceMem gamma_device_buf(sizeof(GammaDataType) * N); SimpleDeviceMem gamma_device_buf(sizeof(GammaDataType) * N);
SimpleDeviceMem beta_device_buf(sizeof(BetaDataType) * N); SimpleDeviceMem beta_device_buf(sizeof(BetaDataType) * N);
SimpleDeviceMem y_device_buf(sizeof(YDataType) * xy_size); SimpleDeviceMem y_device_buf(sizeof(YDataType) * xy_size);
#ifdef SAVE_MEAN_INV_STD
SimpleDeviceMem save_mean_device_buf(sizeof(SaveMeanInvStdDataType) * M);
SimpleDeviceMem save_inv_std_device_buf(sizeof(SaveMeanInvStdDataType) * M);
#endif
using DeviceOp = ck::tensor_operation::device::DeviceNormalization<XDataType, using DeviceOp = ck::tensor_operation::device::DeviceNormalization<XDataType,
GammaDataType, GammaDataType,
BetaDataType, BetaDataType,
ComputeDataType,
YDataType, YDataType,
SaveMeanInvStdDataType,
PassThrough, PassThrough,
Rank, Rank,
NumReduceDim>; NumReduceDim>;
...@@ -84,14 +90,21 @@ int main(int argc, char* argv[]) ...@@ -84,14 +90,21 @@ int main(int argc, char* argv[])
{0, 1}, // gammaStrides {0, 1}, // gammaStrides
{0, 1}, // betaStrides {0, 1}, // betaStrides
{Stride, 1}, // yStrides {Stride, 1}, // yStrides
{1}, // save_mean Strides
{1}, // save_inv_std Strides
{1}, // reduceDims {1}, // reduceDims
1e-4, 1e-4,
x_device_buf.GetDeviceBuffer(), x_device_buf.GetDeviceBuffer(),
gamma_device_buf.GetDeviceBuffer(), gamma_device_buf.GetDeviceBuffer(),
beta_device_buf.GetDeviceBuffer(), beta_device_buf.GetDeviceBuffer(),
y_device_buf.GetDeviceBuffer(), y_device_buf.GetDeviceBuffer(),
#ifdef SAVE_MEAN_INV_STD
save_mean_device_buf.GetDeviceBuffer(),
save_inv_std_device_buf.GetDeviceBuffer(),
#else
nullptr, nullptr,
nullptr, nullptr,
#endif
PassThrough{}); PassThrough{});
auto invoker_ptr = op_ptr->MakeInvokerPointer(); auto invoker_ptr = op_ptr->MakeInvokerPointer();
...@@ -100,11 +113,19 @@ int main(int argc, char* argv[]) ...@@ -100,11 +113,19 @@ int main(int argc, char* argv[])
if(op_ptr->IsSupportedArgument(argument_ptr.get())) if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
SimpleDeviceMem workspace(workspace_sz);
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t num_byte = sizeof(XDataType) * M * N + sizeof(GammaDataType) * N + std::size_t num_byte = sizeof(XDataType) * M * N + sizeof(GammaDataType) * N +
sizeof(BetaDataType) * N + sizeof(YDataType) * M * N; sizeof(BetaDataType) * N + sizeof(YDataType) * M * N;
#ifdef SAVE_MEAN_INV_STD
num_byte += sizeof(SaveMeanInvStdDataType) * M * 2;
#endif
float gb_per_sec = num_byte / 1.E6 / ave_time; float gb_per_sec = num_byte / 1.E6 / ave_time;
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, " std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
...@@ -136,23 +157,34 @@ int main(int argc, char* argv[]) ...@@ -136,23 +157,34 @@ int main(int argc, char* argv[])
auto argument_ptr = op_ptr->MakeArgumentPointer({M, N}, // lengths auto argument_ptr = op_ptr->MakeArgumentPointer({M, N}, // lengths
{Stride, 1}, // xStrides {Stride, 1}, // xStrides
{1}, // gammaStrides {0, 1}, // gammaStrides
{1}, // betaStrides {0, 1}, // betaStrides
{Stride, 1}, // yStrides {Stride, 1}, // yStrides
{1}, // save_mean Strides
{1}, // save_inv_std Strides
{1}, // reduceDims {1}, // reduceDims
1e-4, 1e-4,
x_device_buf.GetDeviceBuffer(), x_device_buf.GetDeviceBuffer(),
gamma_device_buf.GetDeviceBuffer(), gamma_device_buf.GetDeviceBuffer(),
beta_device_buf.GetDeviceBuffer(), beta_device_buf.GetDeviceBuffer(),
y_device_buf.GetDeviceBuffer(), y_device_buf.GetDeviceBuffer(),
#ifdef SAVE_MEAN_INV_STD
save_mean_device_buf.GetDeviceBuffer(),
save_inv_std_device_buf.GetDeviceBuffer(),
#else
nullptr, nullptr,
nullptr, nullptr,
#endif
PassThrough{}); PassThrough{});
auto invoker_ptr = op_ptr->MakeInvokerPointer(); auto invoker_ptr = op_ptr->MakeInvokerPointer();
if(op_ptr->IsSupportedArgument(argument_ptr.get())) if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
SimpleDeviceMem workspace(workspace_sz);
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
} }
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <functional> #include <functional>
#include <numeric> #include <numeric>
...@@ -53,12 +53,35 @@ int main(int argc, char* argv[]) ...@@ -53,12 +53,35 @@ int main(int argc, char* argv[])
SimpleDeviceMem in(sizeof(InDataType) * num_elements); SimpleDeviceMem in(sizeof(InDataType) * num_elements);
SimpleDeviceMem out(sizeof(OutDataType) * num_elements); SimpleDeviceMem out(sizeof(OutDataType) * num_elements);
using DeviceOp = ck::tensor_operation::device:: using DeviceOp = ck::tensor_operation::device::DeviceSoftmax<InDataType,
DeviceSoftmax<InDataType, AccDataType, OutDataType, PassThrough, PassThrough, Rank>; AccDataType,
OutDataType,
PassThrough,
PassThrough,
Rank,
NumReduceDim>;
// get device op instances // get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances(); DeviceOp>::GetInstances();
auto& generic_op_ptr = op_ptrs[0];
auto generic_argument_ptr = generic_op_ptr->MakeArgumentPointer(in_lengths,
in_strides,
reduce_dims,
alpha,
beta,
in.GetDeviceBuffer(),
out.GetDeviceBuffer(),
PassThrough{},
PassThrough{});
if(!generic_op_ptr->IsSupportedArgument(generic_argument_ptr.get()))
{
throw std::runtime_error(
"The generic kernel instance should be able to support any input shapes");
};
std::cout << "found " << op_ptrs.size() << " instances" << std::endl; std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
std::string best_op_name; std::string best_op_name;
...@@ -74,11 +97,6 @@ int main(int argc, char* argv[]) ...@@ -74,11 +97,6 @@ int main(int argc, char* argv[])
{ {
auto& op_ptr = op_ptrs[i]; auto& op_ptr = op_ptrs[i];
if(op_ptr->GetRank() != Rank || op_ptr->GetNumReduceDim() != NumReduceDim)
{
continue;
}
auto argument_ptr = op_ptr->MakeArgumentPointer(in_lengths, auto argument_ptr = op_ptr->MakeArgumentPointer(in_lengths,
in_strides, in_strides,
reduce_dims, reduce_dims,
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib> #include <cstdlib>
#include <iomanip> #include <iomanip>
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib> #include <cstdlib>
#include <iomanip> #include <iomanip>
...@@ -17,22 +17,22 @@ using InDataType = ck::half_t; ...@@ -17,22 +17,22 @@ using InDataType = ck::half_t;
using WeiDataType = ck::half_t; using WeiDataType = ck::half_t;
using OutDataType = ck::half_t; using OutDataType = ck::half_t;
using InLayout = ck::tensor_layout::convolution::GNHWC; using InLayout = ck::tensor_layout::convolution::NHWGC;
using WeiLayout = ck::tensor_layout::convolution::GKYXC; using WeiLayout = ck::tensor_layout::convolution::GKYXC;
using OutLayout = ck::tensor_layout::convolution::GNHWK; using OutLayout = ck::tensor_layout::convolution::NHWGK;
using PassThrough = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
static constexpr ck::index_t NumDimSpatial = 2; static constexpr ck::index_t NumDimSpatial = 2;
static constexpr ck::index_t G = 32; static constexpr ck::index_t G = 32;
static constexpr ck::index_t N = 256; static constexpr ck::index_t N = 256; // batch size
static constexpr ck::index_t K = 192; static constexpr ck::index_t K = 64; // output channel
static constexpr ck::index_t C = 192; static constexpr ck::index_t C = 32; // input channel (per group)
static constexpr ck::index_t Y = 3; static constexpr ck::index_t Y = 3; // filter H
static constexpr ck::index_t X = 3; static constexpr ck::index_t X = 3; // filter W
static constexpr ck::index_t Hi = 28; static constexpr ck::index_t Hi = 28; // input H
static constexpr ck::index_t Wi = 28; static constexpr ck::index_t Wi = 28; // input W
static constexpr ck::index_t Ho = 28; static constexpr ck::index_t Ho = 28; // output H
static constexpr ck::index_t Wo = 28; static constexpr ck::index_t Wo = 28; // output W
struct SimpleDeviceMem struct SimpleDeviceMem
{ {
...@@ -52,50 +52,24 @@ struct SimpleDeviceMem ...@@ -52,50 +52,24 @@ struct SimpleDeviceMem
int main() int main()
{ {
std::array<ck::index_t, NumDimSpatial + 3> in_lengths{G, N, Hi, Wi, C}; // We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
std::array<ck::index_t, NumDimSpatial + 3> in_strides{0, 0, 0, 0, 1}; // However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
// Hence, we need to adjust the order of stride
std::array<ck::index_t, NumDimSpatial + 3> wei_lengths{G, K, Y, X, C}; std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
std::array<ck::index_t, NumDimSpatial + 3> wei_strides{0, 0, 0, 0, 1}; std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
std::array<ck::index_t, 5> wei_lengths{G, K, C, Y, X};
std::array<ck::index_t, NumDimSpatial + 3> out_lengths{G, N, Ho, Wo, K}; std::array<ck::index_t, 5> wei_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
std::array<ck::index_t, NumDimSpatial + 3> out_strides{0, 0, 0, 0, 1}; std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
std::partial_sum(rbegin(in_lengths),
std::prev(rend(in_lengths)),
std::next(rbegin(in_strides)),
std::multiplies<>{});
std::partial_sum(rbegin(wei_lengths),
std::prev(rend(wei_lengths)),
std::next(rbegin(wei_strides)),
std::multiplies<>{});
std::partial_sum(rbegin(out_lengths),
std::prev(rend(out_lengths)),
std::next(rbegin(out_strides)),
std::multiplies<>{});
// transpose GNHWC/GKYXC/GNHWK to GNCHW/GKCYX/GNCHW
std::rotate(
rbegin(in_lengths), std::next(rbegin(in_lengths)), std::next(rbegin(in_lengths), 3));
std::rotate(
rbegin(in_strides), std::next(rbegin(in_strides)), std::next(rbegin(in_strides), 3));
std::rotate(
rbegin(wei_lengths), std::next(rbegin(wei_lengths)), std::next(rbegin(wei_lengths), 3));
std::rotate(
rbegin(wei_strides), std::next(rbegin(wei_strides)), std::next(rbegin(wei_strides), 3));
std::rotate(
rbegin(out_lengths), std::next(rbegin(out_lengths)), std::next(rbegin(out_lengths), 3));
std::rotate(
rbegin(out_strides), std::next(rbegin(out_strides)), std::next(rbegin(out_strides), 3));
std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1}; std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1};
std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1}; std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1};
std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1}; std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1};
std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1}; std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};
SimpleDeviceMem in(sizeof(InDataType) * G * N * Hi * Wi * C); SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C); SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
SimpleDeviceMem out(sizeof(OutDataType) * G * N * Ho * Wo * K); SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial, using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
InLayout, InLayout,
...@@ -155,9 +129,9 @@ int main() ...@@ -155,9 +129,9 @@ int main()
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t flop = std::size_t(2) * G * N * K * C * Ho * Wo * Y * X; std::size_t flop = std::size_t(2) * G * N * K * C * Ho * Wo * Y * X;
std::size_t num_bytes = sizeof(InDataType) * G * N * Hi * Wi * C + std::size_t num_bytes = sizeof(InDataType) * N * Hi * Wi * G * C +
sizeof(WeiDataType) * G * K * Y * X * C + sizeof(WeiDataType) * G * K * Y * X * C +
sizeof(OutDataType) * G * N * Ho * Wo * K; sizeof(OutDataType) * N * Ho * Wo * G * K;
float tflops = static_cast<float>(flop) / 1.E9 / avg_time; float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
float gb_per_sec = num_bytes / 1.E6 / avg_time; float gb_per_sec = num_bytes / 1.E6 / avg_time;
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream> #include <iostream>
#include <vector> #include <vector>
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream> #include <iostream>
#include <vector> #include <vector>
......
if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)
add_executable(client_conv2d_fwd_bias_tanh_perchannel_quantization conv2d_fwd_bias_tanh_perchannel_quantization.cpp) add_executable(client_conv2d_fwd_bias_tanh_perchannel_quantization conv2d_fwd_bias_tanh_perchannel_quantization.cpp)
target_link_libraries(client_conv2d_fwd_bias_tanh_perchannel_quantization PRIVATE composable_kernel::device_operations) target_link_libraries(client_conv2d_fwd_bias_tanh_perchannel_quantization PRIVATE composable_kernel::device_operations)
...@@ -18,3 +19,4 @@ target_link_libraries(client_conv2d_fwd_perlayer_quantization PRIVATE composable ...@@ -18,3 +19,4 @@ target_link_libraries(client_conv2d_fwd_perlayer_quantization PRIVATE composable
add_executable(client_gemm_quantization gemm_quantization.cpp) add_executable(client_gemm_quantization gemm_quantization.cpp)
target_link_libraries(client_gemm_quantization PRIVATE composable_kernel::device_operations) target_link_libraries(client_gemm_quantization PRIVATE composable_kernel::device_operations)
endif()
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip> #include <iomanip>
#include <iostream> #include <iostream>
...@@ -17,26 +17,26 @@ using BiasDataType = int32_t; ...@@ -17,26 +17,26 @@ using BiasDataType = int32_t;
using RequantScaleDataType = float; using RequantScaleDataType = float;
using OutDataType = int8_t; using OutDataType = int8_t;
using InLayout = ck::tensor_layout::convolution::GNHWC; using InLayout = ck::tensor_layout::convolution::NHWGC;
using WeiLayout = ck::tensor_layout::convolution::GKYXC; using WeiLayout = ck::tensor_layout::convolution::GKYXC;
using BiasLayout = ck::tensor_layout::convolution::G_K; using BiasLayout = ck::tensor_layout::convolution::G_K;
using RequantScaleLayout = ck::tensor_layout::convolution::G_K; using RequantScaleLayout = ck::tensor_layout::convolution::G_K;
using OutLayout = ck::tensor_layout::convolution::GNHWK; using OutLayout = ck::tensor_layout::convolution::NHWGK;
using PassThrough = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using ActivationOp = ck::tensor_operation::element_wise::Relu; using ActivationOp = ck::tensor_operation::element_wise::Relu;
using OutElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul2_Clamp<ActivationOp>; using OutElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul2_Clamp<ActivationOp>;
static constexpr ck::index_t NumDimSpatial = 2; static constexpr ck::index_t NumDimSpatial = 2;
static constexpr ck::index_t G = 1; static constexpr ck::index_t G = 4;
static constexpr ck::index_t N = 4; // batch size static constexpr ck::index_t N = 4; // batch size
static constexpr ck::index_t K = 64; // output channel static constexpr ck::index_t K = 32; // output channel
static constexpr ck::index_t C = 192; // input channel static constexpr ck::index_t C = 64; // input channel (per group)
static constexpr ck::index_t Y = 3; // filter H static constexpr ck::index_t Y = 3; // filter H
static constexpr ck::index_t X = 3; // filter W static constexpr ck::index_t X = 3; // filter W
static constexpr ck::index_t Hi = 71; // input H static constexpr ck::index_t Hi = 71; // input H
static constexpr ck::index_t Wi = 71; // input W static constexpr ck::index_t Wi = 71; // input W
static constexpr ck::index_t Ho = 36; // output H static constexpr ck::index_t Ho = 36; // output H
static constexpr ck::index_t Wo = 36; // output W static constexpr ck::index_t Wo = 36; // output W
struct SimpleDeviceMem struct SimpleDeviceMem
{ {
SimpleDeviceMem() = delete; SimpleDeviceMem() = delete;
...@@ -55,8 +55,11 @@ struct SimpleDeviceMem ...@@ -55,8 +55,11 @@ struct SimpleDeviceMem
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
// We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
// However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
// Hence, we need to adjust the order of stride
std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi}; std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C}; std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X}; std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C}; std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo}; std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo};
...@@ -64,17 +67,18 @@ int main(int argc, char* argv[]) ...@@ -64,17 +67,18 @@ int main(int argc, char* argv[])
std::array<ck::index_t, 5> requant_scale_lengths{G, N, K, Ho, Wo}; std::array<ck::index_t, 5> requant_scale_lengths{G, N, K, Ho, Wo};
std::array<ck::index_t, 5> requant_scale_strides{K, 0, 1, 0, 0}; std::array<ck::index_t, 5> requant_scale_strides{K, 0, 1, 0, 0};
std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo}; std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
std::array<ck::index_t, 5> out_strides{N * Ho * Wo * K, Ho * Wo * K, 1, Wo * K, K}; std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
std::array<ck::index_t, 2> in_left_pad{1, 1}; std::array<ck::index_t, 2> in_left_pad{1, 1};
std::array<ck::index_t, 2> in_right_pad{1, 1}; std::array<ck::index_t, 2> in_right_pad{1, 1};
std::array<ck::index_t, 2> conv_strides{2, 2}; std::array<ck::index_t, 2> conv_strides{2, 2};
std::array<ck::index_t, 2> conv_dilations{1, 1}; std::array<ck::index_t, 2> conv_dilations{1, 1};
SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C); SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C); SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
SimpleDeviceMem bias(sizeof(BiasDataType) * K * Y * X * C); SimpleDeviceMem bias(sizeof(BiasDataType) * G * K);
SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * K); SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * G * K);
SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K); SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD< using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<
NumDimSpatial, NumDimSpatial,
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip> #include <iomanip>
#include <iostream> #include <iostream>
...@@ -16,19 +16,19 @@ using WeiDataType = int8_t; ...@@ -16,19 +16,19 @@ using WeiDataType = int8_t;
using BiasDataType = int32_t; using BiasDataType = int32_t;
using OutDataType = int8_t; using OutDataType = int8_t;
using InLayout = ck::tensor_layout::convolution::GNHWC; using InLayout = ck::tensor_layout::convolution::NHWGC;
using WeiLayout = ck::tensor_layout::convolution::GKYXC; using WeiLayout = ck::tensor_layout::convolution::GKYXC;
using BiasLayout = ck::tensor_layout::convolution::G_K; using BiasLayout = ck::tensor_layout::convolution::G_K;
using OutLayout = ck::tensor_layout::convolution::GNHWK; using OutLayout = ck::tensor_layout::convolution::NHWGK;
using PassThrough = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using ActivationOp = ck::tensor_operation::element_wise::Relu; using ActivationOp = ck::tensor_operation::element_wise::Relu;
using OutElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp<ActivationOp>; using OutElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp<ActivationOp>;
static constexpr ck::index_t NumDimSpatial = 2; static constexpr ck::index_t NumDimSpatial = 2;
static constexpr ck::index_t G = 1; static constexpr ck::index_t G = 4;
static constexpr ck::index_t N = 4; // batch size static constexpr ck::index_t N = 4; // batch size
static constexpr ck::index_t K = 64; // output channel static constexpr ck::index_t K = 32; // output channel
static constexpr ck::index_t C = 192; // input channel static constexpr ck::index_t C = 64; // input channel (per group)
static constexpr ck::index_t Y = 3; // filter H static constexpr ck::index_t Y = 3; // filter H
static constexpr ck::index_t X = 3; // filter W static constexpr ck::index_t X = 3; // filter W
static constexpr ck::index_t Hi = 71; // input H static constexpr ck::index_t Hi = 71; // input H
...@@ -55,23 +55,27 @@ struct SimpleDeviceMem ...@@ -55,23 +55,27 @@ struct SimpleDeviceMem
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
// We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
// However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
// Hence, we need to adjust the order of stride
std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi}; std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C}; std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X}; std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C}; std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo}; std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo};
std::array<ck::index_t, 5> bias_strides{K, 0, 1, 0, 0}; std::array<ck::index_t, 5> bias_strides{K, 0, 1, 0, 0};
std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo}; std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
std::array<ck::index_t, 5> out_strides{N * Ho * Wo * K, Ho * Wo * K, 1, Wo * K, K}; std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
std::array<ck::index_t, 2> in_left_pad{1, 1}; std::array<ck::index_t, 2> in_left_pad{1, 1};
std::array<ck::index_t, 2> in_right_pad{1, 1}; std::array<ck::index_t, 2> in_right_pad{1, 1};
std::array<ck::index_t, 2> conv_strides{2, 2}; std::array<ck::index_t, 2> conv_strides{2, 2};
std::array<ck::index_t, 2> conv_dilations{1, 1}; std::array<ck::index_t, 2> conv_dilations{1, 1};
SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C); SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C); SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
SimpleDeviceMem bias(sizeof(BiasDataType) * K * Y * X * C); SimpleDeviceMem bias(sizeof(BiasDataType) * G * K);
SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K); SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
using DeviceOp = using DeviceOp =
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial, ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip> #include <iomanip>
#include <iostream> #include <iostream>
...@@ -17,21 +17,21 @@ using BiasDataType = int32_t; ...@@ -17,21 +17,21 @@ using BiasDataType = int32_t;
using RequantScaleDataType = float; using RequantScaleDataType = float;
using OutDataType = int8_t; using OutDataType = int8_t;
using InLayout = ck::tensor_layout::convolution::GNHWC; using InLayout = ck::tensor_layout::convolution::NHWGC;
using WeiLayout = ck::tensor_layout::convolution::GKYXC; using WeiLayout = ck::tensor_layout::convolution::GKYXC;
using BiasLayout = ck::tensor_layout::convolution::G_K; using BiasLayout = ck::tensor_layout::convolution::G_K;
using RequantScaleLayout = ck::tensor_layout::convolution::G_K; using RequantScaleLayout = ck::tensor_layout::convolution::G_K;
using OutLayout = ck::tensor_layout::convolution::GNHWK; using OutLayout = ck::tensor_layout::convolution::NHWGK;
using PassThrough = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using ActivationOp = ck::tensor_operation::element_wise::TanH; using ActivationOp = ck::tensor_operation::element_wise::TanH;
using OutElementOp = using OutElementOp =
ck::tensor_operation::element_wise::Add_Mul2_Activation_Mul_Clamp<ActivationOp>; ck::tensor_operation::element_wise::Add_Mul2_Activation_Mul_Clamp<ActivationOp>;
static constexpr ck::index_t NumDimSpatial = 2; static constexpr ck::index_t NumDimSpatial = 2;
static constexpr ck::index_t G = 1; static constexpr ck::index_t G = 4;
static constexpr ck::index_t N = 4; // batch size static constexpr ck::index_t N = 4; // batch size
static constexpr ck::index_t K = 64; // output channel static constexpr ck::index_t K = 32; // output channel
static constexpr ck::index_t C = 192; // input channel static constexpr ck::index_t C = 64; // input channel (per group)
static constexpr ck::index_t Y = 3; // filter H static constexpr ck::index_t Y = 3; // filter H
static constexpr ck::index_t X = 3; // filter W static constexpr ck::index_t X = 3; // filter W
static constexpr ck::index_t Hi = 71; // input H static constexpr ck::index_t Hi = 71; // input H
...@@ -58,8 +58,11 @@ struct SimpleDeviceMem ...@@ -58,8 +58,11 @@ struct SimpleDeviceMem
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
// We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
// However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
// Hence, we need to adjust the order of stride
std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi}; std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C}; std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X}; std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C}; std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo}; std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo};
...@@ -67,17 +70,18 @@ int main(int argc, char* argv[]) ...@@ -67,17 +70,18 @@ int main(int argc, char* argv[])
std::array<ck::index_t, 5> requant_scale_lengths{G, N, K, Ho, Wo}; std::array<ck::index_t, 5> requant_scale_lengths{G, N, K, Ho, Wo};
std::array<ck::index_t, 5> requant_scale_strides{K, 0, 1, 0, 0}; std::array<ck::index_t, 5> requant_scale_strides{K, 0, 1, 0, 0};
std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo}; std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
std::array<ck::index_t, 5> out_strides{N * Ho * Wo * K, Ho * Wo * K, 1, Wo * K, K}; std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
std::array<ck::index_t, 2> in_left_pad{1, 1}; std::array<ck::index_t, 2> in_left_pad{1, 1};
std::array<ck::index_t, 2> in_right_pad{1, 1}; std::array<ck::index_t, 2> in_right_pad{1, 1};
std::array<ck::index_t, 2> conv_strides{2, 2}; std::array<ck::index_t, 2> conv_strides{2, 2};
std::array<ck::index_t, 2> conv_dilations{1, 1}; std::array<ck::index_t, 2> conv_dilations{1, 1};
SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C); SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C); SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
SimpleDeviceMem bias(sizeof(BiasDataType) * K * Y * X * C); SimpleDeviceMem bias(sizeof(BiasDataType) * G * K);
SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * K); SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * G * K);
SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K); SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD< using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<
NumDimSpatial, NumDimSpatial,
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip> #include <iomanip>
#include <iostream> #include <iostream>
...@@ -16,19 +16,19 @@ using WeiDataType = int8_t; ...@@ -16,19 +16,19 @@ using WeiDataType = int8_t;
using BiasDataType = int32_t; using BiasDataType = int32_t;
using OutDataType = int8_t; using OutDataType = int8_t;
using InLayout = ck::tensor_layout::convolution::GNHWC; using InLayout = ck::tensor_layout::convolution::NHWGC;
using WeiLayout = ck::tensor_layout::convolution::GKYXC; using WeiLayout = ck::tensor_layout::convolution::GKYXC;
using BiasLayout = ck::tensor_layout::convolution::G_K; using BiasLayout = ck::tensor_layout::convolution::G_K;
using OutLayout = ck::tensor_layout::convolution::GNHWK; using OutLayout = ck::tensor_layout::convolution::NHWGK;
using PassThrough = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using ActivationOp = ck::tensor_operation::element_wise::TanH; using ActivationOp = ck::tensor_operation::element_wise::TanH;
using OutElementOp = ck::tensor_operation::element_wise::Add_Mul_Activation_Mul_Clamp<ActivationOp>; using OutElementOp = ck::tensor_operation::element_wise::Add_Mul_Activation_Mul_Clamp<ActivationOp>;
static constexpr ck::index_t NumDimSpatial = 2; static constexpr ck::index_t NumDimSpatial = 2;
static constexpr ck::index_t G = 1; static constexpr ck::index_t G = 4;
static constexpr ck::index_t N = 4; // batch size static constexpr ck::index_t N = 4; // batch size
static constexpr ck::index_t K = 64; // output channel static constexpr ck::index_t K = 32; // output channel
static constexpr ck::index_t C = 192; // input channel static constexpr ck::index_t C = 64; // input channel (per group)
static constexpr ck::index_t Y = 3; // filter H static constexpr ck::index_t Y = 3; // filter H
static constexpr ck::index_t X = 3; // filter W static constexpr ck::index_t X = 3; // filter W
static constexpr ck::index_t Hi = 71; // input H static constexpr ck::index_t Hi = 71; // input H
...@@ -56,23 +56,27 @@ struct SimpleDeviceMem ...@@ -56,23 +56,27 @@ struct SimpleDeviceMem
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
// We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
// However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
// Hence, we need to adjust the order of stride
std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi}; std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C}; std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X}; std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C}; std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo}; std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo};
std::array<ck::index_t, 5> bias_strides{K, 0, 1, 0, 0}; std::array<ck::index_t, 5> bias_strides{K, 0, 1, 0, 0};
std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo}; std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
std::array<ck::index_t, 5> out_strides{N * Ho * Wo * K, Ho * Wo * K, 1, Wo * K, K}; std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
std::array<ck::index_t, 2> in_left_pad{1, 1}; std::array<ck::index_t, 2> in_left_pad{1, 1};
std::array<ck::index_t, 2> in_right_pad{1, 1}; std::array<ck::index_t, 2> in_right_pad{1, 1};
std::array<ck::index_t, 2> conv_strides{2, 2}; std::array<ck::index_t, 2> conv_strides{2, 2};
std::array<ck::index_t, 2> conv_dilations{1, 1}; std::array<ck::index_t, 2> conv_dilations{1, 1};
SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C); SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C); SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
SimpleDeviceMem bias(sizeof(BiasDataType) * K * Y * X * C); SimpleDeviceMem bias(sizeof(BiasDataType) * G * K);
SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K); SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
using DeviceOp = using DeviceOp =
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial, ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip> #include <iomanip>
#include <iostream> #include <iostream>
...@@ -16,25 +16,25 @@ using WeiDataType = int8_t; ...@@ -16,25 +16,25 @@ using WeiDataType = int8_t;
using RequantScaleDataType = float; using RequantScaleDataType = float;
using OutDataType = int8_t; using OutDataType = int8_t;
using InLayout = ck::tensor_layout::convolution::GNHWC; using InLayout = ck::tensor_layout::convolution::NHWGC;
using WeiLayout = ck::tensor_layout::convolution::GKYXC; using WeiLayout = ck::tensor_layout::convolution::GKYXC;
using RequantScaleLayout = ck::tensor_layout::convolution::G_K; using RequantScaleLayout = ck::tensor_layout::convolution::G_K;
using OutLayout = ck::tensor_layout::convolution::GNHWK; using OutLayout = ck::tensor_layout::convolution::NHWGK;
using PassThrough = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using ActivationOp = PassThrough; using ActivationOp = PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::Activation_Mul2_Clamp<ActivationOp>; using OutElementOp = ck::tensor_operation::element_wise::Activation_Mul2_Clamp<ActivationOp>;
static constexpr ck::index_t NumDimSpatial = 2; static constexpr ck::index_t NumDimSpatial = 2;
static constexpr ck::index_t G = 1; static constexpr ck::index_t G = 4;
static constexpr ck::index_t N = 4; // batch size static constexpr ck::index_t N = 4; // batch size
static constexpr ck::index_t K = 64; // output channel static constexpr ck::index_t K = 32; // output channel
static constexpr ck::index_t C = 192; // input channel static constexpr ck::index_t C = 64; // input channel (per group)
static constexpr ck::index_t Y = 3; // filter H static constexpr ck::index_t Y = 3; // filter H
static constexpr ck::index_t X = 3; // filter W static constexpr ck::index_t X = 3; // filter W
static constexpr ck::index_t Hi = 71; // input H static constexpr ck::index_t Hi = 71; // input H
static constexpr ck::index_t Wi = 71; // input W static constexpr ck::index_t Wi = 71; // input W
static constexpr ck::index_t Ho = 36; // output H static constexpr ck::index_t Ho = 36; // output H
static constexpr ck::index_t Wo = 36; // output W static constexpr ck::index_t Wo = 36; // output W
struct SimpleDeviceMem struct SimpleDeviceMem
{ {
...@@ -54,23 +54,27 @@ struct SimpleDeviceMem ...@@ -54,23 +54,27 @@ struct SimpleDeviceMem
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
// We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
// However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
// Hence, we need to adjust the order of stride
std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi}; std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C}; std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X}; std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C}; std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
std::array<ck::index_t, 5> requant_scale_lengths{G, N, K, Ho, Wo}; std::array<ck::index_t, 5> requant_scale_lengths{G, N, K, Ho, Wo};
std::array<ck::index_t, 5> requant_scale_strides{K, 0, 1, 0, 0}; std::array<ck::index_t, 5> requant_scale_strides{K, 0, 1, 0, 0};
std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo}; std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
std::array<ck::index_t, 5> out_strides{N * Ho * Wo * K, Ho * Wo * K, 1, Wo * K, K}; std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
std::array<ck::index_t, 2> in_left_pad{1, 1}; std::array<ck::index_t, 2> in_left_pad{1, 1};
std::array<ck::index_t, 2> in_right_pad{1, 1}; std::array<ck::index_t, 2> in_right_pad{1, 1};
std::array<ck::index_t, 2> conv_strides{2, 2}; std::array<ck::index_t, 2> conv_strides{2, 2};
std::array<ck::index_t, 2> conv_dilations{1, 1}; std::array<ck::index_t, 2> conv_dilations{1, 1};
SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C); SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C); SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * K); SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * G * K);
SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K); SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
using DeviceOp = using DeviceOp =
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial, ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip> #include <iomanip>
#include <iostream> #include <iostream>
...@@ -15,18 +15,18 @@ using InDataType = int8_t; ...@@ -15,18 +15,18 @@ using InDataType = int8_t;
using WeiDataType = int8_t; using WeiDataType = int8_t;
using OutDataType = int8_t; using OutDataType = int8_t;
using InLayout = ck::tensor_layout::convolution::GNHWC; using InLayout = ck::tensor_layout::convolution::NHWGC;
using WeiLayout = ck::tensor_layout::convolution::GKYXC; using WeiLayout = ck::tensor_layout::convolution::GKYXC;
using OutLayout = ck::tensor_layout::convolution::GNHWK; using OutLayout = ck::tensor_layout::convolution::NHWGK;
using PassThrough = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using ActivationOp = PassThrough; using ActivationOp = PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<ActivationOp>; using OutElementOp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<ActivationOp>;
static constexpr ck::index_t NumDimSpatial = 2; static constexpr ck::index_t NumDimSpatial = 2;
static constexpr ck::index_t G = 1; static constexpr ck::index_t G = 4;
static constexpr ck::index_t N = 4; // batch size static constexpr ck::index_t N = 4; // batch size
static constexpr ck::index_t K = 64; // output channel static constexpr ck::index_t K = 32; // output channel
static constexpr ck::index_t C = 192; // input channel static constexpr ck::index_t C = 64; // input channel (per group)
static constexpr ck::index_t Y = 3; // filter H static constexpr ck::index_t Y = 3; // filter H
static constexpr ck::index_t X = 3; // filter W static constexpr ck::index_t X = 3; // filter W
static constexpr ck::index_t Hi = 71; // input H static constexpr ck::index_t Hi = 71; // input H
...@@ -53,20 +53,24 @@ struct SimpleDeviceMem ...@@ -53,20 +53,24 @@ struct SimpleDeviceMem
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
// We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
// However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
// Hence, we need to adjust the order of stride
std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi}; std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C}; std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X}; std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C}; std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo}; std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
std::array<ck::index_t, 5> out_strides{N * Ho * Wo * K, Ho * Wo * K, 1, Wo * K, K}; std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
std::array<ck::index_t, 2> in_left_pad{1, 1}; std::array<ck::index_t, 2> in_left_pad{1, 1};
std::array<ck::index_t, 2> in_right_pad{1, 1}; std::array<ck::index_t, 2> in_right_pad{1, 1};
std::array<ck::index_t, 2> conv_strides{2, 2}; std::array<ck::index_t, 2> conv_strides{2, 2};
std::array<ck::index_t, 2> conv_dilations{1, 1}; std::array<ck::index_t, 2> conv_dilations{1, 1};
SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C); SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C); SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K); SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial, using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
InLayout, InLayout,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment