Commit 30ca77a7 authored by root's avatar root
Browse files

merge from develop and revisison for pr#881

parents 9223a5e2 8f84a012
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include "ck/ck.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_max_pool_bwd_impl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_maxpool_bwd.hpp"
template <typename InDataType,
typename OutDataType,
typename IndexDataType,
typename ComputeDataType,
typename DInDataType,
typename DOutDataType,
bool PropagateNan>
bool maxpool_bwd_test(bool do_verification,
bool time_kernel,
ck::index_t N,
ck::index_t C,
ck::index_t Y,
ck::index_t X,
ck::index_t Hi,
ck::index_t Wi,
ck::index_t window_stride_h,
ck::index_t window_stride_w,
ck::index_t window_dilation_h,
ck::index_t window_dilation_w,
ck::index_t in_left_pad_h,
ck::index_t in_left_pad_w,
ck::index_t in_right_pad_h,
ck::index_t in_right_pad_w)
{
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using DevicePoolFwdInstance =
ck::tensor_operation::device::DevicePool2dFwd_NHWC_NHWC<InDataType, // InDataType
OutDataType, // OutDataType
IndexDataType, // IndexDataType
ComputeDataType, // ComputeDataType
ck::ReduceTensorOp::MAX,
true,
64, // BlockSize
64, // ReduceMThreadClusterSize
1, // ReduceKThreadClusterSize
4, // ReduceMThreadSliceSize
1, // ReduceKThreadSliceSize
1>; // InSrcOutDstVectorSize
using DeviceMaxPoolBwdInstance = ck::tensor_operation::device::
DeviceMaxPoolBwdImpl<DOutDataType, IndexDataType, DInDataType, 4>;
const ck::index_t Ys = (Y - 1) * window_dilation_h + 1;
const ck::index_t Xs = (X - 1) * window_dilation_w + 1;
const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Ys) / window_stride_h + 1;
const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - Xs) / window_stride_w + 1;
const std::vector<ck::index_t> window_spatial_lengths{Y, X};
const std::vector<ck::index_t> window_strides{window_stride_h, window_stride_w};
const std::vector<ck::index_t> window_dilations{window_dilation_h, window_dilation_w};
const std::vector<ck::index_t> input_left_pads{in_left_pad_h, in_left_pad_w};
const std::vector<ck::index_t> input_right_pads{in_right_pad_h, in_right_pad_w};
auto f_host_tensor_descriptor =
[](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W) {
using namespace ck::literals;
// reference need Tensor with NCHW order
return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_});
};
// in
Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi));
// out
Tensor<OutDataType> out_n_c_ho_wo_host(f_host_tensor_descriptor(N, C, Ho, Wo));
Tensor<OutDataType> out_n_c_ho_wo_device(f_host_tensor_descriptor(N, C, Ho, Wo));
// indices
Tensor<IndexDataType> indices_n_c_ho_wo_device(f_host_tensor_descriptor(N, C, Ho, Wo));
Tensor<IndexDataType> indices_n_c_ho_wo_host(f_host_tensor_descriptor(N, C, Ho, Wo));
// dout
Tensor<DOutDataType> dout_n_c_ho_wo(f_host_tensor_descriptor(N, C, Ho, Wo));
// din
Tensor<DInDataType> din_n_c_hi_wi_host(f_host_tensor_descriptor(N, C, Hi, Wi));
Tensor<DInDataType> din_n_c_hi_wi_device(f_host_tensor_descriptor(N, C, Hi, Wi));
std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
std::cout << "out_n_c_ho_wo: " << out_n_c_ho_wo_host.mDesc << std::endl;
std::cout << "indices_n_c_ho_wo: " << indices_n_c_ho_wo_host.mDesc << std::endl;
std::cout << "dout_n_c_ho_wo: " << dout_n_c_ho_wo.mDesc << std::endl;
std::cout << "din_n_c_hi_wi: " << din_n_c_hi_wi_host.mDesc << std::endl;
in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{-1.0, 1.0});
dout_n_c_ho_wo.GenerateTensorValue(GeneratorTensor_3<DOutDataType>{-1.0, 1.0});
DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpaceSize());
DeviceMem out_device_buf(sizeof(OutDataType) *
out_n_c_ho_wo_device.mDesc.GetElementSpaceSize());
DeviceMem indices_device_buf(sizeof(IndexDataType) *
indices_n_c_ho_wo_device.mDesc.GetElementSpaceSize());
DeviceMem dout_device_buf(sizeof(DOutDataType) * dout_n_c_ho_wo.mDesc.GetElementSpaceSize());
DeviceMem din_device_buf(sizeof(DInDataType) *
din_n_c_hi_wi_device.mDesc.GetElementSpaceSize());
in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
dout_device_buf.ToDevice(dout_n_c_ho_wo.mData.data());
auto pool_fwd = DevicePoolFwdInstance{};
auto pool_fwd_invoker_ptr = pool_fwd.MakeInvokerPointer();
auto pool_fwd_argument_ptr = pool_fwd.MakeArgumentPointer(
static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
static_cast<IndexDataType*>(indices_device_buf.GetDeviceBuffer()),
{N, C, Hi, Wi},
window_spatial_lengths,
{N, C, Ho, Wo},
{C * Hi * Wi, 1, Wi * C, C},
{C * Ho * Wo, 1, Wo * C, C},
{C * Ho * Wo, 1, Wo * C, C},
window_strides,
window_dilations,
input_left_pads,
input_right_pads,
{2, 3});
if(!pool_fwd.IsSupportedArgument(pool_fwd_argument_ptr.get()))
{
throw std::runtime_error("wrong! pool_fwd with the specified compilation parameters does "
"not support this problem");
}
float ave_time_fwd =
pool_fwd_invoker_ptr->Run(pool_fwd_argument_ptr.get(), StreamConfig{nullptr, time_kernel});
auto pool_bwd = DeviceMaxPoolBwdInstance{};
auto pool_bwd_invoker_ptr = pool_bwd.MakeInvokerPointer();
auto pool_bwd_argument_ptr = pool_bwd.MakeArgumentPointer(
static_cast<DOutDataType*>(dout_device_buf.GetDeviceBuffer()),
static_cast<IndexDataType*>(indices_device_buf.GetDeviceBuffer()),
static_cast<DInDataType*>(din_device_buf.GetDeviceBuffer()),
dout_n_c_ho_wo.mDesc.GetElementSpaceSize(),
din_n_c_hi_wi_device.mDesc.GetElementSpaceSize(),
window_spatial_lengths,
window_strides,
window_dilations);
if(!pool_bwd.IsSupportedArgument(pool_bwd_argument_ptr.get()))
{
throw std::runtime_error("wrong! pool_bwd with the specified compilation parameters does "
"not support this problem");
}
size_t pool_bwd_workspace_sz = pool_bwd.GetWorkSpaceSize(pool_bwd_argument_ptr.get());
DeviceMem pool_bwd_workspace_device_buf(pool_bwd_workspace_sz);
pool_bwd.SetWorkSpacePointer(pool_bwd_argument_ptr.get(),
pool_bwd_workspace_device_buf.GetDeviceBuffer());
float ave_time_bwd =
pool_bwd_invoker_ptr->Run(pool_bwd_argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::cout << "Pool fwd perf: " << ave_time_fwd << " ms" << std::endl;
std::cout << "Pool bwd perf: " << ave_time_bwd << " ms" << std::endl;
bool pass = true;
if(do_verification)
{
using ReferencePoolingFwdInstance =
ck::tensor_operation::host::ReferencePoolingFwd<4,
2,
InDataType,
OutDataType,
ComputeDataType,
IndexDataType,
ck::ReduceTensorOp::MAX,
PropagateNan,
true>;
auto ref_pooling_fwd = ReferencePoolingFwdInstance{};
auto ref_pooling_fwd_invoker = ref_pooling_fwd.MakeInvoker();
auto ref_pooling_fwd_argument = ref_pooling_fwd.MakeArgument(in_n_c_hi_wi,
out_n_c_ho_wo_host,
indices_n_c_ho_wo_host,
window_spatial_lengths,
window_strides,
window_dilations,
input_left_pads,
input_right_pads);
ref_pooling_fwd_invoker.Run(ref_pooling_fwd_argument);
using ReferencePoolingBwdInstance =
ck::tensor_operation::host::ReferenceMaxPoolBwd<DOutDataType,
IndexDataType,
ComputeDataType,
DInDataType,
PassThrough>;
auto ref_pooling_bwd = ReferencePoolingBwdInstance{};
auto ref_pooling_bwd_invoker = ref_pooling_bwd.MakeInvoker();
auto ref_pooling_bwd_argument = ref_pooling_bwd.MakeArgument(
dout_n_c_ho_wo, indices_n_c_ho_wo_host, din_n_c_hi_wi_host, PassThrough{});
ref_pooling_bwd_invoker.Run(ref_pooling_bwd_argument);
out_device_buf.FromDevice(out_n_c_ho_wo_device.mData.data());
indices_device_buf.FromDevice(indices_n_c_ho_wo_device.mData.data());
din_device_buf.FromDevice(din_n_c_hi_wi_device.mData.data());
pass = pass && ck::utils::check_err(out_n_c_ho_wo_device, out_n_c_ho_wo_host);
pass = pass && ck::utils::check_err(indices_n_c_ho_wo_device, indices_n_c_ho_wo_host);
pass = pass && ck::utils::check_err(din_n_c_hi_wi_device, din_n_c_hi_wi_host);
}
return (pass);
};
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "maxpool2d_bwd_common.hpp"
using InDataType = ck::half_t;
using OutDataType = ck::half_t;
using IndexDataType = int32_t;
using ComputeDataType = float;
using DInDataType = ck::half_t;
using DOutDataType = ck::half_t;
static constexpr bool PropagateNan = false;
int main()
{
bool do_verification = true;
bool time_kernel = false;
// Pool shape
ck::index_t N = 1;
ck::index_t C = 1;
ck::index_t Y = 3;
ck::index_t X = 3;
ck::index_t Hi = 32;
ck::index_t Wi = 32;
ck::index_t window_stride_h = 1;
ck::index_t window_stride_w = 1;
ck::index_t window_dilation_h = 1;
ck::index_t window_dilation_w = 1;
ck::index_t in_left_pad_h = 0;
ck::index_t in_left_pad_w = 0;
ck::index_t in_right_pad_h = 0;
ck::index_t in_right_pad_w = 0;
bool pass = maxpool_bwd_test<InDataType,
OutDataType,
IndexDataType,
ComputeDataType,
DInDataType,
DOutDataType,
PropagateNan>(do_verification,
time_kernel,
N,
C,
Y,
X,
Hi,
Wi,
window_stride_h,
window_stride_w,
window_dilation_h,
window_dilation_w,
in_left_pad_h,
in_left_pad_w,
in_right_pad_h,
in_right_pad_w);
return (pass ? 0 : 1);
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "maxpool2d_bwd_common.hpp"
using InDataType = float;
using OutDataType = float;
using IndexDataType = int32_t;
using ComputeDataType = float;
using DInDataType = float;
using DOutDataType = float;
static constexpr bool PropagateNan = false;
int main()
{
bool do_verification = true;
bool time_kernel = false;
// Pool shape
ck::index_t N = 1;
ck::index_t C = 1;
ck::index_t Y = 2;
ck::index_t X = 2;
ck::index_t Hi = 32;
ck::index_t Wi = 32;
ck::index_t window_stride_h = 2;
ck::index_t window_stride_w = 2;
ck::index_t window_dilation_h = 1;
ck::index_t window_dilation_w = 1;
ck::index_t in_left_pad_h = 0;
ck::index_t in_left_pad_w = 0;
ck::index_t in_right_pad_h = 0;
ck::index_t in_right_pad_w = 0;
bool pass = maxpool_bwd_test<InDataType,
OutDataType,
IndexDataType,
ComputeDataType,
DInDataType,
DOutDataType,
PropagateNan>(do_verification,
time_kernel,
N,
C,
Y,
X,
Hi,
Wi,
window_stride_h,
window_stride_w,
window_dilation_h,
window_dilation_w,
in_left_pad_h,
in_left_pad_w,
in_right_pad_h,
in_right_pad_w);
return (pass ? 0 : 1);
}
if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
add_example_executable(example_put_element_fp16 put_element_fp16.cpp)
endif()
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_put_element_impl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
using XDataType = ck::half_t;
using YDataType = ck::half_t;
using IndexDataType = int32_t;
using YElementwiseOp = ck::tensor_operation::element_wise::PassThrough;
using DeviceInstance =
ck::tensor_operation::device::DevicePutElementImpl<XDataType, // XDataType
IndexDataType, // IndexDataType
YDataType, // YDataType
YElementwiseOp,
ck::InMemoryDataOperationEnum::Set,
1>;
int main()
{
bool do_verification = true;
bool time_kernel = false;
int N = 1024;
Tensor<XDataType> x(HostTensorDescriptor{N});
Tensor<IndexDataType> indices(HostTensorDescriptor{N});
Tensor<YDataType> y(HostTensorDescriptor{N});
x.GenerateTensorValue(GeneratorTensor_3<XDataType>{-1.0, 1.0});
for(int i = 0; i < N; ++i)
indices(i) = i;
DeviceMem x_device_buf(sizeof(XDataType) * x.mDesc.GetElementSpaceSize());
DeviceMem y_device_buf(sizeof(YDataType) * y.mDesc.GetElementSpaceSize());
DeviceMem indices_device_buf(sizeof(IndexDataType) * indices.mDesc.GetElementSpaceSize());
x_device_buf.ToDevice(x.mData.data());
indices_device_buf.ToDevice(indices.mData.data());
auto put_instance = DeviceInstance{};
auto put_invoker_ptr = put_instance.MakeInvokerPointer();
auto put_argument_ptr = put_instance.MakeArgumentPointer(
static_cast<XDataType*>(x_device_buf.GetDeviceBuffer()),
static_cast<IndexDataType*>(indices_device_buf.GetDeviceBuffer()),
static_cast<YDataType*>(y_device_buf.GetDeviceBuffer()),
N,
N,
YElementwiseOp{});
if(!put_instance.IsSupportedArgument(put_argument_ptr.get()))
{
throw std::runtime_error("argument is not supported!");
}
float ave_time =
put_invoker_ptr->Run(put_argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::cout << "perf: " << ave_time << " ms" << std::endl;
bool pass = true;
if(do_verification)
{
Tensor<YDataType> y_host(HostTensorDescriptor{N});
for(int i = 0; i < N; ++i)
{
IndexDataType idx = indices(i);
y_host(idx) = x(i);
}
y_device_buf.FromDevice(y.mData.data());
pass = ck::utils::check_err(y, y_host);
}
return (pass ? 0 : 1);
}
add_example_executable(example_avgpool3d_bwd_bf16 avgpool3d_bwd_bf16.cpp)
add_example_executable(example_avgpool3d_bwd_fp16 avgpool3d_bwd_fp16.cpp)
add_example_executable(example_avgpool3d_bwd_fp32 avgpool3d_bwd_fp32.cpp)
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_avgpool3d_bwd_ndhwc_ndhwc.hpp"
#include "avgpool3d_bwd_common.hpp"
using DOutDataType = ck::bhalf_t;
using DInDataType = ck::bhalf_t;
using ComputeDataType = float;
#if 1
using DOutLayout = ck::tensor_layout::convolution::NDHWC;
using DInLayout = ck::tensor_layout::convolution::NDHWC;
#else
using DOutLayout = ck::tensor_layout::convolution::NCDHW;
using DInLayout = ck::tensor_layout::convolution::NCDHW;
#endif
using DevicePoolBwdInstance =
ck::tensor_operation::device::DeviceAvgPool3dBwd_NDHWC_NDHWC<DOutDataType,
DInDataType,
ComputeDataType,
64, // BlockSize
64, // ReduceMThreadClusterSize
1, // ReduceKThreadClusterSize
1, // ReduceMThreadSliceSize
1, // ReduceKThreadSliceSize
1>; // InSrcOutDstVectorSize
int main()
{
std::vector<ck::index_t> window_lengths = {5, 5, 5};
std::vector<ck::index_t> window_strides = {2, 2, 2};
std::vector<ck::index_t> window_dilations = {2, 2, 2};
std::vector<ck::index_t> dinput_left_pads = {0, 0, 0};
std::vector<ck::index_t> dinput_right_pads = {0, 0, 0};
ck::index_t N = 1;
ck::index_t C = 16;
ck::index_t Di = 40;
ck::index_t Hi = 40;
ck::index_t Wi = 40;
pool3d_bwd_test<DevicePoolBwdInstance, DOutDataType, DInDataType, DOutLayout, DInLayout>(
true,
false,
N,
C,
Di,
Hi,
Wi,
window_lengths,
window_strides,
window_dilations,
dinput_left_pads,
dinput_right_pads);
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "ck/ck.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_avgpool_bwd.hpp"
template <typename TensorLayout>
std::vector<ck::index_t> f_tensor_strides_ncdhw(ck::index_t N_,
ck::index_t C_,
ck::index_t D,
ck::index_t H,
ck::index_t W,
TensorLayout layout)
{
using namespace ck::literals;
(void)N_;
if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCDHW>::value)
return {C_ * D * H * W, D * H * W, H * W, W, 1_uz};
else if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NDHWC>::value)
return {D * C_ * H * W, 1_uz, C_ * H * W, W * C_, C_};
};
template <typename TensorLayout>
HostTensorDescriptor f_host_tensor_descriptor(std::size_t N_,
std::size_t C_,
std::size_t D,
std::size_t H,
std::size_t W,
TensorLayout layout)
{
using namespace ck::literals;
if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCDHW>::value)
{
return HostTensorDescriptor({N_, C_, D, H, W}, {C_ * D * H * W, D * H * W, H * W, W, 1_uz});
}
else if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NDHWC>::value)
{
return HostTensorDescriptor({N_, C_, D, H, W},
{D * C_ * H * W, 1_uz, C_ * H * W, W * C_, C_});
}
};
template <typename DevicePoolBwdInstance,
typename DOutDataType,
typename DInDataType,
typename DOutLayout,
typename DInLayout>
bool pool3d_bwd_test(bool do_verification,
bool time_kernel,
ck::index_t N,
ck::index_t C,
ck::index_t Di,
ck::index_t Hi,
ck::index_t Wi,
std::vector<ck::index_t> window_lengths,
std::vector<ck::index_t> window_strides,
std::vector<ck::index_t> window_dilations,
std::vector<ck::index_t> dinput_left_pads,
std::vector<ck::index_t> dinput_right_pads)
{
auto OutSpatialLength = [&](auto InSpatialLength, int index) {
ck::index_t left_pad = dinput_left_pads[index];
ck::index_t right_pad = dinput_right_pads[index];
ck::index_t window_len = window_lengths[index];
ck::index_t stride = window_strides[index];
ck::index_t dilation = window_dilations[index];
ck::index_t eff = (window_len - 1) * dilation + 1;
return (InSpatialLength + left_pad + right_pad - eff) / stride + 1;
};
ck::index_t Do = OutSpatialLength(Di, 0);
ck::index_t Ho = OutSpatialLength(Hi, 1);
ck::index_t Wo = OutSpatialLength(Wi, 2);
Tensor<DOutDataType> dout(f_host_tensor_descriptor(N, C, Do, Ho, Wo, DOutLayout{}));
Tensor<DInDataType> din_dev(f_host_tensor_descriptor(N, C, Di, Hi, Wi, DInLayout{}));
Tensor<DInDataType> din_host(f_host_tensor_descriptor(N, C, Di, Hi, Wi, DInLayout{}));
std::cout << "dout: " << dout.mDesc << std::endl;
std::cout << "din_host: " << din_host.mDesc << std::endl;
dout.GenerateTensorValue(GeneratorTensor_3<DOutDataType>{0.0, 1.0});
DeviceMem dout_device_buf(sizeof(DOutDataType) * dout.mDesc.GetElementSpaceSize());
DeviceMem din_device_buf(sizeof(DInDataType) * din_dev.mDesc.GetElementSpaceSize());
dout_device_buf.ToDevice(dout.mData.data());
din_device_buf.SetZero();
auto pool = DevicePoolBwdInstance{};
auto invoker_ptr = pool.MakeInvokerPointer();
auto argument_ptr =
pool.MakeArgumentPointer(static_cast<DOutDataType*>(dout_device_buf.GetDeviceBuffer()),
static_cast<DInDataType*>(din_device_buf.GetDeviceBuffer()),
{N, C, Do, Ho, Wo},
{N, C, Di, Hi, Wi},
f_tensor_strides_ncdhw(N, C, Do, Ho, Wo, DOutLayout{}),
f_tensor_strides_ncdhw(N, C, Di, Hi, Wi, DInLayout{}),
window_lengths,
window_strides,
window_dilations,
dinput_left_pads,
dinput_right_pads);
if(!pool.IsSupportedArgument(argument_ptr.get()))
{
throw std::runtime_error("wrong! device_op with the specified compilation parameters does "
"not support this problem");
}
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::cout << "Perf: " << ave_time << std::endl;
bool pass = true;
if(do_verification)
{
auto ref_pool =
ck::tensor_operation::host::ReferenceAvgPoolBwd<3, DInDataType, DOutDataType>();
auto ref_invoker = ref_pool.MakeInvoker();
auto ref_argument = ref_pool.MakeArgument(din_host,
dout,
window_lengths,
window_strides,
window_dilations,
dinput_left_pads,
dinput_right_pads);
ref_invoker.Run(ref_argument);
din_device_buf.FromDevice(din_dev.mData.data());
pass = ck::utils::check_err(din_dev, din_host);
}
return pass;
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_avgpool3d_bwd_ndhwc_ndhwc.hpp"
#include "avgpool3d_bwd_common.hpp"
using DOutDataType = ck::half_t;
using DInDataType = ck::half_t;
using ComputeDataType = float;
#if 1
using DOutLayout = ck::tensor_layout::convolution::NDHWC;
using DInLayout = ck::tensor_layout::convolution::NDHWC;
#else
using DOutLayout = ck::tensor_layout::convolution::NCDHW;
using DInLayout = ck::tensor_layout::convolution::NCDHW;
#endif
using DevicePoolBwdInstance =
ck::tensor_operation::device::DeviceAvgPool3dBwd_NDHWC_NDHWC<DOutDataType,
DInDataType,
ComputeDataType,
64, // BlockSize
64, // ReduceMThreadClusterSize
1, // ReduceKThreadClusterSize
1, // ReduceMThreadSliceSize
1, // ReduceKThreadSliceSize
1>; // InSrcOutDstVectorSize
int main()
{
std::vector<ck::index_t> window_lengths = {5, 5, 5};
std::vector<ck::index_t> window_strides = {2, 2, 2};
std::vector<ck::index_t> window_dilations = {2, 2, 2};
std::vector<ck::index_t> dinput_left_pads = {0, 0, 0};
std::vector<ck::index_t> dinput_right_pads = {0, 0, 0};
ck::index_t N = 1;
ck::index_t C = 16;
ck::index_t Di = 40;
ck::index_t Hi = 40;
ck::index_t Wi = 40;
pool3d_bwd_test<DevicePoolBwdInstance, DOutDataType, DInDataType, DOutLayout, DInLayout>(
true,
false,
N,
C,
Di,
Hi,
Wi,
window_lengths,
window_strides,
window_dilations,
dinput_left_pads,
dinput_right_pads);
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_avgpool3d_bwd_ndhwc_ndhwc.hpp"
#include "avgpool3d_bwd_common.hpp"
using DOutDataType = float;
using DInDataType = float;
using ComputeDataType = float;
#if 1
using DOutLayout = ck::tensor_layout::convolution::NDHWC;
using DInLayout = ck::tensor_layout::convolution::NDHWC;
#else
using DOutLayout = ck::tensor_layout::convolution::NCDHW;
using DInLayout = ck::tensor_layout::convolution::NCDHW;
#endif
using DevicePoolBwdInstance =
ck::tensor_operation::device::DeviceAvgPool3dBwd_NDHWC_NDHWC<DOutDataType,
DInDataType,
ComputeDataType,
64, // BlockSize
64, // ReduceMThreadClusterSize
1, // ReduceKThreadClusterSize
1, // ReduceMThreadSliceSize
1, // ReduceKThreadSliceSize
1>; // InSrcOutDstVectorSize
int main()
{
std::vector<ck::index_t> window_lengths = {5, 5, 5};
std::vector<ck::index_t> window_strides = {2, 2, 2};
std::vector<ck::index_t> window_dilations = {2, 2, 2};
std::vector<ck::index_t> dinput_left_pads = {0, 0, 0};
std::vector<ck::index_t> dinput_right_pads = {0, 0, 0};
ck::index_t N = 1;
ck::index_t C = 16;
ck::index_t Di = 40;
ck::index_t Hi = 40;
ck::index_t Wi = 40;
pool3d_bwd_test<DevicePoolBwdInstance, DOutDataType, DInDataType, DOutLayout, DInLayout>(
true,
false,
N,
C,
Di,
Hi,
Wi,
window_lengths,
window_strides,
window_dilations,
dinput_left_pads,
dinput_right_pads);
}
list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
set(target 0)
foreach(gpu IN LISTS GPU_TARGETS)
if(gpu IN_LIST gpu_list AND target EQUAL 0)
add_custom_target(example_image_to_column)
add_example_executable(example_image_to_column_f32 image_to_column_f32.cpp)
add_dependencies(example_image_to_column example_image_to_column_f32)
set(target 1)
endif()
endforeach()
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <cstdlib>
#include <initializer_list>
#include <iostream>
#include <numeric>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp"
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
static inline constexpr ck::index_t NDimSpatial = 2;
using FP32 = float;
struct ExecutionConfig final
{
bool do_verification = true;
int init_method = 1;
bool time_kernel = true;
};
#define DefaultConvParams \
ck::utils::conv::ConvParam \
{ \
NDimSpatial, 1, 32, 1, 1, {4, 4}, {64, 64}, {1, 1}, {1, 1}, {0, 0}, { 0, 0 } \
}
inline void print_help_msg()
{
std::cerr << "arg1: verification (0=no, 1=yes)\n"
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
<< "arg3: time kernel (0=no, 1=yes)\n"
<< ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
}
inline bool parse_cmd_args(int argc,
char* argv[],
ExecutionConfig& config,
ck::utils::conv::ConvParam& conv_params)
{
constexpr int num_execution_config_args =
3; // arguments for do_verification, init_method, time_kernel
constexpr int num_conv_param_leading_args = 5; // arguments for num_dim_spatial_, G_, N_, K_, C_
constexpr int threshold_to_catch_partial_args = 1 + num_execution_config_args;
constexpr int threshold_to_catch_all_args =
threshold_to_catch_partial_args + num_conv_param_leading_args;
if(argc == 1)
{
// use default
config = ExecutionConfig{};
}
// catch only ExecutionConfig arguments
else if(argc == threshold_to_catch_partial_args)
{
config.do_verification = std::stoi(argv[1]);
config.init_method = std::stoi(argv[2]);
config.time_kernel = std::stoi(argv[3]);
}
// catch both ExecutionConfig & ConvParam arguments
else if(threshold_to_catch_all_args < argc && ((argc - threshold_to_catch_all_args) % 3 == 0))
{
config.do_verification = std::stoi(argv[1]);
config.init_method = std::stoi(argv[2]);
config.time_kernel = std::stoi(argv[3]);
const ck::index_t num_dim_spatial = std::stoi(argv[4]);
conv_params = ck::utils::conv::parse_conv_param(
num_dim_spatial, threshold_to_catch_partial_args, argv);
}
else
{
print_help_msg();
return false;
}
return true;
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
using InDataType = FP32;
using OutDataType = FP32;
using InLayout = ck::tensor_layout::convolution::GNHWC;
// clang-format off
using DeviceImgToColInstance = ck::tensor_operation::device::DeviceImageToColumnImpl
//#####################| Num| InLayout| InDataType| OutDataType| Block| MPer| KPer| Thread| Scalar|
//#####################| Dim| | | | Size| Block| Block| Cluster| Per|
//#####################| Spatial| | | | | | | Lengths| Vector|
//#####################| | | | | | | | | |
< NDimSpatial, InLayout, InDataType, OutDataType, 256, 128, 128, S<16, 16>, 1>;
// clang-format on
bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::ConvParam& conv_params)
{
const auto N = conv_params.N_;
const auto C = conv_params.C_;
const ck::index_t NDoHoWo =
N * ck::accumulate_n<ck::index_t>(
conv_params.output_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
const ck::index_t CZYX =
C * ck::accumulate_n<ck::index_t>(
conv_params.filter_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
const auto in_desc =
ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_params);
const auto out_desc = HostTensorDescriptor({NDoHoWo, CZYX});
std::array<ck::index_t, NDimSpatial> input_spatial_lengths{};
std::array<ck::index_t, NDimSpatial> filter_spatial_lengths{};
std::array<ck::index_t, NDimSpatial> output_spatial_lengths{};
std::array<ck::index_t, NDimSpatial + 3> input_g_n_c_wis_strides{};
std::array<ck::index_t, 2> output_m_k_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
std::array<ck::index_t, NDimSpatial> input_left_pads{};
std::array<ck::index_t, NDimSpatial> input_right_pads{};
auto copy = [](const auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); };
copy(conv_params.input_spatial_lengths_, input_spatial_lengths);
copy(conv_params.filter_spatial_lengths_, filter_spatial_lengths);
copy(conv_params.output_spatial_lengths_, output_spatial_lengths);
copy(in_desc.GetStrides(), input_g_n_c_wis_strides);
copy(out_desc.GetStrides(), output_m_k_strides);
copy(conv_params.conv_filter_strides_, conv_filter_strides);
copy(conv_params.conv_filter_dilations_, conv_filter_dilations);
copy(conv_params.input_left_pads_, input_left_pads);
copy(conv_params.input_right_pads_, input_right_pads);
Tensor<InDataType> in(in_desc);
Tensor<OutDataType> out_device(out_desc);
Tensor<OutDataType> out_host(out_desc);
std::cout << "in: " << in.mDesc << std::endl;
std::cout << "out: " << out_device.mDesc << std::endl;
switch(config.init_method)
{
case 0: break;
case 1: in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}); break;
default: in.GenerateTensorValue(GeneratorTensor_3<InDataType>{-0.5, 0.5});
}
DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
in_device_buf.ToDevice(in.mData.data());
// reset input to zero
out_device_buf.SetZero();
static_assert(std::is_default_constructible_v<DeviceImgToColInstance>);
// do conv
auto img2col = DeviceImgToColInstance{};
auto invoker = img2col.MakeInvoker();
auto argument = img2col.MakeArgument(in_device_buf.GetDeviceBuffer(),
out_device_buf.GetDeviceBuffer(),
N,
C,
input_spatial_lengths,
filter_spatial_lengths,
output_spatial_lengths,
input_g_n_c_wis_strides,
output_m_k_strides,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads);
if(!img2col.IsSupportedArgument(argument))
{
std::cerr << "wrong! device_img2col with the specified compilation parameters does "
"not support this img2col problem"
<< std::endl;
return false;
}
float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
std::size_t num_btype = NDoHoWo * CZYX * (sizeof(OutDataType) + sizeof(InDataType));
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s" << std::endl;
if(config.do_verification)
{
auto ref_image_to_column = ck::tensor_operation::host::
ReferenceImageToColumn<NDimSpatial, InLayout, InDataType, OutDataType>();
auto ref_invoker = ref_image_to_column.MakeInvoker();
auto ref_argument = ref_image_to_column.MakeArgument(in,
out_host,
conv_params.filter_spatial_lengths_,
conv_params.conv_filter_strides_,
conv_params.conv_filter_dilations_,
conv_params.input_left_pads_,
conv_params.input_right_pads_);
if(!ref_image_to_column.IsSupportedArgument(&ref_argument))
{
std::cerr << "wrong! ref_img2col with the specified compilation parameters does "
"not support this img2col problem"
<< std::endl;
return false;
}
ref_invoker.Run(ref_argument);
out_device_buf.FromDevice(out_device.mData.data());
return ck::utils::check_err(out_device.mData, out_host.mData);
}
return true;
}
int RunImageToColumnExample(int argc, char* argv[])
{
ExecutionConfig config;
ck::utils::conv::ConvParam conv_params = DefaultConvParams;
if(!parse_cmd_args(argc, argv, config, conv_params))
{
return EXIT_FAILURE;
}
if(conv_params.num_dim_spatial_ != NDimSpatial)
{
std::cerr << "unsupported # of spatial dimensions" << std::endl;
return EXIT_FAILURE;
}
return !RunImageToColumn(config, conv_params);
}
int main(int argc, char* argv[]) { return RunImageToColumnExample(argc, argv); }
add_custom_target(example_gemv_splitk)
add_example_executable(example_gemv_splitk_fp16 gemv_splitk_fp16.cpp)
add_dependencies(example_gemv_splitk
example_gemv_splitk_fp16)
\ No newline at end of file
...@@ -22,15 +22,15 @@ ...@@ -22,15 +22,15 @@
#include "ck/library/utility/literals.hpp" #include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
struct ProblemSize final struct ProblemSize final // Default GEMV problem size
{ {
ck::index_t M = 3840; ck::index_t M = 1;
ck::index_t N = 4096; ck::index_t N = 1104;
ck::index_t K = 4096; ck::index_t K = 4608;
ck::index_t stride_A = K;
ck::index_t StrideA = 4096; ck::index_t stride_B = K;
ck::index_t StrideB = 4096; ck::index_t stride_C = N;
ck::index_t StrideC = 4096; ck::index_t k_batch = 1;
}; };
struct ExecutionConfig final struct ExecutionConfig final
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp" #include "common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_splitK_gemv.hpp" #include "ck/tensor_operation/gpu/device/impl/device_gemv_splitk.hpp"
using ADataType = ck::half_t; using ADataType = ck::half_t;
using BDataType = ck::half_t; using BDataType = ck::half_t;
...@@ -17,15 +17,14 @@ using AElementOp = PassThrough; ...@@ -17,15 +17,14 @@ using AElementOp = PassThrough;
using BElementOp = PassThrough; using BElementOp = PassThrough;
using CElementOp = PassThrough; using CElementOp = PassThrough;
// static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding; static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
#define K1 8 //2,4,8 #define K1 8 //K1PerThread:2,4,8
#define K0 4 //1,2,3,4...32 --> 423ms vs 282ms (k0=4) #define K0 4 //K0PerBlock:1,2,3,4...32
#define N1 2 //2,4,8 #define N1 2 //Nperthread:2,4,8
#define B 64 //64 #define B 64 //block-size:64
// clang-format off // clang-format off
using DeviceGemvInstance = ck::tensor_operation::device::deviceGemvDl/* using DeviceGemvInstance = ck::tensor_operation::device::deviceGemvDl/*
...@@ -34,13 +33,12 @@ using DeviceGemvInstance = ck::tensor_operation::device::deviceGemvDl/* ...@@ -34,13 +33,12 @@ using DeviceGemvInstance = ck::tensor_operation::device::deviceGemvDl/*
// ######| | | | | | | | Operation| Operation| Operation| | | | | | | | | | KBatch_K0_M0_M1_K1| KBatch_K0_M0_M1_K1| ArrangeOrder| Order| KBatch_K0_M0_M1_K1 | ContiguousDimOrder| KBatch_K0_M0_M1_K1 | Order| | | Order| | | // ######| | | | | | | | Operation| Operation| Operation| | | | | | | | | | KBatch_K0_M0_M1_K1| KBatch_K0_M0_M1_K1| ArrangeOrder| Order| KBatch_K0_M0_M1_K1 | ContiguousDimOrder| KBatch_K0_M0_M1_K1 | Order| | | Order| | |
// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | // ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
//< ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout, AElementOp, BElementOp, CElementOp, GemmMNPadding, 64, 1, 64, 32, 2, 1, 1, 1, S<1, 1, 1, 2>, S<32, 1, 1, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<1, 2, 0, 3>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 1>;*/ //< ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout, AElementOp, BElementOp, CElementOp, GemmMNPadding, 64, 1, 64, 32, 2, 1, 1, 1, S<1, 1, 1, 2>, S<32, 1, 1, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<1, 2, 0, 3>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 1>;*/
// < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout, AElementOp, BElementOp, CElementOp, GemmMNPadding, 64, 1, 128, 8, 8, 1, 2, 1, S<1,1, 1, 1, 8>, S<1,8, 1, 1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, 8, S<0, 1, 2, 3, 4, 5>, 5, 2>;
< ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout, AElementOp, BElementOp, CElementOp, GemmMNPadding, B, 1, B*N1, K0, K1, 1, N1, 1, S<1,1, 1, 1, K1>, S<1,K0, 1, 1, 1>,S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, K1>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, K1, S<0, 1, 2, 3, 4, 5>, 5, N1>; < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout, AElementOp, BElementOp, CElementOp, GemmMNPadding, B, 1, B*N1, K0, K1, 1, N1, 1, S<1,1, 1, 1, K1>, S<1,K0, 1, 1, 1>,S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, K1>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, K1, S<0, 1, 2, 3, 4, 5>, 5, N1>;
// clang-format on // clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host:: using ReferenceGemmInstance = ck::tensor_operation::host::
ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>; ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
#include "run_splitK_gemv_example.inc" #include "run_gemv_splitk_example.inc"
int main(int argc, char* argv[]) { return !run_gemv_example(argc, argv); } int main(int argc, char* argv[]) { return !run_gemv_example(argc, argv); }
...@@ -2,29 +2,8 @@ ...@@ -2,29 +2,8 @@
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
struct ProblemSize_gemv final
{
// ck::index_t M = 1; // 3840;
// ck::index_t N = 128;
// ck::index_t K = 128;
// ck::index_t stride_A = K;
// ck::index_t stride_B = K;
// ck::index_t stride_C = N;
// ck::index_t k_batch = 1;
ck::index_t M = 1; // 3840;
ck::index_t N = 1104;
ck::index_t K = 4608;
ck::index_t stride_A = K;
ck::index_t stride_B = K;
ck::index_t stride_C = N;
ck::index_t k_batch = 1; bool run_gemv(const ProblemSize& problem_size, const ExecutionConfig& config)
};
bool run_gemv(const ProblemSize_gemv& problem_size, const ExecutionConfig& config)
{ {
#if defined(BUILD_INT4_EXAMPLE) && defined(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4) #if defined(BUILD_INT4_EXAMPLE) && defined(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
static_assert(sizeof(ck::int4_t) == sizeof(int8_t)); static_assert(sizeof(ck::int4_t) == sizeof(int8_t));
...@@ -116,7 +95,7 @@ bool run_gemv(const ProblemSize_gemv& problem_size, const ExecutionConfig& confi ...@@ -116,7 +95,7 @@ bool run_gemv(const ProblemSize_gemv& problem_size, const ExecutionConfig& confi
c_element_op, c_element_op,
k_batch); // // k_batch); // //
// // // //
if(!gemv.IsSupportedArgument(argument)) if(!gemv.IsSupportedArgument(argument))
{ {
std::cerr << gemv.GetTypeString() << " does not support this problem" << std::endl; std::cerr << gemv.GetTypeString() << " does not support this problem" << std::endl;
...@@ -124,18 +103,9 @@ bool run_gemv(const ProblemSize_gemv& problem_size, const ExecutionConfig& confi ...@@ -124,18 +103,9 @@ bool run_gemv(const ProblemSize_gemv& problem_size, const ExecutionConfig& confi
return true; return true;
} }
float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel}); c_m_n_device_buf.Zero();
std::size_t flop = 2_uz * M * N * K;
std::size_t num_btype =
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time; invoker.Run(argument, StreamConfig{nullptr, false}); // Run prior to verification
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< gemv.GetTypeString() << std::endl;
if(config.do_verification) if(config.do_verification)
{ {
...@@ -162,12 +132,26 @@ bool run_gemv(const ProblemSize_gemv& problem_size, const ExecutionConfig& confi ...@@ -162,12 +132,26 @@ bool run_gemv(const ProblemSize_gemv& problem_size, const ExecutionConfig& confi
#endif #endif
} }
float ave_time = invoker.Run(
argument, StreamConfig{nullptr, config.time_kernel}); // Run to measure performance
std::size_t flop = 2_uz * M * N * K;
std::size_t num_btype =
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< gemv.GetTypeString() << std::endl;
return true; return true;
} }
bool run_gemv_example(int argc, char* argv[]) bool run_gemv_example(int argc, char* argv[])
{ {
ProblemSize_gemv problem_size; ProblemSize problem_size;
// problem_size.M = 1; // problem_size.M = 1;
ExecutionConfig config; ExecutionConfig config;
if(argc == 1) if(argc == 1)
......
File mode changed from 100644 to 100755
...@@ -3,6 +3,8 @@ ...@@ -3,6 +3,8 @@
#pragma once #pragma once
#include "ck/config.h"
#ifndef CK_DONT_USE_HIP_RUNTIME_HEADERS #ifndef CK_DONT_USE_HIP_RUNTIME_HEADERS
#include "hip/hip_runtime.h" #include "hip/hip_runtime.h"
#include "hip/hip_fp16.h" #include "hip/hip_fp16.h"
...@@ -27,6 +29,21 @@ ...@@ -27,6 +29,21 @@
#define CK_WAVELET_MIN_BLOCK_PER_CU 2 #define CK_WAVELET_MIN_BLOCK_PER_CU 2
#endif #endif
// kernel attribute: amdgpu_waves_per_eu()
#ifdef CK_USE_WAVES_PER_EU
// for 1-wave kernels, control arguments of amdgpu_waves_per_eu() attribute
#ifndef CK_MIN_WAVES_PER_EU
#define CK_MIN_WAVES_PER_EU 0
#endif
#ifndef CK_MAX_WAVES_PER_EU
#define CK_MAX_WAVES_PER_EU 0
#endif
#else
#define CK_USE_WAVES_PER_EU 0
#endif
// buffer resource // buffer resource
#ifndef __HIP_DEVICE_COMPILE__ // for host code #ifndef __HIP_DEVICE_COMPILE__ // for host code
#define CK_BUFFER_RESOURCE_3RD_DWORD -1 #define CK_BUFFER_RESOURCE_3RD_DWORD -1
...@@ -103,8 +120,15 @@ ...@@ -103,8 +120,15 @@
// inline asm // inline asm
#define CK_USE_AMD_INLINE_ASM 1 #define CK_USE_AMD_INLINE_ASM 1
// inner product (DLOP) // inner product (V_MAC/V_FMAC)
#define CK_USE_AMD_INNER_PRODUCT_INLINE_ASM 1 #define CK_USE_AMD_V_MAC_INLINE_ASM 1
// V_DOT inline instructions, less efficient since they require adding
// `s_nop`s to avoid hazard
#define CK_USE_AMD_V_DOT_INLINE_ASM 0
// inner product using V_DOT with DPP8 modifiers
#define CK_USE_AMD_V_DOT_DPP8_INLINE_ASM 1
// block synchronization only s_wait lgkmcnt(0), not vmcnt(0) // block synchronization only s_wait lgkmcnt(0), not vmcnt(0)
#define CK_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1 #define CK_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1
...@@ -148,6 +172,10 @@ ...@@ -148,6 +172,10 @@
#define CK_EXPERIMENTAL_INTER_WAVE_INSTANCES 1 #define CK_EXPERIMENTAL_INTER_WAVE_INSTANCES 1
// experimental feature: add instances using pipeline v2 // experimental feature: add instances using pipeline v2
#define CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES 1 #define CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES 1
// experimental feature: optimize pipeline v2 by IGLP strategy (value=ID of strategy)
#ifndef CK_EXPERIMENTAL_PIPELINE_V2_IGLP_OPT
#define CK_EXPERIMENTAL_PIPELINE_V2_IGLP_OPT 0
#endif
// hack: have underlying assumption that need to be satsified, otherwise it's a bug // hack: have underlying assumption that need to be satsified, otherwise it's a bug
// hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be // hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be
...@@ -173,6 +201,7 @@ ...@@ -173,6 +201,7 @@
// workaround: compiler issue on gfx908 // workaround: compiler issue on gfx908
#define CK_WORKAROUND_SWDEV_388832 1 #define CK_WORKAROUND_SWDEV_388832 1
// flag to enable (1) or disable (0) the debugging output in some kernels // flag to enable (1) or disable (0) the debugging output in some kernels
#define DEBUG_LOG 0 #define DEBUG_LOG 0
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment