Commit 9b0ee1f1 authored by carlushuang's avatar carlushuang
Browse files

Merge remote-tracking branch 'origin/develop' into stream-k-initial-impl

parents c3c89727 3b18f1e3
...@@ -20,6 +20,8 @@ Full documentation for Composable Kernel is not yet available. ...@@ -20,6 +20,8 @@ Full documentation for Composable Kernel is not yet available.
- Added multi-embeddings support (#542). - Added multi-embeddings support (#542).
- Added Navi3x blockwise GEMM and real GEMM support (#541). - Added Navi3x blockwise GEMM and real GEMM support (#541).
- Added Navi grouped ConvBwdWeight support (#505). - Added Navi grouped ConvBwdWeight support (#505).
- Added pool3d forward (#697).
- Added maxpool backward (#750).
### Changed ### Changed
- Changed ... - Changed ...
...@@ -696,7 +696,7 @@ pipeline { ...@@ -696,7 +696,7 @@ pipeline {
agent{ label rocmnode("gfx908 || gfx90a") } agent{ label rocmnode("gfx908 || gfx90a") }
environment{ environment{
setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a;gfx940" """ setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a;gfx940" """
execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908;gfx90a;gfx940" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """
} }
steps{ steps{
Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
......
...@@ -14,10 +14,6 @@ foreach(gpu IN LISTS GPU_TARGETS) ...@@ -14,10 +14,6 @@ foreach(gpu IN LISTS GPU_TARGETS)
endif() endif()
endforeach() endforeach()
set(target 0) if(NOT GPU_TARGETS MATCHES "gfx94" AND NOT GPU_TARGETS MATCHES "gfx1")
foreach(gpu IN LISTS GPU_TARGETS)
if(gpu IN_LIST gpu_list2 AND target EQUAL 0)
add_example_executable(example_batched_gemm_gemm_xdl_int8 batched_gemm_gemm_xdl_int8.cpp) add_example_executable(example_batched_gemm_gemm_xdl_int8 batched_gemm_gemm_xdl_int8.cpp)
set(target 1) endif()
endif()
endforeach()
\ No newline at end of file
...@@ -13,10 +13,6 @@ foreach(gpu IN LISTS GPU_TARGETS) ...@@ -13,10 +13,6 @@ foreach(gpu IN LISTS GPU_TARGETS)
endif() endif()
endforeach() endforeach()
set(target 0) if(NOT GPU_TARGETS MATCHES "gfx94" AND NOT GPU_TARGETS MATCHES "gfx1")
foreach(gpu IN LISTS GPU_TARGETS)
if(gpu IN_LIST gpu_list2 AND target EQUAL 0)
add_example_executable(example_grouped_conv_conv_fwd_xdl_int8 grouped_conv_conv_fwd_xdl_int8.cpp) add_example_executable(example_grouped_conv_conv_fwd_xdl_int8 grouped_conv_conv_fwd_xdl_int8.cpp)
set(target 1) endif()
endif()
endforeach()
add_example_executable(example_maxpool2d_bwd_bf16 maxpool2d_bwd_bf16.cpp)
add_example_executable(example_maxpool2d_bwd_fp16 maxpool2d_bwd_fp16.cpp)
add_example_executable(example_maxpool2d_bwd_fp32 maxpool2d_bwd_fp32.cpp)
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "maxpool2d_bwd_common.hpp"
using InDataType = ck::bhalf_t;
using OutDataType = ck::bhalf_t;
using IndexDataType = int32_t;
using ComputeDataType = float;
using DInDataType = ck::bhalf_t;
using DOutDataType = ck::bhalf_t;
static constexpr bool PropagateNan = false;
int main()
{
bool do_verification = true;
bool time_kernel = false;
// Pool shape
ck::index_t N = 1;
ck::index_t C = 1;
ck::index_t Y = 3;
ck::index_t X = 3;
ck::index_t Hi = 32;
ck::index_t Wi = 32;
ck::index_t window_stride_h = 1;
ck::index_t window_stride_w = 1;
ck::index_t in_left_pad_h = 0;
ck::index_t in_left_pad_w = 0;
ck::index_t in_right_pad_h = 0;
ck::index_t in_right_pad_w = 0;
bool pass = maxpool_bwd_test<InDataType,
OutDataType,
IndexDataType,
ComputeDataType,
DInDataType,
DOutDataType,
PropagateNan>(do_verification,
time_kernel,
N,
C,
Y,
X,
Hi,
Wi,
window_stride_h,
window_stride_w,
in_left_pad_h,
in_left_pad_w,
in_right_pad_h,
in_right_pad_w);
return (pass ? 0 : 1);
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include "ck/ck.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_index_pool_bwd_impl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_maxpool_bwd.hpp"
template <typename InDataType,
typename OutDataType,
typename IndexDataType,
typename ComputeDataType,
typename DInDataType,
typename DOutDataType,
bool PropagateNan>
bool maxpool_bwd_test(bool do_verification,
bool time_kernel,
ck::index_t N,
ck::index_t C,
ck::index_t Y,
ck::index_t X,
ck::index_t Hi,
ck::index_t Wi,
ck::index_t window_stride_h,
ck::index_t window_stride_w,
ck::index_t in_left_pad_h,
ck::index_t in_left_pad_w,
ck::index_t in_right_pad_h,
ck::index_t in_right_pad_w)
{
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using DevicePoolFwdInstance =
ck::tensor_operation::device::DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C<
InDataType, // InDataType
OutDataType, // OutDataType
IndexDataType, // IndexDataType
ComputeDataType, // ComputeDataType
ck::ReduceTensorOp::MAX,
true, // OutputIndex
64, // BlockSize
64, // ReduceMThreadClusterSize
1, // ReduceKThreadClusterSize
4, // ReduceMThreadSliceSize
1, // ReduceKThreadSliceSize
1>; // InSrcOutDstVectorSize
using DeviceMaxPoolBwdInstance = ck::tensor_operation::device::
DeviceIndexPoolBwdImpl<DOutDataType, IndexDataType, DInDataType, 4>;
const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Y) / window_stride_h + 1;
const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - X) / window_stride_w + 1;
const std::vector<ck::index_t> window_spatial_lengths{Y, X};
const std::vector<ck::index_t> window_strides{window_stride_h, window_stride_w};
const std::vector<ck::index_t> input_left_pads{in_left_pad_h, in_left_pad_w};
const std::vector<ck::index_t> input_right_pads{in_right_pad_h, in_right_pad_w};
auto f_host_tensor_descriptor =
[](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W) {
using namespace ck::literals;
// reference need Tensor with NCHW order
return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_});
};
// in
Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi));
// out
Tensor<OutDataType> out_n_c_ho_wo_host(f_host_tensor_descriptor(N, C, Ho, Wo));
Tensor<OutDataType> out_n_c_ho_wo_device(f_host_tensor_descriptor(N, C, Ho, Wo));
// indices
Tensor<IndexDataType> indices_n_c_ho_wo_device(f_host_tensor_descriptor(N, C, Ho, Wo));
Tensor<IndexDataType> indices_n_c_ho_wo_host(f_host_tensor_descriptor(N, C, Ho, Wo));
// dout
Tensor<DOutDataType> dout_n_c_ho_wo(f_host_tensor_descriptor(N, C, Ho, Wo));
// din
Tensor<DInDataType> din_n_c_hi_wi_host(f_host_tensor_descriptor(N, C, Hi, Wi));
Tensor<DInDataType> din_n_c_hi_wi_device(f_host_tensor_descriptor(N, C, Hi, Wi));
std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
std::cout << "out_n_c_ho_wo: " << out_n_c_ho_wo_host.mDesc << std::endl;
std::cout << "indices_n_c_ho_wo: " << indices_n_c_ho_wo_host.mDesc << std::endl;
std::cout << "dout_n_c_ho_wo: " << dout_n_c_ho_wo.mDesc << std::endl;
std::cout << "din_n_c_hi_wi: " << din_n_c_hi_wi_host.mDesc << std::endl;
in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{-1.0, 1.0});
dout_n_c_ho_wo.GenerateTensorValue(GeneratorTensor_3<DOutDataType>{-1.0, 1.0});
DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpaceSize());
DeviceMem out_device_buf(sizeof(OutDataType) *
out_n_c_ho_wo_device.mDesc.GetElementSpaceSize());
DeviceMem indices_device_buf(sizeof(IndexDataType) *
indices_n_c_ho_wo_device.mDesc.GetElementSpaceSize());
DeviceMem dout_device_buf(sizeof(DOutDataType) * dout_n_c_ho_wo.mDesc.GetElementSpaceSize());
DeviceMem din_device_buf(sizeof(DInDataType) *
din_n_c_hi_wi_device.mDesc.GetElementSpaceSize());
in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
dout_device_buf.ToDevice(dout_n_c_ho_wo.mData.data());
auto pool_fwd = DevicePoolFwdInstance{};
auto pool_fwd_invoker_ptr = pool_fwd.MakeInvokerPointer();
auto pool_fwd_argument_ptr = pool_fwd.MakeArgumentPointer(
static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
static_cast<IndexDataType*>(indices_device_buf.GetDeviceBuffer()),
{N, C, Hi, Wi},
window_spatial_lengths,
{N, C, Ho, Wo},
{C * Hi * Wi, 1, Wi * C, C},
{C * Ho * Wo, 1, Wo * C, C},
{C * Ho * Wo, 1, Wo * C, C},
window_strides,
input_left_pads,
input_right_pads,
{2, 3});
if(!pool_fwd.IsSupportedArgument(pool_fwd_argument_ptr.get()))
{
throw std::runtime_error("wrong! pool_fwd with the specified compilation parameters does "
"not support this problem");
}
float ave_time_fwd =
pool_fwd_invoker_ptr->Run(pool_fwd_argument_ptr.get(), StreamConfig{nullptr, time_kernel});
auto pool_bwd = DeviceMaxPoolBwdInstance{};
auto pool_bwd_invoker_ptr = pool_bwd.MakeInvokerPointer();
auto pool_bwd_argument_ptr = pool_bwd.MakeArgumentPointer(
static_cast<DOutDataType*>(dout_device_buf.GetDeviceBuffer()),
static_cast<IndexDataType*>(indices_device_buf.GetDeviceBuffer()),
static_cast<DInDataType*>(din_device_buf.GetDeviceBuffer()),
dout_n_c_ho_wo.mDesc.GetElementSpaceSize(),
din_n_c_hi_wi_device.mDesc.GetElementSpaceSize(),
window_spatial_lengths,
window_strides);
if(!pool_bwd.IsSupportedArgument(pool_bwd_argument_ptr.get()))
{
throw std::runtime_error("wrong! pool_bwd with the specified compilation parameters does "
"not support this problem");
}
size_t pool_bwd_workspace_sz = pool_bwd.GetWorkSpaceSize(pool_bwd_argument_ptr.get());
DeviceMem pool_bwd_workspace_device_buf(pool_bwd_workspace_sz);
pool_bwd.SetWorkSpacePointer(pool_bwd_argument_ptr.get(),
pool_bwd_workspace_device_buf.GetDeviceBuffer());
float ave_time_bwd =
pool_bwd_invoker_ptr->Run(pool_bwd_argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::cout << "Pool fwd perf: " << ave_time_fwd << " ms" << std::endl;
std::cout << "Pool bwd perf: " << ave_time_bwd << " ms" << std::endl;
bool pass = true;
if(do_verification)
{
using ReferencePoolingFwdInstance =
ck::tensor_operation::host::ReferencePoolingFwd<4,
2,
InDataType,
OutDataType,
ComputeDataType,
IndexDataType,
ck::ReduceTensorOp::MAX,
PropagateNan,
true>;
auto ref_pooling_fwd = ReferencePoolingFwdInstance{};
auto ref_pooling_fwd_invoker = ref_pooling_fwd.MakeInvoker();
auto ref_pooling_fwd_argument = ref_pooling_fwd.MakeArgument(in_n_c_hi_wi,
out_n_c_ho_wo_host,
indices_n_c_ho_wo_host,
window_spatial_lengths,
window_strides,
input_left_pads,
input_right_pads);
ref_pooling_fwd_invoker.Run(ref_pooling_fwd_argument);
using ReferencePoolingBwdInstance =
ck::tensor_operation::host::ReferenceMaxPoolBwd<DOutDataType,
IndexDataType,
ComputeDataType,
DInDataType,
PassThrough>;
auto ref_pooling_bwd = ReferencePoolingBwdInstance{};
auto ref_pooling_bwd_invoker = ref_pooling_bwd.MakeInvoker();
auto ref_pooling_bwd_argument = ref_pooling_bwd.MakeArgument(
dout_n_c_ho_wo, indices_n_c_ho_wo_host, din_n_c_hi_wi_host, PassThrough{});
ref_pooling_bwd_invoker.Run(ref_pooling_bwd_argument);
out_device_buf.FromDevice(out_n_c_ho_wo_device.mData.data());
indices_device_buf.FromDevice(indices_n_c_ho_wo_device.mData.data());
din_device_buf.FromDevice(din_n_c_hi_wi_device.mData.data());
pass = pass && ck::utils::check_err(out_n_c_ho_wo_device, out_n_c_ho_wo_host);
pass = pass && ck::utils::check_err(indices_n_c_ho_wo_device, indices_n_c_ho_wo_host);
pass = pass && ck::utils::check_err(din_n_c_hi_wi_device, din_n_c_hi_wi_host);
}
return (pass);
};
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "maxpool2d_bwd_common.hpp"
using InDataType = ck::half_t;
using OutDataType = ck::half_t;
using IndexDataType = int32_t;
using ComputeDataType = float;
using DInDataType = ck::half_t;
using DOutDataType = ck::half_t;
static constexpr bool PropagateNan = false;
int main()
{
bool do_verification = true;
bool time_kernel = false;
// Pool shape
ck::index_t N = 1;
ck::index_t C = 1;
ck::index_t Y = 3;
ck::index_t X = 3;
ck::index_t Hi = 32;
ck::index_t Wi = 32;
ck::index_t window_stride_h = 1;
ck::index_t window_stride_w = 1;
ck::index_t in_left_pad_h = 0;
ck::index_t in_left_pad_w = 0;
ck::index_t in_right_pad_h = 0;
ck::index_t in_right_pad_w = 0;
bool pass = maxpool_bwd_test<InDataType,
OutDataType,
IndexDataType,
ComputeDataType,
DInDataType,
DOutDataType,
PropagateNan>(do_verification,
time_kernel,
N,
C,
Y,
X,
Hi,
Wi,
window_stride_h,
window_stride_w,
in_left_pad_h,
in_left_pad_w,
in_right_pad_h,
in_right_pad_w);
return (pass ? 0 : 1);
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "maxpool2d_bwd_common.hpp"
using InDataType = float;
using OutDataType = float;
using IndexDataType = int32_t;
using ComputeDataType = float;
using DInDataType = float;
using DOutDataType = float;
static constexpr bool PropagateNan = false;
int main()
{
bool do_verification = true;
bool time_kernel = false;
// Pool shape
ck::index_t N = 1;
ck::index_t C = 1;
ck::index_t Y = 2;
ck::index_t X = 2;
ck::index_t Hi = 32;
ck::index_t Wi = 32;
ck::index_t window_stride_h = 2;
ck::index_t window_stride_w = 2;
ck::index_t in_left_pad_h = 0;
ck::index_t in_left_pad_w = 0;
ck::index_t in_right_pad_h = 0;
ck::index_t in_right_pad_w = 0;
bool pass = maxpool_bwd_test<InDataType,
OutDataType,
IndexDataType,
ComputeDataType,
DInDataType,
DOutDataType,
PropagateNan>(do_verification,
time_kernel,
N,
C,
Y,
X,
Hi,
Wi,
window_stride_h,
window_stride_w,
in_left_pad_h,
in_left_pad_w,
in_right_pad_h,
in_right_pad_w);
return (pass ? 0 : 1);
}
add_example_executable(example_put_element_fp16 put_element_fp16.cpp)
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_put_element_impl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
using XDataType = ck::half_t;
using YDataType = ck::half_t;
using IndexDataType = int32_t;
using YElementwiseOp = ck::tensor_operation::element_wise::PassThrough;
using DeviceInstance =
ck::tensor_operation::device::DevicePutElementImpl<XDataType, // XDataType
IndexDataType, // IndexDataType
YDataType, // YDataType
YElementwiseOp,
ck::InMemoryDataOperationEnum::Set,
1>;
int main()
{
bool do_verification = true;
bool time_kernel = false;
int N = 1024;
Tensor<XDataType> x(HostTensorDescriptor{N, 1});
Tensor<IndexDataType> indices(HostTensorDescriptor{N, 1});
Tensor<YDataType> y(HostTensorDescriptor{N, 1});
x.GenerateTensorValue(GeneratorTensor_3<XDataType>{-1.0, 1.0});
for(int i = 0; i < N; ++i)
indices(i) = i;
DeviceMem x_device_buf(sizeof(XDataType) * x.mDesc.GetElementSpaceSize());
DeviceMem y_device_buf(sizeof(YDataType) * y.mDesc.GetElementSpaceSize());
DeviceMem indices_device_buf(sizeof(IndexDataType) * indices.mDesc.GetElementSpaceSize());
x_device_buf.ToDevice(x.mData.data());
indices_device_buf.ToDevice(indices.mData.data());
auto put_instance = DeviceInstance{};
auto put_invoker_ptr = put_instance.MakeInvokerPointer();
auto put_argument_ptr = put_instance.MakeArgumentPointer(
static_cast<XDataType*>(x_device_buf.GetDeviceBuffer()),
static_cast<IndexDataType*>(indices_device_buf.GetDeviceBuffer()),
static_cast<YDataType*>(y_device_buf.GetDeviceBuffer()),
N,
N,
YElementwiseOp{});
if(!put_instance.IsSupportedArgument(put_argument_ptr.get()))
{
throw std::runtime_error("argument is not supported!");
}
float ave_time =
put_invoker_ptr->Run(put_argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::cout << "perf: " << ave_time << " ms" << std::endl;
bool pass = true;
if(do_verification)
{
Tensor<YDataType> y_host(HostTensorDescriptor{N, 1});
for(int i = 0; i < N; ++i)
{
IndexDataType idx = indices(i);
y_host(idx) = x(i);
}
y_device_buf.FromDevice(y.mData.data());
pass = ck::utils::check_err(y, y_host);
}
return (pass ? 0 : 1);
}
...@@ -173,6 +173,10 @@ ...@@ -173,6 +173,10 @@
// workaround: compiler issue on gfx908 // workaround: compiler issue on gfx908
#define CK_WORKAROUND_SWDEV_388832 1 #define CK_WORKAROUND_SWDEV_388832 1
// workaround: Grouped Conv2d_bwd_data fails for already implemented instance
#define CK_WORKAROUND_SWDEV_3318619 0
// flag to enable (1) or disable (0) the debugging output in some kernels // flag to enable (1) or disable (0) the debugging output in some kernels
#define DEBUG_LOG 0 #define DEBUG_LOG 0
......
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
#include "ck/stream_config.hpp" #include "ck/stream_config.hpp"
#include "ck/host_utility/hip_check_error.hpp" #include "ck/host_utility/hip_check_error.hpp"
static int getAvailableComputeUnitCount(const StreamConfig& stream_config) static inline int getAvailableComputeUnitCount(const StreamConfig& stream_config)
{ {
constexpr int MAX_MASK_DWORDS = 64; constexpr int MAX_MASK_DWORDS = 64;
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include "ck/tensor_operation/gpu/device/device_base.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
// For pooling which used indexable operation, such as MaxPool, MinPool...etc
template <typename DOutDataType, typename IndexDataType, typename DInDataType>
struct DeviceIndexPoolBwd : public BaseOperator
{
virtual std::unique_ptr<BaseArgument>
MakeArgumentPointer(const void* p_dout,
const void* p_indices,
void* p_din,
index_t dout_length,
index_t din_length,
std::vector<ck::index_t> window_lengths,
std::vector<ck::index_t> window_strides) = 0;
virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
};
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "ck/utility/reduction_enums.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
// output[indices] = input
template <typename InDataType,
typename IndexDataType,
typename OutDataType,
typename ElementwiseOperation,
InMemoryDataOperationEnum Op>
struct DevicePutElement : public BaseOperator
{
virtual std::unique_ptr<BaseArgument>
MakeArgumentPointer(const void* p_input,
const void* p_indices,
void* p_output,
index_t input_length,
index_t output_length,
ElementwiseOperation elementwise_op) = 0;
virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
};
} // namespace device
} // namespace tensor_operation
} // namespace ck
...@@ -786,12 +786,10 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle ...@@ -786,12 +786,10 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle
if(arg.d0s_nl_ns_lengths_strides_[i][1] == 1 && if(arg.d0s_nl_ns_lengths_strides_[i][1] == 1 &&
arg.d0s_nl_ns_lengths_strides_[i][0] % D0sTransferSrcScalarPerVector != 0) arg.d0s_nl_ns_lengths_strides_[i][0] % D0sTransferSrcScalarPerVector != 0)
{ {
std::cout << "first" << std::endl;
return false; return false;
} }
if(arg.d0s_nl_ns_lengths_strides_[i][1] != 1 && D0sTransferSrcScalarPerVector != 1) if(arg.d0s_nl_ns_lengths_strides_[i][1] != 1 && D0sTransferSrcScalarPerVector != 1)
{ {
std::cout << "second" << std::endl;
return false; return false;
} }
} }
......
...@@ -76,7 +76,7 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout, ...@@ -76,7 +76,7 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
// TODO: should be exposed as Tparams. // TODO: should be exposed as Tparams.
static constexpr index_t NumGemmKPrefetchStage = 1; static constexpr index_t NumGemmKPrefetchStage = 1;
static constexpr LoopScheduler LoopSched = make_default_loop_scheduler(); static constexpr LoopScheduler LoopSched = make_default_loop_scheduler();
static constexpr PipelineVersion PipelineVer = PipelineVersion::v2; static constexpr PipelineVersion PipelineVer = PipelineVersion::v1;
using GridwiseGemm = GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2< using GridwiseGemm = GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2<
BlockSize, BlockSize,
......
...@@ -459,7 +459,6 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1 ...@@ -459,7 +459,6 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
p_ds_grid_{}, p_ds_grid_{},
p_e_grid_{static_cast<EDataType*>(p_e)}, p_e_grid_{static_cast<EDataType*>(p_e)},
num_group_{a_g_n_k_wos_lengths[0]}, num_group_{a_g_n_k_wos_lengths[0]},
num_gemm_{},
a_element_op_{a_element_op}, a_element_op_{a_element_op},
b_element_op_{b_element_op}, b_element_op_{b_element_op},
cde_element_op_{cde_element_op}, cde_element_op_{cde_element_op},
...@@ -508,9 +507,6 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1 ...@@ -508,9 +507,6 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
const auto YTilde = ConvStrideH / GcdStrideDilationH; const auto YTilde = ConvStrideH / GcdStrideDilationH;
const auto XTilde = ConvStrideW / GcdStrideDilationW; const auto XTilde = ConvStrideW / GcdStrideDilationW;
// number of GEMM
num_gemm_ = YTilde * XTilde;
for(index_t i_ytilde = 0; i_ytilde < YTilde; ++i_ytilde) for(index_t i_ytilde = 0; i_ytilde < YTilde; ++i_ytilde)
{ {
for(index_t i_xtilde = 0; i_xtilde < XTilde; ++i_xtilde) for(index_t i_xtilde = 0; i_xtilde < XTilde; ++i_xtilde)
...@@ -626,7 +622,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1 ...@@ -626,7 +622,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
void Print() const void Print() const
{ {
for(index_t i = 0; i < num_gemm_; i++) for(std::size_t i = 0; i < a_grid_desc_ak0_m_ak1_container_.size(); i++)
{ {
std::cout << "a_grid_desc_ak0_m_ak1_container_" std::cout << "a_grid_desc_ak0_m_ak1_container_"
<< a_grid_desc_ak0_m_ak1_container_[i] << std::endl; << a_grid_desc_ak0_m_ak1_container_[i] << std::endl;
...@@ -654,7 +650,6 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1 ...@@ -654,7 +650,6 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
// tensor descriptor for problem definition // tensor descriptor for problem definition
index_t num_group_; index_t num_group_;
index_t num_gemm_;
std::vector<AGridDesc_M_K> a_grid_desc_m_k_container_; std::vector<AGridDesc_M_K> a_grid_desc_m_k_container_;
std::vector<BGridDesc_N_K> b_grid_desc_n_k_container_; std::vector<BGridDesc_N_K> b_grid_desc_n_k_container_;
std::vector<DsGridDesc_M_N> ds_grid_desc_m_n_container_; std::vector<DsGridDesc_M_N> ds_grid_desc_m_n_container_;
...@@ -708,7 +703,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1 ...@@ -708,7 +703,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
float ave_time = 0; float ave_time = 0;
for(index_t i = 0; i < arg.num_gemm_; i++) for(std::size_t i = 0; i < arg.a_grid_desc_ak0_m_ak1_container_.size(); i++)
{ {
if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_container_[i], if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_container_[i],
arg.b_grid_desc_n_k_container_[i], arg.b_grid_desc_n_k_container_[i],
...@@ -807,7 +802,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1 ...@@ -807,7 +802,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
} }
// vector load for A matrix from global memory to LDS // vector load for A matrix from global memory to LDS
if constexpr(is_same_v<ALayout, tensor_layout::convolution::GNHWK>) if constexpr(is_same_v<ALayout, tensor_layout::convolution::GNHWK> ||
is_same_v<ALayout, tensor_layout::convolution::NHWGK>)
{ {
if(!(ABlockTransferSrcVectorDim == 2 && ConvK % ABlockTransferSrcScalarPerVector == 0)) if(!(ABlockTransferSrcVectorDim == 2 && ConvK % ABlockTransferSrcScalarPerVector == 0))
{ {
...@@ -862,7 +858,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1 ...@@ -862,7 +858,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
} }
// vector store for E // vector store for E
if constexpr(is_same_v<ELayout, tensor_layout::convolution::GNHWC>) if constexpr(is_same_v<ELayout, tensor_layout::convolution::GNHWC> ||
is_same_v<ELayout, tensor_layout::convolution::NHWGC>)
{ {
// vector store C matrix into global memory // vector store C matrix into global memory
if(!(ConvC % CDEBlockTransferScalarPerVector_NPerBlock == 0)) if(!(ConvC % CDEBlockTransferScalarPerVector_NPerBlock == 0))
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include <sstream>
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_operation/gpu/device/device_index_pool_bwd.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_put_element_1d.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/host_utility/stream_utility.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
// output[indices] = input
template <typename DOutDataType,
typename IndexDataType,
typename DInDataType,
ck::index_t InOutVectorSize>
struct DeviceIndexPoolBwdImpl : public DeviceIndexPoolBwd<DOutDataType, IndexDataType, DInDataType>
{
using DInDataType_AutomicAddPreCast =
conditional_t<is_same_v<DInDataType, float> || is_same_v<DInDataType, double>,
DInDataType,
float>;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using UnaryConvert = ck::tensor_operation::element_wise::UnaryConvert;
static constexpr auto I0 = Number<0>{};
template <typename Desc_M>
static auto PadDescriptor_M_1d(Desc_M desc_m, index_t loop_step)
{
const auto m = desc_m.GetLength(I0);
const auto pad = math::integer_least_multiple(m, loop_step) - m;
const auto desc_m_pad =
transform_tensor_descriptor(desc_m,
make_tuple(make_right_pad_transform(m, pad)),
make_tuple(Sequence<0>{}),
make_tuple(Sequence<0>{}));
return desc_m_pad;
}
static auto MakeDescriptor_M(index_t length, index_t loop_step)
{
const auto desc_m = make_naive_tensor_descriptor_packed(make_tuple(length));
return PadDescriptor_M_1d(desc_m, loop_step);
}
using InOutGrid1dDesc = decltype(MakeDescriptor_M(1, 1));
using GridwisePutElementSet = GridwisePutElement_1D<InOutGrid1dDesc,
DOutDataType,
IndexDataType,
DInDataType,
PassThrough,
InMemoryDataOperationEnum::Set,
InOutVectorSize>;
using GridwisePutElementAtomicAdd = GridwisePutElement_1D<InOutGrid1dDesc,
DOutDataType,
IndexDataType,
DInDataType_AutomicAddPreCast,
PassThrough,
InMemoryDataOperationEnum::AtomicAdd,
InOutVectorSize>;
using GridwiseCasting = GridwiseElementwise_1D<Tuple<InOutGrid1dDesc>,
Tuple<InOutGrid1dDesc>,
Tuple<const DInDataType_AutomicAddPreCast*>,
Tuple<DInDataType*>,
UnaryConvert,
InOutVectorSize,
Sequence<InOutVectorSize>,
Sequence<InOutVectorSize>>;
struct Argument : public BaseArgument
{
Argument(const DOutDataType* p_dout,
const IndexDataType* p_indices,
DInDataType* p_din,
index_t dout_length,
index_t din_length,
const std::vector<ck::index_t>& window_lengths,
const std::vector<ck::index_t>& window_strides)
: p_dout_{p_dout},
p_indices_{p_indices},
p_din_{p_din},
dout_length_raw_{dout_length},
din_length_raw_{din_length},
blockSize_{256},
windowOverlap_{false}
{
for(size_t i = 0; i < window_lengths.size(); ++i)
{
windowOverlap_ |= window_lengths.at(i) > window_strides.at(i);
}
}
const DOutDataType* p_dout_;
const IndexDataType* p_indices_;
DInDataType* p_din_;
index_t dout_length_raw_;
index_t din_length_raw_;
index_t blockSize_;
bool windowOverlap_;
};
struct Invoker : public BaseInvoker
{
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
index_t gridSize = getAvailableComputeUnitCount(stream_config);
index_t loop_step = gridSize * arg.blockSize_ * InOutVectorSize;
InOutGrid1dDesc din_grid_desc = MakeDescriptor_M(arg.din_length_raw_, loop_step);
InOutGrid1dDesc dout_grid_desc = MakeDescriptor_M(arg.dout_length_raw_, loop_step);
if constexpr(is_same_v<DInDataType, float> || is_same_v<DInDataType, double>)
{
hip_check_error(hipMemsetAsync(arg.p_din_,
0,
arg.din_length_raw_ * sizeof(DInDataType),
stream_config.stream_id_));
if(arg.windowOverlap_)
{
const auto put_kernel = kernel_put_element_1d<GridwisePutElementAtomicAdd,
InOutGrid1dDesc,
DOutDataType,
IndexDataType,
DInDataType,
PassThrough>;
return launch_and_time_kernel(stream_config,
put_kernel,
dim3(gridSize),
dim3(arg.blockSize_),
0,
dout_grid_desc,
arg.p_dout_,
arg.p_indices_,
arg.p_din_,
PassThrough{});
}
else
{
const auto put_kernel = kernel_put_element_1d<GridwisePutElementSet,
InOutGrid1dDesc,
DOutDataType,
IndexDataType,
DInDataType,
PassThrough>;
return launch_and_time_kernel(stream_config,
put_kernel,
dim3(gridSize),
dim3(arg.blockSize_),
0,
dout_grid_desc,
arg.p_dout_,
arg.p_indices_,
arg.p_din_,
PassThrough{});
}
}
else
{
if(arg.windowOverlap_)
{
if(arg.p_workspace_ == nullptr)
throw std::runtime_error("wrong! WorkSpace pointer has not been set");
hip_check_error(
hipMemsetAsync(arg.p_workspace_,
0,
arg.din_length_raw_ * sizeof(DInDataType_AutomicAddPreCast),
stream_config.stream_id_));
const auto put_kernel = kernel_put_element_1d<GridwisePutElementAtomicAdd,
InOutGrid1dDesc,
DOutDataType,
IndexDataType,
DInDataType_AutomicAddPreCast,
PassThrough>;
const auto cast_kernel =
kernel_elementwise_1d<GridwiseCasting,
Tuple<InOutGrid1dDesc>,
Tuple<InOutGrid1dDesc>,
Tuple<const DInDataType_AutomicAddPreCast*>,
Tuple<DInDataType*>,
UnaryConvert>;
float elapsed_time = launch_and_time_kernel(
stream_config,
put_kernel,
dim3(gridSize),
dim3(arg.blockSize_),
0,
dout_grid_desc,
arg.p_dout_,
arg.p_indices_,
static_cast<DInDataType_AutomicAddPreCast*>(arg.p_workspace_),
PassThrough{});
elapsed_time += launch_and_time_kernel(
stream_config,
cast_kernel,
dim3(gridSize),
dim3(arg.blockSize_),
0,
ck::make_tuple(din_grid_desc),
ck::make_tuple(din_grid_desc),
static_cast<DInDataType_AutomicAddPreCast*>(arg.p_workspace_),
arg.p_din_,
UnaryConvert{});
return elapsed_time;
}
else
{
const auto put_kernel = kernel_put_element_1d<GridwisePutElementSet,
InOutGrid1dDesc,
DOutDataType,
IndexDataType,
DInDataType,
PassThrough>;
hip_check_error(hipMemsetAsync(arg.p_din_,
0,
arg.din_length_raw_ * sizeof(DInDataType),
stream_config.stream_id_));
return launch_and_time_kernel(stream_config,
put_kernel,
dim3(gridSize),
dim3(arg.blockSize_),
0,
dout_grid_desc,
arg.p_dout_,
arg.p_indices_,
arg.p_din_,
PassThrough{});
}
}
}
float Run(const BaseArgument* p_arg,
const StreamConfig& stream_config = StreamConfig{}) override
{
return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
}
};
size_t GetWorkSpaceSize(const BaseArgument* pArg) const override
{
const Argument* pArg_ = dynamic_cast<const Argument*>(pArg);
bool needCast = pArg_->windowOverlap_ &&
!(is_same_v<DInDataType, float> || is_same_v<DInDataType, double>);
if(!needCast)
return 0;
else
return pArg_->din_length_raw_ * sizeof(DInDataType_AutomicAddPreCast);
};
bool IsSupportedArgument(const BaseArgument* p_arg) override
{
const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
if(pArg->din_length_raw_ % InOutVectorSize != 0 ||
pArg->dout_length_raw_ % InOutVectorSize != 0)
{
return false;
}
return true;
}
std::unique_ptr<BaseArgument>
MakeArgumentPointer(const void* p_dout,
const void* p_indices,
void* p_din,
index_t dout_length,
index_t din_length,
std::vector<ck::index_t> window_lengths,
std::vector<ck::index_t> window_strides) override
{
// Assume p_dout, p_indices, p_din are packed memory space, dout_length and din_length are
// physical size of the packed tensor
return std::make_unique<Argument>(static_cast<const DOutDataType*>(p_dout),
static_cast<const IndexDataType*>(p_indices),
static_cast<DInDataType*>(p_din),
dout_length,
din_length,
window_lengths,
window_strides);
}
std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
{
return std::make_unique<Invoker>(Invoker{});
}
};
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include <sstream>
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_operation/gpu/device/device_put_element.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_put_element_1d.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/host_utility/stream_utility.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
// output[indices] = input
template <typename InDataType,
typename IndexDataType,
typename OutDataType,
typename ElementwiseOperation,
InMemoryDataOperationEnum MemOp,
ck::index_t InVectorSize>
struct DevicePutElementImpl
: public DevicePutElement<InDataType, IndexDataType, OutDataType, ElementwiseOperation, MemOp>
{
template <typename Desc_M>
static auto PadDescriptor_M_1d(Desc_M desc_m, index_t gridSize, index_t blockSize)
{
constexpr auto I0 = Number<0>{};
const auto m = desc_m.GetLength(I0);
const index_t loop_step = gridSize * blockSize * InVectorSize;
const auto pad = math::integer_least_multiple(m, loop_step) - m;
const auto desc_m_pad =
transform_tensor_descriptor(desc_m,
make_tuple(make_right_pad_transform(m, pad)),
make_tuple(Sequence<0>{}),
make_tuple(Sequence<0>{}));
return desc_m_pad;
}
static auto MakeDescriptor_M(index_t length, index_t gridSize, index_t blockSize)
{
const auto desc_m = make_naive_tensor_descriptor_packed(make_tuple(length));
return PadDescriptor_M_1d(desc_m, gridSize, blockSize);
}
using InGrid1dDesc = decltype(MakeDescriptor_M(1, 1, 1));
using GridwisePutElement = GridwisePutElement_1D<InGrid1dDesc,
InDataType,
IndexDataType,
OutDataType,
ElementwiseOperation,
MemOp,
InVectorSize>;
struct Argument : public BaseArgument
{
Argument(const InDataType* p_input,
const IndexDataType* p_indices,
OutDataType* p_output,
index_t input_length,
ElementwiseOperation elementwise_op)
: p_input_{p_input},
p_indices_{p_indices},
p_output_{p_output},
input_length_raw_{input_length},
elementwise_op_{elementwise_op},
blockSize_{256}
{
}
const InDataType* p_input_;
const IndexDataType* p_indices_;
OutDataType* p_output_;
index_t input_length_raw_;
ElementwiseOperation elementwise_op_;
index_t blockSize_;
};
struct Invoker : public BaseInvoker
{
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
index_t gridSize = getAvailableComputeUnitCount(stream_config);
InGrid1dDesc in_grid_desc =
MakeDescriptor_M(arg.input_length_raw_, gridSize, arg.blockSize_);
const auto kernel = kernel_put_element_1d<GridwisePutElement,
InGrid1dDesc,
InDataType,
IndexDataType,
OutDataType,
ElementwiseOperation>;
float elapsed_time = launch_and_time_kernel(stream_config,
kernel,
dim3(gridSize),
dim3(arg.blockSize_),
0,
in_grid_desc,
arg.p_input_,
arg.p_indices_,
arg.p_output_,
arg.elementwise_op_);
return elapsed_time;
}
float Run(const BaseArgument* p_arg,
const StreamConfig& stream_config = StreamConfig{}) override
{
return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
}
};
bool IsSupportedArgument(const BaseArgument* p_arg) override
{
const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
if(pArg->input_length_raw_ % InVectorSize != 0)
{
return false;
}
return true;
}
std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_input,
const void* p_indices,
void* p_output,
index_t input_length,
index_t,
ElementwiseOperation elementwise_op) override
{
return std::make_unique<Argument>(static_cast<const InDataType*>(p_input),
static_cast<const IndexDataType*>(p_indices),
static_cast<OutDataType*>(p_output),
input_length,
elementwise_op);
}
std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
{
return std::make_unique<Invoker>(Invoker{});
}
};
} // namespace device
} // namespace tensor_operation
} // namespace ck
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment