Commit 7a3b49e5 authored by Chao Liu's avatar Chao Liu
Browse files

Merge remote-tracking branch 'origin/develop' into contraction

parents e07b3d8e d3051d75
#ifndef DEVICE_GEMM_XDL_SPLITK_C_SHUFFLE_HPP // SPDX-License-Identifier: MIT
#define DEVICE_GEMM_XDL_SPLITK_C_SHUFFLE_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream> #include <iostream>
#include <sstream> #include <sstream>
#include "device.hpp"
#include "device_base.hpp" #include "ck/utility/common_header.hpp"
#include "device_gemm.hpp" #include "ck/tensor_description/tensor_descriptor.hpp"
#include "common_header.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "tensor_descriptor.hpp" #include "ck/tensor_operation/gpu/device/device_gemm.hpp"
#include "tensor_descriptor_helper.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "gridwise_gemm_xdlops_v2r4r2.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp"
#include "gemm_specialization.hpp" #include "ck/device_utility/device_prop.hpp"
#include "ck/device_utility/kernel_launch.hpp"
#ifndef CK_RUN_KERNEL_AND_TIME
#define CK_RUN_KERNEL_AND_TIME 1
#endif
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -641,4 +640,3 @@ struct DeviceGemmXdlSplitKCShuffle ...@@ -641,4 +640,3 @@ struct DeviceGemmXdlSplitKCShuffle
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_GROUPED_GEMM_XDL_HPP // SPDX-License-Identifier: MIT
#define DEVICE_GROUPED_GEMM_XDL_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream> #include <iostream>
#include <sstream> #include <sstream>
#include "device.hpp"
#include "device_base.hpp" #include "ck/utility/common_header.hpp"
#include "device_gemm.hpp" #include "ck/tensor_description/tensor_descriptor.hpp"
#include "common_header.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "tensor_descriptor.hpp" #include "ck/tensor_operation/gpu/device/device_gemm.hpp"
#include "tensor_descriptor_helper.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "gridwise_gemm_xdlops_v2r3.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp"
#include "gemm_specialization.hpp" #include "ck/device_utility/device_prop.hpp"
#include "ck/device_utility/kernel_launch.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -362,7 +365,7 @@ struct DeviceGroupedGemmXdl ...@@ -362,7 +365,7 @@ struct DeviceGroupedGemmXdl
{ {
grid_size_ = 0; grid_size_ = 0;
gemm_descs_args_workspace_ = nullptr; p_workspace_ = nullptr;
group_count_ = ck::type_convert<ck::index_t>(gemm_shapes.size()); group_count_ = ck::type_convert<ck::index_t>(gemm_shapes.size());
...@@ -437,8 +440,6 @@ struct DeviceGroupedGemmXdl ...@@ -437,8 +440,6 @@ struct DeviceGroupedGemmXdl
std::vector<GemmDescKernelArg> gemm_desc_kernel_arg_; std::vector<GemmDescKernelArg> gemm_desc_kernel_arg_;
void* gemm_descs_args_workspace_;
index_t grid_size_; index_t grid_size_;
}; };
...@@ -488,7 +489,7 @@ struct DeviceGroupedGemmXdl ...@@ -488,7 +489,7 @@ struct DeviceGroupedGemmXdl
} }
hipGetErrorString( hipGetErrorString(
hipMemcpy(arg.gemm_descs_args_workspace_, hipMemcpy(arg.p_workspace_,
arg.gemm_desc_kernel_arg_.data(), arg.gemm_desc_kernel_arg_.data(),
arg.gemm_desc_kernel_arg_.size() * sizeof(GemmDescKernelArg), arg.gemm_desc_kernel_arg_.size() * sizeof(GemmDescKernelArg),
hipMemcpyHostToDevice)); hipMemcpyHostToDevice));
...@@ -507,17 +508,17 @@ struct DeviceGroupedGemmXdl ...@@ -507,17 +508,17 @@ struct DeviceGroupedGemmXdl
CElementwiseOperation, CElementwiseOperation,
true>; true>;
ave_time = launch_and_time_kernel( ave_time =
stream_config, launch_and_time_kernel(stream_config,
kernel, kernel,
dim3(arg.grid_size_), dim3(arg.grid_size_),
dim3(BlockSize), dim3(BlockSize),
0, 0,
cast_pointer_to_constant_address_space(arg.gemm_descs_args_workspace_), cast_pointer_to_constant_address_space(arg.p_workspace_),
arg.gemm_desc_kernel_arg_.size(), arg.gemm_desc_kernel_arg_.size(),
arg.a_element_op_, arg.a_element_op_,
arg.b_element_op_, arg.b_element_op_,
arg.c_element_op_); arg.c_element_op_);
} }
else else
{ {
...@@ -531,17 +532,17 @@ struct DeviceGroupedGemmXdl ...@@ -531,17 +532,17 @@ struct DeviceGroupedGemmXdl
CElementwiseOperation, CElementwiseOperation,
false>; false>;
ave_time = launch_and_time_kernel( ave_time =
stream_config, launch_and_time_kernel(stream_config,
kernel, kernel,
dim3(arg.grid_size_), dim3(arg.grid_size_),
dim3(BlockSize), dim3(BlockSize),
0, 0,
cast_pointer_to_constant_address_space(arg.gemm_descs_args_workspace_), cast_pointer_to_constant_address_space(arg.p_workspace_),
arg.gemm_desc_kernel_arg_.size(), arg.gemm_desc_kernel_arg_.size(),
arg.a_element_op_, arg.a_element_op_,
arg.b_element_op_, arg.b_element_op_,
arg.c_element_op_); arg.c_element_op_);
} }
return ave_time; return ave_time;
...@@ -635,14 +636,8 @@ struct DeviceGroupedGemmXdl ...@@ -635,14 +636,8 @@ struct DeviceGroupedGemmXdl
{ {
return dynamic_cast<const Argument*>(p_arg)->group_count_ * sizeof(GemmDescKernelArg); return dynamic_cast<const Argument*>(p_arg)->group_count_ * sizeof(GemmDescKernelArg);
} }
void SetWorkSpacePointer(BaseArgument* p_arg, void* workspace_ptr) const override
{
dynamic_cast<Argument*>(p_arg)->gemm_descs_args_workspace_ = workspace_ptr;
}
}; };
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_POOL2D_FWD_HPP // SPDX-License-Identifier: MIT
#define DEVICE_POOL2D_FWD_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream> #include <iostream>
#include <array> #include <array>
#include "device_base.hpp"
#include "reduction_enums.hpp" #include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "ck/utility/reduction_enums.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -35,4 +38,3 @@ using DevicePool2dFwdPtr = std::unique_ptr<DevicePool2dFwd<ReduceOpId>>; ...@@ -35,4 +38,3 @@ using DevicePool2dFwdPtr = std::unique_ptr<DevicePool2dFwd<ReduceOpId>>;
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_POOL2D_FWD_NHWC_NHWC_HPP // SPDX-License-Identifier: MIT
#define DEVICE_POOL2D_FWD_NHWC_NHWC_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream> #include <iostream>
#include <sstream> #include <sstream>
#include "device_pool2d_fwd.hpp"
#include "tensor_descriptor.hpp" #include "ck/tensor_description/tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "reduction_operator_mapping.hpp" #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "gridwise_2d_reduction_threadwise.hpp" #include "ck/tensor_operation/gpu/device/device_pool2d_fwd.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp"
#include "ck/device_utility/device_prop.hpp"
#include "ck/device_utility/kernel_launch.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -35,14 +40,13 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd ...@@ -35,14 +40,13 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
using IndexDataType = int32_t; using IndexDataType = int32_t;
using ReduceOperation = typename reduce_binary_operator<AccDataType, ReduceOpId>::opType; using ReduceOperation = typename reduce_binary_operator<ReduceOpId>::opType;
using InElementwiseOperation = using InElementwiseOperation =
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation; typename reduce_unary_operator<ReduceOpId, true, true>::InElementwiseOperation;
using AccElementwiseOperation = using AccElementwiseOperation =
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>:: typename reduce_unary_operator<ReduceOpId, true, true>::AccElementwiseOperation;
AccElementwiseOperation;
static constexpr index_t InSrcOutDstVectorDim = static constexpr index_t InSrcOutDstVectorDim =
0; // for NHWC, the dim C is the vector Dim for both input and output in memory, which is 0; // for NHWC, the dim C is the vector Dim for both input and output in memory, which is
...@@ -178,13 +182,10 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd ...@@ -178,13 +182,10 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
invariant_lowest_length_ = C; invariant_lowest_length_ = C;
reduce_lowest_length_ = window_spatial_lengths[1]; reduce_lowest_length_ = window_spatial_lengths[1];
// TODO: is this correct? int32_t reduceLength = window_spatial_lengths[0] * window_spatial_lengths[1];
if constexpr(ReduceOpId == ck::ReduceTensorOp::AVG)
{ std::tie(in_element_op_, acc_element_op_) =
ck::index_t divider = window_spatial_lengths[0] * window_spatial_lengths[1]; reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator(reduceLength);
in_element_op_ = InElementwiseOperation{divider};
acc_element_op_ = AccElementwiseOperation{divider};
}
} }
const InDataType* p_in_dev_; const InDataType* p_in_dev_;
...@@ -319,9 +320,8 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd ...@@ -319,9 +320,8 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
return str.str(); return str.str();
} }
}; // namespace device };
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_REDUCE_HPP // SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector> #include <vector>
#include <memory> #include <memory>
#include <iostream> #include <iostream>
#include "common_header.hpp" #include "ck/utility/common_header.hpp"
#include "device_base.hpp" #include "ck/utility/reduction_enums.hpp"
#include "reduction_enums.hpp" #include "ck/tensor_operation/gpu/device/device_base.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -41,4 +43,3 @@ using DeviceReducePtr = ...@@ -41,4 +43,3 @@ using DeviceReducePtr =
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_REDUCE_COMMON_HPP // SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_COMMON_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector> #include <vector>
#include <cassert> #include <cassert>
#include "common_header.hpp" #include "ck/utility/common_header.hpp"
#include "reduction_enums.hpp" #include "ck/utility/reduction_enums.hpp"
#include "reduction_operator.hpp" #include "ck/utility/reduction_operator.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -85,6 +87,4 @@ std::vector<index_t> shuffle_tensor_dimensions(const std::vector<index_t>& origL ...@@ -85,6 +87,4 @@ std::vector<index_t> shuffle_tensor_dimensions(const std::vector<index_t>& origL
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_REDUCE_MULTIBLOCK_HPP // SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_MULTIBLOCK_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream> #include <iostream>
#include <sstream> #include <sstream>
#include "device.hpp"
#include "device_base.hpp" #include "ck/utility/common_header.hpp"
#include "device_reduce.hpp" #include "ck/utility/reduction_operator.hpp"
#include "device_reduce_common.hpp" #include "ck/tensor_description/tensor_descriptor.hpp"
#include "gridwise_2d_reduction_multiblock.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "gridwise_set_buffer_value.hpp" #include "ck/tensor_operation/gpu/device/device_reduce.hpp"
#include "reduction_operator.hpp" #include "ck/tensor_operation/gpu/device/device_reduce_common.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp"
#include "ck/device_utility/device_prop.hpp"
#include "ck/device_utility/kernel_launch.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -61,12 +67,9 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE ...@@ -61,12 +67,9 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
static constexpr bool use_multiblock = static constexpr bool use_multiblock =
(OutMemoryDataOperation == InMemoryDataOperationEnum::AtomicAdd); (OutMemoryDataOperation == InMemoryDataOperationEnum::AtomicAdd);
static constexpr bool out_type_compatible_with_atomic_op = static_assert(ck::reduce::InMemoryDataOperatonSupportedOnDataType<OutMemoryDataOperation,
std::is_same<OutDataType, float>::value || std::is_same<OutDataType, double>::value; OutDataType>::value,
"The OutDataType must support the specified OutMemoryDataOperation!");
static_assert(
!use_multiblock || (use_multiblock && out_type_compatible_with_atomic_op),
"The OutDataType must support the atomic operation for using MultiBlock reduction");
static_assert(!use_multiblock || (use_multiblock && !OutputIndex), static_assert(!use_multiblock || (use_multiblock && !OutputIndex),
"MultiBlock reduction can only be used when outputing index is not required"); "MultiBlock reduction can only be used when outputing index is not required");
...@@ -348,8 +351,8 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE ...@@ -348,8 +351,8 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
if constexpr(use_multiblock) if constexpr(use_multiblock)
{ {
const auto zeroVal = const auto identityVal =
ck::reduce::GetReductionZeroValueForInMemoryDataOperation<OutDataType>( ck::reduce::GetIdentityValueForInMemoryDataOperation<OutDataType>(
OutMemoryDataOperation); OutMemoryDataOperation);
const auto kernel_pre = const auto kernel_pre =
...@@ -362,7 +365,7 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE ...@@ -362,7 +365,7 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
0, 0,
out_grid_desc_m_2, out_grid_desc_m_2,
arg.out_dev_, arg.out_dev_,
zeroVal); identityVal);
}; };
avg_time += launch_and_time_kernel(stream_config, avg_time += launch_and_time_kernel(stream_config,
...@@ -393,10 +396,8 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE ...@@ -393,10 +396,8 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
}; };
}; };
bool IsSupportedArgument(const BaseArgument* p_arg) override static bool IsSupportedArgument(const Argument* pArg)
{ {
const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
if constexpr(use_multiblock) if constexpr(use_multiblock)
{ {
if(static_cast<float>(pArg->beta_) != 0.0f) if(static_cast<float>(pArg->beta_) != 0.0f)
...@@ -445,11 +446,16 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE ...@@ -445,11 +446,16 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
else else
{ {
// cases with very small reduce_total_length should be handled by ThreadWise kernel // cases with very small reduce_total_length should be handled by ThreadWise kernel
if(pArg->reduce_total_length / KThreadSliceSize < 2) // if(pArg->reduce_total_length / KThreadSliceSize < 2)
return (false); // return (false);
}; };
return (true); return (true);
}
bool IsSupportedArgument(const BaseArgument* p_arg) override
{
return IsSupportedArgument(dynamic_cast<const Argument*>(p_arg));
}; };
std::unique_ptr<BaseArgument> std::unique_ptr<BaseArgument>
...@@ -492,7 +498,7 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE ...@@ -492,7 +498,7 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
auto str = std::stringstream(); auto str = std::stringstream();
// clang-format off // clang-format off
str << "DeviceReduceMultiBlockAtomicAdd<" << BlockSize << ","; str << (OutMemoryDataOperation == InMemoryDataOperationEnum::Set? "DeviceReduceBlockWise<" : "DeviceReduceMultiBlock<") << BlockSize << ",";
str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ","; str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ",";
str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ","; str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ",";
str << "InSrcVectorDim_" << InSrcVectorDim << "_InSrcVectorSize_" << InSrcVectorSize << "_OutDstVectorSize_" << OutDstVectorSize << ">"; str << "InSrcVectorDim_" << InSrcVectorDim << "_InSrcVectorSize_" << InSrcVectorSize << "_OutDstVectorSize_" << OutDstVectorSize << ">";
...@@ -505,4 +511,3 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE ...@@ -505,4 +511,3 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_REDUCE_THREADWISE_HPP // SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_THREADWISE_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream> #include <iostream>
#include <sstream> #include <sstream>
#include "device.hpp"
#include "device_reduce.hpp" #include "ck/device_utility/device_prop.hpp"
#include "device_reduce_common.hpp" #include "ck/device_utility/kernel_launch.hpp"
#include "gridwise_2d_reduction_multiblock.hpp" #include "ck/tensor_operation/gpu/device/device_reduce.hpp"
#include "gridwise_2d_reduction_threadwise.hpp" #include "ck/tensor_operation/gpu/device/device_reduce_common.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -370,4 +374,3 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, AccE ...@@ -370,4 +374,3 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, AccE
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include <sstream>
#include "ck/utility/reduction_operator.hpp"
#include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce_common.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_softmax.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp"
#include "ck/device_utility/device_prop.hpp"
#include "ck/device_utility/kernel_launch.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
template <typename InDataType,
typename AccDataType,
typename OutDataType,
index_t Rank,
index_t NumReduceDim,
index_t BlockSize,
index_t MThreadClusterSize,
index_t KThreadClusterSize,
index_t MThreadSliceSize,
index_t KThreadSliceSize,
index_t InSrcVectorDim,
index_t InSrcVectorSize,
index_t OutDstVectorSize>
struct DeviceSoftmax : public BaseOperator
{
using PassThrough = tensor_operation::element_wise::PassThrough;
// Used for freeloading of some handy functions from DeviceReduceMultiBlock
using Reduction = DeviceReduceMultiBlock<InDataType,
AccDataType,
OutDataType,
Rank,
NumReduceDim,
reduce::Add,
PassThrough, // InElementwiseOperation
PassThrough, // AccElementwiseOperation
InMemoryDataOperationEnum::Set,
false, // PropagateNan
false, // OutputIndex
false, // HaveIndexInputIfOutputIndex
BlockSize,
MThreadClusterSize,
KThreadClusterSize,
MThreadSliceSize,
KThreadSliceSize,
InSrcVectorDim,
InSrcVectorSize,
1>; // OutDstVectorSize
using GridDesc_M_K = decltype(Reduction::MakeSrc2dDescriptor({1}, {1}, 1, 1));
using GridwiseReduce = GridwiseSoftmax_mk_to_mk<InDataType,
OutDataType,
AccDataType,
GridDesc_M_K,
BlockSize,
MThreadClusterSize,
KThreadClusterSize,
MThreadSliceSize,
KThreadSliceSize,
InSrcVectorDim,
InSrcVectorSize,
OutDstVectorSize>;
struct Argument : public Reduction::Argument
{
Argument(const std::vector<index_t> inLengths,
const std::vector<index_t> inStrides,
const std::vector<index_t> reduceDims,
AccDataType alpha,
AccDataType beta,
const InDataType* in_dev,
OutDataType* out_dev)
: Reduction::Argument(inLengths,
inStrides,
{},
{},
reduceDims,
0.0f, // alpha
0.0f, // beta
in_dev,
nullptr,
out_dev,
nullptr,
PassThrough{},
PassThrough{}),
// FIXME: The base class DeviceReduceMultiBlock::Argument only supports alpha/beta of
// float32 precision. Make it support any data type so the fields can be removed.
alpha_(alpha),
beta_(beta)
{
// std::cout << "blkGroupSize= " << this->blkGroupSize
// << ", numBlockTileIteration= " << this->numBlockTileIteration
// << ", gridSize=" << this->gridSize
// << ", invariant_total_length=" << this->invariant_total_length <<
// std::endl;
}
AccDataType alpha_;
AccDataType beta_;
};
struct Invoker : public BaseInvoker
{
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
const auto in_grid_desc_m_k = Reduction::MakeSrc2dDescriptor(
arg.inLengths_, arg.inStrides_, arg.blkGroupSize, arg.numBlockTileIteration);
const auto out_grid_desc_m_k = Reduction::MakeSrc2dDescriptor(
arg.inLengths_, arg.inStrides_, arg.blkGroupSize, arg.numBlockTileIteration);
const auto kernel_main =
kernel_softmax<GridwiseReduce, InDataType, OutDataType, AccDataType, GridDesc_M_K>;
float avg_time = 0;
avg_time += launch_and_time_kernel(stream_config,
kernel_main,
dim3(arg.gridSize),
dim3(BlockSize),
0,
in_grid_desc_m_k,
out_grid_desc_m_k,
arg.blkGroupSize,
arg.numBlockTileIteration,
arg.alpha_,
arg.in_dev_,
arg.beta_,
arg.out_dev_);
return (avg_time);
};
float Run(const BaseArgument* p_arg,
const StreamConfig& stream_config = StreamConfig{}) override
{
return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
};
};
bool IsSupportedArgument(const BaseArgument* p_arg) override
{
const Argument* p_arg_ = dynamic_cast<const Argument*>(p_arg);
if(!Reduction::IsSupportedArgument(p_arg_))
{
return false;
}
if(p_arg_->inLengths_[Rank - 1] % OutDstVectorSize != 0)
{
return false;
}
return true;
};
std::unique_ptr<BaseArgument> MakeArgumentPointer(const std::vector<index_t> inLengths,
const std::vector<index_t> inStrides,
const std::vector<int> reduceDims,
AccDataType alpha,
AccDataType beta,
const void* in_dev,
void* out_dev)
{
return std::make_unique<Argument>(inLengths,
inStrides,
reduceDims,
alpha,
beta,
static_cast<const InDataType*>(in_dev),
static_cast<OutDataType*>(out_dev));
};
std::unique_ptr<BaseInvoker> MakeInvokerPointer() { return std::make_unique<Invoker>(); };
std::string GetTypeString() const override
{
auto str = std::stringstream();
// clang-format off
str << "DeviceReduceSoftmax<" << BlockSize << ",";
str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ",";
str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ",";
str << "InSrcVectorDim_" << InSrcVectorDim << "_InSrcVectorSize_" << InSrcVectorSize << "_OutDstVectorSize_" << OutDstVectorSize << ">";
// clang-format on
return str.str();
}
};
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include <vector>
#include "ck/device_utility/device_prop.hpp"
#include "ck/device_utility/kernel_launch.hpp"
#include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_unary_elementwise_1d.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
template <typename ADataType,
typename BDataType,
typename ElementwiseFunctor,
index_t Dim,
index_t ScalarPerVector>
struct DeviceUnaryElementwise : public BaseOperator
{
static constexpr auto I0 = Number<0>{};
template <typename Desc_M0>
static auto PadDescriptor_M0_1d(Desc_M0 desc_m0, index_t gridSize, index_t blockSize)
{
const auto m0 = desc_m0.GetLength(I0);
const index_t loop_step = gridSize * blockSize * ScalarPerVector;
const auto pad = math::integer_least_multiple(m0, loop_step) - m0;
const auto desc_m0_pad =
transform_tensor_descriptor(desc_m0,
make_tuple(make_right_pad_transform(m0, pad)),
make_tuple(Sequence<0>{}),
make_tuple(Sequence<0>{}));
return desc_m0_pad;
}
static auto MakeDescriptor_M0(const std::vector<index_t>& shape,
const std::vector<index_t>& stride,
index_t gridSize,
index_t blockSize)
{
auto tupleOfShape = generate_tuple([&](auto I) { return shape[I]; }, Number<Dim>{});
auto tupleOfStride = generate_tuple([&](auto I) { return stride[I]; }, Number<Dim>{});
// nd desc - [s0, s1, s2, ...]
const auto desc = make_naive_tensor_descriptor(tupleOfShape, tupleOfStride);
// merge nd to 1d desc - [s0 * s1 * ...]
if constexpr(Dim > 1)
{
const auto desc_m0 = transform_tensor_descriptor(
desc,
make_tuple(make_merge_transform(tupleOfShape)),
make_tuple(generate_sequence_v2([&](auto I) { return I; }, Number<Dim>{})),
make_tuple(Sequence<0>{}));
return PadDescriptor_M0_1d(desc_m0, gridSize, blockSize);
}
else
return PadDescriptor_M0_1d(desc, gridSize, blockSize);
}
using GridDesc_M0 = decltype(MakeDescriptor_M0({1, 1}, {1, 1}, 1, 1));
using GridwiseUEltwise = GridwiseUnaryElementwise_1D<ADataType,
BDataType,
GridDesc_M0,
ElementwiseFunctor,
ScalarPerVector>;
struct Argument : public BaseArgument
{
Argument(const ADataType* p_a,
BDataType* p_b,
const std::vector<index_t>& shape,
const std::vector<index_t>& stride_a,
const std::vector<index_t>& stride_b,
ElementwiseFunctor functor)
: p_a_(p_a),
p_b_(p_b),
shape_(shape),
functor_(functor),
blockSize_(256) // FIXME - Calculate the grid size by number of CU in the future
{
index_t tensor_size =
std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>{});
gridSize_ = GridwiseUEltwise::CalculateGridSize(tensor_size);
a_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_a, gridSize_, blockSize_);
b_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_b, gridSize_, blockSize_);
}
const ADataType* p_a_;
BDataType* p_b_;
std::vector<int> shape_;
GridDesc_M0 a_grid_desc_m0_;
GridDesc_M0 b_grid_desc_m0_;
ElementwiseFunctor functor_;
index_t blockSize_;
index_t gridSize_;
};
struct Invoker : public BaseInvoker
{
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
const auto kernel = kernel_unary_elementwise_1d<GridwiseUEltwise,
ADataType,
BDataType,
GridDesc_M0,
ElementwiseFunctor>;
float elapsed_time = launch_and_time_kernel(stream_config,
kernel,
dim3(arg.gridSize_),
dim3(arg.blockSize_),
0,
arg.p_a_,
arg.p_b_,
arg.a_grid_desc_m0_,
arg.b_grid_desc_m0_,
arg.functor_);
return elapsed_time;
}
// polymorphic
float Run(const BaseArgument* p_arg,
const StreamConfig& stream_config = StreamConfig{}) override
{
return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
}
};
bool IsSupportedArgument(const BaseArgument* p_arg) override
{
const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
if(pArg == nullptr)
return false;
if(pArg->shape_.back() % ScalarPerVector != 0)
return false;
return true;
};
std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
void* p_b,
std::vector<index_t> shape,
std::vector<index_t> stride_a,
std::vector<index_t> stride_b,
ElementwiseFunctor functor)
{
return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
static_cast<BDataType*>(p_b),
shape,
stride_a,
stride_b,
functor);
}
std::unique_ptr<BaseInvoker> MakeInvokerPointer() { return std::make_unique<Invoker>(); }
std::string GetTypeString() const override
{
auto str = std::stringstream();
// clang-format off
str << "DeviceBinaryElementwise"
<< "<"
<< "ScalarPerVector = " << ScalarPerVector
<< ">";
// clang-format on
return str.str();
}
};
} // namespace device
} // namespace tensor_operation
} // namespace ck
#ifndef GEMM_SPECIALIZATION // SPDX-License-Identifier: MIT
#define GEMM_SPECIALIZATION // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -20,4 +22,3 @@ enum struct GemmSpecialization ...@@ -20,4 +22,3 @@ enum struct GemmSpecialization
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
/******************************************************************************* // SPDX-License-Identifier: MIT
* // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
* MIT License
* #pragma once
* Copyright (c) 2020 Advanced Micro Devices, Inc.
* #include "ck/utility/reduction_operator.hpp"
* Permission is hereby granted, free of charge, to any person obtaining a copy #include "ck/utility/reduction_enums.hpp"
* of this software and associated documentation files (the "Software"), to deal #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
* in the Software without restriction, including without limitation the rights // FIXME: can it be replaced with ck::Tuple?
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell #include <tuple>
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_REDUCTION_OPERATOR_MAPPING_HPP
#define CK_REDUCTION_OPERATOR_MAPPING_HPP
#include "reduction_operator.hpp"
#include "reduction_enums.hpp"
#include "element_wise_operation.hpp"
namespace ck { namespace ck {
...@@ -37,77 +16,69 @@ namespace ck { ...@@ -37,77 +16,69 @@ namespace ck {
// The boolean member "indexable" are also provided in reduce_binary_operactor for // The boolean member "indexable" are also provided in reduce_binary_operactor for
// easier checking by the upper-layer codes in the kernels. // easier checking by the upper-layer codes in the kernels.
template <typename T, ReduceTensorOp Op> template <ReduceTensorOp Op>
struct reduce_binary_operator; struct reduce_binary_operator;
template <typename T> template <>
struct reduce_binary_operator<T, ReduceTensorOp::ADD> struct reduce_binary_operator<ReduceTensorOp::ADD>
{ {
using opType = reduce::Add<T>; using opType = reduce::Add;
using dataType = T;
static constexpr bool indexable = false; static constexpr bool indexable = false;
}; };
template <typename T> template <>
struct reduce_binary_operator<T, ReduceTensorOp::MUL> struct reduce_binary_operator<ReduceTensorOp::MUL>
{ {
using opType = reduce::Mul<T>; using opType = reduce::Mul;
using dataType = T;
static constexpr bool indexable = false; static constexpr bool indexable = false;
}; };
template <typename T> template <>
struct reduce_binary_operator<T, ReduceTensorOp::MIN> struct reduce_binary_operator<ReduceTensorOp::MIN>
{ {
using opType = reduce::Min<T>; using opType = reduce::Min;
using dataType = T;
static constexpr bool indexable = true; static constexpr bool indexable = true;
}; };
template <typename T> template <>
struct reduce_binary_operator<T, ReduceTensorOp::MAX> struct reduce_binary_operator<ReduceTensorOp::MAX>
{ {
using opType = reduce::Max<T>; using opType = reduce::Max;
using dataType = T;
static constexpr bool indexable = true; static constexpr bool indexable = true;
}; };
template <typename T> template <>
struct reduce_binary_operator<T, ReduceTensorOp::AMAX> struct reduce_binary_operator<ReduceTensorOp::AMAX>
{ {
using opType = reduce::AMax<T>; using opType = reduce::AMax;
using dataType = T;
static constexpr bool indexable = true; static constexpr bool indexable = true;
}; };
template <typename T> template <>
struct reduce_binary_operator<T, ReduceTensorOp::AVG> struct reduce_binary_operator<ReduceTensorOp::AVG>
{ {
using opType = reduce::Add<T>; using opType = reduce::Add;
using dataType = T;
static constexpr bool indexable = false; static constexpr bool indexable = false;
}; };
template <typename T> template <>
struct reduce_binary_operator<T, ReduceTensorOp::NORM1> struct reduce_binary_operator<ReduceTensorOp::NORM1>
{ {
using opType = reduce::Add<T>; using opType = reduce::Add;
using dataType = T;
static constexpr bool indexable = false; static constexpr bool indexable = false;
}; };
template <typename T> template <>
struct reduce_binary_operator<T, ReduceTensorOp::NORM2> struct reduce_binary_operator<ReduceTensorOp::NORM2>
{ {
using opType = reduce::Add<T>; using opType = reduce::Add;
using dataType = T;
static constexpr bool indexable = false; static constexpr bool indexable = false;
}; };
...@@ -115,55 +86,101 @@ struct reduce_binary_operator<T, ReduceTensorOp::NORM2> ...@@ -115,55 +86,101 @@ struct reduce_binary_operator<T, ReduceTensorOp::NORM2>
// The templated struct reduce_unary_operator maps the enum Ids of Reduce operators to two unary // The templated struct reduce_unary_operator maps the enum Ids of Reduce operators to two unary
// functor classes. // functor classes.
// The two unary functors are called before and afer the Reduction is executed respectively // The two unary functors are called before and afer the Reduction is executed respectively
template <typename T, ReduceTensorOp Op, bool IsFirstReduce, bool IsLastReduce> template <ReduceTensorOp Op, bool IsFirstReduce, bool IsLastReduce>
struct reduce_unary_operator struct reduce_unary_operator
{ {
using InElementwiseOperation = tensor_operation::element_wise::UnaryIdentic<T, T>; using InElementwiseOperation = tensor_operation::element_wise::PassThrough;
using AccElementwiseOperation = tensor_operation::element_wise::UnaryIdentic<T, T>; using AccElementwiseOperation = tensor_operation::element_wise::PassThrough;
static std::tuple<InElementwiseOperation, AccElementwiseOperation>
GetElementwiseOperator(int32_t reduceLength)
{
(void)reduceLength;
return std::make_tuple(InElementwiseOperation{}, AccElementwiseOperation{});
};
}; };
template <typename T, bool IsFirstReduce> template <bool IsFirstReduce>
struct reduce_unary_operator<T, ReduceTensorOp::AVG, IsFirstReduce, true> struct reduce_unary_operator<ReduceTensorOp::AVG, IsFirstReduce, true>
{ {
using InElementwiseOperation = tensor_operation::element_wise::UnaryIdentic<T, T>; using InElementwiseOperation = tensor_operation::element_wise::PassThrough;
using AccElementwiseOperation = tensor_operation::element_wise::UnaryIdentic<T, T, true>; using AccElementwiseOperation = tensor_operation::element_wise::UnaryDivide;
static std::tuple<InElementwiseOperation, AccElementwiseOperation>
GetElementwiseOperator(int32_t reduceLength)
{
return std::make_tuple(InElementwiseOperation{}, AccElementwiseOperation{reduceLength});
};
}; };
template <typename T, bool IsLastReduce> template <bool IsLastReduce>
struct reduce_unary_operator<T, ReduceTensorOp::NORM1, true, IsLastReduce> struct reduce_unary_operator<ReduceTensorOp::NORM1, true, IsLastReduce>
{ {
using InElementwiseOperation = tensor_operation::element_wise::UnaryAbs<T, T>; using InElementwiseOperation = tensor_operation::element_wise::UnaryAbs;
using AccElementwiseOperation = tensor_operation::element_wise::UnaryIdentic<T, T>; using AccElementwiseOperation = tensor_operation::element_wise::PassThrough;
static std::tuple<InElementwiseOperation, AccElementwiseOperation>
GetElementwiseOperator(int32_t reduceLength)
{
(void)reduceLength;
return std::make_tuple(InElementwiseOperation{}, AccElementwiseOperation{});
};
}; };
template <typename T, bool IsLastReduce> template <bool IsLastReduce>
struct reduce_unary_operator<T, ReduceTensorOp::AMAX, true, IsLastReduce> struct reduce_unary_operator<ReduceTensorOp::AMAX, true, IsLastReduce>
{ {
using InElementwiseOperation = tensor_operation::element_wise::UnaryAbs<T, T>; using InElementwiseOperation = tensor_operation::element_wise::UnaryAbs;
using AccElementwiseOperation = tensor_operation::element_wise::UnaryIdentic<T, T>; using AccElementwiseOperation = tensor_operation::element_wise::PassThrough;
static std::tuple<InElementwiseOperation, AccElementwiseOperation>
GetElementwiseOperator(int32_t reduceLength)
{
(void)reduceLength;
return std::make_tuple(InElementwiseOperation{}, AccElementwiseOperation{});
};
}; };
template <typename T> template <>
struct reduce_unary_operator<T, ReduceTensorOp::NORM2, true, false> struct reduce_unary_operator<ReduceTensorOp::NORM2, true, false>
{ {
using InElementwiseOperation = tensor_operation::element_wise::UnarySquare<T, T>; using InElementwiseOperation = tensor_operation::element_wise::UnarySquare;
using AccElementwiseOperation = tensor_operation::element_wise::UnaryIdentic<T, T>; using AccElementwiseOperation = tensor_operation::element_wise::PassThrough;
static std::tuple<InElementwiseOperation, AccElementwiseOperation>
GetElementwiseOperator(int32_t reduceLength)
{
(void)reduceLength;
return std::make_tuple(InElementwiseOperation{}, AccElementwiseOperation{});
};
}; };
template <typename T> template <>
struct reduce_unary_operator<T, ReduceTensorOp::NORM2, true, true> struct reduce_unary_operator<ReduceTensorOp::NORM2, true, true>
{ {
using InElementwiseOperation = tensor_operation::element_wise::UnarySquare<T, T>; using InElementwiseOperation = tensor_operation::element_wise::UnarySquare;
using AccElementwiseOperation = tensor_operation::element_wise::UnarySqrt<T, T>; using AccElementwiseOperation = tensor_operation::element_wise::UnarySqrt;
static std::tuple<InElementwiseOperation, AccElementwiseOperation>
GetElementwiseOperator(int32_t reduceLength)
{
(void)reduceLength;
return std::make_tuple(InElementwiseOperation{}, AccElementwiseOperation{});
};
}; };
template <typename T> template <>
struct reduce_unary_operator<T, ReduceTensorOp::NORM2, false, true> struct reduce_unary_operator<ReduceTensorOp::NORM2, false, true>
{ {
using InElementwiseOperation = tensor_operation::element_wise::UnaryIdentic<T, T>; using InElementwiseOperation = tensor_operation::element_wise::PassThrough;
using AccElementwiseOperation = tensor_operation::element_wise::UnarySqrt<T, T>; using AccElementwiseOperation = tensor_operation::element_wise::UnarySqrt;
static std::tuple<InElementwiseOperation, AccElementwiseOperation>
GetElementwiseOperator(int32_t reduceLength)
{
(void)reduceLength;
return std::make_tuple(InElementwiseOperation{}, AccElementwiseOperation{});
};
}; };
} // end of namespace ck } // namespace ck
#endif
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
namespace ck { namespace ck {
......
/******************************************************************************* // SPDX-License-Identifier: MIT
* // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
* MIT License
*
* Copyright (c) 2022 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#pragma once #pragma once
#include "data_type.hpp"
#include "ck/utility/data_type.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
namespace binary_element_wise { namespace element_wise {
template <typename Y, typename X1, typename X2>
struct Add;
template <> struct Add
struct Add<double, double, double>
{ {
template <typename T>
__host__ __device__ constexpr void operator()(T& y, const T& x0, const T& x1) const;
template <>
__host__ __device__ constexpr void __host__ __device__ constexpr void
operator()(double& dst, const double& src1, const double& src2) const operator()<float>(float& y, const float& x0, const float& x1) const
{ {
dst = src1 + src2; y = x0 + x1;
} };
};
template <> template <>
struct Add<float, float, float>
{
__host__ __device__ constexpr void __host__ __device__ constexpr void
operator()(float& dst, const float& src1, const float& src2) const operator()<double>(double& y, const double& x0, const double& x1) const
{ {
dst = src1 + src2; y = x0 + x1;
} };
};
template <> // Question: should half_t be supported ?
struct Add<half_t, half_t, half_t> template <>
{ __host__ __device__ constexpr void
operator()<half_t>(half_t& y, const half_t& x0, const half_t& x1) const
{
y = x0 + x1;
};
// Question: should bhalf_t be supported ?
template <>
__host__ __device__ constexpr void __host__ __device__ constexpr void
operator()(half_t& dst, const half_t& src1, const half_t& src2) const operator()<bhalf_t>(bhalf_t& y, const bhalf_t& x0, const bhalf_t& x1) const
{ {
dst = src1 + src2; const float x1_tmp = ck::type_convert<float>(x0);
const float x2_tmp = ck::type_convert<float>(x1);
const float y_tmp = x1_tmp + x2_tmp;
y = ck::type_convert<bhalf_t>(y_tmp);
} }
}; };
template <> struct Subtract
struct Add<bhalf_t, bhalf_t, bhalf_t>
{ {
template <typename T>
__host__ __device__ constexpr void operator()(T& y, const T& x0, const T& x1) const;
template <>
__host__ __device__ constexpr void __host__ __device__ constexpr void
operator()(bhalf_t& dst, const bhalf_t& src1, const bhalf_t& src2) const operator()<float>(float& y, const float& x0, const float& x1) const
{ {
const float x1 = ck::type_convert<float>(src1); y = x0 - x1;
const float x2 = ck::type_convert<float>(src2); };
const float y = x1 + x2;
dst = ck::type_convert<bhalf_t>(y); template <>
} __host__ __device__ constexpr void
}; operator()<double>(double& y, const double& x0, const double& x1) const
{
y = x0 - x1;
};
template <typename Y, typename X1, typename X2> // Question: should half_t be supported ?
struct Substract; template <>
__host__ __device__ constexpr void
operator()<half_t>(half_t& y, const half_t& x0, const half_t& x1) const
{
y = x0 - x1;
};
template <> // Question: should bhalf_t be supported ?
struct Substract<double, double, double> template <>
{
__host__ __device__ constexpr void __host__ __device__ constexpr void
operator()(double& dst, const double& src1, const double& src2) const operator()<bhalf_t>(bhalf_t& y, const bhalf_t& x0, const bhalf_t& x1) const
{ {
dst = src1 - src2; const float x1_tmp = ck::type_convert<float>(x0);
const float x2_tmp = ck::type_convert<float>(x1);
const float y_tmp = x1_tmp - x2_tmp;
y = ck::type_convert<bhalf_t>(y_tmp);
} }
}; };
template <> struct AlphaBetaAdd
struct Substract<float, float, float>
{ {
AlphaBetaAdd(float alpha, float beta) : alpha_(alpha), beta_(beta){};
template <typename T>
__host__ __device__ constexpr void operator()(T& y, const T& x0, const T& x1) const;
template <>
__host__ __device__ constexpr void __host__ __device__ constexpr void
operator()(float& dst, const float& src1, const float& src2) const operator()<float>(float& y, const float& x0, const float& x1) const
{ {
dst = src1 - src2; y = alpha_ * x0 + beta_ * x1;
} };
template <>
__host__ __device__ constexpr void
operator()<double>(double& y, const double& x0, const double& x1) const
{
y = static_cast<double>(alpha_) * x0 + static_cast<double>(beta_) * x1;
};
// Question: should half_t be supported ?
template <>
__host__ __device__ constexpr void
operator()<half_t>(half_t& y, const half_t& x0, const half_t& x1) const
{
y = static_cast<half_t>(alpha_ * static_cast<float>(x0) + beta_ * static_cast<float>(x1));
};
float alpha_;
float beta_;
}; };
template <> struct AddRelu
struct Substract<half_t, half_t, half_t>
{ {
template <typename T>
__host__ __device__ constexpr void operator()(T& y, const T& x0, const T& x1) const;
template <>
__host__ __device__ constexpr void __host__ __device__ constexpr void
operator()(half_t& dst, const half_t& src1, const half_t& src2) const operator()<float>(float& y, const float& x0, const float& x1) const
{ {
dst = src1 - src2; const float a = x0 + x1;
} y = a > 0.0f ? a : 0.0f;
};
template <>
__host__ __device__ constexpr void
operator()<double>(double& y, const double& x0, const double& x1) const
{
const double a = x0 + x1;
y = a > 0.0 ? a : 0.0;
};
// Question: should half_t be supported ?
template <>
__host__ __device__ constexpr void
operator()<half_t>(half_t& y, const half_t& x0, const half_t& x1) const
{
const half_t a = x0 + x1;
y = a > static_cast<half_t>(0.0f) ? a : static_cast<half_t>(0.0f);
};
}; };
template <> struct AddHardswish
struct Substract<bhalf_t, bhalf_t, bhalf_t>
{ {
template <typename T>
__host__ __device__ constexpr void operator()(T& y, const T& x0, const T& x1) const;
template <>
__host__ __device__ constexpr void __host__ __device__ constexpr void
operator()(bhalf_t& dst, const bhalf_t& src1, const bhalf_t& src2) const operator()<float>(float& y, const float& x0, const float& x1) const
{ {
const float x1 = ck::type_convert<float>(src1); float a = x0 + x1;
const float x2 = ck::type_convert<float>(src2); float b = a + float{3};
const float y = x1 - x2; float c = (b > 0) * (b > 6.0f ? 6.0f : b) * a * 0.166667f;
dst = ck::type_convert<bhalf_t>(y); y = c;
} };
template <>
__host__ __device__ constexpr void
operator()<double>(double& y, const double& x0, const double& x1) const
{
double a = x0 + x1;
double b = a + 3.0;
double c = (b > 0) * (b > 6.0 ? 6.0 : b) * a * 0.166667;
y = c;
};
// Question: should half_t be supported ?
template <>
__host__ __device__ constexpr void
operator()<half_t>(half_t& y, const half_t& x0, const half_t& x1) const
{
float a = x0 + x1;
float b = a + 3.0f;
float c = (b > 0) * (b > 6.0f ? 6.0f : b) * a * 0.166667f;
y = c;
};
}; };
} // namespace binary_element_wise } // namespace element_wise
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
#include "data_type.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/utility/math_v2.hpp"
#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
namespace element_wise { namespace element_wise {
struct PassThrough // Need to ensure compiler will fail if there is no matching candidate, instead of compiler
{ // siliently do implicit type conversion
__host__ __device__ void operator()(float& y, const float& x) const { y = x; } //
// Method 1:
__host__ __device__ void operator()(half_t& y, const half_t& x) const { y = x; } //
// struct ExampleElementwiseOp
__host__ __device__ void operator()(bhalf_t& y, const bhalf_t& x) const { y = x; } // {
// template<typename Y, typename X>
__host__ __device__ void operator()(int32_t& y, const int32_t& x) const { y = x; } // __host__ __device__ constexpr void
// operator()(Y&, const X) const;
__host__ __device__ void operator()(int8_t& y, const int8_t& x) const { y = x; } //
// template<>
__host__ __device__ void operator()(double& y, const double& x) const { y = x; } // __host__ __device__ constexpr void
}; // operator()<half_t, half_t>(half_t& y, const half_t& x) const
// {
struct Add // }
{ // };
__host__ __device__ constexpr void operator()(float& y, const float& x0, const float& x1) const //
{ // Method 2:
y = x0 + x1; //
} // template <typename Y, typename X>
// struct ExampleElementwiseOp;
__host__ __device__ constexpr void //
operator()(half_t& y, const half_t& x0, const half_t& x1) const // template <>
{ // struct ExampleElementwiseOp<float, ck::bhalf_t>
// FIXME - Use float (acc type) bias in the future. // {
y = x0 + x1; // __host__ __device__ void operator()(float& y, ck::bhalf_t& x) const
} // {
}; // }
// };
struct AlphaBetaAdd
{
AlphaBetaAdd(float alpha, float beta) : alpha_(alpha), beta_(beta) {}
__host__ __device__ constexpr void operator()(float& y, const float& x0, const float& x1) const
{
y = alpha_ * x0 + beta_ * x1;
}
__host__ __device__ constexpr void
operator()(half_t& y, const half_t& x0, const half_t& x1) const
{
// FIXME - Let x0 be acc type
y = static_cast<half_t>(alpha_ * static_cast<float>(x0) + beta_ * static_cast<float>(x1));
}
float alpha_;
float beta_;
};
struct AddRelu
{
__host__ __device__ constexpr void operator()(float& y, const float& x0, const float& x1) const
{
const float a = x0 + x1;
y = a > 0 ? a : 0;
}
__host__ __device__ constexpr void
operator()(half_t& y, const half_t& x0, const half_t& x1) const
{
const half_t a = x0 + x1;
y = a > 0 ? a : 0;
}
};
struct AddHardswish
{
__host__ __device__ constexpr void operator()(float& y, const float& x0, const float& x1) const
{
float a = x0 + x1;
float b = a + float{3};
float c = (b > 0) * (b > float{6} ? float{6} : b) * a * float{0.166667};
y = c;
}
__host__ __device__ constexpr void
operator()(half_t& y, const half_t& x0, const half_t& x1) const
{
float a = x0 + x1;
float b = a + float{3};
float c = (b > 0) * (b > float{6} ? float{6} : b) * a * float{0.166667};
y = c;
}
};
struct AddReluAdd struct AddReluAdd
{ {
__host__ __device__ constexpr void template <typename Y, typename X0, typename X1, typename X2>
operator()(half_t& y, const half_t& x0, const half_t& x1, const half_t& x2) const __host__ __device__ constexpr void operator()(Y&, const X0&, const X1&, const X2&) const;
template <>
__host__ __device__ constexpr void operator()<half_t, half_t, half_t, half_t>(
half_t& y, const half_t& x0, const half_t& x1, const half_t& x2) const
{ {
half_t a = x0 + x1; half_t a = x0 + x1;
half_t b = a > 0 ? a : 0; half_t b = a > 0 ? a : 0;
y = b + x2; y = b + x2;
} }
__host__ __device__ constexpr void template <>
operator()(float& y, const float& x0, const float& x1, const float& x2) const __host__ __device__ constexpr void operator()<float, float, float, float>(float& y,
const float& x0,
const float& x1,
const float& x2) const
{ {
float a = x0 + x1; float a = x0 + x1;
float b = a > 0 ? a : 0; float b = a > 0 ? a : 0;
...@@ -110,8 +69,9 @@ struct AddReluAdd ...@@ -110,8 +69,9 @@ struct AddReluAdd
y = c; y = c;
} }
__host__ __device__ constexpr void template <>
operator()(half_t& y, const float& x0, const half_t& x1, const half_t& x2) const __host__ __device__ constexpr void operator()<half_t, float, half_t, half_t>(
half_t& y, const float& x0, const half_t& x1, const half_t& x2) const
{ {
float a = x0 + x1; float a = x0 + x1;
float b = a > 0 ? a : 0; float b = a > 0 ? a : 0;
...@@ -122,8 +82,14 @@ struct AddReluAdd ...@@ -122,8 +82,14 @@ struct AddReluAdd
struct AddHardswishAdd struct AddHardswishAdd
{ {
__host__ __device__ constexpr void template <typename Y, typename X0, typename X1, typename X2>
operator()(float& y, const float& x0, const float& x1, const float& x2) const __host__ __device__ constexpr void operator()(Y&, const X0&, const X1&, const X2&) const;
template <>
__host__ __device__ constexpr void operator()<float, float, float, float>(float& y,
const float& x0,
const float& x1,
const float& x2) const
{ {
float a = x0 + x1; float a = x0 + x1;
float b = a + float{3}; float b = a + float{3};
...@@ -132,8 +98,9 @@ struct AddHardswishAdd ...@@ -132,8 +98,9 @@ struct AddHardswishAdd
y = d; y = d;
} }
__host__ __device__ constexpr void template <>
operator()(half_t& y, const half_t& x0, const half_t& x1, const half_t& x2) const __host__ __device__ constexpr void operator()<half_t, half_t, half_t, half_t>(
half_t& y, const half_t& x0, const half_t& x1, const half_t& x2) const
{ {
float a = x0 + x1; float a = x0 + x1;
float b = a + float{3}; float b = a + float{3};
...@@ -143,208 +110,95 @@ struct AddHardswishAdd ...@@ -143,208 +110,95 @@ struct AddHardswishAdd
} }
}; };
struct Normalize // C = A * B
{ // E = FastGelu(C + D0 + D1)
Normalize(float epsilon = 1e-4) : epsilon_(epsilon) {} struct AddAddFastGelu
__host__ __device__ constexpr void operator()(float& y,
const float& x,
const float& mean,
const float& mean_square,
const float& gamma,
const float& beta) const
{
float variance = mean_square - (mean * mean);
y = ((x - mean) / sqrtf(variance + epsilon_)) * gamma + beta;
}
float epsilon_;
};
// Unary operators are usually called element-wisely before/after the reduction is executed on the
// elements. They are needed for easy implementation of reduction types of AVG, NRM1, NRM2
template <typename Y, typename X, bool HasDividing = false>
struct UnaryIdentic;
template <>
struct UnaryIdentic<float, float, false>
{ {
__host__ __device__ UnaryIdentic(const int32_t divider = 1) { (void)divider; }; template <typename E, typename C, typename D0, typename D1>
__host__ __device__ void operator()(E&, const C&, const D0&, const D1&) const;
__host__ __device__ void operator()(float& y, const float& x) const { y = x; };
};
template <> template <>
struct UnaryIdentic<float, float, true> __host__ __device__ void operator()<half_t, float, half_t, half_t>(half_t& e,
{ const float& c,
__host__ __device__ UnaryIdentic(const int32_t divider = 1) { divider_ = divider; }; const half_t& d0,
const half_t& d1) const
__host__ __device__ void operator()(float& y, const float& x) const
{ {
y = x / type_convert<float>(divider_); // Fast GeLU
}; // https://paperswithcode.com/method/gelu
// y = 0.5*x*(1+tanh(sqrt(2/pi)*(x+0.044715*x^3)))
const auto fast_gelu = [&](float x) {
const float u = float(2) * x * (float(0.035677) * x * x + float(0.797885));
const float emu = exp(-u);
const float cdf = float(0.5) + float(0.5) * (float(2) / (float(1) + emu) - float(1));
return x * cdf;
};
int32_t divider_ = 1; const float y = fast_gelu(c + float(d0) + float(d1));
};
template <> e = type_convert<half_t>(y);
struct UnaryIdentic<half_t, half_t, false> }
{
__host__ __device__ UnaryIdentic(const int32_t divider = 1) { (void)divider; };
__host__ __device__ void operator()(half_t& y, const half_t& x) const { y = x; };
}; };
template <> struct Normalize
struct UnaryIdentic<double, double, false>
{ {
__host__ __device__ UnaryIdentic(const int32_t divider = 1) { (void)divider; }; // FIXME: is double absolutely necessary?
Normalize(double epsilon = 1e-4) : epsilon_(epsilon) {}
__host__ __device__ void operator()(double& y, const double& x) const { y = x; }; template <typename T>
}; __host__ __device__ constexpr void operator()(
T& y, const T& x, const T& mean, const T& mean_square, const T& gamma, const T& beta) const;
template <>
struct UnaryIdentic<double, double, true>
{
__host__ __device__ UnaryIdentic(const int32_t divider = 1) { divider_ = divider; };
__host__ __device__ void operator()(double& y, const double& x) const template <>
__host__ __device__ constexpr void operator()<float>(float& y,
const float& x,
const float& mean,
const float& mean_square,
const float& gamma,
const float& beta) const
{ {
y = x / type_convert<double>(divider_); using ck::math::sqrt;
};
int32_t divider_ = 1;
};
template <>
struct UnaryIdentic<int32_t, int32_t, false>
{
__host__ __device__ UnaryIdentic(const int32_t divider = 1) { (void)divider; };
__host__ __device__ void operator()(int32_t& y, const int32_t& x) const { y = x; };
};
template <>
struct UnaryIdentic<int32_t, int32_t, true>
{
__host__ __device__ UnaryIdentic(const int32_t divider = 1) { divider_ = divider; };
__host__ __device__ void operator()(int32_t& y, const int32_t& x) const { y = x / divider_; };
int32_t divider_ = 1;
};
template <>
struct UnaryIdentic<int8_t, int8_t, false>
{
__host__ __device__ UnaryIdentic(const int8_t divider = 1) { (void)divider; };
__host__ __device__ void operator()(int8_t& y, const int8_t& x) const { y = x; };
};
template <typename Y, typename X, bool HasDividing = false>
struct UnarySquare;
template <>
struct UnarySquare<float, float, false>
{
__host__ __device__ UnarySquare(const int32_t divider = 1) { (void)divider; };
__host__ __device__ void operator()(float& y, const float& x) const { y = x * x; }; float variance = mean_square - (mean * mean);
}; y = ((x - mean) / sqrt(variance + static_cast<float>(epsilon_))) * gamma + beta;
template <>
struct UnarySquare<float, float, true>
{
__host__ __device__ UnarySquare(const int32_t divider = 1) { divider_ = divider; };
__host__ __device__ void operator()(float& y, const float& x) const
{
y = x * x / type_convert<float>(divider_);
}; };
int32_t divider_ = 1; template <>
}; __host__ __device__ constexpr void operator()<double>(double& y,
const double& x,
template <> const double& mean,
struct UnarySquare<double, double, false> const double& mean_square,
{ const double& gamma,
__host__ __device__ UnarySquare(const int32_t divider = 1) { (void)divider; }; const double& beta) const
__host__ __device__ void operator()(double& y, const double& x) const { y = x * x; };
};
template <>
struct UnarySquare<double, double, true>
{
__host__ __device__ UnarySquare(const int32_t divider = 1) { divider_ = divider; };
__host__ __device__ void operator()(double& y, const double& x) const
{ {
y = x * x / type_convert<double>(divider_); using ck::math::sqrt;
double variance = mean_square - (mean * mean);
y = ((x - mean) / sqrt(variance + epsilon_)) * gamma + beta;
}; };
int32_t divider_ = 1; // FIXME: is double absolutely necessary?
double epsilon_;
}; };
template <typename Y, typename X> template <typename Y, typename X>
struct UnaryAbs; struct UnaryTypeConvert;
template <>
struct UnaryAbs<float, float>
{
__host__ __device__ UnaryAbs(const int32_t divider = 1) { (void)divider; };
__host__ __device__ void operator()(float& y, const float& x) const { y = abs(x); };
};
template <>
struct UnaryAbs<half_t, half_t>
{
__host__ __device__ UnaryAbs(const int32_t divider = 1) { (void)divider; };
__host__ __device__ void operator()(half_t& y, const half_t& x) const { y = __habs(x); };
};
template <> template <>
struct UnaryAbs<double, double> struct UnaryTypeConvert<float, ck::bhalf_t>
{ {
__host__ __device__ UnaryAbs(const int32_t divider = 1) { (void)divider; }; __host__ __device__ void operator()(float& y, ck::bhalf_t& x) const
__host__ __device__ void operator()(double& y, const double& x) const { y = abs(x); };
};
template <>
struct UnaryAbs<int8_t, int8_t>
{
__host__ __device__ UnaryAbs(const int32_t divider = 1) { (void)divider; };
__host__ __device__ void operator()(int8_t& y, const int8_t& x) const
{ {
int8_t sgn = x >> (8 - 1); y = ck::type_convert<float, ck::bhalf_t>(x);
}
y = (x ^ sgn) - sgn;
};
};
template <typename Y, typename X>
struct UnarySqrt;
template <>
struct UnarySqrt<float, float>
{
__host__ __device__ UnarySqrt(const int32_t divider = 1) { (void)divider; };
__host__ __device__ void operator()(float& y, const float& x) const { y = sqrtf(x); };
}; };
template <> template <>
struct UnarySqrt<double, double> struct UnaryTypeConvert<ck::bhalf_t, float>
{ {
__host__ __device__ UnarySqrt(const int32_t divider = 1) { (void)divider; }; __host__ __device__ void operator()(ck::bhalf_t& y, float& x) const
{
__host__ __device__ void operator()(double& y, const double& x) const { y = sqrt(x); }; y = ck::type_convert<ck::bhalf_t, float>(x);
}
}; };
} // namespace element_wise } // namespace element_wise
......
#pragma once
#include "data_type.hpp"
namespace ck {
namespace tensor_operation {
namespace element_wise {
} // namespace element_wise
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/utility/math_v2.hpp"
namespace ck {
namespace tensor_operation {
namespace element_wise {
struct PassThrough
{
template <typename T>
__host__ __device__ void operator()(T& y, const T& x) const
{
static_assert(is_same<T, float>::value || is_same<T, double>::value ||
is_same<T, half_t>::value || is_same<T, bhalf_t>::value ||
is_same<T, int32_t>::value || is_same<T, int8_t>::value,
"Data type is not supported by this operation!");
y = x;
};
};
struct UnaryDivide
{
__host__ __device__ UnaryDivide(const int32_t divider = 1) : divider_(divider){};
template <typename T>
__host__ __device__ void operator()(T& y, const T& x) const
{
static_assert(is_same<T, float>::value || is_same<T, double>::value ||
is_same<T, int32_t>::value,
"Data type is not supported by this operation!");
y = x / type_convert<T>(divider_);
};
int32_t divider_ = 1;
};
struct UnarySquare
{
template <typename T>
__host__ __device__ void operator()(T& y, const T& x) const
{
static_assert(is_same<T, float>::value || is_same<T, double>::value,
"Data type is not supported by this operation!");
y = x * x;
};
};
struct UnaryAbs
{
template <typename T>
__host__ __device__ void operator()(T& y, const T& x) const
{
static_assert(is_same<T, float>::value || is_same<T, double>::value ||
is_same<T, half_t>::value || is_same<T, int32_t>::value ||
is_same<T, int8_t>::value,
"Data type is not supported by this operation!");
y = ck::math::abs(x);
};
};
struct UnarySqrt
{
template <typename T>
__host__ __device__ void operator()(T& y, const T& x) const
{
static_assert(is_same<T, float>::value || is_same<T, double>::value,
"Data type is not supported by this operation!");
y = ck::math::sqrt(x);
};
};
struct Relu
{
template <typename T>
__host__ __device__ void operator()(T& y, const T& x) const
{
static_assert(is_same<T, float>::value || is_same<T, double>::value ||
is_same<T, half_t>::value || is_same<T, int32_t>::value ||
is_same<T, int8_t>::value,
"Data type is not supported by this operation!");
y = x > 0 ? x : 0;
}
template <>
__host__ __device__ void operator()(bhalf_t& y, const bhalf_t& x) const
{
float x_f32 = ck::type_convert<float>(x);
float y_f32 = x_f32 > 0 ? x_f32 : 0;
y = ck::type_convert<bhalf_t>(y_f32);
}
};
// https://paperswithcode.com/method/gelu
// y = 0.5*x*(1+tanh(sqrt(2/pi)*(x+0.044715*x^3)))
struct FastGelu
{
template <typename Y, typename X>
__host__ __device__ void operator()(Y& y, const X& x) const;
template <>
__host__ __device__ void operator()<float, float>(float& y, const float& x) const
{
const float u = float(2) * x * (float(0.035677) * x * x + float(0.797885));
const float emu = exp(-u);
const float cdf = float(0.5) + float(0.5) * (float(2) / (float(1) + emu) - float(1));
y = x * cdf;
}
};
} // namespace element_wise
} // namespace tensor_operation
} // namespace ck
#ifndef UTILITY_BLOCK_TO_CTILE_MAP // SPDX-License-Identifier: MIT
#define UTILITY_BLOCK_TO_CTILE_MAP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "utility/math.hpp" #pragma once
#include "utility/number.hpp"
#include "tensor_description/tensor_adaptor.hpp" #include "ck/utility/math.hpp"
#include "tensor_description/multi_index_transform_helper.hpp" #include "ck/utility/number.hpp"
#include "ck/tensor_description/tensor_adaptor.hpp"
#include "ck/tensor_description/multi_index_transform_helper.hpp"
namespace ck { namespace ck {
...@@ -485,5 +487,3 @@ __host__ __device__ bool DefaultValidCTileIndex(const CTileIdx& c_tile_idx, ...@@ -485,5 +487,3 @@ __host__ __device__ bool DefaultValidCTileIndex(const CTileIdx& c_tile_idx,
} }
} // namespace ck } // namespace ck
#endif // UTILITY_BLOCK_TO_CTILE_MAP
/******************************************************************************* // SPDX-License-Identifier: MIT
* // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
* MIT License
* #pragma once
* Copyright (c) 2020 Advanced Micro Devices, Inc.
* #include "ck/utility/reduction_common.hpp"
* Permission is hereby granted, free of charge, to any person obtaining a copy #include "ck/utility/reduction_operator.hpp"
* of this software and associated documentation files (the "Software"), to deal #include "ck/utility/reduction_functions_accumulate.hpp"
* in the Software without restriction, including without limitation the rights #include "ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp"
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell #include "ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp"
* copies of the Software, and to permit persons to whom the Software is #include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
* furnished to do so, subject to the following conditions: #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_GRIDWISE_2D_REDUCTION_MULTIBLOCK_HPP
#define CK_GRIDWISE_2D_REDUCTION_MULTIBLOCK_HPP
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
#include "reduction_functions_accumulate.hpp"
#include "reduction_functions_blockwise.hpp"
#include "reduction_functions_threadwise.hpp"
#include "threadwise_tensor_slice_transfer.hpp"
#include "element_wise_operation.hpp"
namespace ck { namespace ck {
...@@ -171,15 +147,15 @@ struct GridwiseReduction_mk_to_m_multiblock ...@@ -171,15 +147,15 @@ struct GridwiseReduction_mk_to_m_multiblock
AccDataType beta, AccDataType beta,
OutDataType* const __restrict__ p_out_value_global) OutDataType* const __restrict__ p_out_value_global)
{ {
const auto zeroVal = ReduceOperation::GetReductionZeroVal(); const auto identityVal = ReduceOperation::template GetIdentityValue<AccDataType>();
// LDS // LDS
__shared__ AccDataType p_reduce_work_buffer[BlockSize]; __shared__ AccDataType p_reduce_work_buffer[BlockSize];
const auto in_global_val_buf = const auto in_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
make_dynamic_buffer<AddressSpaceEnum::Global>(p_in_value_global, p_in_value_global,
in_grid_desc_m_k.GetElementSpaceSize(), in_grid_desc_m_k.GetElementSpaceSize(),
type_convert<InDataType>(zeroVal)); ReduceOperation::template GetIdentityValue<InDataType>());
auto out_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>( auto out_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_out_value_global, out_grid_desc_m.GetElementSpaceSize()); p_out_value_global, out_grid_desc_m.GetElementSpaceSize());
...@@ -191,7 +167,7 @@ struct GridwiseReduction_mk_to_m_multiblock ...@@ -191,7 +167,7 @@ struct GridwiseReduction_mk_to_m_multiblock
StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> accu_value_buf; StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> accu_value_buf;
static_for<0, MThreadSliceSize, 1>{}([&](auto I) { accu_value_buf(I) = zeroVal; }); static_for<0, MThreadSliceSize, 1>{}([&](auto I) { accu_value_buf(I) = identityVal; });
const index_t thread_local_id = get_thread_local_1d_id(); const index_t thread_local_id = get_thread_local_1d_id();
const index_t block_global_id = get_block_1d_id(); const index_t block_global_id = get_block_1d_id();
...@@ -358,12 +334,12 @@ struct GridwiseReduction_mk_to_m_multiblock ...@@ -358,12 +334,12 @@ struct GridwiseReduction_mk_to_m_multiblock
__shared__ AccDataType p_reduce_work_val_buffer[BlockSize]; __shared__ AccDataType p_reduce_work_val_buffer[BlockSize];
__shared__ IndexDataType p_reduce_work_idx_buffer[BlockSize]; __shared__ IndexDataType p_reduce_work_idx_buffer[BlockSize];
const auto zeroVal = ReduceOperation::GetReductionZeroVal(); const auto identityVal = ReduceOperation::template GetIdentityValue<AccDataType>();
const auto in_global_val_buf = const auto in_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
make_dynamic_buffer<AddressSpaceEnum::Global>(p_in_value_global, p_in_value_global,
in_grid_desc_m_k.GetElementSpaceSize(), in_grid_desc_m_k.GetElementSpaceSize(),
type_convert<InDataType>(zeroVal)); ReduceOperation::template GetIdentityValue<InDataType>());
const auto in_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum::Global>( const auto in_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_in_index_global, in_grid_desc_m_k.GetElementSpaceSize()); p_in_index_global, in_grid_desc_m_k.GetElementSpaceSize());
auto out_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>( auto out_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
...@@ -418,7 +394,7 @@ struct GridwiseReduction_mk_to_m_multiblock ...@@ -418,7 +394,7 @@ struct GridwiseReduction_mk_to_m_multiblock
thread_k_cluster_id * KThreadSliceSize)); thread_k_cluster_id * KThreadSliceSize));
static_for<0, MThreadSliceSize, 1>{}([&](auto I) { static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
accu_value_buf(I) = zeroVal; accu_value_buf(I) = identityVal;
accu_index_buf(I) = 0; accu_index_buf(I) = 0;
}); });
...@@ -459,7 +435,7 @@ struct GridwiseReduction_mk_to_m_multiblock ...@@ -459,7 +435,7 @@ struct GridwiseReduction_mk_to_m_multiblock
in_thread_idx_buf); in_thread_idx_buf);
static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
AccDataType tmpValue = zeroVal; AccDataType tmpValue = identityVal;
IndexDataType tmpIndex = 0; IndexDataType tmpIndex = 0;
static_for<0, KThreadSliceSize, 1>{}([&](auto iK) { static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
...@@ -512,7 +488,7 @@ struct GridwiseReduction_mk_to_m_multiblock ...@@ -512,7 +488,7 @@ struct GridwiseReduction_mk_to_m_multiblock
in_thread_val_buf(Number<offset>{})); in_thread_val_buf(Number<offset>{}));
}); });
AccDataType tmpValue = zeroVal; AccDataType tmpValue = identityVal;
IndexDataType tmpIndex = 0; IndexDataType tmpIndex = 0;
static_for<0, KThreadSliceSize, 1>{}([&](auto iK) { static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
...@@ -635,4 +611,3 @@ struct GridwiseReduction_mk_to_m_multiblock ...@@ -635,4 +611,3 @@ struct GridwiseReduction_mk_to_m_multiblock
}; };
} // namespace ck } // namespace ck
#endif
/******************************************************************************* // SPDX-License-Identifier: MIT
* // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
* MIT License
* #pragma once
* Copyright (c) 2021 Advanced Micro Devices, Inc.
* #include "ck/utility/data_type.hpp"
* Permission is hereby granted, free of charge, to any person obtaining a copy #include "ck/utility/reduction_common.hpp"
* of this software and associated documentation files (the "Software"), to deal #include "ck/utility/reduction_operator.hpp"
* in the Software without restriction, including without limitation the rights #include "ck/utility/reduction_functions_accumulate.hpp"
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell #include "ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp"
* copies of the Software, and to permit persons to whom the Software is #include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
* furnished to do so, subject to the following conditions: #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_GRIDWISE_2D_REDUCTION_THREADWISE_HPP
#define CK_GRIDWISE_2D_REDUCTION_THREADWISE_HPP
#include "data_type.hpp"
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
#include "reduction_functions_accumulate.hpp"
#include "reduction_functions_threadwise.hpp"
#include "threadwise_tensor_slice_transfer.hpp"
#include "element_wise_operation.hpp"
namespace ck { namespace ck {
...@@ -135,12 +112,12 @@ struct GridwiseReduction_mk_to_m_threadwise ...@@ -135,12 +112,12 @@ struct GridwiseReduction_mk_to_m_threadwise
ReduceOperation, ReduceOperation,
PropagateNan>; PropagateNan>;
const auto zeroVal = ReduceOperation::GetReductionZeroVal(); const auto identityVal = ReduceOperation::template GetIdentityValue<AccDataType>();
const auto in_global_val_buf = const auto in_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
make_dynamic_buffer<AddressSpaceEnum::Global>(p_in_value_global, p_in_value_global,
in_grid_desc_m_k.GetElementSpaceSize(), in_grid_desc_m_k.GetElementSpaceSize(),
type_convert<InDataType>(zeroVal)); ReduceOperation::template GetIdentityValue<InDataType>());
auto dst_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>( auto dst_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_out_value_global, out_grid_desc_m.GetElementSpaceSize()); p_out_value_global, out_grid_desc_m.GetElementSpaceSize());
...@@ -149,7 +126,7 @@ struct GridwiseReduction_mk_to_m_threadwise ...@@ -149,7 +126,7 @@ struct GridwiseReduction_mk_to_m_threadwise
StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> accu_value_buf; StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> accu_value_buf;
static_for<0, MThreadSliceSize, 1>{}([&](auto I) { accu_value_buf(I) = zeroVal; }); static_for<0, MThreadSliceSize, 1>{}([&](auto I) { accu_value_buf(I) = identityVal; });
const auto toReduceLength = in_grid_desc_m_k.GetLength(Number<1>{}); const auto toReduceLength = in_grid_desc_m_k.GetLength(Number<1>{});
...@@ -276,12 +253,12 @@ struct GridwiseReduction_mk_to_m_threadwise ...@@ -276,12 +253,12 @@ struct GridwiseReduction_mk_to_m_threadwise
(void)acc_elementwise_op; (void)acc_elementwise_op;
const auto zeroVal = ReduceOperation::GetReductionZeroVal(); const auto identityVal = ReduceOperation::template GetIdentityValue<AccDataType>();
const auto in_global_val_buf = const auto in_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
make_dynamic_buffer<AddressSpaceEnum::Global>(p_in_value_global, p_in_value_global,
in_grid_desc_m_k.GetElementSpaceSize(), in_grid_desc_m_k.GetElementSpaceSize(),
type_convert<InDataType>(zeroVal)); ReduceOperation::template GetIdentityValue<InDataType>());
const auto in_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum::Global>( const auto in_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_in_index_global, in_grid_desc_m_k.GetElementSpaceSize()); p_in_index_global, in_grid_desc_m_k.GetElementSpaceSize());
...@@ -303,7 +280,7 @@ struct GridwiseReduction_mk_to_m_threadwise ...@@ -303,7 +280,7 @@ struct GridwiseReduction_mk_to_m_threadwise
StaticBuffer<AddressSpaceEnum::Vgpr, IndexDataType, MThreadSliceSize, true> accu_index_buf; StaticBuffer<AddressSpaceEnum::Vgpr, IndexDataType, MThreadSliceSize, true> accu_index_buf;
static_for<0, MThreadSliceSize, 1>{}([&](auto I) { static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
accu_value_buf(I) = zeroVal; accu_value_buf(I) = identityVal;
accu_index_buf(I) = 0; accu_index_buf(I) = 0;
}); });
...@@ -495,4 +472,3 @@ struct GridwiseReduction_mk_to_m_threadwise ...@@ -495,4 +472,3 @@ struct GridwiseReduction_mk_to_m_threadwise
}; };
} // namespace ck } // namespace ck
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment