Commit c0f698d5 authored by carlushuang's avatar carlushuang
Browse files

add test threadwise transfer. currently static_ford in threadwise transfer can...

add test threadwise transfer. currently static_ford in threadwise transfer can not support large MC*KC tile size
parent e6ee6594
#ifndef CK_BLOCKWISE_GEMM_AVX2_HPP
#define CK_BLOCKWISE_GEMM_AVX2_HPP
#include "common_header.hpp"
#include "multi_index_transform_helper.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "threadwise_gemm_avx2.hpp"
namespace ck {
namespace cpu {
template <typename FloatA,
typename FloatB,
typename FloatC,
typename AccDataType,
typename ABlockDesc,
typename BBlockDesc,
typename CBlockDesc,
typename ABlockSliceLengths,
typename BBlockSliceLengths,
typename CBlockSliceLengths,
typename AThreadSliceLength,
typename BThreadSliceLength,
ck::index_t AThreadLoopOverDim, // thread slice loop over on block slice. 1d is enough for
// now
ck::index_t BThreadLoopOverDim,
ck::index_t KPerBlock,
typename ThreadwiseGemm_Dispatch,
typename ThreadMNAccessOrder // how we acces gemm MN to utilize micro kernel
>
struct BlockwiseGemmAvx2_MxN
{
static constexpr auto I0 = Number<0>{};
static constexpr auto I1 = Number<1>{};
static constexpr auto I2 = Number<2>{};
static constexpr auto I3 = Number<3>{};
static constexpr auto I4 = Number<4>{};
static constexpr auto I5 = Number<5>{};
static constexpr auto I6 = Number<6>{};
static constexpr auto I7 = Number<7>{};
static constexpr index_t nDimA = ABlockDesc::GetNumOfDimension();
static constexpr index_t nDimB = BBlockDesc::GetNumOfDimension();
static constexpr index_t nDimC = CBlockDesc::GetNumOfDimension();
using IndexA = MultiIndex<nDimA>;
using IndexB = MultiIndex<nDimB>;
using IndexC = MultiIndex<nDimC>;
using ACoord = decltype(make_tensor_coordinate(ABlockDesc{}, IndexA{}));
using BCoord = decltype(make_tensor_coordinate(BBlockDesc{}, IndexB{}));
using CCoord = decltype(make_tensor_coordinate(CBlockDesc{}, IndexC{}));
#if 0
constexpr BlockwiseGemmAvx2_MxN(const ABlockDesc & a_block_desc, const IndexA& a_thread_origin,
const BBlockDesc & b_block_desc, const IndexB& b_thread_origin)
: a_thread_coord_(make_tensor_coordinate(a_block_desc, a_thread_origin)),
b_thread_coord_(make_tensor_coordinate(b_block_desc, b_thread_origin)),
{
}
#endif
template <typename TensorDesc>
constexpr auto GetLeadingElement(const TensorDesc& desc)
{
// if use this function, make sure desc are known at compile time.
// otherwise, it is not efficient to calculate leading dim here
if constexpr(TensorDesc::GetNumOfDimension() == 1)
{
return 1;
}
else
{
constexpr auto last_dims =
typename uniform_sequence_gen<TensorDesc::GetNumOfDimension() - 1, 0>::type{};
constexpr auto lead_dims = decltype(last_dims)::PushFront(Number<1>{});
return desc.CalculateOffset(lead_dims);
}
}
template <typename ABlockBuffer, typename BBlockBuffer, typename CBlockBuffer>
void Run(const ABlockDesc& a_block_desc,
const ABlockBuffer& a_block_buf,
const IndexA& a_origin,
const BBlockDesc& b_block_desc,
const BBlockBuffer& b_block_buf,
const IndexB& b_origin,
const CBlockDesc& c_block_desc,
CBlockBuffer& c_block_buf,
const IndexC& c_origin) const
{
constexpr auto m_n_block_length =
ck::Sequence<ABlockSliceLengths::At(AThreadLoopOverDim),
BBlockSliceLengths::At(BThreadLoopOverDim)>{};
constexpr auto m_n_thread_length =
ck::Sequence<AThreadSliceLength::At(AThreadLoopOverDim),
BThreadSliceLength::At(BThreadLoopOverDim)>{};
constexpr auto m_n_access_length = m_n_block_length / m_n_thread_length;
constexpr auto ordered_m_n_access_length =
container_reorder_given_new2old(m_n_access_length, ThreadMNAccessOrder{});
constexpr auto a_block_idx_zeros =
typename uniform_sequence_gen<nDimA, 0>::type{}; // starting point of the block
constexpr auto b_block_idx_zeros = typename uniform_sequence_gen<nDimB, 0>::type{};
constexpr auto lda = GetLeadingElement(a_block_desc) * sizeof(FloatA);
constexpr auto ldb = GetLeadingElement(b_block_desc) * sizeof(FloatB);
constexpr auto ldc = GetLeadingElement(c_block_desc) * sizeof(FloatC);
ck::cpu::ThreadwiseGemmParam param;
param.Kr = KPerBlock;
param.lda = lda;
param.ldb = ldb;
param.ldc = ldc;
param.alpha = 1.0f; // TODO
static_ford<decltype(ordered_m_n_access_length)>{}([&](auto ordered_idx) {
constexpr auto origin_m_n_idx = ordered_idx.ReorderGivenOld2New(ThreadMNAccessOrder{});
constexpr auto current_m_idx =
origin_m_n_idx.At(0) * AThreadSliceLength::At(AThreadLoopOverDim);
constexpr auto current_n_idx =
origin_m_n_idx.At(1) * BThreadSliceLength::At(BThreadLoopOverDim);
constexpr auto current_mr =
ck::math::min(m_n_block_length.At(0) - current_m_idx, m_n_thread_length.At(0));
constexpr auto current_nr =
ck::math::min(m_n_block_length.At(1) - current_n_idx, m_n_thread_length.At(1));
constexpr auto a_block_idx =
a_block_idx_zeros.Modify(AThreadLoopOverDim, current_m_idx);
constexpr auto a_block_coord =
make_tensor_coordinate(a_block_desc, to_multi_index(a_origin + a_block_idx));
constexpr auto b_block_idx =
b_block_idx_zeros.Modify(BThreadLoopOverDim, current_n_idx);
constexpr auto b_block_coord =
make_tensor_coordinate(b_block_desc, to_multi_index(b_origin + b_block_idx));
constexpr auto c_block_coord =
make_tensor_coordinate(c_block_desc, to_multi_index(c_origin + origin_m_n_idx));
param.p_a = &a_block_buf.p_data_[a_block_coord.GetOffset()];
param.p_b = &b_block_buf.p_data_[b_block_coord.GetOffset()];
param.p_c = &c_block_buf.p_data_[c_block_coord.GetOffset()];
ThreadwiseGemm_Dispatch::Run(&param, current_mr, current_nr);
});
}
};
} // namespace cpu
} // namespace ck
#endif
#ifndef CONVOLUTION_FORWARD_SPECIALIZATION_CPU
#define CONVOLUTION_FORWARD_SPECIALIZATION_CPU
namespace ck {
namespace tensor_operation {
namespace cpu {
namespace device {
enum ConvolutionForwardSpecialization_t
{
Default,
Filter1x1Pad0,
Filter1x1Stride1Pad0,
OddC,
};
} // namespace device
} // namespace cpu
} // namespace tensor_operation
} // namespace ck
#endif
#ifndef DEVICE_BASE_CPU_HPP
#define DEVICE_BASE_CPU_HPP
#include <string>
namespace ck {
namespace tensor_operation {
namespace cpu {
namespace device {
struct BaseArgument
{
BaseArgument() = default;
BaseArgument(const BaseArgument&) = default;
BaseArgument& operator=(const BaseArgument&) = default;
virtual ~BaseArgument() {}
};
struct BaseInvoker
{
BaseInvoker() = default;
BaseInvoker(const BaseInvoker&) = default;
BaseInvoker& operator=(const BaseInvoker&) = default;
virtual float Run(const BaseArgument*, int = 1) = 0;
virtual ~BaseInvoker() {}
};
struct BaseOperator
{
BaseOperator() = default;
BaseOperator(const BaseOperator&) = default;
BaseOperator& operator=(const BaseOperator&) = default;
virtual bool IsSupportedArgument(const BaseArgument*) = 0;
virtual std::string GetTypeString() const = 0;
virtual ~BaseOperator() {}
};
} // namespace device
} // namespace cpu
} // namespace tensor_operation
} // namespace ck
#endif
#ifndef DEVICE_CONV_FWD_CPU_HPP
#define DEVICE_CONV_FWD_CPU_HPP
#include <iostream>
#include "device_base_cpu.hpp"
namespace ck {
namespace tensor_operation {
namespace cpu {
namespace device {
template <typename InElementwiseOperation,
typename WeiElementwiseOperation,
typename OutElementwiseOperation>
struct DeviceConvFwd : public BaseOperator
{
virtual std::unique_ptr<BaseArgument>
MakeArgumentPointer(const void* p_in,
const void* p_wei,
void* p_out,
ck::index_t N,
ck::index_t K,
ck::index_t C,
std::vector<ck::index_t> input_spatial_lengths,
std::vector<ck::index_t> filter_spatial_lengths,
std::vector<ck::index_t> output_spatial_lengths,
std::vector<ck::index_t> conv_filter_strides,
std::vector<ck::index_t> conv_filter_dilations,
std::vector<ck::index_t> input_left_pads,
std::vector<ck::index_t> input_right_pads,
InElementwiseOperation in_element_op,
WeiElementwiseOperation wei_element_op,
OutElementwiseOperation out_element_op) = 0;
virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
};
template <typename InElementwiseOperation,
typename WeiElementwiseOperation,
typename OutElementwiseOperation>
using DeviceConvFwdPtr = std::unique_ptr<
DeviceConvFwd<InElementwiseOperation, WeiElementwiseOperation, OutElementwiseOperation>>;
} // namespace device
} // namespace cpu
} // namespace tensor_operation
} // namespace ck
#endif
#ifndef DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_HPP
#define DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_HPP
#include <iostream>
#include <sstream>
#include <numeric>
#include "device.hpp"
#include "device_base_cpu.hpp"
#include "device_conv_fwd_cpu.hpp"
#include "convolution_forward_specialization_cpu.hpp"
#include "common_header.hpp"
#include "../../gpu/device/tensor_layout.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "gridwise_gemm_avx2.hpp"
namespace ck {
namespace tensor_operation {
namespace cpu {
namespace device {
// out[N, Ho, Wo, K] = in[N, Hi, Wi, C] * wei[K, Y, X, C]
template <typename InDataType,
typename WeiDataType,
typename OutDataType,
typename AccDataType,
typename InElementwiseOperation,
typename WeiElementwiseOperation,
typename OutElementwiseOperation,
ConvolutionForwardSpecialization_t ConvForwardSpecialization,
ck::index_t NumDimSpatial,
ck::index_t MPerBlock, // block means data are designed to fit in cache (L1/L2/L3)
ck::index_t NPerBlock,
ck::index_t KPerBlock,
typename ThreadwiseGemm_Dispatch>
// bool IsGemmMPadded,
// bool IsGemmNPadded,
// bool IsGemmKPadded>
struct DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
: public DeviceConvFwd<InElementwiseOperation, WeiElementwiseOperation, OutElementwiseOperation>
{
using DeviceOp = DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K;
using ADataType = InDataType;
using BDataType = WeiDataType;
using CDataType = OutDataType;
using AElementwiseOperation = InElementwiseOperation;
using BElementwiseOperation = WeiElementwiseOperation;
using CElementwiseOperation = OutElementwiseOperation;
// TODO make A/B datatype different
using ABDataType = InDataType;
static constexpr index_t NDimSpatial = NumDimSpatial;
static constexpr auto I0 = Number<0>{};
static constexpr auto I1 = Number<1>{};
static constexpr auto I2 = Number<2>{};
static constexpr auto I3 = Number<3>{};
static auto GetWeightTensorDescriptor(ck::index_t gemm_k, ck::index_t gemm_n)
{
const auto wei_gemm_n_k_grid_desc =
make_naive_tensor_descriptor_packed(make_tuple(gemm_n, gemm_k));
const auto wei_gemm_n0_k_n1_grid_desc = transform_tensor_descriptor(
wei_gemm_n_k_grid_desc,
ck::make_tuple(ck::make_unmerge_transform(
ck::make_tuple(wei_gemm_n_k_grid_desc.GetLength(I0) /
ThreadwiseGemm_Dispatch::MatrixBMinVectorSize,
ThreadwiseGemm_Dispatch::MatrixBMinVectorSize)),
ck::make_pass_through_transform(wei_gemm_n_k_grid_desc.GetLength(I1))),
ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}),
ck::make_tuple(ck::Sequence<0, 2>{}, ck::Sequence<1>{}));
return wei_gemm_n0_k_n1_grid_desc;
}
static auto GetOutputTensorDescriptor(ck::index_t gemm_m, ck::index_t gemm_n)
{
const auto out_gemm_m_n_grid_desc =
make_naive_tensor_descriptor_packed(make_tuple(gemm_m, gemm_n));
return out_gemm_m_n_grid_desc;
}
template <ck::index_t NDim, typename std::enable_if<NDim == 1, bool>::type = false>
static auto GetInputTensorDescriptor(ck::index_t N,
ck::index_t C,
ck::index_t gemm_m,
ck::index_t gemm_k,
const std::vector<ck::index_t>& input_spatial_lengths,
const std::vector<ck::index_t>& filter_spatial_lengths,
const std::vector<ck::index_t>& output_spatial_lengths,
const std::vector<ck::index_t>& conv_filter_strides,
const std::vector<ck::index_t>& conv_filter_dilations,
const std::vector<ck::index_t>& input_left_pads,
const std::vector<ck::index_t>& input_right_pads)
{
const index_t Wi = input_spatial_lengths[0];
const index_t Wo = output_spatial_lengths[0];
const index_t ConvStrideW = conv_filter_strides[0];
if constexpr(ConvForwardSpecialization ==
ConvolutionForwardSpecialization_t::Filter1x1Stride1Pad0)
{
const auto in_gemm_m_k_grid_desc =
make_naive_tensor_descriptor_packed(make_tuple(gemm_m, gemm_k));
return in_gemm_m_k_grid_desc;
}
else if constexpr(ConvForwardSpecialization ==
ConvolutionForwardSpecialization_t::Filter1x1Pad0)
{
const auto in_n_wi_c_grid_desc =
make_naive_tensor_descriptor_packed(make_tuple(N, Wi, C));
const auto in_n_wo_c_grid_desc = transform_tensor_descriptor(
in_n_wi_c_grid_desc,
make_tuple(make_pass_through_transform(N),
make_embed_transform(make_tuple(Wo), make_tuple(ConvStrideW)),
make_pass_through_transform(C)),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
const auto in_gemm_m_k_grid_desc = transform_tensor_descriptor(
in_n_wo_c_grid_desc,
make_tuple(make_merge_transform(make_tuple(N, Wo)), make_pass_through_transform(C)),
make_tuple(Sequence<0, 1>{}, Sequence<2>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
return in_gemm_m_k_grid_desc;
}
else
{
const index_t X = filter_spatial_lengths[0];
const index_t ConvDilationW = conv_filter_dilations[0];
const index_t InLeftPadW = input_left_pads[0];
const index_t InRightPadW = input_right_pads[0];
const auto in_n_wi_c_grid_desc =
make_naive_tensor_descriptor_packed(make_tuple(N, Wi, C));
const auto in_n_wip_c_grid_desc = transform_tensor_descriptor(
in_n_wi_c_grid_desc,
make_tuple(make_pass_through_transform(N),
make_pad_transform(Wi, InLeftPadW, InRightPadW),
make_pass_through_transform(C)),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
const auto in_n_x_wo_c_grid_desc = transform_tensor_descriptor(
in_n_wip_c_grid_desc,
make_tuple(
make_pass_through_transform(N),
make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
make_pass_through_transform(C)),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
const auto in_gemm_m_k_grid_desc =
transform_tensor_descriptor(in_n_x_wo_c_grid_desc,
make_tuple(make_merge_transform(make_tuple(N, Wo)),
make_merge_transform(make_tuple(X, C))),
make_tuple(Sequence<0, 2>{}, Sequence<1, 3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
return in_gemm_m_k_grid_desc;
}
}
template <ck::index_t NDim, typename std::enable_if<NDim == 2, bool>::type = false>
static auto GetInputTensorDescriptor(ck::index_t N,
ck::index_t C,
ck::index_t gemm_m,
ck::index_t gemm_k,
const std::vector<ck::index_t>& input_spatial_lengths,
const std::vector<ck::index_t>& filter_spatial_lengths,
const std::vector<ck::index_t>& output_spatial_lengths,
const std::vector<ck::index_t>& conv_filter_strides,
const std::vector<ck::index_t>& conv_filter_dilations,
const std::vector<ck::index_t>& input_left_pads,
const std::vector<ck::index_t>& input_right_pads)
{
const index_t Hi = input_spatial_lengths[0];
const index_t Wi = input_spatial_lengths[1];
const index_t Ho = output_spatial_lengths[0];
const index_t Wo = output_spatial_lengths[1];
const index_t ConvStrideH = conv_filter_strides[0];
const index_t ConvStrideW = conv_filter_strides[1];
if constexpr(ConvForwardSpecialization ==
ConvolutionForwardSpecialization_t::Filter1x1Stride1Pad0)
{
const auto in_gemm_m_k_grid_desc =
make_naive_tensor_descriptor_packed(make_tuple(gemm_m, gemm_k));
return in_gemm_m_k_grid_desc;
}
else if constexpr(ConvForwardSpecialization ==
ConvolutionForwardSpecialization_t::Filter1x1Pad0)
{
const auto in_n_hi_wi_c_grid_desc =
make_naive_tensor_descriptor_packed(make_tuple(N, Hi, Wi, C));
const auto in_n_ho_wo_c_grid_desc = transform_tensor_descriptor(
in_n_hi_wi_c_grid_desc,
make_tuple(make_pass_through_transform(N),
make_embed_transform(make_tuple(Ho), make_tuple(ConvStrideH)),
make_embed_transform(make_tuple(Wo), make_tuple(ConvStrideW)),
make_pass_through_transform(C)),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
const auto in_gemm_m_k_grid_desc =
transform_tensor_descriptor(in_n_ho_wo_c_grid_desc,
make_tuple(make_merge_transform(make_tuple(N, Ho, Wo)),
make_pass_through_transform(C)),
make_tuple(Sequence<0, 1, 2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
return in_gemm_m_k_grid_desc;
}
else
{
const index_t Y = filter_spatial_lengths[0];
const index_t X = filter_spatial_lengths[1];
const index_t ConvDilationH = conv_filter_dilations[0];
const index_t ConvDilationW = conv_filter_dilations[1];
const index_t InLeftPadH = input_left_pads[0];
const index_t InLeftPadW = input_left_pads[1];
const index_t InRightPadH = input_right_pads[0];
const index_t InRightPadW = input_right_pads[1];
const auto in_n_hi_wi_c_grid_desc =
make_naive_tensor_descriptor_packed(make_tuple(N, Hi, Wi, C));
const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
in_n_hi_wi_c_grid_desc,
make_tuple(make_pass_through_transform(N),
make_pad_transform(Hi, InLeftPadH, InRightPadH),
make_pad_transform(Wi, InLeftPadW, InRightPadW),
make_pass_through_transform(C)),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
in_n_hip_wip_c_grid_desc,
make_tuple(
make_pass_through_transform(N),
make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
make_pass_through_transform(C)),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
const auto in_gemm_m_k_grid_desc =
transform_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc,
make_tuple(make_merge_transform(make_tuple(N, Ho, Wo)),
make_merge_transform(make_tuple(Y, X, C))),
make_tuple(Sequence<0, 2, 4>{}, Sequence<1, 3, 5>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
return in_gemm_m_k_grid_desc;
}
}
template <ck::index_t NDim, typename std::enable_if<NDim == 3, bool>::type = false>
static auto GetInputTensorDescriptor(ck::index_t N,
ck::index_t C,
ck::index_t gemm_m,
ck::index_t gemm_k,
ck::index_t gemm_m_pad,
const std::vector<ck::index_t>& input_spatial_lengths,
const std::vector<ck::index_t>& filter_spatial_lengths,
const std::vector<ck::index_t>& output_spatial_lengths,
const std::vector<ck::index_t>& conv_filter_strides,
const std::vector<ck::index_t>& conv_filter_dilations,
const std::vector<ck::index_t>& input_left_pads,
const std::vector<ck::index_t>& input_right_pads)
{
const index_t Di = input_spatial_lengths[0];
const index_t Hi = input_spatial_lengths[1];
const index_t Wi = input_spatial_lengths[2];
const index_t Do = output_spatial_lengths[0];
const index_t Ho = output_spatial_lengths[1];
const index_t Wo = output_spatial_lengths[2];
const index_t ConvStrideD = conv_filter_strides[0];
const index_t ConvStrideH = conv_filter_strides[1];
const index_t ConvStrideW = conv_filter_strides[2];
if constexpr(ConvForwardSpecialization ==
ConvolutionForwardSpecialization_t::Filter1x1Stride1Pad0)
{
const auto in_gemm_m_k_grid_desc =
make_naive_tensor_descriptor_packed(make_tuple(gemm_m, gemm_k));
return in_gemm_m_k_grid_desc;
}
else if constexpr(ConvForwardSpecialization ==
ConvolutionForwardSpecialization_t::Filter1x1Pad0)
{
const auto in_n_di_hi_wi_c_grid_desc =
make_naive_tensor_descriptor_packed(make_tuple(N, Di, Hi, Wi, C));
const auto in_n_do_ho_wo_c_grid_desc = transform_tensor_descriptor(
in_n_di_hi_wi_c_grid_desc,
make_tuple(make_pass_through_transform(N),
make_embed_transform(make_tuple(Do), make_tuple(ConvStrideD)),
make_embed_transform(make_tuple(Ho), make_tuple(ConvStrideH)),
make_embed_transform(make_tuple(Wo), make_tuple(ConvStrideW)),
make_pass_through_transform(C)),
make_tuple(
Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
make_tuple(
Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
const auto in_gemm_m_k_grid_desc = transform_tensor_descriptor(
in_n_do_ho_wo_c_grid_desc,
make_tuple(make_merge_transform(make_tuple(N, Do, Ho, Wo)),
make_pass_through_transform(C)),
make_tuple(Sequence<0, 1, 2, 3>{}, Sequence<4>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
return in_gemm_m_k_grid_desc;
}
else
{
const index_t Z = filter_spatial_lengths[0];
const index_t Y = filter_spatial_lengths[1];
const index_t X = filter_spatial_lengths[2];
const index_t ConvDilationD = conv_filter_dilations[0];
const index_t ConvDilationH = conv_filter_dilations[1];
const index_t ConvDilationW = conv_filter_dilations[2];
const index_t InLeftPadD = input_left_pads[0];
const index_t InLeftPadH = input_left_pads[1];
const index_t InLeftPadW = input_left_pads[2];
const index_t InRightPadD = input_right_pads[0];
const index_t InRightPadH = input_right_pads[1];
const index_t InRightPadW = input_right_pads[2];
const auto in_n_di_hi_wi_c_grid_desc =
make_naive_tensor_descriptor_packed(make_tuple(N, Di, Hi, Wi, C));
const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
in_n_di_hi_wi_c_grid_desc,
make_tuple(make_pass_through_transform(N),
make_pad_transform(Di, InLeftPadD, InRightPadD),
make_pad_transform(Hi, InLeftPadH, InRightPadH),
make_pad_transform(Wi, InLeftPadW, InRightPadW),
make_pass_through_transform(C)),
make_tuple(
Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
make_tuple(
Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
const auto in_n_z_do_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
in_n_hip_wip_c_grid_desc,
make_tuple(
make_pass_through_transform(N),
make_embed_transform(make_tuple(Z, Do), make_tuple(ConvDilationD, ConvStrideD)),
make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
make_pass_through_transform(C)),
make_tuple(
Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
make_tuple(Sequence<0>{},
Sequence<1, 2>{},
Sequence<3, 4>{},
Sequence<5, 6>{},
Sequence<7>{}));
const auto in_gemm_m_k_grid_desc = transform_tensor_descriptor(
in_n_z_do_y_ho_x_wo_c_grid_desc,
make_tuple(make_merge_transform(make_tuple(N, Do, Ho, Wo)),
make_merge_transform(make_tuple(Z, Y, X, C))),
make_tuple(Sequence<0, 2, 4, 6>{}, Sequence<1, 3, 5, 7>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
return in_gemm_m_k_grid_desc;
}
}
static index_t GetGemmM(ck::index_t N, const std::vector<ck::index_t>& output_spatial_lengths)
{
return N * std::accumulate(std::begin(output_spatial_lengths),
std::end(output_spatial_lengths),
1,
std::multiplies<ck::index_t>());
}
static index_t GetGemmK(ck::index_t C, const std::vector<ck::index_t>& filter_spatial_lengths)
{
return C * std::accumulate(std::begin(filter_spatial_lengths),
std::end(filter_spatial_lengths),
1,
std::multiplies<ck::index_t>());
}
static auto MakeABCGridDescriptor(ck::index_t N,
ck::index_t K,
ck::index_t C,
std::vector<ck::index_t> input_spatial_lengths,
std::vector<ck::index_t> filter_spatial_lengths,
std::vector<ck::index_t> output_spatial_lengths,
std::vector<ck::index_t> conv_filter_strides,
std::vector<ck::index_t> conv_filter_dilations,
std::vector<ck::index_t> input_left_pads,
std::vector<ck::index_t> input_right_pads)
{
using namespace ck;
const index_t GemmM = GetGemmM(N, output_spatial_lengths);
const index_t GemmN = K;
const index_t GemmK = GetGemmK(C, filter_spatial_lengths);
// A:
const auto in_gemm_m_k_grid_desc =
GetInputTensorDescriptor<NumDimSpatial>(N,
C,
GemmM,
GemmK,
input_spatial_lengths,
filter_spatial_lengths,
output_spatial_lengths,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads);
// B:
const auto wei_gemm_n0_k_n1_grid_desc = GetWeightTensorDescriptor(GemmK, GemmN);
// C:
const auto out_gemm_m_n_grid_desc = GetOutputTensorDescriptor(GemmM, GemmN);
return make_tuple(
in_gemm_m_k_grid_desc, wei_gemm_n0_k_n1_grid_desc, out_gemm_m_n_grid_desc);
}
template <ck::index_t NDim, typename std::enable_if<NDim == 1, bool>::type = false>
static auto GetABCGridDesc()
{
return MakeABCGridDescriptor(1, 1, 1, {1}, {1}, {1}, {1}, {1}, {1}, {1});
}
template <ck::index_t NDim, typename std::enable_if<NDim == 2, bool>::type = false>
static auto GetABCGridDesc()
{
return MakeABCGridDescriptor(
1, 1, 1, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1});
}
template <ck::index_t NDim, typename std::enable_if<NDim == 3, bool>::type = false>
static auto GetABCGridDesc()
{
return MakeABCGridDescriptor(
1, 1, 1, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1});
}
using ABCGridDescs = decltype(GetABCGridDesc<NumDimSpatial>());
using AGridDesc = remove_cvref_t<decltype(ABCGridDescs{}[I0])>;
using BGridDesc = remove_cvref_t<decltype(ABCGridDescs{}[I1])>;
using CGridDesc = remove_cvref_t<decltype(ABCGridDescs{}[I2])>;
static constexpr bool UseCLocalBuffer = true;
using GridwiseGemm =
ck::cpu::GridwiseGemmAvx2_MxN<InDataType, // InDataType,
WeiDataType, // WeiDataType,
OutDataType, // OutDataType,
AccDataType, // AccDataType,
AGridDesc, // AGridDesc,
BGridDesc, // BGridDesc,
CGridDesc, // CGridDesc,
AElementwiseOperation, // AElementwiseOperation,
BElementwiseOperation, // BElementwiseOperation,
CElementwiseOperation, // CElementwiseOperation,
MPerBlock, // MPerBlock,
NPerBlock, // NPerBlock,
KPerBlock, // KPerBlock,
ThreadwiseGemm_Dispatch, // ThreadwiseGemm_Dispatch,
ck::Sequence<0, 1, 2>, // BlockMNKAccessOrder,
ck::Sequence<0, 1>, // ThreadMNAccessOrder
UseCLocalBuffer // UseCLocalBuffer
>;
// Argument
struct Argument : public BaseArgument
{
Argument(const InDataType* p_in_grid,
const WeiDataType* p_wei_grid,
OutDataType* p_out_grid,
ck::index_t N,
ck::index_t K,
ck::index_t C,
std::vector<ck::index_t> input_spatial_lengths,
std::vector<ck::index_t> filter_spatial_lengths,
std::vector<ck::index_t> output_spatial_lengths,
std::vector<ck::index_t> conv_filter_strides,
std::vector<ck::index_t> conv_filter_dilations,
std::vector<ck::index_t> input_left_pads,
std::vector<ck::index_t> input_right_pads,
InElementwiseOperation in_element_op,
WeiElementwiseOperation wei_element_op,
OutElementwiseOperation out_element_op)
: p_a_grid_{p_in_grid},
p_b_grid_{p_wei_grid},
p_c_grid_{p_out_grid},
a_grid_desc_{},
b_grid_desc_{},
c_grid_desc_{},
a_element_op_{in_element_op},
b_element_op_{wei_element_op},
c_element_op_{out_element_op},
Conv_N_{N},
Conv_K_{K},
Conv_C_{C},
filter_spatial_lengths_{filter_spatial_lengths},
conv_filter_strides_{conv_filter_strides},
input_left_pads_{input_left_pads},
input_right_pads_{input_right_pads}
{
const auto descs = DeviceOp::MakeABCGridDescriptor(N,
K,
C,
input_spatial_lengths,
filter_spatial_lengths,
output_spatial_lengths,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads);
a_grid_desc_ = descs[I0];
b_grid_desc_ = descs[I1];
c_grid_desc_ = descs[I2];
}
// private:
const ADataType* p_a_grid_;
const BDataType* p_b_grid_;
CDataType* p_c_grid_;
AGridDesc a_grid_desc_;
BGridDesc b_grid_desc_;
CGridDesc c_grid_desc_;
AElementwiseOperation a_element_op_;
BElementwiseOperation b_element_op_;
CElementwiseOperation c_element_op_;
// for checking IsSupportedArgument()
index_t Conv_N_;
index_t Conv_K_;
index_t Conv_C_;
std::vector<index_t> filter_spatial_lengths_;
std::vector<index_t> conv_filter_strides_;
std::vector<index_t> input_left_pads_;
std::vector<index_t> input_right_pads_;
};
// Invoker
struct Invoker : public BaseInvoker
{
using Argument = DeviceOp::Argument;
float Run(const Argument& arg, int nrepeat = 1)
{
if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_, arg.b_grid_desc_, arg.c_grid_desc_))
{
throw std::runtime_error("wrong! GridwiseGemmAvx2_MxN has invalid setting");
}
const auto kernel = ck::cpu::kernel_gemm_avx_mxn<GridwiseGemm,
InDataType,
WeiDataType,
OutDataType,
AGridDesc,
BGridDesc,
CGridDesc,
AElementwiseOperation,
BElementwiseOperation,
CElementwiseOperation>;
float ave_time = launch_and_time_cpu_kernel(kernel,
nrepeat,
arg.p_a_grid_,
arg.p_b_grid_,
arg.p_c_grid_,
arg.a_grid_desc_,
arg.b_grid_desc_,
arg.c_grid_desc_,
arg.a_element_op_,
arg.b_element_op_,
arg.c_element_op_);
// TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the
// result
memset(arg.p_c_grid_, 0, arg.a_grid_desc_.GetElementSpaceSize());
launch_cpu_kernel(kernel,
arg.p_a_grid_,
arg.p_b_grid_,
arg.p_c_grid_,
arg.a_grid_desc_,
arg.b_grid_desc_,
arg.c_grid_desc_,
arg.a_element_op_,
arg.b_element_op_,
arg.c_element_op_);
return ave_time;
}
float Run(const BaseArgument* p_arg, int nrepeat = 1) override
{
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
}
};
static constexpr bool IsValidCompilationParameter()
{
// TODO: properly implement this check
return true;
}
static bool IsSupportedArgument(const Argument& arg)
{
if constexpr(ConvForwardSpecialization ==
ConvolutionForwardSpecialization_t::Filter1x1Stride1Pad0)
{
// check if it's 1x1, stride=1 conv
if(!(arg.filter_spatial_lengths_[0] == 1 && arg.filter_spatial_lengths_[1] == 1 &&
arg.conv_filter_strides_[0] == 1 && arg.conv_filter_strides_[1] == 1 &&
arg.input_left_pads_[0] == 0 && arg.input_left_pads_[1] == 0 &&
arg.input_right_pads_[0] == 0 && arg.input_right_pads_[1] == 0))
{
return false;
}
}
else if constexpr(ConvForwardSpecialization ==
ConvolutionForwardSpecialization_t::Filter1x1Pad0)
{
// check if it's 1x1 conv
if(!(arg.filter_spatial_lengths_[0] == 1 && arg.filter_spatial_lengths_[1] == 1 &&
arg.input_left_pads_[0] == 0 && arg.input_left_pads_[1] == 0 &&
arg.input_right_pads_[0] == 0 && arg.input_right_pads_[1] == 0))
{
return false;
}
}
// Gridwise GEMM size
return GridwiseGemm::CheckValidity(arg.a_grid_desc_, arg.b_grid_desc_, arg.c_grid_desc_);
}
bool IsSupportedArgument(const BaseArgument* p_arg) override
{
return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
}
static auto MakeArgument(const InDataType* p_in_grid,
const WeiDataType* p_wei_grid,
OutDataType* p_out_grid,
ck::index_t N,
ck::index_t K,
ck::index_t C,
std::vector<ck::index_t> input_spatial_lengths,
std::vector<ck::index_t> filter_spatial_lengths,
std::vector<ck::index_t> output_spatial_lengths,
std::vector<ck::index_t> conv_filter_strides,
std::vector<ck::index_t> conv_filter_dilations,
std::vector<ck::index_t> input_left_pads,
std::vector<ck::index_t> input_right_pads,
InElementwiseOperation in_element_op,
WeiElementwiseOperation wei_element_op,
OutElementwiseOperation out_element_op)
{
return Argument{p_in_grid,
p_wei_grid,
p_out_grid,
N,
K,
C,
input_spatial_lengths,
filter_spatial_lengths,
output_spatial_lengths,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
in_element_op,
wei_element_op,
out_element_op};
}
static auto MakeInvoker() { return Invoker{}; }
std::unique_ptr<BaseArgument>
MakeArgumentPointer(const void* p_in_grid,
const void* p_wei_grid,
void* p_out_grid,
ck::index_t N,
ck::index_t K,
ck::index_t C,
std::vector<ck::index_t> input_spatial_lengths,
std::vector<ck::index_t> filter_spatial_lengths,
std::vector<ck::index_t> output_spatial_lengths,
std::vector<ck::index_t> conv_filter_strides,
std::vector<ck::index_t> conv_filter_dilations,
std::vector<ck::index_t> input_left_pads,
std::vector<ck::index_t> input_right_pads,
InElementwiseOperation in_element_op,
WeiElementwiseOperation wei_element_op,
OutElementwiseOperation out_element_op) override
{
return std::make_unique<Argument>(static_cast<const InDataType*>(p_in_grid),
static_cast<const WeiDataType*>(p_wei_grid),
static_cast<OutDataType*>(p_out_grid),
N,
K,
C,
input_spatial_lengths,
filter_spatial_lengths,
output_spatial_lengths,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
in_element_op,
wei_element_op,
out_element_op);
}
std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
{
return std::make_unique<Invoker>(Invoker{});
}
std::string GetTypeString() const override
{
auto str = std::stringstream();
// clang-format off
str << "DeviceConv" << std::to_string(NumDimSpatial)
<< "DFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K"
<< "<"
<< MPerBlock << ", "
<< NPerBlock << ", "
<< KPerBlock
<< ">";
// clang-format on
return str.str();
}
};
} // namespace device
} // namespace cpu
} // namespace tensor_operation
} // namespace ck
#endif
#pragma once
#include "data_type_cpu.hpp"
namespace ck {
namespace tensor_operation {
namespace cpu {
namespace element_wise {
using float8_t = ck::cpu::float8_t;
using float4_t = ck::cpu::float4_t;
struct PassThrough
{
void operator()(float& y, const float& x) const { y = x; }
void operator()(float4_t& y, const float4_t& x) const { y = x; }
void operator()(float8_t& y, const float8_t& x) const { y = x; }
};
struct Add
{
void operator()(float& y, const float& x0, const float& x1) const { y = x0 + x1; }
void operator()(float4_t& y, const float4_t& x0, const float4_t& x1) const
{
y = _mm_add_ps(x0, x1);
}
void operator()(float8_t& y, const float8_t& x0, const float8_t& x1) const
{
y = _mm256_add_ps(x0, x1);
}
};
struct AlphaBetaAdd
{
AlphaBetaAdd(float alpha, float beta) : alpha_(alpha), beta_(beta) {}
void operator()(float& y, const float& x0, const float& x1) const
{
y = alpha_ * x0 + beta_ * x1;
}
void operator()(float4_t& y, const float4_t& x0, const float4_t& x1) const
{
y = _mm_add_ps(_mm_mul_ps(x0, _mm_set1_ps(alpha_)), _mm_mul_ps(x1, _mm_set1_ps(beta_)));
}
void operator()(float8_t& y, const float8_t& x0, const float8_t& x1) const
{
y = _mm256_add_ps(_mm256_mul_ps(x0, _mm256_set1_ps(alpha_)),
_mm256_mul_ps(x1, _mm256_set1_ps(beta_)));
}
float alpha_;
float beta_;
};
struct AddRelu
{
void operator()(float& y, const float& x0, const float& x1) const
{
const float a = x0 + x1;
y = a > 0 ? a : 0;
}
void operator()(float4_t& y, const float4_t& x0, const float4_t& x1) const
{
y = _mm_max_ps(_mm_add_ps(x0, x1), _mm_setzero_ps());
}
void operator()(float8_t& y, const float8_t& x0, const float8_t& x1) const
{
y = _mm256_max_ps(_mm256_add_ps(x0, x1), _mm256_setzero_ps());
}
};
#if 0
struct AddHardswish
{
void operator()(float& y, const float& x0, const float& x1) const
{
float a = x0 + x1;
float b = a + float{3};
float c = (b > 0) * (b > float{6} ? float{6} : b) * a * float{0.166667};
y = c;
}
void
operator()(half_t& y, const half_t& x0, const half_t& x1) const
{
float a = x0 + x1;
float b = a + float{3};
float c = (b > 0) * (b > float{6} ? float{6} : b) * a * float{0.166667};
y = c;
}
};
#endif
struct AddReluAdd
{
void operator()(float& y, const float& x0, const float& x1, const float& x2) const
{
float a = x0 + x1;
float b = a > 0 ? a : 0;
float c = b + x2;
y = c;
}
void operator()(float4_t& y, const float4_t& x0, const float4_t& x1, const float4_t& x2) const
{
float4_t a = _mm_add_ps(x0, x1);
float4_t b = _mm_max_ps(a, _mm_setzero_ps());
y = _mm_add_ps(b, x2);
}
void operator()(float8_t& y, const float8_t& x0, const float8_t& x1, const float8_t& x2) const
{
float8_t a = _mm256_add_ps(x0, x1);
float8_t b = _mm256_max_ps(a, _mm256_setzero_ps());
y = _mm256_add_ps(b, x2);
}
};
#if 0
struct AddHardswishAdd
{
void
operator()(float& y, const float& x0, const float& x1, const float& x2) const
{
float a = x0 + x1;
float b = a + float{3};
float c = (b > 0) * (b > float{6} ? float{6} : b) * a * float{0.166667};
float d = c + x2;
y = d;
}
void
operator()(half_t& y, const half_t& x0, const half_t& x1, const half_t& x2) const
{
float a = x0 + x1;
float b = a + float{3};
float c = (b > 0) * (b > float{6} ? float{6} : b) * a * float{0.166667};
float d = c + x2;
y = d;
}
};
#endif
#if 0
struct RequantReluRequant
{
// FIXME: We just need one scale for Relu / Leaky Relu / PRelu
RequantReluRequant(float scaleGemm, float scaleRelu)
: scaleGemm_(scaleGemm), scaleRelu_(scaleRelu)
{
}
void operator()(int8_t& y, const int& x) const
{
float gemm_requant = scaleGemm_ * static_cast<float>(x);
float relu = gemm_requant > 0 ? gemm_requant : 0;
float relu_requant = scaleRelu_ * relu;
y = static_cast<int8_t>(relu_requant > 127 ? 127
: relu_requant < -128 ? -128 : relu_requant);
}
// for reference_gemm
void operator()(float& y, const float& x) const
{
float gemm_requant = scaleGemm_ * x;
float relu = gemm_requant > 0 ? gemm_requant : 0;
float relu_requant = scaleRelu_ * relu;
y = static_cast<float>(relu_requant > 127 ? 127
: relu_requant < -128 ? -128 : relu_requant);
}
float scaleGemm_;
float scaleRelu_;
};
#endif
// Unary operators are usually called element-wisely before/after the reduction is executed on the
// elements. They are needed for easy implementation of reduction types of AVG, NRM1, NRM2
template <typename Y, typename X, bool HasDividing = false>
struct UnaryIdentic;
template <>
struct UnaryIdentic<float, float, false>
{
UnaryIdentic(const int32_t divider = 1) { (void)divider; };
void operator()(float& y, const float& x) const { y = x; };
};
template <>
struct UnaryIdentic<float, float, true>
{
UnaryIdentic(const int32_t divider = 1) { divider_ = divider; };
void operator()(float& y, const float& x) const { y = x / type_convert<float>(divider_); };
int32_t divider_ = 1;
};
template <>
struct UnaryIdentic<float4_t, float4_t, false>
{
UnaryIdentic(const int32_t divider = 1) { (void)divider; };
void operator()(float4_t& y, const float4_t& x) const { y = x; };
};
template <>
struct UnaryIdentic<float4_t, float4_t, true>
{
UnaryIdentic(const int32_t divider = 1) { divider_ = divider; };
void operator()(float4_t& y, const float4_t& x) const
{
y = _mm_div_ps(x, _mm_set1_ps(static_cast<float>(divider_)));
};
int32_t divider_ = 1;
};
template <>
struct UnaryIdentic<float8_t, float8_t, false>
{
UnaryIdentic(const int32_t divider = 1) { (void)divider; };
void operator()(float8_t& y, const float8_t& x) const { y = x; };
};
template <>
struct UnaryIdentic<float8_t, float8_t, true>
{
UnaryIdentic(const int32_t divider = 1) { divider_ = divider; };
void operator()(float8_t& y, const float8_t& x) const
{
y = _mm256_div_ps(x, _mm256_set1_ps(static_cast<float>(divider_)));
};
int32_t divider_ = 1;
};
template <typename Y, typename X, bool HasDividing = false>
struct UnarySquare;
template <>
struct UnarySquare<float, float, false>
{
UnarySquare(const int32_t divider = 1) { (void)divider; };
void operator()(float& y, const float& x) const { y = x * x; };
};
template <>
struct UnarySquare<float, float, true>
{
UnarySquare(const int32_t divider = 1) { divider_ = divider; };
void operator()(float& y, const float& x) const { y = x * x / type_convert<float>(divider_); };
int32_t divider_ = 1;
};
template <>
struct UnarySquare<float4_t, float4_t, false>
{
UnarySquare(const int32_t divider = 1) { (void)divider; };
void operator()(float4_t& y, const float4_t& x) const { y = _mm_mul_ps(x, x); };
};
template <>
struct UnarySquare<float4_t, float4_t, true>
{
UnarySquare(const int32_t divider = 1) { divider_ = divider; };
void operator()(float4_t& y, const float4_t& x) const
{
y = _mm_div_ps(_mm_mul_ps(x, x), _mm_set1_ps(static_cast<float>(divider_)));
};
int32_t divider_ = 1;
};
template <>
struct UnarySquare<float8_t, float8_t, false>
{
UnarySquare(const int32_t divider = 1) { (void)divider; };
void operator()(float8_t& y, const float8_t& x) const { y = _mm256_mul_ps(x, x); };
};
template <>
struct UnarySquare<float8_t, float8_t, true>
{
UnarySquare(const int32_t divider = 1) { divider_ = divider; };
void operator()(float8_t& y, const float8_t& x) const
{
y = _mm256_div_ps(_mm256_mul_ps(x, x), _mm256_set1_ps(static_cast<float>(divider_)));
};
int32_t divider_ = 1;
};
template <typename Y, typename X>
struct UnaryAbs;
template <>
struct UnaryAbs<float, float>
{
UnaryAbs(const int32_t divider = 1) { (void)divider; };
void operator()(float& y, const float& x) const { y = abs(x); };
};
template <>
struct UnaryAbs<float4_t, float4_t>
{
UnaryAbs(const int32_t divider = 1) { (void)divider; };
void operator()(float4_t& y, const float4_t& x) const
{
__m128 Mask = _mm_castsi128_ps(_mm_set1_epi32(~0x80000000));
y = _mm_and_ps(Mask, x);
};
};
template <>
struct UnaryAbs<float8_t, float8_t>
{
UnaryAbs(const int32_t divider = 1) { (void)divider; };
void operator()(float8_t& y, const float8_t& x) const
{
__m256 Mask = _mm256_castsi256_ps(_mm256_set1_epi32(~0x80000000));
y = _mm256_and_ps(Mask, x);
};
};
template <typename Y, typename X>
struct UnarySqrt;
template <>
struct UnarySqrt<float, float>
{
void operator()(float& y, const float& x) const { y = sqrtf(x); };
};
template <>
struct UnarySqrt<float4_t, float4_t>
{
void operator()(float4_t& y, const float4_t& x) const { y = _mm_sqrt_ps(x); };
};
template <>
struct UnarySqrt<float8_t, float8_t>
{
void operator()(float8_t& y, const float8_t& x) const { y = _mm256_sqrt_ps(x); };
};
} // namespace element_wise
} // namespace cpu
} // namespace tensor_operation
} // namespace ck
#ifndef CK_GRIDWISE_GEMM_AVX2_HPP
#define CK_GRIDWISE_GEMM_AVX2_HPP
#include "common_header.hpp"
#include "multi_index_transform_helper.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "blockwise_gemm_avx2.hpp"
#include "threadwise_tensor_slice_transfer_avx2.hpp"
#include "dynamic_buffer_cpu.hpp"
namespace ck {
namespace cpu {
template <typename GridwiseGemm,
typename FloatA,
typename FloatB,
typename FloatC,
typename AGridDesc,
typename BGridDesc,
typename CGridDesc,
typename AElementwiseOperation,
typename BElementwiseOperation,
typename CElementwiseOperation>
void kernel_gemm_avx_mxn(const FloatA* __restrict__ p_a_grid,
const FloatB* __restrict__ p_b_grid,
FloatC* __restrict__ p_c_grid,
const AGridDesc& a_grid_desc,
const BGridDesc& b_grid_desc,
const CGridDesc& c_grid_desc,
const AElementwiseOperation& a_element_op,
const BElementwiseOperation& b_element_op,
const CElementwiseOperation& c_element_op)
{
GridwiseGemm::Run(p_a_grid,
p_b_grid,
p_c_grid,
a_grid_desc,
b_grid_desc,
c_grid_desc,
a_element_op,
b_element_op,
c_element_op);
}
template <typename FloatA,
typename FloatB,
typename FloatC,
typename AccDataType,
typename AGridDesc,
typename BGridDesc,
typename CGridDesc,
typename AElementwiseOperation,
typename BElementwiseOperation,
typename CElementwiseOperation,
ck::index_t MPerBlock, // block means data are designed to fit in cache (L1/L2/L3)
ck::index_t NPerBlock,
ck::index_t KPerBlock,
typename ThreadwiseGemm_Dispatch,
typename BlockMNKAccessOrder, // how we accss gemm MNK to better fit in cache
typename ThreadMNAccessOrder, // how we acces gemm MN to utilize micro kernel
bool UseCLocalBuffer // if true, will allocate a buffer and write to it in kernel, then
// copy back to block buffer. if false, will write to C directly
>
struct GridwiseGemmAvx2_MxN
{
static constexpr auto I0 = Number<0>{};
static constexpr auto I1 = Number<1>{};
static constexpr auto I2 = Number<2>{};
static constexpr auto I3 = Number<3>{};
static constexpr auto I4 = Number<4>{};
static constexpr auto I5 = Number<5>{};
static constexpr auto I6 = Number<6>{};
static constexpr auto I7 = Number<7>{};
// static constexpr auto Avx2RegisterVector = 8; // 8 floats
static constexpr index_t MemAlignmentByte = 32; // 256bit
static constexpr auto GetABlockDescriptor()
{
if constexpr(std::is_same<typename ThreadwiseGemm_Dispatch::MatrixALayout,
ck::tensor_layout::gemm::RowMajor>::value)
{
// A : M, K
constexpr auto a_block_desc_m_k =
make_naive_tensor_descriptor_packed(make_tuple(MPerBlock, KPerBlock));
return a_block_desc_m_k;
}
else
{
// A : K, M
constexpr auto a_block_desc_k_m = make_naive_tensor_descriptor_packed(
make_tuple(KPerBlock,
math::integer_least_multiple(
MPerBlock, ThreadwiseGemm_Dispatch::MatrixAMinVectorSize)));
return a_block_desc_k_m;
}
}
static constexpr auto GetBBlockDescriptor()
{
if constexpr(std::is_same<typename ThreadwiseGemm_Dispatch::MatrixBLayout,
ck::tensor_layout::gemm::RowMajor>::value)
{
// B : K, N
constexpr auto b_block_desc_k_n = make_naive_tensor_descriptor_packed(
make_tuple(KPerBlock,
math::integer_least_multiple(
NPerBlock, ThreadwiseGemm_Dispatch::MatrixBMinVectorSize)));
return b_block_desc_k_n;
}
else
{
// B : N/8, K, N8
constexpr auto b_block_desc_n0_k_n1 = make_naive_tensor_descriptor_packed(make_tuple(
math::integer_divide_ceil(NPerBlock, ThreadwiseGemm_Dispatch::MatrixBMinVectorSize),
KPerBlock,
ThreadwiseGemm_Dispatch::MatrixBMinVectorSize));
return b_block_desc_n0_k_n1;
}
}
static constexpr auto GetABlockSliceLength()
{
if constexpr(std::is_same<typename ThreadwiseGemm_Dispatch::MatrixALayout,
ck::tensor_layout::gemm::RowMajor>::value)
{
// A : M, K
return ck::Sequence<MPerBlock, KPerBlock>{};
}
else
{
// A : K, M
return ck::Sequence<KPerBlock, MPerBlock>{};
}
}
static constexpr auto GetBBlockSliceLength()
{
if constexpr(std::is_same<typename ThreadwiseGemm_Dispatch::MatrixBLayout,
ck::tensor_layout::gemm::RowMajor>::value)
{
// B : K, N
return ck::Sequence<KPerBlock, NPerBlock>{};
}
else
{
// B : N/8, K, N88;
return ck::Sequence<NPerBlock / ThreadwiseGemm_Dispatch::MatrixBMinVectorSize,
KPerBlock,
ThreadwiseGemm_Dispatch::MatrixBMinVectorSize>{};
}
}
static constexpr auto GetABlockDimAccessOrder() { return ck::Sequence<0, 1>{}; }
static constexpr auto GetBBlockDimAccessOrder()
{
if constexpr(std::is_same<typename ThreadwiseGemm_Dispatch::MatrixBLayout,
ck::tensor_layout::gemm::RowMajor>::value)
{
// B : K, N
return ck::Sequence<0, 1>{};
}
else
{
// B : N/8, K, N88;
return ck::Sequence<0, 1, 2>{};
}
}
static constexpr auto GetABlockMoveFwdStep()
{
if constexpr(std::is_same<typename ThreadwiseGemm_Dispatch::MatrixALayout,
ck::tensor_layout::gemm::RowMajor>::value)
{
// A : M, K
return ck::make_multi_index(0, KPerBlock);
}
else
{
// A : K, M
return ck::make_multi_index(KPerBlock, 0);
}
}
static constexpr auto GetBBlockMoveFwdStep()
{
if constexpr(std::is_same<typename ThreadwiseGemm_Dispatch::MatrixBLayout,
ck::tensor_layout::gemm::RowMajor>::value)
{
// B : K, N
return ck::make_multi_index(KPerBlock, 0);
}
else
{
// B : N/8, K, N88;
return ck::make_multi_index(0, KPerBlock, 0);
}
}
#if 0
static constexpr auto GetAThreadDiscriptor()
{
if constexpr (std::is_same<typename ThreadwiseGemm_Dispatch::MatrixALayout, ck::tensor_layout::gemm::RowMajor>::value){
// A : M, K
constexpr auto a_thread_desc_m_k = make_naive_tensor_descriptor_packed(make_tuple(ThreadwiseGemm_Dispatch::ThreadMaxMr, KPerBlock));
return a_thread_desc_m_k;
} else {
// A : K, M
constexpr auto a_thread_desc_k_m = make_naive_tensor_descriptor_packed(make_tuple(KPerBlock, ThreadwiseGemm_Dispatch::ThreadMaxMr));
return a_thread_desc_k_m;
}
}
static constexpr auto GetBThreadDescriptor()
{
if constexpr (std::is_same<typename ThreadwiseGemm_Dispatch::MatrixBLayout, ck::tensor_layout::gemm::RowMajor>::value){
// B : K, N
constexpr auto b_thread_desc_k_n = make_naive_tensor_descriptor_packed(make_tuple(KPerBlock, ThreadwiseGemm_Dispatch::ThreadMaxNr));
return b_thread_desc_k_n;
} else {
// B : N/8, K, N8
constexpr auto b_thread_desc_n_k_n8 = make_naive_tensor_descriptor_packed(make_tuple(math::integer_divide_ceil(ThreadwiseGemm_Dispatch::ThreadMaxNr, ThreadwiseGemm_Dispatch::MatrixBMinVectorSize), KPerBlock, ThreadwiseGemm_Dispatch::MatrixBMinVectorSize));
return b_thread_desc_n_k_n8;
}
}
#endif
static constexpr auto GetAThreadSliceLength()
{
if constexpr(std::is_same<typename ThreadwiseGemm_Dispatch::MatrixALayout,
ck::tensor_layout::gemm::RowMajor>::value)
{
// A : M, K
return ck::Sequence<ThreadwiseGemm_Dispatch::ThreadMaxMr, KPerBlock>{};
}
else
{
// A : K, M
return ck::Sequence<KPerBlock, ThreadwiseGemm_Dispatch::ThreadMaxMr>{};
}
}
static constexpr auto GetBThreadSliceLength()
{
if constexpr(std::is_same<typename ThreadwiseGemm_Dispatch::MatrixBLayout,
ck::tensor_layout::gemm::RowMajor>::value)
{
// B : K, N
return ck::Sequence<KPerBlock, ThreadwiseGemm_Dispatch::ThreadMaxNr>{};
}
else
{
// B : N/8, K, N88;
return ck::Sequence<ThreadwiseGemm_Dispatch::ThreadMaxNr /
ThreadwiseGemm_Dispatch::MatrixBMinVectorSize,
KPerBlock,
ThreadwiseGemm_Dispatch::MatrixBMinVectorSize>{};
}
}
static constexpr auto GetAThreadMoveFwdStep()
{
if constexpr(std::is_same<typename ThreadwiseGemm_Dispatch::MatrixALayout,
ck::tensor_layout::gemm::RowMajor>::value)
{
// A : M, K
return ck::make_multi_index(ThreadwiseGemm_Dispatch::ThreadMaxMr, 0);
}
else
{
// A : K, M
return ck::make_multi_index(0, ThreadwiseGemm_Dispatch::ThreadMaxMr);
}
}
static constexpr auto GetBThreadMoveFwdStep()
{
if constexpr(std::is_same<typename ThreadwiseGemm_Dispatch::MatrixBLayout,
ck::tensor_layout::gemm::RowMajor>::value)
{
// B : K, N
return ck::make_multi_index(0, ThreadwiseGemm_Dispatch::ThreadMaxNr);
}
else
{
// B : N/8, K, N88;
return ck::Sequence<ThreadwiseGemm_Dispatch::ThreadMaxNr /
ThreadwiseGemm_Dispatch::MatrixBMinVectorSize,
0,
0>{};
}
}
static constexpr ck::index_t GetAThreadLoopOverDim()
{
if constexpr(std::is_same<typename ThreadwiseGemm_Dispatch::MatrixALayout,
ck::tensor_layout::gemm::RowMajor>::value)
{
// A : M, K
return 0;
}
else
{
// A : K, M
return 1;
}
}
static constexpr ck::index_t GetBThreadLoopOverDim()
{
if constexpr(std::is_same<typename ThreadwiseGemm_Dispatch::MatrixBLayout,
ck::tensor_layout::gemm::RowMajor>::value)
{
// B : K, N
return 1;
}
else
{
// B : N/8, K, N88;
return 0;
}
}
static constexpr auto GetCBlockDescriptor()
{
if constexpr(UseCLocalBuffer)
{
return make_naive_tensor_descriptor_packed(make_tuple(MPerBlock, NPerBlock));
}
else
{
return make_naive_tensor_descriptor_packed(make_tuple(MPerBlock, NPerBlock)); // TODO:
}
}
static constexpr auto GetCBlockSliceLength() { return ck::Sequence<MPerBlock, NPerBlock>{}; }
static constexpr bool CheckValidity(const AGridDesc& a_grid_desc,
const BGridDesc& b_grid_desc,
const CGridDesc& c_grid_desc)
{
#if 0
const auto M = a_grid_desc_k0_m_k1.GetLength(I1);
const auto N = b_grid_desc_k0_n_k1.GetLength(I1);
const auto K0 = a_grid_desc_k0_m_k1.GetLength(I0);
if(!(M == c_grid_desc_m_n.GetLength(I0) && N == c_grid_desc_m_n.GetLength(I1) &&
K0 == b_grid_desc_k0_n_k1.GetLength(I0) && K1 == a_grid_desc_k0_m_k1.GetLength(I2) &&
K1 == b_grid_desc_k0_n_k1.GetLength(I2)))
return false;
if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K0 % K0PerBlock == 0))
return false;
// check NumPrefetch
if constexpr(NumPrefetch == 1)
{
// 1-stage prefetch always supported
}
else if constexpr(NumPrefetch == 2)
{
// 2-stage prefetch currently only support even number of K0 loop
// TODO: add support for odd number of K0 loop
if(!((K0 / K0PerBlock) % 2 == 0))
{
return false;
}
}
else
{
return false;
}
// check M01, N01
constexpr auto M1 = Number<MPerBlock>{};
constexpr auto N1 = Number<NPerBlock>{};
const auto M0 = M / M1;
const auto N0 = N / N1;
if(!(M0 % M01 == 0 && N0 % N01 == 0))
return false;
#endif
// TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
return true;
}
static void Run(const FloatA* __restrict__ p_a_grid,
const FloatB* __restrict__ p_b_grid,
FloatC* __restrict__ p_c_grid,
const AGridDesc& a_grid_desc,
const BGridDesc& b_grid_desc,
const CGridDesc& c_grid_desc,
const AElementwiseOperation& a_element_op,
const BElementwiseOperation& b_element_op,
const CElementwiseOperation& c_element_op)
{
ck::index_t m_per_block;
ck::index_t n_per_block;
ck::index_t k_per_block;
if constexpr(MPerBlock == 0 && NPerBlock == 0 && KPerBlock == 0) {}
else
{
m_per_block = MPerBlock;
n_per_block = NPerBlock;
k_per_block = KPerBlock;
}
const auto M = a_grid_desc.GetLength(I0);
const auto N = b_grid_desc.GetLength(I1);
const auto K = b_grid_desc.GetLength(I0);
const ck::index_t grid_m = math::integer_divide_ceil(M, m_per_block);
const ck::index_t grid_n = math::integer_divide_ceil(N, n_per_block);
const ck::index_t grid_size = grid_m * grid_n;
constexpr auto a_block_desc = GetABlockDescriptor();
constexpr auto a_block_slice_length = GetABlockSliceLength();
constexpr auto a_block_copy_dim = decltype(a_block_slice_length)::Size();
constexpr auto a_dim_access_order = GetABlockDimAccessOrder();
constexpr auto a_block_move_step = GetABlockMoveFwdStep();
constexpr auto a_thread_slice_length = GetAThreadSliceLength();
constexpr auto a_thread_loop_over_dim = GetAThreadLoopOverDim();
constexpr auto b_block_desc = GetBBlockDescriptor();
constexpr auto b_block_slice_length = GetBBlockSliceLength();
constexpr auto b_block_copy_dim = decltype(b_block_slice_length)::Size();
constexpr auto b_dim_access_order = GetBBlockDimAccessOrder();
constexpr auto b_block_move_step = GetBBlockMoveFwdStep();
constexpr auto b_thread_slice_length = GetBThreadSliceLength();
constexpr auto b_thread_loop_over_dim = GetBThreadLoopOverDim();
constexpr auto c_block_desc = GetCBlockDescriptor();
constexpr auto c_block_slice_length = GetCBlockSliceLength();
constexpr auto c_block_move_step = ck::make_multi_index(0, NPerBlock);
auto a_threadwise_copy = ck::cpu::ThreadwiseTensorSliceTransferAvx2<
FloatA, // SrcData
FloatA, // DstData
decltype(a_grid_desc), // SrcDesc
decltype(a_block_desc), // DstDesc
AElementwiseOperation, // ElementwiseOperation
decltype(a_block_slice_length), // SliceLengths
decltype(a_dim_access_order), // DimAccessOrder
1, // VectorDim
1, // ScalarPerVector
ck::InMemoryDataOperationEnum_t::Set, // InMemoryDataOperationEnum_t
false, // SrcResetCoordinateAfterRun
true // DstResetCoordinateAfterRun
>(a_grid_desc,
ck::make_zero_multi_index<a_block_copy_dim>(),
a_block_desc,
ck::make_zero_multi_index<a_block_copy_dim>(),
AElementwiseOperation{});
auto b_threadwise_copy = ck::cpu::ThreadwiseTensorSliceTransferAvx2<
FloatB, // SrcData
FloatB, // DstData
decltype(b_grid_desc), // SrcDesc
decltype(b_block_desc), // DstDesc
BElementwiseOperation, // ElementwiseOperation
decltype(b_block_slice_length), // SliceLengths
decltype(b_dim_access_order), // DimAccessOrder
1, // VectorDim
1, // ScalarPerVector
ck::InMemoryDataOperationEnum_t::Set, // InMemoryDataOperationEnum_t
false, // SrcResetCoordinateAfterRun
true // DstResetCoordinateAfterRun
>(b_grid_desc,
ck::make_zero_multi_index<b_block_copy_dim>(),
b_block_desc,
ck::make_zero_multi_index<b_block_copy_dim>(),
BElementwiseOperation{});
auto c_threadwise_copy = ck::cpu::ThreadwiseTensorSliceTransferAvx2<
FloatC, // SrcData
FloatC, // DstData
decltype(c_block_desc), // SrcDesc
decltype(c_grid_desc), // DstDesc
BElementwiseOperation, // ElementwiseOperation
ck::Sequence<MPerBlock, NPerBlock>, // SliceLengths
ck::Sequence<0, 1>, // DimAccessOrder
1, // VectorDim
1, // ScalarPerVector
ck::InMemoryDataOperationEnum_t::Set, // InMemoryDataOperationEnum_t
true, // SrcResetCoordinateAfterRun
false // DstResetCoordinateAfterRun
>(c_block_desc,
ck::make_zero_multi_index<2>(),
c_grid_desc,
ck::make_zero_multi_index<2>(),
CElementwiseOperation{});
DeviceAlignedMemCPU a_block_mem(MPerBlock * KPerBlock * sizeof(FloatA), MemAlignmentByte);
DeviceAlignedMemCPU b_block_mem(KPerBlock * NPerBlock * sizeof(FloatB), MemAlignmentByte);
DeviceAlignedMemCPU c_block_mem(MPerBlock * NPerBlock * sizeof(FloatC), MemAlignmentByte);
auto a_grid_buf = ck::cpu::make_dynamic_buffer<ck::AddressSpaceEnum_t::Global>(
reinterpret_cast<const FloatA*>(p_a_grid), a_grid_desc.GetElementSpaceSize());
auto b_grid_buf = ck::cpu::make_dynamic_buffer<ck::AddressSpaceEnum_t::Global>(
reinterpret_cast<const FloatB*>(p_b_grid), b_grid_desc.GetElementSpaceSize());
auto c_grid_buf = ck::cpu::make_dynamic_buffer<ck::AddressSpaceEnum_t::Global>(
reinterpret_cast<FloatC*>(p_c_grid), c_grid_desc.GetElementSpaceSize());
auto a_block_buf = ck::cpu::make_dynamic_buffer<ck::AddressSpaceEnum_t::Global>(
reinterpret_cast<FloatA*>(a_block_mem.mpDeviceBuf),
a_block_mem.mMemSize / sizeof(FloatA));
auto b_block_buf = ck::cpu::make_dynamic_buffer<ck::AddressSpaceEnum_t::Global>(
reinterpret_cast<FloatB*>(b_block_mem.mpDeviceBuf),
b_block_mem.mMemSize / sizeof(FloatB));
auto c_block_buf = ck::cpu::make_dynamic_buffer<ck::AddressSpaceEnum_t::Global>(
reinterpret_cast<FloatC*>(c_block_mem.mpDeviceBuf),
c_block_mem.mMemSize / sizeof(FloatC));
auto blockwise_gemm =
BlockwiseGemmAvx2_MxN<FloatA, // FloatA,
FloatB, // FloatB,
FloatC, // FloatC,
AccDataType, // AccDataType,
decltype(a_block_desc), // ABlockDesc,
decltype(b_block_desc), // BBlockDesc,
decltype(c_block_desc), // CBlockDesc,
decltype(a_block_slice_length), // ABlockSliceLengths,
decltype(b_block_slice_length), // BBlockSliceLengths,
decltype(c_block_slice_length), // CBlockSliceLengths,
decltype(a_thread_slice_length), // AThreadSliceLength,
decltype(b_thread_slice_length), // BThreadSliceLength,
a_thread_loop_over_dim, // AThreadLoopOverDim, // thread slice
// loop over on block slice. 1d is enough
// for now
b_thread_loop_over_dim, // BThreadLoopOverDim,
KPerBlock, // KPerBlock,
ThreadwiseGemm_Dispatch, // ThreadwiseGemm_Dispatch,
ThreadMNAccessOrder>{}; // ThreadMNAccessOrder // how we acces
// gemm MN to utilize micro kernel>{};
// TODO: openmp aware ordering
if constexpr(std::is_same<BlockMNKAccessOrder, ck::Sequence<0, 1, 2>>::value)
{
#pragma omp parallel for
for(ck::index_t gid = 0; gid < grid_size; gid++)
{
ck::index_t i_mc = (gid / grid_n) * m_per_block;
ck::index_t i_nc = (gid % grid_n) * n_per_block;
ck::index_t mc_size = ck::math::min(M - i_mc, m_per_block);
ck::index_t nc_size = ck::math::min(N - i_nc, n_per_block);
// pack_b
b_threadwise_copy.RunGeneric(b_grid_desc, b_grid_buf, b_block_desc, b_block_buf);
b_threadwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_move_step);
if(i_nc == 0)
{
// pack_a
a_threadwise_copy.RunGeneric(
a_grid_desc, a_grid_buf, a_block_desc, a_block_buf);
a_threadwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_move_step);
}
for(ck::index_t i_kc = 0; i_kc < K; i_kc += k_per_block)
{
ck::index_t kc_size = ck::math::min(K - i_kc, k_per_block);
blockwise_gemm.Run(a_block_desc,
a_block_buf,
make_zero_multi_index<a_block_copy_dim>(),
b_block_desc,
b_block_buf,
make_zero_multi_index<b_block_copy_dim>(),
c_block_desc,
c_block_buf,
make_zero_multi_index<2>());
}
if constexpr(UseCLocalBuffer)
{
c_threadwise_copy.RunGeneric(
c_block_desc, c_block_buf, c_grid_desc, c_grid_buf);
c_threadwise_copy.MoveDstSliceWindow(c_grid_desc, c_block_move_step);
}
}
}
}
};
} // namespace cpu
} // namespace ck
#endif
...@@ -5,13 +5,23 @@ ...@@ -5,13 +5,23 @@
#include <immintrin.h> #include <immintrin.h>
#endif #endif
#include "common_header.hpp" #include "common_header.hpp"
#include "tensor_layout.hpp" #include "../../gpu/device/tensor_layout.hpp"
#include "math.hpp" #include "math.hpp"
#include "threadwise_param.hpp" #include "threadwise_param.hpp"
namespace ck { namespace ck {
namespace cpu { namespace cpu {
#if 0
struct ThreadWiseGemmAvx2_Base {
virtual void Run(ThreadwiseGemmParam* param) = 0;
virtual index_t GetMr() const = 0;
virtual index_t GetMaxMr() const = 0;
virtual index_t GetNr() const = 0;
virtual index_t GetMaxNr() const = 0;
};
#endif
template <typename FloatA, template <typename FloatA,
typename FloatB, typename FloatB,
typename FloatC, typename FloatC,
...@@ -22,17 +32,17 @@ template <typename FloatA, ...@@ -22,17 +32,17 @@ template <typename FloatA,
bool NonTemporalStore> bool NonTemporalStore>
struct ThreadwiseGemmAvx2_MxN_6x16 struct ThreadwiseGemmAvx2_MxN_6x16
{ {
using ALayout_ = ALayout; using MatrixALayout = ALayout;
using BLayout_ = BLayout; using MatrixBLayout = BLayout;
static constexpr auto Mr_ = Mr; static constexpr auto ThreadMaxMr = 6;
static constexpr auto Nr_ = Nr; static constexpr auto ThreadMaxNr = 16;
static constexpr auto NonTemporalStore_ = NonTemporalStore; static constexpr auto ThreadMr = Mr;
static constexpr auto ThreadNr = Nr;
static constexpr auto ThreadNTStore = NonTemporalStore;
__host__ constexpr ThreadwiseGemmAvx2_MxN_6x16() static_assert(Mr <= 6 && Mr >= 1 && (Nr == 8 || Nr == 16), "wrong! Mr x Nr not valid");
{
static_assert(Mr <= 6 && Mr >= 1 && (Nr == 8 || Nr == 16), "wrong! Mr x Nr not valid"); static void Run(ThreadwiseGemmParam* param)
}
__host__ static void Run(ThreadwiseGemmParam* param)
{ {
/* 6x16 ukernel /* 6x16 ukernel
* *
...@@ -563,18 +573,18 @@ template <typename FloatA, ...@@ -563,18 +573,18 @@ template <typename FloatA,
bool NonTemporalStore> bool NonTemporalStore>
struct ThreadwiseGemmAvx2_MxN_4x24 struct ThreadwiseGemmAvx2_MxN_4x24
{ {
using ALayout_ = ALayout; using MatrixALayout = ALayout;
using BLayout_ = BLayout; using MatrixBLayout = BLayout;
static constexpr auto Mr_ = Mr; static constexpr auto ThreadMaxMr = 4;
static constexpr auto Nr_ = Nr; static constexpr auto ThreadMaxNr = 24;
static constexpr auto NonTemporalStore_ = NonTemporalStore; static constexpr auto ThreadMr = Mr;
static constexpr auto ThreadNr = Nr;
__host__ constexpr ThreadwiseGemmAvx2_MxN_4x24() static constexpr auto ThreadNTStore = NonTemporalStore;
{
static_assert(Mr <= 4 && Mr >= 1 && (Nr == 8 || Nr == 16 || Nr == 24), static_assert(Mr <= 4 && Mr >= 1 && (Nr == 8 || Nr == 16 || Nr == 24),
"wrong! Mr x Nr not valid"); "wrong! Mr x Nr not valid");
}
__host__ static void Run(ThreadwiseGemmParam* param) static void Run(ThreadwiseGemmParam* param)
{ {
/* 4x24 ukernel /* 4x24 ukernel
* *
...@@ -820,18 +830,18 @@ struct ThreadwiseGemmAvx2_MxN_4x24 ...@@ -820,18 +830,18 @@ struct ThreadwiseGemmAvx2_MxN_4x24
".if (m_Mr > 2)\n lea (%%rbx, %%rdi, 1), %%rcx\n .endif\n" ".if (m_Mr > 2)\n lea (%%rbx, %%rdi, 1), %%rcx\n .endif\n"
".if (m_Mr > 3)\n lea (%%rcx, %%rdi, 1), %%rdx\n .endif\n" ".if (m_Mr > 3)\n lea (%%rcx, %%rdi, 1), %%rdx\n .endif\n"
" vaddps (%%rax), %%ymm0, %%ymm0 \n" // " vaddps (%%rax), %%ymm0, %%ymm0 \n"
".if (m_Nr > 8)\n vaddps 32(%%rax), %%ymm1, %%ymm1 \n .endif\n" // ".if (m_Nr > 8)\n vaddps 32(%%rax), %%ymm1, %%ymm1 \n .endif\n"
".if (m_Nr >16)\n vaddps 64(%%rax), %%ymm2, %%ymm2 \n .endif\n" // ".if (m_Nr >16)\n vaddps 64(%%rax), %%ymm2, %%ymm2 \n .endif\n"
".if (m_Mr > 1) \n vaddps (%%rbx), %%ymm3, %%ymm3 \n .endif\n" // ".if (m_Mr > 1) \n vaddps (%%rbx), %%ymm3, %%ymm3 \n .endif\n"
".if (m_Mr > 1) && (m_Nr > 8)\n vaddps 32(%%rbx), %%ymm4, %%ymm4 \n .endif\n" // ".if (m_Mr > 1) && (m_Nr > 8)\n vaddps 32(%%rbx), %%ymm4, %%ymm4 \n .endif\n"
".if (m_Mr > 1) && (m_Nr >16)\n vaddps 64(%%rbx), %%ymm5, %%ymm5 \n .endif\n" // ".if (m_Mr > 1) && (m_Nr >16)\n vaddps 64(%%rbx), %%ymm5, %%ymm5 \n .endif\n"
".if (m_Mr > 2) \n vaddps (%%rcx), %%ymm6, %%ymm6 \n .endif\n" // ".if (m_Mr > 2) \n vaddps (%%rcx), %%ymm6, %%ymm6 \n .endif\n"
".if (m_Mr > 2) && (m_Nr > 8)\n vaddps 32(%%rcx), %%ymm7, %%ymm7 \n .endif\n" // ".if (m_Mr > 2) && (m_Nr > 8)\n vaddps 32(%%rcx), %%ymm7, %%ymm7 \n .endif\n"
".if (m_Mr > 2) && (m_Nr >16)\n vaddps 64(%%rcx), %%ymm8, %%ymm8 \n .endif\n" // ".if (m_Mr > 2) && (m_Nr >16)\n vaddps 64(%%rcx), %%ymm8, %%ymm8 \n .endif\n"
".if (m_Mr > 3) \n vaddps (%%rdx), %%ymm9, %%ymm9 \n .endif\n" // ".if (m_Mr > 3) \n vaddps (%%rdx), %%ymm9, %%ymm9 \n .endif\n"
".if (m_Mr > 3) && (m_Nr > 8)\n vaddps 32(%%rdx), %%ymm10, %%ymm10\n .endif\n" // ".if (m_Mr > 3) && (m_Nr > 8)\n vaddps 32(%%rdx), %%ymm10, %%ymm10\n .endif\n"
".if (m_Mr > 3) && (m_Nr >16)\n vaddps 64(%%rdx), %%ymm11, %%ymm11\n .endif\n" // ".if (m_Mr > 3) && (m_Nr >16)\n vaddps 64(%%rdx), %%ymm11, %%ymm11\n .endif\n"
".if m_NTStore == 0\n" ".if m_NTStore == 0\n"
" vmovups %%ymm0, (%%rax) \n" " vmovups %%ymm0, (%%rax) \n"
...@@ -1090,6 +1100,304 @@ struct ThreadwiseGemmAvx2_MxN_4x24 ...@@ -1090,6 +1100,304 @@ struct ThreadwiseGemmAvx2_MxN_4x24
} }
}; };
typedef void (*pThreadwiseGemmAvx2Run)(ThreadwiseGemmParam*);
template <typename FloatA,
typename FloatB,
typename FloatC,
typename ALayout, // default is k*m, trans->m*k
typename BLayout, // default is n/8*k*n8, trans->k*n
bool NonTemporalStore>
struct ThreadwiseGemmAvx2_MxN_6x16_Dispatch
{
using MatrixALayout = ALayout;
using MatrixBLayout = BLayout;
static constexpr auto ThreadMaxMr = 6;
static constexpr auto ThreadMaxNr = 16;
// static constexpr auto ThreadMr = Mr;
// static constexpr auto ThreadNr = Nr;
static constexpr auto ThreadNTStore = NonTemporalStore;
static constexpr auto MatrixAMinVectorSize =
std::is_same<ck::tensor_layout::gemm::RowMajor, ALayout>::value ? 1 : 8;
static constexpr auto MatrixBMinVectorSize =
std::is_same<ck::tensor_layout::gemm::RowMajor, BLayout>::value ? 8 : 8;
using ThreadwiseGemm_6x16_t = ThreadwiseGemmAvx2_MxN_6x16<FloatA,
FloatB,
FloatC,
6,
16,
ALayout,
BLayout,
NonTemporalStore>;
using ThreadwiseGemm_6x8_t = ThreadwiseGemmAvx2_MxN_6x16<FloatA,
FloatB,
FloatC,
6,
8,
ALayout,
BLayout,
NonTemporalStore>;
using ThreadwiseGemm_5x16_t = ThreadwiseGemmAvx2_MxN_6x16<FloatA,
FloatB,
FloatC,
5,
16,
ALayout,
BLayout,
NonTemporalStore>;
using ThreadwiseGemm_5x8_t = ThreadwiseGemmAvx2_MxN_6x16<FloatA,
FloatB,
FloatC,
5,
8,
ALayout,
BLayout,
NonTemporalStore>;
using ThreadwiseGemm_4x16_t = ThreadwiseGemmAvx2_MxN_6x16<FloatA,
FloatB,
FloatC,
4,
16,
ALayout,
BLayout,
NonTemporalStore>;
using ThreadwiseGemm_4x8_t = ThreadwiseGemmAvx2_MxN_6x16<FloatA,
FloatB,
FloatC,
4,
8,
ALayout,
BLayout,
NonTemporalStore>;
using ThreadwiseGemm_3x16_t = ThreadwiseGemmAvx2_MxN_6x16<FloatA,
FloatB,
FloatC,
3,
16,
ALayout,
BLayout,
NonTemporalStore>;
using ThreadwiseGemm_3x8_t = ThreadwiseGemmAvx2_MxN_6x16<FloatA,
FloatB,
FloatC,
3,
8,
ALayout,
BLayout,
NonTemporalStore>;
using ThreadwiseGemm_2x16_t = ThreadwiseGemmAvx2_MxN_6x16<FloatA,
FloatB,
FloatC,
2,
16,
ALayout,
BLayout,
NonTemporalStore>;
using ThreadwiseGemm_2x8_t = ThreadwiseGemmAvx2_MxN_6x16<FloatA,
FloatB,
FloatC,
2,
8,
ALayout,
BLayout,
NonTemporalStore>;
using ThreadwiseGemm_1x16_t = ThreadwiseGemmAvx2_MxN_6x16<FloatA,
FloatB,
FloatC,
1,
16,
ALayout,
BLayout,
NonTemporalStore>;
using ThreadwiseGemm_1x8_t = ThreadwiseGemmAvx2_MxN_6x16<FloatA,
FloatB,
FloatC,
1,
8,
ALayout,
BLayout,
NonTemporalStore>;
static constexpr pThreadwiseGemmAvx2Run dispatch_table[6][2] = {
{
ThreadwiseGemm_6x16_t::Run,
ThreadwiseGemm_6x8_t::Run,
},
{
ThreadwiseGemm_5x16_t::Run,
ThreadwiseGemm_5x8_t::Run,
},
{
ThreadwiseGemm_4x16_t::Run,
ThreadwiseGemm_4x8_t::Run,
},
{
ThreadwiseGemm_3x16_t::Run,
ThreadwiseGemm_3x8_t::Run,
},
{
ThreadwiseGemm_2x16_t::Run,
ThreadwiseGemm_2x8_t::Run,
},
{
ThreadwiseGemm_1x16_t::Run,
ThreadwiseGemm_1x8_t::Run,
},
};
static void Run(ThreadwiseGemmParam* param, index_t mr, index_t nr)
{
return dispatch_table[mr][nr](param);
}
};
template <typename FloatA,
typename FloatB,
typename FloatC,
typename ALayout, // default is k*m, trans->m*k
typename BLayout, // default is n/8*k*n8, trans->k*n
bool NonTemporalStore>
struct ThreadwiseGemmAvx2_MxN_4x24_Dispatch
{
using MatrixALayout = ALayout;
using MatrixBLayout = BLayout;
static constexpr auto ThreadMaxMr = 4;
static constexpr auto ThreadMaxNr = 24;
// static constexpr auto ThreadMr = Mr;
// static constexpr auto ThreadNr = Nr;
static constexpr auto ThreadNTStore = NonTemporalStore;
static constexpr auto MatrixAMinVectorSize =
std::is_same<ck::tensor_layout::gemm::RowMajor, ALayout>::value ? 1 : 8;
static constexpr auto MatrixBMinVectorSize =
std::is_same<ck::tensor_layout::gemm::RowMajor, BLayout>::value ? 8 : 8;
using ThreadwiseGemm_4x24_t = ThreadwiseGemmAvx2_MxN_4x24<FloatA,
FloatB,
FloatC,
4,
24,
ALayout,
BLayout,
NonTemporalStore>;
using ThreadwiseGemm_4x16_t = ThreadwiseGemmAvx2_MxN_4x24<FloatA,
FloatB,
FloatC,
4,
16,
ALayout,
BLayout,
NonTemporalStore>;
using ThreadwiseGemm_4x8_t = ThreadwiseGemmAvx2_MxN_4x24<FloatA,
FloatB,
FloatC,
4,
8,
ALayout,
BLayout,
NonTemporalStore>;
using ThreadwiseGemm_3x24_t = ThreadwiseGemmAvx2_MxN_4x24<FloatA,
FloatB,
FloatC,
3,
24,
ALayout,
BLayout,
NonTemporalStore>;
using ThreadwiseGemm_3x16_t = ThreadwiseGemmAvx2_MxN_4x24<FloatA,
FloatB,
FloatC,
3,
16,
ALayout,
BLayout,
NonTemporalStore>;
using ThreadwiseGemm_3x8_t = ThreadwiseGemmAvx2_MxN_4x24<FloatA,
FloatB,
FloatC,
3,
8,
ALayout,
BLayout,
NonTemporalStore>;
using ThreadwiseGemm_2x24_t = ThreadwiseGemmAvx2_MxN_4x24<FloatA,
FloatB,
FloatC,
2,
24,
ALayout,
BLayout,
NonTemporalStore>;
using ThreadwiseGemm_2x16_t = ThreadwiseGemmAvx2_MxN_4x24<FloatA,
FloatB,
FloatC,
2,
16,
ALayout,
BLayout,
NonTemporalStore>;
using ThreadwiseGemm_2x8_t = ThreadwiseGemmAvx2_MxN_4x24<FloatA,
FloatB,
FloatC,
2,
8,
ALayout,
BLayout,
NonTemporalStore>;
using ThreadwiseGemm_1x24_t = ThreadwiseGemmAvx2_MxN_4x24<FloatA,
FloatB,
FloatC,
1,
24,
ALayout,
BLayout,
NonTemporalStore>;
using ThreadwiseGemm_1x16_t = ThreadwiseGemmAvx2_MxN_4x24<FloatA,
FloatB,
FloatC,
1,
16,
ALayout,
BLayout,
NonTemporalStore>;
using ThreadwiseGemm_1x8_t = ThreadwiseGemmAvx2_MxN_4x24<FloatA,
FloatB,
FloatC,
1,
8,
ALayout,
BLayout,
NonTemporalStore>;
static constexpr pThreadwiseGemmAvx2Run dispatch_table[4][3] = {
{
ThreadwiseGemm_4x24_t::Run,
ThreadwiseGemm_4x16_t::Run,
ThreadwiseGemm_4x8_t::Run,
},
{
ThreadwiseGemm_3x24_t::Run,
ThreadwiseGemm_3x16_t::Run,
ThreadwiseGemm_3x8_t::Run,
},
{
ThreadwiseGemm_2x24_t::Run,
ThreadwiseGemm_2x16_t::Run,
ThreadwiseGemm_2x8_t::Run,
},
{
ThreadwiseGemm_1x24_t::Run,
ThreadwiseGemm_1x16_t::Run,
ThreadwiseGemm_1x8_t::Run,
},
};
static void Run(ThreadwiseGemmParam* param, index_t mr, index_t nr)
{
return dispatch_table[mr][nr](param);
}
};
} // namespace cpu } // namespace cpu
} // namespace ck } // namespace ck
#endif #endif
#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_AVX2_HPP
#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_AVX2_HPP
#include "common_header.hpp"
#include "data_type_cpu.hpp"
#include "../../gpu/thread/threadwise_tensor_slice_transfer.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "tensor_space_filling_curve.hpp"
#include "dynamic_buffer_cpu.hpp"
#include <immintrin.h>
namespace ck {
namespace cpu {
// Assume:
// 1. src_desc and dst_desc are not known at compile-time
// 2. src_slice_origin and dst_slice_origin are not known at compile-time,
// 3. always use __mm256 register to hold continuous 8 dword, so if fast-changing
// dim is a complex dimension, better re-consider layout (e.g NCHW is not good if non 1x1)
// 4. RunGeneric() can handle any case (by not using ymm), but performance are not guranteed
template <typename SrcData,
typename DstData,
typename SrcDesc,
typename DstDesc,
typename ElementwiseOperation,
typename SliceLengths,
typename DimAccessOrder,
index_t VectorDim,
index_t ScalarPerVector, // src/dst must use same vector size, aka src/dst both need same
// avx/float register
InMemoryDataOperationEnum_t DstInMemOp,
bool SrcResetCoordinateAfterRun,
bool DstResetCoordinateAfterRun>
struct ThreadwiseTensorSliceTransferAvx2
{
static constexpr index_t nDim = SliceLengths::Size();
using Index = MultiIndex<nDim>;
using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
static constexpr auto I0 = Number<0>{};
constexpr ThreadwiseTensorSliceTransferAvx2(const SrcDesc& src_desc,
const Index& src_slice_origin,
const DstDesc& dst_desc,
const Index& dst_slice_origin,
const ElementwiseOperation& element_op)
: src_coord_(make_tensor_coordinate(src_desc, src_slice_origin)),
dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin)),
element_op_(element_op)
{
static_assert(SliceLengths::At(Number<VectorDim>{}) % ScalarPerVector == 0,
"wrong! cannot evenly divide");
}
void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
{
// In GPU this function is used for set per-thread index based on threadIdx.x
// But for CPU, no need to call this function.
src_coord_ = make_tensor_coordinate(src_desc, src_slice_origin_idx);
}
void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
{
dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx);
}
template <typename SrcBuffer, typename DstBuffer>
void RunGeneric(const SrcDesc& src_desc,
const SrcBuffer& src_buf,
const DstDesc& dst_desc,
DstBuffer& dst_buf)
{
// scalar per access on each dim
// TODO: don't use lambda_scalar_per_access
constexpr auto scalar_per_access = generate_sequence(
ck::detail::lambda_scalar_per_access<VectorDim, ScalarPerVector>{}, Number<nDim>{});
using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
DimAccessOrder,
remove_cv_t<decltype(scalar_per_access)>>;
// loop over space-filling curve
constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
// std::cout<<"num_access:"<<num_access<<std::endl;
static_for<0, num_access, 1>{}([&](auto idx_1d) {
using src_vector_type = ck::cpu::vector_type_maker_t<SrcData, ScalarPerVector>;
using src_vector_t = typename src_vector_type::type;
using dst_vector_type = ck::cpu::vector_type_maker_t<DstData, ScalarPerVector>;
using dst_vector_t = typename dst_vector_type::type;
const bool is_src_valid =
coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc, src_coord_);
// printf("[%s] ", is_src_valid ? "y":"n");
// print_multi_index(src_coord_.GetIndex());
// printf("----");
// print_multi_index(src_coord_.GetHiddenIndex());
// printf(":%d", src_coord_.GetOffset());
// printf("\n");
// copy data from src_buf into src_vector_container
auto src_vector_container = src_vector_type{
src_buf.template Get<src_vector_t>(src_coord_.GetOffset(), is_src_valid)};
auto dst_vector_container = dst_vector_type{};
// apply pointwise operation
// static_for<0, ScalarPerVector, 1>{}([&](auto i) {
// element_op_(dst_vector_container.template AsType<DstData>()(i),
// src_vector_container.template AsType<SrcData>()[i]);
// });
element_op_(dst_vector_container.template AsType<dst_vector_t>(),
src_vector_container.template AsType<src_vector_t>());
const bool is_dst_valid =
coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_);
// printf(" -> ");
// print_multi_index(dst_coord_.GetIndex());
// printf(":%d", dst_coord_.GetOffset());
// printf(", src:0x%x, dst:0x%x",
// *reinterpret_cast<uint32_t*>(&src_vector_container.template AsType<src_vector_t>()),
// *reinterpret_cast<uint32_t*>(&dst_vector_container.template
// AsType<dst_vector_t>()));
// printf("\n");
// copy data from dst_vector into dst_buf
dst_buf.template Update<DstInMemOp, dst_vector_t>(
dst_coord_.GetOffset(),
is_dst_valid,
dst_vector_container.template AsType<dst_vector_t>());
// move coordinate
if constexpr(idx_1d.value != num_access - 1)
{
constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(idx_1d);
move_tensor_coordinate(
src_desc, src_coord_, make_tensor_coordinate_step(src_desc, forward_step));
move_tensor_coordinate(
dst_desc, dst_coord_, make_tensor_coordinate_step(dst_desc, forward_step));
}
});
// move coordinate back to slice origin (or not)
if constexpr(SrcResetCoordinateAfterRun)
{
const auto src_reset_step =
make_tensor_coordinate_step(src_desc, GetCoordinateResetStep());
move_tensor_coordinate(src_desc, src_coord_, src_reset_step);
}
if constexpr(DstResetCoordinateAfterRun)
{
const auto dst_reset_step =
make_tensor_coordinate_step(dst_desc, GetCoordinateResetStep());
move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step);
}
}
static constexpr auto GetCoordinateResetStep()
{
constexpr auto scalar_per_access = generate_sequence(
detail::lambda_scalar_per_access<VectorDim, ScalarPerVector>{}, Number<nDim>{});
using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
DimAccessOrder,
remove_cv_t<decltype(scalar_per_access)>>;
constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
if constexpr(num_access == 0)
{
return typename SpaceFillingCurve::Index{};
}
else
{
constexpr auto reset_step =
SpaceFillingCurve::GetStepBetween(Number<num_access - 1>{}, Number<0>{});
return reset_step;
}
}
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
void MoveSrcSliceWindow(const SrcDesc& src_desc, const Index& src_slice_origin_step_idx)
{
// if src coord was not reset by RunRead(), then need to adjust the step here
const auto adjusted_step_idx = SrcResetCoordinateAfterRun
? src_slice_origin_step_idx
: src_slice_origin_step_idx + GetCoordinateResetStep();
printf(" GetCoordinateResetStep:");
print_multi_index(GetCoordinateResetStep());
printf(" adjusted_step_idx:");
print_multi_index(adjusted_step_idx);
// is it OK to construct a new step every time?
const auto adjusted_step = make_tensor_coordinate_step(src_desc, adjusted_step_idx);
printf(" adjusted_step:");
print_multi_index(adjusted_step.GetIndexDiff());
printf("\n");
move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
}
// dst_slice_origin_step_idx need to be known at compile-time, for performance reason
void MoveDstSliceWindow(const DstDesc& dst_desc, const Index& dst_slice_origin_step_idx)
{
// if dst coord was not reset by Run(), then need to adjust the step here
const auto adjusted_step_idx = DstResetCoordinateAfterRun
? dst_slice_origin_step_idx
: dst_slice_origin_step_idx + GetCoordinateResetStep();
// is it OK to construct a new step every time?
const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx);
move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
}
private:
SrcCoord src_coord_;
DstCoord dst_coord_;
const ElementwiseOperation element_op_;
};
} // namespace cpu
} // namespace ck
#endif
...@@ -34,7 +34,7 @@ ...@@ -34,7 +34,7 @@
#include "is_known_at_compile_time.hpp" #include "is_known_at_compile_time.hpp"
#include "transpose_vectors.hpp" #include "transpose_vectors.hpp"
#include "inner_product.hpp" #include "inner_product.hpp"
#include "element_wise_operation.hpp" // #include "element_wise_operation.hpp"
#include "debug.hpp" #include "debug.hpp"
// TODO: remove this // TODO: remove this
......
#pragma once
#include <immintrin.h>
namespace ck {
namespace cpu {
// vector_type
template <typename T, index_t N>
struct vector_type;
// Caution: DO NOT REMOVE
// intentionally have only declaration but no definition to cause compilation failure when trying to
// instantiate this template. The purpose is to catch user's mistake when trying to make "vector of
// vectors"
template <typename T, index_t V, index_t N>
struct vector_type<T __attribute__((ext_vector_type(V))), N>;
// Caution: DO NOT REMOVE
// intentionally have only declaration but no definition to cause compilation failure when trying to
// instantiate this template. The purpose is to catch user's mistake when trying to make "vector of
// vectors"
template <typename T, index_t V, index_t N>
struct vector_type<vector_type<T, V>, N>;
// vector_type_maker
// This is the right way to handle "vector of vectors": making a bigger vector instead
template <typename T, index_t N>
struct vector_type_maker
{
using type = vector_type<T, N>;
};
template <typename T, index_t N>
using vector_type_maker_t = typename vector_type_maker<T, N>::type;
template <typename T, index_t N>
constexpr auto make_vector_type(Number<N>)
{
return typename vector_type_maker<T, N>::type{};
}
template <>
struct vector_type<float, 1>
{
using d1_t = float;
// SSE
using type = float;
type data_;
vector_type() : data_{0} {}
// vector_type(float x) : data_{x} {}
vector_type(type v) : data_{v} {}
vector_type(const float* mem) : data_{*mem} {}
template <typename X>
constexpr const auto& AsType() const
{
static_assert(std::is_same<X, type>::value, "wrong!");
return data_;
}
template <typename X>
constexpr auto& AsType()
{
static_assert(std::is_same<X, type>::value, "wrong!");
return data_;
}
constexpr void Load(const float* mem) { data_ = *mem; }
constexpr void Store(float* mem) const { *mem = data_; }
};
template <>
struct vector_type<float, 4>
{
using d1_t = float;
// SSE
using type = __m128;
type data_;
vector_type() : data_{_mm_setzero_ps()} {}
vector_type(float x) : data_{_mm_set1_ps(x)} {}
vector_type(type v) : data_{v} {}
vector_type(const float* mem) : data_{_mm_loadu_ps(mem)} {}
template <typename X>
constexpr const auto& AsType() const
{
static_assert(std::is_same<X, type>::value, "wrong!");
return data_;
}
template <typename X>
constexpr auto& AsType()
{
static_assert(std::is_same<X, type>::value, "wrong!");
return data_;
}
constexpr void Load(const float* mem) { data_ = _mm_loadu_ps(mem); }
constexpr void Store(float* mem) const { _mm_storeu_ps(mem, data_); }
};
template <>
struct vector_type<float, 8>
{
using d1_t = float;
// SSE
using type = __m256;
type data_;
vector_type() : data_{_mm256_setzero_ps()} {}
vector_type(float x) : data_{_mm256_set1_ps(x)} {}
vector_type(type v) : data_{v} {}
vector_type(const float* mem) : data_{_mm256_loadu_ps(mem)} {}
template <typename X>
constexpr const auto& AsType() const
{
static_assert(std::is_same<X, type>::value, "wrong!");
return data_;
}
template <typename X>
constexpr auto& AsType()
{
static_assert(std::is_same<X, type>::value, "wrong!");
return data_;
}
constexpr void Load(const float* mem) { data_ = _mm256_loadu_ps(mem); }
constexpr void Store(float* mem) const { _mm256_storeu_ps(mem, data_); }
};
template <typename T>
struct to_vector_type
{
using type = T;
};
template <>
struct to_vector_type<__m128>
{
using type = vector_type<float, 4>;
};
template <>
struct to_vector_type<__m256>
{
using type = vector_type<float, 8>;
};
template <typename Tv, typename Tp>
inline void load_vector(Tv& v, const Tp* mem)
{
v = *reinterpret_cast<const Tv*>(mem);
}
template <>
inline void load_vector(__m128& v, const float* mem)
{
v = _mm_loadu_ps(mem);
}
template <>
inline void load_vector(__m256& v, const float* mem)
{
v = _mm256_loadu_ps(mem);
}
template <typename Tv, typename Tp>
inline void store_vector(const Tv& v, Tp* mem)
{
*reinterpret_cast<Tv*>(mem) = v;
}
template <>
inline void store_vector(const __m128& v, float* mem)
{
_mm_storeu_ps(mem, v);
}
template <>
inline void store_vector(const __m256& v, float* mem)
{
_mm256_storeu_ps(mem, v);
}
template <typename Tv, typename Tx>
inline void set_vector(Tv& v, const Tx x)
{
v = static_cast<const Tv>(x);
}
template <>
inline void set_vector(__m128& v, const float x)
{
v = _mm_set1_ps(x);
}
template <>
inline void set_vector(__m256& v, const float x)
{
v = _mm256_set1_ps(x);
}
template <typename Tv>
inline void clear_vector(Tv& v)
{
v = static_cast<Tv>(0);
}
template <>
inline void clear_vector(__m128& v)
{
v = _mm_setzero_ps();
}
template <>
inline void clear_vector(__m256& v)
{
v = _mm256_setzero_ps();
}
using float4_t = typename vector_type<float, 4>::type;
using float8_t = typename vector_type<float, 8>::type;
// scalar_type
template <typename TV>
struct scalar_type;
// is_scalar_type
template <typename TV>
struct is_scalar_type
{
static constexpr bool value = (scalar_type<remove_cvref_t<TV>>::vector_size == 1);
};
// has_same_scalar_type
template <typename X, typename Y>
using has_same_scalar_type = is_same<typename scalar_type<remove_cvref_t<X>>::type,
typename scalar_type<remove_cvref_t<Y>>::type>;
template <typename T, index_t N>
struct scalar_type<vector_type<T, N>>
{
using type = T;
static constexpr index_t vector_size = N;
};
template <>
struct scalar_type<float4_t>
{
using type = float;
static constexpr index_t vector_size = 4;
};
template <>
struct scalar_type<float8_t>
{
using type = float;
static constexpr index_t vector_size = 8;
};
//
template <>
struct scalar_type<float>
{
using type = float;
static constexpr index_t vector_size = 1;
};
} // namespace cpu
} // namespace ck
#ifndef CK_BUFFER_CPU_HPP
#define CK_BUFFER_CPU_HPP
#include "config.hpp"
#include "enable_if.hpp"
#include "data_type_cpu.hpp"
namespace ck {
namespace cpu {
template <AddressSpaceEnum_t BufferAddressSpace,
typename T,
typename ElementSpaceSize,
bool InvalidElementUseNumericalZeroValue>
struct DynamicBuffer
{
using type = T;
static_assert(BufferAddressSpace ==
AddressSpaceEnum_t::Global); // only valid for global address space on cpu
T* p_data_;
ElementSpaceSize element_space_size_;
T invalid_element_value_ = T{0};
constexpr DynamicBuffer(T* p_data, ElementSpaceSize element_space_size)
: p_data_{p_data}, element_space_size_{element_space_size}
{
}
constexpr DynamicBuffer(T* p_data, ElementSpaceSize element_space_size, T invalid_element_value)
: p_data_{p_data},
element_space_size_{element_space_size},
invalid_element_value_{invalid_element_value}
{
}
static constexpr AddressSpaceEnum_t GetAddressSpace() { return BufferAddressSpace; }
constexpr const T& operator[](index_t i) const { return p_data_[i]; }
constexpr T& operator()(index_t i) { return p_data_[i]; }
// X should be data_type::type, not directly data_type
template <typename X,
typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
typename scalar_type<remove_cvref_t<T>>::type>::value,
bool>::type = false>
constexpr auto Get(index_t i, bool is_valid_element) const
{
if constexpr(InvalidElementUseNumericalZeroValue)
{
X v;
if(is_valid_element)
load_vector(v, &p_data_[i]);
else
clear_vector(v);
return v;
}
else
{
X v;
if(is_valid_element)
load_vector(v, &p_data_[i]);
else
set_vector(v, invalid_element_value_);
return v;
}
}
template <InMemoryDataOperationEnum_t Op,
typename X,
typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
typename scalar_type<remove_cvref_t<T>>::type>::value,
bool>::type = false>
void Update(index_t i, bool is_valid_element, const X& x)
{
if constexpr(Op == InMemoryDataOperationEnum_t::Set)
{
this->template Set<X>(i, is_valid_element, x);
}
else if constexpr(Op == InMemoryDataOperationEnum_t::Add)
{
auto tmp = this->template Get<X>(i, is_valid_element);
this->template Set<X>(i, is_valid_element, x + tmp);
}
}
template <typename X,
typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
typename scalar_type<remove_cvref_t<T>>::type>::value,
bool>::type = false>
void Set(index_t i, bool is_valid_element, const X& x)
{
// X contains multiple T
constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;
static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
"wrong! X need to be multiple T");
if(is_valid_element)
{
store_vector(x, &p_data_[i]);
}
}
static constexpr bool IsStaticBuffer() { return false; }
static constexpr bool IsDynamicBuffer() { return true; }
};
template <AddressSpaceEnum_t BufferAddressSpace, typename T, typename ElementSpaceSize>
constexpr auto make_dynamic_buffer(T* p, ElementSpaceSize element_space_size)
{
return DynamicBuffer<BufferAddressSpace, T, ElementSpaceSize, true>{p, element_space_size};
}
template <
AddressSpaceEnum_t BufferAddressSpace,
typename T,
typename ElementSpaceSize,
typename X,
typename enable_if<is_same<remove_cvref_t<T>, remove_cvref_t<X>>::value, bool>::type = false>
constexpr auto
make_dynamic_buffer(T* p, ElementSpaceSize element_space_size, X invalid_element_value)
{
return DynamicBuffer<BufferAddressSpace, T, ElementSpaceSize, false>{
p, element_space_size, invalid_element_value};
}
} // namespace cpu
} // namespace ck
#endif
add_subdirectory(src/host_tensor) add_subdirectory(src/host_tensor)
add_subdirectory(src/tensor_operation_instance/gpu) add_subdirectory(src/tensor_operation_instance/gpu)
add_subdirectory(src/tensor_operation_instance/cpu)
\ No newline at end of file
...@@ -29,6 +29,8 @@ struct DeviceAlignedMemCPU ...@@ -29,6 +29,8 @@ struct DeviceAlignedMemCPU
DeviceAlignedMemCPU(std::size_t mem_size, std::size_t alignment); DeviceAlignedMemCPU(std::size_t mem_size, std::size_t alignment);
void* GetDeviceBuffer(); void* GetDeviceBuffer();
std::size_t GetBufferSize(); std::size_t GetBufferSize();
void ToDevice(const void* p);
void FromDevice(void* p);
void SetZero(); void SetZero();
~DeviceAlignedMemCPU(); ~DeviceAlignedMemCPU();
...@@ -108,4 +110,27 @@ float launch_and_time_kernel( ...@@ -108,4 +110,27 @@ float launch_and_time_kernel(
return timer.GetElapsedTime() / nrepeat; return timer.GetElapsedTime() / nrepeat;
} }
template <typename... Args, typename F>
void launch_cpu_kernel(F kernel, Args... args)
{
kernel(args...);
}
template <typename... Args, typename F>
float launch_and_time_cpu_kernel(F kernel, int nrepeat, Args... args)
{
WallTimer timer;
kernel(args...);
timer.Start();
for(int i = 0; i < nrepeat; i++)
{
kernel(args...);
}
timer.End();
return timer.GetElapsedTime() / nrepeat;
}
#endif #endif
...@@ -45,6 +45,10 @@ void* DeviceAlignedMemCPU::GetDeviceBuffer() { return mpDeviceBuf; } ...@@ -45,6 +45,10 @@ void* DeviceAlignedMemCPU::GetDeviceBuffer() { return mpDeviceBuf; }
std::size_t DeviceAlignedMemCPU::GetBufferSize() { return mMemSize; } std::size_t DeviceAlignedMemCPU::GetBufferSize() { return mMemSize; }
void DeviceAlignedMemCPU::ToDevice(const void* p) { memcpy(mpDeviceBuf, p, mMemSize); }
void DeviceAlignedMemCPU::FromDevice(void* p) { memcpy(p, mpDeviceBuf, mMemSize); }
void DeviceAlignedMemCPU::SetZero() { memset(mpDeviceBuf, 0, mMemSize); } void DeviceAlignedMemCPU::SetZero() { memset(mpDeviceBuf, 0, mMemSize); }
DeviceAlignedMemCPU::~DeviceAlignedMemCPU() { free((reinterpret_cast<void**>(mpDeviceBuf))[-1]); } DeviceAlignedMemCPU::~DeviceAlignedMemCPU() { free((reinterpret_cast<void**>(mpDeviceBuf))[-1]); }
......
include_directories(BEFORE
${PROJECT_SOURCE_DIR}/include/ck
${PROJECT_SOURCE_DIR}/include/ck/utility
${PROJECT_SOURCE_DIR}/include/ck/tensor_description
${PROJECT_SOURCE_DIR}/include/ck/tensor
${PROJECT_SOURCE_DIR}/include/ck/problem_transform
${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/cpu/device
${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/cpu/grid
${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/cpu/block
${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/cpu/thread
${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/cpu/element
${PROJECT_SOURCE_DIR}/library/include/ck/library/host_tensor
${PROJECT_SOURCE_DIR}/library/include/ck/library/tensor_operation_instance
${PROJECT_SOURCE_DIR}/external/include/half
)
function(add_instance_library INSTANCE_NAME)
message("adding instance ${INSTANCE_NAME}")
add_library(${INSTANCE_NAME} SHARED ${ARGN})
target_compile_features(${INSTANCE_NAME} PUBLIC)
set_target_properties(${INSTANCE_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
endfunction(add_instance_library INSTANCE_NAME)
add_subdirectory(conv2d_fwd)
# device_conv2d_fwd_cpu_instance
set(DEVICE_CONV2D_FWD_CPU_INSTANCE_SOURCE
device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_instance.cpp
)
add_library(device_conv2d_fwd_cpu_instance SHARED ${DEVICE_CONV2D_FWD_CPU_INSTANCE_SOURCE})
target_compile_features(device_conv2d_fwd_cpu_instance PUBLIC)
set_target_properties(device_conv2d_fwd_cpu_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
install(TARGETS device_conv2d_fwd_cpu_instance LIBRARY DESTINATION lib)
clang_tidy_check(device_conv2d_fwd_cpu_instance)
#include <stdlib.h>
#include "convolution_forward_specialization_cpu.hpp"
#include "config.hpp"
#include "device_convnd_fwd_avx2_nhwc_kyxc_nhwk.hpp"
#include "element_wise_operation_cpu.hpp"
#include "device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace cpu {
namespace device {
namespace device_conv2d_fwd_avx2_instance {
using InType = float;
using WeiType = float;
using OutType = float;
using AccType = float;
using InLayout = ck::tensor_layout::gemm::RowMajor; // NHWC
using WeiLayout = ck::tensor_layout::gemm::ColumnMajor; // KYXC
static constexpr bool NonTemporalStore = false;
using PassThrough = ck::tensor_operation::cpu::element_wise::PassThrough;
using ThreadwiseGemmAvx2_MxN_4x24_Dispatch =
ck::cpu::ThreadwiseGemmAvx2_MxN_4x24_Dispatch<InType,
WeiType,
OutType,
InLayout,
WeiLayout,
NonTemporalStore>;
static constexpr auto ConvFwdDefault =
ck::tensor_operation::cpu::device::ConvolutionForwardSpecialization_t::Default;
static constexpr auto ConvFwd1x1P0 =
ck::tensor_operation::cpu::device::ConvolutionForwardSpecialization_t::Filter1x1Pad0;
static constexpr auto ConvFwd1x1S1P0 =
ck::tensor_operation::cpu::device::ConvolutionForwardSpecialization_t::Filter1x1Stride1Pad0;
using device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_f32_instances = std::tuple<
//#################################################################|InDataType|WeiDataType|OutDataType|AccDataType|InElementwiseOp|WeiElementwiseOp|OutElementwiseOp|ConvForwardSp|NumDimSpatial|MPerBlock|NPerBlock|KPerBlock|ThreadwiseGemm_Dispatch
DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
float,
float,
float,
float,
PassThrough,
PassThrough,
PassThrough,
ConvFwdDefault,
2,
256,
128,
64,
ThreadwiseGemmAvx2_MxN_4x24_Dispatch>,
DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
float,
float,
float,
float,
PassThrough,
PassThrough,
PassThrough,
ConvFwdDefault,
2,
512,
256,
128,
ThreadwiseGemmAvx2_MxN_4x24_Dispatch>,
DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
float,
float,
float,
float,
PassThrough,
PassThrough,
PassThrough,
ConvFwdDefault,
2,
1024,
144,
128,
ThreadwiseGemmAvx2_MxN_4x24_Dispatch>>;
void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk(
std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances)
{
ck::tensor_operation::device::add_device_operation_instances(
instances, device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_f32_instances{});
}
} // namespace device_conv2d_fwd_avx2_instance
} // namespace device
} // namespace cpu
} // namespace tensor_operation
} // namespace ck
...@@ -32,6 +32,7 @@ set(PROFILER_SOURCE ...@@ -32,6 +32,7 @@ set(PROFILER_SOURCE
src/profile_conv_fwd_bias_relu.cpp src/profile_conv_fwd_bias_relu.cpp
src/profile_conv_fwd_bias_relu_add.cpp src/profile_conv_fwd_bias_relu_add.cpp
src/profile_conv_fwd_bias_relu_atomic_add.cpp src/profile_conv_fwd_bias_relu_atomic_add.cpp
src/profile_conv_fwd_cpu.cpp
src/profile_convnd_bwd_data.cpp src/profile_convnd_bwd_data.cpp
src/profile_reduce.cpp src/profile_reduce.cpp
src/profile_grouped_gemm.cpp src/profile_grouped_gemm.cpp
...@@ -51,6 +52,7 @@ target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_instance) ...@@ -51,6 +52,7 @@ target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_instance)
target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_instance) target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_instance)
target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_add_instance) target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_add_instance)
target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_atomic_add_instance) target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_atomic_add_instance)
target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_cpu_instance)
target_link_libraries(ckProfiler PRIVATE device_convnd_bwd_data_instance) target_link_libraries(ckProfiler PRIVATE device_convnd_bwd_data_instance)
target_link_libraries(ckProfiler PRIVATE device_reduce_instance) target_link_libraries(ckProfiler PRIVATE device_reduce_instance)
target_link_libraries(ckProfiler PRIVATE device_reduce_instance) target_link_libraries(ckProfiler PRIVATE device_reduce_instance)
......
#pragma once
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "tensor_layout.hpp"
#include "device_tensor.hpp"
#include "device_convnd_fwd_avx2_nhwc_kyxc_nhwk.hpp"
#include "element_wise_operation_cpu.hpp"
#include "reference_conv_fwd.hpp"
namespace ck {
namespace tensor_operation {
namespace cpu {
namespace device {
namespace device_conv2d_fwd_avx2_instance {
void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk(
std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
} // namespace device_conv2d_fwd_avx2_instance
} // namespace device
} // namespace cpu
} // namespace tensor_operation
} // namespace ck
namespace ck {
namespace profiler {
#define AVX2_DATA_ALIGNMENT
template <int NDimSpatial,
typename InDataType,
typename WeiDataType,
typename OutDataType,
typename InLayout,
typename WeiLayout,
typename OutLayout>
void profile_conv_cpu_fwd_impl(int do_verification,
int init_method,
bool do_log,
int nrepeat,
ck::index_t N,
ck::index_t K,
ck::index_t C,
std::vector<ck::index_t> input_spatial_lengths,
std::vector<ck::index_t> filter_spatial_lengths,
std::vector<ck::index_t> output_spatial_lengths,
std::vector<ck::index_t> conv_filter_strides,
std::vector<ck::index_t> conv_filter_dilations,
std::vector<ck::index_t> input_left_pads,
std::vector<ck::index_t> input_right_pads)
{
const ck::index_t Y = filter_spatial_lengths[0];
const ck::index_t X = filter_spatial_lengths[1];
const ck::index_t Hi = input_spatial_lengths[0];
const ck::index_t Wi = input_spatial_lengths[1];
const ck::index_t Ho = output_spatial_lengths[0];
const ck::index_t Wo = output_spatial_lengths[1];
auto f_host_tensor_descriptor =
[](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W, auto layout) {
if constexpr(is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value ||
is_same<decltype(layout), ck::tensor_layout::convolution::KCYX>::value ||
is_same<decltype(layout), ck::tensor_layout::convolution::NKHW>::value)
{
return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
std::vector<std::size_t>({C_ * H * W, H * W, W, 1}));
}
else if constexpr(is_same<decltype(layout), tensor_layout::convolution::NHWC>::value ||
is_same<decltype(layout), tensor_layout::convolution::KYXC>::value ||
is_same<decltype(layout), tensor_layout::convolution::NHWK>::value)
{
return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
std::vector<std::size_t>({C_ * H * W, 1, W * C_, C_}));
}
};
Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi, InLayout{}));
Tensor<WeiDataType> wei_k_c_y_x(f_host_tensor_descriptor(K, C, Y, X, WeiLayout{}));
Tensor<OutDataType> out_n_k_ho_wo_host_result(
f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
Tensor<OutDataType> out_n_k_ho_wo_device_result(
f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl;
std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo_host_result.mDesc << std::endl;
switch(init_method)
{
case 0: break;
case 1:
in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
break;
default:
in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
}
using InElementOp = ck::tensor_operation::cpu::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::cpu::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::cpu::element_wise::PassThrough;
const auto in_element_op = InElementOp{};
const auto wei_element_op = WeiElementOp{};
const auto out_element_op = OutElementOp{};
if(do_verification)
{
using ReferenceConvFwdInstance = ck::tensor_operation::host::ReferenceConvFwd<InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>;
auto ref_conv = ReferenceConvFwdInstance{};
auto ref_invoker = ref_conv.MakeInvoker();
auto ref_argument = ref_conv.MakeArgument(in_n_c_hi_wi,
wei_k_c_y_x,
out_n_k_ho_wo_host_result,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
in_element_op,
wei_element_op,
out_element_op);
ref_invoker.Run(ref_argument);
}
DeviceAlignedMemCPU in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpace(),
AVX2_DATA_ALIGNMENT);
DeviceAlignedMemCPU wei_device_buf(sizeof(WeiDataType) * wei_k_c_y_x.mDesc.GetElementSpace(),
AVX2_DATA_ALIGNMENT);
DeviceAlignedMemCPU out_device_buf(sizeof(OutDataType) *
out_n_k_ho_wo_device_result.mDesc.GetElementSpace(),
AVX2_DATA_ALIGNMENT);
in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
wei_device_buf.ToDevice(wei_k_c_y_x.mData.data());
memcpy(in_device_buf.mpDeviceBuf, in_n_c_hi_wi.mData.data(), in_device_buf.mMemSize);
memcpy(wei_device_buf.mpDeviceBuf, wei_k_c_y_x.mData.data(), wei_device_buf.mMemSize);
using PassThrough = ck::tensor_operation::cpu::element_wise::PassThrough;
using DeviceConvFwdNoOpPtr =
ck::tensor_operation::device::DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>;
// add device Conv instances
std::vector<DeviceConvFwdNoOpPtr> conv_ptrs;
ck::tensor_operation::cpu::device::device_conv2d_fwd_instance::
add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk(conv_ptrs);
if(conv_ptrs.size() <= 0)
{
throw std::runtime_error("wrong! no device Conv instance found");
}
std::string best_conv_name;
float best_ave_time = 0;
float best_gflops = 0;
float best_gb_per_sec = 0;
// profile device Conv instances
for(auto& conv_ptr : conv_ptrs)
{
auto argument_ptr = conv_ptr->MakeArgumentPointer(
static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
N,
K,
C,
input_spatial_lengths,
filter_spatial_lengths,
output_spatial_lengths,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
in_element_op,
wei_element_op,
out_element_op);
auto invoker_ptr = conv_ptr->MakeInvokerPointer();
if(conv_ptr->IsSupportedArgument(argument_ptr.get()))
{
std::string conv_name = conv_ptr->GetTypeString();
float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
std::size_t num_btype = sizeof(InDataType) * (N * C * Hi * Wi) +
sizeof(WeiDataType) * (K * C * Y * X) +
sizeof(OutDataType) * (N * K * Ho * Wo);
float gflops = static_cast<float>(flop) / 1.E6 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << gflops << " GFlops, " << gb_per_sec
<< " GB/s, " << conv_name << std::endl;
if(gflops > best_gflops)
{
best_conv_name = conv_name;
best_gflops = gflops;
best_ave_time = ave_time;
best_gb_per_sec = gb_per_sec;
}
if(do_verification)
{
memcpy(out_n_k_ho_wo_device_result.mData.data(),
out_device_buf.mpDeviceBuf,
out_device_buf.mMemSize);
check_error(out_n_k_ho_wo_host_result, out_n_k_ho_wo_device_result);
if(do_log)
{
LogRangeAsType<float>(std::cout << "in : ", in_n_c_hi_wi.mData, ",")
<< std::endl;
LogRangeAsType<float>(std::cout << "wei: ", wei_k_c_y_x.mData, ",")
<< std::endl;
LogRangeAsType<float>(
std::cout << "out_host : ", out_n_k_ho_wo_host_result.mData, ",")
<< std::endl;
LogRangeAsType<float>(
std::cout << "out_device: ", out_n_k_ho_wo_device_result.mData, ",")
<< std::endl;
}
}
}
}
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gflops << " GFlops, "
<< best_gb_per_sec << " GB/s, " << best_conv_name << std::endl;
}
} // namespace profiler
} // namespace ck
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment