Commit c0f698d5 authored by carlushuang's avatar carlushuang
Browse files

add test threadwise transfer. currently static_ford in threadwise transfer can...

add test threadwise transfer. currently static_ford in threadwise transfer can not support large MC*KC tile size
parent e6ee6594
#ifndef CK_BLOCKWISE_GEMM_AVX2_HPP
#define CK_BLOCKWISE_GEMM_AVX2_HPP
#include "common_header.hpp"
#include "multi_index_transform_helper.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "threadwise_gemm_avx2.hpp"
namespace ck {
namespace cpu {
template <typename FloatA,
typename FloatB,
typename FloatC,
typename AccDataType,
typename ABlockDesc,
typename BBlockDesc,
typename CBlockDesc,
typename ABlockSliceLengths,
typename BBlockSliceLengths,
typename CBlockSliceLengths,
typename AThreadSliceLength,
typename BThreadSliceLength,
ck::index_t AThreadLoopOverDim, // thread slice loop over on block slice. 1d is enough for
// now
ck::index_t BThreadLoopOverDim,
ck::index_t KPerBlock,
typename ThreadwiseGemm_Dispatch,
typename ThreadMNAccessOrder // how we acces gemm MN to utilize micro kernel
>
struct BlockwiseGemmAvx2_MxN
{
static constexpr auto I0 = Number<0>{};
static constexpr auto I1 = Number<1>{};
static constexpr auto I2 = Number<2>{};
static constexpr auto I3 = Number<3>{};
static constexpr auto I4 = Number<4>{};
static constexpr auto I5 = Number<5>{};
static constexpr auto I6 = Number<6>{};
static constexpr auto I7 = Number<7>{};
static constexpr index_t nDimA = ABlockDesc::GetNumOfDimension();
static constexpr index_t nDimB = BBlockDesc::GetNumOfDimension();
static constexpr index_t nDimC = CBlockDesc::GetNumOfDimension();
using IndexA = MultiIndex<nDimA>;
using IndexB = MultiIndex<nDimB>;
using IndexC = MultiIndex<nDimC>;
using ACoord = decltype(make_tensor_coordinate(ABlockDesc{}, IndexA{}));
using BCoord = decltype(make_tensor_coordinate(BBlockDesc{}, IndexB{}));
using CCoord = decltype(make_tensor_coordinate(CBlockDesc{}, IndexC{}));
#if 0
constexpr BlockwiseGemmAvx2_MxN(const ABlockDesc & a_block_desc, const IndexA& a_thread_origin,
const BBlockDesc & b_block_desc, const IndexB& b_thread_origin)
: a_thread_coord_(make_tensor_coordinate(a_block_desc, a_thread_origin)),
b_thread_coord_(make_tensor_coordinate(b_block_desc, b_thread_origin)),
{
}
#endif
template <typename TensorDesc>
constexpr auto GetLeadingElement(const TensorDesc& desc)
{
// if use this function, make sure desc are known at compile time.
// otherwise, it is not efficient to calculate leading dim here
if constexpr(TensorDesc::GetNumOfDimension() == 1)
{
return 1;
}
else
{
constexpr auto last_dims =
typename uniform_sequence_gen<TensorDesc::GetNumOfDimension() - 1, 0>::type{};
constexpr auto lead_dims = decltype(last_dims)::PushFront(Number<1>{});
return desc.CalculateOffset(lead_dims);
}
}
template <typename ABlockBuffer, typename BBlockBuffer, typename CBlockBuffer>
void Run(const ABlockDesc& a_block_desc,
const ABlockBuffer& a_block_buf,
const IndexA& a_origin,
const BBlockDesc& b_block_desc,
const BBlockBuffer& b_block_buf,
const IndexB& b_origin,
const CBlockDesc& c_block_desc,
CBlockBuffer& c_block_buf,
const IndexC& c_origin) const
{
constexpr auto m_n_block_length =
ck::Sequence<ABlockSliceLengths::At(AThreadLoopOverDim),
BBlockSliceLengths::At(BThreadLoopOverDim)>{};
constexpr auto m_n_thread_length =
ck::Sequence<AThreadSliceLength::At(AThreadLoopOverDim),
BThreadSliceLength::At(BThreadLoopOverDim)>{};
constexpr auto m_n_access_length = m_n_block_length / m_n_thread_length;
constexpr auto ordered_m_n_access_length =
container_reorder_given_new2old(m_n_access_length, ThreadMNAccessOrder{});
constexpr auto a_block_idx_zeros =
typename uniform_sequence_gen<nDimA, 0>::type{}; // starting point of the block
constexpr auto b_block_idx_zeros = typename uniform_sequence_gen<nDimB, 0>::type{};
constexpr auto lda = GetLeadingElement(a_block_desc) * sizeof(FloatA);
constexpr auto ldb = GetLeadingElement(b_block_desc) * sizeof(FloatB);
constexpr auto ldc = GetLeadingElement(c_block_desc) * sizeof(FloatC);
ck::cpu::ThreadwiseGemmParam param;
param.Kr = KPerBlock;
param.lda = lda;
param.ldb = ldb;
param.ldc = ldc;
param.alpha = 1.0f; // TODO
static_ford<decltype(ordered_m_n_access_length)>{}([&](auto ordered_idx) {
constexpr auto origin_m_n_idx = ordered_idx.ReorderGivenOld2New(ThreadMNAccessOrder{});
constexpr auto current_m_idx =
origin_m_n_idx.At(0) * AThreadSliceLength::At(AThreadLoopOverDim);
constexpr auto current_n_idx =
origin_m_n_idx.At(1) * BThreadSliceLength::At(BThreadLoopOverDim);
constexpr auto current_mr =
ck::math::min(m_n_block_length.At(0) - current_m_idx, m_n_thread_length.At(0));
constexpr auto current_nr =
ck::math::min(m_n_block_length.At(1) - current_n_idx, m_n_thread_length.At(1));
constexpr auto a_block_idx =
a_block_idx_zeros.Modify(AThreadLoopOverDim, current_m_idx);
constexpr auto a_block_coord =
make_tensor_coordinate(a_block_desc, to_multi_index(a_origin + a_block_idx));
constexpr auto b_block_idx =
b_block_idx_zeros.Modify(BThreadLoopOverDim, current_n_idx);
constexpr auto b_block_coord =
make_tensor_coordinate(b_block_desc, to_multi_index(b_origin + b_block_idx));
constexpr auto c_block_coord =
make_tensor_coordinate(c_block_desc, to_multi_index(c_origin + origin_m_n_idx));
param.p_a = &a_block_buf.p_data_[a_block_coord.GetOffset()];
param.p_b = &b_block_buf.p_data_[b_block_coord.GetOffset()];
param.p_c = &c_block_buf.p_data_[c_block_coord.GetOffset()];
ThreadwiseGemm_Dispatch::Run(&param, current_mr, current_nr);
});
}
};
} // namespace cpu
} // namespace ck
#endif
#ifndef CONVOLUTION_FORWARD_SPECIALIZATION_CPU
#define CONVOLUTION_FORWARD_SPECIALIZATION_CPU
namespace ck {
namespace tensor_operation {
namespace cpu {
namespace device {
enum ConvolutionForwardSpecialization_t
{
Default,
Filter1x1Pad0,
Filter1x1Stride1Pad0,
OddC,
};
} // namespace device
} // namespace cpu
} // namespace tensor_operation
} // namespace ck
#endif
#ifndef DEVICE_BASE_CPU_HPP
#define DEVICE_BASE_CPU_HPP
#include <string>
namespace ck {
namespace tensor_operation {
namespace cpu {
namespace device {
struct BaseArgument
{
BaseArgument() = default;
BaseArgument(const BaseArgument&) = default;
BaseArgument& operator=(const BaseArgument&) = default;
virtual ~BaseArgument() {}
};
struct BaseInvoker
{
BaseInvoker() = default;
BaseInvoker(const BaseInvoker&) = default;
BaseInvoker& operator=(const BaseInvoker&) = default;
virtual float Run(const BaseArgument*, int = 1) = 0;
virtual ~BaseInvoker() {}
};
struct BaseOperator
{
BaseOperator() = default;
BaseOperator(const BaseOperator&) = default;
BaseOperator& operator=(const BaseOperator&) = default;
virtual bool IsSupportedArgument(const BaseArgument*) = 0;
virtual std::string GetTypeString() const = 0;
virtual ~BaseOperator() {}
};
} // namespace device
} // namespace cpu
} // namespace tensor_operation
} // namespace ck
#endif
#ifndef DEVICE_CONV_FWD_CPU_HPP
#define DEVICE_CONV_FWD_CPU_HPP
#include <iostream>
#include "device_base_cpu.hpp"
namespace ck {
namespace tensor_operation {
namespace cpu {
namespace device {
template <typename InElementwiseOperation,
typename WeiElementwiseOperation,
typename OutElementwiseOperation>
struct DeviceConvFwd : public BaseOperator
{
virtual std::unique_ptr<BaseArgument>
MakeArgumentPointer(const void* p_in,
const void* p_wei,
void* p_out,
ck::index_t N,
ck::index_t K,
ck::index_t C,
std::vector<ck::index_t> input_spatial_lengths,
std::vector<ck::index_t> filter_spatial_lengths,
std::vector<ck::index_t> output_spatial_lengths,
std::vector<ck::index_t> conv_filter_strides,
std::vector<ck::index_t> conv_filter_dilations,
std::vector<ck::index_t> input_left_pads,
std::vector<ck::index_t> input_right_pads,
InElementwiseOperation in_element_op,
WeiElementwiseOperation wei_element_op,
OutElementwiseOperation out_element_op) = 0;
virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
};
template <typename InElementwiseOperation,
typename WeiElementwiseOperation,
typename OutElementwiseOperation>
using DeviceConvFwdPtr = std::unique_ptr<
DeviceConvFwd<InElementwiseOperation, WeiElementwiseOperation, OutElementwiseOperation>>;
} // namespace device
} // namespace cpu
} // namespace tensor_operation
} // namespace ck
#endif
#pragma once
#include "data_type_cpu.hpp"
namespace ck {
namespace tensor_operation {
namespace cpu {
namespace element_wise {
using float8_t = ck::cpu::float8_t;
using float4_t = ck::cpu::float4_t;
struct PassThrough
{
void operator()(float& y, const float& x) const { y = x; }
void operator()(float4_t& y, const float4_t& x) const { y = x; }
void operator()(float8_t& y, const float8_t& x) const { y = x; }
};
struct Add
{
void operator()(float& y, const float& x0, const float& x1) const { y = x0 + x1; }
void operator()(float4_t& y, const float4_t& x0, const float4_t& x1) const
{
y = _mm_add_ps(x0, x1);
}
void operator()(float8_t& y, const float8_t& x0, const float8_t& x1) const
{
y = _mm256_add_ps(x0, x1);
}
};
struct AlphaBetaAdd
{
AlphaBetaAdd(float alpha, float beta) : alpha_(alpha), beta_(beta) {}
void operator()(float& y, const float& x0, const float& x1) const
{
y = alpha_ * x0 + beta_ * x1;
}
void operator()(float4_t& y, const float4_t& x0, const float4_t& x1) const
{
y = _mm_add_ps(_mm_mul_ps(x0, _mm_set1_ps(alpha_)), _mm_mul_ps(x1, _mm_set1_ps(beta_)));
}
void operator()(float8_t& y, const float8_t& x0, const float8_t& x1) const
{
y = _mm256_add_ps(_mm256_mul_ps(x0, _mm256_set1_ps(alpha_)),
_mm256_mul_ps(x1, _mm256_set1_ps(beta_)));
}
float alpha_;
float beta_;
};
struct AddRelu
{
void operator()(float& y, const float& x0, const float& x1) const
{
const float a = x0 + x1;
y = a > 0 ? a : 0;
}
void operator()(float4_t& y, const float4_t& x0, const float4_t& x1) const
{
y = _mm_max_ps(_mm_add_ps(x0, x1), _mm_setzero_ps());
}
void operator()(float8_t& y, const float8_t& x0, const float8_t& x1) const
{
y = _mm256_max_ps(_mm256_add_ps(x0, x1), _mm256_setzero_ps());
}
};
#if 0
struct AddHardswish
{
void operator()(float& y, const float& x0, const float& x1) const
{
float a = x0 + x1;
float b = a + float{3};
float c = (b > 0) * (b > float{6} ? float{6} : b) * a * float{0.166667};
y = c;
}
void
operator()(half_t& y, const half_t& x0, const half_t& x1) const
{
float a = x0 + x1;
float b = a + float{3};
float c = (b > 0) * (b > float{6} ? float{6} : b) * a * float{0.166667};
y = c;
}
};
#endif
struct AddReluAdd
{
void operator()(float& y, const float& x0, const float& x1, const float& x2) const
{
float a = x0 + x1;
float b = a > 0 ? a : 0;
float c = b + x2;
y = c;
}
void operator()(float4_t& y, const float4_t& x0, const float4_t& x1, const float4_t& x2) const
{
float4_t a = _mm_add_ps(x0, x1);
float4_t b = _mm_max_ps(a, _mm_setzero_ps());
y = _mm_add_ps(b, x2);
}
void operator()(float8_t& y, const float8_t& x0, const float8_t& x1, const float8_t& x2) const
{
float8_t a = _mm256_add_ps(x0, x1);
float8_t b = _mm256_max_ps(a, _mm256_setzero_ps());
y = _mm256_add_ps(b, x2);
}
};
#if 0
struct AddHardswishAdd
{
void
operator()(float& y, const float& x0, const float& x1, const float& x2) const
{
float a = x0 + x1;
float b = a + float{3};
float c = (b > 0) * (b > float{6} ? float{6} : b) * a * float{0.166667};
float d = c + x2;
y = d;
}
void
operator()(half_t& y, const half_t& x0, const half_t& x1, const half_t& x2) const
{
float a = x0 + x1;
float b = a + float{3};
float c = (b > 0) * (b > float{6} ? float{6} : b) * a * float{0.166667};
float d = c + x2;
y = d;
}
};
#endif
#if 0
struct RequantReluRequant
{
// FIXME: We just need one scale for Relu / Leaky Relu / PRelu
RequantReluRequant(float scaleGemm, float scaleRelu)
: scaleGemm_(scaleGemm), scaleRelu_(scaleRelu)
{
}
void operator()(int8_t& y, const int& x) const
{
float gemm_requant = scaleGemm_ * static_cast<float>(x);
float relu = gemm_requant > 0 ? gemm_requant : 0;
float relu_requant = scaleRelu_ * relu;
y = static_cast<int8_t>(relu_requant > 127 ? 127
: relu_requant < -128 ? -128 : relu_requant);
}
// for reference_gemm
void operator()(float& y, const float& x) const
{
float gemm_requant = scaleGemm_ * x;
float relu = gemm_requant > 0 ? gemm_requant : 0;
float relu_requant = scaleRelu_ * relu;
y = static_cast<float>(relu_requant > 127 ? 127
: relu_requant < -128 ? -128 : relu_requant);
}
float scaleGemm_;
float scaleRelu_;
};
#endif
// Unary operators are usually called element-wisely before/after the reduction is executed on the
// elements. They are needed for easy implementation of reduction types of AVG, NRM1, NRM2
template <typename Y, typename X, bool HasDividing = false>
struct UnaryIdentic;
template <>
struct UnaryIdentic<float, float, false>
{
UnaryIdentic(const int32_t divider = 1) { (void)divider; };
void operator()(float& y, const float& x) const { y = x; };
};
template <>
struct UnaryIdentic<float, float, true>
{
UnaryIdentic(const int32_t divider = 1) { divider_ = divider; };
void operator()(float& y, const float& x) const { y = x / type_convert<float>(divider_); };
int32_t divider_ = 1;
};
template <>
struct UnaryIdentic<float4_t, float4_t, false>
{
UnaryIdentic(const int32_t divider = 1) { (void)divider; };
void operator()(float4_t& y, const float4_t& x) const { y = x; };
};
template <>
struct UnaryIdentic<float4_t, float4_t, true>
{
UnaryIdentic(const int32_t divider = 1) { divider_ = divider; };
void operator()(float4_t& y, const float4_t& x) const
{
y = _mm_div_ps(x, _mm_set1_ps(static_cast<float>(divider_)));
};
int32_t divider_ = 1;
};
template <>
struct UnaryIdentic<float8_t, float8_t, false>
{
UnaryIdentic(const int32_t divider = 1) { (void)divider; };
void operator()(float8_t& y, const float8_t& x) const { y = x; };
};
template <>
struct UnaryIdentic<float8_t, float8_t, true>
{
UnaryIdentic(const int32_t divider = 1) { divider_ = divider; };
void operator()(float8_t& y, const float8_t& x) const
{
y = _mm256_div_ps(x, _mm256_set1_ps(static_cast<float>(divider_)));
};
int32_t divider_ = 1;
};
template <typename Y, typename X, bool HasDividing = false>
struct UnarySquare;
template <>
struct UnarySquare<float, float, false>
{
UnarySquare(const int32_t divider = 1) { (void)divider; };
void operator()(float& y, const float& x) const { y = x * x; };
};
template <>
struct UnarySquare<float, float, true>
{
UnarySquare(const int32_t divider = 1) { divider_ = divider; };
void operator()(float& y, const float& x) const { y = x * x / type_convert<float>(divider_); };
int32_t divider_ = 1;
};
template <>
struct UnarySquare<float4_t, float4_t, false>
{
UnarySquare(const int32_t divider = 1) { (void)divider; };
void operator()(float4_t& y, const float4_t& x) const { y = _mm_mul_ps(x, x); };
};
template <>
struct UnarySquare<float4_t, float4_t, true>
{
UnarySquare(const int32_t divider = 1) { divider_ = divider; };
void operator()(float4_t& y, const float4_t& x) const
{
y = _mm_div_ps(_mm_mul_ps(x, x), _mm_set1_ps(static_cast<float>(divider_)));
};
int32_t divider_ = 1;
};
template <>
struct UnarySquare<float8_t, float8_t, false>
{
UnarySquare(const int32_t divider = 1) { (void)divider; };
void operator()(float8_t& y, const float8_t& x) const { y = _mm256_mul_ps(x, x); };
};
template <>
struct UnarySquare<float8_t, float8_t, true>
{
UnarySquare(const int32_t divider = 1) { divider_ = divider; };
void operator()(float8_t& y, const float8_t& x) const
{
y = _mm256_div_ps(_mm256_mul_ps(x, x), _mm256_set1_ps(static_cast<float>(divider_)));
};
int32_t divider_ = 1;
};
template <typename Y, typename X>
struct UnaryAbs;
template <>
struct UnaryAbs<float, float>
{
UnaryAbs(const int32_t divider = 1) { (void)divider; };
void operator()(float& y, const float& x) const { y = abs(x); };
};
template <>
struct UnaryAbs<float4_t, float4_t>
{
UnaryAbs(const int32_t divider = 1) { (void)divider; };
void operator()(float4_t& y, const float4_t& x) const
{
__m128 Mask = _mm_castsi128_ps(_mm_set1_epi32(~0x80000000));
y = _mm_and_ps(Mask, x);
};
};
template <>
struct UnaryAbs<float8_t, float8_t>
{
UnaryAbs(const int32_t divider = 1) { (void)divider; };
void operator()(float8_t& y, const float8_t& x) const
{
__m256 Mask = _mm256_castsi256_ps(_mm256_set1_epi32(~0x80000000));
y = _mm256_and_ps(Mask, x);
};
};
template <typename Y, typename X>
struct UnarySqrt;
template <>
struct UnarySqrt<float, float>
{
void operator()(float& y, const float& x) const { y = sqrtf(x); };
};
template <>
struct UnarySqrt<float4_t, float4_t>
{
void operator()(float4_t& y, const float4_t& x) const { y = _mm_sqrt_ps(x); };
};
template <>
struct UnarySqrt<float8_t, float8_t>
{
void operator()(float8_t& y, const float8_t& x) const { y = _mm256_sqrt_ps(x); };
};
} // namespace element_wise
} // namespace cpu
} // namespace tensor_operation
} // namespace ck
This diff is collapsed.
#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_AVX2_HPP
#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_AVX2_HPP
#include "common_header.hpp"
#include "data_type_cpu.hpp"
#include "../../gpu/thread/threadwise_tensor_slice_transfer.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "tensor_space_filling_curve.hpp"
#include "dynamic_buffer_cpu.hpp"
#include <immintrin.h>
namespace ck {
namespace cpu {
// Assume:
// 1. src_desc and dst_desc are not known at compile-time
// 2. src_slice_origin and dst_slice_origin are not known at compile-time,
// 3. always use __mm256 register to hold continuous 8 dword, so if fast-changing
// dim is a complex dimension, better re-consider layout (e.g NCHW is not good if non 1x1)
// 4. RunGeneric() can handle any case (by not using ymm), but performance are not guranteed
template <typename SrcData,
typename DstData,
typename SrcDesc,
typename DstDesc,
typename ElementwiseOperation,
typename SliceLengths,
typename DimAccessOrder,
index_t VectorDim,
index_t ScalarPerVector, // src/dst must use same vector size, aka src/dst both need same
// avx/float register
InMemoryDataOperationEnum_t DstInMemOp,
bool SrcResetCoordinateAfterRun,
bool DstResetCoordinateAfterRun>
struct ThreadwiseTensorSliceTransferAvx2
{
static constexpr index_t nDim = SliceLengths::Size();
using Index = MultiIndex<nDim>;
using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
static constexpr auto I0 = Number<0>{};
constexpr ThreadwiseTensorSliceTransferAvx2(const SrcDesc& src_desc,
const Index& src_slice_origin,
const DstDesc& dst_desc,
const Index& dst_slice_origin,
const ElementwiseOperation& element_op)
: src_coord_(make_tensor_coordinate(src_desc, src_slice_origin)),
dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin)),
element_op_(element_op)
{
static_assert(SliceLengths::At(Number<VectorDim>{}) % ScalarPerVector == 0,
"wrong! cannot evenly divide");
}
void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
{
// In GPU this function is used for set per-thread index based on threadIdx.x
// But for CPU, no need to call this function.
src_coord_ = make_tensor_coordinate(src_desc, src_slice_origin_idx);
}
void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
{
dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx);
}
template <typename SrcBuffer, typename DstBuffer>
void RunGeneric(const SrcDesc& src_desc,
const SrcBuffer& src_buf,
const DstDesc& dst_desc,
DstBuffer& dst_buf)
{
// scalar per access on each dim
// TODO: don't use lambda_scalar_per_access
constexpr auto scalar_per_access = generate_sequence(
ck::detail::lambda_scalar_per_access<VectorDim, ScalarPerVector>{}, Number<nDim>{});
using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
DimAccessOrder,
remove_cv_t<decltype(scalar_per_access)>>;
// loop over space-filling curve
constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
// std::cout<<"num_access:"<<num_access<<std::endl;
static_for<0, num_access, 1>{}([&](auto idx_1d) {
using src_vector_type = ck::cpu::vector_type_maker_t<SrcData, ScalarPerVector>;
using src_vector_t = typename src_vector_type::type;
using dst_vector_type = ck::cpu::vector_type_maker_t<DstData, ScalarPerVector>;
using dst_vector_t = typename dst_vector_type::type;
const bool is_src_valid =
coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc, src_coord_);
// printf("[%s] ", is_src_valid ? "y":"n");
// print_multi_index(src_coord_.GetIndex());
// printf("----");
// print_multi_index(src_coord_.GetHiddenIndex());
// printf(":%d", src_coord_.GetOffset());
// printf("\n");
// copy data from src_buf into src_vector_container
auto src_vector_container = src_vector_type{
src_buf.template Get<src_vector_t>(src_coord_.GetOffset(), is_src_valid)};
auto dst_vector_container = dst_vector_type{};
// apply pointwise operation
// static_for<0, ScalarPerVector, 1>{}([&](auto i) {
// element_op_(dst_vector_container.template AsType<DstData>()(i),
// src_vector_container.template AsType<SrcData>()[i]);
// });
element_op_(dst_vector_container.template AsType<dst_vector_t>(),
src_vector_container.template AsType<src_vector_t>());
const bool is_dst_valid =
coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_);
// printf(" -> ");
// print_multi_index(dst_coord_.GetIndex());
// printf(":%d", dst_coord_.GetOffset());
// printf(", src:0x%x, dst:0x%x",
// *reinterpret_cast<uint32_t*>(&src_vector_container.template AsType<src_vector_t>()),
// *reinterpret_cast<uint32_t*>(&dst_vector_container.template
// AsType<dst_vector_t>()));
// printf("\n");
// copy data from dst_vector into dst_buf
dst_buf.template Update<DstInMemOp, dst_vector_t>(
dst_coord_.GetOffset(),
is_dst_valid,
dst_vector_container.template AsType<dst_vector_t>());
// move coordinate
if constexpr(idx_1d.value != num_access - 1)
{
constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(idx_1d);
move_tensor_coordinate(
src_desc, src_coord_, make_tensor_coordinate_step(src_desc, forward_step));
move_tensor_coordinate(
dst_desc, dst_coord_, make_tensor_coordinate_step(dst_desc, forward_step));
}
});
// move coordinate back to slice origin (or not)
if constexpr(SrcResetCoordinateAfterRun)
{
const auto src_reset_step =
make_tensor_coordinate_step(src_desc, GetCoordinateResetStep());
move_tensor_coordinate(src_desc, src_coord_, src_reset_step);
}
if constexpr(DstResetCoordinateAfterRun)
{
const auto dst_reset_step =
make_tensor_coordinate_step(dst_desc, GetCoordinateResetStep());
move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step);
}
}
static constexpr auto GetCoordinateResetStep()
{
constexpr auto scalar_per_access = generate_sequence(
detail::lambda_scalar_per_access<VectorDim, ScalarPerVector>{}, Number<nDim>{});
using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
DimAccessOrder,
remove_cv_t<decltype(scalar_per_access)>>;
constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
if constexpr(num_access == 0)
{
return typename SpaceFillingCurve::Index{};
}
else
{
constexpr auto reset_step =
SpaceFillingCurve::GetStepBetween(Number<num_access - 1>{}, Number<0>{});
return reset_step;
}
}
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
void MoveSrcSliceWindow(const SrcDesc& src_desc, const Index& src_slice_origin_step_idx)
{
// if src coord was not reset by RunRead(), then need to adjust the step here
const auto adjusted_step_idx = SrcResetCoordinateAfterRun
? src_slice_origin_step_idx
: src_slice_origin_step_idx + GetCoordinateResetStep();
printf(" GetCoordinateResetStep:");
print_multi_index(GetCoordinateResetStep());
printf(" adjusted_step_idx:");
print_multi_index(adjusted_step_idx);
// is it OK to construct a new step every time?
const auto adjusted_step = make_tensor_coordinate_step(src_desc, adjusted_step_idx);
printf(" adjusted_step:");
print_multi_index(adjusted_step.GetIndexDiff());
printf("\n");
move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
}
// dst_slice_origin_step_idx need to be known at compile-time, for performance reason
void MoveDstSliceWindow(const DstDesc& dst_desc, const Index& dst_slice_origin_step_idx)
{
// if dst coord was not reset by Run(), then need to adjust the step here
const auto adjusted_step_idx = DstResetCoordinateAfterRun
? dst_slice_origin_step_idx
: dst_slice_origin_step_idx + GetCoordinateResetStep();
// is it OK to construct a new step every time?
const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx);
move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
}
private:
SrcCoord src_coord_;
DstCoord dst_coord_;
const ElementwiseOperation element_op_;
};
} // namespace cpu
} // namespace ck
#endif
...@@ -34,7 +34,7 @@ ...@@ -34,7 +34,7 @@
#include "is_known_at_compile_time.hpp" #include "is_known_at_compile_time.hpp"
#include "transpose_vectors.hpp" #include "transpose_vectors.hpp"
#include "inner_product.hpp" #include "inner_product.hpp"
#include "element_wise_operation.hpp" // #include "element_wise_operation.hpp"
#include "debug.hpp" #include "debug.hpp"
// TODO: remove this // TODO: remove this
......
#pragma once
#include <immintrin.h>
namespace ck {
namespace cpu {
// vector_type
template <typename T, index_t N>
struct vector_type;
// Caution: DO NOT REMOVE
// intentionally have only declaration but no definition to cause compilation failure when trying to
// instantiate this template. The purpose is to catch user's mistake when trying to make "vector of
// vectors"
template <typename T, index_t V, index_t N>
struct vector_type<T __attribute__((ext_vector_type(V))), N>;
// Caution: DO NOT REMOVE
// intentionally have only declaration but no definition to cause compilation failure when trying to
// instantiate this template. The purpose is to catch user's mistake when trying to make "vector of
// vectors"
template <typename T, index_t V, index_t N>
struct vector_type<vector_type<T, V>, N>;
// vector_type_maker
// This is the right way to handle "vector of vectors": making a bigger vector instead
template <typename T, index_t N>
struct vector_type_maker
{
using type = vector_type<T, N>;
};
template <typename T, index_t N>
using vector_type_maker_t = typename vector_type_maker<T, N>::type;
template <typename T, index_t N>
constexpr auto make_vector_type(Number<N>)
{
return typename vector_type_maker<T, N>::type{};
}
template <>
struct vector_type<float, 1>
{
using d1_t = float;
// SSE
using type = float;
type data_;
vector_type() : data_{0} {}
// vector_type(float x) : data_{x} {}
vector_type(type v) : data_{v} {}
vector_type(const float* mem) : data_{*mem} {}
template <typename X>
constexpr const auto& AsType() const
{
static_assert(std::is_same<X, type>::value, "wrong!");
return data_;
}
template <typename X>
constexpr auto& AsType()
{
static_assert(std::is_same<X, type>::value, "wrong!");
return data_;
}
constexpr void Load(const float* mem) { data_ = *mem; }
constexpr void Store(float* mem) const { *mem = data_; }
};
template <>
struct vector_type<float, 4>
{
using d1_t = float;
// SSE
using type = __m128;
type data_;
vector_type() : data_{_mm_setzero_ps()} {}
vector_type(float x) : data_{_mm_set1_ps(x)} {}
vector_type(type v) : data_{v} {}
vector_type(const float* mem) : data_{_mm_loadu_ps(mem)} {}
template <typename X>
constexpr const auto& AsType() const
{
static_assert(std::is_same<X, type>::value, "wrong!");
return data_;
}
template <typename X>
constexpr auto& AsType()
{
static_assert(std::is_same<X, type>::value, "wrong!");
return data_;
}
constexpr void Load(const float* mem) { data_ = _mm_loadu_ps(mem); }
constexpr void Store(float* mem) const { _mm_storeu_ps(mem, data_); }
};
template <>
struct vector_type<float, 8>
{
using d1_t = float;
// SSE
using type = __m256;
type data_;
vector_type() : data_{_mm256_setzero_ps()} {}
vector_type(float x) : data_{_mm256_set1_ps(x)} {}
vector_type(type v) : data_{v} {}
vector_type(const float* mem) : data_{_mm256_loadu_ps(mem)} {}
template <typename X>
constexpr const auto& AsType() const
{
static_assert(std::is_same<X, type>::value, "wrong!");
return data_;
}
template <typename X>
constexpr auto& AsType()
{
static_assert(std::is_same<X, type>::value, "wrong!");
return data_;
}
constexpr void Load(const float* mem) { data_ = _mm256_loadu_ps(mem); }
constexpr void Store(float* mem) const { _mm256_storeu_ps(mem, data_); }
};
template <typename T>
struct to_vector_type
{
using type = T;
};
template <>
struct to_vector_type<__m128>
{
using type = vector_type<float, 4>;
};
template <>
struct to_vector_type<__m256>
{
using type = vector_type<float, 8>;
};
template <typename Tv, typename Tp>
inline void load_vector(Tv& v, const Tp* mem)
{
v = *reinterpret_cast<const Tv*>(mem);
}
template <>
inline void load_vector(__m128& v, const float* mem)
{
v = _mm_loadu_ps(mem);
}
template <>
inline void load_vector(__m256& v, const float* mem)
{
v = _mm256_loadu_ps(mem);
}
template <typename Tv, typename Tp>
inline void store_vector(const Tv& v, Tp* mem)
{
*reinterpret_cast<Tv*>(mem) = v;
}
template <>
inline void store_vector(const __m128& v, float* mem)
{
_mm_storeu_ps(mem, v);
}
template <>
inline void store_vector(const __m256& v, float* mem)
{
_mm256_storeu_ps(mem, v);
}
template <typename Tv, typename Tx>
inline void set_vector(Tv& v, const Tx x)
{
v = static_cast<const Tv>(x);
}
template <>
inline void set_vector(__m128& v, const float x)
{
v = _mm_set1_ps(x);
}
template <>
inline void set_vector(__m256& v, const float x)
{
v = _mm256_set1_ps(x);
}
template <typename Tv>
inline void clear_vector(Tv& v)
{
v = static_cast<Tv>(0);
}
template <>
inline void clear_vector(__m128& v)
{
v = _mm_setzero_ps();
}
template <>
inline void clear_vector(__m256& v)
{
v = _mm256_setzero_ps();
}
using float4_t = typename vector_type<float, 4>::type;
using float8_t = typename vector_type<float, 8>::type;
// scalar_type
template <typename TV>
struct scalar_type;
// is_scalar_type
template <typename TV>
struct is_scalar_type
{
static constexpr bool value = (scalar_type<remove_cvref_t<TV>>::vector_size == 1);
};
// has_same_scalar_type
template <typename X, typename Y>
using has_same_scalar_type = is_same<typename scalar_type<remove_cvref_t<X>>::type,
typename scalar_type<remove_cvref_t<Y>>::type>;
template <typename T, index_t N>
struct scalar_type<vector_type<T, N>>
{
using type = T;
static constexpr index_t vector_size = N;
};
template <>
struct scalar_type<float4_t>
{
using type = float;
static constexpr index_t vector_size = 4;
};
template <>
struct scalar_type<float8_t>
{
using type = float;
static constexpr index_t vector_size = 8;
};
//
template <>
struct scalar_type<float>
{
using type = float;
static constexpr index_t vector_size = 1;
};
} // namespace cpu
} // namespace ck
#ifndef CK_BUFFER_CPU_HPP
#define CK_BUFFER_CPU_HPP
#include "config.hpp"
#include "enable_if.hpp"
#include "data_type_cpu.hpp"
namespace ck {
namespace cpu {
template <AddressSpaceEnum_t BufferAddressSpace,
typename T,
typename ElementSpaceSize,
bool InvalidElementUseNumericalZeroValue>
struct DynamicBuffer
{
using type = T;
static_assert(BufferAddressSpace ==
AddressSpaceEnum_t::Global); // only valid for global address space on cpu
T* p_data_;
ElementSpaceSize element_space_size_;
T invalid_element_value_ = T{0};
constexpr DynamicBuffer(T* p_data, ElementSpaceSize element_space_size)
: p_data_{p_data}, element_space_size_{element_space_size}
{
}
constexpr DynamicBuffer(T* p_data, ElementSpaceSize element_space_size, T invalid_element_value)
: p_data_{p_data},
element_space_size_{element_space_size},
invalid_element_value_{invalid_element_value}
{
}
static constexpr AddressSpaceEnum_t GetAddressSpace() { return BufferAddressSpace; }
constexpr const T& operator[](index_t i) const { return p_data_[i]; }
constexpr T& operator()(index_t i) { return p_data_[i]; }
// X should be data_type::type, not directly data_type
template <typename X,
typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
typename scalar_type<remove_cvref_t<T>>::type>::value,
bool>::type = false>
constexpr auto Get(index_t i, bool is_valid_element) const
{
if constexpr(InvalidElementUseNumericalZeroValue)
{
X v;
if(is_valid_element)
load_vector(v, &p_data_[i]);
else
clear_vector(v);
return v;
}
else
{
X v;
if(is_valid_element)
load_vector(v, &p_data_[i]);
else
set_vector(v, invalid_element_value_);
return v;
}
}
template <InMemoryDataOperationEnum_t Op,
typename X,
typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
typename scalar_type<remove_cvref_t<T>>::type>::value,
bool>::type = false>
void Update(index_t i, bool is_valid_element, const X& x)
{
if constexpr(Op == InMemoryDataOperationEnum_t::Set)
{
this->template Set<X>(i, is_valid_element, x);
}
else if constexpr(Op == InMemoryDataOperationEnum_t::Add)
{
auto tmp = this->template Get<X>(i, is_valid_element);
this->template Set<X>(i, is_valid_element, x + tmp);
}
}
template <typename X,
typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
typename scalar_type<remove_cvref_t<T>>::type>::value,
bool>::type = false>
void Set(index_t i, bool is_valid_element, const X& x)
{
// X contains multiple T
constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;
static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
"wrong! X need to be multiple T");
if(is_valid_element)
{
store_vector(x, &p_data_[i]);
}
}
static constexpr bool IsStaticBuffer() { return false; }
static constexpr bool IsDynamicBuffer() { return true; }
};
template <AddressSpaceEnum_t BufferAddressSpace, typename T, typename ElementSpaceSize>
constexpr auto make_dynamic_buffer(T* p, ElementSpaceSize element_space_size)
{
return DynamicBuffer<BufferAddressSpace, T, ElementSpaceSize, true>{p, element_space_size};
}
template <
AddressSpaceEnum_t BufferAddressSpace,
typename T,
typename ElementSpaceSize,
typename X,
typename enable_if<is_same<remove_cvref_t<T>, remove_cvref_t<X>>::value, bool>::type = false>
constexpr auto
make_dynamic_buffer(T* p, ElementSpaceSize element_space_size, X invalid_element_value)
{
return DynamicBuffer<BufferAddressSpace, T, ElementSpaceSize, false>{
p, element_space_size, invalid_element_value};
}
} // namespace cpu
} // namespace ck
#endif
add_subdirectory(src/host_tensor) add_subdirectory(src/host_tensor)
add_subdirectory(src/tensor_operation_instance/gpu) add_subdirectory(src/tensor_operation_instance/gpu)
add_subdirectory(src/tensor_operation_instance/cpu)
\ No newline at end of file
...@@ -29,6 +29,8 @@ struct DeviceAlignedMemCPU ...@@ -29,6 +29,8 @@ struct DeviceAlignedMemCPU
DeviceAlignedMemCPU(std::size_t mem_size, std::size_t alignment); DeviceAlignedMemCPU(std::size_t mem_size, std::size_t alignment);
void* GetDeviceBuffer(); void* GetDeviceBuffer();
std::size_t GetBufferSize(); std::size_t GetBufferSize();
void ToDevice(const void* p);
void FromDevice(void* p);
void SetZero(); void SetZero();
~DeviceAlignedMemCPU(); ~DeviceAlignedMemCPU();
...@@ -108,4 +110,27 @@ float launch_and_time_kernel( ...@@ -108,4 +110,27 @@ float launch_and_time_kernel(
return timer.GetElapsedTime() / nrepeat; return timer.GetElapsedTime() / nrepeat;
} }
template <typename... Args, typename F>
void launch_cpu_kernel(F kernel, Args... args)
{
kernel(args...);
}
template <typename... Args, typename F>
float launch_and_time_cpu_kernel(F kernel, int nrepeat, Args... args)
{
WallTimer timer;
kernel(args...);
timer.Start();
for(int i = 0; i < nrepeat; i++)
{
kernel(args...);
}
timer.End();
return timer.GetElapsedTime() / nrepeat;
}
#endif #endif
...@@ -45,6 +45,10 @@ void* DeviceAlignedMemCPU::GetDeviceBuffer() { return mpDeviceBuf; } ...@@ -45,6 +45,10 @@ void* DeviceAlignedMemCPU::GetDeviceBuffer() { return mpDeviceBuf; }
std::size_t DeviceAlignedMemCPU::GetBufferSize() { return mMemSize; } std::size_t DeviceAlignedMemCPU::GetBufferSize() { return mMemSize; }
void DeviceAlignedMemCPU::ToDevice(const void* p) { memcpy(mpDeviceBuf, p, mMemSize); }
void DeviceAlignedMemCPU::FromDevice(void* p) { memcpy(p, mpDeviceBuf, mMemSize); }
void DeviceAlignedMemCPU::SetZero() { memset(mpDeviceBuf, 0, mMemSize); } void DeviceAlignedMemCPU::SetZero() { memset(mpDeviceBuf, 0, mMemSize); }
DeviceAlignedMemCPU::~DeviceAlignedMemCPU() { free((reinterpret_cast<void**>(mpDeviceBuf))[-1]); } DeviceAlignedMemCPU::~DeviceAlignedMemCPU() { free((reinterpret_cast<void**>(mpDeviceBuf))[-1]); }
......
include_directories(BEFORE
${PROJECT_SOURCE_DIR}/include/ck
${PROJECT_SOURCE_DIR}/include/ck/utility
${PROJECT_SOURCE_DIR}/include/ck/tensor_description
${PROJECT_SOURCE_DIR}/include/ck/tensor
${PROJECT_SOURCE_DIR}/include/ck/problem_transform
${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/cpu/device
${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/cpu/grid
${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/cpu/block
${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/cpu/thread
${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/cpu/element
${PROJECT_SOURCE_DIR}/library/include/ck/library/host_tensor
${PROJECT_SOURCE_DIR}/library/include/ck/library/tensor_operation_instance
${PROJECT_SOURCE_DIR}/external/include/half
)
function(add_instance_library INSTANCE_NAME)
message("adding instance ${INSTANCE_NAME}")
add_library(${INSTANCE_NAME} SHARED ${ARGN})
target_compile_features(${INSTANCE_NAME} PUBLIC)
set_target_properties(${INSTANCE_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
endfunction(add_instance_library INSTANCE_NAME)
add_subdirectory(conv2d_fwd)
# device_conv2d_fwd_cpu_instance
set(DEVICE_CONV2D_FWD_CPU_INSTANCE_SOURCE
device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_instance.cpp
)
add_library(device_conv2d_fwd_cpu_instance SHARED ${DEVICE_CONV2D_FWD_CPU_INSTANCE_SOURCE})
target_compile_features(device_conv2d_fwd_cpu_instance PUBLIC)
set_target_properties(device_conv2d_fwd_cpu_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
install(TARGETS device_conv2d_fwd_cpu_instance LIBRARY DESTINATION lib)
clang_tidy_check(device_conv2d_fwd_cpu_instance)
#include <stdlib.h>
#include "convolution_forward_specialization_cpu.hpp"
#include "config.hpp"
#include "device_convnd_fwd_avx2_nhwc_kyxc_nhwk.hpp"
#include "element_wise_operation_cpu.hpp"
#include "device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace cpu {
namespace device {
namespace device_conv2d_fwd_avx2_instance {
using InType = float;
using WeiType = float;
using OutType = float;
using AccType = float;
using InLayout = ck::tensor_layout::gemm::RowMajor; // NHWC
using WeiLayout = ck::tensor_layout::gemm::ColumnMajor; // KYXC
static constexpr bool NonTemporalStore = false;
using PassThrough = ck::tensor_operation::cpu::element_wise::PassThrough;
using ThreadwiseGemmAvx2_MxN_4x24_Dispatch =
ck::cpu::ThreadwiseGemmAvx2_MxN_4x24_Dispatch<InType,
WeiType,
OutType,
InLayout,
WeiLayout,
NonTemporalStore>;
static constexpr auto ConvFwdDefault =
ck::tensor_operation::cpu::device::ConvolutionForwardSpecialization_t::Default;
static constexpr auto ConvFwd1x1P0 =
ck::tensor_operation::cpu::device::ConvolutionForwardSpecialization_t::Filter1x1Pad0;
static constexpr auto ConvFwd1x1S1P0 =
ck::tensor_operation::cpu::device::ConvolutionForwardSpecialization_t::Filter1x1Stride1Pad0;
using device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_f32_instances = std::tuple<
//#################################################################|InDataType|WeiDataType|OutDataType|AccDataType|InElementwiseOp|WeiElementwiseOp|OutElementwiseOp|ConvForwardSp|NumDimSpatial|MPerBlock|NPerBlock|KPerBlock|ThreadwiseGemm_Dispatch
DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
float,
float,
float,
float,
PassThrough,
PassThrough,
PassThrough,
ConvFwdDefault,
2,
256,
128,
64,
ThreadwiseGemmAvx2_MxN_4x24_Dispatch>,
DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
float,
float,
float,
float,
PassThrough,
PassThrough,
PassThrough,
ConvFwdDefault,
2,
512,
256,
128,
ThreadwiseGemmAvx2_MxN_4x24_Dispatch>,
DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
float,
float,
float,
float,
PassThrough,
PassThrough,
PassThrough,
ConvFwdDefault,
2,
1024,
144,
128,
ThreadwiseGemmAvx2_MxN_4x24_Dispatch>>;
void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk(
std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances)
{
ck::tensor_operation::device::add_device_operation_instances(
instances, device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_f32_instances{});
}
} // namespace device_conv2d_fwd_avx2_instance
} // namespace device
} // namespace cpu
} // namespace tensor_operation
} // namespace ck
...@@ -32,6 +32,7 @@ set(PROFILER_SOURCE ...@@ -32,6 +32,7 @@ set(PROFILER_SOURCE
src/profile_conv_fwd_bias_relu.cpp src/profile_conv_fwd_bias_relu.cpp
src/profile_conv_fwd_bias_relu_add.cpp src/profile_conv_fwd_bias_relu_add.cpp
src/profile_conv_fwd_bias_relu_atomic_add.cpp src/profile_conv_fwd_bias_relu_atomic_add.cpp
src/profile_conv_fwd_cpu.cpp
src/profile_convnd_bwd_data.cpp src/profile_convnd_bwd_data.cpp
src/profile_reduce.cpp src/profile_reduce.cpp
src/profile_grouped_gemm.cpp src/profile_grouped_gemm.cpp
...@@ -51,6 +52,7 @@ target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_instance) ...@@ -51,6 +52,7 @@ target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_instance)
target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_instance) target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_instance)
target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_add_instance) target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_add_instance)
target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_atomic_add_instance) target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_atomic_add_instance)
target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_cpu_instance)
target_link_libraries(ckProfiler PRIVATE device_convnd_bwd_data_instance) target_link_libraries(ckProfiler PRIVATE device_convnd_bwd_data_instance)
target_link_libraries(ckProfiler PRIVATE device_reduce_instance) target_link_libraries(ckProfiler PRIVATE device_reduce_instance)
target_link_libraries(ckProfiler PRIVATE device_reduce_instance) target_link_libraries(ckProfiler PRIVATE device_reduce_instance)
......
#pragma once
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "tensor_layout.hpp"
#include "device_tensor.hpp"
#include "device_convnd_fwd_avx2_nhwc_kyxc_nhwk.hpp"
#include "element_wise_operation_cpu.hpp"
#include "reference_conv_fwd.hpp"
namespace ck {
namespace tensor_operation {
namespace cpu {
namespace device {
namespace device_conv2d_fwd_avx2_instance {
void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk(
std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
} // namespace device_conv2d_fwd_avx2_instance
} // namespace device
} // namespace cpu
} // namespace tensor_operation
} // namespace ck
namespace ck {
namespace profiler {
#define AVX2_DATA_ALIGNMENT
template <int NDimSpatial,
typename InDataType,
typename WeiDataType,
typename OutDataType,
typename InLayout,
typename WeiLayout,
typename OutLayout>
void profile_conv_cpu_fwd_impl(int do_verification,
int init_method,
bool do_log,
int nrepeat,
ck::index_t N,
ck::index_t K,
ck::index_t C,
std::vector<ck::index_t> input_spatial_lengths,
std::vector<ck::index_t> filter_spatial_lengths,
std::vector<ck::index_t> output_spatial_lengths,
std::vector<ck::index_t> conv_filter_strides,
std::vector<ck::index_t> conv_filter_dilations,
std::vector<ck::index_t> input_left_pads,
std::vector<ck::index_t> input_right_pads)
{
const ck::index_t Y = filter_spatial_lengths[0];
const ck::index_t X = filter_spatial_lengths[1];
const ck::index_t Hi = input_spatial_lengths[0];
const ck::index_t Wi = input_spatial_lengths[1];
const ck::index_t Ho = output_spatial_lengths[0];
const ck::index_t Wo = output_spatial_lengths[1];
auto f_host_tensor_descriptor =
[](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W, auto layout) {
if constexpr(is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value ||
is_same<decltype(layout), ck::tensor_layout::convolution::KCYX>::value ||
is_same<decltype(layout), ck::tensor_layout::convolution::NKHW>::value)
{
return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
std::vector<std::size_t>({C_ * H * W, H * W, W, 1}));
}
else if constexpr(is_same<decltype(layout), tensor_layout::convolution::NHWC>::value ||
is_same<decltype(layout), tensor_layout::convolution::KYXC>::value ||
is_same<decltype(layout), tensor_layout::convolution::NHWK>::value)
{
return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
std::vector<std::size_t>({C_ * H * W, 1, W * C_, C_}));
}
};
Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi, InLayout{}));
Tensor<WeiDataType> wei_k_c_y_x(f_host_tensor_descriptor(K, C, Y, X, WeiLayout{}));
Tensor<OutDataType> out_n_k_ho_wo_host_result(
f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
Tensor<OutDataType> out_n_k_ho_wo_device_result(
f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl;
std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo_host_result.mDesc << std::endl;
switch(init_method)
{
case 0: break;
case 1:
in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
break;
default:
in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
}
using InElementOp = ck::tensor_operation::cpu::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::cpu::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::cpu::element_wise::PassThrough;
const auto in_element_op = InElementOp{};
const auto wei_element_op = WeiElementOp{};
const auto out_element_op = OutElementOp{};
if(do_verification)
{
using ReferenceConvFwdInstance = ck::tensor_operation::host::ReferenceConvFwd<InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>;
auto ref_conv = ReferenceConvFwdInstance{};
auto ref_invoker = ref_conv.MakeInvoker();
auto ref_argument = ref_conv.MakeArgument(in_n_c_hi_wi,
wei_k_c_y_x,
out_n_k_ho_wo_host_result,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
in_element_op,
wei_element_op,
out_element_op);
ref_invoker.Run(ref_argument);
}
DeviceAlignedMemCPU in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpace(),
AVX2_DATA_ALIGNMENT);
DeviceAlignedMemCPU wei_device_buf(sizeof(WeiDataType) * wei_k_c_y_x.mDesc.GetElementSpace(),
AVX2_DATA_ALIGNMENT);
DeviceAlignedMemCPU out_device_buf(sizeof(OutDataType) *
out_n_k_ho_wo_device_result.mDesc.GetElementSpace(),
AVX2_DATA_ALIGNMENT);
in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
wei_device_buf.ToDevice(wei_k_c_y_x.mData.data());
memcpy(in_device_buf.mpDeviceBuf, in_n_c_hi_wi.mData.data(), in_device_buf.mMemSize);
memcpy(wei_device_buf.mpDeviceBuf, wei_k_c_y_x.mData.data(), wei_device_buf.mMemSize);
using PassThrough = ck::tensor_operation::cpu::element_wise::PassThrough;
using DeviceConvFwdNoOpPtr =
ck::tensor_operation::device::DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>;
// add device Conv instances
std::vector<DeviceConvFwdNoOpPtr> conv_ptrs;
ck::tensor_operation::cpu::device::device_conv2d_fwd_instance::
add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk(conv_ptrs);
if(conv_ptrs.size() <= 0)
{
throw std::runtime_error("wrong! no device Conv instance found");
}
std::string best_conv_name;
float best_ave_time = 0;
float best_gflops = 0;
float best_gb_per_sec = 0;
// profile device Conv instances
for(auto& conv_ptr : conv_ptrs)
{
auto argument_ptr = conv_ptr->MakeArgumentPointer(
static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
N,
K,
C,
input_spatial_lengths,
filter_spatial_lengths,
output_spatial_lengths,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
in_element_op,
wei_element_op,
out_element_op);
auto invoker_ptr = conv_ptr->MakeInvokerPointer();
if(conv_ptr->IsSupportedArgument(argument_ptr.get()))
{
std::string conv_name = conv_ptr->GetTypeString();
float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
std::size_t num_btype = sizeof(InDataType) * (N * C * Hi * Wi) +
sizeof(WeiDataType) * (K * C * Y * X) +
sizeof(OutDataType) * (N * K * Ho * Wo);
float gflops = static_cast<float>(flop) / 1.E6 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << gflops << " GFlops, " << gb_per_sec
<< " GB/s, " << conv_name << std::endl;
if(gflops > best_gflops)
{
best_conv_name = conv_name;
best_gflops = gflops;
best_ave_time = ave_time;
best_gb_per_sec = gb_per_sec;
}
if(do_verification)
{
memcpy(out_n_k_ho_wo_device_result.mData.data(),
out_device_buf.mpDeviceBuf,
out_device_buf.mMemSize);
check_error(out_n_k_ho_wo_host_result, out_n_k_ho_wo_device_result);
if(do_log)
{
LogRangeAsType<float>(std::cout << "in : ", in_n_c_hi_wi.mData, ",")
<< std::endl;
LogRangeAsType<float>(std::cout << "wei: ", wei_k_c_y_x.mData, ",")
<< std::endl;
LogRangeAsType<float>(
std::cout << "out_host : ", out_n_k_ho_wo_host_result.mData, ",")
<< std::endl;
LogRangeAsType<float>(
std::cout << "out_device: ", out_n_k_ho_wo_device_result.mData, ",")
<< std::endl;
}
}
}
}
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gflops << " GFlops, "
<< best_gb_per_sec << " GB/s, " << best_conv_name << std::endl;
}
} // namespace profiler
} // namespace ck
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment