Commit e72c0c43 authored by carlushuang's avatar carlushuang
Browse files

Merge remote-tracking branch 'origin/develop' into cpu_avx2

parents d714fa15 313bbea5
...@@ -5,10 +5,16 @@ namespace ck { ...@@ -5,10 +5,16 @@ namespace ck {
namespace tensor_operation { namespace tensor_operation {
namespace device { namespace device {
enum GemmSpecialization_t enum struct GemmSpecialization_t
{ {
Default, Default,
MPadding,
NPadding,
KPadding,
MNPadding, MNPadding,
MKPadding,
NKPadding,
MNKPadding,
}; };
} // namespace device } // namespace device
......
...@@ -85,6 +85,7 @@ struct NKHW : public BaseTensorLayout ...@@ -85,6 +85,7 @@ struct NKHW : public BaseTensorLayout
static constexpr const char* name = "NKHW"; static constexpr const char* name = "NKHW";
}; };
// 3D Conv
struct NDHWC : public BaseTensorLayout struct NDHWC : public BaseTensorLayout
{ {
static constexpr const char* name = "NDHWC"; static constexpr const char* name = "NDHWC";
...@@ -100,6 +101,21 @@ struct NDHWK : public BaseTensorLayout ...@@ -100,6 +101,21 @@ struct NDHWK : public BaseTensorLayout
static constexpr const char* name = "NDHWK"; static constexpr const char* name = "NDHWK";
}; };
struct NCDHW : public BaseTensorLayout
{
static constexpr const char* name = "NCDHW";
};
struct KCZYX : public BaseTensorLayout
{
static constexpr const char* name = "KCZYX";
};
struct NKDHW : public BaseTensorLayout
{
static constexpr const char* name = "NKDHW";
};
} // namespace convolution } // namespace convolution
template < template <
......
#ifndef CK_ELEMENT_WISE_OPERATION_HPP #pragma once
#define CK_ELEMENT_WISE_OPERATION_HPP
#include "data_type.hpp"
#include "data_type.hpp" #include "data_type.hpp"
namespace ck { namespace ck {
...@@ -19,6 +16,8 @@ struct PassThrough ...@@ -19,6 +16,8 @@ struct PassThrough
__host__ __device__ void operator()(int32_t& y, const int32_t& x) const { y = x; } __host__ __device__ void operator()(int32_t& y, const int32_t& x) const { y = x; }
__host__ __device__ void operator()(int8_t& y, const int8_t& x) const { y = x; } __host__ __device__ void operator()(int8_t& y, const int8_t& x) const { y = x; }
__host__ __device__ void operator()(double& y, const double& x) const { y = x; }
}; };
struct Add struct Add
...@@ -239,6 +238,24 @@ struct UnaryIdentic<int32_t, int32_t, false> ...@@ -239,6 +238,24 @@ struct UnaryIdentic<int32_t, int32_t, false>
__host__ __device__ void operator()(int32_t& y, const int32_t& x) const { y = x; }; __host__ __device__ void operator()(int32_t& y, const int32_t& x) const { y = x; };
}; };
template <>
struct UnaryIdentic<int32_t, int32_t, true>
{
__host__ __device__ UnaryIdentic(const int32_t divider = 1) { divider_ = divider; };
__host__ __device__ void operator()(int32_t& y, const int32_t& x) const { y = x / divider_; };
int32_t divider_ = 1;
};
template <>
struct UnaryIdentic<int8_t, int8_t, false>
{
__host__ __device__ UnaryIdentic(const int8_t divider = 1) { (void)divider; };
__host__ __device__ void operator()(int8_t& y, const int8_t& x) const { y = x; };
};
template <typename Y, typename X, bool HasDividing = false> template <typename Y, typename X, bool HasDividing = false>
struct UnarySquare; struct UnarySquare;
...@@ -311,6 +328,19 @@ struct UnaryAbs<double, double> ...@@ -311,6 +328,19 @@ struct UnaryAbs<double, double>
__host__ __device__ void operator()(double& y, const double& x) const { y = abs(x); }; __host__ __device__ void operator()(double& y, const double& x) const { y = abs(x); };
}; };
template <>
struct UnaryAbs<int8_t, int8_t>
{
__host__ __device__ UnaryAbs(const int32_t divider = 1) { (void)divider; };
__host__ __device__ void operator()(int8_t& y, const int8_t& x) const
{
int8_t sgn = x >> (8 - 1);
y = (x ^ sgn) - sgn;
};
};
template <typename Y, typename X> template <typename Y, typename X>
struct UnarySqrt; struct UnarySqrt;
...@@ -333,4 +363,3 @@ struct UnarySqrt<double, double> ...@@ -333,4 +363,3 @@ struct UnarySqrt<double, double>
} // namespace element_wise } // namespace element_wise
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#pragma once
#include "data_type.hpp"
namespace ck {
namespace tensor_operation {
namespace element_wise {
struct ReduceSum
{
__host__ __device__ static constexpr float GetReduceZeroValue() { return float(0); }
__host__ __device__ void Reduce(float& acc, float v) const { acc += v; }
};
struct ReduceSquareSum
{
__host__ __device__ static constexpr float GetReduceZeroValue() { return float(0); }
__host__ __device__ void Reduce(float& acc, float v) const { acc += v * v; }
};
} // namespace element_wise
} // namespace tensor_operation
} // namespace ck
...@@ -32,6 +32,7 @@ ...@@ -32,6 +32,7 @@
#include "reduction_functions_blockwise.hpp" #include "reduction_functions_blockwise.hpp"
#include "threadwise_tensor_slice_transfer.hpp" #include "threadwise_tensor_slice_transfer.hpp"
#include "element_wise_operation.hpp"
namespace ck { namespace ck {
...@@ -84,6 +85,11 @@ template <typename InDataType, ...@@ -84,6 +85,11 @@ template <typename InDataType,
index_t OutDstVectorSize> index_t OutDstVectorSize>
struct GridwiseReduction_mk_to_m_multiblock_atomic_add struct GridwiseReduction_mk_to_m_multiblock_atomic_add
{ {
static_assert(((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) ||
(InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0)) &&
(MThreadSliceSize % OutDstVectorSize == 0),
"Invalid thread slice sizes and/or vector sizes configuration, please check!");
static constexpr bool reorder_thread_cluster = (InSrcVectorDim == 0); static constexpr bool reorder_thread_cluster = (InSrcVectorDim == 0);
using ThreadClusterLengths_M_K = Sequence<MThreadClusterSize, KThreadClusterSize>; using ThreadClusterLengths_M_K = Sequence<MThreadClusterSize, KThreadClusterSize>;
...@@ -109,8 +115,7 @@ struct GridwiseReduction_mk_to_m_multiblock_atomic_add ...@@ -109,8 +115,7 @@ struct GridwiseReduction_mk_to_m_multiblock_atomic_add
ReduceOperation, ReduceOperation,
PropagateNan>; PropagateNan>;
template <typename T> using PassThroughOp = tensor_operation::element_wise::PassThrough;
using PassThroughOp = tensor_operation::element_wise::UnaryIdentic<T, T>;
static constexpr auto I0 = Number<0>{}; static constexpr auto I0 = Number<0>{};
static constexpr auto I1 = Number<1>{}; static constexpr auto I1 = Number<1>{};
...@@ -249,7 +254,7 @@ struct GridwiseReduction_mk_to_m_multiblock_atomic_add ...@@ -249,7 +254,7 @@ struct GridwiseReduction_mk_to_m_multiblock_atomic_add
OutDataType, OutDataType,
decltype(reduced_data_desc), decltype(reduced_data_desc),
OutGridDesc_M, OutGridDesc_M,
PassThroughOp<AccDataType>, PassThroughOp,
Sequence<MThreadSliceSize>, Sequence<MThreadSliceSize>,
Sequence<0>, Sequence<0>,
0, 0,
...@@ -260,7 +265,7 @@ struct GridwiseReduction_mk_to_m_multiblock_atomic_add ...@@ -260,7 +265,7 @@ struct GridwiseReduction_mk_to_m_multiblock_atomic_add
out_grid_desc_m, out_grid_desc_m,
make_multi_index(blkgroup_id * M_BlockTileSize + make_multi_index(blkgroup_id * M_BlockTileSize +
thread_m_cluster_id * MThreadSliceSize), thread_m_cluster_id * MThreadSliceSize),
PassThroughOp<AccDataType>{}); PassThroughOp{});
threadwise_dst_store.Run( threadwise_dst_store.Run(
reduced_data_desc, make_tuple(I0), accu_value_buf, out_grid_desc_m, out_global_buf); reduced_data_desc, make_tuple(I0), accu_value_buf, out_grid_desc_m, out_global_buf);
......
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
#include "blockwise_tensor_slice_transfer_v6r1.hpp" #include "blockwise_tensor_slice_transfer_v6r1.hpp"
#include "threadwise_tensor_slice_transfer.hpp" #include "threadwise_tensor_slice_transfer.hpp"
#include "gridwise_gemm_pipeline_v1.hpp" #include "gridwise_gemm_pipeline_v1.hpp"
#include "tensor_space_filling_curve.hpp"
namespace ck { namespace ck {
...@@ -657,6 +658,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1 ...@@ -657,6 +658,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
n_thread_data_on_block_idx[I2]), n_thread_data_on_block_idx[I2]),
ck::tensor_operation::element_wise::PassThrough{}}; ck::tensor_operation::element_wise::PassThrough{}};
// LDS to global
auto c_block_copy_lds_to_global = BlockwiseTensorSliceTransfer_v6r1< auto c_block_copy_lds_to_global = BlockwiseTensorSliceTransfer_v6r1<
BlockSize, // index_t BlockSize, BlockSize, // index_t BlockSize,
CElementwiseOperation, // ElementwiseOperation, CElementwiseOperation, // ElementwiseOperation,
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
namespace ck { namespace ck {
enum AddressSpaceEnum_t enum struct AddressSpaceEnum_t
{ {
Generic, Generic,
Global, Global,
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment