Merge remote-tracking branch 'origin/develop' into cpu_avx2

e72c0c43 · carlushuang · d714fa15 · 313bbea5 · e72c0c43 · e72c0c43
Commit e72c0c43 authored Mar 26, 2022 by carlushuang
20 changed files
--- a/include/ck/tensor_operation/gpu/device/gemm_specialization.hpp
+++ b/include/ck/tensor_operation/gpu/device/gemm_specialization.hpp
@@ -5,10 +5,16 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
-enum GemmSpecialization_t
+enum struct GemmSpecialization_t
 {
    Default,
+    MPadding,
+    NPadding,
+    KPadding,
    MNPadding,
+    MKPadding,
+    NKPadding,
+    MNKPadding,
 };
 } // namespace device

--- a/include/ck/tensor_operation/gpu/device/tensor_layout.hpp
+++ b/include/ck/tensor_operation/gpu/device/tensor_layout.hpp
@@ -85,6 +85,7 @@ struct NKHW : public BaseTensorLayout
    static constexpr const char* name = "NKHW";
 };
+// 3D Conv
 struct NDHWC : public BaseTensorLayout
 {
    static constexpr const char* name = "NDHWC";
@@ -100,6 +101,21 @@ struct NDHWK : public BaseTensorLayout
    static constexpr const char* name = "NDHWK";
 };
+struct NCDHW : public BaseTensorLayout
+{
+    static constexpr const char* name = "NCDHW";
+};
+struct KCZYX : public BaseTensorLayout
+{
+    static constexpr const char* name = "KCZYX";
+};
+struct NKDHW : public BaseTensorLayout
+{
+    static constexpr const char* name = "NKDHW";
+};
 } // namespace convolution
 template <

--- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
-#ifndef CK_ELEMENT_WISE_OPERATION_HPP
+#pragma once
-#define CK_ELEMENT_WISE_OPERATION_HPP
-#include "data_type.hpp"
 #include "data_type.hpp"
 namespace ck {
@@ -19,6 +16,8 @@ struct PassThrough
    __host__ __device__ void operator()(int32_t& y, const int32_t& x) const { y = x; }
    __host__ __device__ void operator()(int8_t& y, const int8_t& x) const { y = x; }
+    __host__ __device__ void operator()(double& y, const double& x) const { y = x; }
 };
 struct Add
@@ -239,6 +238,24 @@ struct UnaryIdentic<int32_t, int32_t, false>
    __host__ __device__ void operator()(int32_t& y, const int32_t& x) const { y = x; };
 };
+template <>
+struct UnaryIdentic<int32_t, int32_t, true>
+{
+    __host__ __device__ UnaryIdentic(const int32_t divider = 1) { divider_ = divider; };
+    __host__ __device__ void operator()(int32_t& y, const int32_t& x) const { y = x / divider_; };
+    int32_t divider_ = 1;
+};
+template <>
+struct UnaryIdentic<int8_t, int8_t, false>
+{
+    __host__ __device__ UnaryIdentic(const int8_t divider = 1) { (void)divider; };
+    __host__ __device__ void operator()(int8_t& y, const int8_t& x) const { y = x; };
+};
 template <typename Y, typename X, bool HasDividing = false>
 struct UnarySquare;
@@ -311,6 +328,19 @@ struct UnaryAbs<double, double>
    __host__ __device__ void operator()(double& y, const double& x) const { y = abs(x); };
 };
+template <>
+struct UnaryAbs<int8_t, int8_t>
+{
+    __host__ __device__ UnaryAbs(const int32_t divider = 1) { (void)divider; };
+    __host__ __device__ void operator()(int8_t& y, const int8_t& x) const
+    {
+        int8_t sgn = x >> (8 - 1);
+        y = (x ^ sgn) - sgn;
+    };
+};
 template <typename Y, typename X>
 struct UnarySqrt;
@@ -333,4 +363,3 @@ struct UnarySqrt<double, double>
 } // namespace element_wise
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/include/ck/tensor_operation/gpu/element/element_wise_reduce_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/element_wise_reduce_operation.hpp
+#pragma once
+#include "data_type.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace element_wise {
+struct ReduceSum
+{
+    __host__ __device__ static constexpr float GetReduceZeroValue() { return float(0); }
+    __host__ __device__ void Reduce(float& acc, float v) const { acc += v; }
+};
+struct ReduceSquareSum
+{
+    __host__ __device__ static constexpr float GetReduceZeroValue() { return float(0); }
+    __host__ __device__ void Reduce(float& acc, float v) const { acc += v * v; }
+};
+} // namespace element_wise
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_blockwise.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_blockwise.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_atomic_add.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_atomic_add.hpp
@@ -32,6 +32,7 @@
 #include "reduction_functions_blockwise.hpp"
 #include "threadwise_tensor_slice_transfer.hpp"
+#include "element_wise_operation.hpp"
 namespace ck {
@@ -84,6 +85,11 @@ template <typename InDataType,
          index_t OutDstVectorSize>
 struct GridwiseReduction_mk_to_m_multiblock_atomic_add
 {
+    static_assert(((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) ||
+                   (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0)) &&
+                      (MThreadSliceSize % OutDstVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
    static constexpr bool reorder_thread_cluster = (InSrcVectorDim == 0);
    using ThreadClusterLengths_M_K = Sequence<MThreadClusterSize, KThreadClusterSize>;
@@ -109,8 +115,7 @@ struct GridwiseReduction_mk_to_m_multiblock_atomic_add
                                                          ReduceOperation,
                                                          PropagateNan>;
-    template <typename T>
+    using PassThroughOp = tensor_operation::element_wise::PassThrough;
-    using PassThroughOp = tensor_operation::element_wise::UnaryIdentic<T, T>;
    static constexpr auto I0 = Number<0>{};
    static constexpr auto I1 = Number<1>{};
@@ -249,7 +254,7 @@ struct GridwiseReduction_mk_to_m_multiblock_atomic_add
                                                   OutDataType,
                                                   decltype(reduced_data_desc),
                                                   OutGridDesc_M,
-                                                   PassThroughOp<AccDataType>,
+                                                   PassThroughOp,
                                                   Sequence<MThreadSliceSize>,
                                                   Sequence<0>,
                                                   0,
@@ -260,7 +265,7 @@ struct GridwiseReduction_mk_to_m_multiblock_atomic_add
                    out_grid_desc_m,
                    make_multi_index(blkgroup_id * M_BlockTileSize +
                                     thread_m_cluster_id * MThreadSliceSize),
-                    PassThroughOp<AccDataType>{});
+                    PassThroughOp{});
            threadwise_dst_store.Run(
                reduced_data_desc, make_tuple(I0), accu_value_buf, out_grid_desc_m, out_global_buf);

--- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_partial_reduce.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_partial_reduce.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_xdlops_v2r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_xdlops_v2r3.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
@@ -10,6 +10,7 @@
 #include "blockwise_tensor_slice_transfer_v6r1.hpp"
 #include "threadwise_tensor_slice_transfer.hpp"
 #include "gridwise_gemm_pipeline_v1.hpp"
+#include "tensor_space_filling_curve.hpp"
 namespace ck {
@@ -657,6 +658,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
                                     n_thread_data_on_block_idx[I2]),
                    ck::tensor_operation::element_wise::PassThrough{}};
+            // LDS to global
            auto c_block_copy_lds_to_global = BlockwiseTensorSliceTransfer_v6r1<
                BlockSize,                  // index_t BlockSize,
                CElementwiseOperation,      // ElementwiseOperation,

--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
--- a/include/ck/utility/amd_address_space.hpp
+++ b/include/ck/utility/amd_address_space.hpp
@@ -9,7 +9,7 @@
 namespace ck {
-enum AddressSpaceEnum_t
+enum struct AddressSpaceEnum_t
 {
    Generic,
    Global,

--- a/include/ck/utility/amd_buffer_addressing.hpp
+++ b/include/ck/utility/amd_buffer_addressing.hpp
--- a/include/ck/utility/data_type.hpp
+++ b/include/ck/utility/data_type.hpp
--- a/include/ck/utility/data_type_enum.hpp
+++ b/include/ck/utility/data_type_enum.hpp
--- a/include/ck/utility/sequence.hpp
+++ b/include/ck/utility/sequence.hpp