Merge remote-tracking branch 'upstream/develop' into gemm-layernorm-4

2c1ed8b2 · Anthony Chang · b86b318b · 56adf7e9 · 2c1ed8b2 · 2c1ed8b2
Commit 2c1ed8b2 authored Jun 20, 2022 by Anthony Chang
20 changed files
--- a/include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp
@@ -35,14 +35,13 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
    using IndexDataType = int32_t;
-    using ReduceOperation = typename reduce_binary_operator<AccDataType, ReduceOpId>::opType;
+    using ReduceOperation = typename reduce_binary_operator<ReduceOpId>::opType;
    using InElementwiseOperation =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation;
+        typename reduce_unary_operator<ReduceOpId, true, true>::InElementwiseOperation;
    using AccElementwiseOperation =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::
+        typename reduce_unary_operator<ReduceOpId, true, true>::AccElementwiseOperation;
-            AccElementwiseOperation;
    static constexpr index_t InSrcOutDstVectorDim =
        0; // for NHWC, the dim C is the vector Dim for both input and output in memory, which is
@@ -178,13 +177,10 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
            invariant_lowest_length_ = C;
            reduce_lowest_length_    = window_spatial_lengths[1];
-            // TODO: is this correct?
+            int32_t reduceLength = window_spatial_lengths[0] * window_spatial_lengths[1];
-            if constexpr(ReduceOpId == ck::ReduceTensorOp::AVG)
-            {
+            std::tie(in_element_op_, acc_element_op_) =
-                ck::index_t divider = window_spatial_lengths[0] * window_spatial_lengths[1];
+                reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator(reduceLength);
-                in_element_op_      = InElementwiseOperation{divider};
-                acc_element_op_     = AccElementwiseOperation{divider};
-            }
        }
        const InDataType* p_in_dev_;

--- a/include/ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp
@@ -61,12 +61,9 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
    static constexpr bool use_multiblock =
        (OutMemoryDataOperation == InMemoryDataOperationEnum::AtomicAdd);
-    static constexpr bool out_type_compatible_with_atomic_op =
+    static_assert(ck::reduce::InMemoryDataOperatonSupportedOnDataType<OutMemoryDataOperation,
-        std::is_same<OutDataType, float>::value || std::is_same<OutDataType, double>::value;
+                                                                      OutDataType>::value,
+                  "The OutDataType must support the specified OutMemoryDataOperation!");
-    static_assert(
-        !use_multiblock || (use_multiblock && out_type_compatible_with_atomic_op),
-        "The OutDataType must support the atomic operation for using MultiBlock reduction");
    static_assert(!use_multiblock || (use_multiblock && !OutputIndex),
                  "MultiBlock reduction can only be used when outputing index is not required");
@@ -349,7 +346,7 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
            if constexpr(use_multiblock)
            {
                const auto identityVal =
-                    ck::reduce::GetIdentityValueueForInMemoryDataOperation<OutDataType>(
+                    ck::reduce::GetIdentityValueForInMemoryDataOperation<OutDataType>(
                        OutMemoryDataOperation);
                const auto kernel_pre =
@@ -492,7 +489,7 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
        auto str = std::stringstream();
        // clang-format off
-        str << "DeviceReduceMultiBlockAtomicAdd<" << BlockSize << ",";
+        str << (OutMemoryDataOperation == InMemoryDataOperationEnum::Set? "DeviceReduceBlockWise<" : "DeviceReduceMultiBlock<") << BlockSize << ",";
        str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ",";
        str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ",";
        str << "InSrcVectorDim_" << InSrcVectorDim << "_InSrcVectorSize_" << InSrcVectorSize << "_OutDstVectorSize_" << OutDstVectorSize << ">";

--- a/include/ck/tensor_operation/gpu/device/device_unary_elementwise.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_unary_elementwise.hpp
+#pragma once
+#include <iostream>
+#include <vector>
+#include "device.hpp"
+#include "device_base.hpp"
+#include "gridwise_unary_elementwise_1d.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+template <typename ADataType,
+          typename BDataType,
+          typename ElementwiseFunctor,
+          index_t Dim,
+          index_t ScalarPerVector>
+struct DeviceUnaryElementwise : public BaseOperator
+{
+    static constexpr auto I0 = Number<0>{};
+    template <typename Desc_M0>
+    static auto PadDescriptor_M0_1d(Desc_M0 desc_m0, index_t gridSize, index_t blockSize)
+    {
+        const auto m0           = desc_m0.GetLength(I0);
+        const index_t loop_step = gridSize * blockSize * ScalarPerVector;
+        const auto pad          = math::integer_least_multiple(m0, loop_step) - m0;
+        const auto desc_m0_pad =
+            transform_tensor_descriptor(desc_m0,
+                                        make_tuple(make_right_pad_transform(m0, pad)),
+                                        make_tuple(Sequence<0>{}),
+                                        make_tuple(Sequence<0>{}));
+        return desc_m0_pad;
+    }
+    static auto MakeDescriptor_M0(const std::vector<index_t>& shape,
+                                  const std::vector<index_t>& stride,
+                                  index_t gridSize,
+                                  index_t blockSize)
+    {
+        auto tupleOfShape  = generate_tuple([&](auto I) { return shape[I]; }, Number<Dim>{});
+        auto tupleOfStride = generate_tuple([&](auto I) { return stride[I]; }, Number<Dim>{});
+        // nd desc - [s0, s1, s2, ...]
+        const auto desc = make_naive_tensor_descriptor(tupleOfShape, tupleOfStride);
+        // merge nd to 1d desc - [s0 * s1 * ...]
+        if constexpr(Dim > 1)
+        {
+            const auto desc_m0 = transform_tensor_descriptor(
+                desc,
+                make_tuple(make_merge_transform(tupleOfShape)),
+                make_tuple(generate_sequence_v2([&](auto I) { return I; }, Number<Dim>{})),
+                make_tuple(Sequence<0>{}));
+            return PadDescriptor_M0_1d(desc_m0, gridSize, blockSize);
+        }
+        else
+            return PadDescriptor_M0_1d(desc, gridSize, blockSize);
+    }
+    using GridDesc_M0      = decltype(MakeDescriptor_M0({1, 1}, {1, 1}, 1, 1));
+    using GridwiseUEltwise = GridwiseUnaryElementwise_1D<ADataType,
+                                                         BDataType,
+                                                         GridDesc_M0,
+                                                         ElementwiseFunctor,
+                                                         ScalarPerVector>;
+    struct Argument : public BaseArgument
+    {
+        Argument(const ADataType* p_a,
+                 BDataType* p_b,
+                 const std::vector<index_t>& shape,
+                 const std::vector<index_t>& stride_a,
+                 const std::vector<index_t>& stride_b,
+                 ElementwiseFunctor functor)
+            : p_a_(p_a),
+              p_b_(p_b),
+              shape_(shape),
+              functor_(functor),
+              blockSize_(256) // FIXME - Calculate the grid size by number of CU in the future
+        {
+            index_t tensor_size =
+                std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>{});
+            gridSize_       = GridwiseUEltwise::CalculateGridSize(tensor_size);
+            a_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_a, gridSize_, blockSize_);
+            b_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_b, gridSize_, blockSize_);
+        }
+        const ADataType* p_a_;
+        BDataType* p_b_;
+        std::vector<int> shape_;
+        GridDesc_M0 a_grid_desc_m0_;
+        GridDesc_M0 b_grid_desc_m0_;
+        ElementwiseFunctor functor_;
+        index_t blockSize_;
+        index_t gridSize_;
+    };
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            const auto kernel = kernel_unary_elementwise_1d<GridwiseUEltwise,
+                                                            ADataType,
+                                                            BDataType,
+                                                            GridDesc_M0,
+                                                            ElementwiseFunctor>;
+            float elapsed_time = launch_and_time_kernel(stream_config,
+                                                        kernel,
+                                                        dim3(arg.gridSize_),
+                                                        dim3(arg.blockSize_),
+                                                        0,
+                                                        arg.p_a_,
+                                                        arg.p_b_,
+                                                        arg.a_grid_desc_m0_,
+                                                        arg.b_grid_desc_m0_,
+                                                        arg.functor_);
+            return elapsed_time;
+        }
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
+        if(pArg == nullptr)
+            return false;
+        if(pArg->shape_.back() % ScalarPerVector != 0)
+            return false;
+        return true;
+    };
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+                                                      void* p_b,
+                                                      std::vector<index_t> shape,
+                                                      std::vector<index_t> stride_a,
+                                                      std::vector<index_t> stride_b,
+                                                      ElementwiseFunctor functor)
+    {
+        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
+                                          static_cast<BDataType*>(p_b),
+                                          shape,
+                                          stride_a,
+                                          stride_b,
+                                          functor);
+    }
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() { return std::make_unique<Invoker>(); }
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+        // clang-format off
+        str << "DeviceBinaryElementwise"
+            << "<"
+            << "ScalarPerVector = " << ScalarPerVector
+            << ">";
+        // clang-format on
+        return str.str();
+    }
+};
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp
+++ b/include/ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp
--- a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
--- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
--- a/include/ck/tensor_operation/gpu/element/element_wise_reduce_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/element_wise_reduce_operation.hpp
-#pragma once
-#include "data_type.hpp"
-namespace ck {
-namespace tensor_operation {
-namespace element_wise {
-} // namespace element_wise
-} // namespace tensor_operation
-} // namespace ck
--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+#pragma once
+#include "data_type.hpp"
+#include "math_v2.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace element_wise {
+struct PassThrough
+{
+    template <typename T>
+    __host__ __device__ void operator()(T& y, const T& x) const
+    {
+        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
+                          is_same<T, half_t>::value || is_same<T, bhalf_t>::value ||
+                          is_same<T, int32_t>::value || is_same<T, int8_t>::value,
+                      "Data type is not supported by this operation!");
+        y = x;
+    };
+};
+struct UnaryDivide
+{
+    __host__ __device__ UnaryDivide(const int32_t divider = 1) : divider_(divider){};
+    template <typename T>
+    __host__ __device__ void operator()(T& y, const T& x) const
+    {
+        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
+                          is_same<T, int32_t>::value,
+                      "Data type is not supported by this operation!");
+        y = x / type_convert<T>(divider_);
+    };
+    int32_t divider_ = 1;
+};
+struct UnarySquare
+{
+    template <typename T>
+    __host__ __device__ void operator()(T& y, const T& x) const
+    {
+        static_assert(is_same<T, float>::value || is_same<T, double>::value,
+                      "Data type is not supported by this operation!");
+        y = x * x;
+    };
+};
+struct UnaryAbs
+{
+    template <typename T>
+    __host__ __device__ void operator()(T& y, const T& x) const
+    {
+        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
+                          is_same<T, half_t>::value || is_same<T, int32_t>::value ||
+                          is_same<T, int8_t>::value,
+                      "Data type is not supported by this operation!");
+        y = ck::math::abs(x);
+    };
+};
+struct UnarySqrt
+{
+    template <typename T>
+    __host__ __device__ void operator()(T& y, const T& x) const
+    {
+        static_assert(is_same<T, float>::value || is_same<T, double>::value,
+                      "Data type is not supported by this operation!");
+        y = ck::math::sqrt(x);
+    };
+};
+struct Relu
+{
+    template <typename T>
+    __host__ __device__ void operator()(T& y, const T& x) const
+    {
+        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
+                          is_same<T, half_t>::value || is_same<T, int32_t>::value ||
+                          is_same<T, int8_t>::value,
+                      "Data type is not supported by this operation!");
+        y = x > 0 ? x : 0;
+    }
+    template <>
+    __host__ __device__ void operator()(bhalf_t& y, const bhalf_t& x) const
+    {
+        float x_f32 = ck::type_convert<float>(x);
+        float y_f32 = x_f32 > 0 ? x_f32 : 0;
+        y           = ck::type_convert<bhalf_t>(y_f32);
+    }
+};
+// https://paperswithcode.com/method/gelu
+// y = 0.5*x*(1+tanh(sqrt(2/pi)*(x+0.044715*x^3)))
+struct FastGelu
+{
+    template <typename Y, typename X>
+    __host__ __device__ void operator()(Y& y, const X& x) const;
+    template <>
+    __host__ __device__ void operator()<float, float>(float& y, const float& x) const
+    {
+        const float u   = float(2) * x * (float(0.035677) * x * x + float(0.797885));
+        const float emu = exp(-u);
+        const float cdf = float(0.5) + float(0.5) * (float(2) / (float(1) + emu) - float(1));
+        y = x * cdf;
+    }
+};
+} // namespace element_wise
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp
@@ -791,8 +791,10 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
            constexpr auto c_block_desc_mblock_mperblock_nblock_nperblock =
                GetCBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+            void* p_shared = static_cast<void*>(p_shared_block);
            auto c_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-                static_cast<FloatC*>(p_shared_block),
+                static_cast<FloatC*>(p_shared),
                c_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
            static_assert(M1 == MWave, "");

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_unary_elementwise_1d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_unary_elementwise_1d.hpp
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7.hpp
--- a/include/ck/utility/amd_buffer_addressing.hpp
+++ b/include/ck/utility/amd_buffer_addressing.hpp
--- a/include/ck/utility/data_type.hpp
+++ b/include/ck/utility/data_type.hpp