Merge branch 'develop' of https://github.com/ROCmSoftwarePlatform/composable_kernel into wmma_op

4fec5ad3 · aska-0096 · 24faa1fc · 87fd1152 · 4fec5ad3 · 4fec5ad3
Commit 4fec5ad3 authored Oct 28, 2022 by aska-0096
20 changed files
--- a/include/ck/tensor_operation/gpu/device/masking_specialization.hpp
+++ b/include/ck/tensor_operation/gpu/device/masking_specialization.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+namespace ck {
+namespace tensor_operation {
+namespace device {
+enum struct MaskingSpecialization
+{
+    MaskDisabled,
+    MaskOutUpperTriangle
+};
+inline std::string getMaskingSpecializationString(const MaskingSpecialization& s)
+{
+    switch(s)
+    {
+    case MaskingSpecialization::MaskDisabled: return "MaskDisabled";
+    case MaskingSpecialization::MaskOutUpperTriangle: return "MaskOutUpperTriangle";
+    default: return "Unrecognized specialization!";
+    }
+}
+struct MaskDisabledPredicate
+{
+    __host__ __device__ constexpr bool operator()(index_t /*m*/, index_t /*n*/) const
+    {
+        return false;
+    };
+    __host__ __device__ constexpr bool
+        IsTileSkippable(index_t /*m*/, index_t /*n*/, index_t /*m_tile*/, index_t /*n_tile*/) const
+    {
+        return false;
+    }
+};
+struct MaskOutUpperTrianglePredicate
+{
+    __host__ __device__ constexpr bool operator()(index_t m, index_t n) const { return n > m; }
+    __host__ __device__ constexpr bool
+    IsTileSkippable(index_t m, index_t n, index_t m_tile, index_t /*n_tile*/) const
+    {
+        return operator()(m + m_tile - 1, n);
+    }
+};
+// to track the points which need to be set to -inf on C0
+// Note: no need to reset M padding value, because they will not be stored out.
+template <typename MaskOutPredicate>
+struct C0MatrixMask_impl
+{
+    C0MatrixMask_impl(index_t NRaw) : NRaw_(NRaw), predicate_(MaskOutPredicate{}) {}
+    __host__ __device__ constexpr bool IsNOutOfBound(/*index_t m, */ index_t n) const
+    {
+        return n >= NRaw_;
+    }
+    __host__ __device__ constexpr bool IsMaskedElement(index_t m, index_t n) const
+    {
+        return predicate_(m, n) || IsNOutOfBound(n);
+    }
+    __host__ __device__ constexpr bool
+    IsTileSkippable(index_t m, index_t n, index_t m_tile, index_t n_tile) const
+    {
+        return predicate_.IsTileSkippable(m, n, m_tile, n_tile);
+    }
+    private:
+    // index_t MRaw_;
+    index_t NRaw_;
+    MaskOutPredicate predicate_;
+};
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/welford_helper.hpp
+++ b/include/ck/tensor_operation/gpu/device/welford_helper.hpp
--- a/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_first_half.hpp
+++ b/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_first_half.hpp
--- a/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_batchnorm_forward_final.hpp
+++ b/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_batchnorm_forward_final.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batchnorm_forward_blockwise_welford.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batchnorm_forward_blockwise_welford.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp
@@ -3,6 +3,7 @@
 #pragma once
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
 namespace ck {

--- a/include/ck/tensor_operation/gpu/thread/threadwise_welford.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_welford.hpp
@@ -75,4 +75,63 @@ struct ThreadwiseWelford
    int max_count_;
 };
+template <typename T,
+          typename SrcMeanVarCountThreadDesc_M_K,
+          typename DstMeanVarThreadDesc_M,
+          bool GetActualVariance = false>
+struct ThreadwiseWelfordMerge
+{
+    static constexpr auto src_thread_desc_m_k = SrcMeanVarCountThreadDesc_M_K{};
+    static constexpr auto dst_thread_desc_m   = DstMeanVarThreadDesc_M{};
+    static constexpr auto src_length_m = src_thread_desc_m_k.GetLength(Number<0>{});
+    static constexpr auto src_length_k = src_thread_desc_m_k.GetLength(Number<1>{});
+    static constexpr auto dst_length_m = dst_thread_desc_m.GetLength(Number<0>{});
+    static_assert(src_length_m == dst_length_m, "lengths of source and dst buffer must match!");
+    __device__ static void
+    Merge(T& mean_a, T& var_a, int32_t& count_a, T mean_b, T var_b, int32_t count_b)
+    {
+        int count            = count_a + count_b;
+        T count_b_over_count = count == 0 ? type_convert<T>(0) : type_convert<T>(count_b) / count;
+        T delta              = mean_b - mean_a;
+        mean_a += delta * count_b_over_count;
+        var_a += var_b + delta * delta * count_a * count_b_over_count;
+        count_a = count;
+    }
+    template <typename SrcMeanBufferType,
+              typename SrcVarBufferType,
+              typename SrcCountBufferType,
+              typename DstMeanBufferType,
+              typename DstVarBufferType,
+              typename DstCountBufferType>
+    __device__ static void Run(const SrcMeanBufferType& src_mean_buf,
+                               const SrcVarBufferType& src_var_buf,
+                               const SrcCountBufferType& src_count_buf,
+                               DstMeanBufferType& dst_mean_buf,
+                               DstVarBufferType& dst_var_buf,
+                               DstCountBufferType& dst_count_buf)
+    {
+        static_for<0, src_length_m, 1>{}([&](auto iM) {
+            static_for<0, src_length_k, 1>{}([&](auto iK) {
+                constexpr auto src_offset = src_thread_desc_m_k.CalculateOffset(make_tuple(iM, iK));
+                Merge(dst_mean_buf(iM),
+                      dst_var_buf(iM),
+                      dst_count_buf(iM),
+                      src_mean_buf[Number<src_offset>{}],
+                      src_var_buf[Number<src_offset>{}],
+                      src_count_buf[Number<src_offset>{}]);
+            });
+            if constexpr(GetActualVariance)
+            {
+                dst_var_buf(iM) = dst_var_buf[iM] / dst_count_buf[iM];
+            };
+        });
+    };
+};
 } // namespace ck
--- a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
--- a/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp
+++ b/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_forward_nhwc_c.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_forward_nhwc_c.hpp
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer_nhwc_c.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer_nhwc_c.hpp
--- a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
+++ b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm.hpp
--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_masking_scale_softmax_gemm_permute.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_masking_scale_softmax_gemm_permute.hpp
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.hpp
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.hpp
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.hpp