Merge branch 'develop' into gridwise_2d

dc70e3e1 · arai713 · GitHub · 10947a54 · 8ee36118 · dc70e3e1
Unverified Commit dc70e3e1 authored Nov 01, 2022 by arai713 Committed by GitHub Nov 01, 2022
20 changed files
--- a/include/ck/tensor_operation/gpu/device/welford_helper.hpp
+++ b/include/ck/tensor_operation/gpu/device/welford_helper.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <index_t K_BlockTileSize, index_t KThreadSliceSize>
+struct GetReduceCountPerThreadForBlockwiseWelford
+{
+    GetReduceCountPerThreadForBlockwiseWelford(index_t numBlockTileIteration,
+                                               long_index_t reduce_length)
+        : numBlockTileIteration_{numBlockTileIteration}
+    {
+        count_in_last_tile_ = reduce_length % K_BlockTileSize;
+    };
+
+    __device__ index_t operator()(index_t thread_k_cluster_id) const
+    {
+        if(count_in_last_tile_ == 0)
+            return (KThreadSliceSize * numBlockTileIteration_);
+        else
+        {
+            index_t num_complete_slice  = count_in_last_tile_ / KThreadSliceSize;
+            index_t count_in_last_slice = count_in_last_tile_ % KThreadSliceSize;
+
+            if(thread_k_cluster_id < num_complete_slice)
+                return (KThreadSliceSize * numBlockTileIteration_);
+            else if(thread_k_cluster_id == num_complete_slice)
+                return (KThreadSliceSize * (numBlockTileIteration_ - 1) + count_in_last_slice);
+            else
+                return (KThreadSliceSize * (numBlockTileIteration_ - 1));
+        };
+    };
+
+    index_t numBlockTileIteration_;
+    index_t count_in_last_tile_;
+};
+
+template <index_t K_BlockTileSize, index_t KThreadSliceSize>
+struct GetReduceCountPerThreadForMultiblockWelford
+{
+    GetReduceCountPerThreadForMultiblockWelford(index_t blkGroupSize,
+                                                index_t numBlockTileIteration,
+                                                long_index_t reduce_length)
+        : blkGroupSize_(blkGroupSize), numBlockTileIteration_{numBlockTileIteration}
+    {
+        last_block_reduce_length_ =
+            reduce_length - K_BlockTileSize * numBlockTileIteration_ * (blkGroupSize_ - 1);
+        numBlockTileIterationByLastBlock_ =
+            (last_block_reduce_length_ + K_BlockTileSize - 1) / K_BlockTileSize;
+    };
+
+    __device__ index_t operator()(index_t block_local_id, index_t thread_k_cluster_id) const
+    {
+        if(last_block_reduce_length_ == K_BlockTileSize * numBlockTileIteration_ ||
+           block_local_id < blkGroupSize_ - 1)
+            return (KThreadSliceSize * numBlockTileIteration_);
+
+        index_t count_in_last_tile = last_block_reduce_length_ % K_BlockTileSize;
+
+        if(count_in_last_tile == 0)
+            return (KThreadSliceSize * numBlockTileIterationByLastBlock_);
+        else
+        {
+            index_t num_complete_slice = count_in_last_tile / KThreadSliceSize;
+
+            if(thread_k_cluster_id < num_complete_slice)
+                return (KThreadSliceSize * numBlockTileIterationByLastBlock_);
+            else if(thread_k_cluster_id == num_complete_slice)
+                return (KThreadSliceSize * (numBlockTileIterationByLastBlock_ - 1) +
+                        count_in_last_tile);
+            else
+                return (KThreadSliceSize * (numBlockTileIterationByLastBlock_ - 1));
+        };
+    };
+
+    index_t blkGroupSize_;
+    index_t numBlockTileIteration_;
+
+    index_t last_block_reduce_length_;
+    index_t numBlockTileIterationByLastBlock_;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_first_half.hpp
+++ b/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_first_half.hpp
--- a/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_batchnorm_forward_final.hpp
+++ b/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_batchnorm_forward_final.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batchnorm_forward_blockwise_welford.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batchnorm_forward_blockwise_welford.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_v1r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_v1r3.hpp
@@ -66,6 +66,7 @@ template <index_t BlockSize,
          index_t MPerBlock,
          index_t NPerBlock,
          index_t K0PerBlock,
+          index_t K1Value,
          index_t M1PerThreadM111,
          index_t N1PerThreadN111,
          index_t KPerThread,
@@ -96,7 +97,7 @@ struct GridwiseGemmDl_km_kn_mn_v1r3
    static constexpr auto I3 = Number<3>{};

    // K1 should be Number<...>
-    static constexpr auto K1 = AGridDesc_K0_M_K1{}.GetLength(I2);
+    static constexpr auto K1 = Number<K1Value>{};

    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
    {

--- a/include/ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp
@@ -3,6 +3,7 @@

 #pragma once

+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"

 namespace ck {

--- a/include/ck/tensor_operation/gpu/thread/threadwise_welford.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_welford.hpp
--- a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
@@ -593,7 +593,8 @@ struct XdlopsGemm
    static constexpr auto I4 = Number<4>{};
    static constexpr auto I5 = Number<5>{};

-    using CIndex = MultiIndex<2>;
+    using CIndex   = MultiIndex<2>;
+    using CIndex4D = MultiIndex<4>;

    __device__ static constexpr index_t GetNumBlks() { return mfma_instr.num_output_blks; }

@@ -822,6 +823,16 @@ struct XdlopsGemm
        return TransposeC ? CIndex{n_offset, m_offset} : CIndex{m_offset, n_offset};
    }

+    __device__ static CIndex4D GetBeginOfThreadBlk4D(index_t /* xdlops_i */, index_t /* blk_i */)
+    {
+        const auto blk_idx = GetBlkIdx();
+
+        const auto blk_id = blk_idx[I0];
+        const auto blk_td = blk_idx[I1];
+
+        return TransposeC ? CIndex4D{blk_td, I0, blk_id, I0} : CIndex4D{I0, blk_id, I0, blk_td};
+    }
+
    static constexpr auto mfma = MfmaSelector<base_type, MPerXdlops, NPerXdlops>{};

    static constexpr auto mfma_instr = mfma.selected_mfma;

--- a/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp
+++ b/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_forward_nhwc_c.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_forward_nhwc_c.hpp
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer_nhwc_c.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer_nhwc_c.hpp
--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm.hpp
--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_masking_scale_softmax_gemm_permute.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_masking_scale_softmax_gemm_permute.hpp
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_dl.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_dl.hpp
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -6,6 +6,7 @@ function(add_instance_library INSTANCE_NAME)
    clang_tidy_check(${INSTANCE_NAME})
 endfunction(add_instance_library INSTANCE_NAME)

+
 file(GLOB dir_list LIST_DIRECTORIES true *)
 set(CK_DEVICE_INSTANCES)
 FOREACH(subdir_path ${dir_list})

--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instance.cpp
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instance.cpp
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gmk_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gmk_gkn_gmn_instance.cpp
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp