Merge remote-tracking branch 'origin/develop' into myamlak/cgemm

bb1f8082 · root · 97ac5007 · 82d7d993 · bb1f8082 · 97ac5007
Commit bb1f8082 authored May 26, 2022 by root
20 changed files
--- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_partial_reduce.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_partial_reduce.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_atomic_add.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_atomic_add.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v1r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v1r3.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
@@ -306,7 +306,7 @@ struct GridwiseGemmReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1
    __host__ __device__ static constexpr auto
    MakeDefaultBlock2CTileMap(const CGridDesc_M_N& c_grid_desc_m_n)
    {
-        return BlockToCTileMap_M00_N00_M01_N01<MPerBlock, NPerBlock, CGridDesc_M_N>(
+        return BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, CGridDesc_M_N>(
            c_grid_desc_m_n);
    }


--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
@@ -259,7 +259,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
    __host__ __device__ static constexpr auto
    MakeDefaultBlock2CTileMap(const CGridDesc_M_N& c_grid_desc_m_n)
    {
-        return BlockToCTileMap_M00_N00_M01_N01<MPerBlock, NPerBlock, CGridDesc_M_N>(
+        return BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, CGridDesc_M_N>(
            c_grid_desc_m_n);
    }


--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
 #pragma once
+
 #include "common_header.hpp"
 #include "multi_index_transform_helper.hpp"
 #include "tensor_descriptor.hpp"
@@ -287,11 +288,11 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
    }

    // return block_id to C matrix tile idx (m0, n0) mapping
-    __host__ __device__ static constexpr auto
-    MakeDefaultBlock2CTileMap(const CGridDesc_M_N& c_grid_desc_m_n, index_t M01, index_t N01)
+    __host__ __device__ static constexpr auto MakeDefaultBlock2CTileMap(
+        const CGridDesc_M_N& c_grid_desc_m_n, index_t /* M01 */, index_t /* N01 */)
    {
-        return BlockToCTileMap_M00_N00_M01_N01<MPerBlock, NPerBlock, CGridDesc_M_N>(
-            c_grid_desc_m_n, M01, N01);
+        return BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, CGridDesc_M_N>(
+            c_grid_desc_m_n);
    }

    using CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 =

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp
@@ -265,10 +265,10 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4

    // return block_id to C matrix tile idx (m0, n0) mapping
    __host__ __device__ static constexpr auto MakeCBlockClusterAdaptor(
-        const CMNGridDesc& c_m_n_grid_desc, index_t M01, index_t N01, index_t KBatch)
+        const CMNGridDesc& c_m_n_grid_desc, index_t /* M01 */, index_t /* N01 */, index_t KBatch)
    {
-        return BlockToCTileMap_KSplit_M00_N00_M01_N01<MPerBlock, NPerBlock, CMNGridDesc>(
-            c_m_n_grid_desc, M01, N01, KBatch);
+        return BlockToCTileMap_KSplit_M00_N0_M01Adapt<MPerBlock, NPerBlock, CMNGridDesc>(
+            c_m_n_grid_desc, 8, KBatch);
    }

    using CM0N0M1N1M2M3M4N2GridDesc = decltype(MakeCM0N0M1N1M2M3M4N2GridDescriptor(CMNGridDesc{}));

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
@@ -239,10 +239,10 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2

    // return block_id to C matrix tile idx (m0, n0) mapping
    __host__ __device__ static constexpr auto MakeCBlockClusterAdaptor(
-        const CMNGridDesc& c_m_n_grid_desc, index_t M01, index_t N01, index_t KBatch)
+        const CMNGridDesc& c_m_n_grid_desc, index_t /* M01 */, index_t /* N01 */, index_t KBatch)
    {
-        return BlockToCTileMap_KSplit_M00_N00_M01_N01<MPerBlock, NPerBlock, CMNGridDesc>(
-            c_m_n_grid_desc, M01, N01, KBatch);
+        return BlockToCTileMap_KSplit_M00_N0_M01Adapt<MPerBlock, NPerBlock, CMNGridDesc>(
+            c_m_n_grid_desc, 8, KBatch);
    }

    __host__ __device__ static constexpr auto

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
@@ -300,11 +300,11 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
    }

    // return block_id to C matrix tile idx (m0, n0) mapping
-    __host__ __device__ static constexpr auto
-    MakeDefaultBlock2CTileMap(const CGridDesc_M_N& c_grid_desc_m_n, index_t M01, index_t N01)
+    __host__ __device__ static constexpr auto MakeDefaultBlock2CTileMap(
+        const CGridDesc_M_N& c_grid_desc_m_n, index_t /* M01 */, index_t /* N01 */)
    {
-        return BlockToCTileMap_M00_N00_M01_N01<MPerBlock, NPerBlock, CGridDesc_M_N>(
-            c_grid_desc_m_n, M01, N01);
+        return BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, CGridDesc_M_N>(
+            c_grid_desc_m_n);
    }
    using CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl =
        remove_cvref_t<decltype(

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
@@ -309,11 +309,11 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r2
    }

    // return block_id to C matrix tile idx (m0, n0) mapping
-    __host__ __device__ static constexpr auto
-    MakeDefaultBlock2CTileMap(const CGridDesc_M_N& c_grid_desc_m_n, index_t M01, index_t N01)
+    __host__ __device__ static constexpr auto MakeDefaultBlock2CTileMap(
+        const CGridDesc_M_N& c_grid_desc_m_n, index_t /* M01 */, index_t /* N01 */)
    {
-        return BlockToCTileMap_M00_N00_M01_N01<MPerBlock, NPerBlock, CGridDesc_M_N>(
-            c_grid_desc_m_n, M01, N01);
+        return BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, CGridDesc_M_N>(
+            c_grid_desc_m_n);
    }

    using CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl =

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
@@ -316,11 +316,11 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r3
    }

    // return block_id to C matrix tile idx (m0, n0) mapping
-    __host__ __device__ static constexpr auto
-    MakeDefaultBlock2CTileMap(const CGridDesc_M_N& c_grid_desc_m_n, index_t M01, index_t N01)
+    __host__ __device__ static constexpr auto MakeDefaultBlock2CTileMap(
+        const CGridDesc_M_N& c_grid_desc_m_n, index_t /* M01 */, index_t /* N01 */)
    {
-        return BlockToCTileMap_M00_N00_M01_N01<MPerBlock, NPerBlock, CGridDesc_M_N>(
-            c_grid_desc_m_n, M01, N01);
+        return BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, CGridDesc_M_N>(
+            c_grid_desc_m_n);
    }
    using CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl =
        remove_cvref_t<decltype(

--- a/include/ck/tensor_operation/gpu/thread/threadwise_contraction_dlops.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_contraction_dlops.hpp
-#ifndef CK_THREADWISE_CONTRACTION_DLOPS_HPP
-#define CK_THREADWISE_CONTRACTION_DLOPS_HPP
-
+#pragma once
 #include "common_header.hpp"
 #include "math.hpp"

@@ -25,9 +23,9 @@ template <typename FloatA,
                                 BThreadDesc_TK0_TN0_TN1_TK1::IsKnownAtCompileTime() &&
                                 CThreadDesc_TM0_TM1_TN0_TN1::IsKnownAtCompileTime(),
                             bool>::type = false>
-struct ThreadwiseGemmDlops_km0m1_kn0n1_m0m1n0n1
+struct ThreadwiseGemmDl_km0m1_kn0n1_m0m1n0n1
 {
-    __device__ constexpr ThreadwiseGemmDlops_km0m1_kn0n1_m0m1n0n1()
+    __device__ constexpr ThreadwiseGemmDl_km0m1_kn0n1_m0m1n0n1()
    {
        static_assert(AThreadDesc_TK0_TM0_TM1_TK1::IsKnownAtCompileTime() &&
                          BThreadDesc_TK0_TN0_TN1_TK1::IsKnownAtCompileTime() &&
@@ -124,9 +122,9 @@ template <typename FloatA,
                                 BThreadDesc_TK0_TN0_TN1_TK1::IsKnownAtCompileTime() &&
                                 CThreadDesc_TM0_TM1_TN0_TN1::IsKnownAtCompileTime(),
                             bool>::type = false>
-struct ThreadwiseContractionDlops_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1
+struct ThreadwiseContractionDl_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1
 {
-    __device__ constexpr ThreadwiseContractionDlops_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1()
+    __device__ constexpr ThreadwiseContractionDl_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1()
    {
        static_assert(AThreadDesc_TK0_TM0_TM1_TK1::IsKnownAtCompileTime() &&
                          BThreadDesc_TK0_TN0_TN1_TK1::IsKnownAtCompileTime() &&
@@ -220,4 +218,3 @@ struct ThreadwiseContractionDlops_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_
 };

 } // namespace ck
-#endif
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp
-#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_V5R1_HPP
-#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_V5R1_HPP
+#pragma once

 #include "common_header.hpp"
 #include "tensor_descriptor.hpp"
@@ -609,4 +608,3 @@ struct ThreadwiseTensorSliceTransfer_v5r1
 };

 } // namespace ck
-#endif
--- a/include/ck/utility/generic_memory_space_atomic.hpp
+++ b/include/ck/utility/generic_memory_space_atomic.hpp
--- a/include/ck/utility/inner_product.hpp
+++ b/include/ck/utility/inner_product.hpp
--- a/include/ck/utility/reduction_operator.hpp
+++ b/include/ck/utility/reduction_operator.hpp
--- a/include/ck/utility/static_buffer.hpp
+++ b/include/ck/utility/static_buffer.hpp
--- a/library/include/ck/library/host_tensor/host_common_util.hpp
+++ b/library/include/ck/library/host_tensor/host_common_util.hpp