Merge branch 'develop' into feature/use-larger-tile-size-for-chunk-prefill

d783a8cf · Po Yen Chen · 1b130866 · 4cb3d7d7 · d783a8cf · d783a8cf
Commit d783a8cf authored Dec 05, 2024 by Po Yen Chen
20 changed files
--- a/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <iostream>
+#include <string>
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common.hpp"
+namespace ck_tile {
+struct BatchedGemmHostArgs
+{
+    const void* a_ptr;
+    const void* b_ptr;
+    void* c_ptr;
+    index_t M;
+    index_t N;
+    index_t K;
+    index_t stride_A;
+    index_t stride_B;
+    index_t stride_C;
+    index_t batch_stride_A;
+    index_t batch_stride_B;
+    index_t batch_stride_C;
+    index_t batch_count;
+};
+template <typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
+struct BatchedGemmKernel
+{
+    using TilePartitioner                    = remove_cvref_t<TilePartitioner_>;
+    using GemmPipeline                       = remove_cvref_t<GemmPipeline_>;
+    using EpiloguePipeline                   = remove_cvref_t<EpiloguePipeline_>;
+    using ALayout                            = remove_cvref_t<typename GemmPipeline::ALayout>;
+    using BLayout                            = remove_cvref_t<typename GemmPipeline::BLayout>;
+    using CLayout                            = remove_cvref_t<typename GemmPipeline::CLayout>;
+    static constexpr index_t KernelBlockSize = GemmPipeline::BlockSize;
+    using ADataType = remove_cvref_t<typename GemmPipeline::ADataType>;
+    using BDataType = remove_cvref_t<typename GemmPipeline::BDataType>;
+    using CDataType = remove_cvref_t<typename EpiloguePipeline::ODataType>;
+    struct BatchedGemmKargs
+    {
+        const void* a_ptr;
+        const void* b_ptr;
+        void* c_ptr;
+        index_t M;
+        index_t N;
+        index_t K;
+        index_t stride_A;
+        index_t stride_B;
+        index_t stride_C;
+        index_t batch_stride_A;
+        index_t batch_stride_B;
+        index_t batch_stride_C;
+        index_t batch_count;
+    };
+    using Kargs = BatchedGemmKargs;
+    using Hargs = BatchedGemmHostArgs;
+    __host__ static constexpr auto GridSize(const Hargs& h)
+    {
+        return TilePartitioner::GridSize(h.M, h.N, h.batch_count);
+    }
+    __host__ static constexpr auto BlockSize() { return dim3(KernelBlockSize); }
+    CK_TILE_HOST static constexpr BatchedGemmKargs MakeKargs(const Hargs& h)
+    {
+        Kargs k;
+        k.a_ptr          = h.a_ptr;
+        k.b_ptr          = h.b_ptr;
+        k.c_ptr          = h.c_ptr;
+        k.M              = h.M;
+        k.N              = h.N;
+        k.K              = h.K;
+        k.stride_A       = h.stride_A;
+        k.stride_B       = h.stride_B;
+        k.stride_C       = h.stride_C;
+        k.batch_stride_A = h.batch_stride_A;
+        k.batch_stride_B = h.batch_stride_B;
+        k.batch_stride_C = h.batch_stride_C;
+        k.batch_count    = h.batch_count;
+        return k;
+    }
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        return max(GemmPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize());
+    }
+    CK_TILE_DEVICE void operator()(Kargs kargs) const
+    {
+        const auto [i_m, i_n] = TilePartitioner{}();
+        const auto i_batch    = __builtin_amdgcn_readfirstlane(blockIdx.z);
+        //  options
+        const auto batch_stride_A = __builtin_amdgcn_readfirstlane(kargs.batch_stride_A);
+        const auto batch_offset_A = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_A);
+        const ADataType* a_start  = static_cast<const ADataType*>(kargs.a_ptr);
+        const auto batch_stride_B = __builtin_amdgcn_readfirstlane(kargs.batch_stride_B);
+        const auto batch_offset_B = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_B);
+        const BDataType* b_start  = static_cast<const BDataType*>(kargs.b_ptr);
+        // Convert pointers to tensor views
+        auto a_tensor_view = [&]() {
+            if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
+            {
+                return make_naive_tensor_view<address_space_enum::global>(
+                    a_start + batch_offset_A,
+                    make_tuple(kargs.M, kargs.K),
+                    make_tuple(kargs.stride_A, 1),
+                    number<GemmPipeline::VectorSizeA>{},
+                    number<1>{});
+            }
+            else
+            {
+                return make_naive_tensor_view<address_space_enum::global>(
+                    a_start + batch_offset_A,
+                    make_tuple(kargs.M, kargs.K),
+                    make_tuple(1, kargs.stride_A),
+                    number<1>{},
+                    number<1>{});
+            }
+        }();
+        auto b_tensor_view = [&]() {
+            if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>)
+            {
+                return make_naive_tensor_view<address_space_enum::global>(
+                    b_start + batch_offset_B,
+                    make_tuple(kargs.N, kargs.K),
+                    make_tuple(1, kargs.stride_B),
+                    number<1>{},
+                    number<1>{});
+            }
+            else
+            {
+                return make_naive_tensor_view<address_space_enum::global>(
+                    b_start + batch_offset_B,
+                    make_tuple(kargs.N, kargs.K),
+                    make_tuple(kargs.stride_B, 1),
+                    number<GemmPipeline::VectorSizeB>{},
+                    number<1>{});
+            }
+        }();
+        auto a_pad_view = [&]() {
+            if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
+            {
+                return pad_tensor_view(
+                    a_tensor_view,
+                    make_tuple(number<TilePartitioner::kM>{}, number<TilePartitioner::kK>{}),
+                    sequence<false, GemmPipeline::kPadK>{});
+            }
+            else
+            {
+                return pad_tensor_view(
+                    a_tensor_view,
+                    make_tuple(number<TilePartitioner::kM>{}, number<TilePartitioner::kK>{}),
+                    sequence<GemmPipeline::kPadM, false>{});
+            }
+        }();
+        // clang-format on
+        auto a_block_window = make_tile_window(
+            a_pad_view,
+            make_tuple(number<TilePartitioner::kM>{}, number<TilePartitioner::kK>{}),
+            {i_m, 0});
+        auto b_pad_view = [&]() {
+            if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::ColumnMajor>)
+            {
+                return pad_tensor_view(
+                    b_tensor_view,
+                    make_tuple(number<TilePartitioner::kN>{}, number<TilePartitioner::kK>{}),
+                    sequence<false, GemmPipeline::kPadK>{});
+            }
+            else
+            {
+                return pad_tensor_view(
+                    b_tensor_view,
+                    make_tuple(number<TilePartitioner::kN>{}, number<TilePartitioner::kK>{}),
+                    sequence<GemmPipeline::kPadN, false>{});
+            }
+        }();
+        // clang-format on
+        auto b_block_window = make_tile_window(
+            b_pad_view,
+            make_tuple(number<TilePartitioner::kN>{}, number<TilePartitioner::kK>{}),
+            {i_n, 0});
+        // allocate LDS
+        __shared__ char smem_ptr[GetSmemSize()];
+        const index_t num_loop = TilePartitioner::GetLoopNum(kargs.K);
+        // Run GEMM cooperatively by whole wokrgroup.
+        auto c_block_tile =
+            GemmPipeline{}.template operator()(a_block_window, b_block_window, num_loop, smem_ptr);
+        const auto batch_stride_C = __builtin_amdgcn_readfirstlane(kargs.batch_stride_C);
+        const auto batch_offset_C = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_C);
+        CDataType* c_start        = static_cast<CDataType*>(kargs.c_ptr);
+        auto c_tensor_view        = [&]() {
+            if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
+            {
+                return make_naive_tensor_view<address_space_enum::global>(
+                    c_start + batch_offset_C,
+                    make_tuple(kargs.M, kargs.N),
+                    make_tuple(kargs.stride_C, 1),
+                    number<GemmPipeline::VectorSizeC>{},
+                    number<1>{});
+            }
+            else
+            {
+                return make_naive_tensor_view<address_space_enum::global>(
+                    c_start + batch_offset_C,
+                    make_tuple(kargs.M, kargs.N),
+                    make_tuple(1, kargs.stride_C),
+                    number<1>{},
+                    number<1>{});
+            }
+        }();
+        auto c_pad_view = [&]() {
+            if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
+            {
+                return pad_tensor_view(
+                    c_tensor_view,
+                    make_tuple(number<TilePartitioner::kM>{}, number<TilePartitioner::kN>{}),
+                    sequence<false, GemmPipeline::kPadN>{});
+            }
+            else
+            {
+                return pad_tensor_view(
+                    c_tensor_view,
+                    make_tuple(number<TilePartitioner::kM>{}, number<TilePartitioner::kN>{}),
+                    sequence<GemmPipeline::kPadM, false>{});
+            }
+        }();
+        auto c_block_window = make_tile_window(
+            c_pad_view,
+            make_tuple(number<TilePartitioner::kM>{}, number<TilePartitioner::kN>{}),
+            {i_m, i_n});
+        EpiloguePipeline{}(c_block_window, c_block_tile);
+    }
+};
+} // namespace ck_tile
--- a/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp
+++ b/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp
@@ -35,4 +35,40 @@ struct GemmTilePartitioner
        return make_tuple(iM, iN);
    }
 };
+template <typename BlockGemmShape_>
+struct GemmTile1DPartitioner
+{
+    using BlockGemmShape = remove_cvref_t<BlockGemmShape_>;
+    static constexpr index_t MPerBlock = BlockGemmShape::kM;
+    static constexpr index_t NPerBlock = BlockGemmShape::kN;
+    static constexpr index_t KPerBlock = BlockGemmShape::kK;
+    CK_TILE_HOST static constexpr auto GridSize(index_t M, index_t N)
+    {
+        index_t GridDimX = (M + MPerBlock - 1) / MPerBlock;
+        index_t GridDimY = (N + NPerBlock - 1) / NPerBlock;
+        return dim3(GridDimX * GridDimY, 1, 1);
+    }
+    CK_TILE_HOST_DEVICE static constexpr auto GetNBlock(index_t N)
+    {
+        return integer_divide_ceil(N, NPerBlock);
+    }
+    CK_TILE_HOST_DEVICE static constexpr auto GetLoopNum(index_t K)
+    {
+        return integer_divide_ceil(K, KPerBlock);
+    }
+    CK_TILE_DEVICE auto operator()(index_t blockOffset, index_t NBlockSize)
+    {
+        index_t iM = __builtin_amdgcn_readfirstlane((blockIdx.x - blockOffset) /
+                                                    GetNBlock(NBlockSize) * MPerBlock);
+        index_t iN = __builtin_amdgcn_readfirstlane((blockIdx.x - blockOffset) %
+                                                    GetNBlock(NBlockSize) * NPerBlock);
+        return make_tuple(iM, iN);
+    }
+};
 } // namespace ck_tile
--- a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp
@@ -124,7 +124,7 @@ struct GemmPipelineAGmemBGmemCRegV1
            b_lds_block, make_tuple(number<kNPerBlock>{}, number<kKPerBlock>{}), {0, 0});
        // Block GEMM
-        constexpr auto block_gemm = Policy::template GetBlockGemm<Problem>();
+        auto block_gemm = Policy::template GetBlockGemm<Problem>();
        // Acc register tile
        auto c_block_tile = decltype(block_gemm(a_lds_gemm_window, b_lds_gemm_window)){};

--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
@@ -62,9 +62,9 @@ struct ReferenceGemm : public device::BaseOperator
            auto f_mk_kn_mn = [&](auto m, auto n) {
                const int K = arg.a_m_k_.mDesc.GetLengths()[1];
-                AccDataType v_acc = 0;
+                AccDataType v_acc{0};
-                ComputeTypeA v_a  = 0;
+                ComputeTypeA v_a{0};
-                ComputeTypeB v_b  = 0;
+                ComputeTypeB v_b{0};
                for(int k = 0; k < K; ++k)
                {
@@ -93,7 +93,7 @@ struct ReferenceGemm : public device::BaseOperator
                        ck::type_convert<AccDataType>(v_a) * ck::type_convert<AccDataType>(v_b);
                }
-                CDataType v_c = 0;
+                CDataType v_c{0};
                arg.c_element_op_(v_c, v_acc);

--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
--- a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f8_instance.cpp
@@ -15,7 +15,7 @@ void add_device_pool3d_fwd_ndhwc_f8_instances(
        instances)
 {
    add_device_operation_instances(
-        instances, device_pool3d_fwd_ndhwc_instances<F8, F8, I32, F8, ReduceOpId, false>{});
+        instances, device_pool3d_fwd_ndhwc_instances<F8, F8, I32, F32, ReduceOpId, false>{});
 }
 void add_device_pool3d_fwd_ndhwc_index_f8_instances(
@@ -23,7 +23,7 @@ void add_device_pool3d_fwd_ndhwc_index_f8_instances(
        instances)
 {
    add_device_operation_instances(
-        instances, device_pool3d_fwd_ndhwc_instances<F8, F8, I32, F8, ReduceOpId, true>{});
+        instances, device_pool3d_fwd_ndhwc_instances<F8, F8, I32, F32, ReduceOpId, true>{});
 }
 } // namespace instance

--- a/profiler/README.md
+++ b/profiler/README.md
+[Back to the main page](../README.md)
+# Composable Kernel profiler
 ## Profile GEMM kernels
 ```bash
 #arg1: tensor operation (gemm=GEMM)
@@ -180,3 +182,13 @@ Note: Column to image kernel adds to the output memory, this will cause output b
 ################            op datatype  verify  init  log  time  dim0 dim1 dim2 in_stride0 in_stride1 in_stride2 out_stride0 out_stride1 out_stride2
 ./bin/ckProfiler permute_scale        0       1     1    0     1    64   64   64       4096         64          1           1          64        4096
 ```
+## Convert MIOpen driver command to CKProfiler
+```bash
+python3 ../script/convert_miopen_driver_to_profiler.py
+/opt/rocm/bin/MIOpenDriver conv -n 32 -c 64 -H 28 -W 28 -k 64 -y 3 -x 3
+-p 1 -q 1 -u 2 -v 2 -l 1 -j 1 -m conv -g 32 -F 1 -t 1 
+```
+Only convolution driver is supported.
--- a/profiler/include/profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
@@ -150,7 +150,7 @@ bool profile_batched_gemm_bias_softmax_gemm_permute_impl(bool do_verification,
        break;
    default:
        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
-        b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+        b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Sequential<B0DataType, 1>{});
        b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
        d0_gs_ms_ns.GenerateTensorValue(GeneratorTensor_1<D0DataType>{1});
    }

--- a/profiler/include/profiler/profile_batched_gemm_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_gemm_impl.hpp
--- a/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp
--- a/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp
--- a/profiler/include/profiler/profile_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_impl.hpp
--- a/python/ck4inductor/batched_universal_gemm/gen_instances.py
+++ b/python/ck4inductor/batched_universal_gemm/gen_instances.py
--- a/python/ck4inductor/batched_universal_gemm/op.py
+++ b/python/ck4inductor/batched_universal_gemm/op.py
--- a/python/ck4inductor/grouped_conv_fwd/gen_instances.py
+++ b/python/ck4inductor/grouped_conv_fwd/gen_instances.py
--- a/test/ck_tile/CMakeLists.txt
+++ b/test/ck_tile/CMakeLists.txt
 add_subdirectory(image_to_column)
 add_subdirectory(gemm)
+add_subdirectory(batched_gemm)
+add_subdirectory(grouped_gemm)