pseudo code

c5b3f69f · Chao Liu · 800f0ab3 · c5b3f69f · c5b3f69f · c5b3f69f
Commit c5b3f69f authored Feb 15, 2023 by Chao Liu
4 changed files
--- a/example/91_tile_program/copy.cpp
+++ b/example/91_tile_program/copy.cpp
+
+#include "tile_program.hpp"
+
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_gemm.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_xdl_cshuffle_v1.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/io.hpp"
+
+#include "ck/library/utility/device_memory.hpp"
+
+// program
+struct GemmMultiplD
+{
+    __host__ __device__ void operator()(TileProgram& tp, int x, int y)
+    {
+        auto desc = tp.make_naive_tensor_descriptor_packed(ck::make_tuple(x));
+
+        printf("length %d\n", desc.GetLength(ck::Number<0>{}));
+    }
+};
+
+int main()
+{
+    int x = 100;
+    int y = 101;
+
+    launch(HelloWorld{}, 1, 1, x, y);
+
+    return 0;
+}
--- a/example/91_tile_program/gemm.cpp
+++ b/example/91_tile_program/gemm.cpp
+template <typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation>
+struct GemmMultiD
+{
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    __host__ __device__ void
+    operator()(TileProgram& tp,
+               const std::array<index_t, 2> a_m_k_lengths,
+               const std::array<index_t, 2> a_m_k_strides,
+               const std::array<index_t, 2> b_n_k_lengths,
+               const std::array<index_t, 2> b_n_k_strides,
+               const std::array<const std::array<index_t, 2>, NumDTensor> ds_m_n_lengths,
+               const std::array<const std::array<index_t, 2>, NumDTensor> ds_m_n_strides,
+               const std::array<index_t, 2> e_m_n_lengths,
+               const std::array<index_t, 2> e_m_n_strides,
+               //
+               const T* p_a,
+               const T* p_b,
+               const std::array<const T*> p_ds,
+               T* p_e)
+    {
+        using namespace ck;
+
+        const auto b  = tp(make_naive_tensor(b_n_k_lengths, b_n_k_strides), p_b);
+        const auto ds = tp(generate_tuple(
+            [&](auto i) {
+                return make_naive_tensor(ds_m_n_lengths[i], ds_m_n_strides[i], p_ds[i]),
+            },
+            Number<NumDTensor>{}));
+        auto e        = tp(make_naive_tensor(e_m_n_lengths, e_m_n_strides), p_e);
+
+        // divide problem
+        const auto num_m = e_m_n_lengths[0];
+        const auto num_n = e_m_n_lengths[1];
+
+        const auto id_block = get_block_1d_id();
+
+        const auto num_tile_m = num_gemmm / MPerTile;
+        const auto num_tile_n = num_gemmn / NPerTile;
+
+        const auto block2tile = tp(make_cluster_descriptor(make_tuple(num_tile_m, num_tile_n)));
+
+        const auto id_tile = block2tile.CalculateBottonIndex(id_block);
+
+        const auto id_tile_m = id_tile.At<0>();
+        const auto id_tile_n = id_tile.At<1>();
+
+        // A/B in DRAM
+        // A/B DRAM layout is part of problem, not solution
+#if 1
+        // DO NOT let user know there is optimization on tensor transform on A/B DRAM tensor
+        const auto a_dram_global = tp(make_naive_tensor(a_m_k_lengths, a_m_k_strides), p_a_dram);
+        const auto b_dram_global = tp(make_naive_tensor(b_n_k_lengths, b_n_k_strides), p_b_dram);
+#endif
+
+        // A/B tile in LDS
+        // A/B DRAM layout is part of solution
+        ADataType* p_a_lds = shared_memmory.get_pointer(0);
+
+        // [allow optimization] allow different LDS layouts
+        constexpr auto a_lds_block =
+            make_tensor(p_a_lds, {kMPerBlock, kKPerBlock}, a_lds_block_strategy);
+
+        constexpr auto a_lds_byte = a_lds_block.get_num_of_byte();
+
+        BDataType* p_b_lds = shared_memory.get_aligned_pointer(a_lds_byte);
+
+        // [allow optimization] allow different LDS layouts
+        constexpr auto b_lds_block =
+            make_tensor({p_b_lds, kNPerBlock, kKPerBlock}, b_lds_block_strategy);
+
+        // A/B copy
+#if 0
+        auto a_block_copy = make_copier(a_dram_global,
+                                        a_lds_block,
+                                        make_tuple(kMPerBlock, kKPerBlock),
+                                        make_tuple(id_tile_m * kMPerBlock, 0),
+                                        a_block_copy_strategy);
+
+        auto b_block_copy = make_copier(b_dram_global,
+                                        b_lds_block,
+                                        make_tuple(kNPerBlock, kKPerBlock),
+                                        make_tuple(id_tile_n * kNPerBlock, 0),
+                                        b_block_copy_strategy);
+#else
+        auto window_a_dram = make_window(a_dram_global,
+                                         {MPerTile, KPerTile},
+                                         {id_tile_m * MPerTile, id_tile_k * KPerTile},
+                                         a_dram_window_map_strategy);
+
+        auto window_a_block =
+            make_window(a_lds_block, {NPerTile, KPerTile}, {0, 0}, a_lds_window_map_strategy);
+
+#endif
+
+#if 1
+        // block GEMM
+        // operation-based syntax: per-operation solution strategy
+        auto block_gemm = make_block_gemm(a_lds_block, b_lds_block, block_gemm_strategy);
+#endif
+
+        // Distributed C in VGPR
+#if 1
+        // C layout is decided alone
+        // C should be distributed,
+        auto c_vgpr_block =
+            make_distributed_tensor({kMPerBlock, kNPerBlock}, c_vgpr_block_strategy);
+#elif 0
+        // C layout is decided by block GEMM
+        auto c_vgpr_block = block_gemm.get_c_vgpr_block();
+#endif
+
+        for(index_t k = 0; k < K; k += kKPerBlock)
+        {
+            auto a_vgpr_block_tmp = load(window_a_dram, a_dram_load_strategy);
+            auto b_vgpr_block_tmp = load(window_b_dram, b_dram_load_strategy);
+
+            auto a_vpgr_block = elementwise_op(a_vgpr_block_tmp, a_element_op);
+            auto b_vpgr_block = elementwise_op(b_vgpr_block_tmp, b_element_op);
+
+            copy(a_vgpr_block, a_lds_block, a_lds_store_strategy);
+            copy(b_vgpr_block, b_lds_block, b_lds_store_strategy);
+
+            block_sync_lds();
+
+            dot_product_accumulate(c_vgpr_block, a_lds_block, b_lds_block);
+
+            block_sync_lds();
+
+            window_a_dram += {0, kKPerBlock};
+            window_b_dram += {0, kKPerBlock};
+        }
+
+        auto p_c_lds = xxx;
+
+        auto c_lds = make_tensor(p_c_lds, xxxxxx);
+
+        auto window_c_vgpr =
+            make_window(c_vgpr, {kMPerShuffle, kNPerShuffle}, {0, 0}, c_vgpr_window_strategy);
+
+        auto window_c_lds =
+            make_window(c_lds, {kMPerShuffle, kNPerShuffle}, {0, 0}, c_lds_window_strategy);
+
+        auto window_d_dram = make_window(d_dram_global,
+                                         {kMPerShuffle, kNPerShuffle},
+                                         {id_tile_m * kMPerTile, id_tile_n * kNPerTile},
+                                         d_dram_window_strategy);
+
+        auto window_e_dram = make_window(e_dram_global,
+                                         {kMPerShuffle, kNPerShuffle},
+                                         {id_tile_m * kMPerTile, id_tile_n * kNPerTile},
+                                         e_dram_window_strategy);
+
+        for(m = 0; m < kMPerBlock; m += kMPerShuffle)
+        {
+            for(n = 0; n < kNPerBlock; n += kNPerShuffle)
+            {
+                // write C into LDS for shuffle
+                copy(window_c_vgpr, window_c_lds, c_lds_store_strategy);
+
+                // load C from LDS to complete shuffle
+                auto c_vgpr_slice_shuffled = load(window_c_lds, c_lds_load_strategy);
+
+                // load D from dram
+                auto d_vgpr_block_slice = load(window_d_dram, d_dram_load_strategy);
+
+                // element wise op
+                // [Question] need to gurantee it always function
+                //   1. C/D should have same layout, how to gurantee?
+                //   2. if C/D have different layout, then need to do shuffle
+                //   3. if C/D have different layout, what should E layout be?
+                auto e_vgpr_block_slice =
+                    elementwise_op(c_vgpr_block_slice, d_vgpr_block_slice, cd_elementwise_op);
+
+                // write E into dram
+                copy(e_vgpr_block_slice, window_e_dram, e_dram_store_strategy);
+            }
+        }
+    }
+};
--- a/example/91_tile_program/hello_world.cpp
+++ b/example/91_tile_program/hello_world.cpp
@@ -19,6 +19,8 @@ struct HelloWorld
        auto desc0 = tp(make_naive_tensor_descriptor_packed(ck::make_tuple(x)));
        auto desc1 = tp(make_naive_tensor_descriptor_packed(ck::make_tuple(y)));

+        // only for testing purpose
+        // cpu should not do work here
        res[0] = desc0.GetLength(ck::Number<0>{});
        res[1] = desc1.GetLength(ck::Number<0>{});
    }

--- a/example/91_tile_program/im2col.cpp
+++ b/example/91_tile_program/im2col.cpp
+#include "tile_program.hpp"
+
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_gemm.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_xdl_cshuffle_v1.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/io.hpp"
+
+#include "ck/library/utility/device_memory.hpp"
+
+// program
+template <ck::index_t NDimSpatial, typename ALayout>
+struct Im2Col
+{
+    __host__ __device__ void
+    operator()(TileProgram& tp,
+               const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
+               const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
+               const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+               const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
+               const std::array<index_t, NDimSpatial>& conv_filter_strides,
+               const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+               const std::array<index_t, NDimSpatial>& input_left_pads,
+               const std::array<index_t, NDimSpatial>& input_right_pads,
+               //
+               const std::array<index_t, 2> a_gemmg_gemmm_gemmk_lengths,
+               const std::array<index_t, 2> a_gemmg_gemmm_gemmk_strides,
+               //
+               const T* p_a_img,
+               T* p_a_mtx)
+    {
+        using namespace ck;
+
+        const auto a_src_desc =
+            tp(TransformConvFwdToGemm<NDimSpatial, ConvolutionForwardSpecialization::Default>::
+                   template MakeADescriptor_M_K<ALayout>(a_g_n_c_wis_lengths,
+                                                         a_g_n_c_wis_strides,
+                                                         b_g_k_c_xs_lengths,
+                                                         b_g_k_c_xs_strides,
+                                                         c_g_n_k_wos_lengths,
+                                                         c_g_n_k_wos_strides,
+                                                         conv_filter_strides,
+                                                         conv_filter_dilations,
+                                                         input_left_pads,
+                                                         input_right_pads));
+
+        const auto a_dst_desc =
+            tp(make_naive_tensor_descriptor(a_gemmm_gemmk_lengths, a_gemmm_gemmk_strides));
+
+        const auto a_src = make_tensor(a_src_desc, p_a_img);
+
+        const auto a_dst = make_tensor(a_dst_desc, p_a_mtx);
+
+        const auto num_gemmg = a_gemmg_gemmm_gemmk_c_wis_lengths[0];
+        const auto num_gemmm = a_gemmg_gemmm_gemmk_c_wis_lengths[1];
+        const auto num_gemmk = a_gemmg_gemmm_gemmk_c_wis_lengths[2];
+
+        const auto id_block = get_block_1d_id();
+
+        const auto num_tile_m = num_gemmm / MPerTile;
+        const auto num_tile_k = num_gemmk / KPerTile;
+
+        const auto block2tile = tp(make_cluster_descriptor(make_tuple(num_tile_m, num_tile_k)));
+
+        const auto id_tile = block2tile.CalculateBottonIndex(id_block);
+
+        const auto id_tile_m = id_tile.At<0>();
+        const auto id_tile_k = id_tile.At<1>();
+
+#if 0
+        // operation-based syntax: per-oeration solution strategy
+        // operation here is data movement
+        auto copier = make_copier(a_src,
+                                  a_dst,
+                                  make_tuple(1, MPerTile, KPerTile),
+                                  make_tuple(0, id_tile_m * MPerTile, id_tile_k * KPerTile),
+                                  copy_strategy);
+
+        for(ck::index_t id_gemmg = 0; id_gemmg < num_gemmg; id_gemmg++)
+        {
+            copier();
+
+            copier.move_src_window(make_tuple(1, 0, 0));
+            copier.move_dst_window(make_tuple(1, 0, 0));
+        }
+#else
+        // data-based syntax: per-data solution strategy
+        auto window_a_src = make_window(a_src,
+                                        make_tuple(1, MPerTile, KPerTile),
+                                        make_tuple(0, id_tile_m * MPerTile, id_tile_k * KPerTile),
+                                        a_src_window_map_strategy);
+
+        auto window_a_dst = make_window(a_dst,
+                                        make_tuple(1, MPerTile, KPerTile),
+                                        make_tuple(0, id_tile_m * MPerTile, id_tile_k * KPerTile),
+                                        a_dst_window_map_strategy);
+
+        for(ck::index_t id_gemmg = 0; id_gemmg < num_gemmg; id_gemmg++)
+        {
+            copy(window_a_src, window_a_dst, a_copy_strategy);
+
+            window_a_src += make_tuple(1, 0, 0);
+            window_a_dst += make_tuple(1, 0, 0);
+        }
+#endif
+    }
+};
+
+int main()
+{
+    ck::index_t NumDimSpatial = 2;
+    ck::index_t G             = 32;
+    ck::index_t N             = 256;
+    ck::index_t K             = 192;
+    ck::index_t C             = 192;
+    ck::index_t Y             = 3;
+    ck::index_t X             = 3;
+    ck::index_t Hi            = 28;
+    ck::index_t Wi            = 28;
+    ck::index_t Ho            = 28;
+    ck::index_t Wo            = 28;
+
+    std::array<ck::index_t, NumDimSpatial + 3> in_lengths{G, N, Hi, Wi, C};
+    std::array<ck::index_t, NumDimSpatial + 3> in_strides{0, 0, 0, 0, 1};
+
+    std::array<ck::index_t, NumDimSpatial + 3> wei_lengths{G, K, Y, X, C};
+    std::array<ck::index_t, NumDimSpatial + 3> wei_strides{0, 0, 0, 0, 1};
+
+    std::array<ck::index_t, NumDimSpatial + 3> out_lengths{G, N, Ho, Wo, K};
+    std::array<ck::index_t, NumDimSpatial + 3> out_strides{0, 0, 0, 0, 1};
+
+    std::partial_sum(rbegin(in_lengths),
+                     std::prev(rend(in_lengths)),
+                     std::next(rbegin(in_strides)),
+                     std::multiplies<>{});
+    std::partial_sum(rbegin(wei_lengths),
+                     std::prev(rend(wei_lengths)),
+                     std::next(rbegin(wei_strides)),
+                     std::multiplies<>{});
+    std::partial_sum(rbegin(out_lengths),
+                     std::prev(rend(out_lengths)),
+                     std::next(rbegin(out_strides)),
+                     std::multiplies<>{});
+
+    // transpose GNHWC/GKYXC/GNHWK to GNCHW/GKCYX/GNCHW
+    std::rotate(
+        rbegin(in_lengths), std::next(rbegin(in_lengths)), std::next(rbegin(in_lengths), 3));
+    std::rotate(
+        rbegin(in_strides), std::next(rbegin(in_strides)), std::next(rbegin(in_strides), 3));
+    std::rotate(
+        rbegin(wei_lengths), std::next(rbegin(wei_lengths)), std::next(rbegin(wei_lengths), 3));
+    std::rotate(
+        rbegin(wei_strides), std::next(rbegin(wei_strides)), std::next(rbegin(wei_strides), 3));
+    std::rotate(
+        rbegin(out_lengths), std::next(rbegin(out_lengths)), std::next(rbegin(out_lengths), 3));
+    std::rotate(
+        rbegin(out_strides), std::next(rbegin(out_strides)), std::next(rbegin(out_strides), 3));
+
+    std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1};
+    std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1};
+    std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1};
+    std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};
+
+    // matrix
+    std::array<ck::index_t, NumDimSpatial + 3> in_mtx_lengths{G, G * Ho * Wo, C * Y * X};
+    std::array<ck::index_t, NumDimSpatial + 3> in_mtx_strides{0, 0, 1};
+
+    std::partial_sum(rbegin(in_mtx_lengths),
+                     std::prev(rend(in_mtx_lengths)),
+                     std::next(rbegin(in_mtx_strides)),
+                     std::multiplies<>{});
+
+    DeviceMem in(sizeof(InDataType) * G * N * Hi * Wi * C);
+    DeviceMem in_mtx(sizeof(InDataType) * G * N * Ho * Wo * C * Y * X);
+
+    launch(HelloWorld{},
+           1,
+           1,
+           in_lengths,
+           in_strides,
+           wei_lengths,
+           wei_strides,
+           out_lengths,
+           out_strides,
+           filter_strides,
+           filter_dilations,
+           input_left_pads,
+           input_right_pads,
+           //
+           in_mtx_lengths,
+           in_mtx_strides,
+           //
+           in.GetDeviceBuffer(),
+           in_mtx.GetDeviceBuffer());
+
+    return 0;
+}