Merge branch gfx950 into andriy/lwpck-2413

e941f59f · Andriy Roshchenko · fe9d9812 · 7da48908 · e941f59f · e941f59f
Commit e941f59f authored Nov 01, 2024 by Andriy Roshchenko
20 changed files
--- a/example/ck_tile/05_reduce/reduce.hpp
+++ b/example/ck_tile/05_reduce/reduce.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common.hpp"
+#include "ck_tile/ops/reduce/block/block_reduce.hpp"
+#include "ck_tile/ops/reduce/block/block_reduce2d_default_policy.hpp"
+namespace ck_tile {
+template <typename BlockWarps, // num warps along seq<M, N>
+          typename BlockTile,  // block size, seq<M, N>
+          typename WarpTile,   // warp size, seq<M, N>
+          typename Vector>     // contiguous pixels(vector size) along seq<M, N>
+struct Reduce2dShape
+{
+    static constexpr index_t Block_M = BlockTile::at(number<0>{});
+    static constexpr index_t Block_N = BlockTile::at(number<1>{});
+    static constexpr index_t Warp_M = WarpTile::at(number<0>{});
+    static constexpr index_t Warp_N = WarpTile::at(number<1>{});
+    static constexpr index_t Vector_M = Vector::at(number<0>{});
+    static constexpr index_t Vector_N = Vector::at(number<1>{});
+    static constexpr index_t WarpPerBlock_M = BlockWarps::at(number<0>{});
+    static constexpr index_t WarpPerBlock_N = BlockWarps::at(number<1>{});
+    static constexpr index_t ThreadPerWarp_M = Warp_M / Vector_M;
+    static constexpr index_t ThreadPerWarp_N = Warp_N / Vector_N;
+    static constexpr index_t Repeat_M = Block_M / (WarpPerBlock_M * Warp_M);
+    static constexpr index_t Repeat_N = Block_N / (WarpPerBlock_N * Warp_N);
+    static constexpr index_t BlockSize =
+        warpSize * reduce_on_sequence(BlockWarps{}, multiplies{}, number<1>{});
+};
+template <typename XDataType_,
+          typename ComputeDataType_,
+          typename YDataType_,
+          typename BlockShape_,
+          typename ReduceOp_>
+struct Reduce2dProblem
+{
+    using XDataType       = remove_cvref_t<XDataType_>;
+    using ComputeDataType = remove_cvref_t<ComputeDataType_>;
+    using YDataType       = remove_cvref_t<YDataType_>;
+    using BlockShape      = remove_cvref_t<BlockShape_>;
+    using ReduceOp        = ReduceOp_;
+    static constexpr bool kNeedCrossLaneSync = BlockShape::ThreadPerWarp_N > 1;
+    static constexpr bool kNeedCrossWarpSync = BlockShape::WarpPerBlock_N > 1;
+};
+template <typename Problem_, typename Policy_ = BlockReduce2dDefaultPolicy>
+struct Reduce
+{
+    using Problem = ck_tile::remove_cvref_t<Problem_>;
+    using Policy  = ck_tile::remove_cvref_t<Policy_>;
+    using XDataType       = ck_tile::remove_cvref_t<typename Problem::XDataType>;
+    using ComputeDataType = ck_tile::remove_cvref_t<typename Problem::ComputeDataType>;
+    using YDataType       = ck_tile::remove_cvref_t<typename Problem::YDataType>;
+#if 0
+    CK_TILE_DEVICE void operator()(const XDataType* p_x, YDataType* p_y, index_t M, index_t N)
+    const
+    {
+        using S = typename Problem::BlockShape;
+        const auto x_m_n = make_naive_tensor_view<address_space_enum::global>(
+            p_x, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
+        const auto y_m = make_naive_tensor_view_packed<address_space_enum::global>(
+            p_y, make_tuple(M), number<1>{});
+        const auto iM = get_block_id() * S::Block_M;
+        auto x_window = make_tile_window(x_m_n,
+                                         make_tuple(number<S::Block_M>{}, number<S::Block_N>{}),
+                                         {iM, 0},
+                                         Policy::template MakeXBlockTileDistribution<Problem>());
+        auto y_window = make_tile_window(y_m, make_tuple(number<S::Block_M>{}), {iM});
+        const auto f_reduce = [](const auto& v0, const auto& v1) { return v0 + v1; };
+        const XDataType reduce_init_value = 0;
+        constexpr auto reduce_dims = sequence<1>{};
+        auto y_compute = decltype(block_tile_reduce<ComputeDataType>(
+            load_tile(x_window), reduce_dims, f_reduce, reduce_init_value)){};
+        set_tile(y_compute, reduce_init_value);
+        index_t num_n_tile_iteration =
+            __builtin_amdgcn_readfirstlane(integer_divide_ceil(N, S::Block_N));
+        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
+        {
+            const auto x = load_tile(x_window);
+            block_tile_reduce(y_compute, x, reduce_dims, f_reduce);
+            move_tile_window(x_window, {0, S::Block_N});
+        }
+        block_tile_reduce_sync(y_compute, f_reduce);
+        store_tile(y_window, cast_tile<YDataType>(y_compute));
+    }
+#else
+    CK_TILE_DEVICE void operator()(const XDataType* p_x, YDataType* p_y, index_t M, index_t N) const
+    {
+        using S = typename Problem::BlockShape;
+        const auto x_m_n = make_naive_tensor_view<address_space_enum::global>(
+            p_x, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
+        const auto y_m = make_naive_tensor_view_packed<address_space_enum::global>(
+            p_y, make_tuple(M), number<1>{});
+        const auto iM = get_block_id() * S::Block_M;
+        auto x_window = make_tile_window(x_m_n,
+                                         make_tuple(number<S::Block_M>{}, number<S::Block_N>{}),
+                                         {iM, 0},
+                                         Policy::template MakeXBlockTileDistribution<Problem>());
+        auto y_window = make_tile_window(y_m, make_tuple(number<S::Block_M>{}), {iM});
+        __shared__ char smem[Policy::template GetSmemSize<Problem>()];
+        index_t num_n_tile_iteration =
+            __builtin_amdgcn_readfirstlane(integer_divide_ceil(N, S::Block_N));
+        auto reduce_func         = typename Problem::ReduceOp{};
+        auto block_reduce2d      = Policy::template GetBlockReduce2d<Problem>();
+        auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync<Problem>();
+        auto block_reduce2d_cross_warp_sync =
+            Policy::template GetBlockReduce2dCrossWarpSync<Problem>();
+        using XTensorType = decltype(load_tile(x_window));
+        auto y_compute    = block_reduce2d.template MakeYBlockTile<XTensorType>();
+        set_tile(y_compute, reduce_func.template GetIdentityValue<ComputeDataType>());
+        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
+        {
+            const auto x = load_tile(x_window);
+            block_reduce2d(x, y_compute, reduce_func);
+            move_tile_window(x_window, {0, S::Block_N});
+        }
+        block_reduce2d_sync(y_compute, reduce_func);
+        block_reduce2d_cross_warp_sync(y_compute, smem, reduce_func);
+        store_tile(y_window, cast_tile<YDataType>(y_compute));
+    }
+#endif
+};
+} // namespace ck_tile
--- a/example/ck_tile/06_permute/CMakeLists.txt
+++ b/example/ck_tile/06_permute/CMakeLists.txt
+# not using add_example_executable() to add this target, since we don't want this to have
+# to be included in "make all/install/check"
+add_executable(tile_example_permute EXCLUDE_FROM_ALL permute.cpp)
+if(NOT DEFINED PERMUTE_USE_ALTERNATIVE_IMPL)
+# set(PERMUTE_USE_ALTERNATIVE_IMPL false)
+set(PERMUTE_USE_ALTERNATIVE_IMPL true)
+endif()
+if(PERMUTE_USE_ALTERNATIVE_IMPL)
+target_compile_options(tile_example_permute PRIVATE -DPERMUTE_USE_ALTERNATIVE_IMPL)
+target_sources(tile_example_permute PRIVATE alternative_impl/matrix_core_swizzle.cpp)
+endif()
+# target_compile_options(tile_example_permute PRIVATE -v --save-temps -Wno-gnu-line-marker)
--- a/example/ck_tile/06_permute/README.md
+++ b/example/ck_tile/06_permute/README.md
+# permute
+This folder contains example for permute kernel, which is similiar to [torch.permute](https://pytorch.org/docs/stable/generated/torch.permute.html) (combined with [torch.contiguous](https://pytorch.org/docs/stable/generated/torch.Tensor.contiguous.html)). Currently we implement a generic permute kernel that support up to rank 8 arbitrary permutation with a single kernel instance. Performance is not the first consideration, we prefer a simple and general kernel implementation using `ck_tile` in this example.
+```
+args:
+          -v    weather do CPU validation or not (default:1)
+       -prec    data type. fp16/bf16/fp32 (default:fp16)
+      -shape    the shape of the input tensor (default:2,3,4)
+       -perm    permute perm (default:2,1,0)
+```
+## build
+```
+# in the root of ck_tile
+mkdir build && cd build
+sh ../script/cmake-ck-dev.sh  ../ <arch>  # you can replace this <arch> to gfx90a, gfx942...
+make tile_example_permute -j
+```
+This will result in an executable `build/bin/tile_example_permute`
+## some examples
+```
+# torch
+x=torch.randn(2,3,4,6)
+y=x.permute(0,3,2,1).contiguous()
+# ck_tile
+./build/bin/tile_example_permute -shape=2,3,4,6 -perm=0,3,2,1
+```
+or you can try the smoke_test
+```
+# in the root of ck_tile, after you build this example
+sh example/ck_tile/06_permute/script/smoke_test.sh
+```
+### alternative implementation
+we have an alternative implementation under `alternative_impl/` folder, that can swizzle the tensor to be more friendly for data loading for matrix core layout. This can be enabled when dealing with a `rank-7` tensor, with a fixed pattern of either `0,1,4,2,5,3,6` or `0,1,2,4,5,3,6`. There are other shape limitation of this implementation, check the source code of `permute.cpp` for detail.
+```
+# example
+./build/bin/tile_example_permute -shape=3,6,4,32,16,2,8 -perm=0,1,4,2,5,3,6 # b_n0_k0_n1_k1_n2_k2
+./build/bin/tile_example_permute -shape=3,8,4,16,16,4,8 -perm=0,1,2,4,5,3,6 # b_n0_n1_k0_k1_n2_k2
+```
--- a/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle.cpp
+++ b/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle.cpp
--- a/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle.hpp
+++ b/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "matrix_core_swizzle_kernel.hpp"
+#include <string>
+struct matrix_core_swizzle_traits
+{
+    std::string data_type; // fp16 only
+    std::string inst;      // 32x32x8, 16x16x16
+    std::string permute;   //
+};
+using matrix_core_swizzle_args = matrix_core_swizzle_host_args;
+// host API
+float matrix_core_swizzle(matrix_core_swizzle_traits,
+                          matrix_core_swizzle_args,
+                          const ck_tile::stream_config&);
--- a/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle_kernel.hpp
+++ b/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle_kernel.hpp
--- a/example/ck_tile/06_permute/permute.cpp
+++ b/example/ck_tile/06_permute/permute.cpp
--- a/example/ck_tile/06_permute/permute.hpp
+++ b/example/ck_tile/06_permute/permute.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/permute.hpp"
+#include <string>
+struct permute_traits
+{
+    std::string data_type;
+};
+using permute_args = ck_tile::GenericPermuteHostArgs;
+// host API
+float permute(permute_traits, permute_args, const ck_tile::stream_config&);
--- a/example/ck_tile/06_permute/script/smoke_test.sh
+++ b/example/ck_tile/06_permute/script/smoke_test.sh
--- a/example/ck_tile/09_topk_softmax/CMakeLists.txt
+++ b/example/ck_tile/09_topk_softmax/CMakeLists.txt
+add_executable(tile_example_topk_softmax EXCLUDE_FROM_ALL topk_softmax.cpp topk_softmax_api.cpp)
+target_include_directories(tile_example_topk_softmax PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/)
+set(EXAMPLE_TOPK_SOFTMAX_COMPILE_OPTIONS)
+# NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
+list(APPEND EXAMPLE_TOPK_SOFTMAX_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)
+# list(APPEND EXAMPLE_TOPK_SOFTMAX_COMPILE_OPTIONS -v --save-temps -Wno-gnu-line-marker)
+target_compile_options(tile_example_topk_softmax PRIVATE ${EXAMPLE_TOPK_SOFTMAX_COMPILE_OPTIONS})
--- a/example/ck_tile/09_topk_softmax/README.md
+++ b/example/ck_tile/09_topk_softmax/README.md
--- a/example/ck_tile/09_topk_softmax/script/smoke_test.sh
+++ b/example/ck_tile/09_topk_softmax/script/smoke_test.sh
--- a/example/ck_tile/09_topk_softmax/topk_softmax.cpp
+++ b/example/ck_tile/09_topk_softmax/topk_softmax.cpp
--- a/example/ck_tile/09_topk_softmax/topk_softmax_api.cpp
+++ b/example/ck_tile/09_topk_softmax/topk_softmax_api.cpp
--- a/example/ck_tile/09_topk_softmax/topk_softmax_api.hpp
+++ b/example/ck_tile/09_topk_softmax/topk_softmax_api.hpp
--- a/example/ck_tile/10_rmsnorm2d/CMakeLists.txt
+++ b/example/ck_tile/10_rmsnorm2d/CMakeLists.txt
--- a/example/ck_tile/10_rmsnorm2d/README.md
+++ b/example/ck_tile/10_rmsnorm2d/README.md
--- a/example/ck_tile/10_rmsnorm2d/example_rmsnorm2d_fwd.cpp
+++ b/example/ck_tile/10_rmsnorm2d/example_rmsnorm2d_fwd.cpp
--- a/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_api.cpp
+++ b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_api.cpp
--- a/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n1024_instance.cpp
+++ b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n1024_instance.cpp