Merge pull request #63 from ROCm/lwpck-1559

Merge changes for gfx12.

Merge pull request #63 from ROCm/lwpck-1559
Merge changes for gfx12.
5dbbf5d6 · Illia Silin · GitHub · 78f637e4 · 2d3d7190 · 5dbbf5d6
Unverified Commit 5dbbf5d6 authored May 08, 2024 by Illia Silin Committed by GitHub May 08, 2024
3 changed files
--- a/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp
@@ -36,7 +36,8 @@ __global__ void
                                const ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) ||         \
-    defined(__gfx90a__) || defined(__gfx94__) || defined(__gfx103__) || defined(__gfx11__))
+    defined(__gfx90a__) || defined(__gfx94__) || defined(__gfx103__) || defined(__gfx11__) || \
+    defined(__gfx12__))
    GridwiseTensorRearrangeKernel::Run(in_grid_desc,
                                       p_in_global,
                                       out_grid_desc,

--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
@@ -1333,7 +1333,7 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic
    ElementwiseOperation element_op_;
 };

-// Specilized for WMMA
+// Specilized for WMMA-Navi3
 // A single Wave32 is composed by double row
 // Data exchange allowed between these two rows
 // This RowLane Dst buf will be filled from two Src buf
@@ -1468,4 +1468,111 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow
    ElementwiseOperation element_op_{};
 };

+// Specilized for WMMA-Navi4
+template <typename SrcData,
+          typename DstData,
+          typename SrcDesc,
+          typename DstDesc,
+          typename ElementwiseOperation,
+          typename SliceLengths,
+          typename DimAccessOrder,
+          index_t DstVectorDim,
+          index_t DstScalarPerVector,
+          bool IntraRowSwizzlePerm,
+          typename enable_if<SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
+                             bool>::type = false>
+struct ThreadwiseTensorSliceTransfer_StaticToStatic_IntraRow
+{
+    static constexpr index_t nDim = SliceLengths::Size();
+
+    using Index = MultiIndex<nDim>;
+
+    __device__ constexpr ThreadwiseTensorSliceTransfer_StaticToStatic_IntraRow(const Index& src_idx)
+    {
+        static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
+                      "wrong! Desc need to known at compile-time");
+
+        static_assert(SliceLengths::At(Number<DstVectorDim>{}) % DstScalarPerVector == 0,
+                      "wrong! Not divisible");
+        ignore = src_idx;
+    }
+
+    template <typename SrcSliceOriginIdx,
+              typename DstSliceOriginIdx,
+              typename SrcBuffer,
+              typename DstBuffer>
+    __device__ void Run(const SrcDesc&,
+                        const SrcSliceOriginIdx&,
+                        const SrcBuffer& src_buf,
+                        const DstDesc&,
+                        const DstSliceOriginIdx&,
+                        DstBuffer& dst_buf) const
+    {
+        static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
+                      "wrong! Desc need to known at compile-time");
+
+        static_assert(is_known_at_compile_time<remove_cvref_t<SrcSliceOriginIdx>>::value &&
+                          is_known_at_compile_time<remove_cvref_t<DstSliceOriginIdx>>::value,
+                      "wrong! SliceOrigin need to known at compile-time");
+
+        static_assert(SrcBuffer::IsStaticBuffer() && DstBuffer::IsStaticBuffer(),
+                      "wrong! Buffer need to be StaticBuffer");
+
+        // SrcDesc and src_slice_origin_idx are known at compile-time
+        constexpr auto src_desc             = remove_cvref_t<SrcDesc>{};
+        constexpr auto dst_desc             = remove_cvref_t<DstDesc>{};
+        constexpr auto src_slice_origin_idx = to_multi_index(SrcSliceOriginIdx{});
+        constexpr auto dst_slice_origin_idx = to_multi_index(DstSliceOriginIdx{});
+
+        // scalar per access on each dim
+        constexpr auto dst_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto dst_scalar_step_in_vector =
+            generate_sequence(detail::lambda_scalar_step_in_vector<DstVectorDim>{}, Number<nDim>{});
+
+        using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
+                                                    DimAccessOrder,
+                                                    remove_cv_t<decltype(dst_scalar_per_access)>>;
+
+        static_assert(DstScalarPerVector == SpaceFillingCurve::ScalarPerVector,
+                      "wrong!DstScalarPerVector != SpaceFillingCurve::ScalarPerVector");
+
+        constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
+
+        static_for<0, num_access, 1>{}([&](auto idx_1d) {
+            constexpr auto idx_md = SpaceFillingCurve::GetIndex(idx_1d);
+
+            // copy data from src_buf into dst_vector
+            static_for<0, DstScalarPerVector, 1>{}([&](auto i) {
+                // src_desc error, non constexpr, caused by merge transform
+                constexpr index_t src_offset = src_desc.CalculateOffset(
+                    src_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
+
+                constexpr index_t dst_offset = dst_desc.CalculateOffset(
+                    dst_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
+
+                SrcData v_this_row;
+                // int type temp value due to intrinsic requirement
+                int temp = 0;
+
+                // apply element-wise operation
+                element_op_(v_this_row, src_buf[Number<src_offset>{}]);
+
+                // apply intra-row permute.
+                if constexpr(IntraRowSwizzlePerm)
+                {
+                    temp = __builtin_amdgcn_permlane16(
+                        temp, type_convert_sp<int>(v_this_row), 0xb3a29180, 0xf7e6d5c4, 1, 0);
+                    v_this_row = type_convert_sp<SrcData>(temp);
+                }
+
+                // apply type convert
+                dst_buf(Number<dst_offset>{}) = type_convert_sp<DstData>(v_this_row);
+            });
+        });
+    }
+    ElementwiseOperation element_op_{};
+};
+
 } // namespace ck
--- a/include/ck/utility/amd_wmma.hpp
+++ b/include/ck/utility/amd_wmma.hpp
@@ -260,7 +260,7 @@ struct intrin_wmma_i32_16x16x16_iu8_w64<16, 16, neg_a, neg_b, clamp>
 // gfx12
 /********************************WAVE32 MODE***********************************************/

-#if defined(__gfx1200__)
+#if defined(__gfx1200__) || defined(__gfx1201__)
 #define __gfx12__
 #endif