xdlops_v4r4_fwd fp32/fp16 (#34)

* create files for xdlops * working on blockwise_gemm_xdlops * add KReduction * add m/n repeats * add 2x2 pipeline * added 128x128 wavegemm * use StaticBuffer of vector_type * break vector type to blk_size * add kpack into xldops_gemm and blockwise_gemm * abroadcast only * add fp32 mfma instructions * adding fp16 mfma * pack half4_t * rename kperwave to kpack * add 32x32x8fp16 * add fp16 mfma * clean code * clean code * V4r4 xdlops kpack (#35) * add kpack with incorrect results * bug fix for make_dynamic_naive_tensor_descriptor_aligned_v2 * add 1x1 kernel * add gridwise_gemm_v2 - single_buffer * enabled dwordx4 for fp16 Co-authored-by: Chao Liu <chao.liu2@amd.com> * refactor fwd-v4r4-xdlops * add v4r4-nhwc-xdlop * improve some perf of nhwc and nchw by tuning parameters, and change scheuduling in gridwise-gemm loop * tweak scheduling in gridwise gemm * add v4r3 with a single output copy * init commit: output with slice win * adding sliceWin * add multiple repeats pattern * starting adding bwd-v4r1-xdlops * use tuple as SrcBuffer * adding bwd-data v4r1 nhwc xdlops * fix bug in make_dynamic_naive_tensor_descriptor_aligned_v2() * fix bug in host bwd-data conv * initial implementation of bwd-data v4r1 nhwc xdlops * add launch bound flags * enable launch bound * add m/nrepeat=4 * tweak bwd-data v4r1 nhwc xdlops * added bwd-data v4r1 nhwc xlops with output A and weight B * add fwd-v4r4 nhwc xdlops, A input, B weight, C output Co-authored-by: Chao Liu <chao.liu2@amd.com>

xdlops_v4r4_fwd fp32/fp16 (#34)
* create files for xdlops * working on blockwise_gemm_xdlops * add KReduction * add m/n repeats * add 2x2 pipeline * added 128x128 wavegemm * use StaticBuffer of vector_type * break vector type to blk_size * add kpack into xldops_gemm and blockwise_gemm * abroadcast only * add fp32 mfma instructions * adding fp16 mfma * pack half4_t * rename kperwave to kpack * add 32x32x8fp16 * add fp16 mfma * clean code * clean code * V4r4 xdlops kpack (#35) * add kpack with incorrect results * bug fix for make_dynamic_naive_tensor_descriptor_aligned_v2 * add 1x1 kernel * add gridwise_gemm_v2 - single_buffer * enabled dwordx4 for fp16 Co-authored-by: Chao Liu <chao.liu2@amd.com> * refactor fwd-v4r4-xdlops * add v4r4-nhwc-xdlop * improve some perf of nhwc and nchw by tuning parameters, and change scheuduling in gridwise-gemm loop * tweak scheduling in gridwise gemm * add v4r3 with a single output copy * init commit: output with slice win * adding sliceWin * add multiple repeats pattern * starting adding bwd-v4r1-xdlops * use tuple as SrcBuffer * adding bwd-data v4r1 nhwc xdlops * fix bug in make_dynamic_naive_tensor_descriptor_aligned_v2() * fix bug in host bwd-data conv * initial implementation of bwd-data v4r1 nhwc xdlops * add launch bound flags * enable launch bound * add m/nrepeat=4 * tweak bwd-data v4r1 nhwc xdlops * added bwd-data v4r1 nhwc xlops with output A and weight B * add fwd-v4r4 nhwc xdlops, A input, B weight, C output Co-authored-by: Chao Liu <chao.liu2@amd.com>
3835318c · zjing14 · GitHub · 1685048a · 3835318c · 3835318c
Unverified Commit 3835318c authored Jul 01, 2021 by zjing14 Committed by GitHub Jul 01, 2021
20 changed files
--- a/composable_kernel/include/tensor_operation/gridwise_dynamic_contraction_v1r1.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_dynamic_contraction_v1r1.hpp
@@ -101,6 +101,7 @@ struct GridwiseDynamicContraction_km0m1_kn0n1_m0m1n0n1_v1r1
    static constexpr auto I2 = Number<2>{};
    static constexpr auto I3 = Number<3>{};

+    // GM0 and GN0 need to known at compile-time
    static constexpr auto GM0 = CGM0GM1GN0GN1GridDesc{}.GetLength(I0);
    static constexpr auto GN0 = CGM0GM1GN0GN1GridDesc{}.GetLength(I2);

@@ -140,7 +141,7 @@ struct GridwiseDynamicContraction_km0m1_kn0n1_m0m1n0n1_v1r1
    {
        static_assert(is_known_at_compile_time<remove_cv_t<decltype(GM0)>>::value &&
                          is_known_at_compile_time<remove_cv_t<decltype(GN0)>>::value,
-                      "wrong!");
+                      "wrong! GM0 and GN0 need to be known at compile-time");

        const auto GM1 = a_gk_gm0_gm1_grid_desc.GetLength(I2);
        const auto GN1 = b_gk_gn0_gn1_grid_desc.GetLength(I2);

--- a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops.hpp
--- a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2.hpp
--- a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r2.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r2.hpp
--- a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp
--- a/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
@@ -101,9 +101,9 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3

        static_assert(SrcBuffer::IsStaticBuffer(), "wrong! SrcBuffer need to be StaticBuffer");

-        static_assert(is_same<remove_cv_t<remove_reference_t<typename SrcBuffer::type>>,
-                              remove_cv_t<remove_reference_t<SrcData>>>::value,
-                      "wrong! SrcBuffer data type is wrong");
+        // static_assert(is_same<remove_cv_t<remove_reference_t<typename SrcBuffer::type>>,
+        // remove_cv_t<remove_reference_t<SrcData>>>::value,
+        //"wrong! SrcBuffer data type is wrong");

        // SrcDesc and src_slice_origin_idx are known at compile-time
        constexpr auto src_desc             = remove_cv_t<remove_reference_t<SrcDesc>>{};
@@ -1407,7 +1407,6 @@ struct ThreadwiseDynamicTensorSliceTransfer_v4
            constexpr auto data_to_origin_disp_idx =
                ordered_access_idx.ReorderGivenOld2New(dim_access_order) * src_scalar_per_access;
 #endif
-
            // src coordinate
            constexpr auto src_ref_to_data_disp_idx =
                src_ref_to_origin_disp_idx + data_to_origin_disp_idx;

--- a/composable_kernel/include/tensor_operation/xdlops_gemm.hpp
+++ b/composable_kernel/include/tensor_operation/xdlops_gemm.hpp
--- a/composable_kernel/include/utility/amd_buffer_addressing_v2.hpp
+++ b/composable_kernel/include/utility/amd_buffer_addressing_v2.hpp
@@ -268,6 +268,7 @@ amd_buffer_load_impl_v2(int32x4_t src_wave_buffer_resource,
        }
        else if constexpr(N == 8)
        {
+#if 0
            vector_type<half_t, 8> tmp;

            tmp.AsType<half4_t>()(Number<0>{}) = __llvm_amdgcn_raw_buffer_load_fp16x4(
@@ -280,6 +281,12 @@ amd_buffer_load_impl_v2(int32x4_t src_wave_buffer_resource,
                                                     0);

            return tmp.AsType<half8_t>()(Number<0>{});
+#else
+            float4_t tmp = __llvm_amdgcn_raw_buffer_load_fp32x4(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+
+            return as_type<half8_t>(tmp);
+#endif
        }
    }
    else if constexpr(is_same<T, int32_t>::value)

--- a/composable_kernel/include/utility/amd_xdlops.hpp
+++ b/composable_kernel/include/utility/amd_xdlops.hpp
--- a/composable_kernel/include/utility/config.amd.hpp.in
+++ b/composable_kernel/include/utility/config.amd.hpp.in
@@ -18,7 +18,7 @@
 #define CK_AMD_GPU_GFX906 1
 #elif 1
 #define CK_AMD_GPU_GFX908 1
-#elif 1
+#elif 0
 #define CK_AMD_GPU_GFX1030 1
 #endif

@@ -28,7 +28,7 @@
 #endif

 // launch bounds
-#define CK_USE_LAUNCH_BOUNDS 0
+#define CK_USE_LAUNCH_BOUNDS 1

 #ifdef CK_USE_LAUNCH_BOUNDS
 #define CK_MAX_THREAD_PER_BLOCK 256
@@ -116,7 +116,7 @@
 #define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER 1

 // merge transformation use magic number division
-#define CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION 0
+#define CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION 1

 // hack: have underlying assumption that need to be satsified, otherwise it's a bug
 // hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be

--- a/composable_kernel/include/utility/container_helper.hpp
+++ b/composable_kernel/include/utility/container_helper.hpp
@@ -174,8 +174,15 @@ __host__ __device__ constexpr auto container_reduce(const Container& x,
 {
    static_assert((IEnd - IBegin) % IStep == 0, "wrong!");

-    return container_reduce_impl(
-        x, reduce, init, Number<IBegin>{}, Number<IEnd>{}, Number<IStep>{});
+    if constexpr(IEnd > IBegin)
+    {
+        return container_reduce_impl(
+            x, reduce, init, Number<IBegin>{}, Number<IEnd>{}, Number<IStep>{});
+    }
+    else
+    {
+        return init;
+    }
 }
 #endif


--- a/composable_kernel/include/utility/float_type.amd.hpp.in
+++ b/composable_kernel/include/utility/float_type.amd.hpp.in
--- a/composable_kernel/include/utility/math.hpp
+++ b/composable_kernel/include/utility/math.hpp
--- a/composable_kernel/include/utility/tuple.hpp
+++ b/composable_kernel/include/utility/tuple.hpp
@@ -153,6 +153,8 @@ struct Tuple : detail::TupleImpl<typename arithmetic_sequence_gen<0, sizeof...(X

        return *this;
    }
+
+    __host__ __device__ static constexpr bool IsStaticBuffer() { return true; }
 };

 template <typename... Xs>

--- a/driver/conv_bwd_data_driver.cpp
+++ b/driver/conv_bwd_data_driver.cpp
@@ -19,7 +19,22 @@ int main(int argc, char* argv[])
 {
    using namespace launcher;

-#if 0
+#if 1
+    // 1x1 filter, 14x14 image
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 256;
+    constexpr index_t HI = 1;
+    constexpr index_t WI = 128;
+    constexpr index_t K  = 16;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
    constexpr index_t N  = 64;
    constexpr index_t C  = 256;
    constexpr index_t HI = 56;
@@ -93,7 +108,7 @@ int main(int argc, char* argv[])

    using LeftPads  = Sequence<0, 0>;
    using RightPads = Sequence<0, 0>;
-#elif 0
+#elif 1
    // 1x1 filter, 14x14 image
    constexpr index_t N  = 128;
    constexpr index_t C  = 512;
@@ -153,7 +168,7 @@ int main(int argc, char* argv[])

    using LeftPads  = Sequence<2, 2>;
    using RightPads = Sequence<2, 2>;
-#elif 1
+#elif 0
    // 1x7 filter, 0x3 pad, 17x17 input
    constexpr index_t N  = 128;
    constexpr index_t C  = 128;
@@ -245,7 +260,7 @@ int main(int argc, char* argv[])
    device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw
 #elif 0
    device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw
-#elif 1
+#elif 0
    device_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw
 #elif 1
    device_convolution_backward_data_implicit_gemm_v5r1_nhwc_kyxc_nhwk

--- a/driver/conv_bwd_data_driver_v2.cpp
+++ b/driver/conv_bwd_data_driver_v2.cpp
--- a/driver/conv_driver.cpp
+++ b/driver/conv_driver.cpp
--- a/driver/conv_driver_v2.cpp
+++ b/driver/conv_driver_v2.cpp
--- a/driver/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/driver/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp
--- a/driver/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/driver/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp