updating v5r1

9d5d6afa · Chao Liu · dcee43fe · 9d5d6afa · 9d5d6afa · 9d5d6afa
Commit 9d5d6afa authored Apr 22, 2021 by Chao Liu
6 changed files
--- a/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
@@ -98,9 +98,14 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
            is_known_at_compile_time<remove_cv_t<remove_reference_t<SrcSliceOriginIdx>>>::value,
            "wrong! SrcSliceOrigin need to known at compile-time");
+#if 0 // debug
+      // TODO: turn this on, once v5r1 is updated to use StaticBuffer for holding C data
+        static_assert(SrcBuffer::IsStaticBuffer(), "wrong! SrcBuffer need to be StaticBuffer");
        static_assert(is_same<remove_cv_t<remove_reference_t<typename SrcBuffer::type>>,
                              remove_cv_t<remove_reference_t<SrcData>>>::value,
                      "wrong! SrcBuffer data type is wrong");
+#endif
        // SrcDesc and src_slice_origin_idx are known at compile-time
        constexpr auto src_desc             = remove_cv_t<remove_reference_t<SrcDesc>>{};
@@ -758,6 +763,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
        static_assert(DstAddressSpace == AddressSpace::Global or
                          DstAddressSpace == AddressSpace::Lds,
                      "wrong!");
+        // TODO: fix this
+        static_assert(is_same<SrcData, DstData>::value,
+                      "wrong! current implementation assume SrcData and DstData are same type");
    }
    __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
@@ -859,11 +868,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
                return src_data_idx;
            }();
-            // copy data
+            // copy data from src_buf to src_tmp_vector
-            typename vector_type_maker<SrcData, SrcScalarPerVector>::type src_vector;
+            vector_type_maker_t<SrcData, SrcScalarPerVector> src_tmp_vector;
-            using src_vector_t =
+            using src_vector_t = typename decltype(src_tmp_vector)::type;
-                typename vector_type_maker<SrcData, SrcScalarPerVector>::type::type;
            const bool is_src_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(
                src_desc, src_slice_origin_coord_);
@@ -871,14 +879,14 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
            if constexpr(SrcAddressSpace == AddressSpace::Global)
            {
 #if CK_USE_AMD_BUFFER_ADDRESSING
-                src_vector.template AsType<src_vector_t>()(Number<0>{}) =
+                src_tmp_vector.template AsType<src_vector_t>()(Number<0>{}) =
                    amd_buffer_load_v2<SrcData, SrcScalarPerVector>(
                        p_src,
                        src_slice_origin_coord_.GetOffset(),
                        is_src_valid,
                        src_desc.GetElementSpaceSize());
 #else
-                src_vector.template AsType<src_vector_t>()(Number<0>{}) =
+                src_tmp_vector.template AsType<src_vector_t>()(Number<0>{}) =
                    is_src_valid ? *reinterpret_cast<const src_vector_t*>(
                                       &p_src[src_slice_origin_coord_.GetOffset()])
                                 : src_vector_t{0};
@@ -886,18 +894,18 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
            }
            else
            {
-                src_vector.template AsType<src_vector_t>()(Number<0>{}) =
+                src_tmp_vector.template AsType<src_vector_t>()(Number<0>{}) =
                    is_src_valid ? *reinterpret_cast<const src_vector_t*>(
                                       &p_src[src_slice_origin_coord_.GetOffset()])
                                 : src_vector_t{0};
            }
+            // copy data from src_tmp_vector to buffer_
            static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
                constexpr index_t buffer_offset =
                    buffer_desc_.CalculateOffset(src_data_idx + i * src_scalar_step_in_vector);
-                buffer_.template AsType<SrcData>()(Number<buffer_offset>{}) =
+                buffer_(Number<buffer_offset>{}) = src_tmp_vector.template AsType<SrcData>()[i];
-                    src_vector.template AsType<SrcData>()[i];
            });
            constexpr auto move_on_dim = [&]() constexpr
@@ -1048,21 +1056,21 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
                              DstInMemOp == InMemoryDataOperation::Set,
                          "wrong! hardcoded for ds_write");
-            typename vector_type_maker<DstData, DstScalarPerVector>::type dst_vector;
+            vector_type_maker_t<DstData, DstScalarPerVector> dst_tmp_vector;
+            // copy data from buffer_ to dst_tmp_vector
            static_for<0, DstScalarPerVector, 1>{}([&](auto i) {
                constexpr index_t buffer_offset =
                    buffer_desc_.CalculateOffset(dst_data_idx + i * dst_scalar_step_in_vector);
-                dst_vector.template AsType<DstData>()(i) =
+                dst_tmp_vector.template AsType<DstData>()(i) = buffer_[Number<buffer_offset>{}];
-                    buffer_.template AsType<DstData>()[Number<buffer_offset>{}];
            });
-            using DstVectorType =
+            using dst_vector_t = typename decltype(dst_tmp_vector)::type;
-                typename vector_type_maker<DstData, DstScalarPerVector>::type::type;
-            *reinterpret_cast<DstVectorType*>(p_dst + dst_slice_origin_coord_.GetOffset()) =
+            // copy data from dst_tmp_vector to dst_buf
-                dst_vector.template AsType<DstVectorType>()[Number<0>{}];
+            *reinterpret_cast<dst_vector_t*>(p_dst + dst_slice_origin_coord_.GetOffset()) =
+                dst_tmp_vector.template AsType<dst_vector_t>()[Number<0>{}];
            constexpr auto move_on_dim = [&]() constexpr
            {
@@ -1319,7 +1327,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
    static constexpr auto buffer_size_ = buffer_desc_.GetElementSpaceSize();
-    typename vector_type_maker<SrcData, buffer_size_>::type buffer_;
+    StaticBuffer<SrcData, buffer_size_> buffer_;
    SrcCoord src_slice_origin_coord_;
    DstCoord dst_slice_origin_coord_;

--- a/composable_kernel/include/tensor_operation/threadwise_gemm_v3.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_gemm_v3.hpp
@@ -61,6 +61,7 @@ struct ThreadwiseGemm_km_kn_mn_v3
        static_for<0, E, 1>{}([&](auto e) {
            static_for<0, K, 1>{}([&](auto k) {
+#if 0
                constexpr auto a_offset = ADesc{}.CalculateOffset(make_tuple(e, k));
                if constexpr(H == 2 && W == 2)
@@ -123,6 +124,22 @@ struct ThreadwiseGemm_km_kn_mn_v3
                        });
                    });
                }
+#else
+                constexpr index_t a_offset = ADesc{}.CalculateOffset(make_tuple(e, k));
+                static_for<0, H, 1>{}([&](auto h) {
+                    static_for<0, W, 1>{}([&](auto w) {
+                        constexpr index_t b_offset =
+                            BDesc{}.CalculateOffset(make_tuple(e, 0, h, w));
+                        constexpr index_t c_offset =
+                            CDesc{}.CalculateOffset(make_tuple(k, 0, h, w));
+                        amd_assembly_inner_product(p_a[Number<a_offset>{}],
+                                                   p_b[Number<b_offset>{}],
+                                                   p_c[Number<c_offset>{}]);
+                    });
+                });
+#endif
            });
        });
    }

--- a/composable_kernel/include/utility/amd_inline_asm.hpp
+++ b/composable_kernel/include/utility/amd_inline_asm.hpp
@@ -36,6 +36,44 @@ __device__ void amd_assembly_inner_product(const int8x4_t& a, const int8x4_t& b,
 #endif
 }
+__device__ void amd_assembly_inner_product(const int8x8_t& a, const int8x8_t& b, int32_t& c)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    amd_assembly_inner_product(vector_type<int8_t, 8>{a}.AsType<int8x4_t>()[I0],
+                               vector_type<int8_t, 8>{b}.AsType<int8x4_t>()[I0],
+                               c);
+    amd_assembly_inner_product(vector_type<int8_t, 8>{a}.AsType<int8x4_t>()[I1],
+                               vector_type<int8_t, 8>{b}.AsType<int8x4_t>()[I1],
+                               c);
+}
+__device__ void amd_assembly_inner_product(const int8x16_t& a, const int8x16_t& b, int32_t& c)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    amd_assembly_inner_product(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I0],
+                               vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I0],
+                               c);
+    amd_assembly_inner_product(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I1],
+                               vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I1],
+                               c);
+    amd_assembly_inner_product(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I2],
+                               vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I2],
+                               c);
+    amd_assembly_inner_product(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I3],
+                               vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I3],
+                               c);
+}
 #if 0
 // c0 += inner_product(a, b0)
 // c1 += inner_product(a, b1)

--- a/composable_kernel/include/utility/config.amd.hpp.in
+++ b/composable_kernel/include/utility/config.amd.hpp.in
@@ -14,11 +14,11 @@
 #define CK_DEVICE_BACKEND_AMD 1
 // GPU ID
-#if 1
+#if 0
 #define CK_AMD_GPU_GFX906 1
 #elif 0
 #define CK_AMD_GPU_GFX908 1
-#elif 0
+#elif 1
 #define CK_AMD_GPU_GFX1030 1
 #endif

--- a/driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp
+++ b/driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp
@@ -53,7 +53,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk(
    constexpr auto C0 = C / Number<InWeiVectorSize>{};
    constexpr auto C1 = Number<InWeiVectorSize>{};
-#if 1
+#if 0
    // run-time variables
    constexpr auto in_n_hi_wi_c0_desc =
        make_dynamic_naive_tensor_descriptor_packed_v2(make_multi_index(N, Hi, Wi, C0));
@@ -112,7 +112,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk(
    wei_k_y_x_c_device_buf.ToDevice(wei_k_y_x_c.mData.data());
    out_n_ho_wo_k_device_buf.ToDevice(out_n_ho_wo_k.mData.data());
-#if 0
+#if 1
    // cdata = 16, BlockSize = 64, 16x64x4
    constexpr index_t BlockSize = 64;

--- a/driver/src/conv_driver.cpp
+++ b/driver/src/conv_driver.cpp
@@ -64,7 +64,7 @@ int main(int argc, char* argv[])
    using LeftPads  = Sequence<0, 0>;
    using RightPads = Sequence<0, 0>;
-#elif 0
+#elif 1
    constexpr index_t N  = 1;
    constexpr index_t C  = 16;
    constexpr index_t HI = 1080;
@@ -630,7 +630,7 @@ int main(int argc, char* argv[])
    print_array("ConvStrides", to_multi_index(ConvStrides{}));
    print_array("ConvDilations", to_multi_index(ConvDilations{}));
-#if 1
+#if 0
    using in_data_t                  = float;
    constexpr index_t in_vector_size = 1;
    using acc_data_t                 = float;
@@ -724,7 +724,7 @@ int main(int argc, char* argv[])
                                                                 LeftPads{},
                                                                 RightPads{},
                                                                 nrepeat);
-#elif 1
+#elif 0
    device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw<in_data_t,
                                                                         in_vector_size,
                                                                         acc_data_t,