add asm

b3108646 · Jing Zhang · 5be6a5f9 · b3108646 · b3108646 · b3108646
Commit b3108646 authored Jun 12, 2019 by Jing Zhang
3 changed files
--- a/src/include/amd_inline_asm.hip.hpp
+++ b/src/include/amd_inline_asm.hip.hpp
@@ -8,7 +8,7 @@
 #define NO_GLB_READ 0

 // cast a pointer of LDS to its address
-extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]];
+extern "C" __attribute__((address_space(3))) void* __to_local(const void* p)[[hc]];

 __device__ void vmcnt(index_t cnt)
 {
@@ -721,18 +721,54 @@ __device__ void ds_read_b128(vector_type<float, 4>::MemoryType& r, void* lds, in
 #endif
 }

-__device__ void global_load(vector_type<float, 4>::MemoryType& r,
-                            const vector_type<float, 4>::MemoryType* ptr,
-                            index_t offset = 0)
+__device__ void global_loadx4(void* r, const void* ptr, index_t offset = 0)
 {
 #if !NO_GLB_READ
    if(offset == 0)
    {
+        //*(vector_type<float, 4>::MemoryType*)(r) = *(vector_type<float, 4>::MemoryType*)(ptr);
        asm volatile("\n \
                global_load_dwordx4 %0, %1, off \n \
                "
-                     : "=v"(r)
-                     : "v"(ptr));
+                     : "=v"(*(vector_type<float, 4>::MemoryType*)(r))
+                     : "r"(ptr));
+    }
+    else
+    {
+        assert(false);
+    }
+#endif
+}
+
+__device__ void global_loadx2(void* r, const void* ptr, index_t offset = 0)
+{
+#if !NO_GLB_READ
+    if(offset == 0)
+    {
+        asm volatile("\n \
+                global_load_dwordx2 %0, %1, off \n \
+                "
+                     : "=v"(*(vector_type<float, 2>::MemoryType*)(r))
+                     : "r"(ptr));
+    }
+    else
+    {
+        assert(false);
+    }
+#endif
+}
+
+__device__ void global_loadx1(void* r, const void* ptr, index_t offset = 0)
+{
+#if !NO_GLB_READ
+    if(offset == 0)
+    {
+        //*(float*)(r) = *(float*)(ptr);
+        asm volatile("\n \
+                global_load_dword %0, %1, off \n \
+                "
+                     : "=v"(*(float*)(r))
+                     : "r"(ptr));
    }
    else
    {
@@ -741,17 +777,92 @@ __device__ void global_load(vector_type<float, 4>::MemoryType& r,
 #endif
 }

-__device__ void
-ds_write_b128(const vector_type<float, 4>::MemoryType& r, void* lds, index_t offset = 0)
+__device__ void global_storex4(const void* ptr, const void* r, index_t offset = 0)
+{
+#if !NO_GLB_READ
+    if(offset == 0)
+    {
+        //*(vector_type<float, 4>::MemoryType*)(ptr) = *(vector_type<float, 4>::MemoryType*)(r);
+        asm volatile("\n \
+                global_store_dwordx4 %0, %1, off \n \
+                "
+                     :
+                     : "r"(ptr), "v"(*(vector_type<float, 4>::MemoryType*)(r)));
+    }
+    else
+    {
+        assert(false);
+    }
+#endif
+}
+
+__device__ void global_storex2(const void* ptr, const void* r, index_t offset = 0)
+{
+#if !NO_GLB_READ
+    if(offset == 0)
+    {
+        asm volatile("\n \
+                global_store_dwordx2 %0, %1, off \n \
+                "
+                     :
+                     : "r"(ptr), "v"(*(vector_type<float, 2>::MemoryType*)(r)));
+    }
+    else
+    {
+        assert(false);
+    }
+#endif
+}
+
+__device__ void global_storex1(const void* ptr, const void* r, index_t offset = 0)
+{
+#if !NO_GLB_READ
+    if(offset == 0)
+    {
+        //*(float*)(ptr) = *(float*)(r);
+        asm volatile("\n \
+                global_store_dword %0, %1, off \n \
+                "
+                     :
+                     : "r"(ptr), "v"(*(float*)(r)));
+    }
+    else
+    {
+        assert(false);
+    }
+#endif
+}
+
+__device__ void ds_write_b128(const void* lds, const void* r, index_t offset = 0)
 {
 #if !NO_DS_WRITE
    if(offset == 0)
    {
+        //*(vector_type<float, 4>::MemoryType*)(lds) = *(vector_type<float, 4>::MemoryType*)(r);
        asm volatile("\n \
-            ds_write_b128 %0, %1 \n \
-            "
+                ds_write_b128 %0, %1 \n \
+                "
+                     :
+                     : "v"(__to_local(lds)), "v"(*(vector_type<float, 4>::MemoryType*)(r)));
+    }
+    else
+    {
+        assert(false);
+    }
+#endif
+}
+
+__device__ void ds_write_b32(const void* lds, const void* r, index_t offset = 0)
+{
+#if !NO_DS_WRITE
+    if(offset == 0)
+    {
+        //*(float*)(lds) = *(float*)(r);
+        asm volatile("\n \
+                ds_write_b32 %0, %1 \n \
+                "
                     :
-                     : "v"(__to_local(lds)), "v"(r));
+                     : "v"(__to_local(lds)), "v"(*(float*)(r)));
    }
    else
    {

--- a/src/include/gridwise_convolution_implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hip.hpp
@@ -460,6 +460,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kc1x1_nkhw
            blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_block_on_global,
                                                        p_wei_register_clipboard);

+            // vmcnt(0);
            blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, p_in_block_double);
            blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
                                                         p_wei_block_double);
@@ -504,6 +505,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kc1x1_nkhw
                // LDS double buffer: GEMM on current data
                run_blockwise_gemm(p_wei_block_now, p_in_block_now, p_out_thread);

+                // vmcnt(0);
                // LDS double buffer: store next data to LDS
                blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
                                                            p_in_block_next);
@@ -535,6 +537,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kc1x1_nkhw
            // LDS double buffer: GEMM on current data
            run_blockwise_gemm(p_wei_block_double, p_in_block_double, p_out_thread);

+            // vmcnt(0);
            // LDS double buffer: store next data to LDS
            blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
                                                        p_in_block_double + in_block_space);

--- a/src/include/threadwise_generic_tensor_slice_op.hip.hpp
+++ b/src/include/threadwise_generic_tensor_slice_op.hip.hpp
@@ -97,3 +97,135 @@ __device__ void threadwise_generic_tensor_slice_copy_v1(
    });
 #endif
 }
+
+template <class Float,
+          class SrcDesc,
+          class DstDesc,
+          class SliceLengths,
+          class DimAccessOrder,
+          index_t DataPerAccess,
+          index_t OpType>
+__device__ void threadwise_generic_tensor_slice_copy_v1_asm(
+    SrcDesc,
+    const Float* __restrict__ p_src,
+    Array<index_t, SrcDesc::GetNumOfDimension()> src_multi_id_begin,
+    DstDesc,
+    Float* __restrict__ p_dst,
+    Array<index_t, DstDesc::GetNumOfDimension()> dst_multi_id_begin,
+    SliceLengths,
+    DimAccessOrder,
+    Number<DataPerAccess>,
+    Number<OpType>)
+{
+    constexpr index_t nDim = SrcDesc::GetNumOfDimension();
+
+    static_assert(nDim == SrcDesc::GetNumOfDimension() && nDim == DstDesc::GetNumOfDimension() &&
+                      nDim == SliceLengths::GetSize() && nDim == DimAccessOrder::GetSize(),
+                  "wrong! # of dimensions not the same");
+
+    static_assert(is_valid_sequence_map<DimAccessOrder>::value, "wrong! map is not valid");
+
+#if 0 
+    // doesn't compile, because merged-tensor reordering is not implemented
+    // TODO: implement tensor desc ops for merged-tensor
+    constexpr auto src_strides_in_access_order =
+        SrcDesc::ReorderGivenNew2Old(DimAccessOrder{}).GetStride(Number<nDim-1>{});
+
+    constexpr auto dst_strides_in_access_order =
+        SrcDesc::ReorderGivenNew2Old(DimAccessOrder{}).GetStride(Number<nDim-1>{});
+
+    // check src/dst stride on the lowest access dimension
+    static_assert((DataPerAccess == 1 || src_strides_in_access_order.Back() == 1) &&
+                      (DataPerAccess == 1 || dst_strides_in_access_order.Back() == 1),
+                  "wrong! src/dst stride on the lowest access dimension needs to be 1 for "
+                  "vectorized read/write");
+#endif
+
+    constexpr auto slice_lengths_in_access_order =
+        SliceLengths::ReorderGivenNew2Old(DimAccessOrder{});
+
+    // check slice length on the lowest access dimension
+    static_assert(slice_lengths_in_access_order.Back() % DataPerAccess == 0,
+                  "wrong! slice length on the lowest access dimension should be evenly divided by "
+                  "DataPerAccess");
+
+    constexpr index_t num_access_on_lowest_access_dimension =
+        slice_lengths_in_access_order.Back() / DataPerAccess;
+
+    constexpr auto access_lengths = slice_lengths_in_access_order.Modify(
+        Number<nDim - 1>{}, Number<num_access_on_lowest_access_dimension>{});
+
+    using vector_t = typename vector_type<Float, DataPerAccess>::MemoryType;
+
+#if 1
+    ford<decltype(access_lengths)>{}([&](auto access_multi_id) {
+        auto data_multi_id_in_access_order      = access_multi_id;
+        data_multi_id_in_access_order(nDim - 1) = access_multi_id[nDim - 1] * DataPerAccess;
+
+        const auto data_multi_id =
+            reorder_array_given_old2new(data_multi_id_in_access_order, DimAccessOrder{});
+
+        const index_t src_index =
+            SrcDesc::GetOffsetFromMultiIndex(src_multi_id_begin + data_multi_id);
+
+        const index_t dst_index =
+            DstDesc::GetOffsetFromMultiIndex(dst_multi_id_begin + data_multi_id);
+
+        static_assert(DataPerAccess == 1 || DataPerAccess == 4, "unsupported DataPerAccess");
+        static_assert(OpType == 1, "unsupported OpType");
+
+        if(DataPerAccess == 4)
+        {
+            if(OpType == 1)
+            {
+                global_loadx4(&p_dst[dst_index], &p_src[src_index]);
+            }
+            else if(OpType == 2)
+            {
+                global_storex4(&p_dst[dst_index], &p_src[src_index]);
+            }
+            else
+            {
+                ds_write_b128(&p_dst[dst_index], &p_src[src_index]);
+            }
+        }
+
+        if(DataPerAccess == 1)
+        {
+            if(OpType == 1)
+            {
+                global_loadx1(&p_dst[dst_index], &p_src[src_index]);
+            }
+            else if(OpType == 2)
+            {
+                global_storex1(&p_dst[dst_index], &p_src[src_index]);
+            }
+            else
+            {
+                ds_write_b32(&p_dst[dst_index], &p_src[src_index]);
+            }
+        }
+        //*reinterpret_cast<vector_t*>(&p_dst[dst_index]) =
+        //*reinterpret_cast<const vector_t*>(&p_src[src_index]);
+    });
+#else
+    static_ford<decltype(access_lengths)>{}([&](auto access_multi_id) {
+        constexpr index_t itmp = access_multi_id.Back() * DataPerAccess;
+
+        constexpr auto data_multi_id_in_access_order =
+            access_multi_id.Modify(Number<nDim - 1>{}, Number<itmp>{});
+
+        constexpr auto data_multi_id = reorder_array_given_old2new(
+            sequence2array(data_multi_id_in_access_order), DimAccessOrder{});
+
+        const index_t src_index =
+            SrcDesc::GetOffsetFromMultiIndex(src_multi_id_begin + data_multi_id);
+
+        const index_t dst_index =
+            DstDesc::GetOffsetFromMultiIndex(dst_multi_id_begin + data_multi_id);
+
+        *reinterpret_cast<vector_t*>(&p_dst[dst_index]) =
+            *reinterpret_cast<const vector_t*>(&p_src[src_index]);
+    });
+#endif
+}