added nk0hwk1 output

c0b9d8c2 · Jing Zhang · 4feb5477 · c0b9d8c2 · c0b9d8c2
Commit c0b9d8c2 authored Mar 26, 2021 by Jing Zhang
2 changed files
--- a/composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
+++ b/composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
@@ -200,7 +200,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
            false, // don't move back src coordinate after threadwise copy, which will be fused with
                   // MoveSrcSliceWindow() to save addr computation
            Sequence<0, 2, 3, 1>,
-            1,
+            0,
            CThreadTransferDstScalarPerVector_W,
            decltype(a_k_m_global_iterator_hacks),
            decltype(b_k_n_global_iterator_hacks),

--- a/composable_kernel/include/utility/amd_buffer_addressing_v2.hpp
+++ b/composable_kernel/include/utility/amd_buffer_addressing_v2.hpp
@@ -231,10 +231,11 @@ __device__ void amd_buffer_store_impl_v2(const typename vector_type<T, N>::type
                                         index_t dst_thread_addr_offset,
                                         index_t dst_wave_addr_offset)
 {
-    static_assert((is_same<T, float>::value && (N == 1 || N == 2 || N == 4)) ||
+    static_assert(
-                      (is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4)) ||
+        (is_same<T, float>::value && (N == 1 || N == 2 || N == 4)) ||
-                      (is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4)),
+            (is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4)) ||
-                  "wrong! not implemented");
+            (is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)),
+        "wrong! not implemented");
    if constexpr(is_same<T, float>::value)
    {
@@ -316,6 +317,22 @@ __device__ void amd_buffer_store_impl_v2(const typename vector_type<T, N>::type
                                               dst_wave_addr_offset,
                                               0);
        }
+        else if constexpr(N == 8)
+        {
+            __llvm_amdgcn_raw_buffer_store_i32x2(src_thread_data,
+                                                 dst_wave_buffer_resource,
+                                                 dst_thread_addr_offset,
+                                                 dst_wave_addr_offset,
+                                                 0);
+        }
+        else if constexpr(N == 16)
+        {
+            __llvm_amdgcn_raw_buffer_store_i32x4(src_thread_data,
+                                                 dst_wave_buffer_resource,
+                                                 dst_thread_addr_offset,
+                                                 dst_wave_addr_offset,
+                                                 0);
+        }
    }
 }