Commit c0b9d8c2 authored by Jing Zhang's avatar Jing Zhang
Browse files

added nk0hwk1 output

parent 4feb5477
...@@ -200,7 +200,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad ...@@ -200,7 +200,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
false, // don't move back src coordinate after threadwise copy, which will be fused with false, // don't move back src coordinate after threadwise copy, which will be fused with
// MoveSrcSliceWindow() to save addr computation // MoveSrcSliceWindow() to save addr computation
Sequence<0, 2, 3, 1>, Sequence<0, 2, 3, 1>,
1, 0,
CThreadTransferDstScalarPerVector_W, CThreadTransferDstScalarPerVector_W,
decltype(a_k_m_global_iterator_hacks), decltype(a_k_m_global_iterator_hacks),
decltype(b_k_n_global_iterator_hacks), decltype(b_k_n_global_iterator_hacks),
......
...@@ -231,9 +231,10 @@ __device__ void amd_buffer_store_impl_v2(const typename vector_type<T, N>::type ...@@ -231,9 +231,10 @@ __device__ void amd_buffer_store_impl_v2(const typename vector_type<T, N>::type
index_t dst_thread_addr_offset, index_t dst_thread_addr_offset,
index_t dst_wave_addr_offset) index_t dst_wave_addr_offset)
{ {
static_assert((is_same<T, float>::value && (N == 1 || N == 2 || N == 4)) || static_assert(
(is_same<T, float>::value && (N == 1 || N == 2 || N == 4)) ||
(is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4)) || (is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4)) ||
(is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4)), (is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)),
"wrong! not implemented"); "wrong! not implemented");
if constexpr(is_same<T, float>::value) if constexpr(is_same<T, float>::value)
...@@ -316,6 +317,22 @@ __device__ void amd_buffer_store_impl_v2(const typename vector_type<T, N>::type ...@@ -316,6 +317,22 @@ __device__ void amd_buffer_store_impl_v2(const typename vector_type<T, N>::type
dst_wave_addr_offset, dst_wave_addr_offset,
0); 0);
} }
else if constexpr(N == 8)
{
__llvm_amdgcn_raw_buffer_store_i32x2(src_thread_data,
dst_wave_buffer_resource,
dst_thread_addr_offset,
dst_wave_addr_offset,
0);
}
else if constexpr(N == 16)
{
__llvm_amdgcn_raw_buffer_store_i32x4(src_thread_data,
dst_wave_buffer_resource,
dst_thread_addr_offset,
dst_wave_addr_offset,
0);
}
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment