Commit 9d5d6afa authored by Chao Liu's avatar Chao Liu
Browse files

updating v5r1

parent dcee43fe
...@@ -98,9 +98,14 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3 ...@@ -98,9 +98,14 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
is_known_at_compile_time<remove_cv_t<remove_reference_t<SrcSliceOriginIdx>>>::value, is_known_at_compile_time<remove_cv_t<remove_reference_t<SrcSliceOriginIdx>>>::value,
"wrong! SrcSliceOrigin need to known at compile-time"); "wrong! SrcSliceOrigin need to known at compile-time");
#if 0 // debug
// TODO: turn this on, once v5r1 is updated to use StaticBuffer for holding C data
static_assert(SrcBuffer::IsStaticBuffer(), "wrong! SrcBuffer need to be StaticBuffer");
static_assert(is_same<remove_cv_t<remove_reference_t<typename SrcBuffer::type>>, static_assert(is_same<remove_cv_t<remove_reference_t<typename SrcBuffer::type>>,
remove_cv_t<remove_reference_t<SrcData>>>::value, remove_cv_t<remove_reference_t<SrcData>>>::value,
"wrong! SrcBuffer data type is wrong"); "wrong! SrcBuffer data type is wrong");
#endif
// SrcDesc and src_slice_origin_idx are known at compile-time // SrcDesc and src_slice_origin_idx are known at compile-time
constexpr auto src_desc = remove_cv_t<remove_reference_t<SrcDesc>>{}; constexpr auto src_desc = remove_cv_t<remove_reference_t<SrcDesc>>{};
...@@ -758,6 +763,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 ...@@ -758,6 +763,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
static_assert(DstAddressSpace == AddressSpace::Global or static_assert(DstAddressSpace == AddressSpace::Global or
DstAddressSpace == AddressSpace::Lds, DstAddressSpace == AddressSpace::Lds,
"wrong!"); "wrong!");
// TODO: fix this
static_assert(is_same<SrcData, DstData>::value,
"wrong! current implementation assume SrcData and DstData are same type");
} }
__device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx) __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
...@@ -859,11 +868,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 ...@@ -859,11 +868,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
return src_data_idx; return src_data_idx;
}(); }();
// copy data // copy data from src_buf to src_tmp_vector
typename vector_type_maker<SrcData, SrcScalarPerVector>::type src_vector; vector_type_maker_t<SrcData, SrcScalarPerVector> src_tmp_vector;
using src_vector_t = using src_vector_t = typename decltype(src_tmp_vector)::type;
typename vector_type_maker<SrcData, SrcScalarPerVector>::type::type;
const bool is_src_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid( const bool is_src_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(
src_desc, src_slice_origin_coord_); src_desc, src_slice_origin_coord_);
...@@ -871,14 +879,14 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 ...@@ -871,14 +879,14 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
if constexpr(SrcAddressSpace == AddressSpace::Global) if constexpr(SrcAddressSpace == AddressSpace::Global)
{ {
#if CK_USE_AMD_BUFFER_ADDRESSING #if CK_USE_AMD_BUFFER_ADDRESSING
src_vector.template AsType<src_vector_t>()(Number<0>{}) = src_tmp_vector.template AsType<src_vector_t>()(Number<0>{}) =
amd_buffer_load_v2<SrcData, SrcScalarPerVector>( amd_buffer_load_v2<SrcData, SrcScalarPerVector>(
p_src, p_src,
src_slice_origin_coord_.GetOffset(), src_slice_origin_coord_.GetOffset(),
is_src_valid, is_src_valid,
src_desc.GetElementSpaceSize()); src_desc.GetElementSpaceSize());
#else #else
src_vector.template AsType<src_vector_t>()(Number<0>{}) = src_tmp_vector.template AsType<src_vector_t>()(Number<0>{}) =
is_src_valid ? *reinterpret_cast<const src_vector_t*>( is_src_valid ? *reinterpret_cast<const src_vector_t*>(
&p_src[src_slice_origin_coord_.GetOffset()]) &p_src[src_slice_origin_coord_.GetOffset()])
: src_vector_t{0}; : src_vector_t{0};
...@@ -886,18 +894,18 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 ...@@ -886,18 +894,18 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
} }
else else
{ {
src_vector.template AsType<src_vector_t>()(Number<0>{}) = src_tmp_vector.template AsType<src_vector_t>()(Number<0>{}) =
is_src_valid ? *reinterpret_cast<const src_vector_t*>( is_src_valid ? *reinterpret_cast<const src_vector_t*>(
&p_src[src_slice_origin_coord_.GetOffset()]) &p_src[src_slice_origin_coord_.GetOffset()])
: src_vector_t{0}; : src_vector_t{0};
} }
// copy data from src_tmp_vector to buffer_
static_for<0, SrcScalarPerVector, 1>{}([&](auto i) { static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
constexpr index_t buffer_offset = constexpr index_t buffer_offset =
buffer_desc_.CalculateOffset(src_data_idx + i * src_scalar_step_in_vector); buffer_desc_.CalculateOffset(src_data_idx + i * src_scalar_step_in_vector);
buffer_.template AsType<SrcData>()(Number<buffer_offset>{}) = buffer_(Number<buffer_offset>{}) = src_tmp_vector.template AsType<SrcData>()[i];
src_vector.template AsType<SrcData>()[i];
}); });
constexpr auto move_on_dim = [&]() constexpr constexpr auto move_on_dim = [&]() constexpr
...@@ -1048,21 +1056,21 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 ...@@ -1048,21 +1056,21 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
DstInMemOp == InMemoryDataOperation::Set, DstInMemOp == InMemoryDataOperation::Set,
"wrong! hardcoded for ds_write"); "wrong! hardcoded for ds_write");
typename vector_type_maker<DstData, DstScalarPerVector>::type dst_vector; vector_type_maker_t<DstData, DstScalarPerVector> dst_tmp_vector;
// copy data from buffer_ to dst_tmp_vector
static_for<0, DstScalarPerVector, 1>{}([&](auto i) { static_for<0, DstScalarPerVector, 1>{}([&](auto i) {
constexpr index_t buffer_offset = constexpr index_t buffer_offset =
buffer_desc_.CalculateOffset(dst_data_idx + i * dst_scalar_step_in_vector); buffer_desc_.CalculateOffset(dst_data_idx + i * dst_scalar_step_in_vector);
dst_vector.template AsType<DstData>()(i) = dst_tmp_vector.template AsType<DstData>()(i) = buffer_[Number<buffer_offset>{}];
buffer_.template AsType<DstData>()[Number<buffer_offset>{}];
}); });
using DstVectorType = using dst_vector_t = typename decltype(dst_tmp_vector)::type;
typename vector_type_maker<DstData, DstScalarPerVector>::type::type;
*reinterpret_cast<DstVectorType*>(p_dst + dst_slice_origin_coord_.GetOffset()) = // copy data from dst_tmp_vector to dst_buf
dst_vector.template AsType<DstVectorType>()[Number<0>{}]; *reinterpret_cast<dst_vector_t*>(p_dst + dst_slice_origin_coord_.GetOffset()) =
dst_tmp_vector.template AsType<dst_vector_t>()[Number<0>{}];
constexpr auto move_on_dim = [&]() constexpr constexpr auto move_on_dim = [&]() constexpr
{ {
...@@ -1319,7 +1327,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 ...@@ -1319,7 +1327,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
static constexpr auto buffer_size_ = buffer_desc_.GetElementSpaceSize(); static constexpr auto buffer_size_ = buffer_desc_.GetElementSpaceSize();
typename vector_type_maker<SrcData, buffer_size_>::type buffer_; StaticBuffer<SrcData, buffer_size_> buffer_;
SrcCoord src_slice_origin_coord_; SrcCoord src_slice_origin_coord_;
DstCoord dst_slice_origin_coord_; DstCoord dst_slice_origin_coord_;
......
...@@ -61,6 +61,7 @@ struct ThreadwiseGemm_km_kn_mn_v3 ...@@ -61,6 +61,7 @@ struct ThreadwiseGemm_km_kn_mn_v3
static_for<0, E, 1>{}([&](auto e) { static_for<0, E, 1>{}([&](auto e) {
static_for<0, K, 1>{}([&](auto k) { static_for<0, K, 1>{}([&](auto k) {
#if 0
constexpr auto a_offset = ADesc{}.CalculateOffset(make_tuple(e, k)); constexpr auto a_offset = ADesc{}.CalculateOffset(make_tuple(e, k));
if constexpr(H == 2 && W == 2) if constexpr(H == 2 && W == 2)
...@@ -123,6 +124,22 @@ struct ThreadwiseGemm_km_kn_mn_v3 ...@@ -123,6 +124,22 @@ struct ThreadwiseGemm_km_kn_mn_v3
}); });
}); });
} }
#else
constexpr index_t a_offset = ADesc{}.CalculateOffset(make_tuple(e, k));
static_for<0, H, 1>{}([&](auto h) {
static_for<0, W, 1>{}([&](auto w) {
constexpr index_t b_offset =
BDesc{}.CalculateOffset(make_tuple(e, 0, h, w));
constexpr index_t c_offset =
CDesc{}.CalculateOffset(make_tuple(k, 0, h, w));
amd_assembly_inner_product(p_a[Number<a_offset>{}],
p_b[Number<b_offset>{}],
p_c[Number<c_offset>{}]);
});
});
#endif
}); });
}); });
} }
......
...@@ -36,6 +36,44 @@ __device__ void amd_assembly_inner_product(const int8x4_t& a, const int8x4_t& b, ...@@ -36,6 +36,44 @@ __device__ void amd_assembly_inner_product(const int8x4_t& a, const int8x4_t& b,
#endif #endif
} }
__device__ void amd_assembly_inner_product(const int8x8_t& a, const int8x8_t& b, int32_t& c)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
amd_assembly_inner_product(vector_type<int8_t, 8>{a}.AsType<int8x4_t>()[I0],
vector_type<int8_t, 8>{b}.AsType<int8x4_t>()[I0],
c);
amd_assembly_inner_product(vector_type<int8_t, 8>{a}.AsType<int8x4_t>()[I1],
vector_type<int8_t, 8>{b}.AsType<int8x4_t>()[I1],
c);
}
__device__ void amd_assembly_inner_product(const int8x16_t& a, const int8x16_t& b, int32_t& c)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
amd_assembly_inner_product(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I0],
vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I0],
c);
amd_assembly_inner_product(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I1],
vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I1],
c);
amd_assembly_inner_product(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I2],
vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I2],
c);
amd_assembly_inner_product(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I3],
vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I3],
c);
}
#if 0 #if 0
// c0 += inner_product(a, b0) // c0 += inner_product(a, b0)
// c1 += inner_product(a, b1) // c1 += inner_product(a, b1)
......
...@@ -14,11 +14,11 @@ ...@@ -14,11 +14,11 @@
#define CK_DEVICE_BACKEND_AMD 1 #define CK_DEVICE_BACKEND_AMD 1
// GPU ID // GPU ID
#if 1 #if 0
#define CK_AMD_GPU_GFX906 1 #define CK_AMD_GPU_GFX906 1
#elif 0 #elif 0
#define CK_AMD_GPU_GFX908 1 #define CK_AMD_GPU_GFX908 1
#elif 0 #elif 1
#define CK_AMD_GPU_GFX1030 1 #define CK_AMD_GPU_GFX1030 1
#endif #endif
......
...@@ -53,7 +53,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk( ...@@ -53,7 +53,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk(
constexpr auto C0 = C / Number<InWeiVectorSize>{}; constexpr auto C0 = C / Number<InWeiVectorSize>{};
constexpr auto C1 = Number<InWeiVectorSize>{}; constexpr auto C1 = Number<InWeiVectorSize>{};
#if 1 #if 0
// run-time variables // run-time variables
constexpr auto in_n_hi_wi_c0_desc = constexpr auto in_n_hi_wi_c0_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(make_multi_index(N, Hi, Wi, C0)); make_dynamic_naive_tensor_descriptor_packed_v2(make_multi_index(N, Hi, Wi, C0));
...@@ -112,7 +112,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk( ...@@ -112,7 +112,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk(
wei_k_y_x_c_device_buf.ToDevice(wei_k_y_x_c.mData.data()); wei_k_y_x_c_device_buf.ToDevice(wei_k_y_x_c.mData.data());
out_n_ho_wo_k_device_buf.ToDevice(out_n_ho_wo_k.mData.data()); out_n_ho_wo_k_device_buf.ToDevice(out_n_ho_wo_k.mData.data());
#if 0 #if 1
// cdata = 16, BlockSize = 64, 16x64x4 // cdata = 16, BlockSize = 64, 16x64x4
constexpr index_t BlockSize = 64; constexpr index_t BlockSize = 64;
......
...@@ -64,7 +64,7 @@ int main(int argc, char* argv[]) ...@@ -64,7 +64,7 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<0, 0>; using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>;
#elif 0 #elif 1
constexpr index_t N = 1; constexpr index_t N = 1;
constexpr index_t C = 16; constexpr index_t C = 16;
constexpr index_t HI = 1080; constexpr index_t HI = 1080;
...@@ -630,7 +630,7 @@ int main(int argc, char* argv[]) ...@@ -630,7 +630,7 @@ int main(int argc, char* argv[])
print_array("ConvStrides", to_multi_index(ConvStrides{})); print_array("ConvStrides", to_multi_index(ConvStrides{}));
print_array("ConvDilations", to_multi_index(ConvDilations{})); print_array("ConvDilations", to_multi_index(ConvDilations{}));
#if 1 #if 0
using in_data_t = float; using in_data_t = float;
constexpr index_t in_vector_size = 1; constexpr index_t in_vector_size = 1;
using acc_data_t = float; using acc_data_t = float;
...@@ -724,7 +724,7 @@ int main(int argc, char* argv[]) ...@@ -724,7 +724,7 @@ int main(int argc, char* argv[])
LeftPads{}, LeftPads{},
RightPads{}, RightPads{},
nrepeat); nrepeat);
#elif 1 #elif 0
device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw<in_data_t, device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw<in_data_t,
in_vector_size, in_vector_size,
acc_data_t, acc_data_t,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment