Commit 9d5d6afa authored by Chao Liu's avatar Chao Liu
Browse files

updating v5r1

parent dcee43fe
......@@ -98,9 +98,14 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
is_known_at_compile_time<remove_cv_t<remove_reference_t<SrcSliceOriginIdx>>>::value,
"wrong! SrcSliceOrigin need to known at compile-time");
#if 0 // debug
// TODO: turn this on, once v5r1 is updated to use StaticBuffer for holding C data
static_assert(SrcBuffer::IsStaticBuffer(), "wrong! SrcBuffer need to be StaticBuffer");
static_assert(is_same<remove_cv_t<remove_reference_t<typename SrcBuffer::type>>,
remove_cv_t<remove_reference_t<SrcData>>>::value,
"wrong! SrcBuffer data type is wrong");
#endif
// SrcDesc and src_slice_origin_idx are known at compile-time
constexpr auto src_desc = remove_cv_t<remove_reference_t<SrcDesc>>{};
......@@ -758,6 +763,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
static_assert(DstAddressSpace == AddressSpace::Global or
DstAddressSpace == AddressSpace::Lds,
"wrong!");
// TODO: fix this
static_assert(is_same<SrcData, DstData>::value,
"wrong! current implementation assume SrcData and DstData are same type");
}
__device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
......@@ -859,11 +868,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
return src_data_idx;
}();
// copy data
typename vector_type_maker<SrcData, SrcScalarPerVector>::type src_vector;
// copy data from src_buf to src_tmp_vector
vector_type_maker_t<SrcData, SrcScalarPerVector> src_tmp_vector;
using src_vector_t =
typename vector_type_maker<SrcData, SrcScalarPerVector>::type::type;
using src_vector_t = typename decltype(src_tmp_vector)::type;
const bool is_src_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(
src_desc, src_slice_origin_coord_);
......@@ -871,14 +879,14 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
if constexpr(SrcAddressSpace == AddressSpace::Global)
{
#if CK_USE_AMD_BUFFER_ADDRESSING
src_vector.template AsType<src_vector_t>()(Number<0>{}) =
src_tmp_vector.template AsType<src_vector_t>()(Number<0>{}) =
amd_buffer_load_v2<SrcData, SrcScalarPerVector>(
p_src,
src_slice_origin_coord_.GetOffset(),
is_src_valid,
src_desc.GetElementSpaceSize());
#else
src_vector.template AsType<src_vector_t>()(Number<0>{}) =
src_tmp_vector.template AsType<src_vector_t>()(Number<0>{}) =
is_src_valid ? *reinterpret_cast<const src_vector_t*>(
&p_src[src_slice_origin_coord_.GetOffset()])
: src_vector_t{0};
......@@ -886,18 +894,18 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
}
else
{
src_vector.template AsType<src_vector_t>()(Number<0>{}) =
src_tmp_vector.template AsType<src_vector_t>()(Number<0>{}) =
is_src_valid ? *reinterpret_cast<const src_vector_t*>(
&p_src[src_slice_origin_coord_.GetOffset()])
: src_vector_t{0};
}
// copy data from src_tmp_vector to buffer_
static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
constexpr index_t buffer_offset =
buffer_desc_.CalculateOffset(src_data_idx + i * src_scalar_step_in_vector);
buffer_.template AsType<SrcData>()(Number<buffer_offset>{}) =
src_vector.template AsType<SrcData>()[i];
buffer_(Number<buffer_offset>{}) = src_tmp_vector.template AsType<SrcData>()[i];
});
constexpr auto move_on_dim = [&]() constexpr
......@@ -1048,21 +1056,21 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
DstInMemOp == InMemoryDataOperation::Set,
"wrong! hardcoded for ds_write");
typename vector_type_maker<DstData, DstScalarPerVector>::type dst_vector;
vector_type_maker_t<DstData, DstScalarPerVector> dst_tmp_vector;
// copy data from buffer_ to dst_tmp_vector
static_for<0, DstScalarPerVector, 1>{}([&](auto i) {
constexpr index_t buffer_offset =
buffer_desc_.CalculateOffset(dst_data_idx + i * dst_scalar_step_in_vector);
dst_vector.template AsType<DstData>()(i) =
buffer_.template AsType<DstData>()[Number<buffer_offset>{}];
dst_tmp_vector.template AsType<DstData>()(i) = buffer_[Number<buffer_offset>{}];
});
using DstVectorType =
typename vector_type_maker<DstData, DstScalarPerVector>::type::type;
using dst_vector_t = typename decltype(dst_tmp_vector)::type;
*reinterpret_cast<DstVectorType*>(p_dst + dst_slice_origin_coord_.GetOffset()) =
dst_vector.template AsType<DstVectorType>()[Number<0>{}];
// copy data from dst_tmp_vector to dst_buf
*reinterpret_cast<dst_vector_t*>(p_dst + dst_slice_origin_coord_.GetOffset()) =
dst_tmp_vector.template AsType<dst_vector_t>()[Number<0>{}];
constexpr auto move_on_dim = [&]() constexpr
{
......@@ -1319,7 +1327,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
static constexpr auto buffer_size_ = buffer_desc_.GetElementSpaceSize();
typename vector_type_maker<SrcData, buffer_size_>::type buffer_;
StaticBuffer<SrcData, buffer_size_> buffer_;
SrcCoord src_slice_origin_coord_;
DstCoord dst_slice_origin_coord_;
......
......@@ -61,6 +61,7 @@ struct ThreadwiseGemm_km_kn_mn_v3
static_for<0, E, 1>{}([&](auto e) {
static_for<0, K, 1>{}([&](auto k) {
#if 0
constexpr auto a_offset = ADesc{}.CalculateOffset(make_tuple(e, k));
if constexpr(H == 2 && W == 2)
......@@ -123,6 +124,22 @@ struct ThreadwiseGemm_km_kn_mn_v3
});
});
}
#else
constexpr index_t a_offset = ADesc{}.CalculateOffset(make_tuple(e, k));
static_for<0, H, 1>{}([&](auto h) {
static_for<0, W, 1>{}([&](auto w) {
constexpr index_t b_offset =
BDesc{}.CalculateOffset(make_tuple(e, 0, h, w));
constexpr index_t c_offset =
CDesc{}.CalculateOffset(make_tuple(k, 0, h, w));
amd_assembly_inner_product(p_a[Number<a_offset>{}],
p_b[Number<b_offset>{}],
p_c[Number<c_offset>{}]);
});
});
#endif
});
});
}
......
......@@ -36,6 +36,44 @@ __device__ void amd_assembly_inner_product(const int8x4_t& a, const int8x4_t& b,
#endif
}
__device__ void amd_assembly_inner_product(const int8x8_t& a, const int8x8_t& b, int32_t& c)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
amd_assembly_inner_product(vector_type<int8_t, 8>{a}.AsType<int8x4_t>()[I0],
vector_type<int8_t, 8>{b}.AsType<int8x4_t>()[I0],
c);
amd_assembly_inner_product(vector_type<int8_t, 8>{a}.AsType<int8x4_t>()[I1],
vector_type<int8_t, 8>{b}.AsType<int8x4_t>()[I1],
c);
}
__device__ void amd_assembly_inner_product(const int8x16_t& a, const int8x16_t& b, int32_t& c)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
amd_assembly_inner_product(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I0],
vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I0],
c);
amd_assembly_inner_product(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I1],
vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I1],
c);
amd_assembly_inner_product(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I2],
vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I2],
c);
amd_assembly_inner_product(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I3],
vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I3],
c);
}
#if 0
// c0 += inner_product(a, b0)
// c1 += inner_product(a, b1)
......
......@@ -14,11 +14,11 @@
#define CK_DEVICE_BACKEND_AMD 1
// GPU ID
#if 1
#if 0
#define CK_AMD_GPU_GFX906 1
#elif 0
#define CK_AMD_GPU_GFX908 1
#elif 0
#elif 1
#define CK_AMD_GPU_GFX1030 1
#endif
......
......@@ -53,7 +53,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk(
constexpr auto C0 = C / Number<InWeiVectorSize>{};
constexpr auto C1 = Number<InWeiVectorSize>{};
#if 1
#if 0
// run-time variables
constexpr auto in_n_hi_wi_c0_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(make_multi_index(N, Hi, Wi, C0));
......@@ -112,7 +112,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk(
wei_k_y_x_c_device_buf.ToDevice(wei_k_y_x_c.mData.data());
out_n_ho_wo_k_device_buf.ToDevice(out_n_ho_wo_k.mData.data());
#if 0
#if 1
// cdata = 16, BlockSize = 64, 16x64x4
constexpr index_t BlockSize = 64;
......
......@@ -64,7 +64,7 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>;
#elif 0
#elif 1
constexpr index_t N = 1;
constexpr index_t C = 16;
constexpr index_t HI = 1080;
......@@ -630,7 +630,7 @@ int main(int argc, char* argv[])
print_array("ConvStrides", to_multi_index(ConvStrides{}));
print_array("ConvDilations", to_multi_index(ConvDilations{}));
#if 1
#if 0
using in_data_t = float;
constexpr index_t in_vector_size = 1;
using acc_data_t = float;
......@@ -724,7 +724,7 @@ int main(int argc, char* argv[])
LeftPads{},
RightPads{},
nrepeat);
#elif 1
#elif 0
device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw<in_data_t,
in_vector_size,
acc_data_t,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment