Commit 5fdccfce authored by Chao Liu's avatar Chao Liu
Browse files

added vector load, but ISA is much worse

parent 44ddcdcb
...@@ -784,27 +784,30 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 ...@@ -784,27 +784,30 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
}(); }();
// TODO use constexpr for coordinate-step to make sure compiler behave correctly // TODO use constexpr for coordinate-step to make sure compiler behave correctly
const auto src_step_0_p1 = const auto src_step_0_p1 = make_dynamic_tensor_coordinate_step(
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 1)); src_desc, make_multi_index(0, 1) * src_scalar_per_access);
const auto src_step_0_m1 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, -1));
const auto src_step_p1_0 = const auto src_step_0_m1 = make_dynamic_tensor_coordinate_step(
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(1, 0)); src_desc, make_multi_index(0, -1) * src_scalar_per_access);
const auto src_step_m1_0 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(-1, 0)); const auto src_step_p1_0 = make_dynamic_tensor_coordinate_step(
src_desc, make_multi_index(1, 0) * src_scalar_per_access);
const auto src_step_m1_0 = make_dynamic_tensor_coordinate_step(
src_desc, make_multi_index(-1, 0) * src_scalar_per_access);
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
static_for<0, access_lengths[I0], 1>{}([&](auto iter0) { static_for<0, SliceLengths{}[I0], src_scalar_per_access[I0]>{}([&](auto iter0) {
static_for<0, access_lengths[I1], 1>{}([&](auto iter1) { static_for<0, SliceLengths{}[I1], src_scalar_per_access[I1]>{}([&](auto iter1) {
// step direction // step direction
constexpr bool forward_dim1 = (iter0.value % 2 == 0); constexpr bool forward_dim1 = (iter0.value % (2 * src_scalar_per_access[I0]) == 0);
constexpr index_t i0 = iter0.value; constexpr index_t i0 = iter0.value;
constexpr index_t i1 = constexpr index_t i1 =
forward_dim1 ? iter1.value : access_lengths[I1] - iter1.value - 1; forward_dim1 ? iter1.value
: SliceLengths{}[I1] - src_scalar_per_access[I1] - iter1.value;
// do work // do work
// hardcoding for buffer_load // hardcoding for buffer_load
...@@ -812,7 +815,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 ...@@ -812,7 +815,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
static_assert(SrcAddressSpace == AddressSpace::Global, static_assert(SrcAddressSpace == AddressSpace::Global,
"wrong! hardcoded to use buffer_load, src must be global mem"); "wrong! hardcoded to use buffer_load, src must be global mem");
#if 1 // only works for SrcScalarPerVector == 1 #if 0 // only works for SrcScalarPerVector == 1
auto src_data = amd_buffer_load<SrcData, 1>( auto src_data = amd_buffer_load<SrcData, 1>(
p_src, src_slice_origin_.GetOffset(), true, src_desc.GetElementSpaceSize()); p_src, src_slice_origin_.GetOffset(), true, src_desc.GetElementSpaceSize());
...@@ -822,17 +825,6 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 ...@@ -822,17 +825,6 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
constexpr index_t buffer_offset = constexpr index_t buffer_offset =
buffer_desc_.CalculateOffset(make_multi_index(i0, i1)); buffer_desc_.CalculateOffset(make_multi_index(i0, i1));
buffer_(Number<buffer_offset>{}) = is_valid ? src_data : SrcData{0};
#elif 1 // only works for SrcScalarPerVector == 1
auto src_data = amd_buffer_load<SrcData, 1>(
p_src, src_slice_origin_.GetOffset(), true, src_desc.GetElementSpaceSize());
const bool is_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(
src_desc, src_slice_origin_);
constexpr index_t buffer_offset =
buffer_desc_.CalculateOffset(make_multi_index(i0, i1) * src_scalar_per_access);
buffer_(Number<buffer_offset>{}) = is_valid ? src_data : SrcData{0}; buffer_(Number<buffer_offset>{}) = is_valid ? src_data : SrcData{0};
#else #else
vector_type<SrcData, SrcScalarPerVector> src_vector; vector_type<SrcData, SrcScalarPerVector> src_vector;
...@@ -849,10 +841,8 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 ...@@ -849,10 +841,8 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
static_for<0, SrcScalarPerVector, 1>{}([&](auto i) { static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
constexpr index_t buffer_offset = buffer_desc_.CalculateOffset( constexpr index_t buffer_offset = buffer_desc_.CalculateOffset(
make_multi_index(i0, i1) * src_scalar_per_access + make_multi_index(i0, i1) + i * src_scalar_step_in_vector);
i * src_scalar_step_in_vector);
// TODO: can buffe_ use vector access?
buffer_(Number<buffer_offset>{}) = src_vector[i]; buffer_(Number<buffer_offset>{}) = src_vector[i];
}); });
#endif #endif
...@@ -882,7 +872,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 ...@@ -882,7 +872,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
if constexpr(SrcResetCoordinateAfterRun) if constexpr(SrcResetCoordinateAfterRun)
{ {
const auto src_back_step = const auto src_back_step =
make_dynamic_tensor_coordinate_step(src_desc, GetCoordinateBackStep()); make_dynamic_tensor_coordinate_step(src_desc, GetSrcCoordinateBackStep());
move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_back_step); move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_back_step);
} }
...@@ -959,7 +949,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 ...@@ -959,7 +949,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
if constexpr(DstResetCoordinateAfterRun) if constexpr(DstResetCoordinateAfterRun)
{ {
const auto dst_back_step = const auto dst_back_step =
make_dynamic_tensor_coordinate_step(dst_desc, GetCoordinateBackStep()); make_dynamic_tensor_coordinate_step(dst_desc, GetDstCoordinateBackStep());
move_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_, dst_back_step); move_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_, dst_back_step);
} }
...@@ -967,8 +957,8 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 ...@@ -967,8 +957,8 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
__device__ void RunRead_hack(const SrcDesc& src_desc, const SrcData* p_src) __device__ void RunRead_hack(const SrcDesc& src_desc, const SrcData* p_src)
{ {
// hardcoding for buffer_load // hardcoded for 2D
// TODO refactor transfer_data() to encapsulate this // TODO implemente N-D
static_assert(remove_reference_t<SrcDesc>::GetNumOfDimension() == 2, static_assert(remove_reference_t<SrcDesc>::GetNumOfDimension() == 2,
"wrong! hardcoded for 2D tensor"); "wrong! hardcoded for 2D tensor");
...@@ -1017,50 +1007,71 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 ...@@ -1017,50 +1007,71 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
#if 0 // hack #if 0 // hack
// TODO use constexpr for coordinate-step to make sure compiler behave correctly // TODO use constexpr for coordinate-step to make sure compiler behave correctly
const auto src_step_0_p1 = const auto src_step_0_p1 = make_dynamic_tensor_coordinate_step(
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 1)); src_desc, make_multi_index(0, 1) * src_scalar_per_access);
const auto src_step_0_m1 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, -1));
const auto src_step_p1_0 = const auto src_step_0_m1 = make_dynamic_tensor_coordinate_step(
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(1, 0)); src_desc, make_multi_index(0, -1) * src_scalar_per_access);
const auto src_step_m1_0 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(-1, 0)); const auto src_step_p1_0 = make_dynamic_tensor_coordinate_step(
src_desc, make_multi_index(1, 0) * src_scalar_per_access);
const auto src_step_m1_0 = make_dynamic_tensor_coordinate_step(
src_desc, make_multi_index(-1, 0) * src_scalar_per_access);
#elif 0 #elif 0
// for padded input tensor // for padded input tensor
const auto src_step_0_p1 = make_dynamic_tensor_coordinate_step_hack( const auto src_step_0_p1 =
src_desc, make_multi_index(0, 1), Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1>{}); make_dynamic_tensor_coordinate_step_hack(src_desc,
make_multi_index(0, 1) * src_scalar_per_access,
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1>{});
const auto src_step_0_m1 = make_dynamic_tensor_coordinate_step_hack( const auto src_step_0_m1 = make_dynamic_tensor_coordinate_step_hack(
src_desc, make_multi_index(0, -1), Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2>{}); src_desc,
make_multi_index(0, -1) * src_scalar_per_access,
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2>{});
const auto src_step_p1_0 = make_dynamic_tensor_coordinate_step_hack( const auto src_step_p1_0 =
src_desc, make_multi_index(1, 0), Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0>{}); make_dynamic_tensor_coordinate_step_hack(src_desc,
make_multi_index(1, 0) * src_scalar_per_access,
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0>{});
const auto src_step_m1_0 = make_dynamic_tensor_coordinate_step_hack( const auto src_step_m1_0 = make_dynamic_tensor_coordinate_step_hack(
src_desc, make_multi_index(-1, 0), Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0>{}); src_desc,
make_multi_index(-1, 0) * src_scalar_per_access,
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0>{});
#elif 1 #elif 1
// for non-padded input tensor // for non-padded input tensor
const auto src_step_0_p1 = make_dynamic_tensor_coordinate_step_hack( const auto src_step_0_p1 =
src_desc, make_multi_index(0, 1), Sequence<0, 0, 0, 0, 0, 0, 1>{}); make_dynamic_tensor_coordinate_step_hack(src_desc,
make_multi_index(0, 1) * src_scalar_per_access,
Sequence<0, 0, 0, 0, 0, 0, 1>{});
const auto src_step_0_m1 = make_dynamic_tensor_coordinate_step_hack( const auto src_step_0_m1 = make_dynamic_tensor_coordinate_step_hack(
src_desc, make_multi_index(0, -1), Sequence<0, 0, 0, 0, 0, 0, 2>{}); src_desc,
make_multi_index(0, -1) * src_scalar_per_access,
Sequence<0, 0, 0, 0, 0, 0, 2>{});
const auto src_step_p1_0 =
make_dynamic_tensor_coordinate_step_hack(src_desc,
make_multi_index(1, 0) * src_scalar_per_access,
Sequence<0, 0, 0, 0, 0, 1, 0>{});
const auto src_step_p1_0 = make_dynamic_tensor_coordinate_step_hack(
src_desc, make_multi_index(1, 0), Sequence<0, 0, 0, 0, 0, 1, 0>{});
const auto src_step_m1_0 = make_dynamic_tensor_coordinate_step_hack( const auto src_step_m1_0 = make_dynamic_tensor_coordinate_step_hack(
src_desc, make_multi_index(-1, 0), Sequence<0, 0, 0, 0, 0, 2, 0>{}); src_desc,
make_multi_index(-1, 0) * src_scalar_per_access,
Sequence<0, 0, 0, 0, 0, 2, 0>{});
#endif #endif
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
static_for<0, access_lengths[I0], 1>{}([&](auto iter0) { static_for<0, SliceLengths{}[I0], src_scalar_per_access[I0]>{}([&](auto iter0) {
static_for<0, access_lengths[I1], 1>{}([&](auto iter1) { static_for<0, SliceLengths{}[I1], src_scalar_per_access[I1]>{}([&](auto iter1) {
// step direction // step direction
constexpr bool forward_dim1 = (iter0.value % 2 == 0); constexpr bool forward_dim1 = (iter0.value % (2 * src_scalar_per_access[I0]) == 0);
constexpr index_t i0 = iter0.value; constexpr index_t i0 = iter0.value;
constexpr index_t i1 = constexpr index_t i1 =
forward_dim1 ? iter1.value : access_lengths[I1] - iter1.value - 1; forward_dim1 ? iter1.value
: SliceLengths{}[I1] - src_scalar_per_access[I1] - iter1.value;
// do work // do work
// hardcoding for buffer_load // hardcoding for buffer_load
...@@ -1068,7 +1079,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 ...@@ -1068,7 +1079,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
static_assert(SrcAddressSpace == AddressSpace::Global, static_assert(SrcAddressSpace == AddressSpace::Global,
"wrong! hardcoded to use buffer_load, src must be global mem"); "wrong! hardcoded to use buffer_load, src must be global mem");
#if 1 // only works for SrcScalarPerVector == 1 #if 0 // only works for SrcScalarPerVector == 1
auto src_data = amd_buffer_load<SrcData, 1>( auto src_data = amd_buffer_load<SrcData, 1>(
p_src, src_slice_origin_.GetOffset(), true, src_desc.GetElementSpaceSize()); p_src, src_slice_origin_.GetOffset(), true, src_desc.GetElementSpaceSize());
...@@ -1076,10 +1087,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 ...@@ -1076,10 +1087,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
src_desc, src_slice_origin_); src_desc, src_slice_origin_);
constexpr index_t buffer_offset = constexpr index_t buffer_offset =
buffer_desc_.CalculateOffset(make_multi_index(i0, i1) * src_scalar_per_access); buffer_desc_.CalculateOffset(make_multi_index(i0, i1));
buffer_(Number<buffer_offset>{}) = is_valid ? src_data : SrcData{0}; buffer_(Number<buffer_offset>{}) = is_valid ? src_data : SrcData{0};
#elif 1 #else
vector_type<SrcData, SrcScalarPerVector> src_vector; vector_type<SrcData, SrcScalarPerVector> src_vector;
using SrcVectorType = typename vector_type<SrcData, SrcScalarPerVector>::MemoryType; using SrcVectorType = typename vector_type<SrcData, SrcScalarPerVector>::MemoryType;
...@@ -1094,10 +1105,8 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 ...@@ -1094,10 +1105,8 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
static_for<0, SrcScalarPerVector, 1>{}([&](auto i) { static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
constexpr index_t buffer_offset = buffer_desc_.CalculateOffset( constexpr index_t buffer_offset = buffer_desc_.CalculateOffset(
make_multi_index(i0, i1) * src_scalar_per_access + make_multi_index(i0, i1) + i * src_scalar_step_in_vector);
i * src_scalar_step_in_vector);
// TODO: can buffe_ use vector access?
buffer_(Number<buffer_offset>{}) = src_vector[i]; buffer_(Number<buffer_offset>{}) = src_vector[i];
}); });
#endif #endif
...@@ -1127,25 +1136,77 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 ...@@ -1127,25 +1136,77 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
if constexpr(SrcResetCoordinateAfterRun) if constexpr(SrcResetCoordinateAfterRun)
{ {
const auto src_back_step = const auto src_back_step =
make_dynamic_tensor_coordinate_step(src_desc, GetCoordinateBackStep()); make_dynamic_tensor_coordinate_step(src_desc, GetSrcCoordinateBackStep());
move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_back_step); move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_back_step);
} }
} }
__device__ static constexpr auto GetCoordinateBackStep() __device__ static constexpr auto GetSrcCoordinateBackStep()
{ {
MultiIndex<nDim> back_step; constexpr auto src_scalar_per_access = [&]() {
Index src_scalar_per_access;
back_step(Number<0>{}) = 1 - SliceLengths{}[0]; static_for<0, nDim, 1>{}([&](auto i) {
if constexpr(i == SrcVectorDim)
{
src_scalar_per_access(i) = SrcScalarPerVector * SrcScalarStrideInVector;
}
else
{
src_scalar_per_access(i) = 1;
}
});
return src_scalar_per_access;
}();
MultiIndex<nDim> src_back_step;
src_back_step(Number<0>{}) = src_scalar_per_access[Number<0>{}] - SliceLengths{}[0];
static_for<1, nDim, 1>{}([&](auto i) { static_for<1, nDim, 1>{}([&](auto i) {
back_step(i) = (SliceLengths{}[i - Number<1>{}] % 2 == 0) ? 0 : (1 - SliceLengths{}[i]); constexpr auto i_m1 = i - Number<1>{};
src_back_step(i) = (SliceLengths{}[i_m1] % (2 * src_scalar_per_access[i_m1]) == 0)
? 0
: (src_scalar_per_access[i] - SliceLengths{}[i]);
}); });
return back_step; return src_back_step;
} }
__device__ static constexpr auto GetDstCoordinateBackStep()
{
constexpr auto dst_scalar_per_access = [&]() {
Index dst_scalar_per_access;
static_for<0, nDim, 1>{}([&](auto i) {
if constexpr(i == DstVectorDim)
{
dst_scalar_per_access(i) = DstScalarPerVector * DstScalarStrideInVector;
}
else
{
dst_scalar_per_access(i) = 1;
}
});
return dst_scalar_per_access;
}();
MultiIndex<nDim> dst_back_step;
dst_back_step(Number<0>{}) = dst_scalar_per_access[Number<0>{}] - SliceLengths{}[0];
static_for<1, nDim, 1>{}([&](auto i) {
constexpr auto i_m1 = i - Number<1>{};
dst_back_step(i) = (SliceLengths{}[i_m1] % (2 * dst_scalar_per_access[i_m1]) == 0)
? 0
: (dst_scalar_per_access[i] - SliceLengths{}[i]);
});
return dst_back_step;
}
// src_slice_origin_step_idx need to be known at compile-time, for performance reason // src_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__ void MoveSrcSliceWindow(const SrcDesc& src_desc, __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc,
const Index& src_slice_origin_step_idx) const Index& src_slice_origin_step_idx)
...@@ -1153,7 +1214,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 ...@@ -1153,7 +1214,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
// if src coord was not reset by RunRead(), then need to adjust the step here // if src coord was not reset by RunRead(), then need to adjust the step here
const auto adjusted_step_idx = SrcResetCoordinateAfterRun const auto adjusted_step_idx = SrcResetCoordinateAfterRun
? src_slice_origin_step_idx ? src_slice_origin_step_idx
: src_slice_origin_step_idx + GetCoordinateBackStep(); : src_slice_origin_step_idx + GetSrcCoordinateBackStep();
// is it OK to construct a new step every time? // is it OK to construct a new step every time?
const auto adjusted_step = make_dynamic_tensor_coordinate_step(src_desc, adjusted_step_idx); const auto adjusted_step = make_dynamic_tensor_coordinate_step(src_desc, adjusted_step_idx);
...@@ -1168,7 +1229,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 ...@@ -1168,7 +1229,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
// if dst coord was not reset by RunWrite(), then need to adjust the step here // if dst coord was not reset by RunWrite(), then need to adjust the step here
const auto adjusted_step_idx = DstResetCoordinateAfterRun const auto adjusted_step_idx = DstResetCoordinateAfterRun
? dst_slice_origin_step_idx ? dst_slice_origin_step_idx
: dst_slice_origin_step_idx + GetCoordinateBackStep(); : dst_slice_origin_step_idx + GetDstCoordinateBackStep();
// is it OK to construct a new step every time? // is it OK to construct a new step every time?
const auto adjusted_step = make_dynamic_tensor_coordinate_step(dst_desc, adjusted_step_idx); const auto adjusted_step = make_dynamic_tensor_coordinate_step(dst_desc, adjusted_step_idx);
...@@ -1183,7 +1244,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 ...@@ -1183,7 +1244,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
// if src coord was not reset by RunRead(), then need to adjust the step here // if src coord was not reset by RunRead(), then need to adjust the step here
const auto adjusted_step_idx = SrcResetCoordinateAfterRun const auto adjusted_step_idx = SrcResetCoordinateAfterRun
? src_slice_origin_step_idx ? src_slice_origin_step_idx
: src_slice_origin_step_idx + GetCoordinateBackStep(); : src_slice_origin_step_idx + GetSrcCoordinateBackStep();
// is it OK to construct a new step every time? // is it OK to construct a new step every time?
#if 0 // hack #if 0 // hack
......
...@@ -187,10 +187,15 @@ __device__ float2_t amd_buffer_load<float, 2>(const float* p_src_wave, ...@@ -187,10 +187,15 @@ __device__ float2_t amd_buffer_load<float, 2>(const float* p_src_wave,
return __llvm_amdgcn_buffer_load_f32x2( return __llvm_amdgcn_buffer_load_f32x2(
src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false); src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#else #else
#if 0
float2_t tmp = __llvm_amdgcn_buffer_load_f32x2( float2_t tmp = __llvm_amdgcn_buffer_load_f32x2(
src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false); src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);
return src_thread_data_valid ? tmp : float2_t(0); return src_thread_data_valid ? tmp : float2_t(0);
#else
return __llvm_amdgcn_buffer_load_f32x2(
src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);
#endif
#endif #endif
} }
...@@ -217,10 +222,15 @@ __device__ float4_t amd_buffer_load<float, 4>(const float* p_src_wave, ...@@ -217,10 +222,15 @@ __device__ float4_t amd_buffer_load<float, 4>(const float* p_src_wave,
return __llvm_amdgcn_buffer_load_f32x4( return __llvm_amdgcn_buffer_load_f32x4(
src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false); src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#else #else
#if 0
float4_t tmp = __llvm_amdgcn_buffer_load_f32x4( float4_t tmp = __llvm_amdgcn_buffer_load_f32x4(
src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false); src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);
return src_thread_data_valid ? tmp : float4_t(0); return src_thread_data_valid ? tmp : float4_t(0);
#else
return __llvm_amdgcn_buffer_load_f32x4(
src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);
#endif
#endif #endif
} }
......
...@@ -184,6 +184,8 @@ struct vector_type<float, 1> ...@@ -184,6 +184,8 @@ struct vector_type<float, 1>
float data_; float data_;
__host__ __device__ explicit constexpr vector_type() : data_{0} {}
__host__ __device__ static constexpr index_t Size() { return 1; } __host__ __device__ static constexpr index_t Size() { return 1; }
__host__ __device__ constexpr const auto& Vector() const { return data_; } __host__ __device__ constexpr const auto& Vector() const { return data_; }
...@@ -212,25 +214,34 @@ struct vector_type<float, 2> ...@@ -212,25 +214,34 @@ struct vector_type<float, 2>
{ {
using MemoryType = float2_t; using MemoryType = float2_t;
union DataType union
{ {
MemoryType vector; float2_t vector_;
float scalar[2]; StaticallyIndexedArray<float, 2> scalars_;
}; } data_;
__host__ __device__ explicit constexpr vector_type() : data_{MemoryType{0}} {}
__host__ __device__ static constexpr index_t Size() { return 2; }
__host__ __device__ constexpr const auto& Vector() const { return data_.vector_; }
__host__ __device__ constexpr auto& Vector() { return data_.vector_; }
template <index_t I> template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>) __host__ __device__ constexpr const auto& operator[](Number<I>) const
{ {
static_assert(I < 2, "wrong"); static_assert(I >= 0 && I < 2, "wrong!");
*(reinterpret_cast<float*>(&v) + I) = s;
return data_.scalars_[Number<I>{}];
} }
__host__ __device__ static MemoryType Pack(float s0, float s1) template <index_t I>
__host__ __device__ constexpr auto& operator()(Number<I>)
{ {
DataType data; static_assert(I >= 0 && I < 2, "wrong!");
data.scalar[0] = s0;
data.scalar[1] = s1; return data_.scalars_(Number<I>{});
return data.vector;
} }
}; };
...@@ -241,37 +252,24 @@ struct vector_type<float, 4> ...@@ -241,37 +252,24 @@ struct vector_type<float, 4>
union union
{ {
float4_t v; float4_t vector_;
float s0, s1, s2, s3; StaticallyIndexedArray<float, 4> scalars_;
} data_; } data_;
__host__ __device__ explicit constexpr vector_type() : data_{MemoryType{0}} {}
__host__ __device__ static constexpr index_t Size() { return 4; } __host__ __device__ static constexpr index_t Size() { return 4; }
__host__ __device__ constexpr const auto& Vector() const { return data_.v; } __host__ __device__ constexpr const auto& Vector() const { return data_.vector_; }
__host__ __device__ constexpr auto& Vector() { return data_.v; } __host__ __device__ constexpr auto& Vector() { return data_.vector_; }
template <index_t I> template <index_t I>
__host__ __device__ constexpr const auto& operator[](Number<I>) const __host__ __device__ constexpr const auto& operator[](Number<I>) const
{ {
static_assert(I >= 0 && I < 4, "wrong!"); static_assert(I >= 0 && I < 4, "wrong!");
if constexpr(I == 0) return data_.scalars_[Number<I>{}];
{
return data_.s0;
}
else if constexpr(I == 1)
{
return data_.s1;
}
else if constexpr(I == 2)
{
return data_.s2;
}
else
{
return data_.s3;
}
} }
template <index_t I> template <index_t I>
...@@ -279,22 +277,7 @@ struct vector_type<float, 4> ...@@ -279,22 +277,7 @@ struct vector_type<float, 4>
{ {
static_assert(I >= 0 && I < 4, "wrong!"); static_assert(I >= 0 && I < 4, "wrong!");
if constexpr(I == 0) return data_.scalars_(Number<I>{});
{
return data_.s0;
}
else if constexpr(I == 1)
{
return data_.s1;
}
else if constexpr(I == 2)
{
return data_.s2;
}
else
{
return data_.s3;
}
} }
}; };
......
...@@ -135,7 +135,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc ...@@ -135,7 +135,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
using GemmABlockTransferThreadSliceLengths_GemmK_GemmM = Sequence<4, 1>; using GemmABlockTransferThreadSliceLengths_GemmK_GemmM = Sequence<4, 1>;
using GemmABlockTransferThreadClusterLengths_GemmK_GemmM = Sequence<2, 128>; using GemmABlockTransferThreadClusterLengths_GemmK_GemmM = Sequence<2, 128>;
constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 1; constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 4;
constexpr index_t GemmABlockTransferDstScalarPerVector_GemmM = 1; constexpr index_t GemmABlockTransferDstScalarPerVector_GemmM = 1;
using GemmBBlockTransferThreadSliceLengths_GemmK_GemmN = Sequence<4, 1>; using GemmBBlockTransferThreadSliceLengths_GemmK_GemmN = Sequence<4, 1>;
...@@ -145,7 +145,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc ...@@ -145,7 +145,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmN = 1; constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmN = 1;
constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 1; constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 1;
#elif 1 #elif 0
// cdata = 64, BlockSize = 256, 128x128x8 // cdata = 64, BlockSize = 256, 128x128x8
// b thread copy 2x2 // b thread copy 2x2
constexpr index_t BlockSize = 256; constexpr index_t BlockSize = 256;
......
...@@ -550,7 +550,7 @@ int main(int argc, char* argv[]) ...@@ -550,7 +550,7 @@ int main(int argc, char* argv[])
in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread); in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
wei_kcyx.GenerateTensorValue(GeneratorTensor_3{}, num_thread); wei_kcyx.GenerateTensorValue(GeneratorTensor_3{}, num_thread);
#elif 0 #elif 0
in_nchw.GenerateTensorValue(GeneratorTensor_3{}, num_thread); in_nchw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread); wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
#elif 1 #elif 1
in_nchw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); in_nchw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment