"vscode:/vscode.git/clone" did not exist on "8606e33844dd8a8a6cbbdb9f8a2566dc971b1cd3"
Commit c1ed17f8 authored by Chao Liu's avatar Chao Liu
Browse files

improving index calculation: change to UpdateIndexDiff()

parent 77c81617
...@@ -157,7 +157,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad ...@@ -157,7 +157,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
const index_t GemmM0 = GemmM / GemmM1; const index_t GemmM0 = GemmM / GemmM1;
const index_t GemmN0 = GemmN / GemmN1; const index_t GemmN0 = GemmN / GemmN1;
#if 1 // debug
const auto out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc = const auto out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc =
transform_dynamic_tensor_descriptor( transform_dynamic_tensor_descriptor(
out_gemmm_gemmn_global_desc, out_gemmm_gemmn_global_desc,
...@@ -165,16 +164,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad ...@@ -165,16 +164,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
DynamicUnMerge<2>{make_multi_index(GemmN0, GemmN1)}), DynamicUnMerge<2>{make_multi_index(GemmN0, GemmN1)}),
make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{})); make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
#else
const auto out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc =
transform_dynamic_tensor_descriptor(
out_gemmm_gemmn_global_desc,
make_tuple(
HackSemiDynamicUnMerge<3, Sequence<GemmM1>>{make_multi_index(1, GemmM0)},
HackSemiDynamicUnMerge<3, Sequence<GemmN1>>{make_multi_index(1, GemmN0)}),
make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
#endif
// GEMM // GEMM
using gridwise_gemm = GridwiseDynamicGemm_km_kn_mn_v1< using gridwise_gemm = GridwiseDynamicGemm_km_kn_mn_v1<
......
...@@ -502,27 +502,18 @@ __host__ __device__ constexpr void move_dynamic_tensor_coordinate(const TensorDe ...@@ -502,27 +502,18 @@ __host__ __device__ constexpr void move_dynamic_tensor_coordinate(const TensorDe
constexpr auto dims_low = TensorDesc::GetLowerDimensionIdss().At(itran); constexpr auto dims_low = TensorDesc::GetLowerDimensionIdss().At(itran);
constexpr auto dims_up = TensorDesc::GetUpperDimensionIdss().At(itran); constexpr auto dims_up = TensorDesc::GetUpperDimensionIdss().At(itran);
const auto idx_up = get_container_subset(idx_hidden, dims_up); const auto idx_up_new = get_container_subset(idx_hidden, dims_up);
auto idx_low = get_container_subset(idx_hidden, dims_low); auto idx_low = get_container_subset(idx_hidden, dims_low);
const auto idx_diff_up = get_container_subset(idx_diff_hidden, dims_up); const auto idx_diff_up = get_container_subset(idx_diff_hidden, dims_up);
MultiIndex<dims_low.Size()> idx_diff_low; MultiIndex<dims_low.Size()> idx_diff_low;
// calculate idx_diff_low // HACK: control UpdateLowerIndex for DynamicMerge using hack
#if 0 // hack
tran.CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low, idx_up);
#else
// HACK: control CalculateLowerIndexDiff for DynamicMerge using ing hack
// TODO remove hack // TODO remove hack
constexpr index_t Hack = constexpr index_t Hack =
decltype(coord_step.hack_calculate_lower_index_diff_)::At(itran); decltype(coord_step.hack_calculate_lower_index_diff_)::At(itran);
tran.CalculateLowerIndexDiff_hack( tran.UpdateLowerIndex(idx_diff_low, idx_diff_up, idx_low, idx_up_new, Number<Hack>{});
idx_diff_low, idx_diff_up, idx_low, idx_up, Number<Hack>{});
#endif
// update idx_low
idx_low += idx_diff_low;
set_container_subset(idx_diff_hidden, dims_low, idx_diff_low); set_container_subset(idx_diff_hidden, dims_low, idx_diff_low);
set_container_subset(idx_hidden, dims_low, idx_low); set_container_subset(idx_hidden, dims_low, idx_low);
......
...@@ -384,13 +384,8 @@ struct GridwiseDynamicGemm_km_kn_mn_v1 ...@@ -384,13 +384,8 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
Float, Float,
decltype(c_m0_m1_n0_n1_thread_desc), decltype(c_m0_m1_n0_n1_thread_desc),
decltype(c_m0_m1_n0_n1_global_desc), decltype(c_m0_m1_n0_n1_global_desc),
#if 1 // debug
Sequence<MRepeat, MPerThread, NRepeat, NPerThread>, Sequence<MRepeat, MPerThread, NRepeat, NPerThread>,
CThreadTransferSrcDstAccessOrder, CThreadTransferSrcDstAccessOrder,
#else
Sequence<1, 1, 2, 4>,
Sequence<0, 1, 2, 3>,
#endif
CThreadTransferSrcDstVectorDim, CThreadTransferSrcDstVectorDim,
1, 1,
CThreadTransferDstScalarPerVector, CThreadTransferDstScalarPerVector,
......
...@@ -922,7 +922,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 ...@@ -922,7 +922,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
src_desc, make_multi_index(1, 0), Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0>{}); src_desc, make_multi_index(1, 0), Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0>{});
const auto src_step_m1_0 = make_dynamic_tensor_coordinate_step_hack( const auto src_step_m1_0 = make_dynamic_tensor_coordinate_step_hack(
src_desc, make_multi_index(-1, 0), Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0>{}); src_desc, make_multi_index(-1, 0), Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0>{});
#elif 0 #elif 1
// for non-padded input tensor // for non-padded input tensor
const auto src_step_0_p1 = make_dynamic_tensor_coordinate_step_hack( const auto src_step_0_p1 = make_dynamic_tensor_coordinate_step_hack(
src_desc, make_multi_index(0, 1), Sequence<0, 0, 0, 0, 0, 0, 1>{}); src_desc, make_multi_index(0, 1), Sequence<0, 0, 0, 0, 0, 0, 1>{});
...@@ -1067,7 +1067,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 ...@@ -1067,7 +1067,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
// for padded input tensor // for padded input tensor
const auto adjusted_step = make_dynamic_tensor_coordinate_step_hack( const auto adjusted_step = make_dynamic_tensor_coordinate_step_hack(
src_desc, adjusted_step_idx, Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2>{}); src_desc, adjusted_step_idx, Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2>{});
#elif 0 #elif 1
// for non-paded input tensor // for non-paded input tensor
const auto adjusted_step = make_dynamic_tensor_coordinate_step_hack( const auto adjusted_step = make_dynamic_tensor_coordinate_step_hack(
src_desc, adjusted_step_idx, Sequence<0, 0, 0, 0, 0, 1, 2>{}); src_desc, adjusted_step_idx, Sequence<0, 0, 0, 0, 0, 1, 2>{});
......
...@@ -152,7 +152,7 @@ __device__ float amd_buffer_load<float, 1>(const float* p_src_wave, ...@@ -152,7 +152,7 @@ __device__ float amd_buffer_load<float, 1>(const float* p_src_wave,
return __llvm_amdgcn_buffer_load_f32( return __llvm_amdgcn_buffer_load_f32(
src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false); src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#else #else
#if 1 // debug #if 0 // debug
float tmp = __llvm_amdgcn_buffer_load_f32( float tmp = __llvm_amdgcn_buffer_load_f32(
src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false); src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);
......
...@@ -201,7 +201,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc ...@@ -201,7 +201,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
constexpr auto conv_driver = constexpr auto conv_driver =
#if 1 // debug #if 1
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
#else #else
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment