improving index calculation: change to UpdateIndexDiff()

c1ed17f8 · Chao Liu · 77c81617 · c1ed17f8 · c1ed17f8 · c1ed17f8
Commit c1ed17f8 authored Jan 17, 2021 by Chao Liu
7 changed files
--- a/composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+++ b/composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
@@ -157,7 +157,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
        const index_t GemmM0 = GemmM / GemmM1;
        const index_t GemmN0 = GemmN / GemmN1;
-#if 1 // debug
        const auto out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc =
            transform_dynamic_tensor_descriptor(
                out_gemmm_gemmn_global_desc,
@@ -165,16 +164,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
                           DynamicUnMerge<2>{make_multi_index(GemmN0, GemmN1)}),
                make_tuple(Sequence<0>{}, Sequence<1>{}),
                make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
-#else
-        const auto out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc =
-            transform_dynamic_tensor_descriptor(
-                out_gemmm_gemmn_global_desc,
-                make_tuple(
-                    HackSemiDynamicUnMerge<3, Sequence<GemmM1>>{make_multi_index(1, GemmM0)},
-                    HackSemiDynamicUnMerge<3, Sequence<GemmN1>>{make_multi_index(1, GemmN0)}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
-#endif
        // GEMM
        using gridwise_gemm = GridwiseDynamicGemm_km_kn_mn_v1<

--- a/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
--- a/composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp
@@ -502,27 +502,18 @@ __host__ __device__ constexpr void move_dynamic_tensor_coordinate(const TensorDe
            constexpr auto dims_low = TensorDesc::GetLowerDimensionIdss().At(itran);
            constexpr auto dims_up  = TensorDesc::GetUpperDimensionIdss().At(itran);
-            const auto idx_up      = get_container_subset(idx_hidden, dims_up);
+            const auto idx_up_new  = get_container_subset(idx_hidden, dims_up);
            auto idx_low           = get_container_subset(idx_hidden, dims_low);
            const auto idx_diff_up = get_container_subset(idx_diff_hidden, dims_up);
            MultiIndex<dims_low.Size()> idx_diff_low;
-            // calculate idx_diff_low
+            // HACK: control UpdateLowerIndex for DynamicMerge using hack
-#if 0 // hack
-            tran.CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low, idx_up);
-#else
-            // HACK: control CalculateLowerIndexDiff for DynamicMerge using ing hack
            // TODO remove hack
            constexpr index_t Hack =
                decltype(coord_step.hack_calculate_lower_index_diff_)::At(itran);
-            tran.CalculateLowerIndexDiff_hack(
+            tran.UpdateLowerIndex(idx_diff_low, idx_diff_up, idx_low, idx_up_new, Number<Hack>{});
-                idx_diff_low, idx_diff_up, idx_low, idx_up, Number<Hack>{});
-#endif
-            // update idx_low
-            idx_low += idx_diff_low;
            set_container_subset(idx_diff_hidden, dims_low, idx_diff_low);
            set_container_subset(idx_hidden, dims_low, idx_low);

--- a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm.hpp
@@ -384,13 +384,8 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
                Float,
                decltype(c_m0_m1_n0_n1_thread_desc),
                decltype(c_m0_m1_n0_n1_global_desc),
-#if 1 // debug
                Sequence<MRepeat, MPerThread, NRepeat, NPerThread>,
                CThreadTransferSrcDstAccessOrder,
-#else
-                Sequence<1, 1, 2, 4>,
-                Sequence<0, 1, 2, 3>,
-#endif
                CThreadTransferSrcDstVectorDim,
                1,
                CThreadTransferDstScalarPerVector,

--- a/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
@@ -922,7 +922,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
                src_desc, make_multi_index(1, 0), Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0>{});
            const auto src_step_m1_0 = make_dynamic_tensor_coordinate_step_hack(
                src_desc, make_multi_index(-1, 0), Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0>{});
-#elif 0
+#elif 1
            // for non-padded input tensor
            const auto src_step_0_p1 = make_dynamic_tensor_coordinate_step_hack(
                src_desc, make_multi_index(0, 1), Sequence<0, 0, 0, 0, 0, 0, 1>{});
@@ -1067,7 +1067,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
        // for padded input tensor
        const auto adjusted_step = make_dynamic_tensor_coordinate_step_hack(
            src_desc, adjusted_step_idx, Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2>{});
-#elif 0
+#elif 1
        // for non-paded input tensor
        const auto adjusted_step = make_dynamic_tensor_coordinate_step_hack(
            src_desc, adjusted_step_idx, Sequence<0, 0, 0, 0, 0, 1, 2>{});

--- a/composable_kernel/include/utility/amd_buffer_addressing.hpp
+++ b/composable_kernel/include/utility/amd_buffer_addressing.hpp
@@ -152,7 +152,7 @@ __device__ float amd_buffer_load<float, 1>(const float* p_src_wave,
    return __llvm_amdgcn_buffer_load_f32(
        src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
 #else
-#if 1 // debug
+#if 0 // debug
    float tmp = __llvm_amdgcn_buffer_load_f32(
        src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);

--- a/driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
@@ -201,7 +201,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
    constexpr auto conv_driver =
-#if 1 // debug
+#if 1
        DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
 #else
        DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad