Update blockwise indexing, working version

06eacec9 · Rosty Geyyer · 581d244c · 06eacec9 · 06eacec9 · 06eacec9
Commit 06eacec9 authored Feb 03, 2023 by Rosty Geyyer
3 changed files
--- a/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_dl_fp16.cpp
+++ b/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_dl_fp16.cpp
@@ -60,9 +60,9 @@ using DeviceConvBwdWeightInstance =
        S<1, 2, 1, 128, 1>,   // ABlockTransferThreadClusterLengths_K0_M0_M1_K1
        S<0, 2, 3, 1, 4>,     // ABlockTransferThreadClusterArrangeOrder
        S<0, 2, 3, 1, 4>,     // ABlockTransferSrcAccessOrder
-        S<1, 4, 1, 1, 2>,     // ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1
+        S<1, 1, 1, 1, 1>,     // ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1
        S<0, 2, 3, 1, 4>,     // ABlockTransferSrcVectorTensorContiguousDimOrder
-        S<1, 1, 1, 1, 2>,     // ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1
+        S<1, 1, 1, 1, 1>,     // ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1
        S<1, 1, 1, 8, 2>,     // BBlockTransferThreadSliceLengths_K0_N0_N1_K1
        S<1, 16, 1, 16, 1>,   // BBlockTransferThreadClusterLengths_K0_N0_N1_K1
        S<0, 1, 4, 2, 3>,     // BBlockTransferThreadClusterArrangeOrder

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_dl.hpp
@@ -138,17 +138,17 @@ struct DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Dl
    : public DeviceGroupedConvBwdWeight<
          NDimSpatial,
          ck::tuple_element_t<NDimSpatial - 1,
-                              ck::Tuple<ck::tensor_layout::convolution::NWC,
+                              ck::Tuple<ck::tensor_layout::convolution::GNWC,
-                                        ck::tensor_layout::convolution::NHWC,
+                                        ck::tensor_layout::convolution::GNHWC,
-                                        ck::tensor_layout::convolution::NDHWC>>,
+                                        ck::tensor_layout::convolution::GNDHWC>>,
          ck::tuple_element_t<NDimSpatial - 1,
-                              ck::Tuple<ck::tensor_layout::convolution::KXC,
+                              ck::Tuple<ck::tensor_layout::convolution::GKXC,
-                                        ck::tensor_layout::convolution::KYXC,
+                                        ck::tensor_layout::convolution::GKYXC,
-                                        ck::tensor_layout::convolution::KZYXC>>,
+                                        ck::tensor_layout::convolution::GKZYXC>>,
          ck::tuple_element_t<NDimSpatial - 1,
-                              ck::Tuple<ck::tensor_layout::convolution::NWK,
+                              ck::Tuple<ck::tensor_layout::convolution::GNWK,
-                                        ck::tensor_layout::convolution::NHWK,
+                                        ck::tensor_layout::convolution::GNHWK,
-                                        ck::tensor_layout::convolution::NDHWK>>,
+                                        ck::tensor_layout::convolution::GNDHWK>>,
          InDataType,
          WeiDataType,
          OutDataType,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_v1r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_v1r3.hpp
@@ -812,11 +812,9 @@ struct GridwiseGemmDl_bkm_bkn_mn_v1r3
        }
        // HACK: this force m/n_block_data_idx_on_grid into SGPR
-        const index_t m_block_data_idx_on_grid =
+        const index_t m_block_data_idx_on_grid = __builtin_amdgcn_readfirstlane(block_work_idx[I1]);
-            __builtin_amdgcn_readfirstlane(block_work_idx[I1] * MPerBlock);
-        const index_t n_block_data_idx_on_grid =
+        const index_t n_block_data_idx_on_grid = __builtin_amdgcn_readfirstlane(block_work_idx[I2]);
-            __builtin_amdgcn_readfirstlane(block_work_idx[I2] * NPerBlock);
        // TODO: change this. I think it needs multi-dimensional alignment
        constexpr auto max_lds_align = K1;
@@ -980,7 +978,7 @@ struct GridwiseGemmDl_bkm_bkn_mn_v1r3
        if constexpr(HasMainKBlockLoop)
        {
-            const auto K0 = a_grid_desc_b_k0_m0_m1_k1.GetLength(I0);
+            const auto K0 = a_grid_desc_b_k0_m0_m1_k1.GetLength(I1);
            index_t k_block_data_begin = 0;