testing

e4b77dcf · Jing Zhang · 58ee3f13 · e4b77dcf · e4b77dcf · e4b77dcf
Commit e4b77dcf authored Jun 04, 2021 by Jing Zhang
7 changed files
--- a/composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
+++ b/composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
@@ -158,7 +158,7 @@ transform_forward_convolution_into_gemm_v4r4_xdlops_nchw_kcyx_nkhw_pad(
    constexpr auto wei_gemmk0_gemmm_gemmk1_global_move_slice_window_iterator_hacks =
        Sequence<0, 0, 0, 0, 0>{};
-#if 0
+#if 1
    // hack to control index calculation when iterating over in_gemmk0_gemmn_gemmk1_global tensor
    constexpr auto in_gemmk0_gemmn_gemmk1_global_iterator_hacks =
        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},

--- a/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp
@@ -211,7 +211,7 @@ struct BlockwiseGemmXdlops_km_kn_m0m1m2n_v1
                                                                Sequence<1, MRepeat, 1, KPack>,
                                                                Sequence<0, 1, 2, 3>,
                                                                3,
-                                                                1, // KPack,
+                                                                KPack,
                                                                1>;
    using BThreadCopy = ThreadwiseDynamicTensorSliceTransfer_v4<FloatAB,
@@ -221,7 +221,7 @@ struct BlockwiseGemmXdlops_km_kn_m0m1m2n_v1
                                                                Sequence<1, NRepeat, 1, KPack>,
                                                                Sequence<0, 1, 2, 3>,
                                                                3,
-                                                                1, // KPack,
+                                                                KPack,
                                                                1>;
    AThreadCopy a_thread_copy_;

--- a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops.hpp
@@ -141,7 +141,7 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_xdlops_v1
 {
    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
    {
-        constexpr auto max_lds_align = KPack;
+        constexpr auto max_lds_align = Number<KPack>{};
        // A matrix in LDS memory, dst of blockwise copy
        //   be careful of LDS alignment
@@ -204,7 +204,7 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_xdlops_v1
            __builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock);
        // lds max alignment
-        constexpr auto max_lds_align = KPack;
+        constexpr auto max_lds_align = Number<KPack>{};
        // A matrix in LDS memory, dst of blockwise copy
        //   be careful of LDS alignment
@@ -229,11 +229,11 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_xdlops_v1
                                                   decltype(a_k0_m_k1_global_desc),
                                                   decltype(a_k0_m_k1_block_desc),
                                                   ABlockTransferSrcAccessOrder,
-                                                   Sequence<2, 0, 1>,
+                                                   Sequence<1, 0, 2>,
-                                                   2, // ABlockTransferSrcVectorDim,
+                                                   ABlockTransferSrcVectorDim,
                                                   2,
-                                                   1, // ABlockTransferSrcScalarPerVector,
+                                                   ABlockTransferSrcScalarPerVector,
-                                                   1, // ABlockTransferDstScalarPerVector_KPack,
+                                                   ABlockTransferDstScalarPerVector_KPack,
                                                   1,
                                                   1,
                                                   AThreadTransferSrcResetCoordinateAfterRun,
@@ -256,11 +256,11 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_xdlops_v1
                                                   decltype(b_k0_n_k1_global_desc),
                                                   decltype(b_k0_n_k1_block_desc),
                                                   BBlockTransferSrcAccessOrder,
-                                                   Sequence<2, 0, 1>,
+                                                   Sequence<1, 0, 2>,
-                                                   1, // BBlockTransferSrcVectorDim,
+                                                   BBlockTransferSrcVectorDim,
                                                   2,
-                                                   1, // BBlockTransferSrcScalarPerVector,
+                                                   BBlockTransferSrcScalarPerVector,
-                                                   1, // BBlockTransferDstScalarPerVector_KPack,
+                                                   BBlockTransferDstScalarPerVector_KPack,
                                                   1,
                                                   1,
                                                   BThreadTransferSrcResetCoordinateAfterRun,
@@ -282,8 +282,6 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_xdlops_v1
                          NPerBlock % (NPerWave * NRepeat) == 0,
                      "wrong!");
-        static_assert(KPack == 1, "");
        constexpr auto a_k0_m0_m1_k1_block_desc = transform_dynamic_tensor_descriptor(
            a_k0_m_k1_block_desc,
            make_tuple(make_pass_through_transform(Number<KPerBlock>{}),

--- a/composable_kernel/include/utility/math.hpp
+++ b/composable_kernel/include/utility/math.hpp
@@ -61,7 +61,7 @@ struct integer_divide_ceiler
    {
        static_assert(is_same<T, index_t>{} || is_same<T, int>{}, "wrong type");
-        return (a + b - 1) / b;
+        return (a + b - Number<1>{}) / b;
    }
 };
@@ -74,7 +74,7 @@ __host__ __device__ constexpr auto integer_divide_floor(X x, Y y)
 template <class X, class Y>
 __host__ __device__ constexpr auto integer_divide_ceil(X x, Y y)
 {
-    return (x + y - 1) / y;
+    return (x + y - Number<1>{}) / y;
 }
 template <class X, class Y>

--- a/driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
@@ -79,10 +79,38 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw
    const auto in_right_pads    = sequence_to_tuple_of_number(InRightPads{});
 #endif
+#if 1
    constexpr index_t BlockSize = 256;
    constexpr index_t GemmMPerBlock = 128;
    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+    constexpr index_t GemmMPerWave = 64;
+    constexpr index_t GemmNPerWave = 64;
+    constexpr index_t GemmKPack    = 4;
+    constexpr index_t MRepeat = 1;
+    constexpr index_t NRepeat = 1;
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 2, 4>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 4;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_KPack = 4;
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 4>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN = 1;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_KPack = 4;
+    constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 1;
+#else
+    constexpr index_t BlockSize = 256;
+    constexpr index_t GemmMPerBlock = 128;
+    constexpr index_t GemmNPerBlock = 256;
    constexpr index_t GemmKPerBlock = 16;
    constexpr index_t GemmMPerWave = 64;
@@ -90,21 +118,22 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw
    constexpr index_t GemmKPack    = 1;
    constexpr index_t MRepeat = 1;
-    constexpr index_t NRepeat = 1;
+    constexpr index_t NRepeat = 2;
-    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<4, 2, GemmKPack>;
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<4, 2, 1>;
    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 1;
    constexpr index_t GemmABlockTransferDstScalarPerVector_KPack = 1;
-    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<2, 4, GemmKPack>;
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<4, 4, 1>;
-    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<8, 32, 1>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN = 1;
    constexpr index_t GemmBBlockTransferDstScalarPerVector_KPack = 1;
    constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 1;
+#endif
    const auto descs =
        transform_forward_convolution_into_gemm_v4r4_xdlops_nchw_kcyx_nkhw_pad<TInWei,
@@ -152,7 +181,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw
            GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1,
            GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1,
            Sequence<0, 2, 1>,
-            Sequence<0, 2, 1>,
+            Sequence<1, 0, 2>,
            1,
            GemmBBlockTransferSrcScalarPerVector_GemmN,
            GemmBBlockTransferDstScalarPerVector_KPack,

--- a/driver/src/conv_driver.cpp
+++ b/driver/src/conv_driver.cpp
@@ -24,7 +24,7 @@ int main(int argc, char* argv[])
 {
    using namespace ck;
-#if 1
+#if 0
    constexpr index_t N  = 256;
    constexpr index_t C  = 256;
    constexpr index_t HI = 16;

--- a/script/cmake-rocm.sh
+++ b/script/cmake-rocm.sh
@@ -10,7 +10,7 @@ cmake
 -D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL}                                                                                      \
 -D CMAKE_BUILD_TYPE=Release                                                                                                        \
 -D DEVICE_BACKEND="AMD"                                                                                                            \
-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx908 -gline-tables-only -save-temps=$CWD -ftemplate-backtrace-limit=0"                   \
+-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx908 -gline-tables-only -save-temps=$CWD -ftemplate-backtrace-limit=0 -mllvm --amdgpu-spill-vgpr-to-agpr=0"                   \
 -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                                                          \
 -D CMAKE_PREFIX_PATH="/opt/rocm"                                                                                                   \
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                                                                  \