change some variable name

4ec493ec · ltqin · 1343569e · 4ec493ec · 4ec493ec · 4ec493ec
Commit 4ec493ec authored Sep 10, 2021 by ltqin
3 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -43,7 +43,7 @@ message(STATUS "Build with HIP ${hip_VERSION}")
 message("HALF_INCLUDE_DIR: ${HALF_INCLUDE_DIR}")

 # CMAKE_CXX_FLAGS
-SET(BUILD_DEV ON CACHE BOOL "BUILD_DEV")
+SET(BUILD_DEV OFF CACHE BOOL "BUILD_DEV")
 if(BUILD_DEV)
    string(APPEND CMAKE_CXX_FLAGS " -Werror -Weverything")
 endif()

--- a/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwgc_gkyxc_nhwgk.hpp
+++ b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwgc_gkyxc_nhwgk.hpp
@@ -15,16 +15,16 @@ template <typename TInWei,
          typename InLeftPads,
          typename InRightPads>
 void device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwgc_gkyxc_nhwgk(
-    const InLengths& in_n_hi_wi_c_lengths,
-    const WeiLengths& wei_k_y_x_c_lengths,
-    const OutLengths& out_n_ho_wo_k_lengths,
+    const InLengths& in_n_hi_wi_g_c_lengths,
+    const WeiLengths& wei_g_k_y_x_c_lengths,
+    const OutLengths& out_n_ho_wo_g_k_lengths,
    const ConvStrides& conv_strides,
    const ConvDilations& conv_dilations,
    const InLeftPads& in_left_pads,
    const InRightPads& in_right_pads,
-    const Tensor<TInWei>& in_n_hi_wi_c,
-    const Tensor<TInWei>& wei_k_y_x_c,
-    Tensor<TOut>& out_n_ho_wo_k,
+    const Tensor<TInWei>& in_n_hi_wi_g_c,
+    const Tensor<TInWei>& wei_g_k_y_x_c,
+    Tensor<TOut>& out_n_ho_wo_g_k,
    ck::index_t nrepeat)
 {
    using namespace ck;
@@ -35,19 +35,19 @@ void device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwgc_gkyxc_nhwgk(
    constexpr auto I1 = Number<1>{};
    constexpr auto I2 = Number<2>{};
    constexpr auto I3 = Number<3>{};
-    //constexpr auto I4 = Number<4>{};
+    constexpr auto I4 = Number<4>{};

-    DeviceMem in_n_hi_wi_c_device_buf(sizeof(TInWei) * in_n_hi_wi_c.mDesc.GetElementSpace());
-    DeviceMem wei_k_y_x_c_device_buf(sizeof(TInWei) * wei_k_y_x_c.mDesc.GetElementSpace());
-    DeviceMem out_n_ho_wo_k_device_buf(sizeof(TOut) * out_n_ho_wo_k.mDesc.GetElementSpace());
+    DeviceMem in_n_hi_wi_g_c_device_buf(sizeof(TInWei) * in_n_hi_wi_g_c.mDesc.GetElementSpace());
+    DeviceMem wei_g_k_y_x_c_device_buf(sizeof(TInWei) * wei_g_k_y_x_c.mDesc.GetElementSpace());
+    DeviceMem out_n_ho_wo_g_k_device_buf(sizeof(TOut) * out_n_ho_wo_g_k.mDesc.GetElementSpace());

-    in_n_hi_wi_c_device_buf.ToDevice(in_n_hi_wi_c.mData.data());
-    wei_k_y_x_c_device_buf.ToDevice(wei_k_y_x_c.mData.data());
-    out_n_ho_wo_k_device_buf.ToDevice(out_n_ho_wo_k.mData.data());
+    in_n_hi_wi_g_c_device_buf.ToDevice(in_n_hi_wi_g_c.mData.data());
+    wei_g_k_y_x_c_device_buf.ToDevice(wei_g_k_y_x_c.mData.data());
+    out_n_ho_wo_g_k_device_buf.ToDevice(out_n_ho_wo_g_k.mData.data());

-    const auto in_n_hi_wi_c_desc  = make_naive_tensor_descriptor_packed(in_n_hi_wi_c_lengths);
-    const auto wei_k_y_x_c_desc   = make_naive_tensor_descriptor_packed(wei_k_y_x_c_lengths);
-    const auto out_n_ho_wo_k_desc = make_naive_tensor_descriptor_packed(out_n_ho_wo_k_lengths);
+    const auto in_n_hi_wi_g_c_desc  = make_naive_tensor_descriptor_packed(in_n_hi_wi_g_c_lengths);
+    const auto wei_g_k_y_x_c_desc   = make_naive_tensor_descriptor_packed(wei_g_k_y_x_c_lengths);
+    const auto out_n_ho_wo_g_k_desc = make_naive_tensor_descriptor_packed(out_n_ho_wo_g_k_lengths);

 #if 0
    // [M, N, K0, K1] = [256, 128, 4, 4] for fp32
@@ -176,14 +176,14 @@ void device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwgc_gkyxc_nhwgk(
    constexpr index_t MRepeat = 2;
    constexpr index_t NRepeat = 4;

-    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 2, 8>;
-    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 1, 2, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<1, 4, 64, 1>;

    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 8;
    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;

-    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 4, 8>;
-    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 1, 4, 8>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<1, 4, 64, 1>;

    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 8;
    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
@@ -220,9 +220,9 @@ void device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwgc_gkyxc_nhwgk(
 #endif

    const auto descs =
-        transform_forward_convolution_into_gemm_v4r4r4_nhwgc_gkyxc_nhwgk_pad(in_n_hi_wi_c_desc,
-                                                                          wei_k_y_x_c_desc,
-                                                                          out_n_ho_wo_k_desc,
+        transform_forward_convolution_into_gemm_v4r4r4_nhwgc_gkyxc_nhwgk_pad(in_n_hi_wi_g_c_desc,
+                                                                          wei_g_k_y_x_c_desc,
+                                                                          out_n_ho_wo_g_k_desc,
                                                                          conv_strides,
                                                                          conv_dilations,
                                                                          in_left_pads,
@@ -351,5 +351,5 @@ void device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwgc_gkyxc_nhwgk(
    }
 */
    // copy result back to host
-    out_n_ho_wo_k_device_buf.FromDevice(out_n_ho_wo_k.mData.data());
+    out_n_ho_wo_g_k_device_buf.FromDevice(out_n_ho_wo_g_k.mData.data());
 }
--- a/script/cmake-rocm.sh
+++ b/script/cmake-rocm.sh
@@ -9,7 +9,7 @@ MY_PROJECT_INSTALL=../install.dir
 cmake                                                                                                                                          \
 -D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL}                                                                                                  \
 -D HALF_INCLUDE_DIR="/root/workspace/external/half/include"                                                                                    \
-D BUILD_DEV=ON                                                                                                                                \
+-D BUILD_DEV=OFF                                                                                                                                \
 -D CMAKE_BUILD_TYPE=Release                                                                                                                    \
 -D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 -O3 --amdgpu-target=gfx908 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -gline-tables-only -save-temps=$PWD"   \
 -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                                                                      \