Merge branch 'develop' into ck_codegen_build

a93d07c7 · Illia Silin · GitHub · 9d9ad510 · afbf6350 · a93d07c7
Unverified Commit a93d07c7 authored Aug 06, 2024 by Illia Silin Committed by GitHub Aug 06, 2024
12 changed files
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/large_tensor/device_grouped_conv3d_fwd_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/large_tensor/device_grouped_conv3d_fwd_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+void add_device_grouped_conv3d_fwd_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_large_tensor_f16_instances<3,
+                                                               NDHWGC,
+                                                               GKZYXC,
+                                                               Empty_Tuple,
+                                                               NDHWGK,
+                                                               ConvFwdDefault>{});
+}
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/large_tensor/device_grouped_conv3d_fwd_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/large_tensor/device_grouped_conv3d_fwd_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+void add_device_grouped_conv3d_fwd_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_large_tensor_f32_instances<3,
+                                                               NDHWGC,
+                                                               GKZYXC,
+                                                               Empty_Tuple,
+                                                               NDHWGK,
+                                                               ConvFwdDefault>{});
+}
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt
+set(FMHA_CPP_FOLDER ${CMAKE_CURRENT_BINARY_DIR})
+set(FMHA_SRC_FOLDER ${CMAKE_SOURCE_DIR}/example/ck_tile/01_fmha/)
+set(CK_TILE_SRC_FOLDER ${CMAKE_SOURCE_DIR}/include/ck_tile/)
+# python stuff
+find_package(PythonInterp 3 REQUIRED)
+rocm_install(DIRECTORY ${CK_TILE_SRC_FOLDER} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/ck_tile)
+rocm_install(FILES
+    "${FMHA_SRC_FOLDER}/fmha_fwd.hpp"
+    "${FMHA_SRC_FOLDER}/bias.hpp"
+    "${FMHA_SRC_FOLDER}/mask.hpp"
+    DESTINATION include/ck_tile/ops
+)
+# header for building lib
+file(COPY ${FMHA_SRC_FOLDER}/fmha_fwd.hpp DESTINATION ${FMHA_CPP_FOLDER})
+file(COPY ${FMHA_SRC_FOLDER}/bias.hpp DESTINATION ${FMHA_CPP_FOLDER})
+file(COPY ${FMHA_SRC_FOLDER}/mask.hpp DESTINATION ${FMHA_CPP_FOLDER})
+# generate a list of kernels, but not actually emit files at config stage
+execute_process(
+  COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/example/ck_tile/01_fmha/generate.py
+  --list_blobs ${FMHA_CPP_FOLDER}/blob_list.txt
+)
+file(STRINGS ${FMHA_CPP_FOLDER}/blob_list.txt FMHA_FWD_GEN_BLOBS)
+# actually generate the cpp files
+add_custom_command(
+  OUTPUT ${FMHA_FWD_GEN_BLOBS}
+  COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/example/ck_tile/01_fmha/generate.py
+  --output_dir ${FMHA_CPP_FOLDER}
+  COMMENT "Generating mha kernel (cpp) files now ..."
+  VERBATIM
+)
+# This is done to remove path info and just
+# have filename. Since, it was cauing the cmake
+# to throw "File name too long"
+set(device_files)
+foreach(filepath IN LISTS FMHA_FWD_GEN_BLOBS)
+    get_filename_component(filename ${filepath} NAME)
+    # Append the filename to the device_files list
+    list(APPEND device_files ${filename})
+endforeach()
+add_custom_target(generate_cpp_files DEPENDS ${FMHA_FWD_GEN_BLOBS})
+add_instance_library(device_mha_instance ${device_files})
+if (TARGET device_mha_instance)
+  add_dependencies(device_mha_instance generate_cpp_files)
+endif()
--- a/library/src/utility/convolution_parameter.cpp
+++ b/library/src/utility/convolution_parameter.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #include "ck/host_utility/io.hpp"
@@ -20,6 +20,63 @@ ConvParam::ConvParam(ck::index_t n_dim,
                     const std::vector<ck::index_t>& dilations,
                     const std::vector<ck::index_t>& left_pads,
                     const std::vector<ck::index_t>& right_pads)
+    : num_dim_spatial_(static_cast<ck::long_index_t>(n_dim)),
+      G_(static_cast<ck::long_index_t>(group_count)),
+      N_(static_cast<ck::long_index_t>(n_batch)),
+      K_(static_cast<ck::long_index_t>(n_out_channels)),
+      C_(static_cast<ck::long_index_t>(n_in_channels)),
+      filter_spatial_lengths_(num_dim_spatial_),
+      input_spatial_lengths_(num_dim_spatial_),
+      output_spatial_lengths_(num_dim_spatial_),
+      conv_filter_strides_(num_dim_spatial_),
+      conv_filter_dilations_(num_dim_spatial_),
+      input_left_pads_(num_dim_spatial_),
+      input_right_pads_(num_dim_spatial_)
+{
+    if(static_cast<ck::index_t>(filter_spatial_lengths_.size()) != num_dim_spatial_ ||
+       static_cast<ck::index_t>(input_spatial_lengths_.size()) != num_dim_spatial_ ||
+       static_cast<ck::index_t>(conv_filter_strides_.size()) != num_dim_spatial_ ||
+       static_cast<ck::index_t>(conv_filter_dilations_.size()) != num_dim_spatial_ ||
+       static_cast<ck::index_t>(input_left_pads_.size()) != num_dim_spatial_ ||
+       static_cast<ck::index_t>(input_right_pads_.size()) != num_dim_spatial_)
+    {
+        throw(
+            std::runtime_error("ConvParam::ConvParam: "
+                               "parameter size is different from number of declared dimensions!"));
+    }
+    for(ck::index_t i = 0; i < num_dim_spatial_; ++i)
+    {
+        filter_spatial_lengths_[i] = static_cast<ck::long_index_t>(filters_len[i]);
+        input_spatial_lengths_[i]  = static_cast<ck::long_index_t>(input_len[i]);
+        conv_filter_strides_[i]    = static_cast<ck::long_index_t>(strides[i]);
+        conv_filter_dilations_[i]  = static_cast<ck::long_index_t>(dilations[i]);
+        input_left_pads_[i]        = static_cast<ck::long_index_t>(left_pads[i]);
+        input_right_pads_[i]       = static_cast<ck::long_index_t>(right_pads[i]);
+        // XEff = (X - 1) * conv_dilation_w + 1;
+        // Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+        const ck::long_index_t x_eff =
+            (filter_spatial_lengths_[i] - 1) * conv_filter_dilations_[i] + 1;
+        output_spatial_lengths_[i] =
+            (input_spatial_lengths_[i] + input_left_pads_[i] + input_right_pads_[i] - x_eff) /
+                conv_filter_strides_[i] +
+            1;
+    }
+}
+ConvParam::ConvParam(ck::long_index_t n_dim,
+                     ck::long_index_t group_count,
+                     ck::long_index_t n_batch,
+                     ck::long_index_t n_out_channels,
+                     ck::long_index_t n_in_channels,
+                     const std::vector<ck::long_index_t>& filters_len,
+                     const std::vector<ck::long_index_t>& input_len,
+                     const std::vector<ck::long_index_t>& strides,
+                     const std::vector<ck::long_index_t>& dilations,
+                     const std::vector<ck::long_index_t>& left_pads,
+                     const std::vector<ck::long_index_t>& right_pads)
    : num_dim_spatial_(n_dim),
      G_(group_count),
      N_(n_batch),
@@ -49,7 +106,8 @@ ConvParam::ConvParam(ck::index_t n_dim,
    {
        // XEff = (X - 1) * conv_dilation_w + 1;
        // Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
-        const ck::index_t x_eff = (filter_spatial_lengths_[i] - 1) * conv_filter_dilations_[i] + 1;
+        const ck::long_index_t x_eff =
+            (filter_spatial_lengths_[i] - 1) * conv_filter_dilations_[i] + 1;
        output_spatial_lengths_[i] =
            (input_spatial_lengths_[i] + input_left_pads_[i] + input_right_pads_[i] - x_eff) /
@@ -63,7 +121,7 @@ ConvParam::ConvParam()
 {
 }
-std::vector<ck::index_t> ConvParam::GetOutputSpatialLengths() const
+std::vector<ck::long_index_t> ConvParam::GetOutputSpatialLengths() const
 {
    return output_spatial_lengths_;
 }
@@ -97,46 +155,46 @@ std::string get_conv_param_parser_helper_msg()
 ck::utils::conv::ConvParam parse_conv_param(int num_dim_spatial, int arg_idx, char* const argv[])
 {
-    const ck::index_t G = std::stoi(argv[arg_idx++]);
+    const ck::long_index_t G = std::stol(argv[arg_idx++]);
-    const ck::index_t N = std::stoi(argv[arg_idx++]);
+    const ck::long_index_t N = std::stol(argv[arg_idx++]);
-    const ck::index_t K = std::stoi(argv[arg_idx++]);
+    const ck::long_index_t K = std::stol(argv[arg_idx++]);
-    const ck::index_t C = std::stoi(argv[arg_idx++]);
+    const ck::long_index_t C = std::stol(argv[arg_idx++]);
-    std::vector<ck::index_t> filter_spatial_lengths(num_dim_spatial);
+    std::vector<ck::long_index_t> filter_spatial_lengths(num_dim_spatial);
-    std::vector<ck::index_t> input_spatial_lengths(num_dim_spatial);
+    std::vector<ck::long_index_t> input_spatial_lengths(num_dim_spatial);
-    std::vector<ck::index_t> conv_filter_strides(num_dim_spatial);
+    std::vector<ck::long_index_t> conv_filter_strides(num_dim_spatial);
-    std::vector<ck::index_t> conv_filter_dilations(num_dim_spatial);
+    std::vector<ck::long_index_t> conv_filter_dilations(num_dim_spatial);
-    std::vector<ck::index_t> input_left_pads(num_dim_spatial);
+    std::vector<ck::long_index_t> input_left_pads(num_dim_spatial);
-    std::vector<ck::index_t> input_right_pads(num_dim_spatial);
+    std::vector<ck::long_index_t> input_right_pads(num_dim_spatial);
    for(int i = 0; i < num_dim_spatial; ++i)
    {
-        filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
+        filter_spatial_lengths[i] = std::stol(argv[arg_idx++]);
    }
    for(int i = 0; i < num_dim_spatial; ++i)
    {
-        input_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
+        input_spatial_lengths[i] = std::stol(argv[arg_idx++]);
    }
    for(int i = 0; i < num_dim_spatial; ++i)
    {
-        conv_filter_strides[i] = std::stoi(argv[arg_idx++]);
+        conv_filter_strides[i] = std::stol(argv[arg_idx++]);
    }
    for(int i = 0; i < num_dim_spatial; ++i)
    {
-        conv_filter_dilations[i] = std::stoi(argv[arg_idx++]);
+        conv_filter_dilations[i] = std::stol(argv[arg_idx++]);
    }
    for(int i = 0; i < num_dim_spatial; ++i)
    {
-        input_left_pads[i] = std::stoi(argv[arg_idx++]);
+        input_left_pads[i] = std::stol(argv[arg_idx++]);
    }
    for(int i = 0; i < num_dim_spatial; ++i)
    {
-        input_right_pads[i] = std::stoi(argv[arg_idx++]);
+        input_right_pads[i] = std::stol(argv[arg_idx++]);
    }
    return ck::utils::conv::ConvParam{num_dim_spatial,

--- a/profiler/include/profiler/profile_conv_bwd_data_impl.hpp
+++ b/profiler/include/profiler/profile_conv_bwd_data_impl.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
@@ -82,6 +82,29 @@ bool profile_conv_bwd_data_impl(int do_verification,
    Tensor<WeiDataType> weight(wei_g_k_c_xs_desc);
    Tensor<OutDataType> output(out_g_n_k_wos_desc);
+    std::vector<ck::index_t> input_spatial_lengths_i32(NDimSpatial);
+    std::vector<ck::index_t> filter_spatial_lengths_i32(NDimSpatial);
+    std::vector<ck::index_t> output_spatial_lengths_i32(NDimSpatial);
+    std::vector<ck::index_t> conv_filter_strides_i32(NDimSpatial);
+    std::vector<ck::index_t> conv_filter_dilations_i32(NDimSpatial);
+    std::vector<ck::index_t> input_left_pads_i32(NDimSpatial);
+    std::vector<ck::index_t> input_right_pads_i32(NDimSpatial);
+    for(ck::index_t d = 0; d < NDimSpatial; d++)
+    {
+        input_spatial_lengths_i32[d] =
+            static_cast<ck::index_t>(conv_param.input_spatial_lengths_[d]);
+        filter_spatial_lengths_i32[d] =
+            static_cast<ck::index_t>(conv_param.filter_spatial_lengths_[d]);
+        output_spatial_lengths_i32[d] =
+            static_cast<ck::index_t>(conv_param.GetOutputSpatialLengths()[d]);
+        conv_filter_strides_i32[d] = static_cast<ck::index_t>(conv_param.conv_filter_strides_[d]);
+        conv_filter_dilations_i32[d] =
+            static_cast<ck::index_t>(conv_param.conv_filter_dilations_[d]);
+        input_left_pads_i32[d]  = static_cast<ck::index_t>(conv_param.input_left_pads_[d]);
+        input_right_pads_i32[d] = static_cast<ck::index_t>(conv_param.input_right_pads_[d]);
+    }
    std::cout << "input: " << input_host_result.mDesc << std::endl;
    std::cout << "weight: " << weight.mDesc << std::endl;
    std::cout << "output: " << output.mDesc << std::endl;
@@ -161,16 +184,16 @@ bool profile_conv_bwd_data_impl(int do_verification,
            op_ptr->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
                                        static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
                                        static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
-                                        conv_param.N_,
+                                        static_cast<ck::index_t>(conv_param.N_),
-                                        conv_param.K_,
+                                        static_cast<ck::index_t>(conv_param.K_),
-                                        conv_param.C_,
+                                        static_cast<ck::index_t>(conv_param.C_),
-                                        conv_param.input_spatial_lengths_,
+                                        input_spatial_lengths_i32,
-                                        conv_param.filter_spatial_lengths_,
+                                        filter_spatial_lengths_i32,
-                                        conv_param.output_spatial_lengths_,
+                                        output_spatial_lengths_i32,
-                                        conv_param.conv_filter_strides_,
+                                        conv_filter_strides_i32,
-                                        conv_param.conv_filter_dilations_,
+                                        conv_filter_dilations_i32,
-                                        conv_param.input_left_pads_,
+                                        input_left_pads_i32,
-                                        conv_param.input_right_pads_,
+                                        input_right_pads_i32,
                                        in_element_op,
                                        wei_element_op,
                                        out_element_op);

--- a/profiler/include/profiler/profile_conv_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_conv_fwd_impl.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
@@ -60,6 +60,29 @@ bool profile_conv_fwd_impl(int do_verification,
    Tensor<OutDataType> host_output(out_g_n_k_wos_desc);
    Tensor<OutDataType> device_output(out_g_n_k_wos_desc);
+    std::vector<ck::index_t> input_spatial_lengths_i32(NDimSpatial);
+    std::vector<ck::index_t> filter_spatial_lengths_i32(NDimSpatial);
+    std::vector<ck::index_t> output_spatial_lengths_i32(NDimSpatial);
+    std::vector<ck::index_t> conv_filter_strides_i32(NDimSpatial);
+    std::vector<ck::index_t> conv_filter_dilations_i32(NDimSpatial);
+    std::vector<ck::index_t> input_left_pads_i32(NDimSpatial);
+    std::vector<ck::index_t> input_right_pads_i32(NDimSpatial);
+    for(ck::index_t d = 0; d < NDimSpatial; d++)
+    {
+        input_spatial_lengths_i32[d] =
+            static_cast<ck::index_t>(conv_param.input_spatial_lengths_[d]);
+        filter_spatial_lengths_i32[d] =
+            static_cast<ck::index_t>(conv_param.filter_spatial_lengths_[d]);
+        output_spatial_lengths_i32[d] =
+            static_cast<ck::index_t>(conv_param.GetOutputSpatialLengths()[d]);
+        conv_filter_strides_i32[d] = static_cast<ck::index_t>(conv_param.conv_filter_strides_[d]);
+        conv_filter_dilations_i32[d] =
+            static_cast<ck::index_t>(conv_param.conv_filter_dilations_[d]);
+        input_left_pads_i32[d]  = static_cast<ck::index_t>(conv_param.input_left_pads_[d]);
+        input_right_pads_i32[d] = static_cast<ck::index_t>(conv_param.input_right_pads_[d]);
+    }
    std::cout << "input: " << input.mDesc << std::endl;
    std::cout << "weight: " << weight.mDesc << std::endl;
    std::cout << "output: " << host_output.mDesc << std::endl;
@@ -143,16 +166,16 @@ bool profile_conv_fwd_impl(int do_verification,
            op_ptr->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
                                        static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
                                        static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
-                                        conv_param.N_,
+                                        static_cast<ck::index_t>(conv_param.N_),
-                                        conv_param.K_,
+                                        static_cast<ck::index_t>(conv_param.K_),
-                                        conv_param.C_,
+                                        static_cast<ck::index_t>(conv_param.C_),
-                                        conv_param.input_spatial_lengths_,
+                                        input_spatial_lengths_i32,
-                                        conv_param.filter_spatial_lengths_,
+                                        filter_spatial_lengths_i32,
-                                        conv_param.GetOutputSpatialLengths(),
+                                        output_spatial_lengths_i32,
-                                        conv_param.conv_filter_strides_,
+                                        conv_filter_strides_i32,
-                                        conv_param.conv_filter_dilations_,
+                                        conv_filter_dilations_i32,
-                                        conv_param.input_left_pads_,
+                                        input_left_pads_i32,
-                                        conv_param.input_right_pads_,
+                                        input_right_pads_i32,
                                        in_element_op,
                                        wei_element_op,
                                        out_element_op);

--- a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
@@ -33,7 +33,8 @@ template <ck::index_t NDimSpatial,
          typename WeiDataType,
          typename OutDataType,
          typename AComputeType = InDataType,
-          typename BComputeType = AComputeType>
+          typename BComputeType = AComputeType,
+          typename IndexType    = ck::index_t>
 bool profile_grouped_conv_fwd_impl(int do_verification,
                                   int init_method,
                                   bool do_log,
@@ -57,16 +58,16 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
    const auto out_g_n_k_wos_desc =
        ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
-    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
+    std::array<IndexType, NDimSpatial + 3> a_g_n_c_wis_lengths{};
-    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
+    std::array<IndexType, NDimSpatial + 3> a_g_n_c_wis_strides{};
-    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
+    std::array<IndexType, NDimSpatial + 3> b_g_k_c_xs_lengths{};
-    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
+    std::array<IndexType, NDimSpatial + 3> b_g_k_c_xs_strides{};
-    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
+    std::array<IndexType, NDimSpatial + 3> e_g_n_k_wos_lengths{};
-    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
+    std::array<IndexType, NDimSpatial + 3> e_g_n_k_wos_strides{};
-    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<IndexType, NDimSpatial> conv_filter_strides{};
-    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<IndexType, NDimSpatial> conv_filter_dilations{};
-    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<IndexType, NDimSpatial> input_left_pads{};
-    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+    std::array<IndexType, NDimSpatial> input_right_pads{};
    auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };

--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -82,6 +82,11 @@ set(PROFILER_EXECUTABLE ckProfiler)
 add_executable(${PROFILER_EXECUTABLE} ${PROFILER_SOURCES})
 target_compile_options(${PROFILER_EXECUTABLE} PRIVATE -Wno-global-constructors)
+# flags to compress the library
+if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600241132)
+  message("Adding --offload-compress flag for ${PROFILER_EXECUTABLE}")
+  target_compile_options(${PROFILER_EXECUTABLE} PRIVATE --offload-compress)
+endif()
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE utility getopt::getopt)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_instance)

--- a/profiler/src/profile_grouped_conv_fwd.cpp
+++ b/profiler/src/profile_grouped_conv_fwd.cpp
--- a/test/conv_util/conv_util.cpp
+++ b/test/conv_util/conv_util.cpp
--- a/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp
+++ b/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp
@@ -17,6 +17,7 @@ class TestGroupedConvndFwd : public ::testing::Test
    using InLayout  = std::tuple_element_t<1, Tuple>;
    using WeiLayout = std::tuple_element_t<2, Tuple>;
    using OutLayout = std::tuple_element_t<3, Tuple>;
+    using IndexType = std::tuple_element_t<4, Tuple>;
    std::vector<ck::utils::conv::ConvParam> conv_params;
@@ -33,7 +34,10 @@ class TestGroupedConvndFwd : public ::testing::Test
                                                                       OutLayout,
                                                                       DataType,
                                                                       DataType,
-                                                                       DataType>(
+                                                                       DataType,
+                                                                       DataType,
+                                                                       DataType,
+                                                                       IndexType>(
                               true,  // do_verification
                               1,     // init_method: integer value
                               false, // do_log
@@ -46,30 +50,31 @@ class TestGroupedConvndFwd : public ::testing::Test
 using namespace ck::tensor_layout::convolution;
-using KernelTypes1d = ::testing::Types<std::tuple<float, GNWC, GKXC, GNWK>,
+using KernelTypes1d = ::testing::Types<std::tuple<float, GNWC, GKXC, GNWK, ck::index_t>,
-                                       std::tuple<ck::half_t, GNWC, GKXC, GNWK>,
+                                       std::tuple<ck::half_t, GNWC, GKXC, GNWK, ck::index_t>,
-                                       std::tuple<ck::bhalf_t, GNWC, GKXC, GNWK>,
+                                       std::tuple<ck::bhalf_t, GNWC, GKXC, GNWK, ck::index_t>,
-                                       std::tuple<int8_t, GNWC, GKXC, GNWK>>;
+                                       std::tuple<int8_t, GNWC, GKXC, GNWK, ck::index_t>>;
-using KernelTypes2d = ::testing::Types<std::tuple<float, GNHWC, GKYXC, GNHWK>,
+using KernelTypes2d = ::testing::Types<std::tuple<float, GNHWC, GKYXC, GNHWK, ck::index_t>,
-                                       std::tuple<ck::half_t, GNHWC, GKYXC, GNHWK>,
+                                       std::tuple<ck::half_t, GNHWC, GKYXC, GNHWK, ck::index_t>,
-                                       std::tuple<ck::bhalf_t, GNHWC, GKYXC, GNHWK>,
+                                       std::tuple<ck::bhalf_t, GNHWC, GKYXC, GNHWK, ck::index_t>,
-                                       std::tuple<int8_t, GNHWC, GKYXC, GNHWK>,
+                                       std::tuple<int8_t, GNHWC, GKYXC, GNHWK, ck::index_t>,
-                                       std::tuple<float, NHWGC, GKYXC, NHWGK>,
+                                       std::tuple<float, NHWGC, GKYXC, NHWGK, ck::index_t>,
-                                       std::tuple<ck::half_t, NHWGC, GKYXC, NHWGK>,
+                                       std::tuple<ck::half_t, NHWGC, GKYXC, NHWGK, ck::index_t>,
-                                       std::tuple<ck::bhalf_t, NHWGC, GKYXC, NHWGK>,
+                                       std::tuple<ck::bhalf_t, NHWGC, GKYXC, NHWGK, ck::index_t>,
-                                       std::tuple<int8_t, NHWGC, GKYXC, NHWGK>>;
+                                       std::tuple<int8_t, NHWGC, GKYXC, NHWGK, ck::index_t>>;
-using KernelTypes3d = ::testing::Types<std::tuple<float, GNDHWC, GKZYXC, GNDHWK>,
+using KernelTypes3d = ::testing::Types<std::tuple<float, GNDHWC, GKZYXC, GNDHWK, ck::index_t>,
-                                       std::tuple<ck::half_t, GNDHWC, GKZYXC, GNDHWK>,
+                                       std::tuple<ck::half_t, GNDHWC, GKZYXC, GNDHWK, ck::index_t>,
-                                       std::tuple<ck::bhalf_t, GNDHWC, GKZYXC, GNDHWK>,
+                                       std::tuple<ck::bhalf_t, GNDHWC, GKZYXC, GNDHWK, ck::index_t>,
-                                       std::tuple<int8_t, GNDHWC, GKZYXC, GNDHWK>,
+                                       std::tuple<int8_t, GNDHWC, GKZYXC, GNDHWK, ck::index_t>,
-                                       std::tuple<float, NDHWGC, GKZYXC, NDHWGK>,
+                                       std::tuple<float, NDHWGC, GKZYXC, NDHWGK, ck::index_t>,
-                                       std::tuple<ck::half_t, NDHWGC, GKZYXC, NDHWGK>,
+                                       std::tuple<ck::half_t, NDHWGC, GKZYXC, NDHWGK, ck::index_t>,
-                                       std::tuple<ck::bhalf_t, NDHWGC, GKZYXC, NDHWGK>,
+                                       std::tuple<ck::bhalf_t, NDHWGC, GKZYXC, NDHWGK, ck::index_t>,
-                                       std::tuple<int8_t, NDHWGC, GKZYXC, NDHWGK>>;
+                                       std::tuple<int8_t, NDHWGC, GKZYXC, NDHWGK, ck::index_t>>;
-using KernelTypes2dLargeCases = ::testing::Types<std::tuple<float, NHWGC, GKYXC, NHWGK>>;
+using KernelTypes2dLargeCases =
+    ::testing::Types<std::tuple<float, NHWGC, GKYXC, NHWGK, ck::long_index_t>>;
 template <typename Tuple>
 class TestGroupedConvndFwd1d : public TestGroupedConvndFwd<Tuple>
@@ -153,5 +158,8 @@ TYPED_TEST(TestGroupedConvndFwd2dLargeCases, Test2DLargeCases)
    // With supported NumGroupsToMerge > 1
    this->conv_params.push_back(
        {2, 32, 64, 1, 1, {2, 2}, {672, 672}, {672, 672}, {1, 1}, {0, 0}, {0, 0}});
+    // When image is larger than 2GB
+    this->conv_params.push_back(
+        {2, 1, 1, 256, 256, {3, 3}, {4096, 2048}, {1024, 1024}, {3, 3}, {1, 1}, {1, 1}});
    this->template Run<2>();
 }
--- a/test/smfmac_op/smfmac_op_xdl.cpp
+++ b/test/smfmac_op/smfmac_op_xdl.cpp