LWPCK-2429: Device grouped GEMM uses Async Memcpy

Resolving merge conflicts

LWPCK-2429: Device grouped GEMM uses Async Memcpy
Resolving merge conflicts
2183406b · rtmadduri · e7b62864 · 2183406b · 2183406b · 2183406b
Commit 2183406b authored Nov 27, 2024 by rtmadduri
7 changed files
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
 #pragma once
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -603,7 +603,7 @@ struct DeviceGroupedGemmMultipleD_Dl : public DeviceGroupedGemm<ALayout,
            }

            hipGetErrorString(
-                hipMemcpyWithStream(arg.p_workspace_,
+                hipMemcpyAsync(arg.p_workspace_,
                               arg.gemm_desc_kernel_arg_.data(),
                               arg.gemm_desc_kernel_arg_.size() * sizeof(GemmKernelArg),
                               hipMemcpyHostToDevice,

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp
@@ -761,7 +761,7 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
            float time{0.f};

            hip_check_error(
-                hipMemcpyWithStream(dev_gemm_kargs,
+                hipMemcpyAsync(dev_gemm_kargs,
                               arg.gemm_kernel_args_.data(),
                               arg.gemm_kernel_args_.size() * sizeof(GemmTransKernelArg),
                               hipMemcpyHostToDevice,

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp
@@ -940,7 +940,7 @@ struct DeviceGroupedGemmMultipleDXdlCShuffleTileLoop
                             const void* p_host_kernel_args) const
    {
        arg.p_dev_gemm_args_ = p_dev_kernel_args;
-        hip_check_error(hipMemcpy(p_dev_kernel_args,
+        hip_check_error(hipMemcpyAsync(p_dev_kernel_args,
                                       p_host_kernel_args,
                                       GetDeviceKernelArgSize(&arg),
                                       hipMemcpyHostToDevice));

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
@@ -557,10 +557,10 @@ struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm<ALayout,
                }
            }

-            hipGetErrorString(hipMemcpyWithStream(arg.p_workspace_,
+            hipGetErrorString(
+                hipMemcpyAsync(arg.p_workspace_,
                               arg.gemm_desc_kernel_arg_.data(),
-                                                  arg.gemm_desc_kernel_arg_.size() *
-                                                      sizeof(GemmBiasTransKernelArg),
+                               arg.gemm_desc_kernel_arg_.size() * sizeof(GemmBiasTransKernelArg),
                               hipMemcpyHostToDevice,
                               stream_config.stream_id_));


--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
@@ -421,7 +421,7 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
            }

            hip_check_error(
-                hipMemcpyWithStream(arg.p_workspace_,
+                hipMemcpyAsync(arg.p_workspace_,
                               arg.gemm_kernel_args_.data(),
                               arg.gemm_kernel_args_.size() * sizeof(GemmTransKernelArg),
                               hipMemcpyHostToDevice,

--- a/profiler/include/profiler/profile_grouped_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_gemm_impl.hpp
@@ -302,6 +302,13 @@ bool profile_grouped_gemm_impl(int do_verification,
                                                                  rtol,
                                                                  atol);

+                        instance_pass =
+                            instance_pass && ck::utils::check_err(c_m_n_device_results[i],
+                                                                  c_m_n_host_results[i],
+                                                                  "Error: Incorrect results!",
+                                                                  rtol,
+                                                                  atol);
+
                        if(do_log)
                        {
                            LogRangeAsType<float>(std::cout << "a : ", a_m_k[i].mData, ",")