Merge branch 'develop' into enable_navi_32_ci

9d3fb0b5 · Ted Themistokleous · GitHub · 9c91c08d · aeb9f78c · 9d3fb0b5
Unverified Commit 9d3fb0b5 authored Aug 05, 2023 by Ted Themistokleous Committed by GitHub Aug 05, 2023
20 changed files
--- a/src/targets/cpu/include/migraphx/cpu/lowering.hpp
+++ b/src/targets/cpu/include/migraphx/cpu/lowering.hpp
@@ -24,8 +24,7 @@
 #ifndef MIGRAPHX_GUARD_RTGLIB_CPU_LOWERING_HPP
 #define MIGRAPHX_GUARD_RTGLIB_CPU_LOWERING_HPP

-#include <string>
-#include <migraphx/config.hpp>
+#include <migraphx/cpu/context.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -34,7 +33,7 @@ struct module;

 namespace cpu {

-struct lowering
+struct MIGRAPHX_CPU_EXPORT lowering
 {
    std::string name() const { return "cpu::lowering"; }
    void apply(module& m) const;

--- a/src/targets/cpu/include/migraphx/cpu/target.hpp
+++ b/src/targets/cpu/include/migraphx/cpu/target.hpp
@@ -28,14 +28,13 @@
 #include <migraphx/register_target.hpp>
 #include <migraphx/compile_options.hpp>
 #include <migraphx/cpu/context.hpp>
-#include <migraphx/config.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 struct pass;
 namespace cpu {

-struct target
+struct MIGRAPHX_CPU_EXPORT target
 {
    std::string name() const;
    std::vector<pass> get_passes(migraphx::context& gctx, const compile_options&) const;

--- a/src/targets/cpu/lowering.cpp
+++ b/src/targets/cpu/lowering.cpp
@@ -27,7 +27,7 @@
 #include <migraphx/dfor.hpp>
 #include <migraphx/op/identity.hpp>
 #include <migraphx/op/convolution.hpp>
-#include <migraphx/op/deconvolution.hpp>
+#include <migraphx/op/convolution_backwards.hpp>
 #include <migraphx/op/quant_convolution.hpp>
 #include <migraphx/op/dot.hpp>
 #include <migraphx/op/quant_dot.hpp>
@@ -345,7 +345,7 @@ struct cpu_apply
        extend_op("contiguous", "dnnl::reorder");
        extend_op("convolution", "dnnl::convolution");
 #ifndef MIGRAPHX_ENABLE_ZENDNN
-        extend_op("deconvolution", "dnnl::deconvolution");
+        extend_op("convolution_backwards", "dnnl::convolution_backwards");
        extend_op("dot", "dnnl::dot");
 #endif
        extend_op("erf", "cpu::erf");

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -33,7 +33,10 @@ if(NOT TARGET MIOpen)
    message(SEND_ERROR "Cant find miopen")
 endif()

-find_package(composable_kernel 1.0.0 COMPONENTS jit_library REQUIRED) 
+if(NOT WIN32)
+    # TODO: re-enable when CK is ported to Windows
+    find_package(composable_kernel 1.0.0 REQUIRED COMPONENTS jit_library)
+endif()

 if(BUILD_DEV)
    set(MIGRAPHX_USE_HIPRTC OFF CACHE BOOL "Use hipRTC APIs")
@@ -42,12 +45,12 @@ else()
 endif()

 include(Embed)
-file(GLOB KERNEL_FILES ${CONFIGURE_DEPENDS}
+file(GLOB KERNEL_FILES CONFIGURE_DEPENDS
    ${CMAKE_CURRENT_SOURCE_DIR}/kernels/include/migraphx/kernels/*.hpp)
 message(STATUS "KERNEL_FILES: ${KERNEL_FILES}")
-add_embed_library(migraphx_kernels ${KERNEL_FILES})
+add_embed_library(migraphx_kernels ${KERNEL_FILES} RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}/kernels/include/)

-file(GLOB DEVICE_GPU_SRCS ${CONFIGURE_DEPENDS} ${CMAKE_CURRENT_SOURCE_DIR}/device/*.cpp)
+file(GLOB DEVICE_GPU_SRCS CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/device/*.cpp)
 add_library(migraphx_device ${DEVICE_GPU_SRCS})

 add_library(compile_for_gpu INTERFACE)
@@ -67,6 +70,8 @@ target_link_libraries(migraphx_device PUBLIC migraphx)
 target_link_libraries(migraphx_device PRIVATE compile_for_gpu)
 target_include_directories(migraphx_device PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>)
 target_include_directories(migraphx_device PRIVATE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/device/include>)
+target_compile_options(migraphx_device PRIVATE -Wno-ignored-attributes)
+migraphx_generate_export_header(migraphx_device DIRECTORY migraphx/gpu/device)

 add_library(kernel_file_check EXCLUDE_FROM_ALL)

@@ -82,7 +87,13 @@ target_link_libraries(kernel_file_check compile_for_gpu)

 rocm_clang_tidy_check(kernel_file_check)

-file(GLOB JIT_GPU_SRCS ${CONFIGURE_DEPENDS} ${CMAKE_CURRENT_SOURCE_DIR}/jit/*.cpp)
+file(GLOB JIT_GPU_SRCS CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/jit/*.cpp)
+
+if(WIN32)
+    # TODO: re-enable when CK is ported to Windows
+    list(REMOVE_ITEM JIT_GPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/jit/ck_gemm.cpp)
+endif()
+
 add_library(migraphx_gpu
    abs.cpp
    analyze_streams.cpp
@@ -131,7 +142,9 @@ add_library(migraphx_gpu
    write_literals.cpp
    ${JIT_GPU_SRCS}
 )
+
 set_target_properties(migraphx_gpu PROPERTIES EXPORT_NAME gpu)
+migraphx_generate_export_header(migraphx_gpu)

 function(register_migraphx_gpu_ops PREFIX)
    foreach(OP ${ARGN})
@@ -173,7 +186,7 @@ register_op(migraphx_gpu
    OPERATORS gpu::rocblas_gemm<op::dot> gpu::rocblas_gemm<op::quant_dot>
    INCLUDES migraphx/gpu/context.hpp)
 register_op(migraphx_gpu HEADER migraphx/gpu/convolution.hpp
-    OPERATORS gpu::miopen_convolution<op::convolution> gpu::miopen_convolution<op::deconvolution> gpu::miopen_convolution<op::quant_convolution>
+    OPERATORS gpu::miopen_convolution<op::convolution> gpu::miopen_convolution<op::convolution_backwards> gpu::miopen_convolution<op::quant_convolution>
    INCLUDES migraphx/gpu/context.hpp)
 rocm_set_soversion(migraphx_gpu ${MIGRAPHX_SO_VERSION})
 rocm_clang_tidy_check(migraphx_gpu)
@@ -185,7 +198,9 @@ if(MIGRAPHX_ENABLE_MLIR)
    find_package(rocMLIR 1.0.0 CONFIG REQUIRED)
    message(STATUS "Build with rocMLIR::rockCompiler ${rocMLIR_VERSION}")
    target_compile_definitions(migraphx_gpu PRIVATE "-DMIGRAPHX_MLIR")
-    target_link_libraries(migraphx_gpu PUBLIC rocMLIR::rockCompiler)
+    # Make this private to avoid multiple inclusions of LLVM symbols.
+    # TODO: Fix rocMLIR's library to hide LLVM internals.
+    target_link_libraries(migraphx_gpu PRIVATE rocMLIR::rockCompiler)
 endif()

 if(MIGRAPHX_USE_HIPRTC)
@@ -231,7 +246,12 @@ check_library_exists(MIOpen "miopenFindSolutions" "${MIOPEN_LOCATION}" HAS_FIND_
 set(MIGRAPHX_USE_FIND_2_API "${HAS_FIND_2_API}" CACHE BOOL "")

 if(MIGRAPHX_USE_FIND_2_API)
-    target_compile_definitions(migraphx_gpu PUBLIC -DMIGRAPHX_HAS_FIND_2_API)
+    check_library_exists(MIOpen "miopenSetFindOptionPreallocatedTensor" "${MIOPEN_LOCATION}" HAS_PREALLOCATION_API)
+    if(HAS_PREALLOCATION_API)
+        target_compile_definitions(migraphx_gpu PUBLIC -DMIGRAPHX_HAS_FIND_2_API -DMIGRAPHX_PREALLOCATE_MIOPEN_BUFFERS)
+    else()
+        target_compile_definitions(migraphx_gpu PUBLIC -DMIGRAPHX_HAS_FIND_2_API)
+    endif()
    message(STATUS "MIGraphx is using Find-2.0 API of MIOpen")
 else()
    message(STATUS "MIGraphx is using legacy Find API in MIOpen")
@@ -245,7 +265,11 @@ else()
 endif()

 target_link_libraries(migraphx_gpu PUBLIC migraphx MIOpen roc::rocblas)
-target_link_libraries(migraphx_gpu PRIVATE migraphx_device migraphx_kernels composable_kernel::jit_library)
+target_link_libraries(migraphx_gpu PRIVATE migraphx_device migraphx_kernels)
+if(NOT WIN32)
+    # TODO: re-enable when CK is ported to Windows
+    target_link_libraries(migraphx_gpu PRIVATE composable_kernel::jit_library)
+endif()

 add_subdirectory(driver)
 add_subdirectory(hiprtc)

--- a/src/targets/gpu/compile_hip_code_object.cpp
+++ b/src/targets/gpu/compile_hip_code_object.cpp
@@ -135,14 +135,13 @@ compute_global_for(context& ctx, std::size_t n, std::size_t over)
    std::size_t max_global = ctx.get_current_device().get_cu_count() *
                             ctx.get_current_device().get_max_workitems_per_cu();
    return [n, over, max_global](std::size_t local) {
-        std::size_t num_elements = n;
+        // hip require global workitems multiple of local workitems. It may degrade performance.
+        // [TODO]: consider adding "fno-hip-uniform-block" flag when it becomes available.
+        // https://reviews.llvm.org/D155213
+        std::size_t num_elements = ((n + local - 1) / local) * local;
        std::size_t groups       = (num_elements + local - 1) / local;
        std::size_t max_blocks   = max_global / local;
        std::size_t nglobal      = std::min(max_blocks * over, groups) * local;
-#ifdef MIGRAPHX_USE_HIPRTC
-        if(enabled(MIGRAPHX_ENABLE_HIPRTC_WORKAROUNDS{}))
-            num_elements = ((num_elements + local - 1) / local) * local;
-#endif
        return std::min(nglobal, num_elements);
    };
 }
@@ -168,7 +167,7 @@ operation compile_hip_code_object(const std::string& content, hip_compile_option
                   [](auto&& p) {
                       auto&& name = p.first;
                       auto&& c    = p.second;
-                       auto path   = fs::path{"migraphx"} / "kernels" / name;
+                       auto path   = name;
                       return src_file{path, c};
                   });
    srcs.push_back(src_file{fs::path{"main.cpp"},

--- a/src/targets/gpu/compile_miopen.cpp
+++ b/src/targets/gpu/compile_miopen.cpp
@@ -79,7 +79,7 @@ void compile_miopen::apply(module& m) const
        std::size_t ws = 0;
        try
        {
-            // for the regular convolution and deconvolution, this try would always succeed
+            // for the regular convolution and convolution_backwards, this try would always succeed
            ws = compile(op, ins, int8_x4_format);
        }
        catch(migraphx::exception&)

--- a/src/targets/gpu/driver/CMakeLists.txt
+++ b/src/targets/gpu/driver/CMakeLists.txt
@@ -22,7 +22,7 @@
 # THE SOFTWARE.
 #####################################################################################

-file(GLOB GPU_DRIVER_SRCS ${CONFIGURE_DEPENDS} ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
+file(GLOB GPU_DRIVER_SRCS CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
 add_executable(gpu-driver
    ${GPU_DRIVER_SRCS}
 )

--- a/src/targets/gpu/fuse_mlir.cpp
+++ b/src/targets/gpu/fuse_mlir.cpp
@@ -216,6 +216,7 @@ struct find_mlir_op
                                                                "quant_dot",
                                                                "add",
                                                                "clip",
+                                                                "relu",
                                                                "sub",
                                                                "mul",
                                                                "div",

--- a/src/targets/gpu/gemm_impl.cpp
+++ b/src/targets/gpu/gemm_impl.cpp
@@ -140,8 +140,11 @@ void gemm_impl(context& ctx,
            compute_type = rocblas_datatype_f32_r;
    }

-    rocblas_gemm_flags flag =
-        int8_x4_format ? rocblas_gemm_flags_pack_int8x4 : rocblas_gemm_flags_none;
+    rocblas_gemm_flags flag = rocblas_gemm_flags_none;
+#if ROCBLAS_VERSION_MAJOR < 3
+    if(int8_x4_format)
+        flag = rocblas_gemm_flags_pack_int8x4;
+#endif

    auto a_lens = args[0].get_shape().lens();
    auto b_lens = args[1].get_shape().lens();

--- a/src/targets/gpu/include/migraphx/gpu/allocation_model.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/allocation_model.hpp
@@ -24,7 +24,7 @@
 #ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_GPU_ALLOCATION_MODEL_HPP
 #define MIGRAPHX_GUARD_AMDMIGRAPHX_GPU_ALLOCATION_MODEL_HPP

-#include <migraphx/config.hpp>
+#include <migraphx/gpu/config.hpp>
 #include <migraphx/operation.hpp>
 #include <migraphx/instruction_ref.hpp>
 #include <string>
@@ -33,7 +33,7 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {

-struct gpu_allocation_model
+struct MIGRAPHX_GPU_EXPORT gpu_allocation_model
 {
    std::string name() const;
    std::string copy() const;

--- a/src/targets/gpu/include/migraphx/gpu/analyze_streams.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/analyze_streams.hpp
@@ -24,7 +24,7 @@
 #ifndef MIGRAPHX_GUARD_RTGLIB_GPU_ANALYZE_STREAMS_HPP
 #define MIGRAPHX_GUARD_RTGLIB_GPU_ANALYZE_STREAMS_HPP

-#include <migraphx/config.hpp>
+#include <migraphx/gpu/config.hpp>
 #include <migraphx/analyze_streams.hpp>

 namespace migraphx {
@@ -34,7 +34,7 @@ struct module;

 namespace gpu {

-std::vector<stream_race> analyze_streams(const module& m);
+MIGRAPHX_GPU_EXPORT std::vector<stream_race> analyze_streams(const module& m);

 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/gpu/include/migraphx/gpu/compile_hip.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compile_hip.hpp
@@ -24,7 +24,7 @@
 #ifndef MIGRAPHX_GUARD_RTGLIB_COMPILE_HIP_HPP
 #define MIGRAPHX_GUARD_RTGLIB_COMPILE_HIP_HPP

-#include <migraphx/config.hpp>
+#include <migraphx/gpu/config.hpp>
 #include <migraphx/filesystem.hpp>
 #include <migraphx/compile_src.hpp>
 #include <migraphx/env.hpp>
@@ -58,14 +58,13 @@ struct hiprtc_src_file
    }
 };

-std::vector<std::vector<char>> compile_hip_src_with_hiprtc(std::vector<hiprtc_src_file> srcs,
-                                                           std::string params,
-                                                           const std::string& arch);
+MIGRAPHX_GPU_EXPORT std::vector<std::vector<char>> compile_hip_src_with_hiprtc(
+    std::vector<hiprtc_src_file> srcs, std::string params, const std::string& arch);

-std::vector<std::vector<char>>
+MIGRAPHX_GPU_EXPORT std::vector<std::vector<char>>
 compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std::string& arch);

-std::string enum_params(std::size_t count, std::string param);
+MIGRAPHX_GPU_EXPORT std::string enum_params(std::size_t count, std::string param);

 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp
@@ -24,7 +24,7 @@
 #ifndef MIGRAPHX_GUARD_GPU_COMPILE_HIP_CODE_OBJECT_HPP
 #define MIGRAPHX_GUARD_GPU_COMPILE_HIP_CODE_OBJECT_HPP

-#include <migraphx/config.hpp>
+#include <migraphx/gpu/config.hpp>
 #include <migraphx/operation.hpp>
 #include <migraphx/compile_src.hpp>

@@ -66,14 +66,16 @@ struct hip_compile_options
 };

 /// Compute global for n elements, but max out on target-specific upper limit
-std::function<std::size_t(std::size_t local)>
+MIGRAPHX_GPU_EXPORT std::function<std::size_t(std::size_t local)>
 compute_global_for(context& ctx, std::size_t n, std::size_t over = 1);

-operation compile_hip_code_object(const std::string& content, hip_compile_options options);
+MIGRAPHX_GPU_EXPORT operation compile_hip_code_object(const std::string& content,
+                                                      hip_compile_options options);

-std::size_t compute_block_size(std::size_t n, std::size_t max_block_size = 1024);
+MIGRAPHX_GPU_EXPORT std::size_t compute_block_size(std::size_t n,
+                                                   std::size_t max_block_size = 1024);

-std::string generate_make_shape(const shape& s);
+MIGRAPHX_GPU_EXPORT std::string generate_make_shape(const shape& s);

 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/gpu/include/migraphx/gpu/compile_ops.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compile_ops.hpp
@@ -24,7 +24,7 @@
 #ifndef MIGRAPHX_GUARD_GPU_COMPILE_OPS_HPP
 #define MIGRAPHX_GUARD_GPU_COMPILE_OPS_HPP

-#include <migraphx/config.hpp>
+#include <migraphx/gpu/config.hpp>
 #include <string>

 namespace migraphx {
@@ -36,7 +36,7 @@ namespace gpu {

 struct context;

-struct compile_ops
+struct MIGRAPHX_GPU_EXPORT compile_ops
 {
    context* ctx         = nullptr;
    bool exhaustive_tune = false;

--- a/src/targets/gpu/include/migraphx/gpu/compiler.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compiler.hpp
@@ -24,7 +24,7 @@
 #ifndef MIGRAPHX_GUARD_GPU_COMPILER_HPP
 #define MIGRAPHX_GUARD_GPU_COMPILER_HPP

-#include <migraphx/config.hpp>
+#include <migraphx/gpu/config.hpp>
 #include <migraphx/auto_register.hpp>
 #include <migraphx/operation.hpp>
 #include <migraphx/value.hpp>
@@ -32,6 +32,7 @@
 #include <migraphx/instruction.hpp>
 #include <migraphx/optional.hpp>
 #include <migraphx/rank.hpp>
+#include <migraphx/gpu/tuning_config.hpp>
 #include <functional>

 namespace migraphx {
@@ -68,12 +69,6 @@ struct compiler_replace
    }
 };

-struct tuning_config
-{
-    value problem;
-    std::vector<value> solutions;
-};
-
 using compiler_compile =
    std::function<compiler_replace(context&, instruction_ref, operation, const value&)>;
 using compiler_compile_op =
@@ -81,17 +76,21 @@ using compiler_compile_op =
 using compiler_tuning_config =
    std::function<optional<tuning_config>(context&, instruction_ref, const operation&, bool)>;

-void register_compiler(const std::string& name,
-                       compiler_compile c,
-                       compiler_compile_op cop,
-                       compiler_tuning_config ctg);
-
-bool has_compiler_for(const std::string& name);
-compiler_replace
-compile(context& ctx, instruction_ref ins, const operation& op, const value& solution);
-operation
-compile_op(const std::string& name, context& ctx, const std::vector<shape>& inputs, const value& v);
-optional<tuning_config>
+MIGRAPHX_GPU_EXPORT void register_compiler(const std::string& name,
+                                           compiler_compile c,
+                                           compiler_compile_op cop,
+                                           compiler_tuning_config ctg);
+
+MIGRAPHX_GPU_EXPORT bool has_compiler_for(const std::string& name);
+MIGRAPHX_GPU_EXPORT compiler_replace compile(context& ctx,
+                                             instruction_ref ins,
+                                             const operation& op,
+                                             const value& solution);
+MIGRAPHX_GPU_EXPORT operation compile_op(const std::string& name,
+                                         context& ctx,
+                                         const std::vector<shape>& inputs,
+                                         const value& v);
+MIGRAPHX_GPU_EXPORT optional<tuning_config>
 get_tuning_config(context& ctx, instruction_ref ins, const operation& op, bool exhaustive);

 template <class T>

--- a/src/targets/gpu/include/migraphx/gpu/config.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/config.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef MIGRAPHX_GUARD_GPU_CONFIG_HPP
+#define MIGRAPHX_GUARD_GPU_CONFIG_HPP
+
+#include <migraphx/config.hpp>
+#include <migraphx/gpu/export.h>
+
+#endif // MIGRAPHX_GUARD_GPU_CONFIG_HPP
--- a/src/targets/gpu/include/migraphx/gpu/context.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/context.hpp
@@ -24,6 +24,7 @@
 #ifndef MIGRAPHX_GUARD_RTGLIB_CONTEXT_HPP
 #define MIGRAPHX_GUARD_RTGLIB_CONTEXT_HPP

+#include <migraphx/gpu/export.h>
 #include <migraphx/context.hpp>
 #include <migraphx/gpu/miopen.hpp>
 #include <migraphx/gpu/rocblas.hpp>

--- a/src/targets/gpu/include/migraphx/gpu/contiguous.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/contiguous.hpp
@@ -41,8 +41,6 @@ struct miopen_contiguous : unary_device<miopen_contiguous, &device::contiguous>
    shape compute_shape(const std::vector<shape>& inputs) const
    {
        check_shapes{inputs, *this}.has(2);
-        if(inputs.front().standard())
-            return inputs.front();
        auto lens = inputs.at(0).lens();
        auto t    = inputs.at(0).type();
        return {t, lens};

--- a/src/targets/gpu/include/migraphx/gpu/convolution.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/convolution.hpp
@@ -31,7 +31,7 @@
 #include <migraphx/op/identity.hpp>
 #include <migraphx/op/convolution.hpp>
 #include <migraphx/op/quant_convolution.hpp>
-#include <migraphx/op/deconvolution.hpp>
+#include <migraphx/op/convolution_backwards.hpp>
 #include <unordered_map>
 #include <migraphx/reflect.hpp>
 #include <migraphx/gpu/context.hpp>
@@ -146,7 +146,8 @@ struct miopen_convolution

    void set_conv_descriptor()
    {
-        cd = (op.name() == "deconvolution") ? make_deconv(op) : make_conv(op);
+        cd =
+            (op.name() == "convolution_backwards") ? make_convolution_backwards(op) : make_conv(op);
    }

    value compile(migraphx::context& ctx, const shape& output, const std::vector<shape>& input)
@@ -159,10 +160,31 @@ struct miopen_convolution
    shape find(context& ctx, const shape& output_shape, const std::vector<shape>& inputs)
    {
        shape workspace_shape{};
-        auto x_desc                = make_tensor(reshape_if_1d(inputs[0]), int8_x4_format);
-        auto w_desc                = make_tensor(reshape_if_1d(inputs[1]), int8_x4_format);
-        auto y_desc                = make_tensor(reshape_if_1d(output_shape));
+        auto x_desc = make_tensor(reshape_if_1d(inputs[0]), int8_x4_format);
+        auto w_desc = make_tensor(reshape_if_1d(inputs[1]), int8_x4_format);
+        auto y_desc = make_tensor(reshape_if_1d(output_shape));
+
+        auto* miopen_stream_handle = ctx.get_stream().get_miopen();
        std::size_t workspace_size = 0;
+        auto status                = miopenConvolutionForwardGetWorkSpaceSize(miopen_stream_handle,
+                                                               w_desc.get(),
+                                                               x_desc.get(),
+                                                               cd.get(),
+                                                               y_desc.get(),
+                                                               &workspace_size);
+        if(status != miopenStatusSuccess)
+            MIGRAPHX_THROW("MIOpen" + op.name() + " : Failed to get forward workspace size");
+
+        workspace_shape = shape{shape::int8_type, {workspace_size}};
+
+        auto x_shape = inputs[0];
+        auto w_shape = inputs[1];
+        if(int8_x4_format)
+        {
+            x_shape = pack_int8_shape(x_shape);
+            w_shape = pack_int8_shape(w_shape);
+        }
+
 #ifdef MIGRAPHX_HAS_FIND_2_API
        {
            auto conv_problem = make_obj<miopen_problem>(
@@ -170,13 +192,34 @@ struct miopen_convolution

            set_tensor_descriptor(miopenTensorConvolutionX, x_desc, conv_problem);
            set_tensor_descriptor(miopenTensorConvolutionW, w_desc, conv_problem);
+            bool preallocate = false;
+#ifdef MIGRAPHX_PREALLOCATE_MIOPEN_BUFFERS
+            // MIOpen has APIs to pass pre-allocated buffers starting from rocm-5.6
+            preallocate = true;
+#endif
+            auto x = preallocate ? to_gpu(generate_argument(x_shape)) : inputs[0];
+            auto w = preallocate ? to_gpu(generate_argument(w_shape)) : inputs[1];
+            auto y = preallocate ? allocate_gpu(output_shape) : inputs[2];
+            auto workspace =
+                preallocate ? allocate_gpu(workspace_shape) : migraphx::argument(workspace_shape);
+
            set_tensor_descriptor(miopenTensorConvolutionY, y_desc, conv_problem);

-            auto* miopen_stream_handle = ctx.get_stream().get_miopen();
+            const miopenTensorArgument_t tensor_args[3] = {
+                {miopenTensorConvolutionX, nullptr, x.implicit()},
+                {miopenTensorConvolutionW, nullptr, w.implicit()},
+                {miopenTensorConvolutionY, nullptr, y.implicit()},
+            };
+
+            solution_ptr = find_solution(miopen_stream_handle,
+                                         3,
+                                         tensor_args,
+                                         workspace.implicit(),
+                                         workspace_size,
+                                         conv_problem.get(),
+                                         ctx.get_exhaustive_tune_flag());

-            solution_ptr = find_solution(
-                miopen_stream_handle, conv_problem.get(), ctx.get_exhaustive_tune_flag());
-            auto status = miopenGetSolutionWorkspaceSize(solution_ptr.get(), &workspace_size);
+            status = miopenGetSolutionWorkspaceSize(solution_ptr.get(), &workspace_size);
            if(status != miopenStatusSuccess)
                MIGRAPHX_THROW("MIOpen" + op.name() + " : failed to get solution's workspace size");

@@ -195,29 +238,10 @@ struct miopen_convolution
            return shape{shape::int8_type, {workspace_size}};
        }
 #else
-        auto status = miopenConvolutionForwardGetWorkSpaceSize(ctx.get_stream().get_miopen(),
-                                                               w_desc.get(),
-                                                               x_desc.get(),
-                                                               cd.get(),
-                                                               y_desc.get(),
-                                                               &workspace_size);
-        if(status != miopenStatusSuccess)
-            MIGRAPHX_THROW("MIOpen" + op.name() + " : Failed to get forward workspace size");
-
-        workspace_shape = shape{shape::int8_type, {workspace_size}};
-
-        auto x_shape = inputs[0];
-        auto w_shape = inputs[1];
-        if(int8_x4_format)
-        {
-            x_shape = pack_int8_shape(x_shape);
-            w_shape = pack_int8_shape(w_shape);
-        }
        auto x         = to_gpu(generate_argument(x_shape));
        auto w         = to_gpu(generate_argument(w_shape));
        auto y         = allocate_gpu(output_shape);
        auto workspace = allocate_gpu(workspace_shape);
-
        int algo_count = 1;
        miopenConvAlgoPerf_t perf;
        status = miopenFindConvolutionForwardAlgorithm(ctx.get_stream().get_miopen(),
@@ -337,6 +361,7 @@ struct miopen_convolution
        return {s.type(), lens, strides};
    }
 };
+
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/targets/gpu/include/migraphx/gpu/device/argmax.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/argmax.hpp
@@ -25,7 +25,7 @@
 #define MIGRAPHX_GUARD_RTGLIB_DEVICE_ARGMAX_HPP

 #include <migraphx/argument.hpp>
-#include <migraphx/config.hpp>
+#include <migraphx/gpu/device/config.hpp>
 #include <hip/hip_runtime_api.h>

 namespace migraphx {
@@ -33,7 +33,10 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {

-void argmax(hipStream_t stream, const argument& result, const argument& arg, int64_t axis);
+void MIGRAPHX_DEVICE_EXPORT argmax(hipStream_t stream,
+                                   const argument& result,
+                                   const argument& arg,
+                                   int64_t axis);

 } // namespace device
 } // namespace gpu