Merge branch 'develop' into threaded_nms

40fbef9b · Ted Themistokleous · GitHub · d164b151 · aeb9f78c · 40fbef9b
Unverified Commit 40fbef9b authored Aug 05, 2023 by Ted Themistokleous Committed by GitHub Aug 05, 2023
20 changed files
--- a/src/targets/gpu/hip.cpp
+++ b/src/targets/gpu/hip.cpp
@@ -146,7 +146,11 @@ std::vector<T> read_from_gpu(const void* x, std::size_t sz)
    gpu_sync();
    std::vector<T> result(sz);
    assert(not is_device_ptr(result.data()));
-    assert(is_device_ptr(x));
+    if(not is_device_ptr(x))
+    {
+        MIGRAPHX_THROW(
+            "read_from_gpu() requires Src buffer to be on the GPU, Copy from gpu failed\n");
+    }
    auto status = hipMemcpy(result.data(), x, sz * sizeof(T), hipMemcpyDeviceToHost);
    if(status != hipSuccess)
        MIGRAPHX_THROW("Copy from gpu failed: " + hip_error(status)); // NOLINT

--- a/src/targets/gpu/include/migraphx/gpu/allocation_model.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/allocation_model.hpp
@@ -24,7 +24,7 @@
 #ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_GPU_ALLOCATION_MODEL_HPP
 #define MIGRAPHX_GUARD_AMDMIGRAPHX_GPU_ALLOCATION_MODEL_HPP
-#include <migraphx/config.hpp>
+#include <migraphx/gpu/config.hpp>
 #include <migraphx/operation.hpp>
 #include <migraphx/instruction_ref.hpp>
 #include <string>
@@ -33,7 +33,7 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
-struct gpu_allocation_model
+struct MIGRAPHX_GPU_EXPORT gpu_allocation_model
 {
    std::string name() const;
    std::string copy() const;

--- a/src/targets/gpu/include/migraphx/gpu/analyze_streams.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/analyze_streams.hpp
@@ -24,7 +24,7 @@
 #ifndef MIGRAPHX_GUARD_RTGLIB_GPU_ANALYZE_STREAMS_HPP
 #define MIGRAPHX_GUARD_RTGLIB_GPU_ANALYZE_STREAMS_HPP
-#include <migraphx/config.hpp>
+#include <migraphx/gpu/config.hpp>
 #include <migraphx/analyze_streams.hpp>
 namespace migraphx {
@@ -34,7 +34,7 @@ struct module;
 namespace gpu {
-std::vector<stream_race> analyze_streams(const module& m);
+MIGRAPHX_GPU_EXPORT std::vector<stream_race> analyze_streams(const module& m);
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/gpu/include/migraphx/gpu/compile_hip.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compile_hip.hpp
@@ -24,9 +24,10 @@
 #ifndef MIGRAPHX_GUARD_RTGLIB_COMPILE_HIP_HPP
 #define MIGRAPHX_GUARD_RTGLIB_COMPILE_HIP_HPP
-#include <migraphx/config.hpp>
+#include <migraphx/gpu/config.hpp>
 #include <migraphx/filesystem.hpp>
 #include <migraphx/compile_src.hpp>
+#include <migraphx/env.hpp>
 #include <migraphx/functional.hpp>
 #include <string>
 #include <utility>
@@ -36,6 +37,11 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
+#ifdef MIGRAPHX_USE_HIPRTC
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TRACE_HIPRTC);
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_HIPRTC_WORKAROUNDS);
+#endif
 struct hiprtc_src_file
 {
    hiprtc_src_file() = default;
@@ -52,14 +58,13 @@ struct hiprtc_src_file
    }
 };
-std::vector<std::vector<char>> compile_hip_src_with_hiprtc(std::vector<hiprtc_src_file> srcs,
+MIGRAPHX_GPU_EXPORT std::vector<std::vector<char>> compile_hip_src_with_hiprtc(
-                                                           std::string params,
+    std::vector<hiprtc_src_file> srcs, std::string params, const std::string& arch);
-                                                           const std::string& arch);
-std::vector<std::vector<char>>
+MIGRAPHX_GPU_EXPORT std::vector<std::vector<char>>
 compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std::string& arch);
-std::string enum_params(std::size_t count, std::string param);
+MIGRAPHX_GPU_EXPORT std::string enum_params(std::size_t count, std::string param);
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp
@@ -24,8 +24,9 @@
 #ifndef MIGRAPHX_GUARD_GPU_COMPILE_HIP_CODE_OBJECT_HPP
 #define MIGRAPHX_GUARD_GPU_COMPILE_HIP_CODE_OBJECT_HPP
-#include <migraphx/config.hpp>
+#include <migraphx/gpu/config.hpp>
 #include <migraphx/operation.hpp>
+#include <migraphx/compile_src.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -39,9 +40,10 @@ struct hip_compile_options
    std::size_t local;
    std::vector<shape> inputs;
    shape output;
-    std::string kernel_name           = "kernel";
+    std::string kernel_name                    = "kernel";
-    std::string params                = "";
+    std::string params                         = "";
-    std::vector<shape> virtual_inputs = {};
+    std::vector<shape> virtual_inputs          = {};
+    std::vector<src_file> additional_src_files = {};
    /**
     * @brief Set the launch parameters but allow v to override the values
@@ -64,14 +66,16 @@ struct hip_compile_options
 };
 /// Compute global for n elements, but max out on target-specific upper limit
-std::function<std::size_t(std::size_t local)>
+MIGRAPHX_GPU_EXPORT std::function<std::size_t(std::size_t local)>
 compute_global_for(context& ctx, std::size_t n, std::size_t over = 1);
-operation compile_hip_code_object(const std::string& content, hip_compile_options options);
+MIGRAPHX_GPU_EXPORT operation compile_hip_code_object(const std::string& content,
+                                                      hip_compile_options options);
-std::size_t compute_block_size(std::size_t n, std::size_t max_block_size = 1024);
+MIGRAPHX_GPU_EXPORT std::size_t compute_block_size(std::size_t n,
+                                                   std::size_t max_block_size = 1024);
-std::string generate_make_shape(const shape& s);
+MIGRAPHX_GPU_EXPORT std::string generate_make_shape(const shape& s);
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/gpu/include/migraphx/gpu/compile_ops.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compile_ops.hpp
@@ -24,7 +24,7 @@
 #ifndef MIGRAPHX_GUARD_GPU_COMPILE_OPS_HPP
 #define MIGRAPHX_GUARD_GPU_COMPILE_OPS_HPP
-#include <migraphx/config.hpp>
+#include <migraphx/gpu/config.hpp>
 #include <string>
 namespace migraphx {
@@ -36,9 +36,10 @@ namespace gpu {
 struct context;
-struct compile_ops
+struct MIGRAPHX_GPU_EXPORT compile_ops
 {
-    context* ctx = nullptr;
+    context* ctx         = nullptr;
+    bool exhaustive_tune = false;
    std::string name() const { return "gpu::compile_ops"; }
    void apply(module& m) const;
 };

--- a/src/targets/gpu/include/migraphx/gpu/compiler.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compiler.hpp
@@ -24,12 +24,15 @@
 #ifndef MIGRAPHX_GUARD_GPU_COMPILER_HPP
 #define MIGRAPHX_GUARD_GPU_COMPILER_HPP
-#include <migraphx/config.hpp>
+#include <migraphx/gpu/config.hpp>
 #include <migraphx/auto_register.hpp>
 #include <migraphx/operation.hpp>
 #include <migraphx/value.hpp>
 #include <migraphx/module.hpp>
 #include <migraphx/instruction.hpp>
+#include <migraphx/optional.hpp>
+#include <migraphx/rank.hpp>
+#include <migraphx/gpu/tuning_config.hpp>
 #include <functional>
 namespace migraphx {
@@ -38,17 +41,57 @@ namespace gpu {
 struct context;
-using compiler_replace = std::function<void(module& m, instruction_ref ins)>;
+struct compiler_replace
-using compiler_compile = std::function<compiler_replace(context&, instruction_ref, operation)>;
+{
+    compiler_replace() = default;
+    compiler_replace(const operation& op) : code_object{op} {}
+    template <class F>
+    compiler_replace(const operation& op, F f)
+        : code_object{op},
+          replace_fn([=](const compiler_replace& cr, module& m, instruction_ref ins) {
+              f(m, ins, cr.code_object);
+          })
+    {
+    }
+    operation code_object = {};
+    std::function<void(const compiler_replace& cr, module& m, instruction_ref ins)> replace_fn =
+        nullptr;
+    void replace(module& m, instruction_ref ins) const
+    {
+        if(replace_fn)
+            replace_fn(*this, m, ins);
+        else
+            m.replace_instruction(ins, code_object, ins->inputs());
+    }
+};
+using compiler_compile =
+    std::function<compiler_replace(context&, instruction_ref, operation, const value&)>;
 using compiler_compile_op =
    std::function<operation(context&, const std::vector<shape>& inputs, const value&)>;
+using compiler_tuning_config =
+    std::function<optional<tuning_config>(context&, instruction_ref, const operation&, bool)>;
-void register_compiler(const std::string& name, compiler_compile c, compiler_compile_op cop);
+MIGRAPHX_GPU_EXPORT void register_compiler(const std::string& name,
+                                           compiler_compile c,
+                                           compiler_compile_op cop,
+                                           compiler_tuning_config ctg);
-bool has_compiler_for(const std::string& name);
+MIGRAPHX_GPU_EXPORT bool has_compiler_for(const std::string& name);
-compiler_replace compile(context& ctx, instruction_ref ins, const operation& op);
+MIGRAPHX_GPU_EXPORT compiler_replace compile(context& ctx,
-operation
+                                             instruction_ref ins,
-compile_op(const std::string& name, context& ctx, const std::vector<shape>& inputs, const value& v);
+                                             const operation& op,
+                                             const value& solution);
+MIGRAPHX_GPU_EXPORT operation compile_op(const std::string& name,
+                                         context& ctx,
+                                         const std::vector<shape>& inputs,
+                                         const value& v);
+MIGRAPHX_GPU_EXPORT optional<tuning_config>
+get_tuning_config(context& ctx, instruction_ref ins, const operation& op, bool exhaustive);
 template <class T>
 void register_compiler()
@@ -58,8 +101,11 @@ void register_compiler()
    {
        register_compiler(
            name,
-            [=](auto&&... xs) { return c.compile(std::forward<decltype(xs)>(xs)...); },
+            [=](auto&&... xs) {
-            [=](auto&&... xs) { return c.compile_op(std::forward<decltype(xs)>(xs)...); });
+                return c.invoke_compile(rank<1>{}, std::forward<decltype(xs)>(xs)...);
+            },
+            [=](auto&&... xs) { return c.compile_op(std::forward<decltype(xs)>(xs)...); },
+            [=](auto&&... xs) { return c.get_tuning_config(std::forward<decltype(xs)>(xs)...); });
    }
 }
@@ -78,12 +124,31 @@ using auto_register_compiler = auto_register<register_compiler_action, T>;
 template <class Derived>
 struct compiler : auto_register_compiler<Derived>
 {
-    auto replace(const operation& op) const
+    const Derived& derived() const { return static_cast<const Derived&>(*this); }
+    optional<tuning_config>
+    get_tuning_config(context&, instruction_ref, const operation&, bool) const
    {
-        return
+        return nullopt;
-            [=](module& m, instruction_ref ins) { m.replace_instruction(ins, op, ins->inputs()); };
    }
    operation compile_op(context&, const std::vector<shape>&, const value&) const { return {}; }
+    template <class D = Derived>
+    auto invoke_compile(
+        rank<1>, context& ctx, instruction_ref ins, operation op, const value& solution) const
+        -> decltype(std::declval<D>().compile(ctx, ins, std::move(op), solution))
+    {
+        return derived().compile(ctx, ins, std::move(op), solution);
+    }
+    template <class D = Derived>
+    auto invoke_compile(
+        rank<0>, context& ctx, instruction_ref ins, operation op, const value& solution) const
+        -> decltype(std::declval<D>().compile(ctx, ins, std::move(op)))
+    {
+        assert(solution.empty());
+        (void)solution;
+        return derived().compile(ctx, ins, std::move(op));
+    }
 };
 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/config.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/config.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_GPU_CONFIG_HPP
+#define MIGRAPHX_GUARD_GPU_CONFIG_HPP
+#include <migraphx/config.hpp>
+#include <migraphx/gpu/export.h>
+#endif // MIGRAPHX_GUARD_GPU_CONFIG_HPP
--- a/src/targets/gpu/include/migraphx/gpu/context.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/context.hpp
@@ -24,6 +24,7 @@
 #ifndef MIGRAPHX_GUARD_RTGLIB_CONTEXT_HPP
 #define MIGRAPHX_GUARD_RTGLIB_CONTEXT_HPP
+#include <migraphx/gpu/export.h>
 #include <migraphx/context.hpp>
 #include <migraphx/gpu/miopen.hpp>
 #include <migraphx/gpu/rocblas.hpp>
@@ -170,7 +171,9 @@ struct hip_device
    std::size_t stream_id() const { return current_stream; }
-    std::string get_device_name() const { return device_props.gcnArchName; }
+    std::string get_device_name() const { return get_arch_name(device_props); }
+    std::string get_gfx_name() const { return trim(split_string(get_device_name(), ':').front()); }
    std::size_t get_device_major() const { return device_props.major; }

--- a/src/targets/gpu/include/migraphx/gpu/contiguous.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/contiguous.hpp
@@ -41,8 +41,6 @@ struct miopen_contiguous : unary_device<miopen_contiguous, &device::contiguous>
    shape compute_shape(const std::vector<shape>& inputs) const
    {
        check_shapes{inputs, *this}.has(2);
-        if(inputs.front().standard())
-            return inputs.front();
        auto lens = inputs.at(0).lens();
        auto t    = inputs.at(0).type();
        return {t, lens};

--- a/src/targets/gpu/include/migraphx/gpu/convolution.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/convolution.hpp
@@ -31,7 +31,7 @@
 #include <migraphx/op/identity.hpp>
 #include <migraphx/op/convolution.hpp>
 #include <migraphx/op/quant_convolution.hpp>
-#include <migraphx/op/deconvolution.hpp>
+#include <migraphx/op/convolution_backwards.hpp>
 #include <unordered_map>
 #include <migraphx/reflect.hpp>
 #include <migraphx/gpu/context.hpp>
@@ -146,7 +146,8 @@ struct miopen_convolution
    void set_conv_descriptor()
    {
-        cd = (op.name() == "deconvolution") ? make_deconv(op) : make_conv(op);
+        cd =
+            (op.name() == "convolution_backwards") ? make_convolution_backwards(op) : make_conv(op);
    }
    value compile(migraphx::context& ctx, const shape& output, const std::vector<shape>& input)
@@ -159,10 +160,31 @@ struct miopen_convolution
    shape find(context& ctx, const shape& output_shape, const std::vector<shape>& inputs)
    {
        shape workspace_shape{};
-        auto x_desc                = make_tensor(reshape_if_1d(inputs[0]), int8_x4_format);
+        auto x_desc = make_tensor(reshape_if_1d(inputs[0]), int8_x4_format);
-        auto w_desc                = make_tensor(reshape_if_1d(inputs[1]), int8_x4_format);
+        auto w_desc = make_tensor(reshape_if_1d(inputs[1]), int8_x4_format);
-        auto y_desc                = make_tensor(reshape_if_1d(output_shape));
+        auto y_desc = make_tensor(reshape_if_1d(output_shape));
+        auto* miopen_stream_handle = ctx.get_stream().get_miopen();
        std::size_t workspace_size = 0;
+        auto status                = miopenConvolutionForwardGetWorkSpaceSize(miopen_stream_handle,
+                                                               w_desc.get(),
+                                                               x_desc.get(),
+                                                               cd.get(),
+                                                               y_desc.get(),
+                                                               &workspace_size);
+        if(status != miopenStatusSuccess)
+            MIGRAPHX_THROW("MIOpen" + op.name() + " : Failed to get forward workspace size");
+        workspace_shape = shape{shape::int8_type, {workspace_size}};
+        auto x_shape = inputs[0];
+        auto w_shape = inputs[1];
+        if(int8_x4_format)
+        {
+            x_shape = pack_int8_shape(x_shape);
+            w_shape = pack_int8_shape(w_shape);
+        }
 #ifdef MIGRAPHX_HAS_FIND_2_API
        {
            auto conv_problem = make_obj<miopen_problem>(
@@ -170,13 +192,34 @@ struct miopen_convolution
            set_tensor_descriptor(miopenTensorConvolutionX, x_desc, conv_problem);
            set_tensor_descriptor(miopenTensorConvolutionW, w_desc, conv_problem);
+            bool preallocate = false;
+#ifdef MIGRAPHX_PREALLOCATE_MIOPEN_BUFFERS
+            // MIOpen has APIs to pass pre-allocated buffers starting from rocm-5.6
+            preallocate = true;
+#endif
+            auto x = preallocate ? to_gpu(generate_argument(x_shape)) : inputs[0];
+            auto w = preallocate ? to_gpu(generate_argument(w_shape)) : inputs[1];
+            auto y = preallocate ? allocate_gpu(output_shape) : inputs[2];
+            auto workspace =
+                preallocate ? allocate_gpu(workspace_shape) : migraphx::argument(workspace_shape);
            set_tensor_descriptor(miopenTensorConvolutionY, y_desc, conv_problem);
-            auto* miopen_stream_handle = ctx.get_stream().get_miopen();
+            const miopenTensorArgument_t tensor_args[3] = {
+                {miopenTensorConvolutionX, nullptr, x.implicit()},
+                {miopenTensorConvolutionW, nullptr, w.implicit()},
+                {miopenTensorConvolutionY, nullptr, y.implicit()},
+            };
+            solution_ptr = find_solution(miopen_stream_handle,
+                                         3,
+                                         tensor_args,
+                                         workspace.implicit(),
+                                         workspace_size,
+                                         conv_problem.get(),
+                                         ctx.get_exhaustive_tune_flag());
-            solution_ptr = find_solution(
+            status = miopenGetSolutionWorkspaceSize(solution_ptr.get(), &workspace_size);
-                miopen_stream_handle, conv_problem.get(), ctx.get_exhaustive_tune_flag());
-            auto status = miopenGetSolutionWorkspaceSize(solution_ptr.get(), &workspace_size);
            if(status != miopenStatusSuccess)
                MIGRAPHX_THROW("MIOpen" + op.name() + " : failed to get solution's workspace size");
@@ -195,29 +238,10 @@ struct miopen_convolution
            return shape{shape::int8_type, {workspace_size}};
        }
 #else
-        auto status = miopenConvolutionForwardGetWorkSpaceSize(ctx.get_stream().get_miopen(),
-                                                               w_desc.get(),
-                                                               x_desc.get(),
-                                                               cd.get(),
-                                                               y_desc.get(),
-                                                               &workspace_size);
-        if(status != miopenStatusSuccess)
-            MIGRAPHX_THROW("MIOpen" + op.name() + " : Failed to get forward workspace size");
-        workspace_shape = shape{shape::int8_type, {workspace_size}};
-        auto x_shape = inputs[0];
-        auto w_shape = inputs[1];
-        if(int8_x4_format)
-        {
-            x_shape = pack_int8_shape(x_shape);
-            w_shape = pack_int8_shape(w_shape);
-        }
        auto x         = to_gpu(generate_argument(x_shape));
        auto w         = to_gpu(generate_argument(w_shape));
        auto y         = allocate_gpu(output_shape);
        auto workspace = allocate_gpu(workspace_shape);
        int algo_count = 1;
        miopenConvAlgoPerf_t perf;
        status = miopenFindConvolutionForwardAlgorithm(ctx.get_stream().get_miopen(),
@@ -337,6 +361,7 @@ struct miopen_convolution
        return {s.type(), lens, strides};
    }
 };
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/targets/gpu/include/migraphx/gpu/device/argmax.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/argmax.hpp
@@ -25,7 +25,7 @@
 #define MIGRAPHX_GUARD_RTGLIB_DEVICE_ARGMAX_HPP
 #include <migraphx/argument.hpp>
-#include <migraphx/config.hpp>
+#include <migraphx/gpu/device/config.hpp>
 #include <hip/hip_runtime_api.h>
 namespace migraphx {
@@ -33,7 +33,10 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
-void argmax(hipStream_t stream, const argument& result, const argument& arg, int64_t axis);
+void MIGRAPHX_DEVICE_EXPORT argmax(hipStream_t stream,
+                                   const argument& result,
+                                   const argument& arg,
+                                   int64_t axis);
 } // namespace device
 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/device/argmin.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/argmin.hpp
@@ -25,7 +25,7 @@
 #define MIGRAPHX_GUARD_RTGLIB_DEVICE_ARGMIN_HPP
 #include <migraphx/argument.hpp>
-#include <migraphx/config.hpp>
+#include <migraphx/gpu/device/config.hpp>
 #include <hip/hip_runtime_api.h>
 namespace migraphx {
@@ -33,7 +33,10 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
-void argmin(hipStream_t stream, const argument& result, const argument& arg, int64_t axis);
+void MIGRAPHX_DEVICE_EXPORT argmin(hipStream_t stream,
+                                   const argument& result,
+                                   const argument& arg,
+                                   int64_t axis);
 } // namespace device
 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/device/config.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/config.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_CONFIG_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_CONFIG_HPP
+#include <migraphx/config.hpp>
+#include <migraphx/gpu/device/export.h>
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/device/contiguous.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/contiguous.hpp
@@ -25,7 +25,7 @@
 #define MIGRAPHX_GUARD_MIGRAPHLIB_KERNELS_HPP
 #include <migraphx/argument.hpp>
-#include <migraphx/config.hpp>
+#include <migraphx/gpu/device/config.hpp>
 #include <hip/hip_runtime_api.h>
 namespace migraphx {
@@ -33,7 +33,9 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
-void contiguous(hipStream_t stream, const argument& result, const argument& arg);
+void MIGRAPHX_DEVICE_EXPORT contiguous(hipStream_t stream,
+                                       const argument& result,
+                                       const argument& arg);
 } // namespace device
 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/device/fill.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/fill.hpp
@@ -25,7 +25,7 @@
 #define MIGRAPHX_GUARD_RTGLIB_DEVICE_FILL_HPP
 #include <migraphx/argument.hpp>
-#include <migraphx/config.hpp>
+#include <migraphx/gpu/device/config.hpp>
 #include <hip/hip_runtime_api.h>
 namespace migraphx {
@@ -33,7 +33,7 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
-void fill(hipStream_t stream, const argument& result, unsigned long val);
+void MIGRAPHX_DEVICE_EXPORT fill(hipStream_t stream, const argument& result, unsigned long val);
 } // namespace device
 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/device/gather.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/gather.hpp
@@ -25,7 +25,7 @@
 #define MIGRAPHX_GUARD_RTGLIB_DEVICE_GATHER_HPP
 #include <migraphx/argument.hpp>
-#include <migraphx/config.hpp>
+#include <migraphx/gpu/device/config.hpp>
 #include <hip/hip_runtime_api.h>
 namespace migraphx {
@@ -33,7 +33,8 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
-argument gather(hipStream_t stream, argument result, argument arg1, argument arg2, int64_t axis);
+argument MIGRAPHX_DEVICE_EXPORT
+gather(hipStream_t stream, argument result, argument arg1, argument arg2, int64_t axis);
 } // namespace device
 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/device/int8_gemm_pack.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/int8_gemm_pack.hpp
@@ -25,7 +25,7 @@
 #define MIGRAPHX_GUARD_RTGLIB_DEVICE_INT8_GEMM_PACK_HPP
 #include <migraphx/argument.hpp>
-#include <migraphx/config.hpp>
+#include <migraphx/gpu/device/config.hpp>
 #include <hip/hip_runtime_api.h>
 namespace migraphx {
@@ -33,9 +33,13 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
-void int8_gemm_pack_a(hipStream_t stream, const argument& result, const argument& arg);
+void MIGRAPHX_DEVICE_EXPORT int8_gemm_pack_a(hipStream_t stream,
+                                             const argument& result,
+                                             const argument& arg);
-void int8_gemm_pack_b(hipStream_t stream, const argument& result, const argument& arg);
+void MIGRAPHX_DEVICE_EXPORT int8_gemm_pack_b(hipStream_t stream,
+                                             const argument& result,
+                                             const argument& arg);
 } // namespace device
 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/device/logsoftmax.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/logsoftmax.hpp
@@ -25,7 +25,7 @@
 #define MIGRAPHX_GUARD_RTGLIB_DEVICE_LOGSOFTMAX_HPP
 #include <migraphx/argument.hpp>
-#include <migraphx/config.hpp>
+#include <migraphx/gpu/device/config.hpp>
 #include <hip/hip_runtime_api.h>
 namespace migraphx {
@@ -33,7 +33,10 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
-void logsoftmax(hipStream_t stream, const argument& result, const argument& arg, int64_t axis);
+void MIGRAPHX_DEVICE_EXPORT logsoftmax(hipStream_t stream,
+                                       const argument& result,
+                                       const argument& arg,
+                                       int64_t axis);
 } // namespace device
 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/device/multinomial.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/multinomial.hpp
@@ -25,7 +25,7 @@
 #define MIGRAPHX_GUARD_RTGLIB_DEVICE_MULTINOMIAL_HPP
 #include <migraphx/argument.hpp>
-#include <migraphx/config.hpp>
+#include <migraphx/gpu/device/config.hpp>
 #include <hip/hip_runtime_api.h>
 namespace migraphx {
@@ -33,10 +33,10 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
-void multinomial(hipStream_t stream,
+void MIGRAPHX_DEVICE_EXPORT multinomial(hipStream_t stream,
-                 const argument& result,
+                                        const argument& result,
-                 const argument& arg0,
+                                        const argument& arg0,
-                 const argument& arg1);
+                                        const argument& arg1);
 } // namespace device
 } // namespace gpu