Merge branch 'develop' into blas_tuning

23cb7917 · Brian Pickrell · GitHub · b5fcc0bc · ea32ca70 · 23cb7917
Unverified Commit 23cb7917 authored Aug 16, 2023 by Brian Pickrell Committed by GitHub Aug 16, 2023
20 changed files
--- a/src/targets/gpu/include/migraphx/gpu/compile_ops.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compile_ops.hpp
@@ -24,7 +24,7 @@
 #ifndef MIGRAPHX_GUARD_GPU_COMPILE_OPS_HPP
 #define MIGRAPHX_GUARD_GPU_COMPILE_OPS_HPP
-#include <migraphx/config.hpp>
+#include <migraphx/gpu/config.hpp>
 #include <string>
 namespace migraphx {
@@ -36,9 +36,10 @@ namespace gpu {
 struct context;
-struct compile_ops
+struct MIGRAPHX_GPU_EXPORT compile_ops
 {
-    context* ctx = nullptr;
+    context* ctx         = nullptr;
+    bool exhaustive_tune = false;
    std::string name() const { return "gpu::compile_ops"; }
    void apply(module& m) const;
 };

--- a/src/targets/gpu/include/migraphx/gpu/compiler.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compiler.hpp
@@ -24,12 +24,15 @@
 #ifndef MIGRAPHX_GUARD_GPU_COMPILER_HPP
 #define MIGRAPHX_GUARD_GPU_COMPILER_HPP
-#include <migraphx/config.hpp>
+#include <migraphx/gpu/config.hpp>
 #include <migraphx/auto_register.hpp>
 #include <migraphx/operation.hpp>
 #include <migraphx/value.hpp>
 #include <migraphx/module.hpp>
 #include <migraphx/instruction.hpp>
+#include <migraphx/optional.hpp>
+#include <migraphx/rank.hpp>
+#include <migraphx/gpu/tuning_config.hpp>
 #include <functional>
 namespace migraphx {
@@ -38,17 +41,57 @@ namespace gpu {
 struct context;
-using compiler_replace = std::function<void(module& m, instruction_ref ins)>;
+struct compiler_replace
-using compiler_compile = std::function<compiler_replace(context&, instruction_ref, operation)>;
+{
+    compiler_replace() = default;
+    compiler_replace(const operation& op) : code_object{op} {}
+    template <class F>
+    compiler_replace(const operation& op, F f)
+        : code_object{op},
+          replace_fn([=](const compiler_replace& cr, module& m, instruction_ref ins) {
+              f(m, ins, cr.code_object);
+          })
+    {
+    }
+    operation code_object = {};
+    std::function<void(const compiler_replace& cr, module& m, instruction_ref ins)> replace_fn =
+        nullptr;
+    void replace(module& m, instruction_ref ins) const
+    {
+        if(replace_fn)
+            replace_fn(*this, m, ins);
+        else
+            m.replace_instruction(ins, code_object, ins->inputs());
+    }
+};
+using compiler_compile =
+    std::function<compiler_replace(context&, instruction_ref, operation, const value&)>;
 using compiler_compile_op =
    std::function<operation(context&, const std::vector<shape>& inputs, const value&)>;
+using compiler_tuning_config =
+    std::function<optional<tuning_config>(context&, instruction_ref, const operation&, bool)>;
-void register_compiler(const std::string& name, compiler_compile c, compiler_compile_op cop);
+MIGRAPHX_GPU_EXPORT void register_compiler(const std::string& name,
+                                           compiler_compile c,
+                                           compiler_compile_op cop,
+                                           compiler_tuning_config ctg);
-bool has_compiler_for(const std::string& name);
+MIGRAPHX_GPU_EXPORT bool has_compiler_for(const std::string& name);
-compiler_replace compile(context& ctx, instruction_ref ins, const operation& op);
+MIGRAPHX_GPU_EXPORT compiler_replace compile(context& ctx,
-operation
+                                             instruction_ref ins,
-compile_op(const std::string& name, context& ctx, const std::vector<shape>& inputs, const value& v);
+                                             const operation& op,
+                                             const value& solution);
+MIGRAPHX_GPU_EXPORT operation compile_op(const std::string& name,
+                                         context& ctx,
+                                         const std::vector<shape>& inputs,
+                                         const value& v);
+MIGRAPHX_GPU_EXPORT optional<tuning_config>
+get_tuning_config(context& ctx, instruction_ref ins, const operation& op, bool exhaustive);
 template <class T>
 void register_compiler()
@@ -58,8 +101,11 @@ void register_compiler()
    {
        register_compiler(
            name,
-            [=](auto&&... xs) { return c.compile(std::forward<decltype(xs)>(xs)...); },
+            [=](auto&&... xs) {
-            [=](auto&&... xs) { return c.compile_op(std::forward<decltype(xs)>(xs)...); });
+                return c.invoke_compile(rank<1>{}, std::forward<decltype(xs)>(xs)...);
+            },
+            [=](auto&&... xs) { return c.compile_op(std::forward<decltype(xs)>(xs)...); },
+            [=](auto&&... xs) { return c.get_tuning_config(std::forward<decltype(xs)>(xs)...); });
    }
 }
@@ -78,12 +124,31 @@ using auto_register_compiler = auto_register<register_compiler_action, T>;
 template <class Derived>
 struct compiler : auto_register_compiler<Derived>
 {
-    auto replace(const operation& op) const
+    const Derived& derived() const { return static_cast<const Derived&>(*this); }
+    optional<tuning_config>
+    get_tuning_config(context&, instruction_ref, const operation&, bool) const
    {
-        return
+        return nullopt;
-            [=](module& m, instruction_ref ins) { m.replace_instruction(ins, op, ins->inputs()); };
    }
    operation compile_op(context&, const std::vector<shape>&, const value&) const { return {}; }
+    template <class D = Derived>
+    auto invoke_compile(
+        rank<1>, context& ctx, instruction_ref ins, operation op, const value& solution) const
+        -> decltype(std::declval<D>().compile(ctx, ins, std::move(op), solution))
+    {
+        return derived().compile(ctx, ins, std::move(op), solution);
+    }
+    template <class D = Derived>
+    auto invoke_compile(
+        rank<0>, context& ctx, instruction_ref ins, operation op, const value& solution) const
+        -> decltype(std::declval<D>().compile(ctx, ins, std::move(op)))
+    {
+        assert(solution.empty());
+        (void)solution;
+        return derived().compile(ctx, ins, std::move(op));
+    }
 };
 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/config.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/config.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_GPU_CONFIG_HPP
+#define MIGRAPHX_GUARD_GPU_CONFIG_HPP
+#include <migraphx/config.hpp>
+#include <migraphx/gpu/export.h>
+#endif // MIGRAPHX_GUARD_GPU_CONFIG_HPP
--- a/src/targets/gpu/include/migraphx/gpu/context.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/context.hpp
@@ -24,6 +24,7 @@
 #ifndef MIGRAPHX_GUARD_RTGLIB_CONTEXT_HPP
 #define MIGRAPHX_GUARD_RTGLIB_CONTEXT_HPP
+#include <migraphx/gpu/export.h>
 #include <migraphx/context.hpp>
 #include <migraphx/gpu/miopen.hpp>
 #include <migraphx/gpu/rocblas.hpp>
@@ -45,13 +46,7 @@ using hip_event_ptr = MIGRAPHX_MANAGE_PTR(hipEvent_t, hipEventDestroy);
 struct hip_device
 {
-    hip_device()
+    hip_device() : device_props{} { add_stream(); }
-    {
-        device_props.gcnArchName[0]      = '\0';
-        device_props.gcnArch             = 0;
-        device_props.multiProcessorCount = 0;
-        add_stream();
-    }
    hip_device(std::size_t id, std::size_t n) : device_id(id)
    {
@@ -172,6 +167,8 @@ struct hip_device
    std::string get_device_name() const { return device_props.gcnArchName; }
+    std::string get_gfx_name() const { return trim(split_string(get_device_name(), ':').front()); }
    std::size_t get_device_major() const { return device_props.major; }
    std::size_t get_device_minor() const { return device_props.minor; }

--- a/src/targets/gpu/include/migraphx/gpu/contiguous.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/contiguous.hpp
@@ -41,8 +41,6 @@ struct miopen_contiguous : unary_device<miopen_contiguous, &device::contiguous>
    shape compute_shape(const std::vector<shape>& inputs) const
    {
        check_shapes{inputs, *this}.has(2);
-        if(inputs.front().standard())
-            return inputs.front();
        auto lens = inputs.at(0).lens();
        auto t    = inputs.at(0).type();
        return {t, lens};

--- a/src/targets/gpu/include/migraphx/gpu/convolution.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/convolution.hpp
@@ -31,7 +31,7 @@
 #include <migraphx/op/identity.hpp>
 #include <migraphx/op/convolution.hpp>
 #include <migraphx/op/quant_convolution.hpp>
-#include <migraphx/op/deconvolution.hpp>
+#include <migraphx/op/convolution_backwards.hpp>
 #include <unordered_map>
 #include <migraphx/reflect.hpp>
 #include <migraphx/gpu/context.hpp>
@@ -146,7 +146,8 @@ struct miopen_convolution
    void set_conv_descriptor()
    {
-        cd = (op.name() == "deconvolution") ? make_deconv(op) : make_conv(op);
+        cd =
+            (op.name() == "convolution_backwards") ? make_convolution_backwards(op) : make_conv(op);
    }
    value compile(migraphx::context& ctx, const shape& output, const std::vector<shape>& input)
@@ -159,10 +160,31 @@ struct miopen_convolution
    shape find(context& ctx, const shape& output_shape, const std::vector<shape>& inputs)
    {
        shape workspace_shape{};
-        auto x_desc                = make_tensor(reshape_if_1d(inputs[0]), int8_x4_format);
+        auto x_desc = make_tensor(reshape_if_1d(inputs[0]), int8_x4_format);
-        auto w_desc                = make_tensor(reshape_if_1d(inputs[1]), int8_x4_format);
+        auto w_desc = make_tensor(reshape_if_1d(inputs[1]), int8_x4_format);
-        auto y_desc                = make_tensor(reshape_if_1d(output_shape));
+        auto y_desc = make_tensor(reshape_if_1d(output_shape));
+        auto* miopen_stream_handle = ctx.get_stream().get_miopen();
        std::size_t workspace_size = 0;
+        auto status                = miopenConvolutionForwardGetWorkSpaceSize(miopen_stream_handle,
+                                                               w_desc.get(),
+                                                               x_desc.get(),
+                                                               cd.get(),
+                                                               y_desc.get(),
+                                                               &workspace_size);
+        if(status != miopenStatusSuccess)
+            MIGRAPHX_THROW("MIOpen" + op.name() + " : Failed to get forward workspace size");
+        workspace_shape = shape{shape::int8_type, {workspace_size}};
+        auto x_shape = inputs[0];
+        auto w_shape = inputs[1];
+        if(int8_x4_format)
+        {
+            x_shape = pack_int8_shape(x_shape);
+            w_shape = pack_int8_shape(w_shape);
+        }
 #ifdef MIGRAPHX_HAS_FIND_2_API
        {
            auto conv_problem = make_obj<miopen_problem>(
@@ -170,13 +192,34 @@ struct miopen_convolution
            set_tensor_descriptor(miopenTensorConvolutionX, x_desc, conv_problem);
            set_tensor_descriptor(miopenTensorConvolutionW, w_desc, conv_problem);
+            bool preallocate = false;
+#ifdef MIGRAPHX_PREALLOCATE_MIOPEN_BUFFERS
+            // MIOpen has APIs to pass pre-allocated buffers starting from rocm-5.6
+            preallocate = true;
+#endif
+            auto x = preallocate ? to_gpu(generate_argument(x_shape)) : inputs[0];
+            auto w = preallocate ? to_gpu(generate_argument(w_shape)) : inputs[1];
+            auto y = preallocate ? allocate_gpu(output_shape) : inputs[2];
+            auto workspace =
+                preallocate ? allocate_gpu(workspace_shape) : migraphx::argument(workspace_shape);
            set_tensor_descriptor(miopenTensorConvolutionY, y_desc, conv_problem);
-            auto* miopen_stream_handle = ctx.get_stream().get_miopen();
+            const miopenTensorArgument_t tensor_args[3] = {
+                {miopenTensorConvolutionX, nullptr, x.implicit()},
+                {miopenTensorConvolutionW, nullptr, w.implicit()},
+                {miopenTensorConvolutionY, nullptr, y.implicit()},
+            };
+            solution_ptr = find_solution(miopen_stream_handle,
+                                         3,
+                                         tensor_args,
+                                         workspace.implicit(),
+                                         workspace_size,
+                                         conv_problem.get(),
+                                         ctx.get_exhaustive_tune_flag());
-            solution_ptr = find_solution(
+            status = miopenGetSolutionWorkspaceSize(solution_ptr.get(), &workspace_size);
-                miopen_stream_handle, conv_problem.get(), ctx.get_exhaustive_tune_flag());
-            auto status = miopenGetSolutionWorkspaceSize(solution_ptr.get(), &workspace_size);
            if(status != miopenStatusSuccess)
                MIGRAPHX_THROW("MIOpen" + op.name() + " : failed to get solution's workspace size");
@@ -195,29 +238,10 @@ struct miopen_convolution
            return shape{shape::int8_type, {workspace_size}};
        }
 #else
-        auto status = miopenConvolutionForwardGetWorkSpaceSize(ctx.get_stream().get_miopen(),
-                                                               w_desc.get(),
-                                                               x_desc.get(),
-                                                               cd.get(),
-                                                               y_desc.get(),
-                                                               &workspace_size);
-        if(status != miopenStatusSuccess)
-            MIGRAPHX_THROW("MIOpen" + op.name() + " : Failed to get forward workspace size");
-        workspace_shape = shape{shape::int8_type, {workspace_size}};
-        auto x_shape = inputs[0];
-        auto w_shape = inputs[1];
-        if(int8_x4_format)
-        {
-            x_shape = pack_int8_shape(x_shape);
-            w_shape = pack_int8_shape(w_shape);
-        }
        auto x         = to_gpu(generate_argument(x_shape));
        auto w         = to_gpu(generate_argument(w_shape));
        auto y         = allocate_gpu(output_shape);
        auto workspace = allocate_gpu(workspace_shape);
        int algo_count = 1;
        miopenConvAlgoPerf_t perf;
        status = miopenFindConvolutionForwardAlgorithm(ctx.get_stream().get_miopen(),
@@ -337,6 +361,7 @@ struct miopen_convolution
        return {s.type(), lens, strides};
    }
 };
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/targets/gpu/include/migraphx/gpu/device/argmax.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/argmax.hpp
@@ -25,7 +25,7 @@
 #define MIGRAPHX_GUARD_RTGLIB_DEVICE_ARGMAX_HPP
 #include <migraphx/argument.hpp>
-#include <migraphx/config.hpp>
+#include <migraphx/gpu/device/config.hpp>
 #include <hip/hip_runtime_api.h>
 namespace migraphx {
@@ -33,7 +33,10 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
-void argmax(hipStream_t stream, const argument& result, const argument& arg, int64_t axis);
+void MIGRAPHX_DEVICE_EXPORT argmax(hipStream_t stream,
+                                   const argument& result,
+                                   const argument& arg,
+                                   int64_t axis);
 } // namespace device
 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/device/argmin.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/argmin.hpp
@@ -25,7 +25,7 @@
 #define MIGRAPHX_GUARD_RTGLIB_DEVICE_ARGMIN_HPP
 #include <migraphx/argument.hpp>
-#include <migraphx/config.hpp>
+#include <migraphx/gpu/device/config.hpp>
 #include <hip/hip_runtime_api.h>
 namespace migraphx {
@@ -33,7 +33,10 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
-void argmin(hipStream_t stream, const argument& result, const argument& arg, int64_t axis);
+void MIGRAPHX_DEVICE_EXPORT argmin(hipStream_t stream,
+                                   const argument& result,
+                                   const argument& arg,
+                                   int64_t axis);
 } // namespace device
 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/device/config.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/config.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_CONFIG_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_CONFIG_HPP
+#include <migraphx/config.hpp>
+#include <migraphx/gpu/device/export.h>
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/device/contiguous.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/contiguous.hpp
@@ -25,7 +25,7 @@
 #define MIGRAPHX_GUARD_MIGRAPHLIB_KERNELS_HPP
 #include <migraphx/argument.hpp>
-#include <migraphx/config.hpp>
+#include <migraphx/gpu/device/config.hpp>
 #include <hip/hip_runtime_api.h>
 namespace migraphx {
@@ -33,7 +33,9 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
-void contiguous(hipStream_t stream, const argument& result, const argument& arg);
+void MIGRAPHX_DEVICE_EXPORT contiguous(hipStream_t stream,
+                                       const argument& result,
+                                       const argument& arg);
 } // namespace device
 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/device/fill.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/fill.hpp
@@ -25,7 +25,7 @@
 #define MIGRAPHX_GUARD_RTGLIB_DEVICE_FILL_HPP
 #include <migraphx/argument.hpp>
-#include <migraphx/config.hpp>
+#include <migraphx/gpu/device/config.hpp>
 #include <hip/hip_runtime_api.h>
 namespace migraphx {
@@ -33,7 +33,7 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
-void fill(hipStream_t stream, const argument& result, unsigned long val);
+void MIGRAPHX_DEVICE_EXPORT fill(hipStream_t stream, const argument& result, unsigned long val);
 } // namespace device
 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/device/gather.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/gather.hpp
@@ -25,7 +25,7 @@
 #define MIGRAPHX_GUARD_RTGLIB_DEVICE_GATHER_HPP
 #include <migraphx/argument.hpp>
-#include <migraphx/config.hpp>
+#include <migraphx/gpu/device/config.hpp>
 #include <hip/hip_runtime_api.h>
 namespace migraphx {
@@ -33,7 +33,8 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
-argument gather(hipStream_t stream, argument result, argument arg1, argument arg2, int64_t axis);
+argument MIGRAPHX_DEVICE_EXPORT
+gather(hipStream_t stream, argument result, argument arg1, argument arg2, int64_t axis);
 } // namespace device
 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/device/int8_gemm_pack.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/int8_gemm_pack.hpp
@@ -25,7 +25,7 @@
 #define MIGRAPHX_GUARD_RTGLIB_DEVICE_INT8_GEMM_PACK_HPP
 #include <migraphx/argument.hpp>
-#include <migraphx/config.hpp>
+#include <migraphx/gpu/device/config.hpp>
 #include <hip/hip_runtime_api.h>
 namespace migraphx {
@@ -33,9 +33,13 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
-void int8_gemm_pack_a(hipStream_t stream, const argument& result, const argument& arg);
+void MIGRAPHX_DEVICE_EXPORT int8_gemm_pack_a(hipStream_t stream,
+                                             const argument& result,
+                                             const argument& arg);
-void int8_gemm_pack_b(hipStream_t stream, const argument& result, const argument& arg);
+void MIGRAPHX_DEVICE_EXPORT int8_gemm_pack_b(hipStream_t stream,
+                                             const argument& result,
+                                             const argument& arg);
 } // namespace device
 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/device/logsoftmax.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/logsoftmax.hpp
@@ -25,7 +25,7 @@
 #define MIGRAPHX_GUARD_RTGLIB_DEVICE_LOGSOFTMAX_HPP
 #include <migraphx/argument.hpp>
-#include <migraphx/config.hpp>
+#include <migraphx/gpu/device/config.hpp>
 #include <hip/hip_runtime_api.h>
 namespace migraphx {
@@ -33,7 +33,10 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
-void logsoftmax(hipStream_t stream, const argument& result, const argument& arg, int64_t axis);
+void MIGRAPHX_DEVICE_EXPORT logsoftmax(hipStream_t stream,
+                                       const argument& result,
+                                       const argument& arg,
+                                       int64_t axis);
 } // namespace device
 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/device/multinomial.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/multinomial.hpp
@@ -25,7 +25,7 @@
 #define MIGRAPHX_GUARD_RTGLIB_DEVICE_MULTINOMIAL_HPP
 #include <migraphx/argument.hpp>
-#include <migraphx/config.hpp>
+#include <migraphx/gpu/device/config.hpp>
 #include <hip/hip_runtime_api.h>
 namespace migraphx {
@@ -33,10 +33,10 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
-void multinomial(hipStream_t stream,
+void MIGRAPHX_DEVICE_EXPORT multinomial(hipStream_t stream,
-                 const argument& result,
+                                        const argument& result,
-                 const argument& arg0,
+                                        const argument& arg0,
-                 const argument& arg1);
+                                        const argument& arg1);
 } // namespace device
 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/device/nonzero.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/nonzero.hpp
@@ -25,7 +25,7 @@
 #define MIGRAPHX_GUARD_RTGLIB_DEVICE_NONZERO_HPP
 #include <migraphx/argument.hpp>
-#include <migraphx/config.hpp>
+#include <migraphx/gpu/device/config.hpp>
 #include <hip/hip_runtime_api.h>
 namespace migraphx {
@@ -33,7 +33,9 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
-argument nonzero(hipStream_t stream, const argument& result, const argument& arg_data);
+argument MIGRAPHX_DEVICE_EXPORT nonzero(hipStream_t stream,
+                                        const argument& result,
+                                        const argument& arg_data);
 } // namespace device
 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/device/pad.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/pad.hpp
@@ -26,7 +26,7 @@
 #define MIGRAPHX_GUARD_RTGLIB_DEVICE_PAD_HPP
 #include <migraphx/argument.hpp>
-#include <migraphx/config.hpp>
+#include <migraphx/gpu/device/config.hpp>
 #include <hip/hip_runtime_api.h>
 namespace migraphx {
@@ -34,11 +34,11 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
-argument pad(hipStream_t stream,
+argument MIGRAPHX_DEVICE_EXPORT pad(hipStream_t stream,
-             argument result,
+                                    argument result,
-             argument arg1,
+                                    argument arg1,
-             float value,
+                                    float value,
-             std::vector<std::int64_t> pads);
+                                    std::vector<std::int64_t> pads);
 } // namespace device
 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/device/prefix_scan_sum.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/prefix_scan_sum.hpp
@@ -25,7 +25,7 @@
 #define MIGRAPHX_GUARD_DEVICE_PREFIX_SCAN_SUM_HPP
 #include <migraphx/argument.hpp>
-#include <migraphx/config.hpp>
+#include <migraphx/gpu/device/config.hpp>
 #include <hip/hip_runtime_api.h>
 namespace migraphx {
@@ -33,12 +33,12 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
-void prefix_scan_sum(hipStream_t stream,
+void MIGRAPHX_DEVICE_EXPORT prefix_scan_sum(hipStream_t stream,
-                     const argument& result,
+                                            const argument& result,
-                     const argument& arg,
+                                            const argument& arg,
-                     int32_t axis,
+                                            int32_t axis,
-                     bool exclusive,
+                                            bool exclusive,
-                     bool reverse);
+                                            bool reverse);
 } // namespace device
 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/device/reverse.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/reverse.hpp
@@ -25,7 +25,7 @@
 #define MIGRAPHX_GUARD_RTGLIB_DEVICE_REVERSE_HPP
 #include <migraphx/argument.hpp>
-#include <migraphx/config.hpp>
+#include <migraphx/gpu/device/config.hpp>
 #include <hip/hip_runtime_api.h>
 namespace migraphx {
@@ -33,8 +33,10 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
-argument
+argument MIGRAPHX_DEVICE_EXPORT reverse(hipStream_t stream,
-reverse(hipStream_t stream, argument result, argument arg1, const std::vector<int64_t>& axes);
+                                        argument result,
+                                        argument arg1,
+                                        const std::vector<int64_t>& axes);
 } // namespace device
 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/device/rnn_variable_seq_lens.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/rnn_variable_seq_lens.hpp
@@ -25,7 +25,7 @@
 #define MIGRAPHX_GUARD_RTGLIB_DEVICE_RNN_VARIABLE_SEQ_LENS_HPP
 #include <migraphx/argument.hpp>
-#include <migraphx/config.hpp>
+#include <migraphx/gpu/device/config.hpp>
 #include <hip/hip_runtime_api.h>
 namespace migraphx {
@@ -33,22 +33,22 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
-void rnn_var_sl_shift_sequence(hipStream_t stream,
+void MIGRAPHX_DEVICE_EXPORT rnn_var_sl_shift_sequence(hipStream_t stream,
-                               const argument& result,
+                                                      const argument& result,
-                               const argument& arg_hs,
+                                                      const argument& arg_hs,
-                               const argument& arg_sl);
+                                                      const argument& arg_sl);
-void rnn_var_sl_shift_output(hipStream_t stream,
+void MIGRAPHX_DEVICE_EXPORT rnn_var_sl_shift_output(hipStream_t stream,
-                             const argument& result,
+                                                    const argument& result,
-                             const argument& arg_hs,
+                                                    const argument& arg_hs,
-                             const argument& arg_sl,
+                                                    const argument& arg_sl,
-                             bool is_reverse);
+                                                    bool is_reverse);
-void rnn_var_sl_last_output(hipStream_t stream,
+void MIGRAPHX_DEVICE_EXPORT rnn_var_sl_last_output(hipStream_t stream,
-                            const argument& result,
+                                                   const argument& result,
-                            const argument& arg_hs,
+                                                   const argument& arg_hs,
-                            const argument& arg_sl,
+                                                   const argument& arg_sl,
-                            bool is_reverse);
+                                                   bool is_reverse);
 } // namespace device
 } // namespace gpu