Merge branch 'develop' into blas_tuning

23cb7917 · Brian Pickrell · GitHub · b5fcc0bc · ea32ca70 · 23cb7917
Unverified Commit 23cb7917 authored Aug 16, 2023 by Brian Pickrell Committed by GitHub Aug 16, 2023
20 changed files
--- a/src/targets/gpu/include/migraphx/gpu/device/scatter.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/scatter.hpp
@@ -25,7 +25,7 @@
 #define MIGRAPHX_GUARD_RTGLIB_DEVICE_SCATTER_HPP

 #include <migraphx/argument.hpp>
-#include <migraphx/config.hpp>
+#include <migraphx/gpu/device/config.hpp>
 #include <hip/hip_runtime_api.h>

 namespace migraphx {
@@ -33,7 +33,7 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {

-argument scatter(
+argument MIGRAPHX_DEVICE_EXPORT scatter(
    hipStream_t stream, argument result, argument arg0, argument arg1, argument arg2, int64_t axis);

 } // namespace device

--- a/src/targets/gpu/include/migraphx/gpu/device/topk.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/topk.hpp
@@ -25,7 +25,7 @@
 #define MIGRAPHX_GUARD_RTGLIB_DEVICE_TOPK_HPP

 #include <migraphx/argument.hpp>
-#include <migraphx/config.hpp>
+#include <migraphx/gpu/device/config.hpp>
 #include <hip/hip_runtime_api.h>

 namespace migraphx {
@@ -33,19 +33,19 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {

-argument topk_smallest(hipStream_t stream,
-                       const argument& val_res,
-                       const argument& ind_res,
-                       const argument& arg,
-                       int64_t k,
-                       int64_t axis);
+argument MIGRAPHX_DEVICE_EXPORT topk_smallest(hipStream_t stream,
+                                              const argument& val_res,
+                                              const argument& ind_res,
+                                              const argument& arg,
+                                              int64_t k,
+                                              int64_t axis);

-argument topk_largest(hipStream_t stream,
-                      const argument& val_res,
-                      const argument& ind_res,
-                      const argument& arg,
-                      int64_t k,
-                      int64_t axis);
+argument MIGRAPHX_DEVICE_EXPORT topk_largest(hipStream_t stream,
+                                             const argument& val_res,
+                                             const argument& ind_res,
+                                             const argument& arg,
+                                             int64_t k,
+                                             int64_t axis);

 } // namespace device
 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/device_name.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device_name.hpp
@@ -24,16 +24,18 @@
 #ifndef MIGRAPHX_GUARD_GPU_DEVICE_NAME_HPP
 #define MIGRAPHX_GUARD_GPU_DEVICE_NAME_HPP

-#include <migraphx/config.hpp>
+#include <migraphx/gpu/config.hpp>
 #include <string>

+struct hipDeviceProp_t;
+
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {

-std::string get_device_name();
+MIGRAPHX_GPU_EXPORT std::string get_device_name();

-int get_device_id();
+MIGRAPHX_GPU_EXPORT int get_device_id();

 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/gpu/include/migraphx/gpu/fuse_ck.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/fuse_ck.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_GPU_FUSE_CK_HPP
+#define MIGRAPHX_GUARD_GPU_FUSE_CK_HPP
+
+#include <migraphx/config.hpp>
+#include <migraphx/gpu/context.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+struct module_pass_manager;
+
+namespace gpu {
+
+struct fuse_ck
+{
+    context* ctx = nullptr;
+    std::string name() const { return "gpu::fuse_ck"; }
+    void apply(module_pass_manager& mpm) const;
+};
+
+} // namespace gpu
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_GPU_FUSE_CK_HPP
--- a/src/targets/gpu/include/migraphx/gpu/fuse_mlir.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/fuse_mlir.hpp
@@ -24,7 +24,6 @@
 #ifndef MIGRAPHX_GUARD_GPU_FUSE_MLIR_HPP
 #define MIGRAPHX_GUARD_GPU_FUSE_MLIR_HPP

-#include <migraphx/config.hpp>
 #include <migraphx/gpu/context.hpp>

 namespace migraphx {
@@ -34,7 +33,9 @@ struct module_pass_manager;

 namespace gpu {

-struct fuse_mlir
+MIGRAPHX_GPU_EXPORT bool mlir_enabled();
+
+struct MIGRAPHX_GPU_EXPORT fuse_mlir
 {
    context* ctx = nullptr;
    std::string name() const { return "gpu::fuse_mlir"; }

--- a/src/targets/gpu/include/migraphx/gpu/hip.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/hip.hpp
@@ -24,11 +24,12 @@
 #ifndef MIGRAPHX_GUARD_MIGRAPHLIB_HIP_HPP
 #define MIGRAPHX_GUARD_MIGRAPHLIB_HIP_HPP

-#include <migraphx/config.hpp>
+#include <migraphx/gpu/config.hpp>
 #include <migraphx/argument.hpp>
 #include <migraphx/literal.hpp>
 #include <migraphx/check_shapes.hpp>
 #include <migraphx/functional.hpp>
+#include <migraphx/dyn_output.hpp>
 #include <utility>

 namespace migraphx {
@@ -37,26 +38,26 @@ namespace gpu {

 struct context;

-std::string hip_error(int error);
+MIGRAPHX_GPU_EXPORT std::string hip_error(int error);

-argument allocate_gpu(const shape& s, bool host = false);
+MIGRAPHX_GPU_EXPORT argument allocate_gpu(const shape& s, bool host = false);

-argument register_on_gpu(const argument& arg);
+MIGRAPHX_GPU_EXPORT argument register_on_gpu(const argument& arg);

-argument to_gpu(const argument& arg, bool host = false);
+MIGRAPHX_GPU_EXPORT argument to_gpu(const argument& arg, bool host = false);

-argument from_gpu(const argument& arg);
+MIGRAPHX_GPU_EXPORT argument from_gpu(const argument& arg);

-void set_device(std::size_t id);
+MIGRAPHX_GPU_EXPORT void set_device(std::size_t id);

-void gpu_sync();
-void gpu_sync(const context& ctx);
+MIGRAPHX_GPU_EXPORT void gpu_sync();
+MIGRAPHX_GPU_EXPORT void gpu_sync(const context& ctx);

-void gpu_copy(context& ctx, const argument& src, const argument& dst);
-void copy_to_gpu(context& ctx, const argument& src, const argument& dst);
-void copy_from_gpu(context& ctx, const argument& src, const argument& dst);
+MIGRAPHX_GPU_EXPORT void gpu_copy(context& ctx, const argument& src, const argument& dst);
+MIGRAPHX_GPU_EXPORT void copy_to_gpu(context& ctx, const argument& src, const argument& dst);
+MIGRAPHX_GPU_EXPORT void copy_from_gpu(context& ctx, const argument& src, const argument& dst);

-argument get_preallocation(context& ctx, const std::string& id);
+MIGRAPHX_GPU_EXPORT argument get_preallocation(context& ctx, const std::string& id);

 struct hip_allocate
 {
@@ -91,7 +92,7 @@ struct hip_sync_stream
        return inputs.front();
    }

-    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const
+    argument compute(const context& ctx, const shape&, const std::vector<argument>& args) const
    {
        gpu_sync(ctx);
        if(args.empty())
@@ -112,7 +113,7 @@ struct hip_copy_to_gpu
    std::string name() const { return "hip::copy_to_gpu"; }
    shape compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.has(1, 2).same_type();
+        check_shapes{inputs, *this, true}.has(1, 2).same_type();
        return inputs.at(0);
    }
    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const
@@ -121,6 +122,10 @@ struct hip_copy_to_gpu
        if(args.size() == 1)
            return input;
        argument result = args[1].share();
+        if(result.get_shape().dynamic())
+        {
+            result = result.reshape(args[0].get_shape());
+        }
        gpu_copy(ctx, input, result);
        // Associate the input since it was registered with hip
        return {result.get_shape(), [input, result]() mutable { return result.data(); }};
@@ -138,19 +143,24 @@ struct hip_copy_from_gpu
    std::string name() const { return "hip::copy_from_gpu"; }
    shape compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.has(1, 2).same_type();
+        check_shapes{inputs, *this, true}.has(1, 2).same_type();
        return inputs.at(0);
    }
    argument
-    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const
+    compute(context& ctx, const dyn_output& dyn_out, const std::vector<argument>& args) const
    {
        if(args.size() == 1)
        {
-            argument result = allocate_gpu(output_shape, true);
+            argument result = allocate_gpu(dyn_out.computed_shape, true);
            gpu_copy(ctx, args[0], result);
            return result;
        }
-        copy_from_gpu(ctx, args[0], args[1]);
+        argument input = args[0].share();
+        if(input.get_shape().dynamic())
+        {
+            input = input.reshape(args[1].get_shape());
+        }
+        copy_from_gpu(ctx, input, args[1]);
        return args[1];
    }
    std::ptrdiff_t output_alias(const std::vector<shape>& args) const
@@ -177,7 +187,8 @@ struct hip_copy
    std::ptrdiff_t output_alias(const std::vector<shape>&) const { return 1; }
 };

-void store_preallocated_param(context& ctx, const std::string& id, const argument& a);
+MIGRAPHX_GPU_EXPORT void
+store_preallocated_param(context& ctx, const std::string& id, const argument& a);

 struct hip_allocate_memory
 {

--- a/src/targets/gpu/include/migraphx/gpu/kernel.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/kernel.hpp
@@ -24,7 +24,7 @@
 #ifndef MIGRAPHX_GUARD_RTGLIB_KERNEL_HPP
 #define MIGRAPHX_GUARD_RTGLIB_KERNEL_HPP

-#include <migraphx/config.hpp>
+#include <migraphx/gpu/config.hpp>
 #include <migraphx/gpu/pack_args.hpp>
 #include <hip/hip_runtime_api.h>
 #include <memory>
@@ -37,7 +37,7 @@ namespace gpu {

 struct kernel_impl;

-struct kernel
+struct MIGRAPHX_GPU_EXPORT kernel
 {
    kernel() = default;
    kernel(const char* image, const std::string& name);

--- a/src/targets/gpu/include/migraphx/gpu/lowering.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/lowering.hpp
@@ -24,13 +24,12 @@
 #ifndef MIGRAPHX_GUARD_RTGLIB_MIOPEN_LOWERING_HPP
 #define MIGRAPHX_GUARD_RTGLIB_MIOPEN_LOWERING_HPP

-#include <migraphx/config.hpp>
 #include <migraphx/gpu/context.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

-struct module;
+struct module_pass_manager;

 namespace gpu {

@@ -40,12 +39,12 @@ namespace gpu {
 * * Maps instructions to their GPU-specific counterparts.
 * * Inserts `allocate` instructions before GPU operators.
 */
-struct lowering
+struct MIGRAPHX_GPU_EXPORT lowering
 {
    context* ctx;
    bool offload_copy;
    std::string name() const { return "gpu::lowering"; }
-    void apply(module& m) const;
+    void apply(module_pass_manager& mpm) const;
 };

 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/miopen.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/miopen.hpp
@@ -75,21 +75,43 @@ using miopen_find_options = MIGRAPHX_MANAGE_PTR(miopenFindOptions_t, miopenDestr
 using miopen_problem      = MIGRAPHX_MANAGE_PTR(miopenProblem_t, miopenDestroyProblem);
 using miopen_solution     = MIGRAPHX_MANAGE_PTR(miopenSolution_t, miopenDestroySolution);

-inline miopen_solution
-find_solution(miopenHandle_t handle, miopenProblem_t problem, bool tune = false)
+inline miopen_solution find_solution(miopenHandle_t handle,
+                                     size_t num_inputs,
+                                     const miopenTensorArgument_t* tensor_args,
+                                     void* workspace,
+                                     size_t workspace_size,
+                                     miopenProblem_t problem,
+                                     bool tune = false)
 {
    miopenSolution_t solution;
    size_t found           = 0;
-    miopen_find_options fo = nullptr;
+    miopen_find_options fo = make_obj<miopen_find_options>(&miopenCreateFindOptions);
    if(tune)
    {
-        fo = make_obj<miopen_find_options>(&miopenCreateFindOptions);
        miopenSetFindOptionTuning(fo.get(), 1);
    }
-    auto status = miopenFindSolutions(handle, problem, fo.get(), &solution, &found, 1);
+#ifdef MIGRAPHX_PREALLOCATE_MIOPEN_BUFFERS
+    for(auto i : range(num_inputs))
+    {
+        auto status = miopenSetFindOptionPreallocatedTensor(
+            fo.get(), tensor_args[i].id, tensor_args[i].buffer);
+        if(status != miopenStatusSuccess)
+            MIGRAPHX_THROW("MIOpen: failed to preallocate tensors for the find process");
+    }
+    auto status = miopenSetFindOptionPreallocatedWorkspace(fo.get(), workspace, workspace_size);
+    if(status != miopenStatusSuccess)
+        MIGRAPHX_THROW("MIOpen: failed to preallocate workspace for the find process");
+#else
+    miopenStatus_t status;
+    (void)(num_inputs);
+    (void)(tensor_args);
+    (void)(workspace_size);
+    (void)(workspace);
+#endif
+    status      = miopenFindSolutions(handle, problem, fo.get(), &solution, &found, 1);
    auto result = miopen_solution{solution};
    if(status != miopenStatusSuccess or found == 0)
-        MIGRAPHX_THROW("MIOpen miopenFindSolutions failed");
+        MIGRAPHX_THROW("MIOpen: miopenFindSolutions failed");
    return result;
 }

@@ -170,7 +192,7 @@ inline convolution_descriptor make_conv(const T& op)
 }

 template <class T>
-inline convolution_descriptor make_deconv(const T& op)
+inline convolution_descriptor make_convolution_backwards(const T& op)
 {
    auto c = make_obj<convolution_descriptor>(&miopenCreateConvolutionDescriptor);
    miopenConvolutionMode_t c_mode = miopenTranspose;

--- a/src/targets/gpu/include/migraphx/gpu/mlir.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/mlir.hpp
@@ -26,23 +26,30 @@

 #include <string>
 #include <vector>
-#include <migraphx/config.hpp>
+#include <migraphx/gpu/config.hpp>
 #include <migraphx/gpu/code_object_op.hpp>
 #include <migraphx/instruction_ref.hpp>
+#include <migraphx/gpu/tuning_config.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 struct module;
 namespace gpu {

-std::string dump_mlir(const module& m);
-code_object_op
-compile_mlir(const context& ctx, module m, const std::vector<instruction_ref>& inputs);
+MIGRAPHX_GPU_EXPORT std::string dump_mlir(const module& m);
+MIGRAPHX_GPU_EXPORT code_object_op compile_mlir(const context& migraphx_ctx,
+                                                module m,
+                                                const std::vector<instruction_ref>& inputs,
+                                                const value& solution);

-instruction_ref insert_mlir(module& m,
-                            instruction_ref ins,
-                            code_object_op co,
-                            const std::vector<instruction_ref>& inputs);
+MIGRAPHX_GPU_EXPORT instruction_ref insert_mlir(module& m,
+                                                instruction_ref ins,
+                                                code_object_op co,
+                                                const std::vector<instruction_ref>& inputs);
+
+MIGRAPHX_GPU_EXPORT tuning_config get_tuning_config_mlir(const context& migraphx_ctx,
+                                                         module m,
+                                                         const std::vector<shape>& inputs);

 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/gpu/include/migraphx/gpu/pack_args.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/pack_args.hpp
@@ -24,7 +24,7 @@
 #ifndef MIGRAPHX_GUARD_RTGLIB_PACK_ARGS_HPP
 #define MIGRAPHX_GUARD_RTGLIB_PACK_ARGS_HPP

-#include <migraphx/config.hpp>
+#include <migraphx/gpu/config.hpp>
 #include <migraphx/requires.hpp>
 #include <utility>
 #include <vector>
@@ -46,7 +46,7 @@ struct kernel_argument
    void* data;
 };

-std::vector<char> pack_args(const std::vector<kernel_argument>& args);
+MIGRAPHX_GPU_EXPORT std::vector<char> pack_args(const std::vector<kernel_argument>& args);

 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/gpu/include/migraphx/gpu/pack_int8_args.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/pack_int8_args.hpp
@@ -25,7 +25,6 @@
 #define MIGRAPHX_GUARD_RTGLIB_PACK_INT8_ARGS_HPP

 #include <migraphx/program.hpp>
-#include <migraphx/config.hpp>
 #include <migraphx/gpu/context.hpp>

 namespace migraphx {
@@ -33,7 +32,7 @@ inline namespace MIGRAPHX_INLINE_NS {

 namespace gpu {

-struct pack_int8_args
+struct MIGRAPHX_GPU_EXPORT pack_int8_args
 {
    std::string name() const { return "gpu::pack_int8_args"; }
    void apply(module& m) const;

--- a/src/targets/gpu/include/migraphx/gpu/rocblas.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/rocblas.hpp
@@ -39,9 +39,10 @@ rocblas_handle_ptr create_rocblas_handle_ptr(hipStream_t s);

 struct context;

-bool get_compute_fp32_flag();
+MIGRAPHX_GPU_EXPORT bool get_compute_fp32_flag();
+
+MIGRAPHX_GPU_EXPORT bool get_int8_x4_format(context& ctx);

-bool get_int8_x4_format(context& ctx);
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/targets/gpu/include/migraphx/gpu/target.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/target.hpp
@@ -26,13 +26,13 @@

 #include <migraphx/program.hpp>
 #include <migraphx/compile_options.hpp>
-#include <migraphx/config.hpp>
+#include <migraphx/gpu/config.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {

-struct target
+struct MIGRAPHX_GPU_EXPORT target
 {
    std::string name() const;
    std::vector<pass> get_passes(migraphx::context& gctx, const compile_options& options) const;

--- a/src/targets/gpu/driver/include/migraphx/gpu/driver/perf.hpp
+++ b/src/targets/gpu/driver/include/migraphx/gpu/driver/perf.hpp
@@ -31,12 +31,10 @@
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
-namespace driver {

-std::pair<double, double>
+MIGRAPHX_GPU_EXPORT std::pair<double, double>
 time_op(context& ictx, operation op, const std::vector<shape>& inputs, int n = 100);

-} // namespace driver
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/targets/gpu/include/migraphx/gpu/tuning_config.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/tuning_config.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_GPU_TUNING_CONFIG_HPP
+#define MIGRAPHX_GUARD_GPU_TUNING_CONFIG_HPP
+
+#include <migraphx/config.hpp>
+#include <migraphx/value.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct tuning_config
+{
+    value problem;
+    std::vector<value> solutions;
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_GPU_TUNING_CONFIG_HPP
--- a/src/targets/gpu/include/migraphx/gpu/write_literals.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/write_literals.hpp
@@ -32,7 +32,7 @@ struct module;

 namespace gpu {

-struct write_literals
+struct MIGRAPHX_GPU_EXPORT write_literals
 {
    context* ctx = nullptr;
    std::string name() const { return "gpu::write_literals"; }

--- a/src/targets/gpu/jit/ck_gemm.cpp
+++ b/src/targets/gpu/jit/ck_gemm.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <fstream>
+#include <migraphx/filesystem.hpp>
+#include <migraphx/gpu/compiler.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/gpu/context.hpp>
+
+#include <migraphx/env.hpp>
+#include <migraphx/file_buffer.hpp>
+#include <migraphx/gpu/compile_gen.hpp>
+#include <migraphx/gpu/compile_hip.hpp>
+#include <migraphx/gpu/compile_hip_code_object.hpp>
+#include <migraphx/module.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/reduce_dims.hpp>
+#include <migraphx/stringutils.hpp>
+
+#include "ck/host/device_gemm_multiple_d.hpp"
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+namespace gpu {
+
+using namespace migraphx::gpu::gen; // NOLINT
+
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_LOG_CK_GEMM);
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_CK_TUNING);
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_CK_TUNING_VALUE);
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_CK_DEBUG);
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TUNE_CK);
+
+// NOLINTNEXTLINE
+static const char* const ck_gemm_kernel = R"__migraphx__(
+#include <args.hpp>
+#include <migraphx/kernels/ck_gemm.hpp>
+#include <migraphx/kernels/pointwise.hpp>
+#include <migraphx/kernels/ops.hpp>
+#include <${include}>
+
+namespace migraphx {
+
+${preamble}
+
+extern "C" {
+
+MIGRAPHX_GLOBAL void ${kernel}(${params})
+{
+    transform_args(make_tensors(), rotate_last())(${args})([](auto... xs) {
+        ck_gemm<${solution}, ${blocks_per_batch}>(xs...);
+    });
+}
+
+}
+
+} // namespace migraphx
+
+)__migraphx__";
+
+// NOLINTNEXTLINE
+static const char* const disable_warning_pragma = R"__migraphx__(
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Weverything"
+${content}
+#pragma clang diagnostic pop
+)__migraphx__";
+
+template <class P>
+static std::string ck_disable_warnings(P p)
+{
+    return interpolate_string(disable_warning_pragma,
+                              {{"content", std::string{p.first, p.second}}});
+}
+
+static std::unordered_map<std::string, std::string> create_ck_header_strings()
+{
+    std::unordered_map<std::string, std::string> result;
+    auto ck_headers = ck::host::GetHeaders();
+
+    std::transform(
+        ck_headers.begin(), ck_headers.end(), std::inserter(result, result.begin()), [&](auto&& p) {
+            return std::make_pair(p.first, ck_disable_warnings(p.second));
+        });
+    return result;
+}
+
+static std::vector<src_file> create_ck_headers()
+{
+    static const auto& header_strings = create_ck_header_strings();
+    std::vector<src_file> srcs;
+    std::transform(
+        header_strings.begin(), header_strings.end(), std::back_inserter(srcs), [&](auto&& p) {
+            return src_file{fs::path{p.first},
+                            {p.second.data(), p.second.data() + p.second.size()}};
+        });
+    return srcs;
+}
+
+static const std::vector<src_file>& ck_headers()
+{
+    static const auto& headers = create_ck_headers();
+    return headers;
+}
+
+static bool transposed_matrix(const shape& s) { return s.strides().back() != 1; }
+
+using tuning_entry = std::pair<std::vector<shape>, size_t>;
+static std::vector<tuning_entry> read_tuning(const std::string& s)
+{
+    if(not fs::exists(s))
+        return {};
+    return from_value<std::vector<tuning_entry>>(from_json_string(read_string(s)));
+}
+
+static float matrix_distance(const shape& x, const shape& y)
+{
+    if(x.type() != y.type())
+        return std::numeric_limits<float>::max();
+    if(transposed_matrix(x) != transposed_matrix(y))
+        return std::numeric_limits<float>::max();
+    auto sum_squared = std::inner_product(x.lens().rbegin(),
+                                          x.lens().rbegin() + 2,
+                                          y.lens().rbegin(),
+                                          0,
+                                          std::plus<>{},
+                                          [](auto a, auto b) { return (a - b) * (a - b); });
+    return std::sqrt(sum_squared);
+}
+
+static std::size_t get_tuning_for(const std::vector<shape>& inputs)
+{
+    static auto tuning = read_tuning(string_value_of(MIGRAPHX_CK_TUNING{}, ""));
+    if(tuning.empty())
+    {
+        std::cout << "*********** Warning: No CK tuning! for config:" << std::endl;
+        std::cout << "  " << inputs[0] << std::endl;
+        std::cout << "  " << inputs[1] << std::endl;
+        std::cout << "  " << inputs[2] << std::endl;
+    }
+    auto it = std::find_if(
+        tuning.begin(), tuning.end(), [&](const auto& p) { return p.first == inputs; });
+    if(it == tuning.end())
+    {
+        std::cout << "*********** Warning: CK tuning missing for config!" << std::endl;
+        std::cout << "  " << inputs[0] << std::endl;
+        std::cout << "  " << inputs[1] << std::endl;
+        std::cout << "  " << inputs[2] << std::endl;
+        std::vector<std::pair<float, std::size_t>> w;
+        std::transform(tuning.begin(), tuning.end(), std::back_inserter(w), [&](const auto& p) {
+            if(inputs.size() < 3 or p.first.size() < 3)
+                MIGRAPHX_THROW("Invalid CK config");
+            auto avg_distance = std::inner_product(
+                p.first.begin(),
+                p.first.begin() + 3,
+                inputs.begin(),
+                0.0f,
+                std::plus<>{},
+                [](const auto& x, const auto& y) { return matrix_distance(x, y) / 3.0f; });
+            return std::make_pair(avg_distance, p.second);
+        });
+        std::sort(w.begin(), w.end());
+        std::size_t default_value = 4;
+        if(not w.empty())
+            default_value = w.front().second;
+        auto tuning_val = value_of(MIGRAPHX_CK_TUNING_VALUE{}, default_value);
+        std::cout << "*********** Warning: CK try tuning: " << tuning_val << std::endl;
+        return tuning_val;
+    }
+    return it->second;
+}
+
+struct ck_gemm_compiler : compiler<ck_gemm_compiler>
+{
+    static std::string get_layout(const shape& s)
+    {
+        return transposed_matrix(s) ? "ck::tensor_layout::gemm::ColumnMajor"
+                                    : "ck::tensor_layout::gemm::RowMajor";
+    }
+
+    static ck::host::DataType get_type(const shape& s)
+    {
+        if(s.type() == shape::half_type)
+            return ck::host::DataType::Half;
+        else if(s.type() == shape::float_type)
+            return ck::host::DataType::Float;
+        else if(s.type() == shape::int8_type)
+            return ck::host::DataType::Int8;
+        else if(s.type() == shape::int32_type)
+            return ck::host::DataType::Int32;
+        MIGRAPHX_THROW("Unsupported ck type");
+    }
+
+    template <class Iterator, class F>
+    static std::string ck_tuple(Iterator start, Iterator last, F f)
+    {
+        std::vector<std::string> s;
+        std::transform(start, last, std::back_inserter(s), f);
+        return "ck::Tuple<" + join_strings(s, ",") + ">";
+    }
+
+    static std::vector<shape> adjust_inputs(std::vector<shape> inputs, bool& swap_inputs)
+    {
+        swap_inputs  = false;
+        auto c_shape = inputs.back();
+        if(not transposed_matrix(c_shape))
+            return inputs;
+        std::vector<int64_t> perm(c_shape.lens().size());
+        std::iota(perm.begin(), perm.end(), 0);
+        std::swap(perm[perm.size() - 1], perm[perm.size() - 2]);
+        std::transform(inputs.begin(), inputs.end(), inputs.begin(), [&](shape s) {
+            return reorder_shape(s, perm);
+        });
+        swap_inputs = true;
+        return inputs;
+    }
+
+    static std::size_t get_batch_count(const shape& s)
+    {
+        return std::accumulate(
+            s.lens().rbegin() + 2, s.lens().rend(), std::size_t{1}, std::multiplies<std::size_t>());
+    }
+
+    static void fold_batch_dims(shape& s)
+    {
+        auto lens = s.lens();
+        if(lens.size() <= 2)
+            return;
+        auto batch_count = get_batch_count(s);
+        auto m1          = lens.at(lens.size() - 2);
+        auto m2          = lens.at(lens.size() - 1);
+        if(transposed_matrix(s))
+            s = shape{s.type(), {m1, m2 * batch_count}};
+        else
+            s = shape{s.type(), {m1 * batch_count, m2}};
+    }
+
+    static void remove_batch_dims(shape& s)
+    {
+        auto lens = s.lens();
+        if(lens.size() <= 2)
+            return;
+        auto m1 = lens.at(lens.size() - 2);
+        auto m2 = lens.at(lens.size() - 1);
+        s       = shape{s.type(), {m1, m2}};
+    }
+
+    std::vector<std::string> names() const { return {"ck_gemm", "gpu::ck_gemm"}; }
+
+    static bool standard_batch(const shape& s)
+    {
+        if(s.lens().size() < 3)
+            return true;
+        std::vector<std::size_t> lens(s.lens().begin(), s.lens().end() - 2);
+        std::vector<std::size_t> strides(s.strides().begin(), s.strides().end() - 2);
+        auto base = *(s.lens().end() - 2) * *(s.lens().end() - 1);
+        std::transform(strides.begin(), strides.end(), strides.begin(), [&](auto stride) {
+            return stride / base;
+        });
+        return shape{s.type(), lens, strides}.standard();
+    }
+
+    bool can_fold_batch(const std::vector<shape>& inputs) const
+    {
+        const auto& b_shape = inputs[1];
+        if(std::any_of(inputs.begin() + 2, inputs.end() - 1, [](auto input) {
+               return not standard_batch(input);
+           }))
+            return false;
+        const auto& b_strides = b_shape.strides();
+        return std::all_of(
+            b_strides.begin(), b_strides.end() - 2, [](auto stride) { return stride == 0; });
+    }
+
+    ck::host::device_gemm_multiple_d::Problem create_problem(const std::vector<shape>& inputs,
+                                                             const value& v) const
+    {
+        const auto& a_shape = inputs[0];
+        const auto& b_shape = inputs[1];
+        const auto& c_shape = inputs.back();
+
+        // cppcheck-suppress unreadVariable
+        auto rank = a_shape.ndim();
+
+        auto batch_count = get_batch_count(c_shape);
+        auto m           = c_shape.lens()[rank - 2];
+        m                = can_fold_batch(inputs) ? m * batch_count : m;
+        auto n           = c_shape.lens().back();
+        auto k           = a_shape.lens().back();
+
+        const bool trans_a = transposed_matrix(a_shape);
+        const bool trans_b = transposed_matrix(b_shape);
+        const bool trans_e = transposed_matrix(c_shape);
+        const auto a_type  = get_type(a_shape);
+        const auto b_type  = get_type(b_shape);
+        const auto e_type  = get_type(c_shape);
+        std::vector<bool> ds_layout;
+        std::transform(inputs.begin() + 2,
+                       inputs.end() - 1,
+                       std::back_inserter(ds_layout),
+                       [](const auto& i) { return transposed_matrix(i); });
+        std::vector<ck::host::DataType> ds_type;
+        std::transform(inputs.begin() + 2,
+                       inputs.end() - 1,
+                       std::back_inserter(ds_type),
+                       [](const auto& i) { return get_type(i); });
+
+        std::string ck_passthrough = "ck_passthrough";
+        std::string cde_op         = ck_passthrough;
+        assert(inputs.size() < 4 or v.contains("post"));
+        if(v.contains("post"))
+        {
+            cde_op = v.at("post").to<std::string>();
+        }
+
+        return ck::host::device_gemm_multiple_d::Problem{m,
+                                                         n,
+                                                         k,
+                                                         trans_a,
+                                                         trans_b,
+                                                         trans_e,
+                                                         ds_layout,
+                                                         a_type,
+                                                         b_type,
+                                                         e_type,
+                                                         ds_type,
+                                                         ck_passthrough,
+                                                         ck_passthrough,
+                                                         cde_op};
+    }
+
+    operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
+    {
+        const auto& a_shape = inputs[0];
+        const auto& b_shape = inputs[1];
+        const auto& c_shape = inputs.back();
+        auto tuning_value   = v.get("tuning_value", 4);
+        if(not v.contains("tuning_value"))
+            tuning_value = get_tuning_for({a_shape, b_shape, c_shape});
+        auto batch_count = get_batch_count(c_shape);
+        auto problem     = create_problem(inputs, v);
+
+        const auto include_header   = problem.GetIncludeHeader();
+        const auto solutions        = problem.GetSolutions(ctx.get_current_device().get_gfx_name());
+        const auto& solution        = solutions.at(tuning_value);
+        const auto template_str     = solution.template_str;
+        const auto blocks_per_batch = solution.grid_size;
+        const auto block_size       = solution.block_size;
+
+        hip_compile_options options;
+        options.additional_src_files = ck_headers();
+        auto grid_size = can_fold_batch(inputs) ? blocks_per_batch : batch_count * blocks_per_batch;
+        options.set_launch_params(v, grid_size * block_size, block_size);
+        options.inputs         = inputs;
+        options.output         = c_shape;
+        options.kernel_name    = v.get("kernel", "ck_gemm_kernel");
+        options.virtual_inputs = inputs;
+        if(can_fold_batch(inputs))
+        {
+            auto vinputs = inputs;
+            fold_batch_dims(vinputs[0]);
+            remove_batch_dims(vinputs[1]);
+            std::for_each(vinputs.begin() + 2, vinputs.end(), fold_batch_dims);
+            options.virtual_inputs = vinputs;
+        }
+
+        if(v.get("check", false) or enabled(MIGRAPHX_CK_DEBUG{}))
+            options.params += " -DMIGRAPHX_CK_CHECK=1";
+
+        auto src = interpolate_string(ck_gemm_kernel,
+                                      {{"solution", template_str},
+                                       {"include", include_header},
+                                       {"params", enum_params(inputs.size(), "void * private_p")},
+                                       {"args", enum_params(inputs.size(), "private_p")},
+                                       {"blocks_per_batch", to_string(blocks_per_batch)},
+                                       {"preamble", v.get("preamble", std::string{})},
+                                       {"kernel", options.kernel_name}});
+
+        return compile_hip_code_object(src, options);
+    }
+
+    value create_settings(instruction_ref ins, const operation& op) const
+    {
+        auto v      = op.to_value();
+        v["kernel"] = "ck_gemm_kernel";
+        if(not ins->module_inputs().empty())
+        {
+            auto* pm      = ins->module_inputs().front();
+            v["preamble"] = generate_pointwise(*pm, "post_ck_gemm_function") +
+                            "\nMIGRAPHX_LIFT_CLASS(post_ck_gemm, post_ck_gemm_function);";
+            v["post"]   = "ck_function_adaptor<post_ck_gemm>";
+            v["kernel"] = "ck_gemm_" + generate_name_from_ops(*pm) + "_kernel";
+        }
+        return v;
+    }
+
+    compiler_replace
+    compile(context& ctx, instruction_ref ins, const operation& op, const value& solution) const
+    {
+        auto shapes = to_shapes(ins->inputs());
+        auto v      = create_settings(ins, op);
+        if(not solution.is_null())
+            v["tuning_value"] = solution;
+        return {compile_op(ctx, shapes, v),
+                [=](module& m, instruction_ref ins2, const operation& code_object) {
+                    if(enabled(MIGRAPHX_LOG_CK_GEMM{}))
+                    {
+                        std::vector<shape> gemm_shapes{
+                            shapes[0], shapes[1], shapes.back().with_type(shapes[0].type())};
+                        std::cout << "gpu::ck_gemm: " << to_json_string(to_value(gemm_shapes))
+                                  << std::endl;
+                    }
+                    m.replace_instruction(ins2, code_object, ins2->inputs());
+                }};
+    }
+
+    optional<tuning_config>
+    get_tuning_config(context& ctx, instruction_ref ins, const operation& op, bool exhaustive) const
+    {
+        if(not exhaustive and not enabled(MIGRAPHX_TUNE_CK{}))
+            return nullopt;
+        tuning_config tc;
+        auto shapes    = to_shapes(ins->inputs());
+        auto problem   = create_problem(shapes, create_settings(ins, op));
+        auto solutions = problem.GetSolutions(ctx.get_current_device().get_gfx_name());
+        tc.solutions.resize(solutions.size());
+        std::iota(tc.solutions.begin(), tc.solutions.end(), 0);
+        std::vector<shape> gemm_shapes{shapes[0], shapes[1], shapes.back()};
+        tc.problem = to_value(gemm_shapes);
+        return tc;
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/jit/concat.cpp
+++ b/src/targets/gpu/jit/concat.cpp
@@ -47,7 +47,7 @@ ${preamble}

 extern "C" {

-__global__ void ${kernel}(${params}) 
+MIGRAPHX_GLOBAL void ${kernel}(${params}) 
 {
    transform_args(make_tensors(), rotate_last(), ${transformers})(${args})([](auto y, ${concat_params}, auto... xs) {
        concat<${axis}>(${concat_args})(${post}, y, xs...);
@@ -108,7 +108,7 @@ struct concat_compiler : compiler<concat_compiler>
            v["post"]          = "MIGRAPHX_LIFT(post_concat)";
            v["kernel"]        = "concat_" + generate_name_from_ops(*pm) + "_kernel";
        }
-        return replace(compile_op(ctx, to_shapes(ins->inputs()), v));
+        return compile_op(ctx, to_shapes(ins->inputs()), v);
    }
 };


--- a/src/targets/gpu/jit/gather.cpp
+++ b/src/targets/gpu/jit/gather.cpp
@@ -44,7 +44,7 @@ namespace migraphx {

 extern "C" {

-__global__ void gather_kernel(void* in_data, void* in_indices, void* output) 
+MIGRAPHX_GLOBAL void gather_kernel(void* in_data, void* in_indices, void* output) 
 {
    make_tensors()(in_data, in_indices, output)([](auto&&... xs) { 
        gather<${axis}>(xs...); 
@@ -80,7 +80,7 @@ struct gather_compiler : compiler<gather_compiler>

    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
    {
-        return replace(compile_op(ctx, to_shapes(ins->inputs()), op.to_value()));
+        return compile_op(ctx, to_shapes(ins->inputs()), op.to_value());
    }
 };