Merge branch 'develop' into multinomial_parse_merge

264a7647 · Brian Pickrell · d99729f8 · 8e18544f · 264a7647 · 264a7647
Commit 264a7647 authored Jul 26, 2023 by Brian Pickrell
20 changed files
--- a/src/targets/gpu/include/migraphx/gpu/device_name.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device_name.hpp
@@ -24,7 +24,7 @@
 #ifndef MIGRAPHX_GUARD_GPU_DEVICE_NAME_HPP
 #define MIGRAPHX_GUARD_GPU_DEVICE_NAME_HPP

-#include <migraphx/config.hpp>
+#include <migraphx/gpu/config.hpp>
 #include <string>

 struct hipDeviceProp_t;
@@ -33,11 +33,11 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {

-std::string get_arch_name(const hipDeviceProp_t& props);
+MIGRAPHX_GPU_EXPORT std::string get_arch_name(const hipDeviceProp_t& props);

-std::string get_device_name();
+MIGRAPHX_GPU_EXPORT std::string get_device_name();

-int get_device_id();
+MIGRAPHX_GPU_EXPORT int get_device_id();

 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/gpu/include/migraphx/gpu/fuse_mlir.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/fuse_mlir.hpp
@@ -24,7 +24,6 @@
 #ifndef MIGRAPHX_GUARD_GPU_FUSE_MLIR_HPP
 #define MIGRAPHX_GUARD_GPU_FUSE_MLIR_HPP

-#include <migraphx/config.hpp>
 #include <migraphx/gpu/context.hpp>

 namespace migraphx {
@@ -34,9 +33,9 @@ struct module_pass_manager;

 namespace gpu {

-bool mlir_enabled();
+MIGRAPHX_GPU_EXPORT bool mlir_enabled();

-struct fuse_mlir
+struct MIGRAPHX_GPU_EXPORT fuse_mlir
 {
    context* ctx = nullptr;
    std::string name() const { return "gpu::fuse_mlir"; }

--- a/src/targets/gpu/include/migraphx/gpu/hip.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/hip.hpp
@@ -24,7 +24,7 @@
 #ifndef MIGRAPHX_GUARD_MIGRAPHLIB_HIP_HPP
 #define MIGRAPHX_GUARD_MIGRAPHLIB_HIP_HPP

-#include <migraphx/config.hpp>
+#include <migraphx/gpu/config.hpp>
 #include <migraphx/argument.hpp>
 #include <migraphx/literal.hpp>
 #include <migraphx/check_shapes.hpp>
@@ -38,26 +38,26 @@ namespace gpu {

 struct context;

-std::string hip_error(int error);
+MIGRAPHX_GPU_EXPORT std::string hip_error(int error);

-argument allocate_gpu(const shape& s, bool host = false);
+MIGRAPHX_GPU_EXPORT argument allocate_gpu(const shape& s, bool host = false);

-argument register_on_gpu(const argument& arg);
+MIGRAPHX_GPU_EXPORT argument register_on_gpu(const argument& arg);

-argument to_gpu(const argument& arg, bool host = false);
+MIGRAPHX_GPU_EXPORT argument to_gpu(const argument& arg, bool host = false);

-argument from_gpu(const argument& arg);
+MIGRAPHX_GPU_EXPORT argument from_gpu(const argument& arg);

-void set_device(std::size_t id);
+MIGRAPHX_GPU_EXPORT void set_device(std::size_t id);

-void gpu_sync();
-void gpu_sync(const context& ctx);
+MIGRAPHX_GPU_EXPORT void gpu_sync();
+MIGRAPHX_GPU_EXPORT void gpu_sync(const context& ctx);

-void gpu_copy(context& ctx, const argument& src, const argument& dst);
-void copy_to_gpu(context& ctx, const argument& src, const argument& dst);
-void copy_from_gpu(context& ctx, const argument& src, const argument& dst);
+MIGRAPHX_GPU_EXPORT void gpu_copy(context& ctx, const argument& src, const argument& dst);
+MIGRAPHX_GPU_EXPORT void copy_to_gpu(context& ctx, const argument& src, const argument& dst);
+MIGRAPHX_GPU_EXPORT void copy_from_gpu(context& ctx, const argument& src, const argument& dst);

-argument get_preallocation(context& ctx, const std::string& id);
+MIGRAPHX_GPU_EXPORT argument get_preallocation(context& ctx, const std::string& id);

 struct hip_allocate
 {
@@ -187,7 +187,8 @@ struct hip_copy
    std::ptrdiff_t output_alias(const std::vector<shape>&) const { return 1; }
 };

-void store_preallocated_param(context& ctx, const std::string& id, const argument& a);
+MIGRAPHX_GPU_EXPORT void
+store_preallocated_param(context& ctx, const std::string& id, const argument& a);

 struct hip_allocate_memory
 {

--- a/src/targets/gpu/include/migraphx/gpu/kernel.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/kernel.hpp
@@ -24,7 +24,7 @@
 #ifndef MIGRAPHX_GUARD_RTGLIB_KERNEL_HPP
 #define MIGRAPHX_GUARD_RTGLIB_KERNEL_HPP

-#include <migraphx/config.hpp>
+#include <migraphx/gpu/config.hpp>
 #include <migraphx/gpu/pack_args.hpp>
 #include <hip/hip_runtime_api.h>
 #include <memory>
@@ -37,7 +37,7 @@ namespace gpu {

 struct kernel_impl;

-struct kernel
+struct MIGRAPHX_GPU_EXPORT kernel
 {
    kernel() = default;
    kernel(const char* image, const std::string& name);

--- a/src/targets/gpu/include/migraphx/gpu/lowering.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/lowering.hpp
@@ -24,7 +24,6 @@
 #ifndef MIGRAPHX_GUARD_RTGLIB_MIOPEN_LOWERING_HPP
 #define MIGRAPHX_GUARD_RTGLIB_MIOPEN_LOWERING_HPP

-#include <migraphx/config.hpp>
 #include <migraphx/gpu/context.hpp>

 namespace migraphx {
@@ -40,7 +39,7 @@ namespace gpu {
 * * Maps instructions to their GPU-specific counterparts.
 * * Inserts `allocate` instructions before GPU operators.
 */
-struct lowering
+struct MIGRAPHX_GPU_EXPORT lowering
 {
    context* ctx;
    bool offload_copy;

--- a/src/targets/gpu/include/migraphx/gpu/miopen.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/miopen.hpp
@@ -75,21 +75,43 @@ using miopen_find_options = MIGRAPHX_MANAGE_PTR(miopenFindOptions_t, miopenDestr
 using miopen_problem      = MIGRAPHX_MANAGE_PTR(miopenProblem_t, miopenDestroyProblem);
 using miopen_solution     = MIGRAPHX_MANAGE_PTR(miopenSolution_t, miopenDestroySolution);

-inline miopen_solution
-find_solution(miopenHandle_t handle, miopenProblem_t problem, bool tune = false)
+inline miopen_solution find_solution(miopenHandle_t handle,
+                                     size_t num_inputs,
+                                     const miopenTensorArgument_t* tensor_args,
+                                     void* workspace,
+                                     size_t workspace_size,
+                                     miopenProblem_t problem,
+                                     bool tune = false)
 {
    miopenSolution_t solution;
    size_t found           = 0;
-    miopen_find_options fo = nullptr;
+    miopen_find_options fo = make_obj<miopen_find_options>(&miopenCreateFindOptions);
    if(tune)
    {
-        fo = make_obj<miopen_find_options>(&miopenCreateFindOptions);
        miopenSetFindOptionTuning(fo.get(), 1);
    }
-    auto status = miopenFindSolutions(handle, problem, fo.get(), &solution, &found, 1);
+#ifdef MIGRAPHX_PREALLOCATE_MIOPEN_BUFFERS
+    for(auto i : range(num_inputs))
+    {
+        auto status = miopenSetFindOptionPreallocatedTensor(
+            fo.get(), tensor_args[i].id, tensor_args[i].buffer);
+        if(status != miopenStatusSuccess)
+            MIGRAPHX_THROW("MIOpen: failed to preallocate tensors for the find process");
+    }
+    auto status = miopenSetFindOptionPreallocatedWorkspace(fo.get(), workspace, workspace_size);
+    if(status != miopenStatusSuccess)
+        MIGRAPHX_THROW("MIOpen: failed to preallocate workspace for the find process");
+#else
+    miopenStatus_t status;
+    (void)(num_inputs);
+    (void)(tensor_args);
+    (void)(workspace_size);
+    (void)(workspace);
+#endif
+    status      = miopenFindSolutions(handle, problem, fo.get(), &solution, &found, 1);
    auto result = miopen_solution{solution};
    if(status != miopenStatusSuccess or found == 0)
-        MIGRAPHX_THROW("MIOpen miopenFindSolutions failed");
+        MIGRAPHX_THROW("MIOpen: miopenFindSolutions failed");
    return result;
 }

@@ -170,7 +192,7 @@ inline convolution_descriptor make_conv(const T& op)
 }

 template <class T>
-inline convolution_descriptor make_deconv(const T& op)
+inline convolution_descriptor make_convolution_backwards(const T& op)
 {
    auto c = make_obj<convolution_descriptor>(&miopenCreateConvolutionDescriptor);
    miopenConvolutionMode_t c_mode = miopenTranspose;

--- a/src/targets/gpu/include/migraphx/gpu/mlir.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/mlir.hpp
@@ -26,7 +26,7 @@

 #include <string>
 #include <vector>
-#include <migraphx/config.hpp>
+#include <migraphx/gpu/config.hpp>
 #include <migraphx/gpu/code_object_op.hpp>
 #include <migraphx/instruction_ref.hpp>

@@ -35,14 +35,16 @@ inline namespace MIGRAPHX_INLINE_NS {
 struct module;
 namespace gpu {

-std::string dump_mlir(const module& m);
-code_object_op
-compile_mlir(const context& ctx, module m, const std::vector<instruction_ref>& inputs);
+MIGRAPHX_GPU_EXPORT std::string dump_mlir(const module& m);

-instruction_ref insert_mlir(module& m,
-                            instruction_ref ins,
-                            code_object_op co,
-                            const std::vector<instruction_ref>& inputs);
+MIGRAPHX_GPU_EXPORT code_object_op compile_mlir(const context& ctx,
+                                                module m,
+                                                const std::vector<instruction_ref>& inputs);
+
+MIGRAPHX_GPU_EXPORT instruction_ref insert_mlir(module& m,
+                                                instruction_ref ins,
+                                                code_object_op co,
+                                                const std::vector<instruction_ref>& inputs);

 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/gpu/include/migraphx/gpu/pack_args.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/pack_args.hpp
@@ -24,7 +24,7 @@
 #ifndef MIGRAPHX_GUARD_RTGLIB_PACK_ARGS_HPP
 #define MIGRAPHX_GUARD_RTGLIB_PACK_ARGS_HPP

-#include <migraphx/config.hpp>
+#include <migraphx/gpu/config.hpp>
 #include <migraphx/requires.hpp>
 #include <utility>
 #include <vector>
@@ -46,7 +46,7 @@ struct kernel_argument
    void* data;
 };

-std::vector<char> pack_args(const std::vector<kernel_argument>& args);
+MIGRAPHX_GPU_EXPORT std::vector<char> pack_args(const std::vector<kernel_argument>& args);

 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/gpu/include/migraphx/gpu/pack_int8_args.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/pack_int8_args.hpp
@@ -25,7 +25,6 @@
 #define MIGRAPHX_GUARD_RTGLIB_PACK_INT8_ARGS_HPP

 #include <migraphx/program.hpp>
-#include <migraphx/config.hpp>
 #include <migraphx/gpu/context.hpp>

 namespace migraphx {
@@ -33,7 +32,7 @@ inline namespace MIGRAPHX_INLINE_NS {

 namespace gpu {

-struct pack_int8_args
+struct MIGRAPHX_GPU_EXPORT pack_int8_args
 {
    std::string name() const { return "gpu::pack_int8_args"; }
    void apply(module& m) const;

--- a/src/targets/gpu/include/migraphx/gpu/rocblas.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/rocblas.hpp
@@ -24,7 +24,7 @@
 #ifndef MIGRAPHX_GUARD_MIGRAPHLIB_ROCBLAS_HPP
 #define MIGRAPHX_GUARD_MIGRAPHLIB_ROCBLAS_HPP
 #include <migraphx/manage_ptr.hpp>
-#include <migraphx/config.hpp>
+#include <migraphx/gpu/config.hpp>
 #include <rocblas/rocblas.h>

 namespace migraphx {
@@ -38,9 +38,10 @@ rocblas_handle_ptr create_rocblas_handle_ptr(hipStream_t s);

 struct context;

-bool get_compute_fp32_flag();
+MIGRAPHX_GPU_EXPORT bool get_compute_fp32_flag();
+
+MIGRAPHX_GPU_EXPORT bool get_int8_x4_format(context& ctx);

-bool get_int8_x4_format(context& ctx);
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/targets/gpu/include/migraphx/gpu/target.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/target.hpp
@@ -26,13 +26,13 @@

 #include <migraphx/program.hpp>
 #include <migraphx/compile_options.hpp>
-#include <migraphx/config.hpp>
+#include <migraphx/gpu/config.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {

-struct target
+struct MIGRAPHX_GPU_EXPORT target
 {
    std::string name() const;
    std::vector<pass> get_passes(migraphx::context& gctx, const compile_options& options) const;

--- a/src/targets/gpu/include/migraphx/gpu/time_op.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/time_op.hpp
@@ -32,7 +32,7 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {

-std::pair<double, double>
+MIGRAPHX_GPU_EXPORT std::pair<double, double>
 time_op(context& ictx, operation op, const std::vector<shape>& inputs, int n = 100);

 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/write_literals.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/write_literals.hpp
@@ -32,7 +32,7 @@ struct module;

 namespace gpu {

-struct write_literals
+struct MIGRAPHX_GPU_EXPORT write_literals
 {
    context* ctx = nullptr;
    std::string name() const { return "gpu::write_literals"; }

--- a/src/targets/gpu/jit/ck_gemm.cpp
+++ b/src/targets/gpu/jit/ck_gemm.cpp
@@ -66,7 +66,7 @@ ${preamble}

 extern "C" {

-__global__ void ${kernel}(${params})
+MIGRAPHX_GLOBAL void ${kernel}(${params})
 {
    transform_args(make_tensors(), rotate_last())(${args})([](auto... xs) {
        ck_gemm<${solution}, ${blocks_per_batch}>(xs...);

--- a/src/targets/gpu/jit/concat.cpp
+++ b/src/targets/gpu/jit/concat.cpp
@@ -47,7 +47,7 @@ ${preamble}

 extern "C" {

-__global__ void ${kernel}(${params}) 
+MIGRAPHX_GLOBAL void ${kernel}(${params}) 
 {
    transform_args(make_tensors(), rotate_last(), ${transformers})(${args})([](auto y, ${concat_params}, auto... xs) {
        concat<${axis}>(${concat_args})(${post}, y, xs...);

--- a/src/targets/gpu/jit/gather.cpp
+++ b/src/targets/gpu/jit/gather.cpp
@@ -44,7 +44,7 @@ namespace migraphx {

 extern "C" {

-__global__ void gather_kernel(void* in_data, void* in_indices, void* output) 
+MIGRAPHX_GLOBAL void gather_kernel(void* in_data, void* in_indices, void* output) 
 {
    make_tensors()(in_data, in_indices, output)([](auto&&... xs) { 
        gather<${axis}>(xs...); 

--- a/src/targets/gpu/jit/gathernd.cpp
+++ b/src/targets/gpu/jit/gathernd.cpp
@@ -44,7 +44,7 @@ namespace migraphx {

 extern "C" {

-__global__ void gathernd_kernel(void* in_data, void* in_indices, void* output) 
+MIGRAPHX_GLOBAL void gathernd_kernel(void* in_data, void* in_indices, void* output) 
 {
    make_tensors()(in_data, in_indices, output)([](auto&&... xs) { 
        auto settings = make_gathernd_settings(MIGRAPHX_MAKE_CONSTANT(int64_t{BATCH_DIMS}));

--- a/src/targets/gpu/jit/layernorm.cpp
+++ b/src/targets/gpu/jit/layernorm.cpp
@@ -48,7 +48,7 @@ namespace migraphx {
 ${preamble}

 extern "C" {
-__global__ void ${kernel}(${params}) 
+MIGRAPHX_GLOBAL void ${kernel}(${params}) 
 {
    transform_args(make_tensors(), rotate_last(), ${transformers})(${args})([](auto... xs) {
        ${layernorm}<${axis}>(${post}, ${eps}, xs...);

--- a/src/targets/gpu/jit/pad.cpp
+++ b/src/targets/gpu/jit/pad.cpp
@@ -44,7 +44,7 @@ static const char* const pointwise_kernel = R"__migraphx__(
 namespace migraphx {

 extern "C" {
-__global__ void pad_kernel(void* input_p, void* output_p) 
+MIGRAPHX_GLOBAL void pad_kernel(void* input_p, void* output_p) 
 {
    auto offsets = index_ints<${offsets}>{};
    auto idx     = make_index();

--- a/src/targets/gpu/jit/pointwise.cpp
+++ b/src/targets/gpu/jit/pointwise.cpp
@@ -44,7 +44,7 @@ namespace migraphx {
 ${preamble}

 extern "C" {
-__global__ void ${kernel}(${params}) 
+MIGRAPHX_GLOBAL void ${kernel}(${params}) 
 {
    auto idx = make_index();
    pointwise(idx, ${transformers})(${lambda}, ${args});