Add option to do offload copying automatically (#403)

* Add compiler options * Add copy operators * Formatting * Use run_passes in tests * Formatting * Use run_pass in schedule test * Formatting * Add compile_options to get_passes in target * Formatting * Offload copy option * Formatting * Copy using pinned memory * Formatting * Improve performance of gpu copying * Formatting * Dont copy * Formatting * Always make an extra copy * Formatting * Remove unused write op * Add missing include * Remove copy_to_gpu function in python api * Make offload copy disabled by default on C++ * Formatting * Fix tidy issues * Formatting * Fix namespace * Fix python tests * Turn clang format off since its broken * Fix compile error on gcc 5 * Remove commented code

Add option to do offload copying automatically (#403)
* Add compiler options * Add copy operators * Formatting * Use run_passes in tests * Formatting * Use run_pass in schedule test * Formatting * Add compile_options to get_passes in target * Formatting * Offload copy option * Formatting * Copy using pinned memory * Formatting * Improve performance of gpu copying * Formatting * Dont copy * Formatting * Always make an extra copy * Formatting * Remove unused write op * Add missing include * Remove copy_to_gpu function in python api * Make offload copy disabled by default on C++ * Formatting * Fix tidy issues * Formatting * Fix namespace * Fix python tests * Turn clang format off since its broken * Fix compile error on gcc 5 * Remove commented code
81b0ff5d · Paul Fultz II · mvermeulen · e814cffb · 81b0ff5d · 81b0ff5d
Commit 81b0ff5d authored Nov 15, 2019 by Paul Fultz II Committed by mvermeulen Nov 15, 2019
20 changed files
--- a/src/driver/main.cpp
+++ b/src/driver/main.cpp
@@ -87,8 +87,9 @@ struct compiler
    static const int q_fp16 = 1;
    static const int q_int8 = 2;
    loader l;
-    bool gpu     = true;
-    int quantize = 0;
+    bool gpu          = true;
+    bool offload_copy = false;
+    int quantize      = 0;

    std::vector<std::string> fill1;
    void parse(argument_parser& ap)
@@ -96,6 +97,10 @@ struct compiler
        l.parse(ap);
        ap(gpu, {"--gpu"}, ap.help("Compile on the gpu"), ap.set_value(true));
        ap(gpu, {"--cpu"}, ap.help("Compile on the cpu"), ap.set_value(false));
+        ap(offload_copy,
+           {"--enable-offload-copy"},
+           ap.help("Enable implicit offload copying"),
+           ap.set_value(false));
        ap(quantize, {"--fp16"}, ap.help("Quantize for fp16"), ap.set_value(q_fp16));
        ap(quantize, {"--int8"}, ap.help("Quantize for int8"), ap.set_value(q_int8));
        ap(fill1, {"--fill1"}, ap.help("Fill parameter with 1s"), ap.append());
@@ -103,10 +108,11 @@ struct compiler

    auto params(const program& p, bool use_gpu = true)
    {
+        bool gpu_flag = use_gpu && gpu && !offload_copy;
        program::parameter_map m;
        for(auto&& s : fill1)
            m[s] = fill_argument(p.get_parameter_shape(s), 1);
-        fill_param_map(m, p, use_gpu && gpu);
+        fill_param_map(m, p, gpu_flag);
        return m;
    }

@@ -122,7 +128,9 @@ struct compiler
        {
            quantize_int8(p, t, {params(p, false)});
        }
-        p.compile(t);
+        compile_options options;
+        options.offload_copy = offload_copy;
+        p.compile(t, options);
        return p;
    }
 };

--- a/src/generate.cpp
+++ b/src/generate.cpp
@@ -9,7 +9,7 @@ argument fill_argument(shape s, unsigned long value)
    s.visit_type([&](auto as) {
        using type = typename decltype(as)::type;
        auto v     = fill_tensor_data<type>(s, value);
-        result     = {s, [v]() mutable { return reinterpret_cast<char*>(v.data()); }};
+        result     = {s, v};
    });
    return result;
 }
@@ -20,7 +20,7 @@ argument generate_argument(shape s, unsigned long seed)
    s.visit_type([&](auto as) {
        using type = typename decltype(as)::type;
        auto v     = generate_tensor_data<type>(s, seed);
-        result     = {s, [v]() mutable { return reinterpret_cast<char*>(v.data()); }};
+        result     = {s, v};
    });
    return result;
 }
@@ -31,7 +31,7 @@ literal generate_literal(shape s, unsigned long seed)
    s.visit_type([&](auto as) {
        using type = typename decltype(as)::type;
        auto v     = generate_tensor_data<type>(s, seed);
-        result     = {s, v};
+        result     = {s, reinterpret_cast<char*>(v.get())};
    });
    return result;
 }

--- a/src/include/migraphx/argument.hpp
+++ b/src/include/migraphx/argument.hpp
@@ -28,13 +28,26 @@ struct argument : raw_data<argument>
        data = [=]() mutable { return buffer.data(); };
    }

-    argument(shape s, std::function<char*()> d) : data(std::move(d)), m_shape(std::move(s)) {}
+    template <class F, MIGRAPHX_REQUIRES(std::is_pointer<decltype(std::declval<F>()())>{})>
+    argument(shape s, F d)
+        : data([f = std::move(d)]() mutable { return reinterpret_cast<char*>(f()); }),
+          m_shape(std::move(s))
+    {
+    }
    template <class T>
    argument(shape s, T* d)
        : data([d] { return reinterpret_cast<char*>(d); }), m_shape(std::move(s))
    {
    }

+    template <class T>
+    argument(shape s, std::shared_ptr<T> d)
+        : data([d] { return reinterpret_cast<char*>(d.get()); }), m_shape(std::move(s))
+    {
+    }
+
+    argument(shape s, std::nullptr_t) : data([] { return nullptr; }), m_shape(std::move(s)) {}
+
    /// Provides a raw pointer to the data
    std::function<char*()> data = nullptr;

@@ -49,6 +62,13 @@ struct argument : raw_data<argument>
        return {s, [=]() mutable { return self.data(); }};
    }

+    /// Make copy of the argument that is always sharing the data
+    argument share() const
+    {
+        auto self = std::make_shared<argument>(*this);
+        return {m_shape, [self]() mutable { return self->data(); }};
+    }
+
    private:
    shape m_shape;
 };

--- a/src/include/migraphx/check_shapes.hpp
+++ b/src/include/migraphx/check_shapes.hpp
@@ -2,6 +2,8 @@
 #define MIGRAPHX_GUARD_RTGLIB_CHECK_SHAPES_HPP

 #include <migraphx/shape.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/stringutils.hpp>
 #include <migraphx/config.hpp>
 #include <algorithm>

@@ -48,11 +50,12 @@ struct check_shapes
        return end - begin;
    }

-    const check_shapes& has(std::size_t n) const
+    template <class... Ts>
+    const check_shapes& has(Ts... ns) const
    {
-        if(size() != n)
-            MIGRAPHX_THROW(prefix() + "Wrong number of arguments: expected " + std::to_string(n) +
-                           " but given " + std::to_string(size()));
+        if(migraphx::none_of({ns...}, [&](auto i) { return this->size() == i; }))
+            MIGRAPHX_THROW(prefix() + "Wrong number of arguments: expected " +
+                           to_string_range({ns...}) + " but given " + std::to_string(size()));
        return *this;
    }


--- a/src/include/migraphx/compile_options.hpp
+++ b/src/include/migraphx/compile_options.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_COMPILE_OPTIONS_HPP
+#define MIGRAPHX_GUARD_RTGLIB_COMPILE_OPTIONS_HPP
+
+#include <migraphx/config.hpp>
+#include <migraphx/tracer.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+struct compile_options
+{
+    bool offload_copy = false;
+    tracer trace{};
+};
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/include/migraphx/generate.hpp
+++ b/src/include/migraphx/generate.hpp
@@ -78,20 +78,18 @@ struct xorshift_generator
 };

 template <class T>
-std::vector<T> generate_tensor_data(const migraphx::shape& s, unsigned long seed = 0)
+auto generate_tensor_data(const migraphx::shape& s, unsigned long seed = 0)
 {
-    std::vector<T> result(s.elements());
-    std::generate(result.begin(), result.end(), xorshf96_generator<T>{seed});
-    // std::generate(result.begin(), result.end(), [&]{ return seed % 7; });
-    // std::generate(result.begin(), result.end(), []{ return 1; });
+    auto result = make_shared_array<T>(s.elements());
+    std::generate(result.get(), result.get() + s.elements(), xorshf96_generator<T>{seed});
    return result;
 }

 template <class T>
-std::vector<T> fill_tensor_data(const migraphx::shape& s, unsigned long value = 0)
+auto fill_tensor_data(const migraphx::shape& s, unsigned long value = 0)
 {
-    std::vector<T> result(s.elements());
-    std::generate(result.begin(), result.end(), [=] { return value; });
+    auto result = make_shared_array<T>(s.elements());
+    std::generate(result.get(), result.get() + s.elements(), [=] { return value; });
    return result;
 }


--- a/src/include/migraphx/program.hpp
+++ b/src/include/migraphx/program.hpp
@@ -8,7 +8,7 @@
 #include <migraphx/builtin.hpp>
 #include <migraphx/instruction_ref.hpp>
 #include <migraphx/target.hpp>
-#include <migraphx/tracer.hpp>
+#include <migraphx/compile_options.hpp>
 #include <migraphx/env.hpp>
 #include <migraphx/config.hpp>
 #include <algorithm>
@@ -107,7 +107,7 @@ struct program

    instruction_ref validate() const;

-    void compile(const target& t, tracer trace = tracer{});
+    void compile(const target& t, compile_options options = compile_options{});

    void finalize();


--- a/src/include/migraphx/stringutils.hpp
+++ b/src/include/migraphx/stringutils.hpp
@@ -83,18 +83,30 @@ inline std::string remove_prefix(std::string s, const std::string& prefix)
        return s;
 }

-template <class Range>
-inline std::string to_string_range(const Range& r)
+template <class Iterator>
+inline std::string to_string_range(Iterator start, Iterator last)
 {
    std::stringstream ss;
-    if(!r.empty())
+    if(start != last)
    {
-        ss << r.front();
-        std::for_each(std::next(r.begin()), r.end(), [&](auto&& x) { ss << ", " << x; });
+        ss << *start;
+        std::for_each(std::next(start), last, [&](auto&& x) { ss << ", " << x; });
    }
    return ss.str();
 }

+template <class Range>
+inline std::string to_string_range(const Range& r)
+{
+    return to_string_range(r.begin(), r.end());
+}
+
+template <class T>
+inline std::string to_string_range(const std::initializer_list<T>& r)
+{
+    return to_string_range(r.begin(), r.end());
+}
+
 template <class T>
 inline std::string to_string(const T& x)
 {

--- a/src/include/migraphx/target.hpp
+++ b/src/include/migraphx/target.hpp
@@ -11,6 +11,7 @@
 #include <migraphx/context.hpp>
 #include <migraphx/pass.hpp>
 #include <migraphx/config.hpp>
+#include <migraphx/compile_options.hpp>
 #include <migraphx/argument.hpp>
 #include <migraphx/rank.hpp>

@@ -28,9 +29,10 @@ struct target
     * @brief The transformation pass to be run during compilation.
     *
     * @param ctx This is the target-dependent context that is created by `get_context`
+     * @param options Compiling options passed in by the user
     * @return The passes to be ran
     */
-    std::vector<pass> get_passes(context& ctx) const;
+    std::vector<pass> get_passes(context& ctx, const compile_options& options) const;
    /**
     * @brief Construct a context for the target.
     * @return The context to be used during compilation and execution.
@@ -122,7 +124,7 @@ argument copy_from_target(T& x, const argument& arg)
 * struct target
 * {
 *      std::string name() const;
- *      std::vector<pass> get_passes(context& ctx) const;
+ *      std::vector<pass> get_passes(context& ctx,const compile_options& options) const;
 *      context get_context() const;
 *      argument copy_to(const argument& input) const;
 *      argument copy_from(const argument& input) const;
@@ -194,10 +196,10 @@ struct target
        return (*this).private_detail_te_get_handle().name();
    }

-    std::vector<pass> get_passes(context& ctx) const
+    std::vector<pass> get_passes(context& ctx, const compile_options& options) const
    {
        assert((*this).private_detail_te_handle_mem_var);
-        return (*this).private_detail_te_get_handle().get_passes(ctx);
+        return (*this).private_detail_te_get_handle().get_passes(ctx, options);
    }

    context get_context() const
@@ -237,12 +239,13 @@ struct target
        virtual std::shared_ptr<private_detail_te_handle_base_type> clone() const = 0;
        virtual const std::type_info& type() const                                = 0;

-        virtual std::string name() const                         = 0;
-        virtual std::vector<pass> get_passes(context& ctx) const = 0;
-        virtual context get_context() const                      = 0;
-        virtual argument copy_to(const argument& input) const    = 0;
-        virtual argument copy_from(const argument& input) const  = 0;
-        virtual argument allocate(const shape& s) const          = 0;
+        virtual std::string name() const                                           = 0;
+        virtual std::vector<pass> get_passes(context& ctx,
+                                             const compile_options& options) const = 0;
+        virtual context get_context() const                                        = 0;
+        virtual argument copy_to(const argument& input) const                      = 0;
+        virtual argument copy_from(const argument& input) const                    = 0;
+        virtual argument allocate(const shape& s) const                            = 0;
    };

    template <typename PrivateDetailTypeErasedT>
@@ -275,10 +278,10 @@ struct target

        std::string name() const override { return private_detail_te_value.name(); }

-        std::vector<pass> get_passes(context& ctx) const override
+        std::vector<pass> get_passes(context& ctx, const compile_options& options) const override
        {

-            return private_detail_te_value.get_passes(ctx);
+            return private_detail_te_value.get_passes(ctx, options);
        }

        context get_context() const override { return private_detail_te_value.get_context(); }

--- a/src/program.cpp
+++ b/src/program.cpp
@@ -345,15 +345,15 @@ instruction_ref program::validate() const
                        [&](const instruction& i) { return !i.valid(impl->instructions.begin()); });
 }

-void program::compile(const target& t, tracer trace)
+void program::compile(const target& t, compile_options options)
 {
    assert(this->validate() == impl->instructions.end());
    this->impl->ctx = t.get_context();
    if(enabled(MIGRAPHX_TRACE_COMPILE{}))
-        trace = tracer{std::cout};
-    trace(*this);
-    trace();
-    run_passes(*this, t.get_passes(this->impl->ctx), trace);
+        options.trace = tracer{std::cout};
+    options.trace(*this);
+    options.trace();
+    run_passes(*this, t.get_passes(this->impl->ctx, options), options.trace);
    auto invalid = this->validate();
    if(invalid != impl->instructions.end())
    {

--- a/src/py/migraphx_py.cpp
+++ b/src/py/migraphx_py.cpp
@@ -159,7 +159,14 @@ PYBIND11_MODULE(migraphx, m)
        .def("clone", [](migraphx::program& p) { return *(new migraphx::program(p)); })
        .def("get_parameter_shapes", &migraphx::program::get_parameter_shapes)
        .def("get_shape", &migraphx::program::get_shape)
-        .def("compile", [](migraphx::program& p, const migraphx::target& t) { p.compile(t); })
+        .def("compile",
+             [](migraphx::program& p, const migraphx::target& t, bool offload_copy) {
+                 migraphx::compile_options options;
+                 options.offload_copy = offload_copy;
+                 p.compile(t, options);
+             },
+             py::arg("t"),
+             py::arg("offload_copy") = true)
        .def("run", &migraphx::program::eval)
        .def("__eq__", std::equal_to<migraphx::program>{})
        .def("__ne__", std::not_equal_to<migraphx::program>{})
@@ -199,7 +206,6 @@ PYBIND11_MODULE(migraphx, m)
    m.def("to_gpu", &migraphx::gpu::to_gpu, py::arg("arg"), py::arg("host") = false);
    m.def("from_gpu", &migraphx::gpu::from_gpu);
    m.def("gpu_sync", &migraphx::gpu::gpu_sync);
-    m.def("copy_to_gpu", &migraphx::gpu::copy_to_gpu);
 #endif

 #ifdef VERSION_INFO

--- a/src/targets/cpu/include/migraphx/cpu/target.hpp
+++ b/src/targets/cpu/include/migraphx/cpu/target.hpp
@@ -2,6 +2,7 @@
 #define MIGRAPHX_GUARD_MIGRAPHLIB_CPU_TARGET_HPP

 #include <migraphx/program.hpp>
+#include <migraphx/compile_options.hpp>
 #include <migraphx/cpu/context.hpp>
 #include <migraphx/config.hpp>

@@ -13,7 +14,7 @@ namespace cpu {
 struct target
 {
    std::string name() const;
-    std::vector<pass> get_passes(migraphx::context& ctx) const;
+    std::vector<pass> get_passes(migraphx::context& ctx, const compile_options&) const;
    migraphx::context get_context() const { return context{}; }

    argument copy_to(const argument& arg) const { return arg; }

--- a/src/targets/cpu/target.cpp
+++ b/src/targets/cpu/target.cpp
@@ -13,7 +13,7 @@ namespace cpu {

 std::string target::name() const { return "cpu"; }

-std::vector<pass> target::get_passes(migraphx::context&) const
+std::vector<pass> target::get_passes(migraphx::context&, const compile_options&) const
 {
    return {rewrite_rnn{},
            dead_code_elimination{},

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -101,6 +101,7 @@ add_library(migraphx_gpu
    int8_gemm_pack.cpp
    int8_conv_pack.cpp
    gemm_impl.cpp
+    preallocate_param.cpp
 )
 set_target_properties(migraphx_gpu PROPERTIES EXPORT_NAME gpu)
 rocm_set_soversion(migraphx_gpu ${PROJECT_VERSION})

--- a/src/targets/gpu/device/argmax.cpp
+++ b/src/targets/gpu/device/argmax.cpp
@@ -5,7 +5,6 @@
 #include <migraphx/gpu/device/launch.hpp>
 #include <migraphx/gpu/device/types.hpp>
 #include <migraphx/gpu/device/arg_op.hpp>
-#include <migraphx/gpu/hip.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

--- a/src/targets/gpu/device/argmin.cpp
+++ b/src/targets/gpu/device/argmin.cpp
@@ -5,7 +5,6 @@
 #include <migraphx/gpu/device/launch.hpp>
 #include <migraphx/gpu/device/types.hpp>
 #include <migraphx/gpu/device/arg_op.hpp>
-#include <migraphx/gpu/hip.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

--- a/src/targets/gpu/device/gather.cpp
+++ b/src/targets/gpu/device/gather.cpp
@@ -4,7 +4,6 @@
 #include <migraphx/gpu/device/tensor.hpp>
 #include <migraphx/gpu/device/launch.hpp>
 #include <migraphx/gpu/device/types.hpp>
-#include <migraphx/gpu/hip.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

--- a/src/targets/gpu/device/int8_gemm_pack.cpp
+++ b/src/targets/gpu/device/int8_gemm_pack.cpp
@@ -4,7 +4,6 @@
 #include <migraphx/gpu/device/launch.hpp>
 #include <migraphx/gpu/device/types.hpp>
 #include <migraphx/gpu/device/tensor.hpp>
-#include <migraphx/gpu/hip.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -69,8 +68,6 @@ void int8_gemm_pack_b(hipStream_t stream, const argument& result, const argument
    });
 }

-void sync_stream(hipStream_t stream) { (void)hipStreamSynchronize(stream); }
-
 } // namespace device
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/gpu/device/logsoftmax.cpp
+++ b/src/targets/gpu/device/logsoftmax.cpp
@@ -5,7 +5,6 @@
 #include <migraphx/gpu/device/tensor.hpp>
 #include <migraphx/gpu/device/launch.hpp>
 #include <migraphx/gpu/device/types.hpp>
-#include <migraphx/gpu/hip.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

--- a/src/targets/gpu/device/softmax.cpp
+++ b/src/targets/gpu/device/softmax.cpp
@@ -6,7 +6,6 @@
 #include <migraphx/gpu/device/tensor.hpp>
 #include <migraphx/gpu/device/launch.hpp>
 #include <migraphx/gpu/device/types.hpp>
-#include <migraphx/gpu/hip.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {