Stream sync Changset (#1358)

Stream sync changes and associated API level changes

Stream sync Changset (#1358)
Stream sync changes and associated API level changes
f7d987ba · Ted Themistokleous · GitHub · a9a47402 · f7d987ba · f7d987ba
Unverified Commit f7d987ba authored Oct 04, 2022 by Ted Themistokleous Committed by GitHub Oct 04, 2022
17 changed files
--- a/src/api/api.cpp
+++ b/src/api/api.cpp
@@ -21,6 +21,7 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
+#include <migraphx/execution_environment.hpp>
 #include <migraphx/migraphx.h>
 #include <migraphx/rank.hpp>
 #include <migraphx/shape.hpp>
@@ -166,6 +167,13 @@ void set_output_names(tf_options& options, std::vector<const char*> names)
    options.output_node_names = std::vector<std::string>(names.begin(), names.end());
 }
+std::vector<argument>
+run_async(program& p, const parameter_map& params, void* s, std::string_view name)
+{
+    execution_environment exec_env{any_ptr(s, name), true};
+    return p.eval(params, exec_env);
+}
 template <class Value>
 std::vector<const char*> get_names(const std::unordered_map<std::string, Value>& m)
 {
@@ -1435,6 +1443,23 @@ extern "C" migraphx_status migraphx_program_run(migraphx_arguments_t* out,
    return api_error_result;
 }
+extern "C" migraphx_status migraphx_program_run_async(migraphx_arguments_t* out,
+                                                      migraphx_program_t program,
+                                                      migraphx_program_parameters_t params,
+                                                      void* s,
+                                                      const char* name)
+{
+    auto api_error_result = migraphx::try_([&] {
+        if(program == nullptr)
+            MIGRAPHX_THROW(migraphx_status_bad_param, "Bad parameter program: Null pointer");
+        if(params == nullptr)
+            MIGRAPHX_THROW(migraphx_status_bad_param, "Bad parameter params: Null pointer");
+        *out = allocate<migraphx_arguments_t>(
+            migraphx::run_async((program->object), (params->object), (s), (name)));
+    });
+    return api_error_result;
+}
 extern "C" migraphx_status
 migraphx_program_equal(bool* out, const_migraphx_program_t program, const_migraphx_program_t x)
 {

--- a/src/api/include/migraphx/migraphx.h
+++ b/src/api/include/migraphx/migraphx.h
@@ -357,6 +357,12 @@ migraphx_status migraphx_program_run(migraphx_arguments_t* out,
                                     migraphx_program_t program,
                                     migraphx_program_parameters_t params);
+migraphx_status migraphx_program_run_async(migraphx_arguments_t* out,
+                                           migraphx_program_t program,
+                                           migraphx_program_parameters_t params,
+                                           void* s,
+                                           const char* name);
 migraphx_status
 migraphx_program_equal(bool* out, const_migraphx_program_t program, const_migraphx_program_t x);

--- a/src/api/include/migraphx/migraphx.hpp
+++ b/src/api/include/migraphx/migraphx.hpp
@@ -1065,6 +1065,20 @@ struct program : MIGRAPHX_HANDLE_BASE(program)
        return arguments(pout, own{});
    }
+    template <class Stream>
+    /// Overloaded to allow for execution_environment input
+    arguments run_async(const program_parameters& pparams, Stream* s) const
+    {
+        migraphx_arguments_t pout;
+        call(&migraphx_program_run_async,
+             &pout,
+             this->get_handle_ptr(),
+             pparams.get_handle_ptr(),
+             s,
+             get_type_name<Stream>().c_str());
+        return arguments(pout, own{});
+    }
    void print() const { call(&migraphx_program_print, this->get_handle_ptr()); }
    program sort()

--- a/src/api/migraphx.py
+++ b/src/api/migraphx.py
@@ -276,6 +276,13 @@ def program(h):
                 params='std::unordered_map<std::string, migraphx::argument>'),
             invoke='migraphx::run($@)',
             returns='std::vector<migraphx::argument>')
+    h.method('run_async',
+             api.params(
+                 params='std::unordered_map<std::string, migraphx::argument>',
+                 s='void*',
+                 name='const char *'),
+             invoke='migraphx::run_async($@)',
+             returns='std::vector<migraphx::argument>')
    h.method('equal',
             api.params(x='const migraphx::program&'),
             invoke='migraphx::equal($@)',

--- a/src/include/migraphx/context.hpp
+++ b/src/include/migraphx/context.hpp
@@ -66,6 +66,15 @@ any_ptr get_queue_context(T&)
 {
    return {};
 }
+template <class T>
+void wait_for_context(T&, any_ptr)
+{
+}
+template <class T>
+void finish_on_context(T&, any_ptr)
+{
+}
 #ifdef TYPE_ERASED_DECLARATION
@@ -78,6 +87,10 @@ struct context
    void from_value(const value& v);
    // (optional)
    any_ptr get_queue();
+    // (optional)
+    void wait_for(any_ptr queue);
+    // (optional)
+    void finish_on(any_ptr queue);
    //
    void finish() const;
 };
@@ -165,6 +178,18 @@ struct context
        return (*this).private_detail_te_get_handle().get_queue();
    }
+    void wait_for(any_ptr queue)
+    {
+        assert((*this).private_detail_te_handle_mem_var);
+        (*this).private_detail_te_get_handle().wait_for(queue);
+    }
+    void finish_on(any_ptr queue)
+    {
+        assert((*this).private_detail_te_handle_mem_var);
+        (*this).private_detail_te_get_handle().finish_on(queue);
+    }
    void finish() const
    {
        assert((*this).private_detail_te_handle_mem_var);
@@ -187,6 +212,8 @@ struct context
        virtual value to_value() const          = 0;
        virtual void from_value(const value& v) = 0;
        virtual any_ptr get_queue()             = 0;
+        virtual void wait_for(any_ptr queue)    = 0;
+        virtual void finish_on(any_ptr queue)   = 0;
        virtual void finish() const             = 0;
    };
@@ -231,6 +258,33 @@ struct context
        return get_queue_context(private_detail_te_self);
    }
+    template <class T>
+    static auto private_detail_te_default_wait_for(char, T&& private_detail_te_self, any_ptr queue)
+        -> decltype(private_detail_te_self.wait_for(queue))
+    {
+        private_detail_te_self.wait_for(queue);
+    }
+    template <class T>
+    static void private_detail_te_default_wait_for(float, T&& private_detail_te_self, any_ptr queue)
+    {
+        wait_for_context(private_detail_te_self, queue);
+    }
+    template <class T>
+    static auto private_detail_te_default_finish_on(char, T&& private_detail_te_self, any_ptr queue)
+        -> decltype(private_detail_te_self.finish_on(queue))
+    {
+        private_detail_te_self.finish_on(queue);
+    }
+    template <class T>
+    static void
+    private_detail_te_default_finish_on(float, T&& private_detail_te_self, any_ptr queue)
+    {
+        finish_on_context(private_detail_te_self, queue);
+    }
    template <typename PrivateDetailTypeErasedT>
    struct private_detail_te_handle_type : private_detail_te_handle_base_type
    {
@@ -248,7 +302,7 @@ struct context
            PrivateDetailTypeErasedT value,
            typename std::enable_if<not std::is_reference<PrivateDetailTypeErasedU>::value,
                                    int>::type* = nullptr) noexcept
-            : private_detail_te_value(std::move(value))
+            : private_detail_te_value(value)
        {
        }
@@ -277,6 +331,18 @@ struct context
            return private_detail_te_default_get_queue(char(0), private_detail_te_value);
        }
+        void wait_for(any_ptr queue) override
+        {
+            private_detail_te_default_wait_for(char(0), private_detail_te_value, queue);
+        }
+        void finish_on(any_ptr queue) override
+        {
+            private_detail_te_default_finish_on(char(0), private_detail_te_value, queue);
+        }
        void finish() const override { private_detail_te_value.finish(); }
        PrivateDetailTypeErasedT private_detail_te_value;

--- a/src/include/migraphx/execution_environment.hpp
+++ b/src/include/migraphx/execution_environment.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_MIGRAPHLIB_EXECUTION_ENV_HPP
+#define MIGRAPHX_GUARD_MIGRAPHLIB_EXECUTION_ENV_HPP
+#include <migraphx/any_ptr.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+struct execution_environment
+{
+    any_ptr queue = any_ptr{};
+    bool async    = false;
+};
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif /* MIGRAPHX_GUARD_MIGRAPHLIB_EXECUTION_ENV_HPP */
--- a/src/include/migraphx/program.hpp
+++ b/src/include/migraphx/program.hpp
@@ -37,6 +37,7 @@
 #include <migraphx/assignment_options.hpp>
 #include <migraphx/env.hpp>
 #include <migraphx/config.hpp>
+#include <migraphx/execution_environment.hpp>
 #include <algorithm>
 #include <iostream>
@@ -76,8 +77,8 @@ struct program
    std::unordered_map<std::string, shape> get_parameter_shapes() const;
-    std::vector<argument> eval(parameter_map params) const;
+    std::vector<argument> eval(parameter_map params,
+                               execution_environment exec_env = execution_environment{}) const;
    std::size_t size() const;
    std::vector<shape> get_output_shapes() const;

--- a/src/program.cpp
+++ b/src/program.cpp
@@ -398,7 +398,7 @@ std::vector<argument> generic_eval(const program& p,
    return generic_eval(mm, ctx, params, {}, make_trace);
 }
-std::vector<argument> program::eval(parameter_map params) const
+std::vector<argument> program::eval(parameter_map params, execution_environment exec_env) const
 {
    auto& ctx = this->impl->ctx;
 #ifndef NDEBUG
@@ -423,6 +423,12 @@ std::vector<argument> program::eval(parameter_map params) const
 #endif
    auto trace_level = value_of(MIGRAPHX_TRACE_EVAL{});
+    std::vector<argument> ret;
+    if(exec_env.async)
+    {
+        ctx.wait_for(exec_env.queue);
+    }
    if(trace_level > 0)
    {
@@ -434,49 +440,56 @@ std::vector<argument> program::eval(parameter_map params) const
            ins_out[x] = ss.str();
        });
-        return generic_eval(*this,
+        ret = generic_eval(*this,
-                            ctx,
+                           ctx,
-                            std::move(params),
+                           std::move(params),
-                            with_check_context([&](auto& ins, auto f, auto&& check_context) {
+                           with_check_context([&](auto& ins, auto f, auto&& check_context) {
-                                ctx.finish();
+                               ctx.finish();
-                                std::cout << "Run instruction: " << ins_out.at(ins) << std::endl;
+                               std::cout << "Run instruction: " << ins_out.at(ins) << std::endl;
-                                timer t{};
+                               timer t{};
-                                auto result = check_context(f);
+                               auto result = check_context(f);
-                                double t1   = t.record<milliseconds>();
+                               double t1   = t.record<milliseconds>();
-                                ctx.finish();
+                               ctx.finish();
-                                double t2 = t.record<milliseconds>();
+                               double t2 = t.record<milliseconds>();
-                                std::cout << "Time: " << t1 << "ms, " << t2 << "ms" << std::endl;
+                               std::cout << "Time: " << t1 << "ms, " << t2 << "ms" << std::endl;
-                                if(trace_level > 1 and ins->name().front() != '@' and
+                               if(trace_level > 1 and ins->name().front() != '@' and
-                                   ins->name() != "load" and not result.empty())
+                                  ins->name() != "load" and not result.empty())
-                                {
+                               {
-                                    target tgt  = make_target(this->impl->target_name);
+                                   target tgt  = make_target(this->impl->target_name);
-                                    auto buffer = tgt.copy_from(result);
+                                   auto buffer = tgt.copy_from(result);
-                                    if(trace_level == 2)
+                                   if(trace_level == 2)
-                                    {
+                                   {
-                                        std::cout << "Output has "
+                                       std::cout << "Output has "
-                                                  << to_string_range(classify_argument(buffer))
+                                                 << to_string_range(classify_argument(buffer))
-                                                  << std::endl;
+                                                 << std::endl;
-                                        std::cout << "Output: ";
+                                       std::cout << "Output: ";
-                                        preview_argument(std::cout, buffer);
+                                       preview_argument(std::cout, buffer);
-                                        std::cout << std::endl;
+                                       std::cout << std::endl;
-                                    }
+                                   }
-                                    else
+                                   else
-                                    {
+                                   {
-                                        std::cout << "Output: " << buffer << std::endl;
+                                       std::cout << "Output: " << buffer << std::endl;
-                                    }
+                                   }
-                                }
+                               }
-                                return result;
+                               return result;
-                            }));
+                           }));
    }
    else
    {
-        return generic_eval(*this,
+        ret = generic_eval(*this,
-                            ctx,
+                           ctx,
-                            std::move(params),
+                           std::move(params),
-                            with_check_context([&](auto&, auto f, auto&& check_context) {
+                           with_check_context([&](auto&, auto f, auto&& check_context) {
-                                return check_context(f);
+                               return check_context(f);
-                            }));
+                           }));
    }
+    if(exec_env.async)
+    {
+        ctx.finish_on(exec_env.queue);
+    }
+    return ret;
 }
 const int program_file_version = 5;

--- a/src/py/migraphx_py.cpp
+++ b/src/py/migraphx_py.cpp
@@ -355,6 +355,23 @@ MIGRAPHX_PYBIND11_MODULE(migraphx, m)
                 }
                 return p.eval(pm);
             })
+        .def("run_async",
+             [](migraphx::program& p,
+                py::dict params,
+                std::uintptr_t stream,
+                std::string stream_name) {
+                 migraphx::parameter_map pm;
+                 for(auto x : params)
+                 {
+                     std::string key      = x.first.cast<std::string>();
+                     py::buffer b         = x.second.cast<py::buffer>();
+                     py::buffer_info info = b.request();
+                     pm[key]              = migraphx::argument(to_shape(info), info.ptr);
+                 }
+                 migraphx::execution_environment exec_env{
+                     migraphx::any_ptr(reinterpret_cast<void*>(stream), stream_name), true};
+                 return p.eval(pm, exec_env);
+             })
        .def("sort", &migraphx::program::sort)
        .def("print", [](const migraphx::program& p) { std::cout << p << std::endl; })
        .def("__eq__", std::equal_to<migraphx::program>{})

--- a/src/targets/gpu/include/migraphx/gpu/context.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/context.hpp
@@ -197,7 +197,9 @@ struct hip_device
 struct context
 {
    context(std::size_t device_id = 0, std::size_t n = value_of(MIGRAPHX_NSTREAMS{}, 1))
-        : current_device(std::make_shared<hip_device>(device_id, n))
+        : current_device(std::make_shared<hip_device>(device_id, n)),
+          begin_event(create_event()),
+          finish_event(create_event())
    {
    }
@@ -274,6 +276,24 @@ struct context
        this->current_device = std::make_shared<hip_device>(0, n_streams);
    }
+    void wait_for(any_ptr queue)
+    {
+        auto status = hipEventRecord(begin_event.get(), queue.get<hipStream_t>());
+        if(status != hipSuccess)
+            MIGRAPHX_THROW("failed to record " + hip_error(status));
+        get_stream().wait(begin_event.get());
+    }
+    void finish_on(any_ptr queue)
+    {
+        get_stream().record(finish_event.get());
+        auto status = hipStreamWaitEvent(queue.get<hipStream_t>(), finish_event.get(), 0);
+        if(status != hipSuccess)
+            MIGRAPHX_THROW("Failed to wait on event " + hip_error(status));
+    }
    any_ptr get_queue() { return get_stream().get(); }
    void enable_perf_measurement(bool b = true)
@@ -316,9 +336,13 @@ struct context
    // TODO: Make this a vector to support multiple devices
    std::shared_ptr<hip_device> current_device;
    std::vector<shared<hip_event_ptr>> events;
-    bool measure_perf                 = false;
+    bool measure_perf = false;
+    // for event perf timing
    shared<hip_event_ptr> start_event = nullptr;
    shared<hip_event_ptr> stop_event  = nullptr;
+    // for stream syncronization
+    shared<hip_event_ptr> begin_event  = nullptr;
+    shared<hip_event_ptr> finish_event = nullptr;
 };
 inline void migraphx_to_value(value& v, const context& ctx) { v = ctx.to_value(); }

--- a/test/api/test_gpu.cpp
+++ b/test/api/test_gpu.cpp
@@ -25,6 +25,8 @@
 #include <hip/hip_runtime_api.h>
 #include <migraphx/migraphx.h>
 #include <migraphx/migraphx.hpp>
+#include <migraphx/manage_ptr.hpp>
 #include "test.hpp"
 TEST_CASE(load_and_run)
@@ -44,11 +46,67 @@ TEST_CASE(load_and_run)
    {
        pp.add(name, migraphx::argument::generate(param_shapes[name]));
    }
    auto outputs = p.eval(pp);
    CHECK(shapes_before.size() == outputs.size());
    CHECK(bool{shapes_before.front() == outputs.front().get_shape()});
 }
+using hip_ptr    = MIGRAPHX_MANAGE_PTR(void, hipFree);
+using stream_ptr = MIGRAPHX_MANAGE_PTR(hipStream_t, hipStreamDestroy);
+stream_ptr get_stream()
+{
+    hipStream_t stream;
+    auto err = hipStreamCreateWithFlags(&stream, 0);
+    EXPECT(err == hipSuccess);
+    return stream_ptr{stream};
+}
+hip_ptr get_hip_buffer(size_t size)
+{
+    void* ptr;
+    auto err = hipMalloc(&ptr, size);
+    EXPECT(err == hipSuccess);
+    return hip_ptr{ptr};
+}
+TEST_CASE(load_and_run_async)
+{
+    auto p             = migraphx::parse_onnx("conv_relu_maxpool_test.onnx");
+    auto shapes_before = p.get_output_shapes();
+    migraphx::compile_options options;
+    options.set_offload_copy(false);
+    p.compile(migraphx::target("gpu"), options);
+    auto shapes_after = p.get_output_shapes();
+    CHECK(shapes_before.size() == 1);
+    CHECK(shapes_before.size() == shapes_after.size());
+    CHECK(bool{shapes_before.front() == shapes_after.front()});
+    migraphx::program_parameters pp;
+    auto param_shapes = p.get_parameter_shapes();
+    stream_ptr stream = get_stream();
+    std::vector<hip_ptr> buffs;
+    std::vector<migraphx::argument> args;
+    for(auto&& name : param_shapes.names())
+    {
+        args.push_back(migraphx::argument::generate(param_shapes[name]));
+        buffs.push_back(get_hip_buffer(args.rbegin()->get_shape().bytes()));
+        auto err = hipMemcpy(buffs.rbegin()->get(),
+                             args.rbegin()->data(),
+                             args.rbegin()->get_shape().bytes(),
+                             hipMemcpyHostToDevice);
+        EXPECT(err == hipSuccess);
+        pp.add(name, migraphx::argument(args.rbegin()->get_shape(), buffs.rbegin()->get()));
+    }
+    auto outputs = p.run_async(pp, stream.get());
+    CHECK(shapes_before.size() == outputs.size());
+    CHECK(bool{shapes_before.front() == outputs.front().get_shape()});
+}
 TEST_CASE(load_and_run_ctx)
 {
    auto p = migraphx::parse_onnx("conv_relu_maxpool_test.onnx");

--- a/test/gpu/stream_sync.cpp
+++ b/test/gpu/stream_sync.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <iostream>
+#include <vector>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/context.hpp>
+#include <migraphx/gpu/compile_hip.hpp>
+#include <migraphx/gpu/kernel.hpp>
+#include <migraphx/gpu/device_name.hpp>
+#include <migraphx/par_for.hpp>
+#include <migraphx/program.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/module.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/gpu/target.hpp>
+#include "test.hpp"
+using hip_stream_ptr = MIGRAPHX_MANAGE_PTR(hipStream_t, hipStreamDestroy);
+constexpr uint32_t stream_sync_test_val = 1337;
+// NOLINTNEXTLINE
+const std::string compare_numbers = R"__migraphx__(
+#include <hip/hip_runtime.h>
+extern "C" {
+__global__ void compare(float* data) 
+{
+    int i = threadIdx.x + blockDim.x * blockIdx.x;
+    if (data[i] != 1337) 
+    {
+        abort();
+    }
+}
+}
+int main() {}
+)__migraphx__";
+migraphx::src_file make_src_file(const std::string& name, const std::string& content)
+{
+    return {name, std::make_pair(content.data(), content.data() + content.size())};
+}
+hip_stream_ptr get_stream()
+{
+    hipStream_t stream;
+    auto status = hipStreamCreate(&stream);
+    if(status != hipSuccess)
+    {
+        MIGRAPHX_THROW("Failed to get stream");
+    }
+    return hip_stream_ptr{stream};
+}
+TEST_CASE(test_stream_sync_compare_kernel)
+{
+    auto binaries = migraphx::gpu::compile_hip_src(
+        {make_src_file("check_stuff.cpp", compare_numbers)}, "", migraphx::gpu::get_device_name());
+    EXPECT(binaries.size() == 1);
+    migraphx::gpu::kernel k1{binaries.front(), "compare"};
+    auto input =
+        migraphx::fill_argument({migraphx::shape::float_type, {128}}, stream_sync_test_val);
+    auto ginput = migraphx::gpu::to_gpu(input);
+    hip_stream_ptr pstream = get_stream();
+    k1.launch(pstream.get(), input.get_shape().elements(), 1024)(ginput.cast<float>());
+    auto output = migraphx::gpu::from_gpu(ginput);
+    EXPECT(output == input);
+}
+TEST_CASE(test_stream_sync)
+{
+    auto binaries = migraphx::gpu::compile_hip_src(
+        {make_src_file("check_stuff.cpp", compare_numbers)}, "", migraphx::gpu::get_device_name());
+    EXPECT(binaries.size() == 1);
+    migraphx::gpu::kernel k1{binaries.front(), "compare"};
+    const unsigned int m = 128;
+    const unsigned int k = 8192;
+    // Setup empty GPU memory buffer
+    migraphx::shape input_shape{migraphx::shape::float_type, {m, k}};
+    migraphx::shape output_shape{migraphx::shape::float_type, {m, m}};
+    auto input  = migraphx::fill_argument(input_shape, 0);
+    auto ginput = migraphx::gpu::to_gpu(input);
+    auto output  = migraphx::fill_argument(output_shape, 0);
+    auto goutput = migraphx::gpu::to_gpu(output);
+    hip_stream_ptr pstream = get_stream();
+    migraphx::program p;
+    auto* mm = p.get_main_module();
+    auto x = mm->add_parameter("x", migraphx::shape{migraphx::shape::float_type, {m, k}});
+    auto y = mm->add_literal(
+        migraphx::generate_literal(migraphx::shape{migraphx::shape::float_type, {k, m}}));
+    std::vector<float> data(m * m, stream_sync_test_val);
+    auto test_val = mm->add_literal(output_shape, data);
+    auto mult_out = mm->add_instruction(migraphx::make_op("dot"), x, y);
+    mm->add_instruction(migraphx::make_op("add"), mult_out, test_val);
+    p.compile(migraphx::gpu::target{});
+    // Run network and then verify with kernel
+    auto args = p.eval({{"x", ginput}, {"output", goutput}}, {pstream.get(), true});
+    k1.launch(pstream.get(), m * m, 1024)(goutput.cast<float>());
+    output = migraphx::gpu::from_gpu(goutput);
+    EXPECT(output != input);
+}
+int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/test/py/CMakeLists.txt
+++ b/test/py/CMakeLists.txt
@@ -56,4 +56,5 @@ add_py_test(gpu_offload test_gpu_offload.py WORKING_DIRECTORY ${TEST_ONNX_DIR})
 add_py_test(gpu test_gpu.py WORKING_DIRECTORY ${TEST_ONNX_DIR})
 add_py_test(array test_array.py WORKING_DIRECTORY ${TEST_ONNX_DIR})
 add_py_test(backend onnx_backend_test.py WORKING_DIRECTORY ${TEST_ONNX_DIR})
+add_py_test(gpu_async test_gpu_async.py WORKING_DIRECTORY ${TEST_ONNX_DIR})
 endif()
--- a/test/py/test_gpu_async.py
+++ b/test/py/test_gpu_async.py
+#####################################################################################
+# The MIT License (MIT)
+#
+# Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#####################################################################################
+import migraphx
+import ctypes
+def test_conv_relu():
+    hip = ctypes.cdll.LoadLibrary("libamdhip64.so")
+    p = migraphx.parse_onnx("conv_relu_maxpool_test.onnx")
+    print(p)
+    print("Compiling ...")
+    # Need to have offload_copy = False to avoid syncs() back to the host device
+    p.compile(migraphx.get_target("gpu"), offload_copy=False)
+    print(p)
+    params = {}
+    # Using default value in api for hipSuccess which is always 0
+    hipSuccess = ctypes.c_long(0)
+    # Alloc a stream
+    stream = ctypes.c_void_p()
+    err = ctypes.c_long(
+        hip.hipStreamCreateWithFlags(ctypes.byref(stream), ctypes.c_uint(0)))
+    if err.value != hipSuccess.value:
+        print("FAILED hipStreamCreate")
+        return err
+    # Use to_gpu to push generated argument to the GPU before we perform a run
+    for key, value in p.get_parameter_shapes().items():
+        params[key] = migraphx.to_gpu(migraphx.generate_argument(value))
+    result = migraphx.from_gpu(
+        p.run_async(params, stream.value, "ihipStream_t")[-1])
+    # Wait for all commands in stream to complete
+    err = ctypes.c_long(hip.hipStreamSynchronize(stream))
+    if err.value != hipSuccess.value:
+        print("FAILED: hipStreamSyncronize")
+        return err
+    # Cleanup Stream
+    err = ctypes.c_long(hip.hipStreamDestroy(stream))
+    if err.value != hipSuccess.value:
+        print("FAILED: hipStreamDestroy")
+        return err
+    print(result)
+test_conv_relu()
--- a/tools/accuracy/accuracy_checker.py
+++ b/tools/accuracy/accuracy_checker.py
@@ -116,6 +116,9 @@ def main():
    model = migraphx.parse_onnx(model_name, default_dim_value=batch)
+    if args.verbose:
+        print(model)
    model.compile(migraphx.get_target('gpu'), offload_copy=False)
    params = {}

--- a/tools/api/api.cpp
+++ b/tools/api/api.cpp
@@ -21,6 +21,7 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
+#include <migraphx/execution_environment.hpp>
 #include <migraphx/migraphx.h>
 #include <migraphx/rank.hpp>
 #include <migraphx/shape.hpp>
@@ -166,6 +167,13 @@ void set_output_names(tf_options& options, std::vector<const char*> names)
    options.output_node_names = std::vector<std::string>(names.begin(), names.end());
 }
+std::vector<argument>
+run_async(program& p, const parameter_map& params, void* s, std::string_view name)
+{
+    execution_environment exec_env{any_ptr(s, name), true};
+    return p.eval(params, exec_env);
+}
 template <class Value>
 std::vector<const char*> get_names(const std::unordered_map<std::string, Value>& m)
 {

--- a/tools/include/context.hpp
+++ b/tools/include/context.hpp
@@ -66,12 +66,21 @@ any_ptr get_queue_context(T&)
 {
    return {};
 }
+template <class T>
+void wait_for_context(T&, any_ptr)
+{
+}
+template <class T>
+void finish_on_context(T&, any_ptr){}
 <%
 interface('context',
           virtual('to_value', returns = 'value', const = True, default = 'to_value_context'),
           virtual('from_value', v = 'const value&', default = 'from_value_context'),
           virtual('get_queue', returns = 'any_ptr', default = 'get_queue_context'),
+           virtual('wait_for', queue = 'any_ptr', returns = 'void', default = 'wait_for_context'),
+           virtual('finish_on', queue = 'any_ptr', returns = 'void', default = 'finish_on_context'),
           virtual('finish', returns = 'void', const = True)) %>
    inline void migraphx_to_value(value& v, const context& ctx)