Merge branch 'develop' of github.com:ROCmSoftwarePlatform/AMDMIGraphX into dyn_conv

417d6644 · charlie · 79e27dac · 4a312201 · 417d6644 · 417d6644
Commit 417d6644 authored May 20, 2022 by charlie
16 changed files
--- a/src/targets/gpu/kernels/include/migraphx/kernels/functional.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/functional.hpp
@@ -3,6 +3,14 @@
 #include <migraphx/kernels/array.hpp>
+// NOLINTNEXTLINE
+#define MIGRAPHX_RETURNS(...) \
+    ->decltype(__VA_ARGS__) { return __VA_ARGS__; }
+// NOLINTNEXTLINE
+#define MIGRAPHX_LIFT(...) \
+    [](auto&&... xs) MIGRAPHX_RETURNS((__VA_ARGS__)(static_cast<decltype(xs)>(xs)...))
 namespace migraphx {
 struct swallow
@@ -161,6 +169,18 @@ constexpr auto pack(Ts... xs)
    return [=](auto f) { return f(xs...); };
 }
+template <class G, class F>
+constexpr auto join(G g, F f)
+{
+    return f([=](auto... xs) { return g(xs...); });
+}
+template <class G, class F, class... Fs>
+constexpr auto join(G g, F f, Fs... fs)
+{
+    return f([=](auto... xs) { return join([=](auto... ys) { return g(xs..., ys...); }, fs...); });
+}
 template <class Compare, class P1, class P2>
 constexpr auto pack_compare(Compare compare, P1 p1, P2 p2)
 {
@@ -191,39 +211,45 @@ constexpr auto arg(IntegralConstant ic)
    return arg_c<ic>();
 }
-inline constexpr auto rotate_last()
+template <class F>
+constexpr auto make_transform(F f)
 {
-    return [](auto... xs) {
+    return [=](auto... xs) { return [=](auto g) { return f(g, xs...); }; };
-        return [=](auto&& f) {
-            return sequence_c<sizeof...(xs)>([&](auto... is) {
-                constexpr auto size = sizeof...(is);
-                return f(arg_c<(is + size - 1) % size>()(xs...)...);
-            });
-        };
-    };
 }
+// An arg transformation takes the arguments and then a function to take the new arguments:
+//     transform(xs...)([](auto... ys) { ... })
+// The transform_args function takes a list of transformations and continually applies them
 template <class F>
 constexpr auto transform_args(F f)
 {
-    return [=](auto... xs) {
+    return f;
-        return [=](auto g) { return f(xs...)([&](auto... ys) { return g(ys...); }); };
-    };
 }
 template <class F, class... Fs>
 constexpr auto transform_args(F f, Fs... fs)
 {
-    return [=](auto... xs) { return transform_args(f)(xs...)(transform_args(fs...)); };
+    return make_transform([=](auto g, auto... xs) {
+        return f(xs...)([=](auto... ys) { return transform_args(fs...)(ys...)(g); });
+    });
 }
-// NOLINTNEXTLINE
+// identity transform
-#define MIGRAPHX_RETURNS(...) \
+inline constexpr auto transform_args()
-    ->decltype(__VA_ARGS__) { return __VA_ARGS__; }
+{
+    return make_transform([](auto f, auto... xs) { return f(xs...); });
+}
-// NOLINTNEXTLINE
+// Rotate the first argument to the last argument
-#define MIGRAPHX_LIFT(...) \
+inline constexpr auto rotate_last()
-    [](auto&&... xs) MIGRAPHX_RETURNS((__VA_ARGS__)(static_cast<decltype(xs)>(xs)...))
+{
+    return make_transform([](auto f, auto... xs) {
+        return sequence_c<sizeof...(xs)>([&](auto... is) {
+            constexpr auto size = sizeof...(is);
+            return f(arg_c<(is + size - 1) % size>()(xs...)...);
+        });
+    });
+}
 } // namespace migraphx
 #endif // MIGRAPHX_GUARD_KERNELS_FUNCTIONAL_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/pointwise.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/pointwise.hpp
@@ -38,20 +38,17 @@ constexpr implicit_conversion_op<T> implicit_conversion(T x)
 template <class F, class T, class... Ts>
 __device__ void pointwise_tensor(index idx, F f, T out, Ts... xs)
 {
-    preload<typename T::type>(idx, xs...)([&](auto... ps) {
    idx.global_stride(out.get_shape().elements(),
-                          [&](auto i) { out[i] = implicit_conversion(f(ps[i]...)); });
+                      [&](auto i) { out[i] = implicit_conversion(f(xs[i]...)); });
-    });
 }
-template <class F, class... Ts>
+template <class... Transforms>
-__device__ void pointwise(F f, Ts*... ps)
+__device__ auto pointwise(index idx, Transforms... transforms)
 {
-    auto t = transform_args(make_tensors(), rotate_last(), auto_vectorize());
+    return [=](auto f, auto*... ps) {
-    t(ps...)([&](auto... xs) {
+        auto t = transform_args(make_tensors(), rotate_last(), transforms...);
-        auto idx = make_index();
+        t(ps...)([&](auto... xs) { pointwise_tensor(idx, f, xs...); });
-        pointwise_tensor(idx, f, xs...);
+    };
-    });
 }
 } // namespace migraphx

--- a/src/targets/gpu/kernels/include/migraphx/kernels/preload.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/preload.hpp
@@ -3,6 +3,8 @@
 #include <migraphx/kernels/index.hpp>
 #include <migraphx/kernels/functional.hpp>
+#include <migraphx/kernels/tensor_view.hpp>
+#include <migraphx/kernels/vec.hpp>
 namespace migraphx {
@@ -73,7 +75,7 @@ __device__ auto preload_copy(index idx, F f, __shared__ T* buffer, Ts... xs)
            {
                if constexpr(decltype(tensor_vec_size(x)){} == 0)
                {
-                    auto v = vectorize(x);
+                    auto v = auto_vectorize(x);
                    auto b = as_vec(tensor_vec_size(v), buffer + offset);
                    idx.local_stride(v.get_shape().element_space(),
                                     [&](auto i) { b[i] = v.data()[i]; });
@@ -126,5 +128,47 @@ __device__ auto preload(index idx, Ts... xs)
    };
 }
+inline __device__ auto auto_preload(index idx)
+{
+    return make_transform([=](auto f, auto out, auto... xs) {
+        preload<typename decltype(out)::type>(idx, xs...)([&](auto... ys) { f(out, ys...); });
+    });
+}
+template <bool B, class T>
+__device__ auto preload_copy(index idx, T x)
+{
+    return [=](auto f) {
+        if constexpr(B)
+        {
+            using type          = typename T::type;
+            constexpr auto size = get_shape_c<T>{}.element_space();
+            __shared__ type buffer[size];
+            // TODO: Always vecotrize when size > 4, and then use a second loop for remainder
+            constexpr auto n = find_vectorize_size([&](auto i) { return (size % i) == 0; });
+            auto input       = as_vec<n>(remove_bool(x.data()));
+            auto b           = as_vec<n>(remove_bool(buffer));
+            idx.local_stride(size / n, [&](auto i) { b[i] = input[i]; });
+            return f(x.with(buffer));
+        }
+        else
+        {
+            return f(x);
+        }
+    };
+}
+template <bool... Bs>
+__device__ auto auto_preload(index idx)
+{
+    return make_transform([=](auto f, auto... xs) {
+        auto invoke = [=](auto... ys) {
+            __syncthreads();
+            f(ys...);
+        };
+        join(invoke, preload_copy<Bs>(idx, xs)...);
+    });
+}
 } // namespace migraphx
 #endif // MIGRAPHX_GUARD_KERNELS_PRELOAD_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/vec.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/vec.hpp
@@ -60,10 +60,19 @@ constexpr auto common_vec_size()
    })(vec_size<Ts>()...);
 }
+// Bools can not be used as a vector type so convert it to uint8
+template <class T>
+__device__ __host__ T* remove_bool(T* x)
+{
+    return x;
+}
+inline __device__ __host__ uint8_t* remove_bool(bool* x) { return reinterpret_cast<uint8_t*>(x); }
 template <index_int N, class T>
 __device__ __host__ auto as_vec(T* x)
 {
-    if constexpr(N == 0)
+    if constexpr(N < 2)
        return x;
    else
        return reinterpret_cast<vec<T, N>*>(x);

--- a/src/targets/gpu/kernels/include/migraphx/kernels/vectorize.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/vectorize.hpp
@@ -50,19 +50,10 @@ constexpr auto shape_step(Shape s, Axis)
    });
 }
-// Bools can not be used as a vector type so convert it to uint8
-template <class T>
-__device__ __host__ T* remove_bool(T* x)
-{
-    return x;
-}
-inline __device__ __host__ uint8_t* remove_bool(bool* x) { return reinterpret_cast<uint8_t*>(x); }
 template <index_int N, class T, class Axis>
 __device__ __host__ auto as_vec(T x, Axis axis)
 {
-    if constexpr(N == 0)
+    if constexpr(N < 2)
        return x;
    else
        return make_tensor_view(as_vec<N>(remove_bool(x.data())),
@@ -72,7 +63,7 @@ __device__ __host__ auto as_vec(T x, Axis axis)
 template <index_int N, class T, class Axis>
 constexpr auto tensor_step(T x, Axis axis)
 {
-    if constexpr(N == 0)
+    if constexpr(N < 2)
    {
        return x;
    }
@@ -157,11 +148,11 @@ constexpr auto find_vectorize_size(P pred)
    else if constexpr(decltype(pred(_c<2>)){})
        return _c<2>;
    else
-        return _c<0>;
+        return _c<1>;
 }
 template <class T>
-__host__ __device__ auto vectorize(T x)
+__host__ __device__ auto auto_vectorize(T x)
 {
    if constexpr(tensor_vec_size<T>() == 0)
    {
@@ -194,7 +185,7 @@ inline __device__ __host__ auto auto_vectorize_impl(F f, Ts... xs)
                {
                    MIGRAPHX_ASSERT(s.strides[axis] == 0 or s.strides[axis] == 1);
                    MIGRAPHX_ASSERT(s.lens[axis] > 0);
-                    MIGRAPHX_ASSERT(n == 0 or s.lens[axis] % n == 0);
+                    MIGRAPHX_ASSERT(n == 1 or s.lens[axis] % n == 0);
                    if constexpr(s.strides[axis] == 0)
                        return tensor_step<n>(x, axis);
                    else
@@ -215,7 +206,32 @@ inline __device__ __host__ auto auto_vectorize_impl(F f, Ts... xs)
 inline __device__ __host__ auto auto_vectorize()
 {
-    return [](auto... xs) { return [=](auto f) { auto_vectorize_impl(f, xs...); }; };
+    return make_transform([](auto f, auto... xs) { auto_vectorize_impl(f, xs...); });
+}
+template <index_int N, index_int Axis, class T>
+__device__ __host__ auto vectorize_tensor(T x)
+{
+    constexpr auto shape = get_shape_c<T>{};
+    if constexpr(shape.strides[Axis] == 0)
+        return tensor_step<N>(x, _c<Axis>);
+    else
+        return as_vec<N>(x, _c<Axis>);
+}
+template <index_int N, index_int Axis>
+__device__ __host__ auto vectorize()
+{
+    return make_transform([](auto f, auto... xs) {
+        if constexpr(N < 2)
+        {
+            f(xs...);
+        }
+        else
+        {
+            f(vectorize_tensor<N, Axis>(xs)...);
+        }
+    });
 }
 } // namespace migraphx

--- a/src/targets/gpu/prefuse_ops.cpp
+++ b/src/targets/gpu/prefuse_ops.cpp
+#include <migraphx/gpu/prefuse_ops.hpp>
+#include <migraphx/match/layernorm.hpp>
+#include <migraphx/make_op.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace {
+struct find_layernorm
+{
+    auto matcher() const { return match::layernorm(); }
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        auto ins   = r.result;
+        auto x_ins = r.instructions["x"];
+        if(not x_ins->get_shape().standard())
+            x_ins = m.insert_instruction(ins, make_op("contiguous"), x_ins);
+        auto relements = x_ins->get_shape().lens().back();
+        if(relements > 1024 or (relements % 4 != 0 and relements > 256))
+            return;
+        auto a = m.insert_instruction(
+            ins, make_op("hip::allocate", {{"shape", to_value(x_ins->get_shape())}}));
+        m.replace_instruction(ins, make_op("gpu::layernorm"), x_ins, a);
+    }
+};
+struct find_triaddlayernorm
+{
+    auto matcher() const
+    {
+        auto add1 =
+            match::name("add")(match::none_of(match::is_constant()),
+                               match::args(match::any().bind("z1"), match::any().bind("z2")));
+        auto add2 = match::name("add")(match::either_arg(0, 1)(add1, match::any().bind("z3")));
+        return match::layernorm()(match::var("x")(add2));
+    }
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        auto ins   = r.result;
+        auto x_ins = r.instructions["z1"];
+        auto y_ins = r.instructions["z2"];
+        auto z_ins = r.instructions["z3"];
+        for(auto* pins : {&x_ins, &y_ins, &z_ins})
+        {
+            if(not(*pins)->get_shape().standard())
+                *pins = m.insert_instruction(ins, make_op("contiguous"), *pins);
+        }
+        auto relements = x_ins->get_shape().lens().back();
+        if(relements > 1024 or (relements % 4 != 0 and relements > 256))
+            return;
+        auto a = m.insert_instruction(
+            ins, make_op("hip::allocate", {{"shape", to_value(x_ins->get_shape())}}));
+        m.replace_instruction(ins, make_op("gpu::triadd_layernorm"), x_ins, y_ins, z_ins, a);
+    }
+};
+} // namespace
+void prefuse_ops::apply(module& m) const
+{
+    match::find_matches(m, find_triaddlayernorm{}, find_layernorm{});
+}
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/schedule_model.cpp
+++ b/src/targets/gpu/schedule_model.cpp
@@ -77,28 +77,28 @@ MIGRAPHX_REGISTER_OP(wait_event)
 MIGRAPHX_REGISTER_OP(set_stream)
 std::size_t schedule_model::concurrency() const { return streams; }
-void schedule_model::sched(module& p, instruction_ref ins, std::size_t n) const
+void schedule_model::sched(module& m, instruction_ref ins, std::size_t n) const
 {
    auto last_stream = std::find_if(std::make_reverse_iterator(ins),
-                                    std::make_reverse_iterator(p.begin()),
+                                    std::make_reverse_iterator(m.begin()),
                                    [&](auto&& i) { return i.name() == "gpu::set_stream"; });
-    if(last_stream != std::make_reverse_iterator(p.begin()))
+    if(last_stream != std::make_reverse_iterator(m.begin()))
    {
        auto&& op = any_cast<set_stream>(last_stream->get_operator());
        // If the same stream was set earlier then skip
        if(op.stream == n)
            return;
    }
-    p.insert_instruction(ins, set_stream{n});
+    m.insert_instruction(ins, set_stream{n});
 }
-void schedule_model::wait(module& p, instruction_ref ins, std::size_t wait_id) const
+void schedule_model::wait(module& m, instruction_ref ins, std::size_t wait_id) const
 {
-    p.insert_instruction(ins, wait_event{wait_id});
+    m.insert_instruction(ins, wait_event{wait_id});
 }
-void schedule_model::record(module& p, instruction_ref ins, std::size_t wait_id) const
+void schedule_model::record(module& m, instruction_ref ins, std::size_t wait_id) const
 {
-    p.insert_instruction(std::next(ins), record_event{wait_id});
+    m.insert_instruction(std::next(ins), record_event{wait_id});
 }
 static std::unordered_map<std::string, std::size_t> create_weight_map()

--- a/src/targets/gpu/sync_device.cpp
+++ b/src/targets/gpu/sync_device.cpp
@@ -8,9 +8,9 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
-void sync_device::apply(module& p) const
+void sync_device::apply(module& m) const
 {
-    auto last = std::prev(p.end());
+    auto last = std::prev(m.end());
    if(last->name() == "@return")
    {
        auto inputs = last->inputs();
@@ -18,10 +18,10 @@ void sync_device::apply(module& p) const
               return (i->name() == "hip::copy_from_gpu");
           }))
        {
-            auto sync_in = p.insert_instruction(last, make_op("hip::sync_stream"), inputs);
+            auto sync_in = m.insert_instruction(last, make_op("hip::sync_stream"), inputs);
            if(not inputs.empty())
            {
-                p.replace_instruction(inputs.front(), sync_in);
+                m.replace_instruction(inputs.front(), sync_in);
            }
        }
    }

--- a/src/targets/gpu/target.cpp
+++ b/src/targets/gpu/target.cpp
@@ -31,6 +31,7 @@
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/gpu/eliminate_workspace.hpp>
 #include <migraphx/gpu/fuse_ops.hpp>
+#include <migraphx/gpu/prefuse_ops.hpp>
 #include <migraphx/gpu/lowering.hpp>
 #include <migraphx/gpu/mlir_conv.hpp>
 #include <migraphx/gpu/pack_int8_args.hpp>
@@ -96,6 +97,8 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
        simplify_algebra{},
        simplify_reshapes{},
        simplify_algebra{},
+        prefuse_ops{},
+        dead_code_elimination{},
        auto_contiguous{},
        simplify_reshapes{},
        propagate_constant{},

--- a/src/targets/gpu/write_literals.cpp
+++ b/src/targets/gpu/write_literals.cpp
@@ -11,25 +11,25 @@ namespace gpu {
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_COPY_LITERALS)
-void write_literals::apply(module& p) const
+void write_literals::apply(module& m) const
 {
    assert(ctx != nullptr);
    std::size_t n = 0;
-    for(auto ins : iterator_for(p))
+    for(auto ins : iterator_for(m))
    {
        if(ins->name() == "@literal")
        {
            if(enabled(MIGRAPHX_COPY_LITERALS{}))
            {
                literal l  = ins->get_literal();
-                auto pre   = p.add_literal(l);
+                auto pre   = m.add_literal(l);
-                auto alloc = p.insert_instruction(std::next(pre), hip_allocate{l.get_shape()});
+                auto alloc = m.insert_instruction(std::next(pre), hip_allocate{l.get_shape()});
-                p.replace_instruction(ins, hip_copy_to_gpu{}, pre, alloc);
+                m.replace_instruction(ins, hip_copy_to_gpu{}, pre, alloc);
            }
            else
            {
-                std::string id = p.name() + ":@literal:" + std::to_string(n);
+                std::string id = m.name() + ":@literal:" + std::to_string(n);
-                p.replace_instruction(ins, hip_copy_literal{ins->get_literal(), id});
+                m.replace_instruction(ins, hip_copy_literal{ins->get_literal(), id});
                n++;
            }
        }

--- a/test/api/test_module_construct.cpp
+++ b/test/api/test_module_construct.cpp
@@ -3,23 +3,21 @@
 #include <migraphx/migraphx.hpp>
 #include "test.hpp"
-TEST_CASE(add_op)
+TEST_CASE(add_literals)
 {
    migraphx::program p;
    migraphx::module m = p.get_main_module();
    migraphx::shape param_shape{migraphx_shape_float_type, {3, 3}};
-    auto x      = m.add_parameter("x", param_shape);
+    std::vector<float> x_values(9, 1);
-    auto y      = m.add_parameter("y", param_shape);
+    auto x = m.add_literal(param_shape, x_values.data());
+    std::vector<float> y_values(9, -1);
+    auto y      = m.add_literal(param_shape, y_values.data());
    auto add_op = migraphx::operation("add");
    auto r      = m.add_instruction(add_op, {x, y});
    m.add_return({r});
    // run on ref target
    p.compile(migraphx::target("ref"));
    migraphx::program_parameters pp;
-    std::vector<float> x_data(9, 1);
-    std::vector<float> y_data(9, -1);
-    pp.add("x", migraphx::argument(param_shape, x_data.data()));
-    pp.add("y", migraphx::argument(param_shape, y_data.data()));
    auto outputs = p.eval(pp);
    auto output  = outputs[0];
    std::vector<float> expected(9, 0);

--- a/test/dead_code_elimination_test.cpp
+++ b/test/dead_code_elimination_test.cpp
@@ -180,6 +180,40 @@ TEST_CASE(duplicate_args3)
    EXPECT(result == migraphx::literal{0});
 }
+TEST_CASE(reused_twice)
+{
+    migraphx::program p;
+    auto* mm                 = p.get_main_module();
+    std::vector<size_t> dims = {1, 2, 2};
+    auto x        = mm->add_parameter("x", migraphx::shape{migraphx::shape::float_type, dims});
+    auto y        = mm->add_parameter("y", migraphx::shape{migraphx::shape::float_type, dims});
+    auto z        = mm->add_parameter("z", migraphx::shape{migraphx::shape::float_type, dims});
+    auto add1     = mm->add_instruction(migraphx::make_op("add"), x, y);
+    auto add2     = mm->add_instruction(migraphx::make_op("add"), add1, z);
+    auto epsilon  = mm->add_literal(1e-12f);
+    auto exponent = mm->add_literal(2.0f);
+    auto mean = mm->add_instruction(migraphx::make_op("reduce_mean", {{"axes", {2}}}), add2);
+    auto mean_mbcast =
+        mm->add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", dims}}), mean);
+    auto sub = mm->add_instruction(migraphx::make_op("sub"), add2, mean_mbcast);
+    auto exponent_mbcast =
+        mm->add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", dims}}), exponent);
+    auto pow = mm->add_instruction(migraphx::make_op("pow"), sub, exponent_mbcast);
+    auto var = mm->add_instruction(migraphx::make_op("reduce_mean", {{"axes", {2}}}), pow);
+    auto epsilon_mbcast = mm->add_instruction(
+        migraphx::make_op("multibroadcast", {{"out_lens", {1, dims.at(1), 1}}}), epsilon);
+    auto add_epsilon = mm->add_instruction(migraphx::make_op("add"), var, epsilon_mbcast);
+    mm->add_instruction(migraphx::make_op("sqrt"), add_epsilon);
+    mm->add_instruction(migraphx::make_op("add"), x, y);
+    auto count = std::distance(mm->begin(), mm->end());
+    run_pass(p);
+    p.debug_print();
+    EXPECT(std::distance(mm->begin(), mm->end()) != count);
+    EXPECT(std::distance(mm->begin(), mm->end()) == 4);
+}
 TEST_CASE(unused_module)
 {
    migraphx::program p;

--- a/test/matcher.cpp
+++ b/test/matcher.cpp
@@ -332,7 +332,7 @@ TEST_CASE(match_either_args_any1)
        match::name("sum")(match::either_arg(0, 1)(match::any().bind("x"), match::any().bind("y")));
    auto r = find_match(mm, m);
    EXPECT(bool{r.result == sum1});
-    EXPECT(bool{r.instructions.at("x") != r.instructions.at("y")});
+    EXPECT(bool{r.instructions["x"] != r.instructions["y"]});
 }
 TEST_CASE(match_either_args_any2)
@@ -347,7 +347,7 @@ TEST_CASE(match_either_args_any2)
        match::either_arg(0, 1)(match::any().bind("x"), match::name("@literal").bind("y")));
    auto r = find_match(mm, m);
    EXPECT(bool{r.result == sum1});
-    EXPECT(bool{r.instructions.at("x") != r.instructions.at("y")});
+    EXPECT(bool{r.instructions["x"] != r.instructions["y"]});
 }
 TEST_CASE(match_either_args_any3)
@@ -362,7 +362,7 @@ TEST_CASE(match_either_args_any3)
        match::either_arg(0, 1)(match::name("@literal").bind("x"), match::any().bind("y")));
    auto r = find_match(mm, m);
    EXPECT(bool{r.result == sum1});
-    EXPECT(bool{r.instructions.at("x") != r.instructions.at("y")});
+    EXPECT(bool{r.instructions["x"] != r.instructions["y"]});
 }
 TEST_CASE(match_either_args_any4)
@@ -377,7 +377,7 @@ TEST_CASE(match_either_args_any4)
        match::either_arg(0, 1)(match::name("sum").bind("x"), match::any().bind("y")));
    auto r = find_match(mm, m);
    EXPECT(bool{r.result == sum2});
-    EXPECT(bool{r.instructions.at("x") != r.instructions.at("y")});
+    EXPECT(bool{r.instructions["x"] != r.instructions["y"]});
 }
 TEST_CASE(match_either_args_any5)
@@ -392,7 +392,7 @@ TEST_CASE(match_either_args_any5)
        match::either_arg(0, 1)(match::any().bind("x"), match::name("sum").bind("y")));
    auto r = find_match(mm, m);
    EXPECT(bool{r.result == sum2});
-    EXPECT(bool{r.instructions.at("x") != r.instructions.at("y")});
+    EXPECT(bool{r.instructions["x"] != r.instructions["y"]});
 }
 TEST_CASE(match_all_of1)
@@ -747,10 +747,10 @@ TEST_CASE(match_bind1)
                 match::standard_shape())
                 .bind("pass");
    auto r = find_match(mm, m);
-    EXPECT(bool{r.instructions.at("one") == one});
+    EXPECT(bool{r.instructions["one"] == one});
-    EXPECT(bool{r.instructions.at("two") == two});
+    EXPECT(bool{r.instructions["two"] == two});
-    EXPECT(bool{r.instructions.at("sum") == sum});
+    EXPECT(bool{r.instructions["sum"] == sum});
-    EXPECT(bool{r.instructions.at("pass") == pass});
+    EXPECT(bool{r.instructions["pass"] == pass});
    EXPECT(bool{r.result == pass});
 }
@@ -795,9 +795,9 @@ TEST_CASE(match_bind_modules2)
                 match::standard_shape())
                 .bind("pass");
    auto r = find_match(*child, m);
-    EXPECT(bool{r.instructions.at("two") == two});
+    EXPECT(bool{r.instructions["two"] == two});
-    EXPECT(bool{r.instructions.at("sum") == sum});
+    EXPECT(bool{r.instructions["sum"] == sum});
-    EXPECT(bool{r.instructions.at("pass") == pass});
+    EXPECT(bool{r.instructions["pass"] == pass});
    EXPECT(bool{r.result == pass});
 }

--- a/test/py/test_module_construct.py
+++ b/test/py/test_module_construct.py
-import migraphx
+import migraphx, array, sys
+def create_buffer(t, data, shape):
+    a = array.array(t, data)
+    m = memoryview(a.tobytes())
+    return m.cast(t, shape)
 def test_add_op():
    p = migraphx.program()
    mm = p.get_main_module()
-    param_shape = migraphx.shape(lens=[3, 3], type="float")
+    x = mm.add_literal(create_buffer('f', [1.0] * 9, (3, 3)))
-    x = mm.add_parameter("x", param_shape)
+    y = mm.add_literal(create_buffer('f', [2.0] * 9, (3, 3)))
-    y = mm.add_parameter("y", param_shape)
    add_op = mm.add_instruction(migraphx.op("add"), [x, y])
    mm.add_return([add_op])
    p.compile(migraphx.get_target("ref"))
    params = {}
-    params["x"] = migraphx.generate_argument(param_shape)
-    params["y"] = migraphx.generate_argument(param_shape)
    output = p.run(params)[-1].tolist()
-    assert output == [
+    assert output == list([3.0] * 9)
-        a + b for a, b in zip(params["x"].tolist(), params["y"].tolist())
-    ]
 def test_if_then_else():
@@ -60,5 +61,6 @@ def test_if_then_else():
 if __name__ == "__main__":
+    if sys.version_info >= (3, 0):
        test_add_op()
    test_if_then_else()
--- a/test/py/test_numpy.py
+++ b/test/py/test_numpy.py
+import migraphx, sys
+try:
+    import numpy as np
+except:
+    sys.exit()
+def test_add_op():
+    p = migraphx.program()
+    mm = p.get_main_module()
+    x = mm.add_literal(np.ones((3, 3), dtype='float32'))
+    y = mm.add_literal(2 * np.ones((3, 3), dtype='float32'))
+    add_op = mm.add_instruction(migraphx.op("add"), [x, y])
+    mm.add_return([add_op])
+    p.compile(migraphx.get_target("ref"))
+    params = {}
+    output = p.run(params)[-1].tolist()
+    assert output == list(3 * np.ones((9), dtype='float32'))
+if __name__ == "__main__":
+    test_add_op()
--- a/tools/install_prereqs.sh
+++ b/tools/install_prereqs.sh
@@ -4,12 +4,20 @@
 set -e
-#install pip3, rocm-cmake, rocblas and miopen
+export LC_ALL=C.UTF-8
-apt update && apt install -y python3-pip rocm-cmake rocblas miopen-hip openmp-extras
+export LANG=C.UTF-8
+# Need pip3 and Python headers to build dependencies
+apt update && apt install -y python3-pip python3-dev cmake rocm-cmake rocblas miopen-hip openmp-extras
+# Needed for cmake to build various pip packages
+pip3 install setuptools wheel
 # install rbuild to build dependencies
 pip3 install https://github.com/RadeonOpenCompute/rbuild/archive/master.tar.gz
 PREFIX=/usr/local
 REQ_FILE_DIR=""
 if [ "$#" -ge 2 ]; then