Merge remote-tracking branch 'origin/develop' into transformer-opts

d5bdfed0 · turneram · 5ded4ac1 · 4ec8209f · d5bdfed0 · d5bdfed0
Commit d5bdfed0 authored May 11, 2022 by turneram
17 changed files
--- a/src/targets/cpu/include/migraphx/cpu/pointwise.hpp
+++ b/src/targets/cpu/include/migraphx/cpu/pointwise.hpp
@@ -319,7 +319,7 @@ struct cpu_unary : reduce_dims_base, auto_register_op<cpu_unary<Op>>
    shape compute_shape(const std::vector<shape>& inputs) const
    {
        check_shapes{inputs, *this}.has(2);
-        auto s = inputs.at(0);
+        const auto& s = inputs.at(0);
        return {s.type(), s.lens()};
    }
    argument
@@ -357,7 +357,7 @@ struct cpu_binary : reduce_dims_base, auto_register_op<cpu_binary<Op>>
    shape compute_shape(const std::vector<shape>& inputs) const
    {
        check_shapes{inputs, *this}.has(3);
-        auto s = inputs.at(0);
+        const auto& s = inputs.at(0);
        return {s.type(), s.lens()};
    }

--- a/src/targets/cpu/lowering.cpp
+++ b/src/targets/cpu/lowering.cpp
@@ -223,7 +223,7 @@ struct cpu_unary2 : auto_register_op<cpu_unary2<Op>>
    shape compute_shape(const std::vector<shape>& inputs) const
    {
        check_shapes{inputs, *this}.has(1);
-        auto s = inputs.at(0);
+        const auto& s = inputs.at(0);
        return {s.type(), s.lens()};
    }

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -93,7 +93,7 @@ add_library(migraphx_device
 )
 add_library(compile_for_gpu INTERFACE)
 target_compile_options(compile_for_gpu INTERFACE -std=c++17 -fno-gpu-rdc -Wno-cuda-compat -Wno-unused-command-line-argument -Xclang -fallow-half-arguments-and-returns)
-target_link_libraries(compile_for_gpu INTERFACE hip::device -fno-gpu-rdc -Wno-invalid-command-line-argument -Wno-unused-command-line-argument)
+target_link_libraries(compile_for_gpu INTERFACE hip::device -fno-gpu-rdc -Wno-invalid-command-line-argument -Wno-unused-command-line-argument -Wno-option-ignored)
 check_cxx_compiler_flag("--cuda-host-only -fhip-lambda-host-device -x hip" HAS_HIP_LAMBDA_HOST_DEVICE)
 if(HAS_HIP_LAMBDA_HOST_DEVICE)
  message(STATUS "Enable -fhip-lambda-host-device")

--- a/src/targets/gpu/compile_hip.cpp
+++ b/src/targets/gpu/compile_hip.cpp
@@ -22,6 +22,7 @@ namespace gpu {
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_DEBUG);
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_OPTIMIZE);
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_DUMP_ASM);
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_DUMP_SRC);
 #if MIGRAPHX_USE_HIPRTC
@@ -133,6 +134,7 @@ struct hiprtc_program
        std::vector<char> buffer(n);
        MIGRAPHX_HIPRTC(hiprtcGetProgramLog(prog.get(), buffer.data()));
        assert(buffer.back() == 0);
+        // cppcheck-suppress returnDanglingLifetime
        return {buffer.begin(), buffer.end() - 1};
    }
@@ -246,6 +248,16 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
            MIGRAPHX_THROW("Missing hsaco");
        };
+    if(enabled(MIGRAPHX_GPU_DUMP_SRC{}))
+    {
+        for(const auto& src : srcs)
+        {
+            if(src.path.extension() != ".cpp")
+                continue;
+            std::cout << std::string(src.content.first, src.len()) << std::endl;
+        }
+    }
    if(enabled(MIGRAPHX_GPU_DUMP_ASM{}))
    {

--- a/src/targets/gpu/jit/pointwise.cpp
+++ b/src/targets/gpu/jit/pointwise.cpp
@@ -6,6 +6,7 @@
 #include <migraphx/cpp_generator.hpp>
 #include <migraphx/ranges.hpp>
 #include <migraphx/reduce_dims.hpp>
+#include <migraphx/permutation.hpp>
 #include <migraphx/stringutils.hpp>
 #include <migraphx/dead_code_elimination.hpp>
 #include <migraphx/eliminate_common_subexpression.hpp>
@@ -28,7 +29,8 @@ ${preamble}
 extern "C" {
 __global__ void kernel(${params}) 
 {
-    pointwise(${lambda}, ${args});
+    auto idx = make_index();
+    pointwise(idx, auto_preload<${preloads}>(idx), vectorize<${vec_size}, ${axis}>())(${lambda}, ${args});
 }
 }
@@ -41,40 +43,105 @@ struct pointwise_compiler : compiler<pointwise_compiler>
 {
    std::vector<std::string> names() const { return {"pointwise"}; }
-    static std::size_t oversubscribe(const std::vector<shape>& inputs)
+    static std::size_t oversubscribe_if(bool b)
    {
-        if(std::any_of(inputs.begin(), inputs.end(), [](const auto& s) { return s.broadcasted(); }))
+        if(b)
-            return 1;
-        else
            return 256;
+        else
+            return 1;
+    }
+    static std::size_t find_fast_axis(const std::vector<shape>& inputs)
+    {
+        auto permutation = find_permutation(inputs);
+        auto it          = std::max_element(permutation.begin(), permutation.end());
+        return it - permutation.begin();
    }
-    static std::size_t vectorize_elements(const std::vector<shape>& inputs)
+    static std::vector<bool> preload(std::size_t axis, const std::vector<shape>& inputs)
    {
-        std::size_t n = inputs.front().elements();
+        const std::size_t max_lds_bytes = 4096;
+        std::vector<bool> result;
+        std::transform(inputs.begin(),
+                       inputs.end(),
+                       std::back_inserter(result),
+                       [&](const shape& input) { return input.strides()[axis] == 0; });
+        auto bytes = std::inner_product(inputs.begin(),
+                                        inputs.end(),
+                                        result.begin(),
+                                        std::size_t{0},
+                                        std::plus<>{},
+                                        [](const shape& s, bool b) -> std::size_t {
+                                            if(b)
+                                                return s.bytes();
+                                            return 0;
+                                        });
+        if(bytes < max_lds_bytes)
+            return result;
+        // TODO: Try to partially preload items
+        std::fill(result.begin(), result.end(), false);
+        return result;
+    }
+    static std::string preload_str(const std::vector<bool>& bs)
+    {
+        std::vector<std::string> bool_strs;
+        std::transform(bs.begin(), std::prev(bs.end()), std::back_inserter(bool_strs), [](bool b) {
+            if(b)
+                return "true";
+            return "false";
+        });
+        return "false, " + join_strings(bool_strs, ", ");
+    }
+    static std::vector<std::size_t> vector_sizes(const std::vector<shape>& inputs)
+    {
+        // If all inputs is half then only use half2
        if(std::all_of(inputs.begin(), inputs.end(), [](const auto& s) {
-               return s.packed() or s.broadcasted();
+               return s.type() == shape::half_type;
           }))
-        {
+            return {2};
-            if((n % 4) == 0)
+        return {4, 2};
-                return n / 4;
+    }
-            else if((n % 2) == 0)
+    static auto vectorize_elements(std::size_t axis, const std::vector<shape>& inputs)
-                return n / 2;
+    {
-        }
+        auto sizes = vector_sizes(inputs);
-        return n;
+        std::vector<std::size_t> max_vec_size;
+        std::transform(inputs.begin(),
+                       inputs.end(),
+                       std::back_inserter(max_vec_size),
+                       [&](const auto& input) -> std::size_t {
+                           auto stride = input.strides()[axis];
+                           auto len    = input.lens()[axis];
+                           if(stride != 0 and stride != 1)
+                               return 1;
+                           auto it = std::find_if(
+                               sizes.begin(), sizes.end(), [&](auto i) { return (len % i) == 0; });
+                           if(it != sizes.end())
+                               return *it;
+                           return 1;
+                       });
+        return *std::min_element(max_vec_size.begin(), max_vec_size.end());
    }
    operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
    {
        hip_compile_options options;
-        options.set_launch_params(
-            v, compute_global_for(ctx, vectorize_elements(inputs), oversubscribe(inputs)));
        options.inputs         = inputs;
        options.output         = inputs.back();
        options.virtual_inputs = reduce_dims(inputs);
        options.params         = "-Wno-float-equal";
-        auto src               = interpolate_string(pointwise_kernel,
+        auto axis              = find_fast_axis(options.virtual_inputs);
+        auto vec_size          = vectorize_elements(axis, options.virtual_inputs);
+        auto preloads          = preload(axis, options.virtual_inputs);
+        auto is_preloading =
+            std::accumulate(preloads.begin(), preloads.end(), false, std::logical_or<>{});
+        options.set_launch_params(v,
+                                  compute_global_for(ctx,
+                                                     options.output.elements() / vec_size,
+                                                     oversubscribe_if(not is_preloading)));
+        auto src = interpolate_string(pointwise_kernel,
                                      {{"params", enum_params(inputs.size(), "void * private_p")},
                                       {"args", enum_params(inputs.size(), "private_p")},
                                       {"lambda", v.at("lambda").to<std::string>()},
+                                       {"vec_size", std::to_string(vec_size)},
+                                       {"axis", std::to_string(axis)},
+                                       {"preloads", preload_str(preloads)},
                                       {"preamble", v.get("preamble", std::string{})}});
        return compile_hip_code_object(src, options);
    }

--- a/src/targets/gpu/jit/scatternd.cpp
+++ b/src/targets/gpu/jit/scatternd.cpp
@@ -52,9 +52,8 @@ struct scatternd_compiler : compiler<scatternd_compiler>
    {
        hip_compile_options options;
        options.set_launch_params(v, compute_global_for(ctx, inputs.at(1).elements()));
-        auto out_s             = inputs.back();
        options.inputs         = inputs;
-        options.output         = out_s;
+        options.output         = inputs.back();
        options.kernel_name    = "scatternd_kernel";
        options.virtual_inputs = inputs;
        auto reduction         = "assign_" + v.get("reduction", std::string{"none"});

--- a/src/targets/gpu/kernels/include/migraphx/kernels/functional.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/functional.hpp
@@ -3,6 +3,14 @@
 #include <migraphx/kernels/integral_constant.hpp>
+// NOLINTNEXTLINE
+#define MIGRAPHX_RETURNS(...) \
+    ->decltype(__VA_ARGS__) { return __VA_ARGS__; }
+// NOLINTNEXTLINE
+#define MIGRAPHX_LIFT(...) \
+    [](auto&&... xs) MIGRAPHX_RETURNS((__VA_ARGS__)(static_cast<decltype(xs)>(xs)...))
 namespace migraphx {
 struct swallow
@@ -161,6 +169,18 @@ constexpr auto pack(Ts... xs)
    return [=](auto f) { return f(xs...); };
 }
+template <class G, class F>
+constexpr auto join(G g, F f)
+{
+    return f([=](auto... xs) { return g(xs...); });
+}
+template <class G, class F, class... Fs>
+constexpr auto join(G g, F f, Fs... fs)
+{
+    return f([=](auto... xs) { return join([=](auto... ys) { return g(xs..., ys...); }, fs...); });
+}
 template <class Compare, class P1, class P2>
 constexpr auto pack_compare(Compare compare, P1 p1, P2 p2)
 {
@@ -191,39 +211,45 @@ constexpr auto arg(IntegralConstant ic)
    return arg_c<ic>();
 }
-inline constexpr auto rotate_last()
+template <class F>
+constexpr auto make_transform(F f)
 {
-    return [](auto... xs) {
+    return [=](auto... xs) { return [=](auto g) { return f(g, xs...); }; };
-        return [=](auto&& f) {
-            return sequence_c<sizeof...(xs)>([&](auto... is) {
-                constexpr auto size = sizeof...(is);
-                return f(arg_c<(is + size - 1) % size>()(xs...)...);
-            });
-        };
-    };
 }
+// An arg transformation takes the arguments and then a function to take the new arguments:
+//     transform(xs...)([](auto... ys) { ... })
+// The transform_args function takes a list of transformations and continually applies them
 template <class F>
 constexpr auto transform_args(F f)
 {
-    return [=](auto... xs) {
+    return f;
-        return [=](auto g) { return f(xs...)([&](auto... ys) { return g(ys...); }); };
-    };
 }
 template <class F, class... Fs>
 constexpr auto transform_args(F f, Fs... fs)
 {
-    return [=](auto... xs) { return transform_args(f)(xs...)(transform_args(fs...)); };
+    return make_transform([=](auto g, auto... xs) {
+        return f(xs...)([=](auto... ys) { return transform_args(fs...)(ys...)(g); });
+    });
 }
-// NOLINTNEXTLINE
+// identity transform
-#define MIGRAPHX_RETURNS(...) \
+inline constexpr auto transform_args()
-    ->decltype(__VA_ARGS__) { return __VA_ARGS__; }
+{
+    return make_transform([](auto f, auto... xs) { return f(xs...); });
+}
-// NOLINTNEXTLINE
+// Rotate the first argument to the last argument
-#define MIGRAPHX_LIFT(...) \
+inline constexpr auto rotate_last()
-    [](auto&&... xs) MIGRAPHX_RETURNS((__VA_ARGS__)(static_cast<decltype(xs)>(xs)...))
+{
+    return make_transform([](auto f, auto... xs) {
+        return sequence_c<sizeof...(xs)>([&](auto... is) {
+            constexpr auto size = sizeof...(is);
+            return f(arg_c<(is + size - 1) % size>()(xs...)...);
+        });
+    });
+}
 } // namespace migraphx
 #endif // MIGRAPHX_GUARD_KERNELS_FUNCTIONAL_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/pointwise.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/pointwise.hpp
@@ -38,20 +38,17 @@ constexpr implicit_conversion_op<T> implicit_conversion(T x)
 template <class F, class T, class... Ts>
 __device__ void pointwise_tensor(index idx, F f, T out, Ts... xs)
 {
-    preload<typename T::type>(idx, xs...)([&](auto... ps) {
+    idx.global_stride(out.get_shape().elements(),
-        idx.global_stride(out.get_shape().elements(),
+                      [&](auto i) { out[i] = implicit_conversion(f(xs[i]...)); });
-                          [&](auto i) { out[i] = implicit_conversion(f(ps[i]...)); });
-    });
 }
-template <class F, class... Ts>
+template <class... Transforms>
-__device__ void pointwise(F f, Ts*... ps)
+__device__ auto pointwise(index idx, Transforms... transforms)
 {
-    auto t = transform_args(make_tensors(), rotate_last(), auto_vectorize());
+    return [=](auto f, auto*... ps) {
-    t(ps...)([&](auto... xs) {
+        auto t = transform_args(make_tensors(), rotate_last(), transforms...);
-        auto idx = make_index();
+        t(ps...)([&](auto... xs) { pointwise_tensor(idx, f, xs...); });
-        pointwise_tensor(idx, f, xs...);
+    };
-    });
 }
 } // namespace migraphx

--- a/src/targets/gpu/kernels/include/migraphx/kernels/preload.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/preload.hpp
@@ -3,6 +3,8 @@
 #include <migraphx/kernels/index.hpp>
 #include <migraphx/kernels/functional.hpp>
+#include <migraphx/kernels/tensor_view.hpp>
+#include <migraphx/kernels/vec.hpp>
 namespace migraphx {
@@ -73,7 +75,7 @@ __device__ auto preload_copy(index idx, F f, __shared__ T* buffer, Ts... xs)
            {
                if constexpr(decltype(tensor_vec_size(x)){} == 0)
                {
-                    auto v = vectorize(x);
+                    auto v = auto_vectorize(x);
                    auto b = as_vec(tensor_vec_size(v), buffer + offset);
                    idx.local_stride(v.get_shape().element_space(),
                                     [&](auto i) { b[i] = v.data()[i]; });
@@ -126,5 +128,47 @@ __device__ auto preload(index idx, Ts... xs)
    };
 }
+inline __device__ auto auto_preload(index idx)
+{
+    return make_transform([=](auto f, auto out, auto... xs) {
+        preload<typename decltype(out)::type>(idx, xs...)([&](auto... ys) { f(out, ys...); });
+    });
+}
+template <bool B, class T>
+__device__ auto preload_copy(index idx, T x)
+{
+    return [=](auto f) {
+        if constexpr(B)
+        {
+            using type          = typename T::type;
+            constexpr auto size = get_shape_c<T>{}.element_space();
+            __shared__ type buffer[size];
+            // TODO: Always vecotrize when size > 4, and then use a second loop for remainder
+            constexpr auto n = find_vectorize_size([&](auto i) { return (size % i) == 0; });
+            auto input       = as_vec<n>(remove_bool(x.data()));
+            auto b           = as_vec<n>(remove_bool(buffer));
+            idx.local_stride(size / n, [&](auto i) { b[i] = input[i]; });
+            return f(x.with(buffer));
+        }
+        else
+        {
+            return f(x);
+        }
+    };
+}
+template <bool... Bs>
+__device__ auto auto_preload(index idx)
+{
+    return make_transform([=](auto f, auto... xs) {
+        auto invoke = [=](auto... ys) {
+            __syncthreads();
+            f(ys...);
+        };
+        join(invoke, preload_copy<Bs>(idx, xs)...);
+    });
+}
 } // namespace migraphx
 #endif // MIGRAPHX_GUARD_KERNELS_PRELOAD_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
@@ -118,15 +118,13 @@ constexpr roalign_settings<Ts...> make_roalign_settings(Ts... xs)
 }
 template <class T, class U, class V, class W, class Settings>
-__device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, const W& y_t, Settings s)
+__device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, W& y_t, Settings s)
 {
    auto index      = make_index();
    const auto x    = x_t.begin();
    const auto rois = rois_t.begin();
    const auto ind  = ind_t.begin();
-    auto out_ptr = y_t.begin();
    // input shape
    auto x_lens      = x_t.get_shape().lens;
    auto channel_num = x_lens[1];
@@ -176,25 +174,25 @@ __device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, const W&
        const auto offset_x = x + ((batch_ind * channel_num + c) * in_dims[0] * in_dims[1]);
        if constexpr(s.is_avg_pooling)
        {
-            out_ptr[i] = calc_pooling(offset_x,
+            y_t[i] = calc_pooling(offset_x,
-                                      roi_starts,
+                                  roi_starts,
-                                      bin_size,
+                                  bin_size,
-                                      {ph, pw},
+                                  {ph, pw},
-                                      bin_grid_size,
+                                  bin_grid_size,
-                                      in_dims,
+                                  in_dims,
-                                      s.roi_offset,
+                                  s.roi_offset,
-                                      avg_pool{});
+                                  avg_pool{});
        }
        else
        {
-            out_ptr[i] = calc_pooling(offset_x,
+            y_t[i] = calc_pooling(offset_x,
-                                      roi_starts,
+                                  roi_starts,
-                                      bin_size,
+                                  bin_size,
-                                      {ph, pw},
+                                  {ph, pw},
-                                      bin_grid_size,
+                                  bin_grid_size,
-                                      in_dims,
+                                  in_dims,
-                                      s.roi_offset,
+                                  s.roi_offset,
-                                      max_pool{});
+                                  max_pool{});
        }
    }
 }

--- a/src/targets/gpu/kernels/include/migraphx/kernels/vec.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/vec.hpp
@@ -61,10 +61,19 @@ constexpr auto common_vec_size()
    })(vec_size<Ts>()...);
 }
+// Bools can not be used as a vector type so convert it to uint8
+template <class T>
+__device__ __host__ T* remove_bool(T* x)
+{
+    return x;
+}
+inline __device__ __host__ uint8_t* remove_bool(bool* x) { return reinterpret_cast<uint8_t*>(x); }
 template <index_int N, class T>
 __device__ __host__ auto as_vec(T* x)
 {
-    if constexpr(N == 0)
+    if constexpr(N < 2)
        return x;
    else
        return reinterpret_cast<vec<T, N>*>(x);

--- a/src/targets/gpu/kernels/include/migraphx/kernels/vectorize.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/vectorize.hpp
@@ -50,19 +50,10 @@ constexpr auto shape_step(Shape s, Axis)
    });
 }
-// Bools can not be used as a vector type so convert it to uint8
-template <class T>
-__device__ __host__ T* remove_bool(T* x)
-{
-    return x;
-}
-inline __device__ __host__ uint8_t* remove_bool(bool* x) { return reinterpret_cast<uint8_t*>(x); }
 template <index_int N, class T, class Axis>
 __device__ __host__ auto as_vec(T x, Axis axis)
 {
-    if constexpr(N == 0)
+    if constexpr(N < 2)
        return x;
    else
        return make_tensor_view(as_vec<N>(remove_bool(x.data())),
@@ -72,7 +63,7 @@ __device__ __host__ auto as_vec(T x, Axis axis)
 template <index_int N, class T, class Axis>
 constexpr auto tensor_step(T x, Axis axis)
 {
-    if constexpr(N == 0)
+    if constexpr(N < 2)
    {
        return x;
    }
@@ -157,11 +148,11 @@ constexpr auto find_vectorize_size(P pred)
    else if constexpr(decltype(pred(_c<2>)){})
        return _c<2>;
    else
-        return _c<0>;
+        return _c<1>;
 }
 template <class T>
-__host__ __device__ auto vectorize(T x)
+__host__ __device__ auto auto_vectorize(T x)
 {
    if constexpr(tensor_vec_size<T>() == 0)
    {
@@ -194,7 +185,7 @@ inline __device__ __host__ auto auto_vectorize_impl(F f, Ts... xs)
                {
                    MIGRAPHX_ASSERT(s.strides[axis] == 0 or s.strides[axis] == 1);
                    MIGRAPHX_ASSERT(s.lens[axis] > 0);
-                    MIGRAPHX_ASSERT(n == 0 or s.lens[axis] % n == 0);
+                    MIGRAPHX_ASSERT(n == 1 or s.lens[axis] % n == 0);
                    if constexpr(s.strides[axis] == 0)
                        return tensor_step<n>(x, axis);
                    else
@@ -215,7 +206,32 @@ inline __device__ __host__ auto auto_vectorize_impl(F f, Ts... xs)
 inline __device__ __host__ auto auto_vectorize()
 {
-    return [](auto... xs) { return [=](auto f) { auto_vectorize_impl(f, xs...); }; };
+    return make_transform([](auto f, auto... xs) { auto_vectorize_impl(f, xs...); });
+}
+template <index_int N, index_int Axis, class T>
+__device__ __host__ auto vectorize_tensor(T x)
+{
+    constexpr auto shape = get_shape_c<T>{};
+    if constexpr(shape.strides[Axis] == 0)
+        return tensor_step<N>(x, _c<Axis>);
+    else
+        return as_vec<N>(x, _c<Axis>);
+}
+template <index_int N, index_int Axis>
+__device__ __host__ auto vectorize()
+{
+    return make_transform([](auto f, auto... xs) {
+        if constexpr(N < 2)
+        {
+            f(xs...);
+        }
+        else
+        {
+            f(vectorize_tensor<N, Axis>(xs)...);
+        }
+    });
 }
 } // namespace migraphx

--- a/src/targets/ref/lowering.cpp
+++ b/src/targets/ref/lowering.cpp
@@ -505,7 +505,7 @@ struct ref_unary : auto_register_op<ref_unary<Op>>
    shape compute_shape(const std::vector<shape>& inputs) const
    {
        check_shapes{inputs, *this}.has(1);
-        auto s = inputs.at(0);
+        const auto& s = inputs.at(0);
        return {s.type(), s.lens()};
    }

--- a/test/api/test_module_construct.cpp
+++ b/test/api/test_module_construct.cpp
@@ -3,23 +3,21 @@
 #include <migraphx/migraphx.hpp>
 #include "test.hpp"
-TEST_CASE(add_op)
+TEST_CASE(add_literals)
 {
    migraphx::program p;
    migraphx::module m = p.get_main_module();
    migraphx::shape param_shape{migraphx_shape_float_type, {3, 3}};
-    auto x      = m.add_parameter("x", param_shape);
+    std::vector<float> x_values(9, 1);
-    auto y      = m.add_parameter("y", param_shape);
+    auto x = m.add_literal(param_shape, x_values.data());
+    std::vector<float> y_values(9, -1);
+    auto y      = m.add_literal(param_shape, y_values.data());
    auto add_op = migraphx::operation("add");
    auto r      = m.add_instruction(add_op, {x, y});
    m.add_return({r});
    // run on ref target
    p.compile(migraphx::target("ref"));
    migraphx::program_parameters pp;
-    std::vector<float> x_data(9, 1);
-    std::vector<float> y_data(9, -1);
-    pp.add("x", migraphx::argument(param_shape, x_data.data()));
-    pp.add("y", migraphx::argument(param_shape, y_data.data()));
    auto outputs = p.eval(pp);
    auto output  = outputs[0];
    std::vector<float> expected(9, 0);

--- a/test/gpu/jit.cpp
+++ b/test/gpu/jit.cpp
@@ -3,6 +3,7 @@
 #include <migraphx/make_op.hpp>
 #include <migraphx/generate.hpp>
 #include <migraphx/program.hpp>
+#include <migraphx/par_for.hpp>
 #include <migraphx/gpu/kernel.hpp>
 #include <migraphx/gpu/target.hpp>
 #include <migraphx/gpu/hip.hpp>
@@ -109,6 +110,24 @@ int main() {}
 )__migraphx__";
+// NOLINTNEXTLINE
+const std::string math_template = R"__migraphx__(
+#include <migraphx/kernels/pointwise.hpp>
+#include <migraphx/kernels/math.hpp>
+extern "C" {
+__global__ void kernel(${type}* p) 
+{
+    auto x = *p;
+    *p = migraphx::implicit_conversion(migraphx::${invoke});
+}
+}
+int main() {}
+)__migraphx__";
 migraphx::src_file make_src_file(const std::string& name, const std::string& content)
 {
    return {name, std::make_pair(content.data(), content.data() + content.size())};
@@ -248,4 +267,66 @@ TEST_CASE(compile_pointwise)
    EXPECT(result == output_literal.get_argument());
 }
+TEST_CASE(compile_math)
+{
+    std::vector<std::string> math_invoke = {
+        // clang-format off
+        "abs(x)",
+        "acos(x)",
+        "acosh(x)",
+        "asin(x)",
+        "asinh(x)",
+        "atan(x)",
+        "atanh(x)",
+        "ceil(x)",
+        "cos(x)",
+        "cosh(x)",
+        "erf(x)",
+        "exp(x)",
+        "floor(x)",
+        "isnan(x)",
+        "log(x)",
+        "max(x, x)",
+        "min(x, x)",
+        "pow(x, 0)",
+        "pow(x, x)",
+        "round(x)",
+        "rsqrt(x)",
+        "sin(x)",
+        "sinh(x)",
+        "sqrt(x)",
+        "tan(x)",
+        "tanh(x)",
+        "where(true, x, x)",
+        // clang-format on
+    };
+    std::vector<std::string> data_types;
+    auto vec_sizes = {2, 4, 6};
+    for(auto&& t : migraphx::shape::types())
+    {
+        if(contains({migraphx::shape::bool_type, migraphx::shape::tuple_type}, t))
+            continue;
+        auto name = migraphx::shape::cpp_type(t);
+        if(t == migraphx::shape::half_type)
+            name.insert(0, "migraphx::");
+        data_types.push_back(name);
+        migraphx::transform(vec_sizes, std::back_inserter(data_types), [&](auto i) {
+            return "migraphx::vec<" + name + ", " + std::to_string(i) + ">";
+        });
+    }
+    migraphx::shape input{migraphx::shape::float_type, {5, 2}};
+    migraphx::gpu::hip_compile_options options;
+    options.global = 1024;
+    options.local  = 1024;
+    options.inputs = {input};
+    options.output = input;
+    migraphx::par_for(math_invoke.size() * data_types.size(), 1, [&](auto i) {
+        const auto& t      = data_types[i % data_types.size()];
+        const auto& invoke = math_invoke[i / data_types.size()];
+        auto src = migraphx::interpolate_string(math_template, {{"type", t}, {"invoke", invoke}});
+        auto co  = migraphx::gpu::compile_hip_code_object(src, options);
+        (void)co;
+    });
+}
 int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/test/py/test_module_construct.py
+++ b/test/py/test_module_construct.py
-import migraphx
+import migraphx, array, sys
+def create_buffer(t, data, shape):
+    a = array.array(t, data)
+    m = memoryview(a.tobytes())
+    return m.cast(t, shape)
 def test_add_op():
    p = migraphx.program()
    mm = p.get_main_module()
-    param_shape = migraphx.shape(lens=[3, 3], type="float")
+    x = mm.add_literal(create_buffer('f', [1.0] * 9, (3, 3)))
-    x = mm.add_parameter("x", param_shape)
+    y = mm.add_literal(create_buffer('f', [2.0] * 9, (3, 3)))
-    y = mm.add_parameter("y", param_shape)
    add_op = mm.add_instruction(migraphx.op("add"), [x, y])
    mm.add_return([add_op])
    p.compile(migraphx.get_target("ref"))
    params = {}
-    params["x"] = migraphx.generate_argument(param_shape)
-    params["y"] = migraphx.generate_argument(param_shape)
    output = p.run(params)[-1].tolist()
-    assert output == [
+    assert output == list([3.0] * 9)
-        a + b for a, b in zip(params["x"].tolist(), params["y"].tolist())
-    ]
 def test_if_then_else():
@@ -60,5 +61,6 @@ def test_if_then_else():
 if __name__ == "__main__":
-    test_add_op()
+    if sys.version_info >= (3, 0):
+        test_add_op()
    test_if_then_else()
--- a/test/py/test_numpy.py
+++ b/test/py/test_numpy.py
+import migraphx, sys
+try:
+    import numpy as np
+except:
+    sys.exit()
+def test_add_op():
+    p = migraphx.program()
+    mm = p.get_main_module()
+    x = mm.add_literal(np.ones((3, 3), dtype='float32'))
+    y = mm.add_literal(2 * np.ones((3, 3), dtype='float32'))
+    add_op = mm.add_instruction(migraphx.op("add"), [x, y])
+    mm.add_return([add_op])
+    p.compile(migraphx.get_target("ref"))
+    params = {}
+    output = p.run(params)[-1].tolist()
+    assert output == list(3 * np.ones((9), dtype='float32'))
+if __name__ == "__main__":
+    test_add_op()