merge changes from develop branch

f06f6aa3 · Shucai Xiao · 80a35596 · ebfe9735 · f06f6aa3 · f06f6aa3
Commit f06f6aa3 authored Jul 09, 2019 by Shucai Xiao
20 changed files
--- a/src/shape.cpp
+++ b/src/shape.cpp
@@ -138,6 +138,24 @@ std::size_t shape::index(std::size_t i) const
        return result;
    }
 }
+
+std::vector<std::size_t> shape::multi(std::size_t i) const
+{
+    assert(this->standard());
+
+    std::vector<std::size_t> indices(lens().size());
+    std::transform(strides().begin(),
+                   strides().end(),
+                   lens().begin(),
+                   indices.begin(),
+                   [&](std::size_t stride, std::size_t len) {
+                       assert(len > 0 and stride > 0);
+                       return (i / stride) % len;
+                   });
+
+    return indices;
+}
+
 bool shape::packed() const { return this->elements() == this->element_space(); }

 bool shape::transposed() const

--- a/src/simplify_reshapes.cpp
+++ b/src/simplify_reshapes.cpp
@@ -2,14 +2,17 @@
 #include <migraphx/program.hpp>
 #include <migraphx/instruction.hpp>
 #include <migraphx/op/as_shape.hpp>
+#include <migraphx/op/transpose.hpp>
+#include <migraphx/op/concat.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/ranges.hpp>
+#include <migraphx/matcher.hpp>
 #include <unordered_set>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

-bool is_reshaper(instruction_ref ins)
+const auto& reshaper_names()
 {
    // clang-format off
    static const std::unordered_set<std::string> names = {
@@ -19,17 +22,10 @@ bool is_reshaper(instruction_ref ins)
        "unsqueeze"
    };
    // clang-format on
-    return contains(names, ins->name());
+    return names;
 }

-bool is_transpose_output(instruction_ref ins)
-{
-    if(ins->outputs().size() != 1)
-        return false;
-    if(ins->outputs().front()->name() == "contiguous")
-        return is_transpose_output(ins->outputs().front());
-    return ins->outputs().front()->name() == "transpose";
-}
+bool is_reshaper(instruction_ref ins) { return contains(reshaper_names(), ins->name()); }

 instruction_ref find_transpose_input(instruction_ref ins)
 {
@@ -42,62 +38,189 @@ instruction_ref find_transpose_input(instruction_ref ins)
    return ins;
 }

-void simplify_reshapes::apply(program& p) const
+auto get_transpose_dims(instruction_ref ins)
 {
-    auto end = std::prev(p.end());
-    for(auto ins : iterator_for(p))
+    return any_cast<const op::transpose&>(ins->get_operator()).dims;
+}
+
+std::vector<int64_t> reorder_dims(std::vector<int64_t> dims, std::vector<int64_t> permutation)
+{
+    std::vector<int64_t> result(dims.size());
+    assert(dims.size() == permutation.size());
+    for(std::size_t i = 0; i < dims.size(); i++)
    {
-        if(ins == end and ins->name() == "contiguous")
-            continue;
-        // Skip possible dead instructions
-        if(ins->outputs().empty() and ins != end)
-            continue;
-        if(is_reshaper(ins))
+        result[i] = dims[permutation[i]];
+    }
+    return result;
+}
+
+bool is_no_transpose(const std::vector<int64_t>& dims)
+{
+    if(dims.empty())
+        return true;
+    if(dims.front() != 0)
+        return false;
+    return std::adjacent_find(
+               dims.begin(), dims.end(), [](auto x, auto y) { return (y - x) != 1; }) == dims.end();
+}
+
+template <class Vector, class Op>
+std::vector<int64_t> sort_permutation(const Vector& data, Op op)
+{
+    std::vector<std::int64_t> result(data.size());
+    std::iota(result.begin(), result.end(), 0);
+    std::sort(result.begin(), result.end(), [&](auto x, auto y) { return op(data[x], data[y]); });
+    return result;
+}
+
+std::vector<int64_t> invert_permutation(const std::vector<int64_t>& permutation)
+{
+    return sort_permutation(permutation, std::less<>{});
+}
+
+std::vector<int64_t> find_permutation(const shape& s)
+{
+    return sort_permutation(s.strides(), std::greater<>{});
+}
+
+struct find_reshaper
+{
+    auto matcher() const
+    {
+        return match::name(reshaper_names())(
+            match::any_of[match::outputs()](match::name(reshaper_names())));
+    }
+
+    void apply(program& p, const match::matcher_result& mr) const
+    {
+        auto ins = mr.result;
+        std::vector<instruction_ref> reshapes{ins};
+        while(is_reshaper(reshapes.back()))
        {
-            if(std::any_of(ins->outputs().begin(), ins->outputs().end(), &is_reshaper))
-                continue;
-            // Gather reshapes
-            std::vector<instruction_ref> reshapes{ins};
-            while(is_reshaper(reshapes.back()))
-            {
-                assert(!reshapes.back()->inputs().empty());
-                assert(p.has_instruction(reshapes.back()->inputs().front()));
-                auto input = reshapes.back()->inputs().front();
-                reshapes.push_back(input);
-            }
+            assert(!reshapes.back()->inputs().empty());
+            assert(p.has_instruction(reshapes.back()->inputs().front()));
+            auto input = reshapes.back()->inputs().front();
+            reshapes.push_back(input);
+        }

-            std::pair<instruction_ref, instruction_ref> r{p.end(), p.end()};
-            for(auto start : iterator_for(reshapes))
-            {
-                auto last = std::find_if(reshapes.rbegin(), reshapes.rend(), [&](auto&& i) {
-                    return i->get_shape() == (*start)->get_shape() and i != (*start);
-                });
-                if(last != reshapes.rend())
-                {
-                    r = std::make_pair(*start, *last);
-                    break;
-                }
-            }
-            if(r.first != r.second)
+        std::pair<instruction_ref, instruction_ref> r{p.end(), p.end()};
+        for(auto start : iterator_for(reshapes))
+        {
+            auto last = std::find_if(reshapes.rbegin(), reshapes.rend(), [&](auto&& i) {
+                return i->get_shape() == (*start)->get_shape() and i != (*start);
+            });
+            if(last != reshapes.rend())
            {
-                p.replace_instruction(r.first, r.second);
+                r = std::make_pair(*start, *last);
+                break;
            }
        }
-        else if(ins->name() == "transpose")
+        if(r.first != r.second)
+        {
+            p.replace_instruction(r.first, r.second);
+        }
+    }
+};
+
+struct find_nop_reshapes
+{
+    auto matcher() const
+    {
+        auto reshapes = reshaper_names();
+        reshapes.insert("transpose");
+        reshapes.insert("slice");
+        return match::name(reshapes)(match::same_shape(match::arg(0)));
+    }
+
+    void apply(program& p, const match::matcher_result& mr) const
+    {
+        auto ins = mr.result;
+        p.replace_instruction(ins, ins->inputs().front());
+    }
+};
+
+struct find_transpose
+{
+    auto matcher() const
+    {
+        return match::name("transpose")(match::none_of(
+            match::skip_output(match::name("contiguous"))(match::name("transpose"))));
+    }
+
+    void apply(program& p, const match::matcher_result& mr) const
+    {
+        auto ins = mr.result;
+        auto x   = ins;
+        auto t   = ins;
+        std::vector<std::int64_t> dims(ins->get_shape().lens().size());
+        std::iota(dims.begin(), dims.end(), 0);
+        do
+        {
+            dims = reorder_dims(get_transpose_dims(t), dims);
+            x    = t;
+            t    = find_transpose_input(x);
+        } while(x != t and t->name() == "transpose");
+        if(t == ins or t->name() != "transpose")
+            return;
+        if(is_no_transpose(dims))
        {
-            if(is_transpose_output(ins))
-                continue;
-            auto x = ins;
-            auto t = ins;
-            do
-            {
-                x = t;
-                t = find_transpose_input(x);
-            } while(x != t and t->name() == "transpose");
-            if(t == ins or t->name() != "transpose")
-                continue;
            p.replace_instruction(ins, t->inputs().front());
        }
+        else
+        {
+            p.replace_instruction(ins, op::transpose{{dims}}, t->inputs().front());
+        }
+    }
+};
+
+struct find_concat_transpose
+{
+    auto matcher() const
+    {
+        return match::name("concat")(match::same_input_shapes(),
+                                     match::all_of[match::inputs()](match::transpose_shape()));
+    }
+
+    void apply(program& p, const match::matcher_result& mr) const
+    {
+        auto ins = mr.result;
+        auto s   = ins->inputs().front()->get_shape();
+        assert(s.transposed());
+        auto op           = any_cast<op::concat>(ins->get_operator());
+        auto permutation  = find_permutation(s);
+        auto ipermutation = invert_permutation(permutation);
+        op.axis           = ipermutation[op.axis];
+
+        std::vector<instruction_ref> inputs;
+        std::transform(
+            ins->inputs().begin(), ins->inputs().end(), std::back_inserter(inputs), [&](auto i) {
+                if(i->name() == "transpose" and i->inputs().front()->get_shape().standard())
+                    return i->inputs().front();
+                return p.insert_instruction(ins, op::transpose{permutation}, i);
+            });
+        auto concat = p.insert_instruction(ins, op, inputs);
+        auto t      = p.insert_instruction(ins, op::transpose{ipermutation}, concat);
+        assert(ins->get_shape().lens() == t->get_shape().lens());
+        p.replace_instruction(ins, t);
+    }
+};
+
+void simplify_reshapes::apply(program& p) const
+{
+    auto end = std::prev(p.end());
+    for(auto ins : iterator_for(p))
+    {
+        if(ins == end and ins->name() == "contiguous")
+            continue;
+        // Skip possible dead instructions
+        if(ins->outputs().empty() and ins != end)
+            continue;
+        match::find_matches(p,
+                            ins,
+                            find_nop_reshapes{},
+                            find_reshaper{},
+                            find_transpose{},
+                            find_concat_transpose{});
    }
 }


--- a/src/targets/cpu/lowering.cpp
+++ b/src/targets/cpu/lowering.cpp
@@ -2,7 +2,21 @@
 #include <migraphx/cpu/lowering.hpp>
 #include <migraphx/instruction.hpp>
 #include <migraphx/dfor.hpp>
-#include <migraphx/operators.hpp>
+#include <migraphx/op/batch_norm.hpp>
+#include <migraphx/op/convolution.hpp>
+#include <migraphx/op/quant_convolution.hpp>
+#include <migraphx/op/dot.hpp>
+#include <migraphx/op/quant_dot.hpp>
+#include <migraphx/op/elu.hpp>
+#include <migraphx/op/im2col.hpp>
+#include <migraphx/op/leaky_relu.hpp>
+#include <migraphx/op/logsoftmax.hpp>
+#include <migraphx/op/lrn.hpp>
+#include <migraphx/op/pad.hpp>
+#include <migraphx/op/pooling.hpp>
+#include <migraphx/op/softmax.hpp>
+#include <migraphx/op/argmax.hpp>
+#include <migraphx/op/argmin.hpp>
 #include <migraphx/shape_for_each.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/par_dfor.hpp>
@@ -650,18 +664,11 @@ struct cpu_softmax

    std::string name() const { return "cpu::softmax"; }
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
-
-    template <typename T>
-    std::size_t compute_batch_index(T idx, shape& batch_shape, int axis) const
-    {
-        idx[axis] = 0;
-        return batch_shape.index(idx);
-    }
-
    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
    {
        argument result{output_shape};
        auto batch_lens     = output_shape.lens();
+        std::size_t n_dims  = batch_lens[op.axis];
        batch_lens[op.axis] = 1;
        shape batch_shape{shape::int32_type, batch_lens};

@@ -669,26 +676,33 @@ struct cpu_softmax
            using value_type = typename decltype(input)::value_type;
            std::vector<value_type> batch_max(batch_shape.elements(),
                                              std::numeric_limits<value_type>::lowest());
-            shape_for_each(output_shape, [&](auto idx) {
-                auto index       = this->compute_batch_index(idx, batch_shape, op.axis);
-                batch_max[index] = std::max(batch_max[index], input(idx.begin(), idx.end()));
-            });
+            std::vector<value_type> batch_sum(batch_shape.elements(), value_type(0));
+            par_for(batch_shape.elements(), [&](auto i) {
+                auto idx = batch_shape.multi(i);
+                for(std::size_t j = 0; j < n_dims; ++j)
+                {
+                    idx[op.axis] = j;
+                    batch_max[i] = std::max(batch_max[i], input(idx.begin(), idx.end()));
+                }

-            shape_for_each(output_shape, [&](auto idx) {
-                auto index = this->compute_batch_index(idx, batch_shape, op.axis);
-                output(idx.begin(), idx.end()) =
-                    std::exp(input(idx.begin(), idx.end()) - batch_max[index]);
-            });
+                for(std::size_t j = 0; j < n_dims; ++j)
+                {
+                    idx[op.axis]      = j;
+                    std::size_t index = output_shape.index(idx);
+                    output[index]     = std::exp(input[index] - batch_max[i]);
+                }

-            std::vector<value_type> batch_sum(batch_shape.elements(), value_type(0));
-            shape_for_each(output_shape, [&](auto idx) {
-                auto index = this->compute_batch_index(idx, batch_shape, op.axis);
-                batch_sum[index] += output(idx.begin(), idx.end());
-            });
+                for(std::size_t j = 0; j < n_dims; ++j)
+                {
+                    idx[op.axis] = j;
+                    batch_sum[i] += output(idx.begin(), idx.end());
+                }

-            shape_for_each(output_shape, [&](auto idx) {
-                auto index = this->compute_batch_index(idx, batch_shape, op.axis);
-                output(idx.begin(), idx.end()) /= batch_sum[index];
+                for(std::size_t j = 0; j < n_dims; ++j)
+                {
+                    idx[op.axis] = j;
+                    output(idx.begin(), idx.end()) /= batch_sum[i];
+                }
            });
        });

@@ -708,49 +722,50 @@ struct cpu_logsoftmax

    std::string name() const { return "cpu::logsoftmax"; }
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
-
-    template <typename T>
-    std::size_t compute_batch_index(T idx, const shape& batch_shape, int axis) const
-    {
-        idx[axis] = 0;
-        return batch_shape.index(idx);
-    }
-
    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
    {
        argument result{output_shape};
        auto batch_lens     = output_shape.lens();
+        std::size_t n_dims  = batch_lens[op.axis];
        batch_lens[op.axis] = 1;
        shape batch_shape{shape::int32_type, batch_lens};

+        // use a parallel implementation to acheive better performance
+        // one thread for one batch
        visit_all(result, args[0])([&](auto output, auto input) {
            using value_type = typename decltype(input)::value_type;
            std::vector<value_type> batch_max(batch_shape.elements(),
                                              std::numeric_limits<value_type>::lowest());
-            shape_for_each(output_shape, [&](auto idx) {
-                auto index       = this->compute_batch_index(idx, batch_shape, op.axis);
-                batch_max[index] = std::max(batch_max[index], input(idx.begin(), idx.end()));
-            });
+            std::vector<value_type> batch_sum(batch_shape.elements(), value_type(0));

-            shape_for_each(output_shape, [&](auto idx) {
-                auto index = this->compute_batch_index(idx, batch_shape, op.axis);
-                output(idx.begin(), idx.end()) = input(idx.begin(), idx.end()) - batch_max[index];
-            });
+            par_for(batch_shape.elements(), [&](auto i) {
+                auto idx = batch_shape.multi(i);
+                for(std::size_t j = 0; j < n_dims; ++j)
+                {
+                    idx[op.axis] = j;
+                    batch_max[i] = std::max(batch_max[i], input(idx.begin(), idx.end()));
+                }

-            std::vector<value_type> batch_sum(batch_shape.elements(), value_type(0));
-            shape_for_each(output_shape, [&](auto idx) {
-                auto index = this->compute_batch_index(idx, batch_shape, op.axis);
-                batch_sum[index] += std::exp(output(idx.begin(), idx.end()));
-            });
+                for(std::size_t j = 0; j < n_dims; ++j)
+                {
+                    idx[op.axis]      = j;
+                    std::size_t index = output_shape.index(idx);
+                    output[index]     = input[index] - batch_max[i];
+                }
+
+                for(std::size_t j = 0; j < n_dims; ++j)
+                {
+                    idx[op.axis] = j;
+                    batch_sum[i] += std::exp(output(idx.begin(), idx.end()));
+                }

-            for(std::size_t i = 0; i < batch_sum.size(); ++i)
-            {
                batch_sum[i] = std::log(batch_sum[i]);
-            }

-            shape_for_each(output_shape, [&](auto idx) {
-                auto index = this->compute_batch_index(idx, batch_shape, op.axis);
-                output(idx.begin(), idx.end()) -= batch_sum[index];
+                for(std::size_t j = 0; j < n_dims; ++j)
+                {
+                    idx[op.axis] = j;
+                    output(idx.begin(), idx.end()) -= batch_sum[i];
+                }
            });
        });


--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -12,9 +12,12 @@ endif()

 add_library(migraphx_device
    device/add.cpp
+    device/argmax.cpp
+    device/argmin.cpp
    device/max.cpp
    device/min.cpp
    device/exp.cpp
+    device/erf.cpp
    device/log.cpp
    device/sin.cpp
    device/cos.cpp
@@ -36,6 +39,7 @@ add_library(migraphx_device
    device/sub.cpp
    device/pack.cpp
    device/clip.cpp
+    device/reduce_sum.cpp
 )
 set_target_properties(migraphx_device PROPERTIES EXPORT_NAME device)
 rocm_clang_tidy_check(migraphx_device)
@@ -44,6 +48,8 @@ target_include_directories(migraphx_device PUBLIC $<BUILD_INTERFACE:${CMAKE_CURR
 target_include_directories(migraphx_device PRIVATE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/device/include>)

 add_library(migraphx_gpu
+    argmax.cpp
+    argmin.cpp
    eliminate_workspace.cpp
    fuse_ops.cpp
    hip.cpp
@@ -73,6 +79,7 @@ add_library(migraphx_gpu
    schedule_model.cpp
    adjust_allocation.cpp
    clip.cpp
+    reduce_sum.cpp
 )
 set_target_properties(migraphx_gpu PROPERTIES EXPORT_NAME gpu)
 rocm_clang_tidy_check(migraphx_gpu)

--- a/src/targets/gpu/argmax.cpp
+++ b/src/targets/gpu/argmax.cpp
+#include <migraphx/gpu/argmax.hpp>
+#include <migraphx/gpu/device/argmax.hpp>
+#include <migraphx/gpu/context.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+shape hip_argmax::compute_shape(const std::vector<shape>& inputs) const
+{
+    check_shapes{inputs, *this}.has(2).standard();
+    return op.compute_shape({inputs.at(0)});
+}
+
+argument hip_argmax::compute(context& ctx, const shape&, const std::vector<argument>& args) const
+{
+    device::argmax(ctx.get_stream().get(), args.back(), args.front(), op.axis);
+    return args.back();
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/argmin.cpp
+++ b/src/targets/gpu/argmin.cpp
+#include <migraphx/gpu/argmin.hpp>
+#include <migraphx/gpu/device/argmin.hpp>
+#include <migraphx/gpu/context.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+shape hip_argmin::compute_shape(const std::vector<shape>& inputs) const
+{
+    check_shapes{inputs, *this}.has(2).standard();
+    return op.compute_shape({inputs.at(0)});
+}
+
+argument hip_argmin::compute(context& ctx, const shape&, const std::vector<argument>& args) const
+{
+    device::argmin(ctx.get_stream().get(), args.back(), args.front(), op.axis);
+    return args.back();
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/argmax.cpp
+++ b/src/targets/gpu/device/argmax.cpp
+#include <migraphx/shape.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/gpu/device/argmax.hpp>
+#include <migraphx/gpu/device/tensor.hpp>
+#include <migraphx/gpu/device/launch.hpp>
+#include <migraphx/gpu/device/types.hpp>
+#include <migraphx/gpu/device/arg_op.hpp>
+#include <migraphx/gpu/hip.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void argmax(hipStream_t stream, const argument& result, const argument& arg, int64_t axis)
+{
+    arg_op(argmax_op{}, stream, result, arg, axis);
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/argmin.cpp
+++ b/src/targets/gpu/device/argmin.cpp
+#include <migraphx/shape.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/gpu/device/argmin.hpp>
+#include <migraphx/gpu/device/tensor.hpp>
+#include <migraphx/gpu/device/launch.hpp>
+#include <migraphx/gpu/device/types.hpp>
+#include <migraphx/gpu/device/arg_op.hpp>
+#include <migraphx/gpu/hip.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void argmin(hipStream_t stream, const argument& result, const argument& arg, int64_t axis)
+{
+    arg_op(argmin_op{}, stream, result, arg, axis);
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/concat.cpp
+++ b/src/targets/gpu/device/concat.cpp
@@ -10,22 +10,22 @@ namespace gpu {
 namespace device {

 argument concat(hipStream_t stream,
-                const migraphx::shape& output_shape,
+                const migraphx::shape&,
                std::vector<migraphx::argument> args,
                std::vector<std::size_t> offsets)
 {
-    for(std::size_t l = 0; l < args.size() - 1; l++)
+    auto ninputs = args.size() - 1;
+    for(std::size_t j = 0; j < ninputs; j++)
    {
-        auto argl             = args[l];
-        std::size_t nelements = argl.get_shape().elements();
-        visit_all(args.back(), argl)([&](auto output, auto input) {
-            visit_tensor_size(output_shape.lens().size(), [&](auto ndim) {
-                auto* outptr      = output.data() + offsets[l];
-                const auto* inptr = input.data();
-                hip_tensor_descriptor<ndim> desc_input(input.get_shape());
-                hip_tensor_descriptor<ndim> desc_output(output.get_shape());
-                gs_launch(stream, nelements)(
-                    [=](auto i) { outptr[desc_output.linear(desc_input.multi(i))] = inptr[i]; });
+        auto&& arg            = args[j];
+        std::size_t nelements = arg.get_shape().elements();
+        auto offset           = offsets[j];
+        shape arg_shape{arg.get_shape().type(), arg.get_shape().lens()};
+        hip_visit_all(args.back(), arg, arg_shape)([&](auto output, auto input, auto input_shape) {
+            gs_launch(stream, nelements)([=](auto i) {
+                auto input_idx              = input_shape.multi(i);
+                auto idx                    = output.get_shape().index(input_idx);
+                output.data()[idx + offset] = input[input_idx];
            });
        });
    }

--- a/src/targets/gpu/device/erf.cpp
+++ b/src/targets/gpu/device/erf.cpp
+#include <migraphx/gpu/device/erf.hpp>
+#include <migraphx/gpu/device/nary.hpp>
+#include <migraphx/gpu/device/types.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void erf(hipStream_t stream, const argument& result, const argument& arg)
+{
+    nary(stream, result, arg)([](auto x) { return ::erf(to_hip_type(x)); });
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/gather.cpp
+++ b/src/targets/gpu/device/gather.cpp
@@ -11,35 +11,30 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {

-argument gather(hipStream_t stream,
-                const migraphx::shape& output_shape,
-                std::vector<migraphx::argument> args,
-                int axis)
+argument gather(hipStream_t stream, argument result, argument arg1, argument arg2, int axis)
 {
-    auto axis_index = (axis < 0) ? (axis + args[0].get_shape().lens().size()) : axis;
-    visit_all(args.back(), args[0])([&](auto output, auto input) {
-        std::size_t nelements = output_shape.elements();
-        args[1].visit([&](auto indices) {
-            const auto* indices_ptr = device_cast(indices.data());
-            auto* out_ptr           = device_cast(output.data());
-            const auto* in_ptr      = device_cast(input.data());
-            auto& input_shape       = args[0].get_shape();
-            auto lens               = input_shape.lens();
-            lens[axis_index]        = args[1].get_shape().elements();
-            migraphx::shape out_comp_shape{output_shape.type(), lens};
-            visit_tensor_size(out_comp_shape.lens().size(), [&](auto n_out_dim) {
-                hip_tensor_descriptor<n_out_dim> desc_input(input_shape);
-                hip_tensor_descriptor<n_out_dim> desc_output(out_comp_shape);
-                gs_launch(stream, nelements)([=](auto ii) {
-                    auto in_idx        = desc_output.multi(ii);
-                    in_idx[axis_index] = indices_ptr[in_idx[axis_index]];
-                    out_ptr[ii]        = in_ptr[desc_input.linear(in_idx)];
+    auto axis_index   = (axis < 0) ? (axis + arg1.get_shape().lens().size()) : axis;
+    auto& input_shape = arg1.get_shape();
+    auto lens         = input_shape.lens();
+    lens[axis_index]  = arg2.get_shape().elements();
+    shape out_comp_shape{result.get_shape().type(), lens};
+    std::size_t nelements = result.get_shape().elements();
+
+    visit_all(result, arg1)([&](auto output, auto input_v) {
+        hip_visit_views(input_v, out_comp_shape)([&](auto input, auto out_comp) {
+            arg2.visit([&](auto indices) {
+                const auto* indices_ptr = device_cast(indices.data());
+                auto* output_ptr        = device_cast(output.data());
+                gs_launch(stream, nelements)([=](auto i) {
+                    auto idx        = out_comp.multi(i);
+                    idx[axis_index] = indices_ptr[idx[axis_index]];
+                    output_ptr[i]   = input[idx];
                });
            });
        });
    });

-    return args.back();
+    return result;
 }

 } // namespace device

--- a/src/targets/gpu/device/include/migraphx/gpu/device/array.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/array.hpp
+
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_ARRAY_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_ARRAY_HPP
+
+#include <migraphx/gpu/device/types.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+template <class T, std::size_t N>
+struct hip_array
+{
+    T d[N];
+    MIGRAPHX_DEVICE_CONSTEXPR T& operator[](std::size_t i) { return d[i]; }
+    MIGRAPHX_DEVICE_CONSTEXPR const T& operator[](std::size_t i) const { return d[i]; }
+
+    MIGRAPHX_DEVICE_CONSTEXPR T* data() { return d; }
+    MIGRAPHX_DEVICE_CONSTEXPR const T* data() const { return d; }
+
+    MIGRAPHX_DEVICE_CONSTEXPR std::integral_constant<std::size_t, N> size() const { return {}; }
+
+    MIGRAPHX_DEVICE_CONSTEXPR T* begin() { return d; }
+    MIGRAPHX_DEVICE_CONSTEXPR const T* begin() const { return d; }
+
+    MIGRAPHX_DEVICE_CONSTEXPR T* end() { return d + size(); }
+    MIGRAPHX_DEVICE_CONSTEXPR const T* end() const { return d + size(); }
+
+    MIGRAPHX_DEVICE_CONSTEXPR T dot(const hip_array& x) const
+    {
+        T result = 0;
+        for(std::size_t i = 0; i < N; i++)
+            result += x[i] * d[i];
+        return result;
+    }
+
+    MIGRAPHX_DEVICE_CONSTEXPR T product() const
+    {
+        T result = 1;
+        for(std::size_t i = 0; i < N; i++)
+            result *= d[i];
+        return result;
+    }
+
+    friend MIGRAPHX_DEVICE_CONSTEXPR hip_array operator*(const hip_array& x, const hip_array& y)
+    {
+        hip_array result;
+        for(std::size_t i = 0; i < N; i++)
+            result[i] = x[i] * y[i];
+        return result;
+    }
+
+    friend MIGRAPHX_DEVICE_CONSTEXPR hip_array operator+(const hip_array& x, const hip_array& y)
+    {
+        hip_array result{};
+        for(std::size_t i = 0; i < N; i++)
+            result[i] = x[i] + y[i];
+        return result;
+    }
+};
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/device/include/migraphx/gpu/device/launch.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/launch.hpp
@@ -11,9 +11,33 @@ namespace device {

 struct index
 {
-    std::size_t global;
-    std::size_t local;
-    std::size_t group;
+    std::size_t global = 0;
+    std::size_t local  = 0;
+    std::size_t group  = 0;
+
+    __device__ std::size_t nglobal() const { return blockDim.x * gridDim.x; } // NOLINT
+
+    __device__ std::size_t nlocal() const { return blockDim.x; } // NOLINT
+
+    template <class F>
+    __device__ void global_stride(std::size_t n, F f) const
+    {
+        const auto stride = nglobal();
+        for(std::size_t i = global; i < n; i += stride)
+        {
+            f(i);
+        }
+    }
+
+    template <class F>
+    __device__ void local_stride(std::size_t n, F f) const
+    {
+        const auto stride = nlocal();
+        for(std::size_t i = local; i < n; i += stride)
+        {
+            f(i);
+        }
+    }
 };

 template <class F>
@@ -35,18 +59,26 @@ inline auto launch(hipStream_t stream, std::size_t global, std::size_t local)
    };
 }

+template <class F>
+__host__ __device__ auto gs_invoke(F&& f, std::size_t i, index idx) -> decltype(f(i, idx))
+{
+    return f(i, idx);
+}
+
+template <class F>
+__host__ __device__ auto gs_invoke(F&& f, std::size_t i, index) -> decltype(f(i))
+{
+    return f(i);
+}
+
 inline auto gs_launch(hipStream_t stream, std::size_t n, std::size_t local = 1024)
 {
-    std::size_t groups  = 1 + n / local;
+    std::size_t groups  = (n + local - 1) / local;
    std::size_t nglobal = std::min<std::size_t>(256, groups) * local;

    return [=](auto f) {
-        launch(stream, nglobal, local)([=](auto idx) {
-            for(size_t i = idx.global; i < n; i += nglobal)
-            {
-                f(i);
-            }
-        });
+        launch(stream, nglobal, local)(
+            [=](auto idx) { idx.global_stride(n, [&](auto i) { gs_invoke(f, i, idx); }); });
    };
 }


--- a/src/targets/gpu/device/include/migraphx/gpu/device/nary.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/nary.hpp
--- a/src/targets/gpu/device/include/migraphx/gpu/device/reduce.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/reduce.hpp
+
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_REDUCE_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_REDUCE_HPP
+
+#include <migraphx/gpu/device/launch.hpp>
+#include <migraphx/gpu/device/visit.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+struct sum
+{
+    template <class T, class U>
+    MIGRAPHX_DEVICE_CONSTEXPR auto operator()(T x, U y) const
+    {
+        return x + y;
+    }
+};
+
+struct id
+{
+    template <class T>
+    MIGRAPHX_DEVICE_CONSTEXPR auto operator()(T x) const
+    {
+        return x;
+    }
+};
+
+struct max
+{
+    template <class T, class U>
+    MIGRAPHX_DEVICE_CONSTEXPR auto operator()(T x, U y) const
+    {
+        return x > y ? x : y;
+    }
+};
+
+struct min
+{
+    template <class T, class U>
+    MIGRAPHX_DEVICE_CONSTEXPR auto operator()(T x, U y) const
+    {
+        return x < y ? x : y;
+    }
+};
+
+struct lowest
+{
+    template <class T>
+    operator T() const
+    {
+        return device_cast(std::numeric_limits<host_type<T>>::lowest());
+    }
+};
+
+struct highest
+{
+    template <class T>
+    operator T() const
+    {
+        return device_cast(std::numeric_limits<host_type<T>>::max());
+    }
+};
+
+#ifdef MIGRAPHX_NO_DPP
+template <std::size_t N, class Op, class T, class F>
+__device__ auto block_reduce(index idx, Op op, T init, std::size_t n, F f)
+{
+    using type = decltype(f(idx.local));
+    MIGRAPHX_DEVICE_SHARED type buffer[N];
+    type x = init;
+    idx.local_stride(n, [&](auto i) { x = op(x, f(i)); });
+    buffer[idx.local] = x;
+    __syncthreads();
+
+    for(std::size_t s = 1; s < idx.nlocal(); s *= 2)
+    {
+        const std::size_t index = 2 * s * idx.local;
+        if(index + s < idx.nlocal())
+        {
+            buffer[index] = op(buffer[index], buffer[index + s]);
+        }
+        __syncthreads();
+    }
+    return buffer[0];
+}
+#else
+constexpr unsigned int dpp_row_shr(unsigned int x) { return 0x110u | x; }
+
+constexpr unsigned int dpp_row_bcast(unsigned int x)
+{
+    unsigned int y = 0;
+    switch(x)
+    {
+    case 15: y = 0x142; break;
+    case 31: y = 0x143; break;
+    default: throw std::runtime_error("Unknown bcast");
+    }
+    return y;
+}
+
+template <unsigned int DppCtrl,
+          unsigned int RowMask  = 0xf,
+          unsigned int BankMask = 0xf,
+          bool BoundCtrl        = false,
+          class T>
+__device__ T dpp_mov(T& x)
+{
+    static const std::size_t n = sizeof(T) < 4 ? 1 : sizeof(T) / 4;
+    union type
+    {
+        uint32_t reg[n];
+        T data;
+    };
+    type output{};
+    type input{};
+    // cppcheck-suppress unreadVariable
+    input.data = x;
+    for(std::size_t i = 0; i < n; i++)
+    {
+        output.reg[i] = __llvm_amdgcn_move_dpp(input.reg[i], DppCtrl, RowMask, BankMask, BoundCtrl);
+    }
+    return output.data;
+}
+
+template <class T, class Op>
+__device__ void dpp_reduce(T& in, Op op)
+{
+    T out{};
+    out = dpp_mov<dpp_row_shr(1)>(in);
+    in  = op(in, out);
+    out = dpp_mov<dpp_row_shr(2)>(in);
+    in  = op(in, out);
+    out = dpp_mov<dpp_row_shr(4), 0xf, 0xe>(in);
+    in  = op(in, out);
+    out = dpp_mov<dpp_row_shr(8), 0xf, 0xc>(in);
+    in  = op(in, out);
+    out = dpp_mov<dpp_row_bcast(15), 0xa>(in);
+    in  = op(in, out);
+    out = dpp_mov<dpp_row_bcast(31), 0xc>(in);
+    in  = op(in, out);
+}
+
+__device__ inline void dpp_reduce(float& x, sum)
+{
+#ifdef MIGRAPHX_USE_CLANG_TIDY
+    (void)x;
+#else
+    __asm__ volatile("s_nop 4\n"
+                     "v_add_f32 %0 %0 %0 row_shr:1\n"
+                     "s_nop 1\n"
+                     "v_add_f32 %0 %0 %0 row_shr:2\n"
+                     "s_nop 1\n"
+                     "v_add_f32 %0 %0 %0 row_shr:4 bank_mask:0xe\n"
+                     "s_nop 1\n"
+                     "v_add_f32 %0 %0 %0 row_shr:8 bank_mask:0xc\n"
+                     "s_nop 1\n"
+                     "v_add_f32 %0 %0 %0 row_bcast:15 row_mask:0xa\n"
+                     "s_nop 1\n"
+                     "v_add_f32 %0 %0 %0 row_bcast:31 row_mask:0xc\n"
+                     "s_nop 1\n"
+                     : "=v"(x)
+                     : "0"(x));
+#endif
+}
+
+template <std::size_t N, class Op, class T, class F>
+__device__ auto block_reduce(index idx, Op op, T init, std::size_t n, F f)
+{
+    using type = decltype(f(idx.local));
+    MIGRAPHX_DEVICE_SHARED type buffer[N / 64];
+    type x = init;
+    idx.local_stride(n, [&](auto i) { x = op(x, f(i)); });
+    dpp_reduce(x, op);
+
+    const auto ldsidx = idx.local / 64;
+    if((idx.local % 64) == 63)
+    {
+        buffer[ldsidx] = x;
+    }
+    __syncthreads();
+
+    type y = init;
+    for(std::size_t i = 0; i < idx.nlocal() / 64; i++)
+    {
+        y = op(y, buffer[i]);
+    }
+    return y;
+}
+#endif
+constexpr std::size_t compute_block_size(std::size_t n, std::size_t max_block_size)
+{
+    size_t block_size = 64;
+    while(block_size < max_block_size and block_size < n)
+        block_size *= 2;
+    return block_size;
+}
+
+template <class Op, class T, class Input, class Output>
+void reduce(hipStream_t stream,
+            const argument& result,
+            const argument& arg,
+            Op op,
+            T init,
+            Input read_input,
+            Output read_output)
+{
+    auto&& output_shape = result.get_shape();
+    auto&& input_shape  = arg.get_shape();
+    std::vector<std::size_t> reduce_lens;
+    std::transform(output_shape.lens().begin(),
+                   output_shape.lens().end(),
+                   input_shape.lens().begin(),
+                   std::back_inserter(reduce_lens),
+                   [](auto x, auto y) -> std::size_t {
+                       if(x == y)
+                           return 1;
+                       else
+                           return y;
+                   });
+    shape reduce_slice{output_shape.type(), reduce_lens};
+    hip_visit_all(result, arg, reduce_slice)([&](auto output, auto input, auto reduce_shape) {
+        auto nelements = result.get_shape().elements();
+        auto relements = reduce_slice.elements();
+
+        const std::size_t max_block_size = 256;
+        const std::size_t block_size     = compute_block_size(relements, max_block_size);
+        gs_launch(stream, nelements * block_size, block_size)([=](auto i, auto idx) __device__ {
+            const auto out_idx = i / block_size;
+            auto base_idx      = output.get_shape().multi(out_idx);
+            auto r = block_reduce<max_block_size>(idx, op, init, relements, [&](auto j) __device__ {
+                auto reduce_idx = reduce_shape.multi(j);
+                return read_input(input[reduce_idx + base_idx]);
+            });
+            if(idx.local == 0)
+                output.data()[out_idx] = read_output(r);
+        });
+    });
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/device/include/migraphx/gpu/device/shape.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/shape.hpp
+
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_SHAPE_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_SHAPE_HPP
+
+#include <migraphx/gpu/device/array.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+template <std::size_t N>
+struct hip_shape
+{
+    using hip_index                   = hip_array<std::size_t, N>;
+    hip_array<std::size_t, N> lens    = {};
+    hip_array<std::size_t, N> strides = {};
+    bool standard                     = false;
+
+    __device__ __host__ hip_shape() = default;
+
+    hip_shape(const shape& s) : standard(s.standard())
+    {
+        assert(s.lens().size() == N);
+        assert(s.strides().size() == N);
+        std::copy(s.lens().begin(), s.lens().end(), lens.begin());
+        std::copy(s.strides().begin(), s.strides().end(), strides.begin());
+    }
+
+    MIGRAPHX_DEVICE_CONSTEXPR std::size_t elements() const { return lens.product(); }
+
+    MIGRAPHX_DEVICE_CONSTEXPR std::size_t index(hip_index x) const { return x.dot(strides); }
+
+    MIGRAPHX_DEVICE_CONSTEXPR std::size_t index(std::initializer_list<std::size_t> x) const
+    {
+        std::size_t idx = 0;
+        for(std::size_t i = 0; i < x.size(); i++)
+            idx += *(x.begin() + i) * strides[i];
+        return idx;
+    }
+
+    MIGRAPHX_DEVICE_CONSTEXPR std::size_t index(std::size_t i) const
+    {
+        if(this->standard)
+            return i;
+        else
+        {
+            const std::size_t rank = this->lens.size();
+            std::size_t s          = 1;
+            std::size_t result     = 0;
+            for(std::size_t j = 0; j < this->lens.size(); j++)
+            {
+                const std::size_t k      = rank - j - 1;
+                const std::size_t stride = this->strides[k];
+                const std::size_t len    = this->lens[k];
+                const std::size_t slen   = s * len;
+                const std::size_t idx    = (i % slen) / s;
+                result += stride * idx;
+                s = slen;
+            }
+            return result;
+        }
+    }
+
+    MIGRAPHX_DEVICE_CONSTEXPR hip_index multi(std::size_t idx) const
+    {
+        hip_index result;
+        std::size_t tidx = idx;
+        for(std::size_t is = 0; is < result.size(); is++)
+        {
+            result[is] = tidx / strides[is];
+            tidx       = tidx % strides[is];
+        }
+        return result;
+    }
+
+    MIGRAPHX_DEVICE_CONSTEXPR hip_index carry(hip_index result) const
+    {
+        std::ptrdiff_t rem = 0;
+        for(std::ptrdiff_t i = result.size() - 1; i >= 0; i--)
+        {
+            auto z = result[i] + rem;
+            rem    = z - std::ptrdiff_t(lens[i]) + 1;
+            if(rem > 0)
+                z -= rem;
+            else
+                rem = 0;
+            result[i] = z;
+        }
+        return result;
+    }
+};
+
+template <std::size_t N>
+hip_shape<N> make_hip_shape(const shape& x)
+{
+    return x;
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/device/include/migraphx/gpu/device/tensor.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/tensor.hpp
 #ifndef MIGRAPHX_GUARD_RTGLIB_DEAVICE_TENSOR_HPP
 #define MIGRAPHX_GUARD_RTGLIB_DEAVICE_TENSOR_HPP

-#include <hip/hip_runtime.h>
-#include <migraphx/functional.hpp>
-#include <migraphx/config.hpp>
+#include <migraphx/gpu/device/visit.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {

-template <class F>
-void visit_tensor_size(std::size_t n, F f)
-{
-    switch(n)
-    {
-    case 1:
-    {
-        f(std::integral_constant<std::size_t, 1>{});
-        break;
-    }
-    case 2:
-    {
-        f(std::integral_constant<std::size_t, 2>{});
-        break;
-    }
-    case 3:
-    {
-        f(std::integral_constant<std::size_t, 3>{});
-        break;
-    }
-    case 4:
-    {
-        f(std::integral_constant<std::size_t, 4>{});
-        break;
-    }
-    case 5:
-    {
-        f(std::integral_constant<std::size_t, 5>{});
-        break;
-    }
-    default: throw std::runtime_error("Unknown tensor size");
-    }
-}
+template <std::size_t NDim>
+using hip_tensor_index = hip_array<std::size_t, NDim>;

-template <size_t NDim>
-struct hip_index
-{
-    size_t d[NDim];
-    __device__ __host__ size_t& operator[](size_t i) { return d[i]; }
-    __device__ __host__ size_t operator[](size_t i) const { return d[i]; }
-};
-
-template <size_t NDim>
+template <std::size_t NDim>
 struct hip_tensor_descriptor
 {
    __device__ __host__ hip_tensor_descriptor() = default;
@@ -63,12 +22,11 @@ struct hip_tensor_descriptor
        std::copy(s.strides().begin(), s.strides().end(), strides);
    }

-    __device__ __host__ hip_index<NDim> multi(size_t idx) const
+    __device__ __host__ hip_tensor_index<NDim> multi(std::size_t idx) const
    {
-        hip_index<NDim> result{};
-        size_t tidx = idx;
-
-        for(size_t is = 0; is < NDim; is++)
+        hip_tensor_index<NDim> result{};
+        std::size_t tidx = idx;
+        for(std::size_t is = 0; is < NDim; is++)
        {
            result[is] = tidx / strides[is];
            tidx       = tidx % strides[is];
@@ -76,16 +34,15 @@ struct hip_tensor_descriptor

        return result;
    }
-
-    __device__ __host__ size_t linear(hip_index<NDim> s) const
+    __device__ __host__ std::size_t linear(hip_tensor_index<NDim> s) const
    {
-        size_t idx = 0;
-        for(size_t i = 0; i < NDim; i++)
+        std::size_t idx = 0;
+        for(std::size_t i = 0; i < NDim; i++)
            idx += s[i] * strides[i];
        return idx;
    }
-    size_t lens[NDim]    = {};
-    size_t strides[NDim] = {};
+    std::size_t lens[NDim]    = {};
+    std::size_t strides[NDim] = {};
 };

 } // namespace device

--- a/src/targets/gpu/device/include/migraphx/gpu/device/tensor_view.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/tensor_view.hpp
+
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_TENSOR_VIEW_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_TENSOR_VIEW_HPP
+
+#include <migraphx/gpu/device/shape.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+template <class T, std::size_t N>
+struct hip_tensor_view
+{
+    using value_type                      = T;
+    using hip_index                       = typename hip_shape<N>::hip_index;
+    __device__ __host__ hip_tensor_view() = default;
+    __host__ hip_tensor_view(tensor_view<T> x) : d(x.data()), s(x.get_shape()) {}
+    __host__ hip_tensor_view(T* x, const shape& ss) : d(x), s(ss) {}
+
+    MIGRAPHX_DEVICE_CONSTEXPR const hip_shape<N>& get_shape() const { return s; }
+
+    MIGRAPHX_DEVICE_CONSTEXPR std::size_t size() const { return s.elements(); }
+
+    MIGRAPHX_DEVICE_CONSTEXPR value_type* data() const { return d; }
+
+    template <class U>
+    MIGRAPHX_DEVICE_CONSTEXPR value_type& operator[](U i) const
+    {
+        return d[s.index(i)];
+    }
+
+    MIGRAPHX_DEVICE_CONSTEXPR value_type* begin() const { return d; }
+
+    MIGRAPHX_DEVICE_CONSTEXPR value_type* end() const { return d + size(); }
+
+    private:
+    value_type* d = nullptr;
+    hip_shape<N> s{};
+};
+
+template <std::size_t N, class T>
+hip_tensor_view<T, N> make_hip_view(const shape& s, T* x)
+{
+    return {x, s};
+}
+
+template <std::size_t N, class T>
+hip_tensor_view<T, N> make_hip_view(tensor_view<T> x)
+{
+    return {x};
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/device/include/migraphx/gpu/device/types.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/types.hpp
@@ -8,14 +8,45 @@
 #ifndef MIGRAPHX_GUARD_RTGLIB_GPU_DEVICE_TYPES_HPP
 #define MIGRAPHX_GUARD_RTGLIB_GPU_DEVICE_TYPES_HPP

+#include <hip/hip_runtime.h>
 #include <migraphx/half.hpp>
 #include <migraphx/config.hpp>
+#include <migraphx/tensor_view.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {

+#define MIGRAPHX_DEVICE_CONSTEXPR constexpr __device__ __host__ // NOLINT
+
+template <class T, std::size_t N>
+using vec = T __attribute__((ext_vector_type(N)));
+
+template <std::size_t N, class T>
+__device__ __host__ T* as_pointer(vec<T, N>* x)
+{
+    return reinterpret_cast<T*>(x);
+}
+
+template <std::size_t N, class T>
+__device__ __host__ vec<T, N>* as_vec(T* x)
+{
+    return reinterpret_cast<vec<T, N>*>(x);
+}
+
+template <std::size_t N, class T>
+tensor_view<vec<T, N>> as_vec(tensor_view<T> x)
+{
+    return {x.get_shape(), as_vec<N>(x.data())};
+}
+
+template <std::size_t N, class... Ts>
+auto pack_vec(Ts... xs)
+{
+    return [=](auto f, std::size_t n) { return f(as_vec<N>(xs)[n]...); };
+}
+
 using gpu_half = __fp16;

 namespace detail {
@@ -25,6 +56,12 @@ struct device_type
    using type = T;
 };

+template <class T, std::size_t N>
+struct device_type<vec<T, N>>
+{
+    using type = vec<typename device_type<T>::type, N>;
+};
+
 template <>
 struct device_type<half>
 {
@@ -38,7 +75,7 @@ struct host_type
 };

 template <>
-struct device_type<gpu_half>
+struct host_type<gpu_half>
 {
    using type = half;
 };
@@ -54,7 +91,7 @@ using device_type = typename detail::device_type<T>::type;
 template <class T>
 host_type<T> host_cast(T x)
 {
-    return reinterpret_cast<host_type<T>>(x);
+    return reinterpret_cast<const host_type<T>&>(x);
 }

 template <class T>
@@ -76,13 +113,19 @@ device_type<T>* device_cast(T* x)
 }

 template <class T>
-T to_hip_type(T x)
+tensor_view<device_type<T>> device_cast(tensor_view<T> x)
+{
+    return {x.get_shape(), reinterpret_cast<device_type<T>*>(x.data())};
+}
+
+template <class T>
+__device__ __host__ T to_hip_type(T x)
 {
    return x;
 }

 // Hip doens't support __fp16
-inline float to_hip_type(gpu_half x) { return x; }
+inline __device__ __host__ float to_hip_type(gpu_half x) { return x; }

 } // namespace device
 } // namespace gpu

--- a/src/targets/gpu/device/include/migraphx/gpu/device/vector.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/vector.hpp
+
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_VECTOR_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_VECTOR_HPP
+
+#include <migraphx/gpu/device/types.hpp>
+#include <vector>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+template <class T, std::size_t N>
+struct hip_vector
+{
+    MIGRAPHX_DEVICE_CONSTEXPR hip_vector() = default;
+    MIGRAPHX_DEVICE_CONSTEXPR hip_vector(std::size_t s) : len(s) {}
+    template <class Iterator>
+    __device__ __host__ hip_vector(Iterator start, Iterator last)
+    {
+        auto it = std::copy(start, last, d);
+        len     = std::distance(d, it);
+    }
+
+    __device__ __host__ hip_vector(std::initializer_list<T> x)
+    {
+        std::copy(x.begin(), x.end(), d);
+        len = x.size();
+    }
+
+    MIGRAPHX_DEVICE_CONSTEXPR T& operator[](std::size_t i) { return d[i]; }
+    MIGRAPHX_DEVICE_CONSTEXPR const T& operator[](std::size_t i) const { return d[i]; }
+
+    MIGRAPHX_DEVICE_CONSTEXPR T& front() { return d[0]; }
+    MIGRAPHX_DEVICE_CONSTEXPR const T& front() const { return d[0]; }
+
+    MIGRAPHX_DEVICE_CONSTEXPR T& back() { return d[size() - 1]; }
+    MIGRAPHX_DEVICE_CONSTEXPR const T& back() const { return d[size() - 1]; }
+
+    MIGRAPHX_DEVICE_CONSTEXPR T* data() { return d; }
+    MIGRAPHX_DEVICE_CONSTEXPR const T* data() const { return d; }
+
+    MIGRAPHX_DEVICE_CONSTEXPR std::size_t size() const { return len; }
+
+    MIGRAPHX_DEVICE_CONSTEXPR T* begin() { return d; }
+    MIGRAPHX_DEVICE_CONSTEXPR const T* begin() const { return d; }
+
+    MIGRAPHX_DEVICE_CONSTEXPR T* end() { return d + size(); }
+    MIGRAPHX_DEVICE_CONSTEXPR const T* end() const { return d + size(); }
+
+    template <class U>
+    MIGRAPHX_DEVICE_CONSTEXPR void push_back(U&& x)
+    {
+        d[len] = static_cast<U&&>(x);
+        len++;
+    }
+
+    private:
+    T d[N]          = {};
+    std::size_t len = 0;
+};
+
+template <std::size_t N, class T>
+hip_vector<T, N> to_hip_vector(const std::vector<T>& x)
+{
+    hip_vector<T, N> result(x.size());
+    std::copy(x.begin(), x.end(), result.begin());
+    return result;
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif