merge changes from develop branch

c1ec929c · Shucai Xiao · abe2a889 · 03225b57 · c1ec929c · c1ec929c
Commit c1ec929c authored Mar 21, 2022 by Shucai Xiao
20 changed files
--- a/src/targets/gpu/device/include/migraphx/gpu/device/multi_index.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/multi_index.hpp
@@ -59,8 +59,8 @@ inline auto mi_nglobal(const hip_shape<N>& s, index_int nlocal)
    assert(s.elements() > 0);
    index_int n      = s.elements();
    index_int groups = (n + nlocal - 1) / nlocal;
-    // change the max group num to 1 Million
-    index_int nglobal = std::min<index_int>((1 << 20), groups) * nlocal;
+    // max possible number of blocks is set to 1B (1,073,741,824)
+    index_int nglobal = std::min<index_int>(1073741824, groups) * nlocal;

    assert(groups > 0);
    assert(nglobal > 0);

--- a/src/targets/gpu/device/include/migraphx/gpu/device/scan.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/scan.hpp
@@ -53,6 +53,12 @@ __device__ void block_scan(index idx, Op op, T init, index_int n, Input input, O
        output);
 }

+template <class F>
+constexpr auto reverse_scan(index_int n, F f)
+{
+    return [=](auto i, auto&&... xs) { return f(n - i - 1, xs...); };
+}
+
 } // namespace device
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/gpu/device/prefix_scan_sum.cpp
+++ b/src/targets/gpu/device/prefix_scan_sum.cpp
 #include <migraphx/gpu/device/prefix_scan_sum.hpp>
 #include <migraphx/gpu/device/scan.hpp>
 #include <migraphx/gpu/device/reduce_ops.hpp>
+#include <migraphx/gpu/device/reduce.hpp>
 #include <migraphx/gpu/device/types.hpp>

 namespace migraphx {
@@ -8,30 +9,108 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {

-void prefix_scan_sum(hipStream_t stream, const argument& result, const argument& arg, int32_t axis)
+void prefix_scan_sum(hipStream_t stream,
+                     const argument& result,
+                     const argument& arg,
+                     int32_t axis,
+                     bool exclusive,
+                     bool reverse)
 {
-    const index_int block_size = 256;
-    const index_int n          = arg.get_shape().lens()[axis];
-    auto rlens                 = result.get_shape().lens();
-    rlens[axis]                = 1;
+    const index_int max_block_size = 256;
+    const index_int n              = arg.get_shape().lens()[axis];
+    auto rlens                     = result.get_shape().lens();
+    rlens[axis]                    = 1;
+
    hip_visit_all(result, arg, result.get_shape().with_lens(rlens))(
        [=](auto output, auto input, auto rshape) {
-            gs_launch(stream, rshape.elements() * block_size, block_size)(
-                [=](auto i, auto idx) __device__ {
-                    const auto ridx  = rshape.multi(i / block_size);
-                    auto compute_idx = [&](auto j) {
-                        auto k  = ridx;
-                        k[axis] = j;
-                        return k;
-                    };
-                    block_scan<block_size>(
-                        idx,
-                        sum{},
-                        0,
-                        n,
-                        [&](auto j) { return input[compute_idx(j)]; },
-                        [&](auto j, auto x) { output[compute_idx(j)] = x; });
-                });
+            const index_int block_size = compute_block_size(rshape.elements(), max_block_size);
+            if(reverse and exclusive)
+            {
+                gs_launch(stream, rshape.elements() * block_size, block_size)(
+                    [=](auto i, auto idx) __device__ {
+                        const auto ridx  = rshape.multi(i / block_size);
+                        auto compute_idx = [&](auto j) {
+                            auto k  = ridx;
+                            k[axis] = j;
+                            return k;
+                        };
+                        block_scan<max_block_size>(
+                            idx,
+                            sum{},
+                            0,
+                            n,
+                            reverse_scan(n, [&](auto j) { return input[compute_idx(j)]; }),
+                            reverse_scan(n, [&](auto j, auto x) {
+                                if(j == n - 1)
+                                    output[compute_idx(j)] = 0;
+                                if(j > 0)
+                                    output[compute_idx(j - 1)] = x;
+                            }));
+                    });
+            }
+            else if(reverse)
+            {
+                gs_launch(stream, rshape.elements() * block_size, block_size)(
+                    [=](auto i, auto idx) __device__ {
+                        const auto ridx  = rshape.multi(i / block_size);
+                        auto compute_idx = [&](auto j) {
+                            auto k  = ridx;
+                            k[axis] = j;
+                            return k;
+                        };
+                        block_scan<max_block_size>(
+                            idx,
+                            sum{},
+                            0,
+                            n,
+                            reverse_scan(n, [&](auto j) { return input[compute_idx(j)]; }),
+                            reverse_scan(n, [&](auto j, auto x) { output[compute_idx(j)] = x; }));
+                    });
+            }
+            else if(exclusive)
+            {
+                gs_launch(stream, rshape.elements() * block_size, block_size)(
+                    [=](auto i, auto idx) __device__ {
+                        const auto ridx  = rshape.multi(i / block_size);
+                        auto compute_idx = [&](auto j) {
+                            auto k  = ridx;
+                            k[axis] = j;
+                            return k;
+                        };
+                        block_scan<max_block_size>(
+                            idx,
+                            sum{},
+                            0,
+                            n,
+                            [&](auto j) { return input[compute_idx(j)]; },
+                            [&](auto j, auto x) {
+                                auto k = j + 1;
+                                if(j == 0)
+                                    output[compute_idx(0)] = 0;
+                                if(k < n)
+                                    output[compute_idx(k)] = x;
+                            });
+                    });
+            }
+            else
+            {
+                gs_launch(stream, rshape.elements() * block_size, block_size)(
+                    [=](auto i, auto idx) __device__ {
+                        const auto ridx  = rshape.multi(i / block_size);
+                        auto compute_idx = [&](auto j) {
+                            auto k  = ridx;
+                            k[axis] = j;
+                            return k;
+                        };
+                        block_scan<max_block_size>(
+                            idx,
+                            sum{},
+                            0,
+                            n,
+                            [&](auto j) { return input[compute_idx(j)]; },
+                            [&](auto j, auto x) { output[compute_idx(j)] = x; });
+                    });
+            }
        });
 }


--- a/src/targets/gpu/include/migraphx/gpu/device/prefix_scan_sum.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/prefix_scan_sum.hpp
@@ -10,7 +10,12 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {

-void prefix_scan_sum(hipStream_t stream, const argument& result, const argument& arg, int32_t axis);
+void prefix_scan_sum(hipStream_t stream,
+                     const argument& result,
+                     const argument& arg,
+                     int32_t axis,
+                     bool exclusive,
+                     bool reverse);

 } // namespace device
 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/miopen.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/miopen.hpp
@@ -9,6 +9,8 @@
 #include <miopen/miopen.h>
 #include <migraphx/config.hpp>

+#include <sstream>
+
 #ifdef HAS_FIND_MODE_API
 extern "C" miopenStatus_t miopenHiddenSetConvolutionFindMode(miopenConvolutionDescriptor_t convDesc,
                                                             int findMode);
@@ -132,12 +134,16 @@ inline convolution_descriptor make_deconv(const T& op)
 inline pooling_descriptor make_pooling(const migraphx::op::pooling& op)
 {
    miopenPoolingMode_t mode;
-    if(op.mode == "max")
+    if(op.mode == op::pooling_mode::max)
        mode = miopenPoolingMax;
-    else if(op.mode == "average")
+    else if(op.mode == op::pooling_mode::average)
        mode = miopenPoolingAverage;
    else
-        MIGRAPHX_THROW("Unknown mode for pooling: " + op.mode);
+    {
+        std::stringstream ss("Unknown mode for pooling: ");
+        ss << op.mode;
+        MIGRAPHX_THROW(ss.str());
+    }
    auto p = make_obj<pooling_descriptor>(&miopenCreatePoolingDescriptor);

    int kdims = op.kdims();

--- a/src/targets/gpu/include/migraphx/gpu/prefix_scan_sum.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/prefix_scan_sum.hpp
@@ -40,9 +40,8 @@ struct hip_prefix_scan_sum : oper<hip_prefix_scan_sum>

    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const
    {
-        if(op.exclusive or op.reverse)
-            MIGRAPHX_THROW("Exclusive and reverse scan not supported");
-        device::prefix_scan_sum(ctx.get_stream().get(), args[1], args[0], op.axis);
+        device::prefix_scan_sum(
+            ctx.get_stream().get(), args[1], args[0], op.axis, op.exclusive, op.reverse);
        return args[1];
    }


--- a/src/targets/gpu/kernels/include/migraphx/kernels/integral_constant.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/integral_constant.hpp
@@ -70,5 +70,11 @@ using index_constant = integral_constant<index_int, N>;
 template <auto V>
 static constexpr auto _c = integral_constant<decltype(V), V>{}; // NOLINT

+template <class F>
+constexpr auto return_c(F f)
+{
+    return _c<f()>;
+}
+
 } // namespace migraphx
 #endif // MIGRAPHX_GUARD_KERNELS_INTEGRAL_CONSTANT_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/iota_iterator.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/iota_iterator.hpp
+#ifndef MIGRAPHX_GUARD_KERNELS_IOTA_ITERATOR_HPP
+#define MIGRAPHX_GUARD_KERNELS_IOTA_ITERATOR_HPP
+
+#include <migraphx/kernels/types.hpp>
+#include <migraphx/kernels/type_traits.hpp>
+
+namespace migraphx {
+
+template <class F, class Iterator = diff_int>
+struct basic_iota_iterator
+{
+    Iterator index;
+    F f;
+
+    using difference_type = diff_int;
+    using reference       = decltype(f(std::declval<Iterator>()));
+    using value_type      = remove_reference_t<reference>;
+    using pointer         = add_pointer_t<value_type>;
+
+    constexpr basic_iota_iterator& operator+=(diff_int n)
+    {
+        index += n;
+        return *this;
+    }
+
+    constexpr basic_iota_iterator& operator-=(diff_int n)
+    {
+        index -= n;
+        return *this;
+    }
+
+    constexpr basic_iota_iterator& operator++()
+    {
+        index++;
+        return *this;
+    }
+
+    constexpr basic_iota_iterator& operator--()
+    {
+        index--;
+        return *this;
+    }
+
+    constexpr basic_iota_iterator operator++(int) // NOLINT
+    {
+        basic_iota_iterator it = *this;
+        index++;
+        return it;
+    }
+
+    constexpr basic_iota_iterator operator--(int) // NOLINT
+    {
+        basic_iota_iterator it = *this;
+        index--;
+        return it;
+    }
+    // TODO: operator->
+    constexpr reference operator*() const { return f(index); }
+
+    template <class T>
+    constexpr reference operator[](T x) const
+    {
+        return f(index + x);
+    }
+};
+
+template <class T, class F>
+constexpr basic_iota_iterator<F, T> make_basic_iota_iterator(T x, F f)
+{
+    return basic_iota_iterator<F, T>{x, f};
+}
+
+template <class F, class Iterator>
+constexpr basic_iota_iterator<F, Iterator> operator+(basic_iota_iterator<F, Iterator> x, diff_int y)
+{
+    return x += y;
+}
+
+template <class F, class Iterator>
+constexpr basic_iota_iterator<F, Iterator> operator+(diff_int x, basic_iota_iterator<F, Iterator> y)
+{
+    return y + x;
+}
+
+template <class F, class Iterator>
+constexpr diff_int operator-(basic_iota_iterator<F, Iterator> x, basic_iota_iterator<F, Iterator> y)
+{
+    return x.index - y.index;
+}
+
+template <class F, class Iterator>
+constexpr basic_iota_iterator<F, Iterator> operator-(basic_iota_iterator<F, Iterator> x, diff_int y)
+{
+    return x -= y;
+}
+
+template <class F, class Iterator>
+constexpr bool operator==(basic_iota_iterator<F, Iterator> x, basic_iota_iterator<F, Iterator> y)
+{
+    return x.index == y.index;
+}
+
+template <class F, class Iterator>
+constexpr bool operator!=(basic_iota_iterator<F, Iterator> x, basic_iota_iterator<F, Iterator> y)
+{
+    return x.index != y.index;
+}
+
+template <class F, class Iterator>
+constexpr bool operator<(basic_iota_iterator<F, Iterator> x, basic_iota_iterator<F, Iterator> y)
+{
+    return x.index < y.index;
+}
+
+template <class F, class Iterator>
+constexpr bool operator>(basic_iota_iterator<F, Iterator> x, basic_iota_iterator<F, Iterator> y)
+{
+    return x.index > y.index;
+}
+
+template <class F, class Iterator>
+constexpr bool operator>=(basic_iota_iterator<F, Iterator> x, basic_iota_iterator<F, Iterator> y)
+{
+    return x.index >= y.index;
+}
+
+template <class F, class Iterator>
+constexpr bool operator<=(basic_iota_iterator<F, Iterator> x, basic_iota_iterator<F, Iterator> y)
+{
+    return x.index <= y.index;
+}
+
+struct defaul_iota_iterator
+{
+    template <class T>
+    constexpr auto operator()(T x) const
+    {
+        return x;
+    }
+};
+
+using iota_iterator = basic_iota_iterator<defaul_iota_iterator>;
+
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_IOTA_ITERATOR_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/pointwise.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/pointwise.hpp
@@ -39,10 +39,8 @@ template <class F, class T, class... Ts>
 __device__ void pointwise_tensor(index idx, F f, T out, Ts... xs)
 {
    preload<typename T::type>(idx, xs...)([&](auto... ps) {
-        idx.global_stride(out.get_shape().elements(), [&](auto i) {
-            auto multi_idx = out.get_shape().multi(i);
-            out[multi_idx] = implicit_conversion(f(ps[multi_idx]...));
-        });
+        idx.global_stride(out.get_shape().elements(),
+                          [&](auto i) { out[i] = implicit_conversion(f(ps[i]...)); });
    });
 }


--- a/src/targets/gpu/kernels/include/migraphx/kernels/preload.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/preload.hpp
@@ -29,7 +29,7 @@ constexpr auto traverse_preload(Shapes... ss)
        auto each        = [&](auto x) {
            using type          = remove_vec<typename decltype(x)::type>;
            constexpr auto s    = decltype(x.get_shape()){};
-            constexpr auto size = _c<s.element_space()>;
+            constexpr auto size = s.element_space();
            if constexpr(not s.broadcasted() or (s.elements() - size) < 64 or
                         not is_same<T, type>{})
                return f(x, offset, false_type{});

--- a/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
@@ -19,7 +19,7 @@ struct max_pool
    }

    template <class T>
-    MIGRAPHX_DEVICE_CONSTEXPR T final(T x, std::size_t)
+    MIGRAPHX_DEVICE_CONSTEXPR T final(T x, index_int)
    {
        return (x);
    }
@@ -36,21 +36,19 @@ struct avg_pool
    }

    template <class T>
-    MIGRAPHX_DEVICE_CONSTEXPR T final(T x, std::size_t y)
+    MIGRAPHX_DEVICE_CONSTEXPR T final(T x, index_int y)
    {
        return (y == 0) ? 0.0 : (x / y);
    }
 };

-template <class T, class Op>
-MIGRAPHX_DEVICE_CONSTEXPR T bilinear_interpolate(const T* data,
-                                                 const array<std::size_t, 2>& dims,
-                                                 array<float, 2> xy,
-                                                 Op pooling)
+template <class Iterator, class Op>
+MIGRAPHX_DEVICE_CONSTEXPR typename Iterator::value_type bilinear_interpolate(
+    const Iterator data, const array<index_int, 2>& dims, array<float, 2> xy, Op pooling)
 {
    array<int, 2> low{};
    array<int, 2> high{};
-    for(std::size_t ii = 0; ii < xy.size(); ++ii)
+    for(index_int ii = 0; ii < xy.size(); ++ii)
    {
        if(xy[ii] < -1.0f or xy[ii] > dims[ii])
        {
@@ -65,36 +63,36 @@ MIGRAPHX_DEVICE_CONSTEXPR T bilinear_interpolate(const T* data,
            xy[ii] = high[ii] = low[ii] = dims[ii] - 1;
        }
    }
-    array<std::size_t, 4> locs = {low[0] * dims[1] + low[1],
-                                  low[0] * dims[1] + high[1],
-                                  high[0] * dims[1] + low[1],
-                                  high[0] * dims[1] + high[1]};
+    array<index_int, 4> locs = {low[0] * dims[1] + low[1],
+                                low[0] * dims[1] + high[1],
+                                high[0] * dims[1] + low[1],
+                                high[0] * dims[1] + high[1]};

-    float ly       = xy[0] - low[0];
-    float lx       = xy[1] - low[1];
-    float hy       = 1.0f - ly;
-    float hx       = 1.0f - lx;
-    array<T, 4> ws = {hy * hx, hy * lx, ly * hx, ly * lx};
+    float ly                                   = xy[0] - low[0];
+    float lx                                   = xy[1] - low[1];
+    float hy                                   = 1.0f - ly;
+    float hx                                   = 1.0f - lx;
+    array<typename Iterator::value_type, 4> ws = {hy * hx, hy * lx, ly * hx, ly * lx};

    auto v01 = pooling(data[locs[0]] * ws[0], data[locs[1]] * ws[1]);
    auto v23 = pooling(data[locs[2]] * ws[2], data[locs[3]] * ws[3]);
    return pooling(v01, v23);
 }

-template <class T, class Op>
-MIGRAPHX_DEVICE_CONSTEXPR T calc_pooling(const T*& data,
-                                         const array<float, 2>& roi_starts,
-                                         const array<float, 2>& bin_size,
-                                         const array<int, 2>& idx,
-                                         const array<std::size_t, 2>& bin_grid_size,
-                                         const array<std::size_t, 2>& dims,
-                                         float roi_offset,
-                                         Op op)
+template <class Iterator, class Op>
+MIGRAPHX_DEVICE_CONSTEXPR auto calc_pooling(const Iterator& data,
+                                            const array<float, 2>& roi_starts,
+                                            const array<float, 2>& bin_size,
+                                            const array<int, 2>& idx,
+                                            const array<index_int, 2>& bin_grid_size,
+                                            const array<index_int, 2>& dims,
+                                            float roi_offset,
+                                            Op op)
 {
-    T output_val        = op.init();
-    const int64_t count = bin_grid_size[0] * bin_grid_size[1];
+    typename Iterator::value_type output_val = op.init();
+    const int64_t count                      = bin_grid_size[0] * bin_grid_size[1];
    dfor(bin_grid_size[0], bin_grid_size[1])([&](auto iy, auto ix) {
-        array<std::size_t, 2> id = {iy, ix};
+        array<index_int, 2> id = {iy, ix};
        array<float, 2> locs =
            roi_starts + idx * bin_size + bin_size * (id + 0.5f) / bin_grid_size + roi_offset;

@@ -122,19 +120,19 @@ constexpr roalign_settings<Ts...> make_roalign_settings(Ts... xs)
 template <class T, class U, class V, class W, class Settings>
 __device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, const W& y_t, Settings s)
 {
-    auto index       = make_index();
-    const auto* x    = x_t.data();
-    const auto* rois = rois_t.data();
-    const auto* ind  = ind_t.data();
+    auto index      = make_index();
+    const auto x    = x_t.begin();
+    const auto rois = rois_t.begin();
+    const auto ind  = ind_t.begin();

-    auto* out_ptr = y_t.data();
+    auto out_ptr = y_t.begin();

    // input shape
    auto x_lens      = x_t.get_shape().lens;
    auto channel_num = x_lens[1];
    // input dims of height and width, in all 2-dim arrays, the first dim
    // is for height and second dim is for width
-    array<std::size_t, 2> in_dims = {x_lens[2], x_lens[3]};
+    array<index_int, 2> in_dims = {x_lens[2], x_lens[3]};

    const auto stride   = index.nglobal();
    auto out_s          = y_t.get_shape();
@@ -142,8 +140,8 @@ __device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, const W&

    // output dims of height and width, in all 2-dim arrays, the first dim
    // is for height and second dim is for width
-    const auto& out_lens           = out_s.lens;
-    array<std::size_t, 2> out_dims = {out_lens[2], out_lens[3]};
+    const auto& out_lens         = out_s.lens;
+    array<index_int, 2> out_dims = {out_lens[2], out_lens[3]};

    for(index_int i = index.global; i < out_s.elements(); i += stride)
    {
@@ -153,8 +151,8 @@ __device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, const W&
        int ph   = idx[2];
        int pw   = idx[3];

-        const auto* offset_rois = rois + (n * roi_column_num);
-        const int batch_ind     = ind[n];
+        const auto offset_rois = rois + (n * roi_column_num);
+        const int batch_ind    = ind[n];

        array<float, 2> roi_starts = {offset_rois[1] * s.spatial_scale,
                                      offset_rois[0] * s.spatial_scale};
@@ -163,9 +161,9 @@ __device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, const W&

        array<float, 2> roi_size{};
        array<float, 2> bin_size{};
-        array<std::size_t, 2> bin_grid_size{};
+        array<index_int, 2> bin_grid_size{};

-        for(std::size_t ii = 0; ii < roi_size.size(); ++ii)
+        for(index_int ii = 0; ii < roi_size.size(); ++ii)
        {
            roi_size[ii] = roi_ends[ii] - roi_starts[ii];
            roi_size[ii] = max(roi_size[ii], 1.0f);
@@ -175,7 +173,7 @@ __device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, const W&
                (s.sampling_ratio > 0) ? s.sampling_ratio : std::ceil(roi_size[ii] / out_dims[ii]);
        }

-        const auto* offset_x = x + ((batch_ind * channel_num + c) * in_dims[0] * in_dims[1]);
+        const auto offset_x = x + ((batch_ind * channel_num + c) * in_dims[0] * in_dims[1]);
        if constexpr(s.is_avg_pooling)
        {
            out_ptr[i] = calc_pooling(offset_x,

--- a/src/targets/gpu/kernels/include/migraphx/kernels/shape.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/shape.hpp
@@ -17,35 +17,38 @@ struct shape

    constexpr shape(Lens l, Strides s) : lens(l), strides(s) {}

-    constexpr index_int elements() const { return lens.product(); }
+    constexpr auto elements() const { return _c<Lens{}.product()>; }

-    constexpr index_int element_space() const { return strides.dot(lens - 1) + 1; }
+    constexpr auto element_space() const { return _c<Strides{}.dot(Lens{} - 1) + 1>; }

-    constexpr bool packed() const { return elements() == element_space(); }
-    constexpr bool broadcasted() const { return strides.product() == 0; }
-    constexpr bool transposed() const
+    constexpr auto packed() const { return elements() == element_space(); }
+    constexpr auto broadcasted() const { return _c<Strides{}.product() == 0>; }
+    constexpr auto transposed() const
    {
-        if(broadcasted())
-        {
-            index_array s;
-            index_int j = 0;
-            for(index_int i = 0; i < s.size(); i++)
+        return return_c([] {
+            auto lstrides = Strides{};
+            if(shape{}.broadcasted())
            {
-                if(strides[i] != 0)
+                index_array s{};
+                index_int j = 0;
+                for(index_int i = 0; i < s.size(); i++)
                {
-                    s[j] = strides[i];
-                    j++;
+                    if(lstrides[i] != 0)
+                    {
+                        s[j] = lstrides[i];
+                        j++;
+                    }
                }
+                return not is_sorted(s.begin(), s.begin() + j, greater{});
            }
-            return not is_sorted(s.begin(), s.begin() + j, greater{});
-        }
-        else
-        {
-            return not is_sorted(strides.begin(), strides.end(), greater{});
-        }
+            else
+            {
+                return not is_sorted(lstrides.begin(), lstrides.end(), greater{});
+            }
+        });
    }

-    constexpr bool standard() const { return packed() and not transposed(); }
+    constexpr auto standard() const { return packed() and not transposed(); }

    constexpr index_int index(index_array x) const { return x.dot(strides); }

@@ -63,10 +66,10 @@ struct shape
            return i;
        else
        {
-            const index_int rank = this->lens.size();
-            index_int s          = 1;
-            index_int result     = 0;
-            for(index_int j = 0; j < this->lens.size(); j++)
+            const auto rank  = this->lens.size();
+            index_int s      = 1;
+            index_int result = 0;
+            for(index_int j = 0; j < rank; j++)
            {
                const index_int k      = rank - j - 1;
                const index_int stride = this->strides[k];

--- a/src/targets/gpu/kernels/include/migraphx/kernels/tensor_view.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/tensor_view.hpp
@@ -3,17 +3,30 @@

 #include <migraphx/kernels/shape.hpp>
 #include <migraphx/kernels/debug.hpp>
+#include <migraphx/kernels/iota_iterator.hpp>

 namespace migraphx {

+template <class T>
+struct tensor_view_iterator_read
+{
+    T* view;
+    constexpr auto& operator()(std::size_t n) const
+    {
+        MIGRAPHX_ASSERT(view != nullptr);
+        return (*view)[n];
+    }
+};
+
 template <class T, class Shape>
 struct tensor_view
 {
    using type       = T;
    using shape_type = Shape;
+    using iterator   = basic_iota_iterator<tensor_view_iterator_read<const tensor_view>, index_int>;

    constexpr Shape get_shape() const { return Shape{}; }
-    constexpr index_int size() const { return get_shape().elements(); }
+    constexpr auto size() const { return get_shape().elements(); }

    template <class U>
    constexpr T& operator[](U i) const
@@ -24,8 +37,8 @@ struct tensor_view

    constexpr T* data() const { return x; }

-    constexpr T* begin() const { return data(); }
-    constexpr T* end() const { return data() + size(); }
+    constexpr auto begin() const { return iterator{0, {this}}; }
+    constexpr auto end() const { return iterator{this->size(), {this}}; }

    template <class U>
    constexpr tensor_view<U, Shape> with(U* y) const

--- a/src/targets/gpu/kernels/include/migraphx/kernels/type_traits.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/type_traits.hpp
@@ -6,6 +6,12 @@

 namespace migraphx {

+template <class T>
+struct type_identity
+{
+    using type = T;
+};
+
 template <bool B, class T = void>
 struct enable_if
 {
@@ -35,6 +41,33 @@ struct is_same<T, T> : true_type
 {
 };

+template <class T>
+struct remove_reference
+{
+    using type = T;
+};
+template <class T>
+struct remove_reference<T&>
+{
+    using type = T;
+};
+template <class T>
+struct remove_reference<T&&>
+{
+    using type = T;
+};
+
+template <class T>
+using remove_reference_t = typename remove_reference<T>::type;
+
+template <class T>
+struct add_pointer : type_identity<typename remove_reference<T>::type*>
+{
+};
+
+template <class T>
+using add_pointer_t = typename add_pointer<T>::type;
+
 #define MIGRAPHX_REQUIRES(...) class = enable_if_t<__VA_ARGS__>

 } // namespace migraphx

--- a/src/targets/gpu/kernels/include/migraphx/kernels/types.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/types.hpp
@@ -6,6 +6,7 @@
 namespace migraphx {

 using index_int = std::uint32_t;
+using diff_int  = std::int32_t;

 #define MIGRAPHX_DEVICE_CONSTEXPR constexpr __device__ __host__ // NOLINT


--- a/src/targets/ref/lowering.cpp
+++ b/src/targets/ref/lowering.cpp
@@ -819,9 +819,9 @@ struct ref_apply
    void apply_pooling(instruction_ref ins) const
    {
        auto&& op = any_cast<op::pooling>(ins->get_operator());
-        if(op.mode == "max")
+        if(op.mode == op::pooling_mode::max)
            mod->replace_instruction(ins, ref_pooling<max_pool>{op}, ins->inputs());
-        else if(op.mode == "average")
+        else if(op.mode == op::pooling_mode::average)
            mod->replace_instruction(ins, ref_pooling<avg_pool>{op}, ins->inputs());
    }
 };

--- a/src/tf/parse_pooling.cpp
+++ b/src/tf/parse_pooling.cpp
@@ -19,7 +19,12 @@ struct parse_pooling : op_parser<parse_pooling>
                          tf_parser::node_info info,
                          std::vector<instruction_ref> args) const
    {
-        op::pooling op{starts_with(opd.tf_name, "Max") ? "max" : "average"};
+        if(!starts_with(opd.tf_name, "Max") && !starts_with(opd.tf_name, "Av"))
+        {
+            MIGRAPHX_THROW("tf pooling mode must be Max or Average");
+        }
+        op::pooling op{starts_with(opd.tf_name, "Max") ? op::pooling_mode::max
+                                                       : op::pooling_mode::average};

        if(contains(info.attributes, "strides"))
        {

--- a/test/api/CMakeLists.txt
+++ b/test/api/CMakeLists.txt
@@ -13,6 +13,7 @@ endfunction()
 add_api_test(assign test_assign.cpp ${TEST_ONNX_DIR})
 add_api_test(compile_options test_compile_options.cpp ${TEST_ONNX_DIR})
 add_api_test(lookup test_lookup.cpp ${TEST_ONNX_DIR})
+add_api_test(module_construct test_module_construct.cpp ${TEST_ONNX_DIR})
 add_api_test(ref test_cpu.cpp ${TEST_ONNX_DIR})
 add_api_test(save_load test_save_load.cpp ${TEST_ONNX_DIR})
 add_api_test(op test_op_construct.cpp ${TEST_ONNX_DIR})

--- a/test/api/test_gpu.cpp
+++ b/test/api/test_gpu.cpp
@@ -25,6 +25,23 @@ TEST_CASE(load_and_run)
    CHECK(bool{shapes_before.front() == outputs.front().get_shape()});
 }

+TEST_CASE(load_and_run_ctx)
+{
+    auto p = migraphx::parse_onnx("conv_relu_maxpool_test.onnx");
+    migraphx::compile_options options;
+    options.set_offload_copy();
+    p.compile(migraphx::target("gpu"), options);
+    migraphx::program_parameters pp;
+    auto param_shapes = p.get_parameter_shapes();
+    for(auto&& name : param_shapes.names())
+    {
+        pp.add(name, migraphx::argument::generate(param_shapes[name]));
+    }
+    auto ctx = p.experimental_get_context();
+    p.eval(pp);
+    ctx.finish();
+}
+
 TEST_CASE(if_pl_test)
 {
    auto run_prog = [&](auto cond) {

--- a/test/api/test_module_construct.cpp
+++ b/test/api/test_module_construct.cpp
+#include <numeric>
+#include <migraphx/migraphx.h>
+#include <migraphx/migraphx.hpp>
+#include "test.hpp"
+
+TEST_CASE(add_op)
+{
+    migraphx::program p;
+    migraphx::module m = p.get_main_module();
+    migraphx::shape param_shape{migraphx_shape_float_type, {3, 3}};
+    auto x      = m.add_parameter("x", param_shape);
+    auto y      = m.add_parameter("y", param_shape);
+    auto add_op = migraphx::operation("add");
+    auto r      = m.add_instruction(add_op, {x, y});
+    m.add_return({r});
+    // run on ref target
+    p.compile(migraphx::target("ref"));
+    migraphx::program_parameters pp;
+    std::vector<float> x_data(9, 1);
+    std::vector<float> y_data(9, -1);
+    pp.add("x", migraphx::argument(param_shape, x_data.data()));
+    pp.add("y", migraphx::argument(param_shape, y_data.data()));
+    auto outputs = p.eval(pp);
+    auto output  = outputs[0];
+    std::vector<float> expected(9, 0);
+    CHECK(bool(output == migraphx::argument(param_shape, expected.data())));
+}
+
+TEST_CASE(if_then_else_op)
+{
+    migraphx::shape param_shape{migraphx_shape_float_type, {3, 3}};
+    migraphx::shape cond_s{migraphx_shape_bool_type};
+    auto create_program = [&]() {
+        migraphx::program p;
+        auto mm         = p.get_main_module();
+        auto cond       = mm.add_parameter("cond", cond_s);
+        auto x          = mm.add_parameter("x", param_shape);
+        auto y          = mm.add_parameter("y", param_shape);
+        auto then_mod   = p.create_module("If_0_if");
+        auto x_identity = then_mod.add_instruction(migraphx::operation("identity"), {x});
+        then_mod.add_return({x_identity});
+
+        auto else_mod   = p.create_module("If_0_else");
+        auto y_identity = else_mod.add_instruction(migraphx::operation("identity"), {y});
+        else_mod.add_return({y_identity});
+
+        auto if_ins = mm.add_instruction(migraphx::operation("if"), {cond}, {then_mod, else_mod});
+        auto get_tuple_op = migraphx::operation("get_tuple_elem", "{index: 0}");
+        auto ret          = mm.add_instruction(get_tuple_op, {if_ins});
+        mm.add_return({ret});
+        return p;
+    };
+
+    std::vector<float> x_data(9, 1);
+    std::vector<float> y_data(9, -1);
+    auto x_arg    = migraphx::argument(param_shape, x_data.data());
+    auto y_arg    = migraphx::argument(param_shape, y_data.data());
+    auto run_prog = [&](bool cond) {
+        auto p = create_program();
+        p.compile(migraphx::target("ref"));
+        auto outputs =
+            p.eval({{"cond", migraphx::argument(cond_s, &cond)}, {"x", x_arg}, {"y", y_arg}});
+        return outputs;
+    };
+
+    // then branch
+    auto then_res = run_prog(true);
+    CHECK(bool{then_res[0] == x_arg});
+
+    // else branch
+    auto else_res = run_prog(false);
+    CHECK(bool{else_res[0] == y_arg});
+}
+
+int main(int argc, const char* argv[]) { test::run(argc, argv); }