Merge branch 'develop' into layout-nhwc

a4f8d30b · Paul · d12efcd2 · a8d86615 · a4f8d30b · a4f8d30b
Commit a4f8d30b authored Aug 09, 2021 by Paul
20 changed files
--- a/src/targets/gpu/kernels/include/migraphx/kernels/pointwise.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/pointwise.hpp
+#ifndef MIGRAPHX_GUARD_KERNELS_POINTWISE_HPP
+#define MIGRAPHX_GUARD_KERNELS_POINTWISE_HPP
+
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/functional.hpp>
+#include <migraphx/kernels/preload.hpp>
+#include <migraphx/kernels/vectorize.hpp>
+#include <migraphx/kernels/args.hpp>
+
+namespace migraphx {
+
+template <class F, class T, class... Ts>
+__device__ void pointwise_tensor(index idx, F f, T out, Ts... xs)
+{
+    preload<typename T::type>(idx, xs...)([&](auto... ps) {
+        idx.global_stride(out.get_shape().elements(), [&](auto i) {
+            auto multi_idx = out.get_shape().multi(i);
+            out[multi_idx] = f(ps[multi_idx]...);
+        });
+    });
+}
+
+template <class F, class... Ts>
+__device__ void pointwise(F f, Ts*... ps)
+{
+    auto t = transform_args(make_tensors(), rotate_last(), auto_vectorize());
+    t(ps...)([&](auto... xs) {
+        auto idx = make_index();
+        pointwise_tensor(idx, f, xs...);
+    });
+}
+
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_POINTWISE_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/preload.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/preload.hpp
+#ifndef MIGRAPHX_GUARD_KERNELS_PRELOAD_HPP
+#define MIGRAPHX_GUARD_KERNELS_PRELOAD_HPP
+
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/functional.hpp>
+
+namespace migraphx {
+
+template <class Shape>
+constexpr bool is_preloadable()
+{
+    Shape s{};
+    if(not s.broadcasted())
+        return false;
+}
+
+template <class T, class... Shapes>
+constexpr auto traverse_preload(Shapes... ss)
+{
+    return [=](auto f, auto... g) {
+        index_int offset = 0;
+        auto each        = [&](auto x) {
+            constexpr auto s    = decltype(x.get_shape()){};
+            constexpr auto size = _c<s.element_space()>;
+            if constexpr(not s.broadcasted())
+                return f(x, offset, false_type{});
+            else if constexpr((s.elements() - size) < 64)
+                return f(x, offset, false_type{});
+            else
+            {
+                auto pre_offset = offset;
+                offset += size;
+                offset += offset % 4;
+                return f(x, pre_offset, true_type{});
+            }
+        };
+        return by(each, g...)(ss...);
+    };
+}
+
+template <class T, class... Shapes>
+constexpr index_int compute_preload_size(Shapes...)
+{
+    index_int size = 0;
+    traverse_preload<T>(Shapes{}...)(
+        [&](auto s, auto offset, auto) { size = offset + s.element_space(); });
+    return size;
+}
+
+template <class F, class T, class... Ts>
+__device__ auto preload_copy(index idx, F f, __shared__ T* buffer, Ts... xs)
+{
+    auto invoke = [&](auto... ys) {
+        __syncthreads();
+        f(ys...);
+    };
+    traverse_preload<T>(xs...)(
+        [&](auto x, auto offset, auto copy) {
+            if constexpr(copy)
+            {
+                auto v = vectorize(x);
+                auto b = as_vec(tensor_vec_size(v), buffer + offset);
+                idx.local_stride(v.get_shape().element_space(),
+                                 [&](auto i) { b[i] = v.data()[i]; });
+                return x.with(buffer + offset);
+            }
+            else
+            {
+                return x;
+            }
+        },
+        invoke);
+}
+
+template <class T>
+struct remove_vec
+{
+    using type = T;
+};
+
+template <class T, index_int N>
+struct remove_vec<vec<T, N>>
+{
+    using type = T;
+};
+
+template <class T, class... Ts>
+__device__ auto preload(index idx, Ts... xs)
+{
+    using type               = typename remove_vec<T>::type;
+    constexpr auto size      = compute_preload_size<type>(xs.get_shape()...);
+    const index_int max_size = 512 * sizeof(type);
+    return [=](auto f) {
+        if constexpr(size > 0 and size < max_size)
+        {
+            __shared__ type buffer[size];
+            preload_copy(idx, f, buffer, xs...);
+        }
+        else
+        {
+            f(xs...);
+        }
+    };
+}
+
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_PRELOAD_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/print.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/print.hpp
+#ifndef MIGRAPHX_GUARD_KERNELS_PRINT_HPP
+#define MIGRAPHX_GUARD_KERNELS_PRINT_HPP
+
+#include <hip/hip_runtime.h>
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/functional.hpp>
+#include <migraphx/kernels/algorithm.hpp>
+
+namespace migraphx {
+
+template <class F, class G>
+struct on_exit
+{
+    F f;
+    G g;
+    template <class T>
+    __host__ __device__ auto operator()(T x) const
+    {
+        return f(x);
+    }
+
+    __host__ __device__ ~on_exit() { f(g); }
+};
+
+template <class PrivateMIGraphXTypeNameProbe>
+constexpr auto print_type_name_probe()
+{
+    constexpr auto name                = __PRETTY_FUNCTION__;
+    constexpr auto size                = sizeof(__PRETTY_FUNCTION__);
+    constexpr auto parameter_name      = "PrivateMIGraphXTypeNameProbe = ";
+    constexpr auto parameter_name_size = sizeof("PrivateMIGraphXTypeNameProbe = ") - 1;
+    constexpr auto begin =
+        search(name, name + size, parameter_name, parameter_name + parameter_name_size);
+    static_assert(begin < name + size, "Type probe not found.");
+    constexpr auto start = begin + parameter_name_size;
+    constexpr auto last  = find_if(start, name + size, [](auto c) { return c == ']' or c == ';'; });
+    return [=](const auto& s) { s.print_string(start, last - start); };
+}
+
+template <class T>
+struct type_printer
+{
+    template <class Stream>
+    friend constexpr const Stream& operator<<(const Stream& s, type_printer)
+    {
+        print_type_name_probe<T>()(s);
+        return s;
+    }
+};
+
+template <class T>
+constexpr type_printer<T> type_of()
+{
+    return {};
+}
+
+template <class T>
+constexpr type_printer<T> type_of(T)
+{
+    return {};
+}
+
+template <class T>
+constexpr type_printer<typename T::type> sub_type_of()
+{
+    return {};
+}
+
+template <class T>
+constexpr type_printer<typename T::type> sub_type_of(T)
+{
+    return {};
+}
+
+template <class F>
+struct basic_printer
+{
+    F f;
+    __host__ __device__ const basic_printer& print_long(long value) const
+    {
+        f([&] { printf("%li", value); });
+        return *this;
+    }
+    __host__ __device__ const basic_printer& print_ulong(unsigned long value) const
+    {
+        f([&] { printf("%lu", value); });
+        return *this;
+    }
+    __host__ __device__ const basic_printer& print_char(char value) const
+    {
+        f([&] { printf("%c", value); });
+        return *this;
+    }
+    __host__ __device__ const basic_printer& print_string(const char* value) const
+    {
+        f([&] { printf("%s", value); });
+        return *this;
+    }
+    __host__ __device__ const basic_printer& print_string(const char* value, int size) const
+    {
+        f([&] { printf("%.*s", size, value); });
+        return *this;
+    }
+    __host__ __device__ const basic_printer& print_double(double value) const
+    {
+        f([&] { printf("%f", value); });
+        return *this;
+    }
+    __host__ __device__ const basic_printer& print_bool(bool value) const
+    {
+        f([&] {
+            if(value)
+                printf("true");
+            else
+                printf("false");
+        });
+        return *this;
+    }
+    __host__ __device__ const basic_printer& operator<<(short value) const
+    {
+        return print_long(value);
+    }
+    __host__ __device__ const basic_printer& operator<<(unsigned short value) const
+    {
+        return print_ulong(value);
+    }
+    __host__ __device__ const basic_printer& operator<<(int value) const
+    {
+        return print_long(value);
+    }
+    __host__ __device__ const basic_printer& operator<<(unsigned int value) const
+    {
+        return print_ulong(value);
+    }
+    __host__ __device__ const basic_printer& operator<<(long value) const
+    {
+        return print_long(value);
+    }
+    __host__ __device__ const basic_printer& operator<<(unsigned long value) const
+    {
+        return print_ulong(value);
+    }
+    __host__ __device__ const basic_printer& operator<<(float value) const
+    {
+        return print_double(value);
+    }
+    __host__ __device__ const basic_printer& operator<<(double value) const
+    {
+        return print_double(value);
+    }
+    __host__ __device__ const basic_printer& operator<<(bool value) const
+    {
+        return print_bool(value);
+    }
+    __host__ __device__ const basic_printer& operator<<(char value) const
+    {
+        return print_char(value);
+    }
+    __host__ __device__ const basic_printer& operator<<(unsigned char value) const
+    {
+        return print_char(value);
+    }
+    __host__ __device__ const basic_printer& operator<<(const char* value) const
+    {
+        return print_string(value);
+    }
+};
+
+template <class F>
+constexpr basic_printer<F> make_printer(F f)
+{
+    return {f};
+}
+
+template <class F, class G>
+constexpr basic_printer<on_exit<F, G>> make_printer(F f, G g)
+{
+    return {{f, g}};
+}
+
+inline __device__ auto cout()
+{
+    return make_printer([](auto f) { f(); });
+}
+
+inline __device__ auto coutln()
+{
+    return make_printer([](auto f) { f(); }, [] { printf("\n"); });
+}
+
+template <class F, class... Ts>
+__device__ void print_each(F f, Ts... xs)
+{
+    each_args([&](auto x) { f() << x; }, xs...);
+}
+
+template <class F, class... Ts>
+__device__ void print_each_once(F f, Ts... xs)
+{
+    auto idx = make_index();
+    if(idx.global == 0)
+        print_each(f, xs...);
+}
+
+template <class... Ts>
+__device__ void print(Ts... xs)
+{
+    print_each(&cout, xs...);
+}
+
+template <class... Ts>
+__device__ void print_once(Ts... xs)
+{
+    print_each_once(&cout, xs...);
+}
+
+template <class... Ts>
+__device__ void println(Ts... xs)
+{
+    print_each(&coutln, xs...);
+}
+
+template <class... Ts>
+__device__ void println_once(Ts... xs)
+{
+    print_each_once(&coutln, xs...);
+}
+
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_PRINT_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/shape.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/shape.hpp
@@ -19,7 +19,7 @@ struct shape

    constexpr index_int elements() const { return lens.product(); }

-    constexpr index_int element_space() const { return strides.dot(lens - 1); }
+    constexpr index_int element_space() const { return strides.dot(lens - 1) + 1; }

    constexpr bool packed() const { return elements() == element_space(); }
    constexpr bool broadcasted() const { return strides.product() == 0; }
@@ -92,6 +92,15 @@ struct shape
        result[0] = tidx;
        return result;
    }
+
+    constexpr shape get_shape() const { return *this; }
+
+    template <class Stream>
+    friend constexpr const Stream& operator<<(const Stream& ss, const shape& s)
+    {
+        ss << "{" << s.lens << "}, {" << s.strides << "}";
+        return ss;
+    }
 };

 template <class Lens, class Strides>

--- a/src/targets/gpu/kernels/include/migraphx/kernels/tensor_view.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/tensor_view.hpp
@@ -2,18 +2,22 @@
 #define MIGRAPHX_GUARD_KERNELS_TENSOR_VIEW_HPP

 #include <migraphx/kernels/shape.hpp>
+#include <migraphx/kernels/debug.hpp>

 namespace migraphx {

 template <class T, class Shape>
 struct tensor_view
 {
+    using type = T;
+
    constexpr Shape get_shape() const { return Shape{}; }
    constexpr index_int size() const { return get_shape().elements(); }

    template <class U>
    constexpr T& operator[](U i) const
    {
+        MIGRAPHX_ASSERT(get_shape().index(i) < get_shape().element_space());
        return x[get_shape().index(i)];
    }

@@ -22,6 +26,13 @@ struct tensor_view
    constexpr T* begin() const { return data(); }
    constexpr T* end() const { return data() + size(); }

+    template <class U>
+    constexpr tensor_view<U, Shape> with(U* y) const
+    {
+        static_assert(sizeof(T) == sizeof(U), "Not the same size");
+        return {y};
+    }
+
    T* x;
 };


--- a/src/targets/gpu/kernels/include/migraphx/kernels/types.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/types.hpp
@@ -9,6 +9,9 @@ using index_int = std::uint32_t;

 #define MIGRAPHX_DEVICE_CONSTEXPR constexpr __device__ __host__ // NOLINT

+template <class T, index_int N>
+using vec = T __attribute__((ext_vector_type(N)));
+
 } // namespace migraphx

 #endif
--- a/src/targets/gpu/kernels/include/migraphx/kernels/vec.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/vec.hpp
+#ifndef MIGRAPHX_GUARD_KERNELS_VEC_HPP
+#define MIGRAPHX_GUARD_KERNELS_VEC_HPP
+
+#include <migraphx/kernels/types.hpp>
+#include <migraphx/kernels/integral_constant.hpp>
+
+namespace migraphx {
+
+template <class T, index_int N>
+constexpr auto vec_size(vec<T, N>)
+{
+    return index_constant<N>{};
+}
+
+template <class T>
+constexpr auto vec_size(T, ...)
+{
+    return index_constant<0>{};
+}
+
+template <class T>
+constexpr auto vec_size()
+{
+    return decltype(vec_size(T{})){};
+}
+
+template <index_int N, class T>
+__device__ __host__ auto as_vec(T* x)
+{
+    if constexpr(N == 0)
+        return x;
+    else
+        return reinterpret_cast<vec<T, N>*>(x);
+}
+
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_VEC_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/vectorize.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/vectorize.hpp
+#ifndef MIGRAPHX_GUARD_KERNELS_VECTORIZE_HPP
+#define MIGRAPHX_GUARD_KERNELS_VECTORIZE_HPP
+
+#include <migraphx/kernels/tensor_view.hpp>
+#include <migraphx/kernels/vec.hpp>
+
+namespace migraphx {
+
+template <class T>
+constexpr auto tensor_vec_size(T)
+{
+    return vec_size<typename T::type>();
+}
+
+template <index_int N, class Shape>
+constexpr auto as_vec_shape(Shape s)
+{
+    auto lens    = transform(s.lens, s.strides, [](auto len, auto stride) {
+        if(stride == 1)
+            return len / N;
+        else
+            return len;
+    });
+    auto strides = transform(s.strides, [](auto stride) {
+        if(stride == 1)
+            return stride;
+        return stride / N;
+    });
+    MIGRAPHX_ASSERT(make_shape(lens, strides).element_space() * N == s.element_space());
+    return make_shape(lens, strides);
+}
+
+template <index_int N, class T>
+__device__ __host__ auto as_vec(T x)
+{
+    if constexpr(N == 0)
+        return x;
+    else
+        return make_tensor_view(as_vec<N>(x.data()), as_vec_shape<N>(x.get_shape()));
+}
+
+template <index_int N, class T, class Axis>
+constexpr auto tensor_step(T x, Axis)
+{
+    if constexpr(N == 0)
+    {
+        return x;
+    }
+    else
+    {
+        constexpr auto s = decltype(x.get_shape()){};
+        MIGRAPHX_ASSERT(s.strides[Axis{}] == 0);
+        return sequence(x.get_shape().lens.size(), [&](auto... is) {
+            auto lens = transform(s.lens, index_ints<is...>{}, [&](auto i, auto j) {
+                constexpr auto axis = Axis{};
+                if(j == axis)
+                    return i / N;
+                else
+                    return i;
+            });
+            return make_tensor_view(x.data(), make_shape(lens, s.strides));
+        });
+    }
+}
+
+template <class IntegralConstant, class T>
+__device__ __host__ auto as_vec(IntegralConstant ic, T&& x)
+{
+    return as_vec<ic>(x);
+}
+
+template <class... Shapes>
+constexpr index_int find_vector_axis(Shapes... ss)
+{
+    index_int axis = 0;
+    bool b         = false;
+    by([&](auto s) {
+        if(s.broadcasted() or b)
+            return;
+        auto it = find(s.strides.begin(), s.strides.end(), 1);
+        if(it == s.strides.end())
+            return;
+        axis = it - s.strides.begin();
+        b    = true;
+    })(ss...);
+    return axis;
+}
+
+template <index_int N, class Axis, class... Shapes>
+constexpr auto is_vectorizable(Axis axis, Shapes... ss)
+{
+    return (((ss.lens[axis] % N) == 0 and (ss.strides[axis] == 1 or ss.strides[axis] == 0)) and
+            ...);
+}
+
+template <index_int N, class... Shapes>
+constexpr bool is_vectorizable(Shapes... ss)
+{
+    return (is_vectorizable<N>(ss, find_vector_axis(ss)) and ...);
+}
+
+template <class P>
+constexpr auto find_vectorize_size(P pred)
+{
+    if constexpr(pred(_c<4>))
+        return _c<4>;
+    else if constexpr(pred(_c<2>))
+        return _c<2>;
+    else
+        return _c<0>;
+}
+
+template <class T>
+__host__ __device__ auto vectorize(T x)
+{
+    if constexpr(vec_size<T>() == 0)
+    {
+        constexpr auto n =
+            find_vectorize_size([&](auto i) { return _c<is_vectorizable<i>(x.get_shape())>; });
+        return as_vec<n>(x);
+    }
+    else
+    {
+        return x;
+    }
+}
+
+inline __device__ __host__ auto auto_vectorize()
+{
+    return [](auto... xs) {
+        return [=](auto f) {
+            // TODO: Just check there a single axis of 1
+            constexpr bool packed_or_broadcasted =
+                ((xs.get_shape().packed() or xs.get_shape().broadcasted()) and ...);
+            if constexpr(packed_or_broadcasted)
+            {
+                constexpr auto axis = find_vector_axis(xs.get_shape()...);
+                constexpr auto n    = find_vectorize_size(
+                    [&](auto i) { return _c<is_vectorizable<i>(axis, xs.get_shape()...)>; });
+                by(
+                    [&](auto x) {
+                        constexpr auto s = x.get_shape();
+                        if constexpr(s.strides[axis] == 0)
+                            return tensor_step<n>(x, axis);
+                        else
+                            return as_vec<n>(x);
+                    },
+                    f)(xs...);
+            }
+            else
+            {
+                f(xs...);
+            }
+        };
+    };
+}
+
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_VECTORIZE_HPP
--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -173,6 +173,7 @@ struct miopen_apply
        add_extend_op("rnn_var_sl_last_output");
        add_extend_op("rnn_var_sl_shift_output");
        add_extend_op("rnn_var_sl_shift_sequence");
+        add_extend_op("scatter");
        add_extend_op("softmax");

        add_gemm_op<op::dot>("dot");

--- a/src/targets/gpu/scatter.cpp
+++ b/src/targets/gpu/scatter.cpp
+#include <migraphx/gpu/scatter.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/device/scatter.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+shape hip_scatter::compute_shape(std::vector<shape> inputs) const
+{
+    inputs.pop_back();
+    return op.normalize_compute_shape(inputs);
+}
+
+argument hip_scatter::compute(context& ctx, const shape&, const std::vector<argument>& args) const
+{
+    return device::scatter(ctx.get_stream().get(), args.back(), args[0], args[1], args[2], op.axis);
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/value.cpp
+++ b/src/value.cpp
@@ -224,23 +224,24 @@ std::vector<value>& get_array_throw(const std::shared_ptr<value_base_impl>& x)
    return *a;
 }

-value* find_impl(const std::shared_ptr<value_base_impl>& x, const std::string& key)
+template <class T>
+T* find_impl(const std::shared_ptr<value_base_impl>& x, const std::string& key, T* end)
 {
    auto* a = if_array_impl(x);
    if(a == nullptr)
-        return nullptr;
+        return end;
    auto* lookup = x->if_object();
    if(lookup == nullptr)
-        return nullptr;
+        return end;
    auto it = lookup->find(key);
    if(it == lookup->end())
-        return a->data() + a->size();
+        return end;
    return std::addressof((*a)[it->second]);
 }

-value* value::find(const std::string& pkey) { return find_impl(x, pkey); }
+value* value::find(const std::string& pkey) { return find_impl(x, pkey, this->end()); }

-const value* value::find(const std::string& pkey) const { return find_impl(x, pkey); }
+const value* value::find(const std::string& pkey) const { return find_impl(x, pkey, this->end()); }
 bool value::contains(const std::string& pkey) const
 {
    const auto* it = find(pkey);

--- a/test/eliminate_contiguous_test.cpp
+++ b/test/eliminate_contiguous_test.cpp
@@ -131,4 +131,32 @@ TEST_CASE(non_standard_return_input)
    EXPECT(std::distance(m.begin(), m.end()) == count);
 }

+TEST_CASE(non_standard_flatten_op)
+{
+    migraphx::module m;
+
+    auto l = m.add_parameter("x", {migraphx::shape::float_type, {2, 6, 6, 6}});
+    auto t = m.add_instruction(
+        migraphx::make_op("slice", {{"axes", {2, 3}}, {"starts", {1, 1}}, {"ends", {6, 6}}}), l);
+    auto c = m.add_instruction(migraphx::make_op("contiguous"), t);
+    m.add_instruction(migraphx::make_op("flatten"), c);
+    auto count = std::distance(m.begin(), m.end());
+    run_pass(m);
+    EXPECT(std::distance(m.begin(), m.end()) == count);
+}
+
+TEST_CASE(standard_flatten_op)
+{
+    migraphx::module m;
+
+    auto l = m.add_parameter("x", {migraphx::shape::float_type, {2, 6, 6, 6}});
+    auto t = m.add_instruction(
+        migraphx::make_op("slice", {{"axes", {0, 1}}, {"starts", {1, 1}}, {"ends", {6, 6}}}), l);
+    auto c = m.add_instruction(migraphx::make_op("contiguous"), t);
+    m.add_instruction(migraphx::make_op("flatten"), c);
+    auto count = std::distance(m.begin(), m.end());
+    run_pass(m);
+    EXPECT(std::distance(m.begin(), m.end()) == (count - 1));
+}
+
 int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/test/gpu/jit.cpp
+++ b/test/gpu/jit.cpp
@@ -54,7 +54,7 @@ using namespace migraphx;
 extern "C" {
 __global__ void kernel(void* x, void* y) 
 {
-    make_tensors(x, y)([](auto xt, auto yt) __device__ {
+    make_tensors()(x, y)([](auto xt, auto yt) __device__ {
        auto idx = make_index();
        const auto stride = idx.nglobal();
        for(index_int i = idx.global; i < xt.get_shape().elements(); i += stride)

--- a/test/onnx/const_of_shape_empty_input_test.onnx
+++ b/test/onnx/const_of_shape_empty_input_test.onnx
--- a/test/onnx/constant_fill_input_as_shape_test.onnx
+++ b/test/onnx/constant_fill_input_as_shape_test.onnx
--- a/test/onnx/flatten_nonstd_test.onnx
+++ b/test/onnx/flatten_nonstd_test.onnx
--- a/test/onnx/gemm_half_test.onnx
+++ b/test/onnx/gemm_half_test.onnx
--- a/test/onnx/gen_onnx.py
+++ b/test/onnx/gen_onnx.py
@@ -4,8 +4,7 @@
 import numpy as np
 import onnx
 from onnx import helper
-from onnx import numpy_helper
-from onnx import AttributeProto, TensorProto, GraphProto
+from onnx import TensorProto


 def onnx_test(op_test):
@@ -483,7 +482,6 @@ def constant_fill_test():
 @onnx_test
 def constant_fill_input_as_shape_test():
    np_shape = np.array([2, 3])
-    shape = helper.make_tensor_value_info('shape', TensorProto.INT32, [2])
    value = helper.make_tensor_value_info('value', TensorProto.FLOAT, [2, 3])

    ts_shape = helper.make_tensor(name='shape_tensor',
@@ -534,7 +532,6 @@ def constant_scalar_test():
 def const_of_shape_empty_input_test():
    tensor_val = onnx.helper.make_tensor('value', onnx.TensorProto.INT64, [1],
                                         [10])
-    shape_val = np.array([2, 3, 4]).astype(np.int64)
    empty_val = np.array([]).astype(np.int64)
    empty_ts = helper.make_tensor(name='empty_tensor',
                                  data_type=TensorProto.INT32,
@@ -1232,98 +1229,6 @@ def equal_bool_test():
    return ([node1, node2], [x1, x2], [y])


-@onnx_test
-def greater_test():
-    ax1 = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
-    x1 = helper.make_tensor("x1",
-                            data_type=TensorProto.FLOAT,
-                            dims=(2, 3),
-                            vals=ax1.astype(np.float32))
-
-    x2 = helper.make_tensor_value_info('x2', TensorProto.FLOAT, [2, 3])
-    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [2, 3])
-
-    node = onnx.helper.make_node(
-        'Greater',
-        inputs=['x1', 'x2'],
-        outputs=['y'],
-    )
-
-    return ([node], [x2], [y], [x1])
-
-
-@onnx_test
-def greater_bool_test():
-
-    x1 = helper.make_tensor_value_info('x1', TensorProto.FLOAT, [2, 3])
-    x2 = helper.make_tensor_value_info('x2', TensorProto.BOOL, [2, 3])
-    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [2, 3])
-
-    node1 = onnx.helper.make_node('Cast', inputs=['x1'], outputs=['bx1'], to=9)
-
-    node2 = onnx.helper.make_node(
-        'Greater',
-        inputs=['bx1', 'x2'],
-        outputs=['y'],
-    )
-
-    return ([node1, node2], [x1, x2], [y])
-
-
-@onnx_test
-def less_test():
-    ax1 = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
-    x1 = helper.make_tensor("x1",
-                            data_type=TensorProto.FLOAT,
-                            dims=(2, 3),
-                            vals=ax1.astype(np.float32))
-
-    x2 = helper.make_tensor_value_info('x2', TensorProto.FLOAT, [2, 3])
-    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [2, 3])
-
-    node = onnx.helper.make_node(
-        'Less',
-        inputs=['x1', 'x2'],
-        outputs=['y'],
-    )
-
-    return ([node], [x2], [y], [x1])
-
-
-@onnx_test
-def less_bool_test():
-
-    x1 = helper.make_tensor_value_info('x1', TensorProto.FLOAT, [2, 3])
-    x2 = helper.make_tensor_value_info('x2', TensorProto.BOOL, [2, 3])
-    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [2, 3])
-
-    node1 = onnx.helper.make_node('Cast', inputs=['x1'], outputs=['bx1'], to=9)
-
-    node2 = onnx.helper.make_node(
-        'Less',
-        inputs=['bx1', 'x2'],
-        outputs=['y'],
-    )
-
-    return ([node1, node2], [x1, x2], [y])
-
-
-@onnx_test
-def lessorequal_test():
-
-    x1 = helper.make_tensor_value_info('x1', TensorProto.FLOAT, [3])
-    x2 = helper.make_tensor_value_info('x2', TensorProto.FLOAT, [3])
-    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [3])
-
-    node = onnx.helper.make_node(
-        'LessOrEqual',
-        inputs=['x1', 'x2'],
-        outputs=['y'],
-    )
-
-    return ([node], [x1, x2], [y])
-
-
 @onnx_test
 def erf_test():
    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [10, 15])
@@ -1391,6 +1296,29 @@ def flatten_test():
    return ([node, node2], [x], [y, y2])


+@onnx_test
+def flatten_nonstd_test():
+    x = helper.make_tensor_value_info('0', TensorProto.FLOAT, [2, 3, 5, 4])
+    y = helper.make_tensor_value_info('2', TensorProto.FLOAT, [6, 20])
+    y2 = helper.make_tensor_value_info('3', TensorProto.FLOAT, [2, 60])
+
+    trans = helper.make_node(
+        'Transpose',
+        inputs=['0'],
+        outputs=['tx'],
+        perm=[0, 1, 3, 2],
+    )
+
+    node = onnx.helper.make_node('Flatten',
+                                 inputs=['tx'],
+                                 axis=2,
+                                 outputs=['2'])
+
+    node2 = onnx.helper.make_node('Flatten', inputs=['tx'], outputs=['3'])
+
+    return ([trans, node, node2], [x], [y, y2])
+
+
 @onnx_test
 def floor_test():
    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [10])
@@ -1506,6 +1434,23 @@ def gemm_ex_brcst_test():
    return ([node], [m1, m2, m3], [y])


+@onnx_test
+def gemm_half_test():
+    m1 = helper.make_tensor_value_info('1', TensorProto.FLOAT16, [1, 1, 8, 6])
+    m2 = helper.make_tensor_value_info('2', TensorProto.FLOAT16, [1, 1, 8, 7])
+    m3 = helper.make_tensor_value_info('3', TensorProto.FLOAT16, [1, 1, 6, 1])
+    y = helper.make_tensor_value_info('y', TensorProto.FLOAT16, [1, 1, 6, 7])
+
+    node = onnx.helper.make_node('Gemm',
+                                 inputs=['1', '2', '3'],
+                                 outputs=['y'],
+                                 alpha=0.5,
+                                 beta=0.8,
+                                 transA=1)
+
+    return ([node], [m1, m2, m3], [y])
+
+
 @onnx_test
 def globalavgpool_test():
    x = helper.make_tensor_value_info('0', TensorProto.FLOAT, [1, 3, 16, 16])
@@ -1534,6 +1479,44 @@ def globalmaxpool_test():
    return ([node], [x], [y])


+@onnx_test
+def greater_test():
+    ax1 = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
+    x1 = helper.make_tensor("x1",
+                            data_type=TensorProto.FLOAT,
+                            dims=(2, 3),
+                            vals=ax1.astype(np.float32))
+
+    x2 = helper.make_tensor_value_info('x2', TensorProto.FLOAT, [2, 3])
+    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [2, 3])
+
+    node = onnx.helper.make_node(
+        'Greater',
+        inputs=['x1', 'x2'],
+        outputs=['y'],
+    )
+
+    return ([node], [x2], [y], [x1])
+
+
+@onnx_test
+def greater_bool_test():
+
+    x1 = helper.make_tensor_value_info('x1', TensorProto.FLOAT, [2, 3])
+    x2 = helper.make_tensor_value_info('x2', TensorProto.BOOL, [2, 3])
+    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [2, 3])
+
+    node1 = onnx.helper.make_node('Cast', inputs=['x1'], outputs=['bx1'], to=9)
+
+    node2 = onnx.helper.make_node(
+        'Greater',
+        inputs=['bx1', 'x2'],
+        outputs=['y'],
+    )
+
+    return ([node1, node2], [x1, x2], [y])
+
+
 @onnx_test
 def group_conv_test():
    x = helper.make_tensor_value_info('0', TensorProto.FLOAT, [1, 4, 16, 16])
@@ -1610,8 +1593,6 @@ def if_literal_test():
                                                  onnx.TensorProto.FLOAT, [5])
    else_out = onnx.helper.make_tensor_value_info('else_out',
                                                  onnx.TensorProto.FLOAT, [5])
-    empty_out = onnx.helper.make_tensor_value_info('empty_out',
-                                                   onnx.TensorProto.FLOAT, [])

    x = np.array([1, 2, 3, 4, 5]).astype(np.float32)
    y = np.array([5, 4, 3, 2, 1]).astype(np.float32)
@@ -2231,6 +2212,60 @@ def leaky_relu_test():
    return ([node], [x], [y])


+@onnx_test
+def less_test():
+    ax1 = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
+    x1 = helper.make_tensor("x1",
+                            data_type=TensorProto.FLOAT,
+                            dims=(2, 3),
+                            vals=ax1.astype(np.float32))
+
+    x2 = helper.make_tensor_value_info('x2', TensorProto.FLOAT, [2, 3])
+    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [2, 3])
+
+    node = onnx.helper.make_node(
+        'Less',
+        inputs=['x1', 'x2'],
+        outputs=['y'],
+    )
+
+    return ([node], [x2], [y], [x1])
+
+
+@onnx_test
+def less_bool_test():
+
+    x1 = helper.make_tensor_value_info('x1', TensorProto.FLOAT, [2, 3])
+    x2 = helper.make_tensor_value_info('x2', TensorProto.BOOL, [2, 3])
+    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [2, 3])
+
+    node1 = onnx.helper.make_node('Cast', inputs=['x1'], outputs=['bx1'], to=9)
+
+    node2 = onnx.helper.make_node(
+        'Less',
+        inputs=['bx1', 'x2'],
+        outputs=['y'],
+    )
+
+    return ([node1, node2], [x1, x2], [y])
+
+
+@onnx_test
+def lessorequal_test():
+
+    x1 = helper.make_tensor_value_info('x1', TensorProto.FLOAT, [3])
+    x2 = helper.make_tensor_value_info('x2', TensorProto.FLOAT, [3])
+    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [3])
+
+    node = onnx.helper.make_node(
+        'LessOrEqual',
+        inputs=['x1', 'x2'],
+        outputs=['y'],
+    )
+
+    return ([node], [x1, x2], [y])
+
+
 @onnx_test
 def log_test():
    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [10])
@@ -2294,8 +2329,7 @@ def logsoftmax_test():
 @onnx_test
 def logsoftmax_nonstd_input_test():
    x = helper.make_tensor_value_info('0', TensorProto.FLOAT, [6, 9])
-    y = helper.make_tensor_value_info('1', TensorProto.FLOAT, [3, 4])
-    z = helper.make_tensor_value_info('2', TensorProto.FLOAT, [3, 4])
+    y = helper.make_tensor_value_info('2', TensorProto.FLOAT, [3, 4])

    node0 = onnx.helper.make_node('Slice',
                                  inputs=['0'],
@@ -2309,7 +2343,7 @@ def logsoftmax_nonstd_input_test():
                                  outputs=['2'],
                                  axis=-1)

-    return ([node0, node1], [x], [z])
+    return ([node0, node1], [x], [y])


 @onnx_test
@@ -3190,8 +3224,6 @@ def reshape_test():
 @onnx_test
 def reshape_non_standard_test():
    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [2, 3, 4])
-    trans_x = helper.make_tensor_value_info('trans_x', TensorProto.FLOAT,
-                                            [2, 4, 3])
    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [4, 3, 2])

    trans = helper.make_node(
@@ -3401,6 +3433,25 @@ def resize_upsample_pc_test():
    return ([node], [X], [Y], [scale_tensor])


+@onnx_test
+def scatter_test():
+    x = helper.make_tensor_value_info('data', TensorProto.FLOAT, [3, 4, 5, 6])
+    i = helper.make_tensor_value_info('indices', TensorProto.INT32,
+                                      [2, 3, 4, 5])
+    u = helper.make_tensor_value_info('update', TensorProto.FLOAT,
+                                      [2, 3, 4, 5])
+    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [3, 4, 5, 6])
+
+    node = onnx.helper.make_node(
+        'Scatter',
+        inputs=['data', 'indices', 'update'],
+        outputs=['y'],
+        axis=-2,
+    )
+
+    return ([node], [x, i, u], [y])
+
+
 @onnx_test
 def selu_test():
    x = helper.make_tensor_value_info('x', TensorProto.DOUBLE, [2, 3])
@@ -3434,7 +3485,6 @@ def shape_gather_test():
    values = np.array([1])
    # value = helper.make_tensor_value_info('value', TensorProto.INT32, [1])
    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [7, 3, 10])
-    y = helper.make_tensor_value_info('y', TensorProto.INT64, [3])
    z = helper.make_tensor_value_info('z', TensorProto.FLOAT, [1])

    value_tensor = helper.make_tensor(name='const_tensor',
@@ -3741,8 +3791,7 @@ def softmax_test():
 @onnx_test
 def softmax_nonstd_input_test():
    x = helper.make_tensor_value_info('0', TensorProto.FLOAT, [6, 8])
-    y = helper.make_tensor_value_info('1', TensorProto.FLOAT, [3, 4])
-    z = helper.make_tensor_value_info('2', TensorProto.FLOAT, [3, 4])
+    y = helper.make_tensor_value_info('2', TensorProto.FLOAT, [3, 4])

    node0 = onnx.helper.make_node('Slice',
                                  inputs=['0'],
@@ -3753,7 +3802,7 @@ def softmax_nonstd_input_test():

    node1 = onnx.helper.make_node('Softmax', inputs=['1'], outputs=['2'])

-    return ([node0, node1], [x], [z])
+    return ([node0, node1], [x], [y])


 @onnx_test
@@ -3856,8 +3905,7 @@ def squeeze_empty_axes_test():
 def squeeze_unsqueeze_test():
    x = helper.make_tensor_value_info('0', TensorProto.FLOAT,
                                      [1, 3, 1, 1, 2, 1])
-    y = helper.make_tensor_value_info('1', TensorProto.FLOAT, [3, 2])
-    z = helper.make_tensor_value_info('2', TensorProto.FLOAT,
+    y = helper.make_tensor_value_info('2', TensorProto.FLOAT,
                                      [1, 1, 3, 1, 2, 1])

    node = onnx.helper.make_node('Squeeze',
@@ -3870,7 +3918,7 @@ def squeeze_unsqueeze_test():
                                  axes=[0, 1, 3, 5],
                                  outputs=['2'])

-    return ([node, node2], [x], [z])
+    return ([node, node2], [x], [y])


 @onnx_test

--- a/test/onnx/logsoftmax_nonstd_input_test.onnx
+++ b/test/onnx/logsoftmax_nonstd_input_test.onnx
--- a/test/onnx/onnx_test.cpp
+++ b/test/onnx/onnx_test.cpp
@@ -1183,6 +1183,21 @@ TEST_CASE(flatten_test)
    EXPECT(p == prog);
 }

+TEST_CASE(flatten_nonstd_test)
+{
+    migraphx::program p;
+    auto* mm = p.get_main_module();
+    auto l0  = mm->add_parameter("0", migraphx::shape{migraphx::shape::float_type, {2, 3, 5, 4}});
+    auto l1  = mm->add_instruction(migraphx::make_op("transpose", {{"dims", {0, 1, 3, 2}}}), l0);
+    auto l2  = mm->add_instruction(migraphx::make_op("contiguous"), l1);
+    mm->add_instruction(migraphx::make_op("flatten", {{"axis", 2}}), l2);
+    auto l3 = mm->add_instruction(migraphx::make_op("contiguous"), l1);
+    mm->add_instruction(migraphx::make_op("flatten", {{"axis", 1}}), l3);
+    auto prog = optimize_onnx("flatten_nonstd_test.onnx");
+
+    EXPECT(p == prog);
+}
+
 TEST_CASE(floor_test)
 {
    migraphx::program p;
@@ -1277,10 +1292,8 @@ TEST_CASE(gemm_test)
    auto beta  = 2.0f;
    auto a_l   = mm->add_literal(alpha);
    auto t_a   = add_common_op(*mm, migraphx::make_op("mul"), {a_l, l0});
-    t_a        = mm->add_instruction(
-        migraphx::make_op("convert", {{"target_type", l1->get_shape().type()}}), t_a);
-    t_a     = mm->add_instruction(migraphx::make_op("transpose", {{"dims", {1, 0}}}), t_a);
-    auto t1 = mm->add_instruction(migraphx::make_op("transpose", {{"dims", {1, 0}}}), l1);
+    t_a        = mm->add_instruction(migraphx::make_op("transpose", {{"dims", {1, 0}}}), t_a);
+    auto t1    = mm->add_instruction(migraphx::make_op("transpose", {{"dims", {1, 0}}}), l1);

    auto b_l = mm->add_literal(beta);
    auto l2_b =
@@ -1305,10 +1318,7 @@ TEST_CASE(gemm_ex_test)
    auto beta  = 0.8f;
    auto a_l   = mm->add_literal(alpha);
    auto t_a   = add_common_op(*mm, migraphx::make_op("mul"), {a_l, l0});
-    t_a        = mm->add_instruction(
-        migraphx::make_op("convert", {{"target_type", l1->get_shape().type()}}), t_a);
-
-    t_a = mm->add_instruction(migraphx::make_op("transpose", {{"dims", {0, 1, 3, 2}}}), t_a);
+    t_a        = mm->add_instruction(migraphx::make_op("transpose", {{"dims", {0, 1, 3, 2}}}), t_a);

    auto b_l = mm->add_literal(beta);
    auto b_b = mm->add_instruction(
@@ -1333,10 +1343,7 @@ TEST_CASE(gemm_ex_brcst_test)
    auto beta  = 0.8f;
    auto a_l   = mm->add_literal(alpha);
    auto t_a   = add_common_op(*mm, migraphx::make_op("mul"), {a_l, l0});
-    t_a        = mm->add_instruction(
-        migraphx::make_op("convert", {{"target_type", l1->get_shape().type()}}), t_a);
-
-    t_a = mm->add_instruction(migraphx::make_op("transpose", {{"dims", {0, 1, 3, 2}}}), t_a);
+    t_a        = mm->add_instruction(migraphx::make_op("transpose", {{"dims", {0, 1, 3, 2}}}), t_a);

    auto b_l = mm->add_literal(beta);
    auto l2_b =
@@ -1352,6 +1359,37 @@ TEST_CASE(gemm_ex_brcst_test)
    EXPECT(p == prog);
 }

+TEST_CASE(gemm_half_test)
+{
+    migraphx::program p;
+    auto* mm   = p.get_main_module();
+    auto l0    = mm->add_parameter("1", migraphx::shape{migraphx::shape::half_type, {1, 1, 8, 6}});
+    auto l1    = mm->add_parameter("2", migraphx::shape{migraphx::shape::half_type, {1, 1, 8, 7}});
+    auto l2    = mm->add_parameter("3", migraphx::shape{migraphx::shape::half_type, {1, 1, 6, 1}});
+    auto alpha = 0.5f;
+    auto beta  = 0.8f;
+    auto a_l   = mm->add_literal(alpha);
+    auto t_a   = add_common_op(*mm, migraphx::make_op("mul"), {a_l, l0});
+    t_a        = mm->add_instruction(
+        migraphx::make_op("convert", {{"target_type", migraphx::shape::half_type}}), t_a);
+    t_a = mm->add_instruction(migraphx::make_op("transpose", {{"dims", {0, 1, 3, 2}}}), t_a);
+    std::vector<std::size_t> lens = {1, 1, 6, 7};
+    l2 = mm->add_instruction(migraphx::make_op("multibroadcast", {{"output_lens", lens}}), l2);
+    l2 = mm->add_instruction(
+        migraphx::make_op("convert", {{"target_type", migraphx::shape::float_type}}), l2);
+    auto b_l = mm->add_literal(beta);
+    auto b_b =
+        mm->add_instruction(migraphx::make_op("multibroadcast", {{"output_lens", lens}}), b_l);
+    auto l2_b = mm->add_instruction(migraphx::make_op("mul"), l2, b_b);
+    l2_b      = mm->add_instruction(
+        migraphx::make_op("convert", {{"target_type", migraphx::shape::half_type}}), l2_b);
+    mm->add_instruction(migraphx::make_op("dot", {{"alpha", 1.0f}, {"beta", 1.0f}}), t_a, l1, l2_b);
+
+    auto prog = optimize_onnx("gemm_half_test.onnx");
+
+    EXPECT(p == prog);
+}
+
 TEST_CASE(globalavgpool_test)
 {
    migraphx::program p;
@@ -3251,6 +3289,23 @@ TEST_CASE(round_test)
    EXPECT(p == prog);
 }

+TEST_CASE(scatter_test)
+{
+    migraphx::program p;
+    auto* mm = p.get_main_module();
+    auto l0 = mm->add_parameter("data", migraphx::shape{migraphx::shape::float_type, {3, 4, 5, 6}});
+    auto l1 =
+        mm->add_parameter("indices", migraphx::shape{migraphx::shape::int32_type, {2, 3, 4, 5}});
+    auto l2 =
+        mm->add_parameter("update", migraphx::shape{migraphx::shape::float_type, {2, 3, 4, 5}});
+    int axis = -2;
+    auto r   = mm->add_instruction(migraphx::make_op("scatter", {{"axis", axis}}), l0, l1, l2);
+    mm->add_return({r});
+    auto prog = migraphx::parse_onnx("scatter_test.onnx");
+
+    EXPECT(p == prog);
+}
+
 TEST_CASE(selu_test)
 {
    migraphx::program p;