fix tests and tf parser

51597ed7 · Khalique · 7bacd3ba · bc80dee8 · 51597ed7 · 51597ed7
Commit 51597ed7 authored Jul 08, 2019 by Khalique
20 changed files
--- a/src/targets/gpu/device/include/migraphx/gpu/device/launch.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/launch.hpp
@@ -73,7 +73,7 @@ __host__ __device__ auto gs_invoke(F&& f, std::size_t i, index) -> decltype(f(i)

 inline auto gs_launch(hipStream_t stream, std::size_t n, std::size_t local = 1024)
 {
-    std::size_t groups  = 1 + n / local;
+    std::size_t groups  = (n + local - 1) / local;
    std::size_t nglobal = std::min<std::size_t>(256, groups) * local;

    return [=](auto f) {

--- a/src/targets/gpu/device/include/migraphx/gpu/device/reduce.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/reduce.hpp
@@ -128,7 +128,7 @@ __device__ T dpp_mov(T& x)
 template <class T, class Op>
 __device__ void dpp_reduce(T& in, Op op)
 {
-    T out;
+    T out{};
    out = dpp_mov<dpp_row_shr(1)>(in);
    in  = op(in, out);
    out = dpp_mov<dpp_row_shr(2)>(in);

--- a/src/targets/gpu/device/include/migraphx/gpu/device/types.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/types.hpp
@@ -119,13 +119,13 @@ tensor_view<device_type<T>> device_cast(tensor_view<T> x)
 }

 template <class T>
-T to_hip_type(T x)
+__device__ __host__ T to_hip_type(T x)
 {
    return x;
 }

 // Hip doens't support __fp16
-inline float to_hip_type(gpu_half x) { return x; }
+inline __device__ __host__ float to_hip_type(gpu_half x) { return x; }

 } // namespace device
 } // namespace gpu

--- a/src/targets/gpu/device/logsoftmax.cpp
+++ b/src/targets/gpu/device/logsoftmax.cpp
 #include <migraphx/shape.hpp>
 #include <migraphx/argument.hpp>
 #include <migraphx/gpu/device/logsoftmax.hpp>
+#include <migraphx/gpu/device/reduce.hpp>
 #include <migraphx/gpu/device/tensor.hpp>
 #include <migraphx/gpu/device/launch.hpp>
 #include <migraphx/gpu/device/types.hpp>
@@ -11,53 +12,45 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {

-argument logsoftmax(hipStream_t stream, argument result, argument arg, int axis)
+void logsoftmax(hipStream_t stream, const argument& result, const argument& arg, int axis)
 {
-
-    auto lens         = result.get_shape().lens();
-    auto num_in_batch = lens[axis];
-    auto batch_lens   = lens;
-    batch_lens[axis]  = 1;
-    shape batch_shape{result.get_shape().type(), batch_lens};
+    auto lens                  = result.get_shape().lens();
+    auto batch_lens            = lens;
+    std::size_t batch_item_num = lens[axis];
+    batch_lens[axis]           = 1;
+    migraphx::shape batch_shape{result.get_shape().type(), batch_lens};

    hip_visit_all(result, arg, batch_shape)([&](auto output, auto input, auto batch) {
-
-        // each thread is for one item in the batch
-        gs_launch(stream, batch_shape.elements())([=](auto i) {
-            auto batch_idx = batch.multi(i);
-            auto data_idx  = batch_idx;
-
-            // get max
-            auto batch_max = input[batch_idx];
-            for(std::size_t j = 1; j < num_in_batch; ++j)
-            {
-                data_idx[axis] = j;
-                batch_max      = std::max(to_hip_type(batch_max), to_hip_type(input[data_idx]));
-            }
-
-            for(std::size_t j = 0; j < num_in_batch; ++j)
-            {
+        const std::size_t max_block_size = 256;
+        const std::size_t block_size     = compute_block_size(batch_item_num, max_block_size);
+        gs_launch(stream,
+                  batch_shape.elements() * block_size,
+                  block_size)([=](auto i, auto idx) __device__ {
+            auto data_idx = batch.multi(i / block_size);
+            using type    = device_type<std::remove_cv_t<typename decltype(input)::value_type>>;
+            type init     = lowest();
+
+            auto batch_max = block_reduce<max_block_size>(
+                idx, max{}, init, batch_item_num, [&](auto j) __device__ {
+                    data_idx[axis] = j;
+                    return input[data_idx];
+                });
+
+            auto batch_sum =
+                block_reduce<max_block_size>(idx, sum{}, 0, batch_item_num, [&](auto j) __device__ {
+                    data_idx[axis] = j;
+                    auto val       = input[data_idx] - batch_max;
+                    return ::exp(to_hip_type(val));
+                });
+
+            auto log_batch_sum = ::log(to_hip_type(batch_sum)) + batch_max;
+
+            idx.local_stride(batch_item_num, [&](auto j) {
                data_idx[axis]   = j;
-                output[data_idx] = input[data_idx] - batch_max;
-            }
-
-            auto batch_sum = ::exp(to_hip_type(output[batch_idx]));
-            for(std::size_t j = 1; j < num_in_batch; ++j)
-            {
-                data_idx[axis] = j;
-                batch_sum += ::exp(to_hip_type(output[data_idx]));
-            }
-            batch_sum = ::log(to_hip_type(batch_sum));
-
-            for(std::size_t j = 0; j < num_in_batch; ++j)
-            {
-                data_idx[axis] = j;
-                output[data_idx] -= batch_sum;
-            }
+                output[data_idx] = input[data_idx] - log_batch_sum;
+            });
        });
    });
-
-    return result;
 }

 } // namespace device

--- a/src/targets/gpu/device/softmax.cpp
+++ b/src/targets/gpu/device/softmax.cpp
@@ -2,6 +2,7 @@
 #include <migraphx/argument.hpp>
 #include <migraphx/dfor.hpp>
 #include <migraphx/gpu/device/softmax.hpp>
+#include <migraphx/gpu/device/reduce.hpp>
 #include <migraphx/gpu/device/tensor.hpp>
 #include <migraphx/gpu/device/launch.hpp>
 #include <migraphx/gpu/device/types.hpp>
@@ -12,51 +13,44 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {

-argument softmax(hipStream_t stream, argument result, argument arg, int axis)
+void softmax(hipStream_t stream, const argument& result, const argument& arg, int axis)
 {
-    auto lens        = result.get_shape().lens();
-    auto batch_lens  = lens;
-    size_t n_dims    = lens[axis];
-    batch_lens[axis] = 1;
-    shape batch_shape{result.get_shape().type(), batch_lens};
+    auto lens                  = result.get_shape().lens();
+    auto batch_lens            = lens;
+    std::size_t batch_item_num = lens[axis];
+    batch_lens[axis]           = 1;
+    migraphx::shape batch_shape{result.get_shape().type(), batch_lens};

    hip_visit_all(result, arg, batch_shape)([&](auto output, auto input, auto batch) {
-
-        // each thread is for one item in the batch
-        gs_launch(stream, batch_shape.elements())([=](auto i) {
-            auto batch_idx = batch.multi(i);
-            auto data_idx  = batch_idx;
-
-            // get max
-            auto batch_max = input[batch_idx];
-            for(std::size_t j = 1; j < n_dims; ++j)
-            {
-                data_idx[axis] = j;
-                batch_max      = std::max(to_hip_type(batch_max), to_hip_type(input[data_idx]));
-            }
-
-            for(std::size_t j = 0; j < n_dims; ++j)
-            {
+        const std::size_t max_block_size = 256;
+        const std::size_t block_size     = compute_block_size(batch_item_num, max_block_size);
+        gs_launch(stream,
+                  batch_shape.elements() * block_size,
+                  block_size)([=](auto i, auto idx) __device__ {
+            auto data_idx = batch.multi(i / block_size);
+            using type    = device_type<std::remove_cv_t<typename decltype(input)::value_type>>;
+            type init     = lowest();
+
+            auto batch_max = block_reduce<max_block_size>(
+                idx, max{}, init, batch_item_num, [&](auto j) __device__ {
+                    data_idx[axis] = j;
+                    return input[data_idx];
+                });
+
+            auto batch_sum =
+                block_reduce<max_block_size>(idx, sum{}, 0, batch_item_num, [&](auto j) __device__ {
+                    data_idx[axis] = j;
+                    auto val       = input[data_idx] - batch_max;
+                    return ::exp(to_hip_type(val));
+                });
+
+            idx.local_stride(batch_item_num, [&](auto j) {
                data_idx[axis]   = j;
-                output[data_idx] = exp(to_hip_type(input[data_idx] - batch_max));
-            }
-
-            auto batch_sum = output[batch_idx];
-            for(std::size_t j = 1; j < n_dims; ++j)
-            {
-                data_idx[axis] = j;
-                batch_sum += output[data_idx];
-            }
-
-            for(std::size_t j = 0; j < n_dims; ++j)
-            {
-                data_idx[axis]   = j;
-                output[data_idx] = output[data_idx] / batch_sum;
-            }
+                auto val         = input[data_idx] - batch_max;
+                output[data_idx] = ::exp(to_hip_type(val)) / batch_sum;
+            });
        });
    });
-
-    return result;
 }

 } // namespace device

--- a/src/targets/gpu/fuse_ops.cpp
+++ b/src/targets/gpu/fuse_ops.cpp
@@ -200,12 +200,33 @@ struct hip_add_relu
    }
 };

+void move_broadcasted_back(std::vector<instruction_ref>& args)
+{
+    // Ensure the last arguments is the broadcasted one
+    auto it = std::find_if(
+        args.begin(), args.end(), [](auto arg) { return arg->get_shape().broadcasted(); });
+    if(it != args.end())
+        std::swap(*it, *std::prev(args.end(), 2));
+}
+
+void move_standard_front(std::vector<instruction_ref>& args)
+{
+    // Ensure the first arguments is the standard one
+    auto it = std::find_if(
+        args.begin(), args.end(), [](auto arg) { return arg->get_shape().standard(); });
+    if(it != args.end())
+        std::swap(*it, args.front());
+}
+
 struct find_add_relu
 {
    auto matcher() const
    {
-        return match::name("gpu::relu")(match::arg(0)(
-            match::any_of(match::name("gpu::add"), match::name("hip::triadd")).bind("add")));
+        return match::name("gpu::relu")(
+            match::arg(0)(match::any_of(match::name("gpu::add"),
+                                        match::name("hip::triadd"),
+                                        match::any_of[match::inputs()](match::standard_shape()))
+                              .bind("add")));
    }

    void apply(program& p, match::matcher_result r) const
@@ -213,6 +234,9 @@ struct find_add_relu
        auto add_ins = r.instructions["add"];
        auto ins     = r.result;
        auto args    = add_ins->inputs();
+        move_standard_front(args);
+        move_broadcasted_back(args);
+
        // Use the allocation from the relu operator
        args.back() = ins->inputs().back();
        if(add_ins->name() == "gpu::add")
@@ -226,24 +250,26 @@ struct find_triadd
 {
    auto matcher() const
    {
-        return match::name("gpu::add")(match::either_arg(0, 1)(match::name("gpu::add").bind("add"),
-                                                               match::any().bind("input")));
+        return match::name("gpu::add")(match::either_arg(0, 1)(
+            match::name("gpu::add").bind("add"),
+            match::any(match::any_of[match::inputs()](match::standard_shape())).bind("input")));
    }

    void apply(program& p, match::matcher_result r) const
    {
-        auto add_ins        = r.instructions["add"];
-        auto input_ins      = r.instructions["input"];
-        auto ins            = r.result;
-        auto args           = add_ins->inputs();
+        auto add_ins   = r.instructions["add"];
+        auto input_ins = r.instructions["input"];
+        auto ins       = r.result;
+        auto args      = add_ins->inputs();
+        assert(add_ins != input_ins);
+
        auto is_broadcasted = [](auto arg) { return arg->get_shape().broadcasted(); };
        if(std::count_if(args.begin(), args.end(), is_broadcasted) > 1)
            return;
        args.insert(args.begin(), input_ins);
-        // Ensure the last arguments is the broadcasted one
-        auto it = std::find_if(args.begin(), args.end(), is_broadcasted);
-        if(it != args.end())
-            std::swap(*it, *std::prev(args.end(), 2));
+        move_standard_front(args);
+        move_broadcasted_back(args);
+
        args.back() = ins->inputs().back();
        p.replace_instruction(ins, hip_triadd{}, args);
    }
@@ -402,8 +428,8 @@ void fuse_ops::apply(program& p) const
    // clang-format off
    match::find_matches(p, find_triadd{});
    match::find_matches(p, 
-        // find_conv_bias_relu{ctx},
-        // find_conv_bias{ctx},
+        find_conv_bias_relu{ctx},
+        find_conv_bias{ctx},
        find_add_relu{}
    );
    // clang-format on

--- a/src/targets/gpu/include/migraphx/gpu/argmax.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/argmax.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_ARGMAX_HPP
+#define MIGRAPHX_GUARD_RTGLIB_ARGMAX_HPP
+
+#include <migraphx/shape.hpp>
+#include <migraphx/op/argmax.hpp>
+#include <migraphx/gpu/device/argmax.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct context;
+
+struct hip_argmax
+{
+    op::argmax op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
+    std::string name() const { return "gpu::argmax"; }
+    shape compute_shape(const std::vector<shape>& inputs) const;
+    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const;
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/argmin.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/argmin.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_ARGMIN_HPP
+#define MIGRAPHX_GUARD_RTGLIB_ARGMIN_HPP
+
+#include <migraphx/shape.hpp>
+#include <migraphx/op/argmin.hpp>
+#include <migraphx/gpu/device/argmin.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct context;
+
+struct hip_argmin
+{
+    op::argmin op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
+    std::string name() const { return "gpu::argmin"; }
+    shape compute_shape(const std::vector<shape>& inputs) const;
+    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const;
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/device/arg_op.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/arg_op.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_ARG_OP_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_ARG_OP_HPP
+
+#include <migraphx/shape.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/gpu/device/tensor.hpp>
+#include <migraphx/gpu/device/launch.hpp>
+#include <migraphx/gpu/device/types.hpp>
+#include <migraphx/gpu/device/reduce.hpp>
+#include <migraphx/gpu/hip.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+template <class T>
+struct val_index
+{
+    T val;
+    int64_t index;
+};
+
+template <class T>
+MIGRAPHX_DEVICE_CONSTEXPR val_index<T> make_val_index(T v)
+{
+    return {v, -1};
+}
+
+template <class T>
+MIGRAPHX_DEVICE_CONSTEXPR val_index<T> make_val_index(T v, int64_t i)
+{
+    return {v, i};
+}
+
+struct argmax_op
+{
+    template <class T>
+    MIGRAPHX_DEVICE_CONSTEXPR val_index<T> operator()(val_index<T> x, val_index<T> y) const
+    {
+        if(x.val > y.val)
+            return x;
+        else if(x.val < y.val)
+            return y;
+        else
+        {
+            return (x.index < y.index) ? x : y;
+        }
+    }
+
+    MIGRAPHX_DEVICE_CONSTEXPR auto init() const { return lowest(); }
+};
+
+struct argmin_op
+{
+    template <class T>
+    MIGRAPHX_DEVICE_CONSTEXPR val_index<T> operator()(val_index<T> x, val_index<T> y) const
+    {
+        if(x.val < y.val)
+            return x;
+        else if(x.val > y.val)
+            return y;
+        else
+        {
+            return (x.index < y.index) ? x : y;
+        }
+    }
+
+    MIGRAPHX_DEVICE_CONSTEXPR auto init() const { return highest(); }
+};
+
+template <class Op>
+void arg_op(Op op, hipStream_t stream, const argument& result, const argument& arg, int64_t axis)
+{
+    auto arg_shape        = arg.get_shape();
+    auto lens             = arg_shape.lens();
+    auto batch_lens       = lens;
+    size_t batch_item_num = lens[axis];
+    batch_lens[axis]      = 1;
+    migraphx::shape batch_shape{arg_shape.type(), batch_lens};
+
+    hip_visit_all(arg, arg_shape, batch_shape)([&](auto input, auto arg_s, auto batch_s) {
+        auto output = device_cast(result.get<int64_t>().data());
+        using type  = device_type<std::remove_cv_t<typename decltype(input)::value_type>>;
+        // use one block for items in one batch.
+        const size_t max_block_size  = 256;
+        const std::size_t block_size = compute_block_size(batch_item_num, max_block_size);
+        gs_launch(stream,
+                  batch_shape.elements() * block_size,
+                  block_size)([=](auto i, auto idx) __device__ {
+            auto batch_idx = batch_s.multi(i / block_size);
+            auto data_idx  = batch_idx;
+            auto init      = make_val_index<type>(op.init());
+
+            auto op_output =
+                block_reduce<max_block_size>(idx, op, init, batch_item_num, [&](auto j) __device__ {
+                    data_idx[axis] = j;
+                    return make_val_index(input[arg_s.index(data_idx)], j);
+                });
+
+            if(idx.local == 0)
+            {
+                output[batch_s.index(batch_idx)] = op_output.index;
+            }
+        });
+    });
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/device/argmax.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/argmax.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_ARGMAX_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_ARGMAX_HPP
+
+#include <migraphx/argument.hpp>
+#include <migraphx/config.hpp>
+#include <hip/hip_runtime_api.h>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void argmax(hipStream_t stream, const argument& result, const argument& arg, int64_t axis);
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/device/argmin.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/argmin.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_ARGMIN_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_ARGMIN_HPP
+
+#include <migraphx/argument.hpp>
+#include <migraphx/config.hpp>
+#include <hip/hip_runtime_api.h>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void argmin(hipStream_t stream, const argument& result, const argument& arg, int64_t axis);
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/device/logsoftmax.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/logsoftmax.hpp
@@ -10,7 +10,7 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {

-argument logsoftmax(hipStream_t stream, argument result, argument arg, int axis);
+void logsoftmax(hipStream_t stream, const argument& result, const argument& arg, int axis);

 } // namespace device
 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/device/softmax.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/softmax.hpp
@@ -10,7 +10,7 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {

-argument softmax(hipStream_t stream, argument result, argument arg, int axis);
+void softmax(hipStream_t stream, const argument& result, const argument& arg, int axis);

 } // namespace device
 } // namespace gpu

--- a/src/targets/gpu/logsoftmax.cpp
+++ b/src/targets/gpu/logsoftmax.cpp
@@ -18,7 +18,8 @@ shape hip_logsoftmax::compute_shape(const std::vector<shape>& inputs) const
 argument
 hip_logsoftmax::compute(context& ctx, const shape&, const std::vector<argument>& args) const
 {
-    return device::logsoftmax(ctx.get_stream().get(), args[1], args[0], op.axis);
+    device::logsoftmax(ctx.get_stream().get(), args.back(), args.front(), op.axis);
+    return args.back();
 }

 } // namespace gpu

--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -11,6 +11,8 @@
 #include <migraphx/gpu/device/contiguous.hpp>
 #include <migraphx/gpu/device/add.hpp>
 #include <migraphx/iterator_for.hpp>
+#include <migraphx/gpu/argmax.hpp>
+#include <migraphx/gpu/argmin.hpp>
 #include <migraphx/gpu/rocblas.hpp>
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/gpu/convolution.hpp>
@@ -102,6 +104,8 @@ struct miopen_apply
        add_extend_op<hip_concat, op::concat>("concat");
        add_extend_op<hip_softmax, op::softmax>("softmax");
        add_extend_op<hip_logsoftmax, op::logsoftmax>("logsoftmax");
+        add_extend_op<hip_argmax, op::argmax>("argmax");
+        add_extend_op<hip_argmin, op::argmin>("argmin");
        add_extend_op<hip_gather, op::gather>("gather");
        add_extend_op<hip_pad, op::pad>("pad");
        add_extend_op<hip_convert, op::convert>("convert");

--- a/src/targets/gpu/softmax.cpp
+++ b/src/targets/gpu/softmax.cpp
@@ -39,7 +39,8 @@ shape hip_softmax::compute_shape(const std::vector<shape>& inputs) const

 argument hip_softmax::compute(context& ctx, const shape&, const std::vector<argument>& args) const
 {
-    return device::softmax(ctx.get_stream().get(), args[1], args[0], op.axis);
+    device::softmax(ctx.get_stream().get(), args.back(), args.front(), op.axis);
+    return args.back();
 }

 } // namespace gpu

--- a/src/targets/gpu/target.cpp
+++ b/src/targets/gpu/target.cpp
@@ -36,6 +36,8 @@ std::vector<pass> target::get_passes(migraphx::context& gctx) const
    // clang-format off
    return
    {
+        dead_code_elimination{},
+        simplify_reshapes{},
        dead_code_elimination{},
        eliminate_identity{},
        eliminate_pad{},
@@ -48,11 +50,11 @@ std::vector<pass> target::get_passes(migraphx::context& gctx) const
        //dead_code_elimination{},
        simplify_algebra{},
        dead_code_elimination{},
-        propagate_constant{},
-        dead_code_elimination{},
        auto_contiguous{},
        simplify_reshapes{},
        dead_code_elimination{},
+        propagate_constant{},
+        dead_code_elimination{},
        lowering{ctx},
        eliminate_concat{concat_gpu_optimization{}},
        dead_code_elimination{},

--- a/src/tf/tf.cpp
+++ b/src/tf/tf.cpp
@@ -37,8 +37,49 @@ struct tf_parser

    std::unordered_map<std::string, op_func> ops;

-    std::vector<size_t>
-    parse_axes(const attribute_map& attributes, const std::string& s, const size_t& num_dims) const
+    bool should_transpose(instruction_ref ins) const
+    {
+        return is_nhwc and ins->get_shape().lens().size() == 4;
+    }
+
+    instruction_ref to_nhwc(instruction_ref ins)
+    {
+        if(should_transpose(ins))
+            return prog.add_instruction(op::transpose{{0, 2, 3, 1}}, ins);
+        return ins;
+    }
+
+    instruction_ref to_nchw(instruction_ref ins)
+    {
+        if(should_transpose(ins))
+            return prog.add_instruction(op::transpose{{0, 3, 1, 2}}, ins);
+        return ins;
+    }
+
+    instruction_ref to_kcxy(instruction_ref ins)
+    {
+        if(should_transpose(ins))
+            return prog.add_instruction(op::transpose{{3, 2, 0, 1}}, ins);
+        return ins;
+    }
+
+    instruction_ref make_contiguous(instruction_ref ins)
+    {
+        if(ins->get_shape().standard())
+            return ins;
+        else
+            return prog.add_instruction(op::contiguous{}, ins);
+    }
+
+    std::vector<instruction_ref> to_nchw(const std::vector<instruction_ref>& args)
+    {
+        std::vector<instruction_ref> result(args.size());
+        std::transform(
+            args.begin(), args.end(), result.begin(), [&](auto ins) { return this->to_nchw(ins); });
+        return result;
+    }
+
+    std::vector<size_t> parse_axes(const attribute_map& attributes, const std::string& s, const size_t& num_dims) const
    {
        auto attrs = attributes.at(s).list().i();
        std::vector<size_t> axes;
@@ -120,60 +161,68 @@ struct tf_parser

        add_mem_op("AvgPool", &tf_parser::parse_pooling);
        add_mem_op("BiasAdd", &tf_parser::parse_biasadd);
-        add_mem_op("ConcatV2", &tf_parser::parse_concat);
+        add_mem_op("ConcatV2", &tf_parser::parse_concat, false);
        add_mem_op("Const", &tf_parser::parse_constant);
        add_mem_op("Conv2D", &tf_parser::parse_conv);
        add_mem_op("DepthwiseConv2dNative", &tf_parser::parse_depthwiseconv);
        add_mem_op("ExpandDims", &tf_parser::parse_expanddims);
        add_mem_op("FusedBatchNorm", &tf_parser::parse_batchnorm);
-        add_mem_op("MatMul", &tf_parser::parse_matmul);
+        add_mem_op("MatMul", &tf_parser::parse_matmul, false);
        add_mem_op("MaxPool", &tf_parser::parse_pooling);
        add_mem_op("Mean", &tf_parser::parse_mean);
-        add_mem_op("Pack", &tf_parser::parse_pack);
+        add_mem_op("Pack", &tf_parser::parse_pack, false);
        add_mem_op("Pad", &tf_parser::parse_pad);
-        add_mem_op("Reshape", &tf_parser::parse_reshape);
+        add_mem_op("Reshape", &tf_parser::parse_reshape, false);
        add_mem_op("Softmax", &tf_parser::parse_softmax);
-        add_mem_op("Squeeze", &tf_parser::parse_squeeze);
+        add_mem_op("Squeeze", &tf_parser::parse_squeeze, false);
        add_mem_op("StridedSlice", &tf_parser::parse_stridedslice);
    }

    template <class F>
-    void add_op(std::string name, F f)
-    {
-        ops.emplace(name, f);
-    }
-
-    // Multi output op
-    template <class F>
-    void add_multi_op(std::string name, F f)
+    void add_op(std::string name, F f, bool transpose = true)
    {
-        ops.emplace(name, f);
+        if(transpose)
+        {
+            ops.emplace(name,
+                        op_func{[=](const attribute_map& attributes,
+                                    const std::vector<instruction_ref>& args) -> instruction_ref {
+                            return to_nhwc(f(attributes, to_nchw(args)));
+                        }});
+        }
+        else
+        {
+            ops.emplace(name, f);
+        }
    }

    template <class F>
-    void add_mem_op(std::string name, F f)
+    void add_mem_op(std::string name, F f, bool transpose = true)
    {
-        add_op(name, [=](auto&&... xs) {
-            return std::mem_fn(f)(*this, name, std::forward<decltype(xs)>(xs)...);
-        });
+        add_op(name,
+               [=](auto&&... xs) {
+                   return std::mem_fn(f)(*this, name, std::forward<decltype(xs)>(xs)...);
+               },
+               transpose);
    }

    template <class T>
    void add_binary_op(std::string name, T x)
    {
-        add_op(name, [this, x](const attribute_map& attributes, std::vector<instruction_ref> args) {
-            if(args.size() != 2)
-                MIGRAPHX_THROW("binary operators should have 2 operands");
-            auto l0 = args[1];
-            if(contains(attributes, "data_format"))
-            {
-                if(is_nhwc)
-                {
-                    l0 = prog.add_instruction(op::transpose{{0, 3, 1, 2}}, args[1]);
-                }
-            }
-            return add_broadcastable_binary_op(args[0], l0, x);
-        });
+        add_op(name,
+               [this, x](const attribute_map&, std::vector<instruction_ref> args) {
+                   if(args.size() != 2)
+                       MIGRAPHX_THROW("binary operators should have 2 operands");
+                   // TODO
+                   // if(contains(attributes, "data_format"))
+                   // {
+                   //     if(is_nhwc)
+                   //     {
+                   //         l0 = prog.add_instruction(op::transpose{{0, 3, 1, 2}}, args[1]);
+                   //     }
+                   // }
+                   return add_broadcastable_binary_op(args[0], args[1], x);
+               },
+               false);
    }

    template <class T>
@@ -212,20 +261,22 @@ struct tf_parser

            auto l0 = prog.add_instruction(op::multibroadcast{output_lens}, arg0);
            auto l1 = prog.add_instruction(op::multibroadcast{output_lens}, arg1);
-            return prog.add_instruction(x, l0, l1);
+            return to_nhwc(prog.add_instruction(x, to_nchw(l0), to_nchw(l1)));
        }
        else
        {
-            return prog.add_instruction(x, {arg0, arg1});
+            return to_nhwc(prog.add_instruction(x, {to_nchw(arg0), to_nchw(arg1)}));
        }
    }

    template <class T>
-    void add_generic_op(std::string name, T x)
+    void add_generic_op(std::string name, T x, bool transpose = true)
    {
-        add_op(name, [this, x](const attribute_map&, std::vector<instruction_ref> args) {
-            return prog.add_instruction(x, args);
-        });
+        add_op(name,
+               [this, x](const attribute_map&, std::vector<instruction_ref> args) {
+                   return prog.add_instruction(x, args);
+               },
+               transpose);
    }

    instruction_ref
@@ -255,8 +306,7 @@ struct tf_parser
    {
        // get index for axis within args
        size_t axis_idx = attributes.at("N").i();
-        size_t axis =
-            parse_axis(args[axis_idx]->eval().at<int64_t>(), args[0]->get_shape().lens().size());
+        size_t axis     = args[axis_idx]->eval().at<int64_t>();
        op::concat op{axis};
        // return only first N arguments (assuming last index is the axis value)
        return prog.add_instruction(
@@ -267,16 +317,8 @@ struct tf_parser
                                   attribute_map attributes,
                                   const std::vector<instruction_ref>&)
    {
-        literal v       = parse_tensor(attributes.at("value").tensor());
-        auto l0         = prog.add_literal(v);
-        size_t num_axes = l0->get_shape().lens().size();
-        if(num_axes >= 4)
-        {
-            std::vector<int64_t> transpose_axes = get_axes(num_axes);
-            reorder_data(transpose_axes);
-            l0 = prog.add_instruction(op::transpose{transpose_axes}, l0);
-        }
-        return l0;
+        literal v = parse_tensor(attributes.at("value").tensor());
+        return prog.add_literal(v);
    }

    instruction_ref
@@ -307,22 +349,9 @@ struct tf_parser
            op.dilation[0] = dilation[2];
            op.dilation[1] = dilation[3];
        }
-        auto weights = args[1];
-        // check if weights are from a constant
-
-        if(weights->name() != "@param")
-        {
-            if(is_nhwc)
-            {
-                weights = prog.add_instruction(op::transpose{{1, 3, 0, 2}}, args[1]);
-            }
-            else
-            {
-                weights = prog.add_instruction(op::transpose{{3, 2, 0, 1}}, args[1]);
-            }
-        }

-        auto l0 = args[0];
+        auto weights = to_kcxy(args[1]);
+        auto l0      = args[0];
        if(contains(attributes, "padding"))
        {
            const std::string& pad_mode = attributes.at("padding").s();
@@ -371,8 +400,7 @@ struct tf_parser
                op.padding[1] = padding[1];
            }
        }
-
-        return prog.add_instruction(op, {l0, weights});
+        return prog.add_instruction(op, {l0, to_kcxy(args[1])});
    }

    instruction_ref parse_depthwiseconv(const std::string&,
@@ -395,6 +423,8 @@ struct tf_parser
            op.stride[0] = stride[2];
            op.stride[1] = stride[3];
        }
+
+        auto weights = to_kcxy(args[1]);
        if(contains(attributes, "dilations"))
        {
            std::vector<size_t> dilation;
@@ -408,20 +438,6 @@ struct tf_parser
            op.dilation[1] = dilation[3];
        }

-        auto weights = args[1];
-        // check if weights are from a constant
-        if(weights->name() != "@param")
-        {
-            if(is_nhwc)
-            {
-                weights = prog.add_instruction(op::transpose{{1, 3, 0, 2}}, args[1]);
-            }
-            else
-            {
-                weights = prog.add_instruction(op::transpose{{3, 2, 0, 1}}, args[1]);
-            }
-        }
-
        auto l0 = args[0];
        if(contains(attributes, "padding"))
        {
@@ -469,8 +485,8 @@ struct tf_parser
        new_weights_shape[0] = out_channels;
        new_weights_shape[1] = 1;
        // Make sure weights are contiguous before doing reshape
-        auto cweights    = prog.add_instruction(op::contiguous{}, weights);
-        auto new_weights = prog.add_instruction(op::reshape{new_weights_shape}, cweights);
+        auto new_weights =
+            prog.add_instruction(op::reshape{new_weights_shape}, make_contiguous(weights));

        return prog.add_instruction(op, {l0, new_weights});
    }
@@ -558,15 +574,14 @@ struct tf_parser
            MIGRAPHX_THROW("TF_PARSER: axis value of " + to_string(axis) +
                           " must be smaller than input size " + to_string(input_size));
        }
-        // check if input arg needs axis to be converted to NCHW
-        axis = parse_axis(axis, input_size);

        std::transform(
            args.begin(),
            args.end(),
            std::back_inserter(unsqueezed_args),
            [&](instruction_ref arg) { return prog.add_instruction(op::unsqueeze{{axis}}, arg); });
-        return prog.add_instruction(op::concat{static_cast<size_t>(axis)}, unsqueezed_args);
+        return to_nhwc(
+            prog.add_instruction(op::concat{static_cast<size_t>(axis)}, unsqueezed_args));
    }

    instruction_ref
@@ -669,7 +684,7 @@ struct tf_parser
            MIGRAPHX_THROW("reshape needs 2 arguments (input, new_shape)");
        auto s = args[1]->eval();
        s.visit([&](auto v) { copy(v, std::back_inserter(op.dims)); });
-        return prog.add_instruction(op, args[0]);
+        return prog.add_instruction(op, make_contiguous(args[0]));
    }

    void parse_from(std::istream& is)
@@ -701,7 +716,7 @@ struct tf_parser
    {
        op::squeeze op;
        auto input_dims = args[0]->get_shape().lens();
-        auto axes       = parse_axes(attributes, "squeeze_dims", input_dims.size());
+        auto axes = attributes.at("squeeze_dims").list().i();
        copy(axes, std::back_inserter(op.axes));

        if(op.axes.empty()) // no squeeze_dims provided, remove any dim that equals 1
@@ -714,7 +729,7 @@ struct tf_parser
                }
            }
        }
-        return prog.add_instruction(op, args[0]);
+        return prog.add_instruction(op, make_contiguous(args[0]));
    }

    instruction_ref parse_stridedslice(const std::string&,
@@ -725,11 +740,6 @@ struct tf_parser
        auto starts     = args[1]->eval().get<int32_t>().to_vector();
        auto ends       = args[2]->eval().get<int32_t>().to_vector();
        size_t num_axes = args[0]->get_shape().lens().size();
-        if(num_axes >= 4)
-        {
-            reorder_data(starts);
-            reorder_data(ends);
-        }

        op.starts = std::vector<int64_t>(starts.begin(), starts.end());
        op.ends   = std::vector<int64_t>(ends.begin(), ends.end());
@@ -748,10 +758,9 @@ struct tf_parser
            if(((shrink_axis_mask >> i) & bitwise_compare) == 1)
                squeeze_axes.push_back(i);
        }
-        squeeze_axes = parse_axes(squeeze_axes, num_axes);

-        auto l0 = prog.add_instruction(op, args[0]);
-        return prog.add_instruction(op::squeeze{squeeze_axes}, l0);
+        auto l0 = prog.add_instruction(op, make_contiguous(args[0]));
+        return to_nhwc(prog.add_instruction(op::squeeze{squeeze_axes}, l0));
    }

    void parse_graph(const tensorflow::GraphDef& graph)
@@ -768,7 +777,7 @@ struct tf_parser
                reorder_data(dims);
            }
            shape s            = shape{shape_type, dims};
-            instructions[name] = prog.add_parameter(name, s);
+            instructions[name] = to_nhwc(prog.add_parameter(name, s));
        }
        for(auto&& p : nodes)
        {
@@ -1118,6 +1127,7 @@ program parse_tf(const std::string& name, bool is_nhwc)
 #else
    parser.parse_from(input);
 #endif
+    parser.to_nchw(std::prev(parser.prog.end()));
    return std::move(parser.prog);
 }


--- a/test/cpu_ops_test.cpp
+++ b/test/cpu_ops_test.cpp
@@ -941,9 +941,6 @@ TEST_CASE(softmax_simple_test)
    auto result = p.eval({});
    std::vector<float> results_vector(2);
    result.visit([&](auto output) { results_vector.assign(output.begin(), output.end()); });
-    for(auto v : results_vector)
-        std::cout << v << "\t";
-    std::cout << std::endl;
    EXPECT(migraphx::verify_range(results_vector, s));
 }

@@ -1138,6 +1135,114 @@ TEST_CASE(logsoftmax_test_axis_3)
    EXPECT(migraphx::verify_range(results_vector, s));
 }

+TEST_CASE(argmax_test_0)
+{
+    migraphx::program p;
+    std::vector<float> data = {1.2255,  1.6834,  -2.0305, -0.3221, 0.4701,  0.2583, 0.7545, 2.5758,
+                               -1.6849, 0.0928,  0.9022,  -0.8765, -0.4090, 0.9301, 2.0724, -1.5706,
+                               0.4867,  -0.1493, 0.6957,  -0.2179, 0.7142,  0.7177, 0.0183, 1.3497};
+    std::vector<int64_t> res_gold = {0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1};
+    migraphx::shape data_shape{migraphx::shape::float_type, {2, 3, 4}};
+    auto dl = p.add_literal(migraphx::literal{data_shape, data});
+    p.add_instruction(migraphx::op::argmax{0}, dl);
+    p.compile(migraphx::cpu::target{});
+    auto result = p.eval({});
+    std::vector<int64_t> result_vec;
+    result.visit([&](auto output) { result_vec.assign(output.begin(), output.end()); });
+
+    EXPECT(migraphx::verify_range(result_vec, res_gold));
+}
+
+TEST_CASE(argmax_test_1)
+{
+    migraphx::program p;
+    std::vector<float> data = {1.2255,  1.6834,  -2.0305, -0.3221, 0.4701,  0.2583, 0.7545, 2.5758,
+                               -1.6849, 0.0928,  0.9022,  -0.8765, -0.4090, 0.9301, 2.0724, -1.5706,
+                               0.4867,  -0.1493, 0.6957,  -0.2179, 0.7142,  0.7177, 0.0183, 1.3497};
+    std::vector<int64_t> res_gold = {0, 0, 2, 1, 2, 0, 0, 2};
+    migraphx::shape data_shape{migraphx::shape::float_type, {2, 3, 4}};
+    auto dl = p.add_literal(migraphx::literal{data_shape, data});
+    p.add_instruction(migraphx::op::argmax{1}, dl);
+    p.compile(migraphx::cpu::target{});
+    auto result = p.eval({});
+    std::vector<int64_t> result_vec;
+    result.visit([&](auto output) { result_vec.assign(output.begin(), output.end()); });
+
+    EXPECT(migraphx::verify_range(result_vec, res_gold));
+}
+
+TEST_CASE(argmax_test_2)
+{
+    migraphx::program p;
+    std::vector<float> data = {1.2255,  1.6834,  -2.0305, -0.3221, 0.4701,  0.2583, 0.7545, 2.5758,
+                               -1.6849, 0.0928,  0.9022,  -0.8765, -0.4090, 0.9301, 2.0724, -1.5706,
+                               0.4867,  -0.1493, 0.6957,  -0.2179, 0.7142,  0.7177, 0.0183, 1.3497};
+    std::vector<int64_t> res_gold = {1, 3, 2, 2, 2, 3};
+    migraphx::shape data_shape{migraphx::shape::float_type, {2, 3, 4}};
+    auto dl = p.add_literal(migraphx::literal{data_shape, data});
+    p.add_instruction(migraphx::op::argmax{2}, dl);
+    p.compile(migraphx::cpu::target{});
+    auto result = p.eval({});
+    std::vector<int64_t> result_vec;
+    result.visit([&](auto output) { result_vec.assign(output.begin(), output.end()); });
+
+    EXPECT(migraphx::verify_range(result_vec, res_gold));
+}
+
+TEST_CASE(argmin_test_0)
+{
+    migraphx::program p;
+    std::vector<float> data = {1.2255,  1.6834,  -2.0305, -0.3221, 0.4701,  0.2583, 0.7545, 2.5758,
+                               -1.6849, 0.0928,  0.9022,  -0.8765, -0.4090, 0.9301, 2.0724, -1.5706,
+                               0.4867,  -0.1493, 0.6957,  -0.2179, 0.7142,  0.7177, 0.0183, 1.3497};
+    std::vector<int64_t> res_gold = {1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0};
+    migraphx::shape data_shape{migraphx::shape::float_type, {2, 3, 4}};
+    auto dl = p.add_literal(migraphx::literal{data_shape, data});
+    p.add_instruction(migraphx::op::argmin{0}, dl);
+    p.compile(migraphx::cpu::target{});
+    auto result = p.eval({});
+    std::vector<int64_t> result_vec;
+    result.visit([&](auto output) { result_vec.assign(output.begin(), output.end()); });
+
+    EXPECT(migraphx::verify_range(result_vec, res_gold));
+}
+
+TEST_CASE(argmin_test_1)
+{
+    migraphx::program p;
+    std::vector<float> data = {1.2255,  1.6834,  -2.0305, -0.3221, 0.4701,  0.2583, 0.7545, 2.5758,
+                               -1.6849, 0.0928,  0.9022,  -0.8765, -0.4090, 0.9301, 2.0724, -1.5706,
+                               0.4867,  -0.1493, 0.6957,  -0.2179, 0.7142,  0.7177, 0.0183, 1.3497};
+    std::vector<int64_t> res_gold = {2, 2, 0, 2, 0, 1, 2, 0};
+    migraphx::shape data_shape{migraphx::shape::float_type, {2, 3, 4}};
+    auto dl = p.add_literal(migraphx::literal{data_shape, data});
+    p.add_instruction(migraphx::op::argmin{1}, dl);
+    p.compile(migraphx::cpu::target{});
+    auto result = p.eval({});
+    std::vector<int64_t> result_vec;
+    result.visit([&](auto output) { result_vec.assign(output.begin(), output.end()); });
+
+    EXPECT(migraphx::verify_range(result_vec, res_gold));
+}
+
+TEST_CASE(argmin_test_2)
+{
+    migraphx::program p;
+    std::vector<float> data = {1.2255,  1.6834,  -2.0305, -0.3221, 0.4701,  0.2583, 0.7545, 2.5758,
+                               -1.6849, 0.0928,  0.9022,  -0.8765, -0.4090, 0.9301, 2.0724, -1.5706,
+                               0.4867,  -0.1493, 0.6957,  -0.2179, 0.7142,  0.7177, 0.0183, 1.3497};
+    std::vector<int64_t> res_gold = {2, 1, 0, 3, 3, 2};
+    migraphx::shape data_shape{migraphx::shape::float_type, {2, 3, 4}};
+    auto dl = p.add_literal(migraphx::literal{data_shape, data});
+    p.add_instruction(migraphx::op::argmin{2}, dl);
+    p.compile(migraphx::cpu::target{});
+    auto result = p.eval({});
+    std::vector<int64_t> result_vec;
+    result.visit([&](auto output) { result_vec.assign(output.begin(), output.end()); });
+
+    EXPECT(migraphx::verify_range(result_vec, res_gold));
+}
+
 TEST_CASE(conv2d_test)
 {
    migraphx::program p;

--- a/test/gpu/miopen.cpp
+++ b/test/gpu/miopen.cpp
@@ -592,13 +592,13 @@ struct test_softmax2 : verify_program<test_softmax2>
    }
 };

-template <int Axis>
-struct test_softmax : verify_program<test_softmax<Axis>>
+template <int Axis, migraphx::shape::type_t T>
+struct test_softmax : verify_program<test_softmax<Axis, T>>
 {
    migraphx::program create_program() const
    {
        migraphx::program p;
-        migraphx::shape s{migraphx::shape::float_type, {3, 4, 5, 6}};
+        migraphx::shape s{T, {512, 4, 1067, 6}};
        auto param = p.add_parameter("0", s);
        p.add_instruction(migraphx::op::softmax{Axis}, param);

@@ -606,10 +606,38 @@ struct test_softmax : verify_program<test_softmax<Axis>>
    }
 };

-template struct test_softmax<0>;
-template struct test_softmax<1>;
-template struct test_softmax<2>;
-template struct test_softmax<3>;
+template struct test_softmax<0, migraphx::shape::float_type>;
+template struct test_softmax<2, migraphx::shape::float_type>;
+template struct test_softmax<1, migraphx::shape::double_type>;
+template struct test_softmax<3, migraphx::shape::double_type>;
+template struct test_softmax<0, migraphx::shape::half_type>;
+template struct test_softmax<1, migraphx::shape::half_type>;
+template struct test_softmax<2, migraphx::shape::half_type>;
+template struct test_softmax<3, migraphx::shape::half_type>;
+
+template <class T, int Axis>
+struct test_arg_ops : verify_program<test_arg_ops<T, Axis>>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        migraphx::shape s{migraphx::shape::float_type, {2, 3, 4, 1025}};
+        auto param = p.add_parameter("data", s);
+        p.add_instruction(T{Axis}, param);
+
+        return p;
+    }
+};
+
+template struct test_arg_ops<migraphx::op::argmax, 0>;
+template struct test_arg_ops<migraphx::op::argmax, 1>;
+template struct test_arg_ops<migraphx::op::argmax, 2>;
+template struct test_arg_ops<migraphx::op::argmax, 3>;
+
+template struct test_arg_ops<migraphx::op::argmin, 0>;
+template struct test_arg_ops<migraphx::op::argmin, 1>;
+template struct test_arg_ops<migraphx::op::argmin, 2>;
+template struct test_arg_ops<migraphx::op::argmin, 3>;

 struct test_conv : verify_program<test_conv>
 {
@@ -3344,32 +3372,13 @@ struct test_lstm_bidirct_default_actv2 : verify_program<test_lstm_bidirct_defaul
    }
 };

-template <int Axis>
-struct test_logsoftmax : verify_program<test_logsoftmax<Axis>>
-{
-    migraphx::program create_program() const
-    {
-        migraphx::program p;
-        migraphx::shape s{migraphx::shape::float_type, {3, 4, 5, 6}};
-        auto param = p.add_parameter("0", s);
-        p.add_instruction(migraphx::op::logsoftmax{Axis}, param);
-
-        return p;
-    }
-};
-
-template struct test_logsoftmax<0>;
-template struct test_logsoftmax<1>;
-template struct test_logsoftmax<2>;
-template struct test_logsoftmax<3>;
-
-template <int Axis>
-struct test_logsoftmax_1 : verify_program<test_logsoftmax_1<Axis>>
+template <int Axis, migraphx::shape::type_t T>
+struct test_logsoftmax : verify_program<test_logsoftmax<Axis, T>>
 {
    migraphx::program create_program() const
    {
        migraphx::program p;
-        migraphx::shape s{migraphx::shape::float_type, {3}};
+        migraphx::shape s{T, {10, 4, 2080, 6}};
        auto param = p.add_parameter("0", s);
        p.add_instruction(migraphx::op::logsoftmax{Axis}, param);

@@ -3377,7 +3386,16 @@ struct test_logsoftmax_1 : verify_program<test_logsoftmax_1<Axis>>
    }
 };

-template struct test_logsoftmax_1<0>;
+template struct test_logsoftmax<0, migraphx::shape::float_type>;
+template struct test_logsoftmax<1, migraphx::shape::float_type>;
+template struct test_logsoftmax<2, migraphx::shape::float_type>;
+template struct test_logsoftmax<3, migraphx::shape::float_type>;
+template struct test_logsoftmax<1, migraphx::shape::double_type>;
+template struct test_logsoftmax<3, migraphx::shape::double_type>;
+template struct test_logsoftmax<1, migraphx::shape::half_type>;
+template struct test_logsoftmax<0, migraphx::shape::half_type>;
+template struct test_logsoftmax<2, migraphx::shape::half_type>;
+template struct test_logsoftmax<3, migraphx::shape::half_type>;

 struct test_fp32_fp16_lall : verify_program<test_fp32_fp16_lall>
 {