Merge branch 'develop' into tf-transpose

2ee59b2b · mvermeulen · GitHub · d5ade1e7 · 8d5a2210 · 2ee59b2b
Unverified Commit 2ee59b2b authored Jul 08, 2019 by mvermeulen Committed by GitHub Jul 08, 2019
20 changed files
--- a/src/include/migraphx/op/argmax.hpp
+++ b/src/include/migraphx/op/argmax.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_ARGMAX_HPP
+#define MIGRAPHX_GUARD_OPERATORS_ARGMAX_HPP
+#include <migraphx/operation.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/par_dfor.hpp>
+#include <migraphx/config.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+struct argmax
+{
+    int64_t axis = 0;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.axis, "axis"));
+    }
+    std::string name() const { return "argmax"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(1).standard();
+        auto lens     = inputs[0].lens();
+        int64_t n_dim = static_cast<int64_t>(lens.size());
+        if(axis >= n_dim || axis < 0)
+        {
+            MIGRAPHX_THROW("ARGMAX: axis is out of range.");
+        }
+        lens[axis] = 1;
+        return {shape::int64_type, lens};
+    }
+    template <class T>
+    int64_t calc_argmax(T& input, std::vector<std::size_t>& indices, size_t item_num) const
+    {
+        auto max_val      = input(indices.begin(), indices.end());
+        int64_t max_index = 0;
+        for(std::size_t i = 1; i < item_num; ++i)
+        {
+            indices[axis] = i;
+            auto cur_val  = input(indices.begin(), indices.end());
+            if(max_val < cur_val)
+            {
+                max_val   = cur_val;
+                max_index = i;
+            }
+        }
+        return max_index;
+    }
+    argument compute(const shape& output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        auto batch_item_num = args.front().get_shape().lens()[axis];
+        result.visit([&](auto output) {
+            args[0].visit([&](auto input) {
+                par_for(output_shape.elements(), [&](auto i) {
+                    auto data_idx = output_shape.multi(i);
+                    output[i]     = this->calc_argmax(input, data_idx, batch_item_num);
+                });
+            });
+        });
+        return result;
+    }
+};
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/include/migraphx/op/argmin.hpp
+++ b/src/include/migraphx/op/argmin.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_ARGMIN_HPP
+#define MIGRAPHX_GUARD_OPERATORS_ARGMIN_HPP
+#include <migraphx/operation.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/par_dfor.hpp>
+#include <migraphx/config.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+struct argmin
+{
+    int64_t axis = 0;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.axis, "axis"));
+    }
+    std::string name() const { return "argmin"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(1).standard();
+        auto lens     = inputs[0].lens();
+        int64_t n_dim = static_cast<int64_t>(lens.size());
+        if(axis >= n_dim || axis < 0)
+        {
+            MIGRAPHX_THROW("ARGMIN: axis is out of range.");
+        }
+        lens[axis] = 1;
+        return {shape::int64_type, lens};
+    }
+    template <class T>
+    int64_t calc_argmin(T& input, std::vector<std::size_t>& indices, size_t item_num) const
+    {
+        auto min_val      = input(indices.begin(), indices.end());
+        int64_t min_index = 0;
+        for(std::size_t i = 1; i < item_num; ++i)
+        {
+            indices[axis] = i;
+            auto cur_val  = input(indices.begin(), indices.end());
+            if(min_val > cur_val)
+            {
+                min_val   = cur_val;
+                min_index = i;
+            }
+        }
+        return min_index;
+    }
+    argument compute(const shape& output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        std::size_t batch_item_num = args.front().get_shape().lens()[axis];
+        result.visit([&](auto output) {
+            args[0].visit([&](auto input) {
+                par_for(output_shape.elements(), [&](auto i) {
+                    auto data_idx = output_shape.multi(i);
+                    output[i]     = this->calc_argmin(input, data_idx, batch_item_num);
+                });
+            });
+        });
+        return result;
+    }
+};
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/include/migraphx/op/logsoftmax.hpp
+++ b/src/include/migraphx/op/logsoftmax.hpp
 #ifndef MIGRAPHX_GUARD_OPERATORS_LOGSOFTMAX_HPP
 #define MIGRAPHX_GUARD_OPERATORS_LOGSOFTMAX_HPP
-#include <array>
 #include <migraphx/operation.hpp>
 #include <migraphx/check_shapes.hpp>
-#include <migraphx/stringutils.hpp>
-#include <migraphx/streamutils.hpp>
-#include <migraphx/literal.hpp>
-#include <migraphx/shape_for_each.hpp>
 #include <migraphx/config.hpp>
-#include <cmath>
-#include <utility>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

--- a/src/include/migraphx/op/softmax.hpp
+++ b/src/include/migraphx/op/softmax.hpp
 #ifndef MIGRAPHX_GUARD_OPERATORS_SOFTMAX_HPP
 #define MIGRAPHX_GUARD_OPERATORS_SOFTMAX_HPP
-#include <array>
 #include <migraphx/operation.hpp>
 #include <migraphx/check_shapes.hpp>
-#include <migraphx/stringutils.hpp>
-#include <migraphx/streamutils.hpp>
-#include <migraphx/literal.hpp>
-#include <migraphx/shape_for_each.hpp>
 #include <migraphx/config.hpp>
-#include <cmath>
-#include <utility>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

--- a/src/include/migraphx/operators.hpp
+++ b/src/include/migraphx/operators.hpp
@@ -5,6 +5,8 @@
 #include <migraphx/op/abs.hpp>
 #include <migraphx/op/acos.hpp>
 #include <migraphx/op/add.hpp>
+#include <migraphx/op/argmax.hpp>
+#include <migraphx/op/argmin.hpp>
 #include <migraphx/op/asin.hpp>
 #include <migraphx/op/as_shape.hpp>
 #include <migraphx/op/atan.hpp>

--- a/src/onnx/onnx.cpp
+++ b/src/onnx/onnx.cpp
@@ -63,6 +63,8 @@ struct onnx_parser
        add_variadic_op("Max", op::max{});
        add_variadic_op("Min", op::min{});
+        add_mem_op("ArgMax", &onnx_parser::parse_argmax);
+        add_mem_op("ArgMin", &onnx_parser::parse_argmin);
        add_mem_op("Clip", &onnx_parser::parse_clip);
        add_mem_op("LRN", &onnx_parser::parse_lrn);
        add_mem_op("ImageScaler", &onnx_parser::parse_imagescaler);
@@ -93,6 +95,7 @@ struct onnx_parser
        add_mem_op("GRU", &onnx_parser::parse_gru);
        add_mem_op("LSTM", &onnx_parser::parse_lstm);
        add_mem_op("Pad", &onnx_parser::parse_pad);
+        add_mem_op("ReduceSum", &onnx_parser::parse_reduce_sum);
        // init the activation function map
        init_actv_func();
@@ -274,6 +277,60 @@ struct onnx_parser
        return prog.add_instruction(op::logsoftmax{axis}, std::move(args));
    }
+    instruction_ref parse_argmax(const std::string&,
+                                 const attribute_map& attributes,
+                                 std::vector<instruction_ref> args)
+    {
+        int64_t axis = 0;
+        if(contains(attributes, "axis"))
+        {
+            axis = static_cast<int64_t>(parse_value(attributes.at("axis")).at<int>());
+        }
+        int keep_dims = 1;
+        if(contains(attributes, "keepdims"))
+        {
+            keep_dims = parse_value(attributes.at("keepdims")).at<int>();
+        }
+        if(keep_dims == 0)
+        {
+            auto ins = prog.add_instruction(op::argmax{axis}, std::move(args));
+            return prog.add_instruction(op::squeeze{{axis}}, ins);
+        }
+        else
+        {
+            return prog.add_instruction(op::argmax{axis}, std::move(args));
+        }
+    }
+    instruction_ref parse_argmin(const std::string&,
+                                 const attribute_map& attributes,
+                                 std::vector<instruction_ref> args)
+    {
+        int64_t axis = 0;
+        if(contains(attributes, "axis"))
+        {
+            axis = static_cast<int64_t>(parse_value(attributes.at("axis")).at<int>());
+        }
+        int keep_dims = 1;
+        if(contains(attributes, "keepdims"))
+        {
+            keep_dims = parse_value(attributes.at("keepdims")).at<int>();
+        }
+        if(keep_dims == 0)
+        {
+            auto ins = prog.add_instruction(op::argmin{axis}, std::move(args));
+            return prog.add_instruction(op::squeeze{{axis}}, ins);
+        }
+        else
+        {
+            return prog.add_instruction(op::argmin{axis}, std::move(args));
+        }
+    }
    instruction_ref
    parse_conv(const std::string&, attribute_map attributes, std::vector<instruction_ref> args)
    {
@@ -1230,6 +1287,40 @@ struct onnx_parser
        return {hidden_states, last_output, last_cell_output};
    }
+    instruction_ref parse_reduce_sum(const std::string&,
+                                     attribute_map attributes,
+                                     std::vector<instruction_ref> args)
+    {
+        std::size_t n_dim = args.front()->get_shape().lens().size();
+        // default to reduce over all dimensions
+        std::vector<std::size_t> axes(n_dim);
+        std::iota(axes.begin(), axes.end(), 0);
+        if(contains(attributes, "axes"))
+        {
+            axes.clear();
+            auto&& attr_axes = attributes["axes"].ints();
+            axes             = std::vector<std::size_t>(attr_axes.begin(), attr_axes.end());
+        }
+        int keep_dims = 1;
+        if(contains(attributes, "keepdims"))
+        {
+            keep_dims = parse_value(attributes.at("keepdims")).at<int>();
+        }
+        if(keep_dims == 1)
+        {
+            return prog.add_instruction(op::reduce_sum{axes}, std::move(args));
+        }
+        else
+        {
+            auto ins = prog.add_instruction(op::reduce_sum{axes}, std::move(args));
+            std::vector<int64_t> squeeze_axes{axes.begin(), axes.end()};
+            return prog.add_instruction(op::squeeze{squeeze_axes}, ins);
+        }
+    }
    void parse_from(std::istream& is)
    {
        onnx::ModelProto model;

--- a/src/targets/cpu/lowering.cpp
+++ b/src/targets/cpu/lowering.cpp
@@ -13,6 +13,8 @@
 #include <migraphx/op/pad.hpp>
 #include <migraphx/op/pooling.hpp>
 #include <migraphx/op/softmax.hpp>
+#include <migraphx/op/argmax.hpp>
+#include <migraphx/op/argmin.hpp>
 #include <migraphx/shape_for_each.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/par_dfor.hpp>

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -12,6 +12,8 @@ endif()
 add_library(migraphx_device
    device/add.cpp
+    device/argmax.cpp
+    device/argmin.cpp
    device/max.cpp
    device/min.cpp
    device/exp.cpp
@@ -44,6 +46,8 @@ target_include_directories(migraphx_device PUBLIC $<BUILD_INTERFACE:${CMAKE_CURR
 target_include_directories(migraphx_device PRIVATE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/device/include>)
 add_library(migraphx_gpu
+    argmax.cpp
+    argmin.cpp
    eliminate_workspace.cpp
    fuse_ops.cpp
    hip.cpp

--- a/src/targets/gpu/argmax.cpp
+++ b/src/targets/gpu/argmax.cpp
+#include <migraphx/gpu/argmax.hpp>
+#include <migraphx/gpu/device/argmax.hpp>
+#include <migraphx/gpu/context.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+shape hip_argmax::compute_shape(const std::vector<shape>& inputs) const
+{
+    check_shapes{inputs, *this}.has(2).standard();
+    return op.compute_shape({inputs.at(0)});
+}
+argument hip_argmax::compute(context& ctx, const shape&, const std::vector<argument>& args) const
+{
+    device::argmax(ctx.get_stream().get(), args.back(), args.front(), op.axis);
+    return args.back();
+}
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/argmin.cpp
+++ b/src/targets/gpu/argmin.cpp
+#include <migraphx/gpu/argmin.hpp>
+#include <migraphx/gpu/device/argmin.hpp>
+#include <migraphx/gpu/context.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+shape hip_argmin::compute_shape(const std::vector<shape>& inputs) const
+{
+    check_shapes{inputs, *this}.has(2).standard();
+    return op.compute_shape({inputs.at(0)});
+}
+argument hip_argmin::compute(context& ctx, const shape&, const std::vector<argument>& args) const
+{
+    device::argmin(ctx.get_stream().get(), args.back(), args.front(), op.axis);
+    return args.back();
+}
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/argmax.cpp
+++ b/src/targets/gpu/device/argmax.cpp
+#include <migraphx/shape.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/gpu/device/argmax.hpp>
+#include <migraphx/gpu/device/tensor.hpp>
+#include <migraphx/gpu/device/launch.hpp>
+#include <migraphx/gpu/device/types.hpp>
+#include <migraphx/gpu/device/arg_op.hpp>
+#include <migraphx/gpu/hip.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+void argmax(hipStream_t stream, const argument& result, const argument& arg, int64_t axis)
+{
+    arg_op(argmax_op{}, stream, result, arg, axis);
+}
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/argmin.cpp
+++ b/src/targets/gpu/device/argmin.cpp
+#include <migraphx/shape.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/gpu/device/argmin.hpp>
+#include <migraphx/gpu/device/tensor.hpp>
+#include <migraphx/gpu/device/launch.hpp>
+#include <migraphx/gpu/device/types.hpp>
+#include <migraphx/gpu/device/arg_op.hpp>
+#include <migraphx/gpu/hip.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+void argmin(hipStream_t stream, const argument& result, const argument& arg, int64_t axis)
+{
+    arg_op(argmin_op{}, stream, result, arg, axis);
+}
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/include/migraphx/gpu/device/reduce.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/reduce.hpp
@@ -128,7 +128,7 @@ __device__ T dpp_mov(T& x)
 template <class T, class Op>
 __device__ void dpp_reduce(T& in, Op op)
 {
-    T out;
+    T out{};
    out = dpp_mov<dpp_row_shr(1)>(in);
    in  = op(in, out);
    out = dpp_mov<dpp_row_shr(2)>(in);

--- a/src/targets/gpu/include/migraphx/gpu/argmax.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/argmax.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_ARGMAX_HPP
+#define MIGRAPHX_GUARD_RTGLIB_ARGMAX_HPP
+#include <migraphx/shape.hpp>
+#include <migraphx/op/argmax.hpp>
+#include <migraphx/gpu/device/argmax.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+struct context;
+struct hip_argmax
+{
+    op::argmax op;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+    std::string name() const { return "gpu::argmax"; }
+    shape compute_shape(const std::vector<shape>& inputs) const;
+    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const;
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/argmin.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/argmin.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_ARGMIN_HPP
+#define MIGRAPHX_GUARD_RTGLIB_ARGMIN_HPP
+#include <migraphx/shape.hpp>
+#include <migraphx/op/argmin.hpp>
+#include <migraphx/gpu/device/argmin.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+struct context;
+struct hip_argmin
+{
+    op::argmin op;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+    std::string name() const { return "gpu::argmin"; }
+    shape compute_shape(const std::vector<shape>& inputs) const;
+    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const;
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/device/arg_op.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/arg_op.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_ARG_OP_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_ARG_OP_HPP
+#include <migraphx/shape.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/gpu/device/tensor.hpp>
+#include <migraphx/gpu/device/launch.hpp>
+#include <migraphx/gpu/device/types.hpp>
+#include <migraphx/gpu/device/reduce.hpp>
+#include <migraphx/gpu/hip.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+template <class T>
+struct val_index
+{
+    T val;
+    int64_t index;
+};
+template <class T>
+MIGRAPHX_DEVICE_CONSTEXPR val_index<T> make_val_index(T v)
+{
+    return {v, -1};
+}
+template <class T>
+MIGRAPHX_DEVICE_CONSTEXPR val_index<T> make_val_index(T v, int64_t i)
+{
+    return {v, i};
+}
+struct argmax_op
+{
+    template <class T>
+    MIGRAPHX_DEVICE_CONSTEXPR val_index<T> operator()(val_index<T> x, val_index<T> y) const
+    {
+        if(x.val > y.val)
+            return x;
+        else if(x.val < y.val)
+            return y;
+        else
+        {
+            return (x.index < y.index) ? x : y;
+        }
+    }
+    MIGRAPHX_DEVICE_CONSTEXPR auto init() const { return lowest(); }
+};
+struct argmin_op
+{
+    template <class T>
+    MIGRAPHX_DEVICE_CONSTEXPR val_index<T> operator()(val_index<T> x, val_index<T> y) const
+    {
+        if(x.val < y.val)
+            return x;
+        else if(x.val > y.val)
+            return y;
+        else
+        {
+            return (x.index < y.index) ? x : y;
+        }
+    }
+    MIGRAPHX_DEVICE_CONSTEXPR auto init() const { return highest(); }
+};
+template <class Op>
+void arg_op(Op op, hipStream_t stream, const argument& result, const argument& arg, int64_t axis)
+{
+    auto arg_shape        = arg.get_shape();
+    auto lens             = arg_shape.lens();
+    auto batch_lens       = lens;
+    size_t batch_item_num = lens[axis];
+    batch_lens[axis]      = 1;
+    migraphx::shape batch_shape{arg_shape.type(), batch_lens};
+    hip_visit_all(arg, arg_shape, batch_shape)([&](auto input, auto arg_s, auto batch_s) {
+        auto output = device_cast(result.get<int64_t>().data());
+        using type  = device_type<std::remove_cv_t<typename decltype(input)::value_type>>;
+        // use one block for items in one batch.
+        const size_t max_block_size  = 256;
+        const std::size_t block_size = compute_block_size(batch_item_num, max_block_size);
+        gs_launch(stream,
+                  batch_shape.elements() * block_size,
+                  block_size)([=](auto i, auto idx) __device__ {
+            auto batch_idx = batch_s.multi(i / block_size);
+            auto data_idx  = batch_idx;
+            auto init      = make_val_index<type>(op.init());
+            auto op_output =
+                block_reduce<max_block_size>(idx, op, init, batch_item_num, [&](auto j) __device__ {
+                    data_idx[axis] = j;
+                    return make_val_index(input[arg_s.index(data_idx)], j);
+                });
+            if(idx.local == 0)
+            {
+                output[batch_s.index(batch_idx)] = op_output.index;
+            }
+        });
+    });
+}
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/device/argmax.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/argmax.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_ARGMAX_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_ARGMAX_HPP
+#include <migraphx/argument.hpp>
+#include <migraphx/config.hpp>
+#include <hip/hip_runtime_api.h>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+void argmax(hipStream_t stream, const argument& result, const argument& arg, int64_t axis);
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/device/argmin.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/argmin.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_ARGMIN_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_ARGMIN_HPP
+#include <migraphx/argument.hpp>
+#include <migraphx/config.hpp>
+#include <hip/hip_runtime_api.h>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+void argmin(hipStream_t stream, const argument& result, const argument& arg, int64_t axis);
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -11,6 +11,8 @@
 #include <migraphx/gpu/device/contiguous.hpp>
 #include <migraphx/gpu/device/add.hpp>
 #include <migraphx/iterator_for.hpp>
+#include <migraphx/gpu/argmax.hpp>
+#include <migraphx/gpu/argmin.hpp>
 #include <migraphx/gpu/rocblas.hpp>
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/gpu/convolution.hpp>
@@ -102,6 +104,8 @@ struct miopen_apply
        add_extend_op<hip_concat, op::concat>("concat");
        add_extend_op<hip_softmax, op::softmax>("softmax");
        add_extend_op<hip_logsoftmax, op::logsoftmax>("logsoftmax");
+        add_extend_op<hip_argmax, op::argmax>("argmax");
+        add_extend_op<hip_argmin, op::argmin>("argmin");
        add_extend_op<hip_gather, op::gather>("gather");
        add_extend_op<hip_pad, op::pad>("pad");
        add_extend_op<hip_convert, op::convert>("convert");

--- a/test/cpu_ops_test.cpp
+++ b/test/cpu_ops_test.cpp
@@ -941,9 +941,6 @@ TEST_CASE(softmax_simple_test)
    auto result = p.eval({});
    std::vector<float> results_vector(2);
    result.visit([&](auto output) { results_vector.assign(output.begin(), output.end()); });
-    for(auto v : results_vector)
-        std::cout << v << "\t";
-    std::cout << std::endl;
    EXPECT(migraphx::verify_range(results_vector, s));
 }
@@ -1138,6 +1135,114 @@ TEST_CASE(logsoftmax_test_axis_3)
    EXPECT(migraphx::verify_range(results_vector, s));
 }
+TEST_CASE(argmax_test_0)
+{
+    migraphx::program p;
+    std::vector<float> data = {1.2255,  1.6834,  -2.0305, -0.3221, 0.4701,  0.2583, 0.7545, 2.5758,
+                               -1.6849, 0.0928,  0.9022,  -0.8765, -0.4090, 0.9301, 2.0724, -1.5706,
+                               0.4867,  -0.1493, 0.6957,  -0.2179, 0.7142,  0.7177, 0.0183, 1.3497};
+    std::vector<int64_t> res_gold = {0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1};
+    migraphx::shape data_shape{migraphx::shape::float_type, {2, 3, 4}};
+    auto dl = p.add_literal(migraphx::literal{data_shape, data});
+    p.add_instruction(migraphx::op::argmax{0}, dl);
+    p.compile(migraphx::cpu::target{});
+    auto result = p.eval({});
+    std::vector<int64_t> result_vec;
+    result.visit([&](auto output) { result_vec.assign(output.begin(), output.end()); });
+    EXPECT(migraphx::verify_range(result_vec, res_gold));
+}
+TEST_CASE(argmax_test_1)
+{
+    migraphx::program p;
+    std::vector<float> data = {1.2255,  1.6834,  -2.0305, -0.3221, 0.4701,  0.2583, 0.7545, 2.5758,
+                               -1.6849, 0.0928,  0.9022,  -0.8765, -0.4090, 0.9301, 2.0724, -1.5706,
+                               0.4867,  -0.1493, 0.6957,  -0.2179, 0.7142,  0.7177, 0.0183, 1.3497};
+    std::vector<int64_t> res_gold = {0, 0, 2, 1, 2, 0, 0, 2};
+    migraphx::shape data_shape{migraphx::shape::float_type, {2, 3, 4}};
+    auto dl = p.add_literal(migraphx::literal{data_shape, data});
+    p.add_instruction(migraphx::op::argmax{1}, dl);
+    p.compile(migraphx::cpu::target{});
+    auto result = p.eval({});
+    std::vector<int64_t> result_vec;
+    result.visit([&](auto output) { result_vec.assign(output.begin(), output.end()); });
+    EXPECT(migraphx::verify_range(result_vec, res_gold));
+}
+TEST_CASE(argmax_test_2)
+{
+    migraphx::program p;
+    std::vector<float> data = {1.2255,  1.6834,  -2.0305, -0.3221, 0.4701,  0.2583, 0.7545, 2.5758,
+                               -1.6849, 0.0928,  0.9022,  -0.8765, -0.4090, 0.9301, 2.0724, -1.5706,
+                               0.4867,  -0.1493, 0.6957,  -0.2179, 0.7142,  0.7177, 0.0183, 1.3497};
+    std::vector<int64_t> res_gold = {1, 3, 2, 2, 2, 3};
+    migraphx::shape data_shape{migraphx::shape::float_type, {2, 3, 4}};
+    auto dl = p.add_literal(migraphx::literal{data_shape, data});
+    p.add_instruction(migraphx::op::argmax{2}, dl);
+    p.compile(migraphx::cpu::target{});
+    auto result = p.eval({});
+    std::vector<int64_t> result_vec;
+    result.visit([&](auto output) { result_vec.assign(output.begin(), output.end()); });
+    EXPECT(migraphx::verify_range(result_vec, res_gold));
+}
+TEST_CASE(argmin_test_0)
+{
+    migraphx::program p;
+    std::vector<float> data = {1.2255,  1.6834,  -2.0305, -0.3221, 0.4701,  0.2583, 0.7545, 2.5758,
+                               -1.6849, 0.0928,  0.9022,  -0.8765, -0.4090, 0.9301, 2.0724, -1.5706,
+                               0.4867,  -0.1493, 0.6957,  -0.2179, 0.7142,  0.7177, 0.0183, 1.3497};
+    std::vector<int64_t> res_gold = {1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0};
+    migraphx::shape data_shape{migraphx::shape::float_type, {2, 3, 4}};
+    auto dl = p.add_literal(migraphx::literal{data_shape, data});
+    p.add_instruction(migraphx::op::argmin{0}, dl);
+    p.compile(migraphx::cpu::target{});
+    auto result = p.eval({});
+    std::vector<int64_t> result_vec;
+    result.visit([&](auto output) { result_vec.assign(output.begin(), output.end()); });
+    EXPECT(migraphx::verify_range(result_vec, res_gold));
+}
+TEST_CASE(argmin_test_1)
+{
+    migraphx::program p;
+    std::vector<float> data = {1.2255,  1.6834,  -2.0305, -0.3221, 0.4701,  0.2583, 0.7545, 2.5758,
+                               -1.6849, 0.0928,  0.9022,  -0.8765, -0.4090, 0.9301, 2.0724, -1.5706,
+                               0.4867,  -0.1493, 0.6957,  -0.2179, 0.7142,  0.7177, 0.0183, 1.3497};
+    std::vector<int64_t> res_gold = {2, 2, 0, 2, 0, 1, 2, 0};
+    migraphx::shape data_shape{migraphx::shape::float_type, {2, 3, 4}};
+    auto dl = p.add_literal(migraphx::literal{data_shape, data});
+    p.add_instruction(migraphx::op::argmin{1}, dl);
+    p.compile(migraphx::cpu::target{});
+    auto result = p.eval({});
+    std::vector<int64_t> result_vec;
+    result.visit([&](auto output) { result_vec.assign(output.begin(), output.end()); });
+    EXPECT(migraphx::verify_range(result_vec, res_gold));
+}
+TEST_CASE(argmin_test_2)
+{
+    migraphx::program p;
+    std::vector<float> data = {1.2255,  1.6834,  -2.0305, -0.3221, 0.4701,  0.2583, 0.7545, 2.5758,
+                               -1.6849, 0.0928,  0.9022,  -0.8765, -0.4090, 0.9301, 2.0724, -1.5706,
+                               0.4867,  -0.1493, 0.6957,  -0.2179, 0.7142,  0.7177, 0.0183, 1.3497};
+    std::vector<int64_t> res_gold = {2, 1, 0, 3, 3, 2};
+    migraphx::shape data_shape{migraphx::shape::float_type, {2, 3, 4}};
+    auto dl = p.add_literal(migraphx::literal{data_shape, data});
+    p.add_instruction(migraphx::op::argmin{2}, dl);
+    p.compile(migraphx::cpu::target{});
+    auto result = p.eval({});
+    std::vector<int64_t> result_vec;
+    result.visit([&](auto output) { result_vec.assign(output.begin(), output.end()); });
+    EXPECT(migraphx::verify_range(result_vec, res_gold));
+}
 TEST_CASE(conv2d_test)
 {
    migraphx::program p;