Merge branch 'develop' of https://github.com/ROCmSoftwarePlatform/AMDMIGraphX into mi100_opts

3a474fca · Khalique Ahmed · d9568511 · 0b7672d7 · 3a474fca · 3a474fca
Commit 3a474fca authored Oct 04, 2021 by Khalique Ahmed
20 changed files
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -4,7 +4,7 @@ CheckOptions:
  - key:             bugprone-unused-return-value.CheckedFunctions
    value:           '::std::async;::std::launder;::std::remove;::std::remove_if;::std::unique;::std::unique_ptr::release;::std::basic_string::empty;::std::vector::empty;::std::find;::std::find_if;::std::find_if_not;::std::all_of;::std::any_of;::std::none_of;::std::count;::std::count_if;::std::mismatch;::std::find_end;::std::find_first_of;::std::adjacent_find;::std::search;::std::search_n;::std::nth_element;::std::lower_bound;::std::upper_bound;::std::binary_search;::std::equal_range;::std::max;::std::max_element;::std::min;::std::min_element;::std::minmax;::std::minmax_element;::std::equal;::std::lexicographical_compare;::std::accumulate;::std::inner_product'
  - key:             cppcoreguidelines-macro-usage.AllowedRegexp
-    value:           'DEBUG|FALLTHROUGH|STRINGIZE|_HAS_|_THROW|_REQUIRES|_DECLARE_|_VISIT_|_REGISTER_|_GENERATE_|_DETAIL_|_TIDY_|_MANAGE_PTR|_MATCHER|DEVICE_SHARED'
+    value:           'DEBUG|FALLTHROUGH|STRINGIZE|_HAS_|_THROW|_REQUIRES|_DECLARE_|_VISIT_|_REGISTER_|_GENERATE_|_DETAIL_|_TIDY_|_MANAGE_PTR|_MATCHER|DEVICE_SHARED|_WORKAROUND_'
  - key:             modernize-loop-convert.MinConfidence
    value:           risky   
  - key:             modernize-loop-convert.NamingStyle

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -223,7 +223,7 @@ rocm_create_package(
    MAINTAINER "Paul Fultz II <paul.fultz@amd.com>"
    LDCONFIG
    PTH
-    DEPENDS miopen-hip rocblas hip-hcc half
+    DEPENDS miopen-hip rocblas hip-rocclr hip-base half
 )
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)

--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -124,8 +124,9 @@ def onnxnode(name, body) {
 rocmtest onnx: onnxnode('rocmtest') { cmake_build ->
    stage("Onnx runtime") {
        sh '''
+            apt install half
            ls -lR
-            dpkg -i --force-depends ./build/*.deb
+            dpkg -i ./build/*.deb
            cd /onnxruntime && ./build_and_test_onnxrt.sh
        '''
    }

--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -129,6 +129,7 @@ register_migraphx_ops(
    min
    mul
    multibroadcast
+    multinomial
    neg
    outline
    pad

--- a/src/include/migraphx/op/multinomial.hpp
+++ b/src/include/migraphx/op/multinomial.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_MULTINOMIAL_HPP
+#define MIGRAPHX_GUARD_OPERATORS_MULTINOMIAL_HPP
+#include <migraphx/operation.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/par_for.hpp>
+#include <random>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+struct multinomial
+{
+    shape::type_t dtype = shape::type_t::int32_type;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.dtype, "dtype"));
+    }
+    std::string name() const { return "multinomial"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(2).only_dims(2);
+        size_t sample_size = inputs.back().lens().back();
+        if(not contains({shape::int32_type, shape::int64_type}, dtype))
+            MIGRAPHX_THROW(
+                "Multinomial: Invalid output type. Valid types are int32_type and int64_type.");
+        return {dtype, {inputs.front().lens().front(), sample_size}};
+    }
+    argument compute(const shape& output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        size_t batch_size  = output_shape.lens().front();
+        size_t class_size  = args[0].get_shape().lens().back();
+        size_t sample_size = output_shape.lens().back();
+        visit_all(args[0], args[1])([&](auto cdf, auto dist) {
+            result.visit([&](auto output) {
+                par_for(batch_size * sample_size, [&](auto i) {
+                    auto idx       = args[1].get_shape().multi(i);
+                    auto cdf_begin = cdf.begin() + (idx[0] * class_size);
+                    auto cdf_end   = cdf_begin + class_size;
+                    auto sample_iter =
+                        std::upper_bound(cdf_begin, cdf_end, dist[i] * *(std::prev(cdf_end)));
+                    output[i] = std::distance(cdf_begin, sample_iter);
+                });
+            });
+        });
+        return result;
+    }
+};
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/onnx/parse_depthtospace.cpp
+++ b/src/onnx/parse_depthtospace.cpp
+#include <migraphx/onnx/op_parser.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace onnx {
+struct parse_depthtospace : op_parser<parse_depthtospace>
+{
+    std::vector<op_desc> operators() const { return {{"DepthToSpace"}}; }
+    instruction_ref parse(const op_desc& /*opd*/,
+                          const onnx_parser& /*parser*/,
+                          const onnx_parser::node_info& info,
+                          std::vector<instruction_ref> args) const
+    {
+        auto s = args[0]->get_shape();
+        // mode attribute of DepthToSpace
+        auto mode = std::string("DCR");
+        if(contains(info.attributes, "mode"))
+        {
+            mode = info.attributes.at("mode").s(); // DCR or CRD?
+        }
+        // blocksize attribute of DepthToSpace
+        int blocksize = 0;
+        if(contains(info.attributes, "blocksize"))
+        {
+            blocksize = info.attributes.at("blocksize").i();
+        }
+        if(blocksize < 1)
+        {
+            MIGRAPHX_THROW("DepthToSpace: blocksize is less than 1");
+        }
+        // calculate dimensions
+        auto lens1            = s.lens();
+        auto lens2            = s.lens();
+        unsigned long divisor = std::pow(blocksize, 2);
+        if((lens2[1] % divisor) == 0)
+            lens2[1] = lens2[1] / divisor;
+        else
+            MIGRAPHX_THROW("DepthToSpace: div by blocksize quotient not int ");
+        lens1.push_back(lens1[2]);
+        lens1.push_back(lens1[3]);
+        lens2[2] = lens2[2] * blocksize;
+        lens2[3] = lens2[3] * blocksize;
+        lens1[2] = blocksize;
+        std::vector<int64_t> perm;
+        if(mode == "DCR")
+        {
+            lens1[3] = lens1[1] / divisor;
+            lens1[1] = blocksize;
+            perm     = {0, 3, 4, 1, 5, 2};
+        }
+        else if(mode == "CRD")
+        {
+            lens1[1] = lens1[1] / divisor;
+            lens1[3] = blocksize;
+            perm     = {0, 1, 4, 2, 5, 3};
+        }
+        else
+            MIGRAPHX_THROW("DepthToSpace: mode attribute cannot be read.");
+        auto temp1 = info.add_instruction(make_op("reshape", {{"dims", lens1}}), args[0]);
+        auto temp2 = info.add_instruction(make_op("transpose", {{"permutation", perm}}), temp1);
+        return info.add_instruction(make_op("reshape", {{"dims", lens2}}),
+                                    info.make_contiguous(temp2));
+    }
+};
+} // namespace onnx
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/onnx/parse_multinomial.cpp
+++ b/src/onnx/parse_multinomial.cpp
+#include <migraphx/onnx/op_parser.hpp>
+#include <migraphx/onnx/checks.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+#include <random>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace onnx {
+struct parse_multinomial : op_parser<parse_multinomial>
+{
+    std::vector<op_desc> operators() const { return {{"Multinomial"}}; }
+    instruction_ref parse(const op_desc& /*opd*/,
+                          const onnx_parser& /*parser*/,
+                          const onnx_parser::node_info& info,
+                          std::vector<instruction_ref> args) const
+    {
+        int dtype = 6;
+        if(contains(info.attributes, "dtype"))
+            dtype = info.attributes.at("dtype").i();
+        shape::type_t output_type = get_type(dtype);
+        size_t sample_size = 1;
+        if(contains(info.attributes, "sample_size"))
+            sample_size = info.attributes.at("sample_size").i();
+        float seed = static_cast<float>(
+            std::chrono::high_resolution_clock::now().time_since_epoch().count());
+        if(contains(info.attributes, "seed"))
+            seed = info.attributes.at("seed").f();
+        // Subtract the per-batch maximum log-probability, making the per-batch max 0
+        auto maxes =
+            info.add_instruction(migraphx::make_op("reduce_max", {{"axes", {1}}}), args[0]);
+        auto mb_maxes = info.add_instruction(
+            migraphx::make_op("multibroadcast", {{"out_lens", args[0]->get_shape().lens()}}),
+            maxes);
+        auto cdf = info.add_instruction(migraphx::make_op("sub"), args[0], mb_maxes);
+        // Take the element-wise exponent to get probabilities in the range (0, 1]
+        cdf = info.add_instruction(migraphx::make_op("exp"), cdf);
+        // Compute the cumulative density function
+        cdf = info.add_instruction(
+            migraphx::make_op("prefix_scan_sum", {{"axis", 1}, {"exclusive", false}}), cdf);
+        // Pre-compute random distribution
+        std::mt19937 gen(seed);
+        std::uniform_real_distribution<> dis(0.0, 1.0);
+        size_t batch_size = args[0]->get_shape().lens().front();
+        migraphx::shape dist_shape{migraphx::shape::float_type, {batch_size, sample_size}};
+        std::vector<float> random_dist(batch_size * sample_size);
+        std::generate(random_dist.begin(), random_dist.end(), [&]() { return dis(gen); });
+        auto dist_lit = info.add_literal(migraphx::literal{dist_shape, random_dist});
+        return info.add_instruction(
+            migraphx::make_op("multinomial", {{"dtype", output_type}}), cdf, dist_lit);
+    }
+};
+} // namespace onnx
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/onnx/parse_randomnormal_ops.cpp
+++ b/src/onnx/parse_randomnormal_ops.cpp
+#include <migraphx/onnx/op_parser.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/onnx/checks.hpp>
+#include <random>
+#include <set>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace onnx {
+struct parse_randomnormal_ops : op_parser<parse_randomnormal_ops>
+{
+    const std::set<shape::type_t> valid_types = {
+        shape::float_type, shape::half_type, shape::double_type};
+    std::vector<op_desc> operators() const { return {{"RandomNormal"}, {"RandomNormalLike"}}; }
+    instruction_ref parse(const op_desc& opd,
+                          const onnx_parser& parser,
+                          const onnx_parser::node_info& info,
+                          std::vector<instruction_ref> args) const
+    {
+        int dtype      = 1;
+        bool use_dtype = false;
+        if(contains(info.attributes, "dtype"))
+        {
+            dtype     = info.attributes.at("dtype").i();
+            use_dtype = true;
+        }
+        shape::type_t out_type = get_type(dtype);
+        if(not contains(valid_types, out_type))
+            MIGRAPHX_THROW(opd.op_name + ": invalid output type: " + std::to_string(dtype) +
+                           ". Valid types are 1 (float), 10 (half), and 11 (double).");
+        float mean = 0.0;
+        if(contains(info.attributes, "mean"))
+            mean = info.attributes.at("mean").f();
+        float scale = 1.0;
+        if(contains(info.attributes, "scale"))
+            scale = info.attributes.at("scale").f();
+        float seed = static_cast<float>(
+            std::chrono::high_resolution_clock::now().time_since_epoch().count());
+        if(contains(info.attributes, "seed"))
+            seed = info.attributes.at("seed").f();
+        shape out_shape;
+        if(contains(info.attributes, "shape"))
+        {
+            // RandomNormal:
+            // output type and shape must come from attributes
+            std::vector<int> out_lens;
+            literal ls = parser.parse_value(info.attributes.at("shape"));
+            ls.visit([&](auto s) { out_lens.assign(s.begin(), s.end()); });
+            out_shape = shape{out_type, out_lens};
+        }
+        else if(args.size() == 1)
+        {
+            // RandomNormalLike:
+            // output type and shape are the same as the input's by default
+            // dtype is used instead when attribute is set
+            if(not contains(valid_types, args[0]->get_shape().type()))
+                MIGRAPHX_THROW(opd.op_name + ": invalid output type: " +
+                               std::to_string(args[0]->get_shape().type()) +
+                               ". Valid types are float, half, and double.");
+            out_shape =
+                use_dtype ? shape{out_type, args[0]->get_shape().lens()} : args[0]->get_shape();
+        }
+        else
+        {
+            MIGRAPHX_THROW(opd.op_name +
+                           ": cannot deduce shape without shape attribute or argument.");
+        }
+        std::mt19937 gen(seed);
+        std::normal_distribution<> d(mean, scale);
+        std::vector<double> rand_vals(out_shape.elements());
+        std::generate(rand_vals.begin(), rand_vals.end(), [&]() { return d(gen); });
+        return info.add_literal(literal{out_shape, rand_vals});
+    }
+};
+} // namespace onnx
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/onnx/parse_randomuniform_ops.cpp
+++ b/src/onnx/parse_randomuniform_ops.cpp
+#include <migraphx/onnx/op_parser.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/onnx/checks.hpp>
+#include <random>
+#include <set>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace onnx {
+struct parse_randomuniform_ops : op_parser<parse_randomuniform_ops>
+{
+    const std::set<shape::type_t> valid_types = {
+        shape::float_type, shape::half_type, shape::double_type};
+    std::vector<op_desc> operators() const { return {{"RandomUniform"}, {"RandomUniformLike"}}; }
+    instruction_ref parse(const op_desc& opd,
+                          const onnx_parser& parser,
+                          const onnx_parser::node_info& info,
+                          std::vector<instruction_ref> args) const
+    {
+        int dtype      = 1;
+        bool use_dtype = false;
+        if(contains(info.attributes, "dtype"))
+        {
+            dtype     = info.attributes.at("dtype").i();
+            use_dtype = true;
+        }
+        shape::type_t out_type = get_type(dtype);
+        if(not contains(valid_types, out_type))
+            MIGRAPHX_THROW(opd.op_name + ": invalid output type: " + std::to_string(dtype) +
+                           ". Valid types are 1 (float), 10 (half), and 11 (double).");
+        float high = 1.0;
+        if(contains(info.attributes, "high"))
+            high = info.attributes.at("high").f();
+        float low = 0.0;
+        if(contains(info.attributes, "low"))
+            low = info.attributes.at("low").f();
+        float seed = static_cast<float>(
+            std::chrono::high_resolution_clock::now().time_since_epoch().count());
+        if(contains(info.attributes, "seed"))
+            seed = info.attributes.at("seed").f();
+        shape out_shape;
+        if(contains(info.attributes, "shape"))
+        {
+            // RandomUniform:
+            // output type and shape must come from attributes
+            std::vector<int> out_lens;
+            literal ls = parser.parse_value(info.attributes.at("shape"));
+            ls.visit([&](auto s) { out_lens.assign(s.begin(), s.end()); });
+            out_shape = shape{out_type, out_lens};
+        }
+        else if(args.size() == 1)
+        {
+            // RandomUniformLike:
+            // output type and shape are the same as the input by default
+            // dtype is used instead when attribute is set
+            if(not contains(valid_types, args[0]->get_shape().type()))
+                MIGRAPHX_THROW(opd.op_name + ": invalid output type: " +
+                               std::to_string(args[0]->get_shape().type()) +
+                               ". Valid types are float, half, and double.");
+            out_shape =
+                use_dtype ? shape{out_type, args[0]->get_shape().lens()} : args[0]->get_shape();
+        }
+        else
+        {
+            MIGRAPHX_THROW(opd.op_name +
+                           ": cannot deduce shape without shape attribute or argument.");
+        }
+        std::mt19937 gen(seed);
+        std::uniform_real_distribution<> d(high, low);
+        std::vector<double> rand_vals(out_shape.elements());
+        std::generate(rand_vals.begin(), rand_vals.end(), [&]() { return d(gen); });
+        return info.add_literal(literal{out_shape, rand_vals});
+    }
+};
+} // namespace onnx
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -59,6 +59,7 @@ add_library(migraphx_device
    device/mul.cpp
    device/mul_add.cpp
    device/mul_add_relu.cpp
+    device/multinomial.cpp
    device/pad.cpp
    device/pow.cpp
    device/prelu.cpp
@@ -143,6 +144,7 @@ add_library(migraphx_gpu
    lrn.cpp
    leaky_relu.cpp
    mlir_conv.cpp
+    multinomial.cpp
    pack_args.cpp
    pack_int8_args.cpp
    pad.cpp
@@ -199,6 +201,7 @@ register_migraphx_gpu_ops(hip_
    max
    min
    mul
+    multinomial
    pad
    pow
    prelu

--- a/src/targets/gpu/device/include/migraphx/gpu/device/reduce.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/reduce.hpp
@@ -12,10 +12,6 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
-#if __AMDGCN_WAVEFRONT_SIZE == 32
-#define MIGRAPHX_NO_DPP
-#endif
 #ifdef MIGRAPHX_NO_DPP
 template <index_int N,
          class Op,
@@ -98,10 +94,12 @@ __device__ void dpp_reduce(T& in, Op op)
    in  = op(in, out);
    out = dpp_mov<dpp_row_shr(8), 0xf, 0xc>(in);
    in  = op(in, out);
+#if __AMDGCN_WAVEFRONT_SIZE == 64
    out = dpp_mov<dpp_row_bcast(15), 0xa>(in);
    in  = op(in, out);
    out = dpp_mov<dpp_row_bcast(31), 0xc>(in);
    in  = op(in, out);
+#endif
 }
 __device__ inline void dpp_reduce(float& x, sum)
@@ -118,9 +116,11 @@ __device__ inline void dpp_reduce(float& x, sum)
                     "s_nop 1\n"
                     "v_add_f32 %0 %0 %0 row_shr:8 bank_mask:0xc\n"
                     "s_nop 1\n"
+#if __AMDGCN_WAVEFRONT_SIZE == 64
                     "v_add_f32 %0 %0 %0 row_bcast:15 row_mask:0xa\n"
                     "s_nop 1\n"
                     "v_add_f32 %0 %0 %0 row_bcast:31 row_mask:0xc\n"
+#endif
                     "s_nop 1\n"
                     : "=v"(x)
                     : "0"(x));
@@ -135,21 +135,27 @@ template <index_int N,
          MIGRAPHX_REQUIRES(not std::is_integral<ForStride>{})>
 __device__ auto block_reduce(index idx, Op op, T init, ForStride fs, F f)
 {
-    using type = decltype(f(deduce_for_stride(fs)));
-    MIGRAPHX_DEVICE_SHARED type buffer[N / 64];
+#if __AMDGCN_WAVEFRONT_SIZE == 32
+    constexpr index_int nthreads = 16;
+#else
+    constexpr index_int nthreads = 64;
+#endif
+    using type                   = decltype(f(deduce_for_stride(fs)));
+    MIGRAPHX_DEVICE_SHARED type buffer[N / nthreads];
    type x = init;
    fs([&](auto i) { x = op(x, f(i)); });
    dpp_reduce(x, op);
-    const auto ldsidx = idx.local / 64;
+    const auto ldsidx = idx.local / nthreads;
-    if((idx.local % 64) == 63)
+    if((idx.local % nthreads) == nthreads - 1)
    {
        buffer[ldsidx] = x;
    }
    __syncthreads();
    type y = init;
-    for(index_int i = 0; i < idx.nlocal() / 64; i++)
+    for(index_int i = 0; i < idx.nlocal() / nthreads; i++)
    {
        y = op(y, buffer[i]);
    }

--- a/src/targets/gpu/device/layernorm.cpp
+++ b/src/targets/gpu/device/layernorm.cpp
@@ -8,6 +8,14 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
+#ifndef MIGRAPHX_WORKAROUND_NAVI_DPP_SYNC
+#if __AMDGCN_WAVEFRONT_SIZE == 32
+#define MIGRAPHX_WORKAROUND_NAVI_DPP_SYNC 1
+#else
+#define MIGRAPHX_WORKAROUND_NAVI_DPP_SYNC 0
+#endif
+#endif
 template <class T>
 struct vector_type
 {
@@ -86,10 +94,13 @@ __device__ void layernorm(index_int i,
    const bool in_range    = idx.local < relements_v;
    auto mean = [&](auto z) {
-        return auto_block_reduce<MaxBlockSize>(
+        auto m = auto_block_reduce<MaxBlockSize>(
-                   idx, sum{}, value_type(0), relements_v, [=](auto) { return z; }) /
+                     idx, sum{}, value_type(0), relements_v, [=](auto) { return z; }) /
-               value_type(relements);
+                 value_type(relements);
+#if MIGRAPHX_WORKAROUND_NAVI_DPP_SYNC
+        __builtin_amdgcn_s_barrier();
+#endif
+        return m;
    };
    // m = x - mean(x)

--- a/src/targets/gpu/device/multinomial.cpp
+++ b/src/targets/gpu/device/multinomial.cpp
+#include <migraphx/shape.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/dfor.hpp>
+#include <migraphx/gpu/device/multinomial.hpp>
+#include <migraphx/gpu/device/tensor.hpp>
+#include <migraphx/gpu/device/launch.hpp>
+#include <migraphx/gpu/device/types.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+template <class Iterator, class T>
+constexpr Iterator upper_bound(Iterator first, Iterator last, const T& value)
+{
+    Iterator it;
+    typename std::iterator_traits<Iterator>::difference_type count;
+    typename std::iterator_traits<Iterator>::difference_type step;
+    count = std::distance(first, last);
+    while(count > 0)
+    {
+        it   = first;
+        step = count / 2;
+        std::advance(it, step);
+        if(!(value < *it))
+        {
+            first = ++it;
+            count -= step + 1;
+        }
+        else
+            count = step;
+    }
+    return first;
+}
+void multinomial(hipStream_t stream,
+                 const argument& result,
+                 const argument& arg0,
+                 const argument& arg1)
+{
+    size_t batch_size  = arg0.get_shape().lens().front();
+    size_t class_size  = arg0.get_shape().lens().back();
+    size_t sample_size = result.get_shape().lens().back();
+    hip_visit_all(arg0, arg1)([&](auto cdf, auto dist) {
+        result.visit([&](auto out) {
+            hip_visit_views(out)([&](auto output) {
+                gs_launch(stream, batch_size * sample_size)([=](auto i) __device__ {
+                    auto idx       = output.get_shape().multi(i);
+                    auto cdf_begin = cdf.begin() + (idx.front() * class_size);
+                    auto cdf_end   = cdf_begin + class_size;
+                    auto sample_iter =
+                        upper_bound(cdf_begin, cdf_end, dist[i] * *(std::prev(cdf_end)));
+                    output[i] = std::distance(cdf_begin, sample_iter);
+                });
+            });
+        });
+    });
+}
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/include/migraphx/gpu/device/multinomial.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/multinomial.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_MULTINOMIAL_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_MULTINOMIAL_HPP
+#include <migraphx/argument.hpp>
+#include <migraphx/config.hpp>
+#include <hip/hip_runtime_api.h>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+void multinomial(hipStream_t stream,
+                 const argument& result,
+                 const argument& arg0,
+                 const argument& arg1);
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/multinomial.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/multinomial.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_MULTINOMIAL_HPP
+#define MIGRAPHX_GUARD_RTGLIB_MULTINOMIAL_HPP
+#include <migraphx/op/multinomial.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+struct context;
+struct hip_multinomial
+{
+    op::multinomial op;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+    std::string name() const { return "gpu::multinomial"; }
+    shape compute_shape(std::vector<shape> inputs) const;
+    argument
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -164,6 +164,7 @@ struct miopen_apply
        add_extend_op("leaky_relu");
        add_extend_op("logsoftmax");
        add_extend_op("lrn");
+        add_extend_op("multinomial");
        add_extend_op("pad");
        add_extend_op("pooling");
        add_extend_op("prefix_scan_sum");

--- a/src/targets/gpu/multinomial.cpp
+++ b/src/targets/gpu/multinomial.cpp
+#include <migraphx/gpu/multinomial.hpp>
+#include <migraphx/gpu/device/multinomial.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/tune_axis.hpp>
+#include <migraphx/check_shapes.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+shape hip_multinomial::compute_shape(std::vector<shape> inputs) const
+{
+    check_shapes{inputs, *this}.has(3).only_dims(2).standard();
+    inputs.pop_back();
+    return op.compute_shape(inputs);
+}
+argument
+hip_multinomial::compute(context& ctx, const shape&, const std::vector<argument>& args) const
+{
+    device::multinomial(ctx.get_stream().get(), args.back(), args.front(), args[1]);
+    return args.back();
+}
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/test/onnx/depthtospace_crd_test.onnx
+++ b/test/onnx/depthtospace_crd_test.onnx
+depthtospace_crd_test:
+6
+xy"DepthToSpace*
+	blocksize*
+mode"CRDdepthtospace_crd_testZ
+x
+b
+y
+B
\ No newline at end of file
--- a/test/onnx/depthtospace_simple_test.onnx
+++ b/test/onnx/depthtospace_simple_test.onnx
+depthtospace_simple_test:
+6
+xy"DepthToSpace*
+	blocksize*
+mode"DCRdepthtospace_simple_testZ
+x
+b
+y
+B
\ No newline at end of file
--- a/test/onnx/depthtospace_test.onnx
+++ b/test/onnx/depthtospace_test.onnx
+depthtospace_test:
+6
+xy"DepthToSpace*
+	blocksize*
+mode"DCRdepthtospace_testZ
+x
+b
+y
+B
\ No newline at end of file