Merge branch 'develop' into mlir-c

dd033c75 · Paul · 50f87a87 · 8829d6ab · dd033c75 · dd033c75
Commit dd033c75 authored Oct 18, 2021 by Paul
20 changed files
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -4,7 +4,7 @@ CheckOptions:
  - key:             bugprone-unused-return-value.CheckedFunctions
    value:           '::std::async;::std::launder;::std::remove;::std::remove_if;::std::unique;::std::unique_ptr::release;::std::basic_string::empty;::std::vector::empty;::std::find;::std::find_if;::std::find_if_not;::std::all_of;::std::any_of;::std::none_of;::std::count;::std::count_if;::std::mismatch;::std::find_end;::std::find_first_of;::std::adjacent_find;::std::search;::std::search_n;::std::nth_element;::std::lower_bound;::std::upper_bound;::std::binary_search;::std::equal_range;::std::max;::std::max_element;::std::min;::std::min_element;::std::minmax;::std::minmax_element;::std::equal;::std::lexicographical_compare;::std::accumulate;::std::inner_product'
  - key:             cppcoreguidelines-macro-usage.AllowedRegexp
-    value:           'DEBUG|FALLTHROUGH|STRINGIZE|_HAS_|_THROW|_REQUIRES|_DECLARE_|_VISIT_|_REGISTER_|_GENERATE_|_DETAIL_|_TIDY_|_MANAGE_PTR|_MATCHER|DEVICE_SHARED'
+    value:           'DEBUG|FALLTHROUGH|STRINGIZE|_HAS_|_THROW|_REQUIRES|_DECLARE_|_VISIT_|_REGISTER_|_GENERATE_|_DETAIL_|_TIDY_|_MANAGE_PTR|_MATCHER|DEVICE_SHARED|_WORKAROUND_'
  - key:             modernize-loop-convert.MinConfidence
    value:           risky   
  - key:             modernize-loop-convert.NamingStyle

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -36,7 +36,7 @@ find_package(nlohmann_json 3.8.0 REQUIRED)

 include(ROCMSetupVersion)

-rocm_setup_version(VERSION 1.3)
+rocm_setup_version(VERSION 2.0)
 set(MIGRAPHX_SO_VERSION ${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR})

 option( BUILD_SHARED_LIBS "Build as a shared library" ON )
@@ -223,7 +223,7 @@ rocm_create_package(
    MAINTAINER "Paul Fultz II <paul.fultz@amd.com>"
    LDCONFIG
    PTH
-    DEPENDS miopen-hip rocblas hip-hcc half
+    DEPENDS miopen-hip rocblas hip-rocclr hip-base half
 )

 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)

--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -124,8 +124,9 @@ def onnxnode(name, body) {
 rocmtest onnx: onnxnode('rocmtest') { cmake_build ->
    stage("Onnx runtime") {
        sh '''
+            apt install half
            ls -lR
-            dpkg -i --force-depends ./build/*.deb
+            dpkg -i ./build/*.deb
            cd /onnxruntime && ./build_and_test_onnxrt.sh
        '''
    }

--- a/cppcheck.rules
+++ b/cppcheck.rules
@@ -152,6 +152,24 @@
        <summary>Else statement is not necessary.</summary>
    </message>
 </rule>
+<rule>
+    <tokenlist>normal</tokenlist>
+    <pattern><![CDATA[((?:(?:\w+|<|>|::) )*(?:\w+|>)(?: &|\*)*) (\w) ; \2 = static_cast < \1 > (\([^()]*(?-1)*[^()]*\)) ;]]></pattern>
+    <message>
+        <id>RedundantCast</id>
+        <severity>style</severity>
+        <summary>Static cast is redundant.</summary>
+    </message>
+</rule>
+<rule>
+    <tokenlist>normal</tokenlist>
+    <pattern><![CDATA[auto (\w) ; \1 = static_cast < (?:(?:\w+|<|>|::) )*(?:\w+|>)(?: &|\*)* > (\([^()]*(?-1)*[^()]*\)) ;]]></pattern>
+    <message>
+        <id>RedundantCast</id>
+        <severity>style</severity>
+        <summary>Static cast is redundant.</summary>
+    </message>
+</rule>
 <rule>
    <tokenlist>normal</tokenlist>
    <pattern><![CDATA[\? (true|false) : (true|false)]]></pattern>

--- a/dev-requirements.txt
+++ b/dev-requirements.txt
 pfultz2/rocm-recipes
 facebook/zstd@v1.4.5 -X subdir -DCMAKE_DIR=build/cmake
 ccache@v4.1
-danmar/cppcheck@4a8a78a9258fd56bc21e55b5b49a0f09bc8fa750 -DHAVE_RULES=1
+danmar/cppcheck@2.6 -DHAVE_RULES=1
 RadeonOpenCompute/rocm-cmake@ececd2eccae4d01e7ec154efe90ac43ebf4df317 --build
 -f requirements.txt
--- a/examples/migraphx/cpp_parse_load_save/README.md
+++ b/examples/migraphx/cpp_parse_load_save/README.md
@@ -25,8 +25,8 @@ migraphx::save(p, output_file);

 ```
 migraphx::program p = ... <migraphx::program>;
-migraphx_file_options options;
-options.format = "msgpack";
+migraphx::file_options options;
+options.set_file_format("msgpack");
 migraphx::save(p, output_file, options);
 ```

@@ -41,15 +41,15 @@ p = migraphx::load(input_file);

 ```
 migraphx::program p;
-migraphx_file_options options;
-options.format = "msgpack";
+migraphx::file_options options;
+options.set_file_format("msgpack");
 p = migraphx::load(input_file, options);
 ```
 To load a program that has been saved in JSON format:
 ```
 migraphx::program p;
-migraphx_file_options options;
-options.format = "json";
+migraphx::file_options options;
+options.set_file_format("json");
 p = migraphx::load(input_file, options);
 ```


--- a/examples/migraphx/cpp_parse_load_save/parse_load_save.cpp
+++ b/examples/migraphx/cpp_parse_load_save/parse_load_save.cpp
@@ -44,14 +44,14 @@ int main(int argc, char** argv)
        std::string format = load_arg;
        if(format == "json")
        {
-            migraphx_file_options options;
-            options.format = "json";
+            migraphx::file_options options;
+            options.set_file_format("json");
            p = migraphx::load(input_file, options);
        }
        else if(format == "msgpack")
        {
-            migraphx_file_options options;
-            options.format = "msgpack";
+            migraphx::file_options options;
+            options.set_file_format("msgpack");
            p = migraphx::load(input_file, options);
        }
        else
@@ -80,8 +80,8 @@ int main(int argc, char** argv)
        output_file = save_arg == nullptr ? "out" : save_arg;
        output_file.append(".msgpack");

-        migraphx_file_options options;
-        options.format = "msgpack";
+        migraphx::file_options options;
+        options.set_file_format("msgpack");
        migraphx::save(p, output_file.c_str(), options);
        std::cout << "Program has been saved as ./" << output_file << std::endl;
    }

--- a/examples/vision/cpp_mnist/README.md
+++ b/examples/vision/cpp_mnist/README.md
@@ -60,14 +60,14 @@ migraphx::quantize_int8(prog, targ, quant_opts);
 ## Compilation 
 Network graphs saved in e.g. ONNX or protobuf format are not target-specific. In order to run inference, we must compile the graph into a target-specific program. 

-Two options may be turned on (default for both is `false`) when compiling:
- `bool offload_copy`: For targets with offloaded memory (such as the gpu), this will insert instructions during compilation to copy the input parameters to the offloaded memory and to copy the final result from the offloaded memory back to main memory.
- `bool fast_math`: Optimize math functions to use faster approximate versions. There may be slight accuracy degredation when enabled. 
+Two options may be turned on when compiling:
+- `set_offload_copy(bool value)`: For targets with offloaded memory (such as the gpu), this will insert instructions during compilation to copy the input parameters to the offloaded memory and to copy the final result from the offloaded memory back to main memory. Default value is `false` for offload_copy.
+- `set_fast_math(bool value)`: Optimize math functions to use faster approximate versions. There may be slight accuracy degredation when enabled. Default value is `true` for fast_math. 

 The following snippet assumes `targ` has been set as "gpu", and will compile the program without the fast_math optimization.
 ```
-migraphx_compile_options comp_opts;
-comp_opts.offload_copy = true;
+migraphx::compile_options comp_opts;
+comp_opts.set_offload_copy();
 prog.compile(targ, comp_opts);
 ``` 


--- a/examples/vision/cpp_mnist/mnist_inference.cpp
+++ b/examples/vision/cpp_mnist/mnist_inference.cpp
@@ -99,8 +99,8 @@ int main(int argc, char** argv)

    if(GPU)
    {
-        migraphx_compile_options comp_opts;
-        comp_opts.offload_copy = true;
+        migraphx::compile_options comp_opts;
+        comp_opts.set_offload_copy();
        prog.compile(targ, comp_opts);
    }
    else

--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -7,6 +7,7 @@ include(CheckCXXLinkerFlag)
 add_library(migraphx 
    adjust_allocation.cpp
    analyze_streams.cpp
+    apply_alpha_beta.cpp
    argument.cpp
    auto_contiguous.cpp
    common.cpp
@@ -14,7 +15,6 @@ add_library(migraphx
    convert_to_json.cpp
    cpp_generator.cpp
    dead_code_elimination.cpp
-    decompose.cpp
    dom_info.cpp
    dynamic_loader.cpp
    eliminate_allocation.cpp
@@ -53,7 +53,6 @@ add_library(migraphx
    reduce_dims.cpp
    register_op.cpp
    register_target.cpp
-    remap.cpp
    simplify_qdq.cpp
    rewrite_batchnorm.cpp
    rewrite_pooling.cpp
@@ -130,7 +129,9 @@ register_migraphx_ops(
    min
    mul
    multibroadcast
+    multinomial
    neg
+    nonzero
    outline
    pad
    pooling

--- a/src/api/CMakeLists.txt
+++ b/src/api/CMakeLists.txt
@@ -3,7 +3,7 @@ add_library(migraphx_c
    api.cpp
 )
 set_target_properties(migraphx_c PROPERTIES EXPORT_NAME c)
-rocm_set_soversion(migraphx_c 2.0)
+rocm_set_soversion(migraphx_c 3.0)

 rocm_clang_tidy_check(migraphx_c)
 target_link_libraries(migraphx_c PRIVATE migraphx migraphx_tf migraphx_onnx migraphx_all_targets)

--- a/src/apply_alpha_beta.cpp
+++ b/src/apply_alpha_beta.cpp
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/common.hpp>
+#include <migraphx/apply_alpha_beta.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+instruction_ref insert_apply_alpha_beta(module& m,
+                                        instruction_ref pos,
+                                        const std::vector<instruction_ref>& args,
+                                        const operation& op,
+                                        const literal& alpha,
+                                        const literal& beta)
+{
+    auto a          = args[0];
+    auto b          = args[1];
+    auto input_type = a->get_shape().type();
+    if(!float_equal(alpha.at<float>(0), 1.0))
+    {
+        auto alpha_literal = m.add_literal(alpha);
+        a                  = insert_common_op(m, pos, migraphx::make_op("mul"), {alpha_literal, a});
+        if(a->get_shape().type() != input_type)
+        {
+            a = m.insert_instruction(pos, make_op("convert", {{"target_type", input_type}}), a);
+        }
+    }
+    auto op_res = m.insert_instruction(pos, op, a, b);
+    if(args.size() == 3)
+    {
+        if(not float_equal(beta.at<float>(0), 0.0) && args[2]->get_shape().elements() > 0)
+        {
+            auto out_lens = op_res->get_shape().lens();
+            auto c        = args[2];
+            auto c_lens   = c->get_shape().lens();
+            input_type    = c->get_shape().type();
+            if(out_lens != c_lens)
+            {
+                c = m.insert_instruction(
+                    pos, migraphx::make_op("multibroadcast", {{"out_lens", out_lens}}), args[2]);
+            }
+            auto beta_literal = m.add_literal(beta);
+            auto beta_c = insert_common_op(m, pos, migraphx::make_op("mul"), {c, beta_literal});
+            if(beta_c->get_shape().type() != input_type)
+            {
+                beta_c = m.insert_instruction(
+                    pos, migraphx::make_op("convert", {{"target_type", input_type}}), beta_c);
+            }
+            return m.insert_instruction(pos, migraphx::make_op("add"), op_res, beta_c);
+        }
+    }
+    return op_res;
+}
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/common.cpp
+++ b/src/common.cpp
--- a/src/decompose.cpp
+++ b/src/decompose.cpp
-#include <migraphx/decompose.hpp>
-#include <migraphx/program.hpp>
-#include <migraphx/instruction.hpp>
-#include <migraphx/iterator_for.hpp>
-#include <migraphx/functional.hpp>
-#include <migraphx/ranges.hpp>
-#include <migraphx/float_equal.hpp>
-#include <migraphx/matcher.hpp>
-#include <migraphx/op/dot.hpp>
-#include <migraphx/make_op.hpp>
-
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace {
-
-struct alpha_beta
-{
-    float alpha = 0.0;
-    float beta  = 0.0;
-};
-
-alpha_beta get_alpha_beta(const operation& op)
-{
-    auto v = op.to_value();
-    return {v.at("alpha").to<float>(), v.at("beta").to<float>()};
-}
-
-struct find_dot_add
-{
-    auto matcher() const { return match::name("dot", "quant_dot")(match::nargs(3)); }
-
-    void apply(module& p, const match::matcher_result& r) const
-    {
-        auto ins   = r.result;
-        auto dot   = get_alpha_beta(ins->get_operator());
-        auto a_ins = ins->inputs()[0];
-        auto b_ins = ins->inputs()[1];
-        if(not float_equal(dot.alpha, 1))
-        {
-            auto alpha = p.add_literal(literal{shape{a_ins->get_shape().type()}, {dot.alpha}});
-            auto alpha_broadcast = p.insert_instruction(
-                ins, make_op("multibroadcast", {{"out_lens", a_ins->get_shape().lens()}}), alpha);
-            a_ins = p.insert_instruction(ins, make_op("mul"), a_ins, alpha_broadcast);
-        }
-        auto dot_ins = p.insert_instruction(ins, make_op(ins->name(), {{"beta", 0}}), a_ins, b_ins);
-
-        auto c_ins = ins->inputs()[2];
-        if(not float_equal(dot.beta, 1))
-        {
-            auto beta = p.add_literal(literal{shape{c_ins->get_shape().type()}, {dot.beta}});
-            auto beta_broadcast = p.insert_instruction(
-                ins, make_op("multibroadcast", {{"out_lens", ins->get_shape().lens()}}), beta);
-            c_ins = p.insert_instruction(ins, make_op("mul"), c_ins, beta_broadcast);
-        }
-        p.replace_instruction(ins, make_op("add"), dot_ins, c_ins);
-    }
-};
-
-struct find_dot_alpha
-{
-    auto matcher() const { return match::name("dot", "quant_dot")(match::nargs(2)); }
-
-    void apply(module& p, const match::matcher_result& r) const
-    {
-        auto ins   = r.result;
-        auto dot   = get_alpha_beta(ins->get_operator());
-        auto a_ins = ins->inputs()[0];
-        auto b_ins = ins->inputs()[1];
-        if(not float_equal(dot.alpha, 1))
-        {
-            auto alpha = p.add_literal(literal{shape{a_ins->get_shape().type()}, {dot.alpha}});
-            auto alpha_broadcast = p.insert_instruction(
-                ins, make_op("multibroadcast", {{"out_lens", a_ins->get_shape().lens()}}), alpha);
-            a_ins = p.insert_instruction(ins, make_op("mul"), a_ins, alpha_broadcast);
-        }
-        p.replace_instruction(ins, make_op(ins->name(), {{"beta", 0}}), a_ins, b_ins);
-    }
-};
-
-} // namespace
-
-void decompose::apply(module& p) const { match::find_matches(p, find_dot_add{}, find_dot_alpha{}); }
-
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
--- a/src/driver/alexnet.cpp
+++ b/src/driver/alexnet.cpp
 #include <migraphx/operators.hpp>
 #include <migraphx/program.hpp>
 #include <migraphx/generate.hpp>
+#include <migraphx/apply_alpha_beta.hpp>
 #include "models.hpp"

 namespace migraphx {
@@ -144,10 +145,10 @@ migraphx::program alexnet(unsigned batch) // NOLINT(readability-function-size)
    migraphx::op::multibroadcast multibroadcast42;
    multibroadcast42.output_lens = {batch, 4096};
    auto mx42                    = mm->add_instruction(multibroadcast42, mx4);
-    migraphx::op::dot dot43;
-    dot43.alpha = 1;
-    dot43.beta  = 1;
-    auto mx43   = mm->add_instruction(dot43, mx40, mx41, mx42);
+    float dot43_alpha            = 1;
+    float dot43_beta             = 1;
+    auto mx43                    = migraphx::add_apply_alpha_beta(
+        *mm, {mx40, mx41, mx42}, migraphx::make_op("dot"), dot43_alpha, dot43_beta);
    migraphx::op::relu relu44;
    auto mx44 = mm->add_instruction(relu44, mx43);
    migraphx::op::identity identity45;
@@ -158,10 +159,10 @@ migraphx::program alexnet(unsigned batch) // NOLINT(readability-function-size)
    migraphx::op::multibroadcast multibroadcast47;
    multibroadcast47.output_lens = {batch, 4096};
    auto mx47                    = mm->add_instruction(multibroadcast47, mx2);
-    migraphx::op::dot dot48;
-    dot48.alpha = 1;
-    dot48.beta  = 1;
-    auto mx48   = mm->add_instruction(dot48, mx45, mx46, mx47);
+    float dot48_alpha            = 1;
+    float dot48_beta             = 1;
+    auto mx48                    = migraphx::add_apply_alpha_beta(
+        *mm, {mx45, mx46, mx47}, migraphx::make_op("dot"), dot48_alpha, dot48_beta);
    migraphx::op::relu relu49;
    auto mx49 = mm->add_instruction(relu49, mx48);
    migraphx::op::transpose transpose50;
@@ -170,10 +171,10 @@ migraphx::program alexnet(unsigned batch) // NOLINT(readability-function-size)
    migraphx::op::multibroadcast multibroadcast51;
    multibroadcast51.output_lens = {batch, 1000};
    auto mx51                    = mm->add_instruction(multibroadcast51, mx0);
-    migraphx::op::dot dot52;
-    dot52.alpha = 1;
-    dot52.beta  = 1;
-    mm->add_instruction(dot52, mx49, mx50, mx51);
+    float dot52_alpha            = 1;
+    float dot52_beta             = 1;
+    migraphx::add_apply_alpha_beta(
+        *mm, {mx49, mx50, mx51}, migraphx::make_op("dot"), dot52_alpha, dot52_beta);
    return p;
 }


--- a/src/driver/inceptionv3.cpp
+++ b/src/driver/inceptionv3.cpp
 #include <migraphx/operators.hpp>
 #include <migraphx/program.hpp>
 #include <migraphx/generate.hpp>
+#include <migraphx/apply_alpha_beta.hpp>
 #include "models.hpp"

 namespace migraphx {
@@ -2225,10 +2226,10 @@ migraphx::program inceptionv3(unsigned batch) // NOLINT(readability-function-siz
    migraphx::op::multibroadcast multibroadcast798;
    multibroadcast798.output_lens = {batch, 1000};
    auto mx798                    = mm->add_instruction(multibroadcast798, mx0);
-    migraphx::op::dot dot799;
-    dot799.alpha = 1;
-    dot799.beta  = 1;
-    mm->add_instruction(dot799, mx796, mx797, mx798);
+    float dot799_alpha            = 1;
+    float dot799_beta             = 1;
+    migraphx::add_apply_alpha_beta(
+        *mm, {mx796, mx797, mx798}, migraphx::make_op("dot"), dot799_alpha, dot799_beta);

    return p;
 }

--- a/src/driver/resnet50.cpp
+++ b/src/driver/resnet50.cpp
 #include <migraphx/operators.hpp>
 #include <migraphx/program.hpp>
 #include <migraphx/generate.hpp>
+#include <migraphx/apply_alpha_beta.hpp>
 #include "models.hpp"

 namespace migraphx {
@@ -1228,10 +1229,10 @@ migraphx::program resnet50(unsigned batch) // NOLINT(readability-function-size)
    migraphx::op::multibroadcast multibroadcast442;
    multibroadcast442.output_lens = {batch, 1000};
    auto mx442                    = mm->add_instruction(multibroadcast442, mx0);
-    migraphx::op::dot dot443;
-    dot443.alpha = 1;
-    dot443.beta  = 1;
-    mm->add_instruction(dot443, mx440, mx441, mx442);
+    float dot443_alpha            = 1;
+    float dot443_beta             = 1;
+    migraphx::add_apply_alpha_beta(
+        *mm, {mx440, mx441, mx442}, migraphx::make_op("dot"), dot443_alpha, dot443_beta);
    return p;
 }


--- a/src/include/migraphx/apply_alpha_beta.hpp
+++ b/src/include/migraphx/apply_alpha_beta.hpp
+#ifndef MIGRAPHX_GUARD_MIGRAPHX_APPLY_ALPHA_BETA_HPP
+#define MIGRAPHX_GUARD_MIGRAPHX_APPLY_ALPHA_BETA_HPP
+
+#include "migraphx/make_op.hpp"
+#include "migraphx/normalize_attributes.hpp"
+#include "migraphx/operation.hpp"
+#include <migraphx/instruction_ref.hpp>
+#include <migraphx/module.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+instruction_ref insert_apply_alpha_beta(module& m,
+                                        instruction_ref pos,
+                                        const std::vector<instruction_ref>& args,
+                                        const operation& op,
+                                        const literal& alpha,
+                                        const literal& beta);
+
+template <typename T = float>
+instruction_ref insert_apply_alpha_beta(module& m,
+                                        instruction_ref pos,
+                                        const std::vector<instruction_ref>& args,
+                                        const operation& op,
+                                        T alpha = 1,
+                                        T beta  = 0)
+{
+    return insert_apply_alpha_beta(m, pos, args, op, literal{T{alpha}}, literal{T{beta}});
+}
+
+template <typename T = float>
+instruction_ref add_apply_alpha_beta(module& m,
+                                     const std::vector<instruction_ref>& args,
+                                     const operation& op,
+                                     T alpha = 1,
+                                     T beta  = 0)
+{
+    return insert_apply_alpha_beta(m, m.end(), args, op, alpha, beta);
+}
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_APPLY_ALPHA_BETA_HPP
--- a/src/include/migraphx/op/dot.hpp
+++ b/src/include/migraphx/op/dot.hpp
@@ -18,19 +18,10 @@ namespace op {

 struct dot
 {
-    float alpha = 1.0;
-    float beta  = 1.0;
-
-    template <class Self, class F>
-    static auto reflect(Self& self, F f)
-    {
-        return pack(f(self.alpha, "alpha"), f(self.beta, "beta"));
-    }
-
    std::string name() const { return "dot"; }
    shape compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.same_type();
+        check_shapes{inputs, *this}.same_type().has(2);
        const shape& a = inputs.at(0);
        const shape& b = inputs.at(1);
        auto t         = a.type();
@@ -58,25 +49,14 @@ struct dot

        auto out_lens   = a.lens();
        out_lens[dim_1] = b.lens()[dim_1];
-        if(inputs.size() == 3 && out_lens != inputs.at(2).lens())
-        {
-            MIGRAPHX_THROW("DOT: dimension mismatch, operand C: {" +
-                           to_string_range(inputs.at(2).lens()) +
-                           "}, cannot add to operand A * B: {" + to_string_range(out_lens) + "}");
-        }
-
        return {t, out_lens};
    }

    argument compute(shape output_shape, std::vector<argument> args) const
    {
-        argument result;
-        if(args.size() == 3)
-            result = args[2];
-        else
-            result = argument{output_shape};
+        argument result = argument{output_shape};
        visit_all(result, args[0], args[1])(
-            [&](auto cmat, auto amat, auto bmat) { gemm(cmat, amat, bmat, alpha, beta); });
+            [&](auto cmat, auto amat, auto bmat) { gemm(cmat, amat, bmat, 1.0f, 0.0f); });
        return result;
    }
 };

--- a/src/include/migraphx/op/multinomial.hpp
+++ b/src/include/migraphx/op/multinomial.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_MULTINOMIAL_HPP
+#define MIGRAPHX_GUARD_OPERATORS_MULTINOMIAL_HPP
+
+#include <migraphx/operation.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/par_for.hpp>
+#include <random>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+
+struct multinomial
+{
+    shape::type_t dtype = shape::type_t::int32_type;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.dtype, "dtype"));
+    }
+
+    std::string name() const { return "multinomial"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(2).only_dims(2);
+        size_t sample_size = inputs.back().lens().back();
+
+        if(not contains({shape::int32_type, shape::int64_type}, dtype))
+            MIGRAPHX_THROW(
+                "Multinomial: Invalid output type. Valid types are int32_type and int64_type.");
+
+        return {dtype, {inputs.front().lens().front(), sample_size}};
+    }
+
+    argument compute(const shape& output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        size_t batch_size  = output_shape.lens().front();
+        size_t class_size  = args[0].get_shape().lens().back();
+        size_t sample_size = output_shape.lens().back();
+
+        visit_all(args[0], args[1])([&](auto cdf, auto dist) {
+            result.visit([&](auto output) {
+                par_for(batch_size * sample_size, [&](auto i) {
+                    auto idx       = args[1].get_shape().multi(i);
+                    auto cdf_begin = cdf.begin() + (idx[0] * class_size);
+                    auto cdf_end   = cdf_begin + class_size;
+                    auto sample_iter =
+                        std::upper_bound(cdf_begin, cdf_end, dist[i] * *(std::prev(cdf_end)));
+                    output[i] = std::distance(cdf_begin, sample_iter);
+                });
+            });
+        });
+
+        return result;
+    }
+};
+
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif