Merge branch 'develop' into mlir-c

dd033c75 · Paul · 50f87a87 · 8829d6ab · dd033c75 · dd033c75
Commit dd033c75 authored Oct 18, 2021 by Paul
20 changed files
--- a/src/targets/gpu/device/multinomial.cpp
+++ b/src/targets/gpu/device/multinomial.cpp
+#include <migraphx/shape.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/dfor.hpp>
+#include <migraphx/gpu/device/multinomial.hpp>
+#include <migraphx/gpu/device/tensor.hpp>
+#include <migraphx/gpu/device/launch.hpp>
+#include <migraphx/gpu/device/types.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+template <class Iterator, class T>
+constexpr Iterator upper_bound(Iterator first, Iterator last, const T& value)
+{
+    Iterator it;
+    typename std::iterator_traits<Iterator>::difference_type count;
+    typename std::iterator_traits<Iterator>::difference_type step;
+    count = std::distance(first, last);
+
+    while(count > 0)
+    {
+        it   = first;
+        step = count / 2;
+        std::advance(it, step);
+        if(!(value < *it))
+        {
+            first = ++it;
+            count -= step + 1;
+        }
+        else
+            count = step;
+    }
+    return first;
+}
+
+void multinomial(hipStream_t stream,
+                 const argument& result,
+                 const argument& arg0,
+                 const argument& arg1)
+{
+    size_t batch_size  = arg0.get_shape().lens().front();
+    size_t class_size  = arg0.get_shape().lens().back();
+    size_t sample_size = result.get_shape().lens().back();
+
+    hip_visit_all(arg0, arg1)([&](auto cdf, auto dist) {
+        result.visit([&](auto out) {
+            hip_visit_views(out)([&](auto output) {
+                gs_launch(stream, batch_size * sample_size)([=](auto i) __device__ {
+                    auto idx       = output.get_shape().multi(i);
+                    auto cdf_begin = cdf.begin() + (idx.front() * class_size);
+                    auto cdf_end   = cdf_begin + class_size;
+                    auto sample_iter =
+                        upper_bound(cdf_begin, cdf_end, dist[i] * *(std::prev(cdf_end)));
+                    output[i] = std::distance(cdf_begin, sample_iter);
+                });
+            });
+        });
+    });
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/nonzero.cpp
+++ b/src/targets/gpu/device/nonzero.cpp
+#include <migraphx/gpu/device/nonzero.hpp>
+#include <migraphx/gpu/device/float_equal.hpp>
+#include <migraphx/gpu/device/scan.hpp>
+#include <migraphx/gpu/device/reduce_ops.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+argument nonzero(hipStream_t stream, const argument& result, const argument& arg_data)
+{
+    auto s            = arg_data.get_shape();
+    auto elem_num     = s.elements();
+    auto out_elem_num = result.get_shape().elements();
+
+    // call the prefix_sum function to do a prefix_sum to compute
+    // index in the output. Only 1 block can be used since we have
+    // only one prefix sum
+    const index_int block_size = 256;
+    hip_visit_all(arg_data, s)([&](auto input, auto si) {
+        const auto* in_ptr = device_cast(input.data());
+        auto* ptr          = result.cast<int64_t>();
+        gs_launch(stream, block_size, block_size)([=](auto, auto idx) __device__ {
+            // fill all output to 0 first
+            idx.local_stride(out_elem_num, [&](auto j) { ptr[j] = 0; });
+
+            block_scan<block_size>(idx,
+                                   sum{},
+                                   0,
+                                   elem_num,
+                                   [&](auto j) { return (float_equal(in_ptr[j], 0)) ? 0 : 1; },
+                                   [&](auto j, auto x) {
+                                       auto out_loc = x - 1;
+                                       if(float_equal(in_ptr[j], 0))
+                                           return;
+
+                                       auto index = si.multi(j);
+                                       for(size_t k = 0; k < index.size(); ++k)
+                                       {
+                                           ptr[k * elem_num + out_loc] = index[k];
+                                       }
+                                   });
+        });
+    });
+
+    return result;
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/fuse_ops.cpp
+++ b/src/targets/gpu/fuse_ops.cpp
@@ -717,7 +717,7 @@ struct find_gemm_add
        auto gemm = any_cast<rocblas_gemm<op::dot>>(gemm_ins->get_operator());

        // Already fused gemm
-        if(not float_equal(gemm.op.beta, 0))
+        if(not float_equal(gemm.beta, 0))
            return;

        if(std::any_of(ins->inputs().begin(), ins->inputs().end(), [](auto i) {
@@ -738,7 +738,7 @@ struct find_gemm_add
        inputs.push_back(copy_ins);
        inputs.push_back(copy_ins);

-        gemm.op.beta = 1;
+        gemm.beta = 1;
        p.replace_instruction(ins, gemm, inputs);
    }
 };

--- a/src/targets/gpu/include/migraphx/gpu/device/multinomial.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/multinomial.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_MULTINOMIAL_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_MULTINOMIAL_HPP
+
+#include <migraphx/argument.hpp>
+#include <migraphx/config.hpp>
+#include <hip/hip_runtime_api.h>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void multinomial(hipStream_t stream,
+                 const argument& result,
+                 const argument& arg0,
+                 const argument& arg1);
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/include/migraphx/decompose.hpp
+++ b/src/include/migraphx/decompose.hpp
-#ifndef MIGRAPHX_GUARD_RTGLIB_DECOMPOSE_HPP
-#define MIGRAPHX_GUARD_RTGLIB_DECOMPOSE_HPP
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_NONZERO_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_NONZERO_HPP

-#include <string>
-#include <migraphx/instruction_ref.hpp>
+#include <migraphx/argument.hpp>
 #include <migraphx/config.hpp>
+#include <hip/hip_runtime_api.h>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {

-struct module;
-
-/**
- * Decompose operators.
- */
-struct decompose
-{
-    std::string name() const { return "decompose"; }
-    void apply(module& p) const;
-};
+argument nonzero(hipStream_t stream, const argument& result, const argument& arg_data);

+} // namespace device
+} // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx


--- a/src/targets/gpu/include/migraphx/gpu/gemm.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/gemm.hpp
 #ifndef MIGRAPHX_GUARD_RTGLIB_GPU_GEMM_HPP
 #define MIGRAPHX_GUARD_RTGLIB_GPU_GEMM_HPP

+#include <migraphx/errors.hpp>
+#include <migraphx/operation.hpp>
+#include <migraphx/value.hpp>
 #include <migraphx/shape.hpp>
 #include <migraphx/reflect.hpp>
 #include <migraphx/gpu/context.hpp>
@@ -19,13 +22,17 @@ template <class Op>
 struct rocblas_gemm
 {
    Op op;
+    float alpha         = 1;
+    float beta          = 0;
    bool int8_x4_format = true;

    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
        return pack_join(migraphx::reflect(self.op, f),
-                         pack(f(self.int8_x4_format, "int8_x4_format")));
+                         pack(f(self.alpha, "alpha"),
+                              f(self.beta, "beta"),
+                              f(self.int8_x4_format, "int8_x4_format")));
    }

    std::string name() const
@@ -44,6 +51,26 @@ struct rocblas_gemm
        check_shapes{in_shapes, *this}.not_broadcasted();
        batch_not_transposed(inputs[0].strides());
        batch_not_transposed(inputs[1].strides());
+        // if gemm and add are fused
+        if(not float_equal(beta, 0))
+        {
+            auto cmat_shape = in_shapes.back();
+            in_shapes.pop_back();
+            auto op_out_shape = op.compute_shape(in_shapes);
+            if(cmat_shape.lens() != op_out_shape.lens())
+            {
+                MIGRAPHX_THROW(this->name() + " : dimension mismatch, operand C: {" +
+                               to_string_range(cmat_shape.lens()) +
+                               "}, cannot add to operand A * B: {" +
+                               to_string_range(op_out_shape.lens()) + "}");
+            }
+            if(cmat_shape.type() != op_out_shape.type())
+            {
+                MIGRAPHX_THROW(this->name() + " : operand C type mismatch, operand C is of type: " +
+                               to_string(cmat_shape.type()) +
+                               ", it must be: " + to_string(op_out_shape.type()));
+            }
+        }

        return op.compute_shape(in_shapes);
    }
@@ -51,7 +78,14 @@ struct rocblas_gemm
    argument
    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const
    {
-        gemm(ctx, output_shape, args, op.alpha, op.beta, int8_x4_format);
+        if(this->name() == "gpu::gemm")
+        {
+            gemm(ctx, output_shape, args, alpha, beta, int8_x4_format);
+        }
+        else
+        {
+            gemm(ctx, output_shape, args, int32_t(alpha), int32_t(beta), int8_x4_format);
+        }
        return args.back();
    }


--- a/src/targets/gpu/include/migraphx/gpu/multinomial.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/multinomial.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_MULTINOMIAL_HPP
+#define MIGRAPHX_GUARD_RTGLIB_MULTINOMIAL_HPP
+
+#include <migraphx/op/multinomial.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct context;
+
+struct hip_multinomial
+{
+    op::multinomial op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
+    std::string name() const { return "gpu::multinomial"; }
+    shape compute_shape(std::vector<shape> inputs) const;
+    argument
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/nonzero.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/nonzero.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_NONZERO_HPP
+#define MIGRAPHX_GUARD_RTGLIB_NONZERO_HPP
+
+#include <migraphx/argument.hpp>
+#include <migraphx/reflect.hpp>
+#include <migraphx/op/nonzero.hpp>
+#include <migraphx/gpu/miopen.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct context;
+
+struct hip_nonzero
+{
+    op::nonzero op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
+    std::string name() const { return "gpu::nonzero"; }
+    shape compute_shape(std::vector<shape> inputs) const;
+    argument
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -164,6 +164,8 @@ struct miopen_apply
        add_extend_op("leaky_relu");
        add_extend_op("logsoftmax");
        add_extend_op("lrn");
+        add_extend_op("multinomial");
+        add_extend_op("nonzero");
        add_extend_op("pad");
        add_extend_op("pooling");
        add_extend_op("prefix_scan_sum");
@@ -180,15 +182,15 @@ struct miopen_apply
        add_extend_op("softmax");
        add_extend_op("topk");

-        add_gemm_op<op::dot>("dot");
-        add_gemm_op<op::quant_dot>("quant_dot");
+        add_batch_norm_inference_op();
        add_convolution_op();
        add_deconvolution_op();
-        add_quant_convolution_op();
-        add_batch_norm_inference_op();
-        add_neg_op();
+        add_gemm_op<op::dot>("dot");
+        add_gemm_op<op::quant_dot>("quant_dot");
        add_if_op();
        add_loop_op();
+        add_neg_op();
+        add_quant_convolution_op();
    }

    void copy_params()
@@ -303,17 +305,14 @@ struct miopen_apply
        });
    }

-    template <class Op>
-    void add_gemm_op(std::string name)
+    template <typename Op>
+    void add_gemm_op(const std::string& name)
    {
        apply_map.emplace(name, [=](instruction_ref ins) {
-            auto&& op                         = any_cast<Op>(ins->get_operator());
-            auto beta                         = op.beta;
            std::vector<instruction_ref> refs = ins->inputs();
            if(refs.size() == 2)
            {
                auto output = insert_allocation(ins, ins->get_shape());
-                beta        = 0;
                refs.push_back(output);
            }
            else
@@ -332,9 +331,8 @@ struct miopen_apply
                    refs.push_back(refs.back());
                }
            }
-
            return mod->replace_instruction(
-                ins, rocblas_gemm<Op>{Op{op.alpha, beta}, int8_x4_format}, refs);
+                ins, rocblas_gemm<Op>{Op{}, 1, 0, int8_x4_format}, refs);
        });
    }


--- a/src/targets/gpu/multinomial.cpp
+++ b/src/targets/gpu/multinomial.cpp
+#include <migraphx/gpu/multinomial.hpp>
+#include <migraphx/gpu/device/multinomial.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/tune_axis.hpp>
+#include <migraphx/check_shapes.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+shape hip_multinomial::compute_shape(std::vector<shape> inputs) const
+{
+    check_shapes{inputs, *this}.has(3).only_dims(2).standard();
+    inputs.pop_back();
+    return op.compute_shape(inputs);
+}
+
+argument
+hip_multinomial::compute(context& ctx, const shape&, const std::vector<argument>& args) const
+{
+    device::multinomial(ctx.get_stream().get(), args.back(), args.front(), args[1]);
+    return args.back();
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/nonzero.cpp
+++ b/src/targets/gpu/nonzero.cpp
+#include <migraphx/gpu/nonzero.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/device/nonzero.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+shape hip_nonzero::compute_shape(std::vector<shape> inputs) const
+{
+    return op.compute_shape({inputs.front()});
+}
+
+argument hip_nonzero::compute(context& ctx, const shape&, const std::vector<argument>& args) const
+{
+    return device::nonzero(ctx.get_stream().get(), args.back(), args.front());
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/target.cpp
+++ b/src/targets/gpu/target.cpp
@@ -2,7 +2,6 @@
 #include <migraphx/auto_contiguous.hpp>
 #include <migraphx/check_context.hpp>
 #include <migraphx/dead_code_elimination.hpp>
-#include <migraphx/decompose.hpp>
 #include <migraphx/eliminate_allocation.hpp>
 #include <migraphx/eliminate_common_subexpression.hpp>
 #include <migraphx/eliminate_concat.hpp>
@@ -17,7 +16,6 @@
 #include <migraphx/preallocate_param.hpp>
 #include <migraphx/propagate_constant.hpp>
 #include <migraphx/register_target.hpp>
-#include <migraphx/remap.hpp>
 #include <migraphx/rewrite_batchnorm.hpp>
 #include <migraphx/rewrite_pooling.hpp>
 #include <migraphx/rewrite_quantization.hpp>
@@ -59,7 +57,6 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
    return
    {
        normalize_ops{},
-        decompose{},
        dead_code_elimination{},
        simplify_qdq{},
        rewrite_quantization{},

--- a/src/targets/ref/lowering.cpp
+++ b/src/targets/ref/lowering.cpp
@@ -518,42 +518,12 @@ struct ref_gemm
        return migraphx::reflect(self.op, f);
    }
    std::string name() const { return "ref::dot"; }
-    shape compute_shape(const std::vector<shape>& inputs) const
-    {
-        if(inputs.size() == 3)
-        {
-            auto c_shape = inputs.at(2);
-            check_shapes{{c_shape}, *this}.not_broadcasted();
-        }
-        return op.compute_shape(inputs);
-    }
+    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }

    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
    {
        argument result{output_shape};
-        // 3 inputs, it is alpha * A * B + beta * C, then
-        // A and B are matrices, and C is of the same shape as A * B
-        if(args.size() == 3)
-        {
-            // no need to consider the value of args[2]
-            if(op.beta == 0.0f)
-            {
-                result.visit([&](auto output) { std::fill(output.begin(), output.end(), 0); });
-            }
-            else
-            {
-                visit_all(result, args[2])([&](auto output, auto input) {
-                    std::copy(input.begin(), input.end(), output.begin());
-                });
-            }
-
-            migemm(result, args[0], args[1], op.alpha, op.beta);
-
-            return result;
-        }
-
-        // 2 input arguments
-        migemm(result, args[0], args[1], op.alpha, 0.0f);
+        migemm(result, args[0], args[1], 1.0f, 0.0f);

        return result;
    }
@@ -571,22 +541,11 @@ struct ref_quant_gemm
    }

    std::string name() const { return "ref::quant_dot"; }
-    shape compute_shape(const std::vector<shape>& inputs) const
-    {
-        if(inputs.size() == 3)
-        {
-            auto c_shape = inputs.at(2);
-            check_shapes{{c_shape}, *this}.not_broadcasted();
-        }
-        return op.compute_shape(inputs);
-    }
+    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }

    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
    {
        argument result{output_shape};
-        // 3 inputs, it is alpha * A * B + beta * C, then
-        // A and B are matrices, and C is of the same shape to A * B
-
        // first, convert the args[0] and args[1] from int8_t to int32_t
        argument arg_0{{shape::int32_type, {args.at(0).get_shape().lens()}}};
        argument arg_1{{shape::int32_type, {args.at(1).get_shape().lens()}}};
@@ -600,27 +559,7 @@ struct ref_quant_gemm
                [&](auto input) { std::copy(input.begin(), input.end(), output.begin()); });
        });

-        if(args.size() == 3)
-        {
-            // no need to consider the value of args[2]
-            if(op.beta == 0)
-            {
-                result.visit([&](auto output) { std::fill(output.begin(), output.end(), 0); });
-            }
-            else
-            {
-                visit_all(result, args[2])([&](auto output, auto input) {
-                    std::copy(input.begin(), input.end(), output.begin());
-                });
-            }
-
-            migemm(result, arg_0, arg_1, op.alpha, op.beta);
-
-            return result;
-        }
-
-        // 2 input arguments
-        migemm(result, arg_0, arg_1, op.alpha, int32_t{0});
+        migemm(result, arg_0, arg_1, int32_t{1}, int32_t{0});

        return result;
    }

--- a/test/decompose_test.cpp
+++ b/test/decompose_test.cpp
-#include <migraphx/decompose.hpp>
-#include <migraphx/pass_manager.hpp>
-#include <basic_ops.hpp>
-#include <migraphx/make_op.hpp>
-
-#include <test.hpp>
-
-void run_pass(migraphx::module& m) { migraphx::run_passes(m, {migraphx::decompose{}}); }
-
-TEST_CASE(dot_add)
-{
-    migraphx::module m1;
-    {
-        auto x   = m1.add_parameter("x", migraphx::shape{migraphx::shape::float_type, {2, 2}});
-        auto y   = m1.add_parameter("y", migraphx::shape{migraphx::shape::float_type, {2, 2}});
-        auto z   = m1.add_parameter("z", migraphx::shape{migraphx::shape::float_type, {2, 2}});
-        auto dot = m1.add_instruction(migraphx::make_op("dot"), x, y, z);
-        m1.add_instruction(migraphx::make_op("identity"), dot);
-    }
-    run_pass(m1);
-    migraphx::module m2;
-    {
-        auto x   = m2.add_parameter("x", migraphx::shape{migraphx::shape::float_type, {2, 2}});
-        auto y   = m2.add_parameter("y", migraphx::shape{migraphx::shape::float_type, {2, 2}});
-        auto z   = m2.add_parameter("z", migraphx::shape{migraphx::shape::float_type, {2, 2}});
-        auto dot = m2.add_instruction(migraphx::make_op("dot", {{"alpha", 1}, {"beta", 0}}), x, y);
-        auto add = m2.add_instruction(migraphx::make_op("add"), dot, z);
-        m2.add_instruction(migraphx::make_op("identity"), add);
-    }
-    EXPECT(m1 == m2);
-}
-
-TEST_CASE(dot_add_beta_float)
-{
-    migraphx::module m1;
-    {
-        auto x = m1.add_parameter("x", migraphx::shape{migraphx::shape::float_type, {2, 2}});
-        auto y = m1.add_parameter("y", migraphx::shape{migraphx::shape::float_type, {2, 2}});
-        auto z = m1.add_parameter("z", migraphx::shape{migraphx::shape::float_type, {2, 2}});
-        auto dot =
-            m1.add_instruction(migraphx::make_op("dot", {{"alpha", 1.0}, {"beta", 0.5}}), x, y, z);
-        m1.add_instruction(migraphx::make_op("identity"), dot);
-    }
-    run_pass(m1);
-    migraphx::module m2;
-    {
-        auto x   = m2.add_parameter("x", migraphx::shape{migraphx::shape::float_type, {2, 2}});
-        auto y   = m2.add_parameter("y", migraphx::shape{migraphx::shape::float_type, {2, 2}});
-        auto z   = m2.add_parameter("z", migraphx::shape{migraphx::shape::float_type, {2, 2}});
-        auto dot = m2.add_instruction(migraphx::make_op("dot", {{"alpha", 1}, {"beta", 0}}), x, y);
-        auto beta =
-            m2.add_literal(migraphx::literal{migraphx::shape{migraphx::shape::float_type}, {0.5}});
-        auto beta_broadcast =
-            m2.add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", {2, 2}}}), beta);
-        auto mul = m2.add_instruction(migraphx::make_op("mul"), z, beta_broadcast);
-        auto add = m2.add_instruction(migraphx::make_op("add"), dot, mul);
-        m2.add_instruction(migraphx::make_op("identity"), add);
-    }
-    EXPECT(m1 == m2);
-}
-
-TEST_CASE(dot_add_beta_half)
-{
-    migraphx::module m1;
-    {
-        auto x = m1.add_parameter("x", migraphx::shape{migraphx::shape::half_type, {2, 2}});
-        auto y = m1.add_parameter("y", migraphx::shape{migraphx::shape::half_type, {2, 2}});
-        auto z = m1.add_parameter("z", migraphx::shape{migraphx::shape::half_type, {2, 2}});
-        auto dot =
-            m1.add_instruction(migraphx::make_op("dot", {{"alpha", 1.0}, {"beta", 0.5}}), x, y, z);
-        m1.add_instruction(migraphx::make_op("identity"), dot);
-    }
-    run_pass(m1);
-    migraphx::module m2;
-    {
-        auto x   = m2.add_parameter("x", migraphx::shape{migraphx::shape::half_type, {2, 2}});
-        auto y   = m2.add_parameter("y", migraphx::shape{migraphx::shape::half_type, {2, 2}});
-        auto z   = m2.add_parameter("z", migraphx::shape{migraphx::shape::half_type, {2, 2}});
-        auto dot = m2.add_instruction(migraphx::make_op("dot", {{"alpha", 1}, {"beta", 0}}), x, y);
-        auto beta =
-            m2.add_literal(migraphx::literal{migraphx::shape{migraphx::shape::half_type}, {0.5}});
-        auto beta_broadcast =
-            m2.add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", {2, 2}}}), beta);
-        auto mul = m2.add_instruction(migraphx::make_op("mul"), z, beta_broadcast);
-        auto add = m2.add_instruction(migraphx::make_op("add"), dot, mul);
-        m2.add_instruction(migraphx::make_op("identity"), add);
-    }
-    EXPECT(m1 == m2);
-}
-
-TEST_CASE(dot_add_beta_double)
-{
-    migraphx::module m1;
-    {
-        auto x = m1.add_parameter("x", migraphx::shape{migraphx::shape::double_type, {2, 2}});
-        auto y = m1.add_parameter("y", migraphx::shape{migraphx::shape::double_type, {2, 2}});
-        auto z = m1.add_parameter("z", migraphx::shape{migraphx::shape::double_type, {2, 2}});
-        auto dot =
-            m1.add_instruction(migraphx::make_op("dot", {{"alpha", 1.0}, {"beta", 0.5}}), x, y, z);
-        m1.add_instruction(migraphx::make_op("identity"), dot);
-    }
-    run_pass(m1);
-    migraphx::module m2;
-    {
-        auto x   = m2.add_parameter("x", migraphx::shape{migraphx::shape::double_type, {2, 2}});
-        auto y   = m2.add_parameter("y", migraphx::shape{migraphx::shape::double_type, {2, 2}});
-        auto z   = m2.add_parameter("z", migraphx::shape{migraphx::shape::double_type, {2, 2}});
-        auto dot = m2.add_instruction(migraphx::make_op("dot", {{"alpha", 1}, {"beta", 0}}), x, y);
-        auto beta =
-            m2.add_literal(migraphx::literal{migraphx::shape{migraphx::shape::double_type}, {0.5}});
-        auto beta_broadcast =
-            m2.add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", {2, 2}}}), beta);
-        auto mul = m2.add_instruction(migraphx::make_op("mul"), z, beta_broadcast);
-        auto add = m2.add_instruction(migraphx::make_op("add"), dot, mul);
-        m2.add_instruction(migraphx::make_op("identity"), add);
-    }
-    EXPECT(m1 == m2);
-}
-
-TEST_CASE(dot_add_beta_int)
-{
-    migraphx::module m1;
-    {
-        auto x = m1.add_parameter("x", migraphx::shape{migraphx::shape::int32_type, {2, 2}});
-        auto y = m1.add_parameter("y", migraphx::shape{migraphx::shape::int32_type, {2, 2}});
-        auto z = m1.add_parameter("z", migraphx::shape{migraphx::shape::int32_type, {2, 2}});
-        auto dot =
-            m1.add_instruction(migraphx::make_op("dot", {{"alpha", 1.0}, {"beta", 0.5}}), x, y, z);
-        m1.add_instruction(migraphx::make_op("identity"), dot);
-    }
-    run_pass(m1);
-    migraphx::module m2;
-    {
-        auto x   = m2.add_parameter("x", migraphx::shape{migraphx::shape::int32_type, {2, 2}});
-        auto y   = m2.add_parameter("y", migraphx::shape{migraphx::shape::int32_type, {2, 2}});
-        auto z   = m2.add_parameter("z", migraphx::shape{migraphx::shape::int32_type, {2, 2}});
-        auto dot = m2.add_instruction(migraphx::make_op("dot", {{"alpha", 1}, {"beta", 0}}), x, y);
-        auto beta =
-            m2.add_literal(migraphx::literal{migraphx::shape{migraphx::shape::int32_type}, {0.5}});
-        auto beta_broadcast =
-            m2.add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", {2, 2}}}), beta);
-        auto mul = m2.add_instruction(migraphx::make_op("mul"), z, beta_broadcast);
-        auto add = m2.add_instruction(migraphx::make_op("add"), dot, mul);
-        m2.add_instruction(migraphx::make_op("identity"), add);
-    }
-    EXPECT(m1 == m2);
-}
-
-int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/test/dot_apply_alpha_beta_test.cpp
+++ b/test/dot_apply_alpha_beta_test.cpp
+#include <cstdint>
+#include <migraphx/instruction.hpp>
+#include <migraphx/apply_alpha_beta.hpp>
+#include <basic_ops.hpp>
+#include <migraphx/make_op.hpp>
+#include <test.hpp>
+
+TEST_CASE(dot_apply_alpha_beta_half)
+{
+    migraphx::module m1;
+    {
+        auto x       = m1.add_parameter("x", migraphx::shape{migraphx::shape::half_type, {2, 2}});
+        auto y       = m1.add_parameter("y", migraphx::shape{migraphx::shape::half_type, {2, 2}});
+        auto z       = m1.add_parameter("z", migraphx::shape{migraphx::shape::half_type, {2, 2}});
+        auto dot_res = migraphx::insert_apply_alpha_beta(
+            m1, m1.end(), {x, y, z}, migraphx::make_op("dot"), 3.0f, 2.0f);
+        m1.add_instruction(migraphx::make_op("identity"), dot_res);
+    }
+    migraphx::module m2;
+    {
+
+        auto ht              = migraphx::shape::half_type;
+        auto ft              = migraphx::shape::float_type;
+        auto x               = m2.add_parameter("x", migraphx::shape{ht, {2, 2}});
+        auto y               = m2.add_parameter("y", migraphx::shape{ht, {2, 2}});
+        auto z               = m2.add_parameter("z", migraphx::shape{ht, {2, 2}});
+        auto alpha_literal   = m2.add_literal(3.0f);
+        auto alpha_broadcast = m2.add_instruction(
+            migraphx::make_op("multibroadcast", {{"out_lens", x->get_shape().lens()}}),
+            alpha_literal);
+        auto x_float = m2.add_instruction(migraphx::make_op("convert", {{"target_type", ft}}), x);
+        auto x_alpha_float = m2.add_instruction(migraphx::make_op("mul"), alpha_broadcast, x_float);
+        auto x_half =
+            m2.add_instruction(migraphx::make_op("convert", {{"target_type", ht}}), x_alpha_float);
+        auto dot_res      = m2.add_instruction(migraphx::make_op("dot"), x_half, y);
+        auto beta_literal = m2.add_literal(2.0f);
+        auto z_float = m2.add_instruction(migraphx::make_op("convert", {{"target_type", ft}}), z);
+        auto beta_broadcast = m2.add_instruction(
+            migraphx::make_op("multibroadcast", {{"out_lens", z->get_shape().lens()}}),
+            beta_literal);
+        auto z_beta_float = m2.add_instruction(migraphx::make_op("mul"), z_float, beta_broadcast);
+        auto z_beta_half =
+            m2.add_instruction(migraphx::make_op("convert", {{"target_type", ht}}), z_beta_float);
+        auto z_add = m2.add_instruction(migraphx::make_op("add"), dot_res, z_beta_half);
+        m2.add_instruction(migraphx::make_op("identity"), z_add);
+    }
+    EXPECT(m1 == m2);
+}
+
+TEST_CASE(dot_apply_alpha_beta_double)
+{
+    migraphx::module m1;
+    {
+        auto x = m1.add_parameter("x", migraphx::shape{migraphx::shape::double_type, {2, 2}});
+        auto y = m1.add_parameter("y", migraphx::shape{migraphx::shape::double_type, {2, 2}});
+        auto z = m1.add_parameter("z", migraphx::shape{migraphx::shape::double_type, {2, 1}});
+        auto dot_res =
+            migraphx::add_apply_alpha_beta(m1, {x, y, z}, migraphx::make_op("dot"), 3.0f, 2.0f);
+        m1.add_instruction(migraphx::make_op("identity"), dot_res);
+    }
+    migraphx::module m2;
+    {
+
+        auto dt              = migraphx::shape::double_type;
+        auto x               = m2.add_parameter("x", migraphx::shape{dt, {2, 2}});
+        auto y               = m2.add_parameter("y", migraphx::shape{dt, {2, 2}});
+        auto z               = m2.add_parameter("z", migraphx::shape{dt, {2, 1}});
+        auto alpha_literal   = m2.add_literal(3.0f);
+        auto alpha_broadcast = m2.add_instruction(
+            migraphx::make_op("multibroadcast", {{"out_lens", x->get_shape().lens()}}),
+            alpha_literal);
+        auto alpha_double = m2.add_instruction(migraphx::make_op("convert", {{"target_type", dt}}),
+                                               alpha_broadcast);
+        auto x_alpha_double = m2.add_instruction(migraphx::make_op("mul"), alpha_double, x);
+        auto dot_res        = m2.add_instruction(migraphx::make_op("dot"), x_alpha_double, y);
+        auto z_broadcast =
+            m2.add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", {2, 2}}}), z);
+        auto beta_literal   = m2.add_literal(2.0f);
+        auto beta_broadcast = m2.add_instruction(
+            migraphx::make_op("multibroadcast", {{"out_lens", z_broadcast->get_shape().lens()}}),
+            beta_literal);
+        auto beta_double =
+            m2.add_instruction(migraphx::make_op("convert", {{"target_type", dt}}), beta_broadcast);
+        auto z_beta_double = m2.add_instruction(migraphx::make_op("mul"), z_broadcast, beta_double);
+        auto z_add         = m2.add_instruction(migraphx::make_op("add"), dot_res, z_beta_double);
+        m2.add_instruction(migraphx::make_op("identity"), z_add);
+    }
+    EXPECT(m1 == m2);
+}
+
+TEST_CASE(quant_dot_apply_alpha_beta)
+{
+    migraphx::module m1;
+    {
+        auto x       = m1.add_parameter("x", migraphx::shape{migraphx::shape::int8_type, {2, 2}});
+        auto y       = m1.add_parameter("y", migraphx::shape{migraphx::shape::int8_type, {2, 2}});
+        auto z       = m1.add_parameter("z", migraphx::shape{migraphx::shape::int32_type, {2, 2}});
+        auto dot_res = migraphx::insert_apply_alpha_beta(m1,
+                                                         m1.end(),
+                                                         {x, y, z},
+                                                         migraphx::make_op("quant_dot"),
+                                                         migraphx::literal{int32_t{3}},
+                                                         migraphx::literal{int32_t{2}});
+        m1.add_instruction(migraphx::make_op("identity"), dot_res);
+    }
+    migraphx::module m2;
+    {
+
+        auto i8              = migraphx::shape::int8_type;
+        auto i32             = migraphx::shape::int32_type;
+        auto x               = m2.add_parameter("x", migraphx::shape{i8, {2, 2}});
+        auto y               = m2.add_parameter("y", migraphx::shape{i8, {2, 2}});
+        auto z               = m2.add_parameter("z", migraphx::shape{i32, {2, 2}});
+        auto alpha_literal   = m2.add_literal(int32_t(3));
+        auto alpha_broadcast = m2.add_instruction(
+            migraphx::make_op("multibroadcast", {{"out_lens", x->get_shape().lens()}}),
+            alpha_literal);
+        auto x_i32 = m2.add_instruction(migraphx::make_op("convert", {{"target_type", i32}}), x);
+        auto x_alpha_i32 = m2.add_instruction(migraphx::make_op("mul"), alpha_broadcast, x_i32);
+        auto x_i8 =
+            m2.add_instruction(migraphx::make_op("convert", {{"target_type", i8}}), x_alpha_i32);
+        auto dot_res        = m2.add_instruction(migraphx::make_op("quant_dot"), x_i8, y);
+        auto beta_literal   = m2.add_literal(int32_t(2));
+        auto beta_broadcast = m2.add_instruction(
+            migraphx::make_op("multibroadcast", {{"out_lens", z->get_shape().lens()}}),
+            beta_literal);
+        auto z_beta_i32 = m2.add_instruction(migraphx::make_op("mul"), z, beta_broadcast);
+        auto z_add      = m2.add_instruction(migraphx::make_op("add"), dot_res, z_beta_i32);
+        m2.add_instruction(migraphx::make_op("identity"), z_add);
+    }
+    EXPECT(m1 == m2);
+}
+
+int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/test/gpu/pack_int8_args.cpp
+++ b/test/gpu/pack_int8_args.cpp
@@ -2,6 +2,7 @@
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/gpu/lowering.hpp>
 #include <migraphx/gpu/target.hpp>
+#include <migraphx/apply_alpha_beta.hpp>
 #include <migraphx/adjust_allocation.hpp>
 #include <migraphx/gpu/pack_int8_args.hpp>
 #include <migraphx/gpu/rocblas.hpp>
@@ -48,7 +49,8 @@ TEST_CASE(quant_dot)
        auto l1 = m.add_parameter("a", m1_shape);
        auto l2 = m.add_parameter("b", m2_shape);
        auto l3 = m.add_parameter("c", m3_shape);
-        auto r  = m.add_instruction(migraphx::make_op("quant_dot"), l1, l2, l3);
+        auto r =
+            migraphx::add_apply_alpha_beta(m, {l1, l2, l3}, migraphx::make_op("quant_dot"), 1, 1);
        m.add_return({r});
        return m;
    };
@@ -62,9 +64,11 @@ TEST_CASE(quant_dot)
        auto l1         = m.add_parameter("a", m1_shape);
        auto l2         = m.add_parameter("b", m2_shape);
        auto l3         = m.add_parameter("c", m3_shape);
+        auto beta       = m.add_literal(1);
        auto output     = m.add_parameter("test:#output_0", m3_shape);
+        auto gemm_alloc = m.add_instruction(
+            migraphx::make_op("hip::allocate", {{"shape", migraphx::to_value(m3_shape)}}));

-        auto cout  = m.add_instruction(migraphx::make_op("hip::copy"), l3, output);
        auto packa = l2;
        if(int8_x4)
        {
@@ -72,14 +76,24 @@ TEST_CASE(quant_dot)
                migraphx::make_op("hip::allocate", {{"shape", migraphx::to_value(m2_shape)}}));
            packa = m.add_instruction(migraphx::make_op("gpu::int8_gemm_pack_a"), l2, alloc);
        }
-        auto gemm = m.add_instruction(
-            migraphx::make_op("gpu::quant_gemm",
-                              {{"alpha", 1}, {"beta", 1}, {"int8_x4_format", int8_x4}}),
+        auto gemm =
+            m.add_instruction(migraphx::make_op("gpu::quant_gemm", {{"int8_x4_format", int8_x4}}),
                              l1,
                              packa,
-            cout,
-            cout);
-        m.add_return({gemm});
+                              gemm_alloc);
+
+        auto beta_broadcast = m.add_instruction(
+            migraphx::make_op("multibroadcast", {{"out_lens", m3_shape.lens()}}), beta);
+        auto beta_alloc = m.add_instruction(
+            migraphx::make_op("hip::allocate", {{"shape", migraphx::to_value(m3_shape)}}));
+        auto beta_contiguous =
+            m.add_instruction(migraphx::make_op("gpu::contiguous"), beta_broadcast, beta_alloc);
+        auto mul_alloc = m.add_instruction(
+            migraphx::make_op("hip::allocate", {{"shape", migraphx::to_value(m3_shape)}}));
+        auto m3_beta =
+            m.add_instruction(migraphx::make_op("gpu::mul"), l3, beta_contiguous, mul_alloc);
+        auto gemm_add = m.add_instruction(migraphx::make_op("gpu::add"), gemm, m3_beta, output);
+        m.add_return({gemm_add});

        return m;
    };
@@ -89,7 +103,6 @@ TEST_CASE(quant_dot)

    bool flag = get_int8_x4_format();
    auto m2   = create_optimized_int8_x4(flag);
-
    EXPECT(m1 == m2);
 }

@@ -106,8 +119,7 @@ TEST_CASE(quant_dot_trans)
        auto l2 = m.add_parameter("b", s2);
        auto tl2 =
            m.add_instruction(migraphx::make_op("transpose", {{"permutation", {0, 1, 3, 2}}}), l2);
-        auto r = m.add_instruction(
-            migraphx::make_op("quant_dot", {{"alpha", 3}, {"beta", 2}}), tl1, tl2);
+        auto r = migraphx::add_apply_alpha_beta(m, {tl1, tl2}, migraphx::make_op("quant_dot"), 3);
        m.add_return({r});
        return m;
    };
@@ -120,6 +132,7 @@ TEST_CASE(quant_dot_trans)

        auto l1     = m.add_parameter("a", s1);
        auto l2     = m.add_parameter("b", s2);
+        auto alpha  = m.add_literal(3);
        auto output = m.add_parameter("test:#output_0", s3);

        auto tl1 =
@@ -136,6 +149,34 @@ TEST_CASE(quant_dot_trans)
            migraphx::make_op("hip::allocate", {{"shape", migraphx::to_value(ts2)}}));
        auto contb = m.add_instruction(migraphx::make_op("gpu::contiguous"), tl2, allocb);

+        auto alpha_broadcast = m.add_instruction(
+            migraphx::make_op("multibroadcast", {{"out_lens", conta->get_shape().lens()}}), alpha);
+        auto alpha_alloc = m.add_instruction(migraphx::make_op(
+            "hip::allocate",
+            {{"shape",
+              migraphx::to_value(migraphx::shape(migraphx::shape::int32_type, {3, 2, 5, 8}))}}));
+        auto alpha_contiguous =
+            m.add_instruction(migraphx::make_op("gpu::contiguous"), alpha_broadcast, alpha_alloc);
+        // alpha = int32 and tl1 = int8, convert tl1 to int32 for multiplication and then convert
+        // back result to int8
+        auto tl1_convert_alloc = m.add_instruction(migraphx::make_op(
+            "hip::allocate", {{"shape", migraphx::to_value(alpha_contiguous->get_shape())}}));
+        auto tl1_convert       = m.add_instruction(
+            migraphx::make_op("gpu::convert", {{"target_type", alpha->get_shape().type()}}),
+            conta,
+            tl1_convert_alloc);
+        auto mul_alloc       = m.add_instruction(migraphx::make_op(
+            "hip::allocate", {{"shape", migraphx::to_value(tl1_convert->get_shape())}}));
+        auto tl1_alpha_int32 = m.add_instruction(
+            migraphx::make_op("gpu::mul"), alpha_contiguous, tl1_convert, mul_alloc);
+        // convert mul_res to int8
+        auto tl1_alpha_int8_alloc = m.add_instruction(migraphx::make_op(
+            "hip::allocate", {{"shape", migraphx::to_value(conta->get_shape())}}));
+        auto tl1_alpha_int8       = m.add_instruction(
+            migraphx::make_op("gpu::convert", {{"target_type", conta->get_shape().type()}}),
+            tl1_alpha_int32,
+            tl1_alpha_int8_alloc);
+
        auto packb = contb;
        if(int8_x4)
        {
@@ -143,10 +184,10 @@ TEST_CASE(quant_dot_trans)
                migraphx::make_op("hip::allocate", {{"shape", migraphx::to_value(ts2)}}));
            packb = m.add_instruction(migraphx::make_op("gpu::int8_gemm_pack_a"), contb, allocpb);
        }
-        auto gemm = m.add_instruction(
-            migraphx::make_op("gpu::quant_gemm",
-                              {{"alpha", 3}, {"beta", 0}, {"int8_x4_format", int8_x4}}),
-            conta,
+
+        auto gemm =
+            m.add_instruction(migraphx::make_op("gpu::quant_gemm", {{"int8_x4_format", int8_x4}}),
+                              tl1_alpha_int8,
                              packb,
                              output);
        m.add_return({gemm});
@@ -174,7 +215,8 @@ TEST_CASE(quant_dot_pad)
        auto l1 = m.add_parameter("a", s1);
        auto l2 = m.add_parameter("b", s2);
        auto l3 = m.add_parameter("c", s3);
-        auto r  = m.add_instruction(migraphx::make_op("quant_dot"), l1, l2, l3);
+        auto r =
+            migraphx::add_apply_alpha_beta(m, {l1, l2, l3}, migraphx::make_op("quant_dot"), 1, 1);
        m.add_return({r});
        return m;
    };
@@ -190,6 +232,7 @@ TEST_CASE(quant_dot_pad)
        auto l1     = m.add_parameter("a", s1);
        auto l2     = m.add_parameter("b", s2);
        auto l3     = m.add_parameter("c", s3);
+        auto beta   = m.add_literal(1);
        auto output = m.add_parameter("test:#output_0", s3);

        auto pl1   = l1;
@@ -213,7 +256,9 @@ TEST_CASE(quant_dot_pad)
                po2);
        }

-        auto cout = m.add_instruction(migraphx::make_op("hip::copy"), l3, output);
+        auto gemm_alloc = m.add_instruction(
+            migraphx::make_op("hip::allocate", {{"shape", migraphx::to_value(s3)}}));
+
        if(int8_x4)
        {
            auto alloc = m.add_instruction(
@@ -221,15 +266,24 @@ TEST_CASE(quant_dot_pad)
            packa = m.add_instruction(migraphx::make_op("gpu::int8_gemm_pack_a"), pl2, alloc);
        }

-        auto gemm = m.add_instruction(
-            migraphx::make_op("gpu::quant_gemm",
-                              {{"alpha", 1}, {"beta", 1}, {"int8_x4_format", int8_x4}}),
+        auto gemm =
+            m.add_instruction(migraphx::make_op("gpu::quant_gemm", {{"int8_x4_format", int8_x4}}),
                              pl1,
                              packa,
-            cout,
-            cout);
-        m.add_return({gemm});
-
+                              gemm_alloc);
+
+        auto beta_broadcast =
+            m.add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", s3.lens()}}), beta);
+        auto beta_alloc = m.add_instruction(
+            migraphx::make_op("hip::allocate", {{"shape", migraphx::to_value(s3)}}));
+        auto beta_contiguous =
+            m.add_instruction(migraphx::make_op("gpu::contiguous"), beta_broadcast, beta_alloc);
+        auto mul_alloc = m.add_instruction(
+            migraphx::make_op("hip::allocate", {{"shape", migraphx::to_value(s3)}}));
+        auto m3_beta =
+            m.add_instruction(migraphx::make_op("gpu::mul"), l3, beta_contiguous, mul_alloc);
+        auto gemm_add = m.add_instruction(migraphx::make_op("gpu::add"), gemm, m3_beta, output);
+        m.add_return({gemm_add});
        return m;
    };

@@ -255,8 +309,7 @@ TEST_CASE(quant_dot_trans_pad)
        auto l2 = m.add_parameter("b", s2);
        auto tl2 =
            m.add_instruction(migraphx::make_op("transpose", {{"permutation", {0, 1, 3, 2}}}), l2);
-        auto r = m.add_instruction(
-            migraphx::make_op("quant_dot", {{"alpha", 3}, {"beta", 2}}), tl1, tl2);
+        auto r = migraphx::add_apply_alpha_beta(m, {tl1, tl2}, migraphx::make_op("quant_dot"), 3);
        m.add_return({r});
        return m;
    };
@@ -271,6 +324,7 @@ TEST_CASE(quant_dot_trans_pad)

        auto l1     = m.add_parameter("a", s1);
        auto l2     = m.add_parameter("b", s2);
+        auto alpha  = m.add_literal(3);
        auto output = m.add_parameter("test:#output_0", s3);

        auto tl1 =
@@ -278,27 +332,14 @@ TEST_CASE(quant_dot_trans_pad)
        migraphx::shape ts1{migraphx::shape::int8_type, {3, 2, 5, 9}};
        auto ta = m.add_instruction(
            migraphx::make_op("hip::allocate", {{"shape", migraphx::to_value(ts1)}}));
-        migraphx::instruction_ref pta{};
-        if(int8_x4)
-        {
-            pta = m.add_instruction(
-                migraphx::make_op("hip::allocate", {{"shape", migraphx::to_value(ps1)}}));
-        }
        auto conta = m.add_instruction(migraphx::make_op("gpu::contiguous"), tl1, ta);
-        auto pa    = conta;
-        if(int8_x4)
-        {
-            pa = m.add_instruction(
-                migraphx::make_op("gpu::pad", {{"mode", 0}, {"pads", {0, 0, 0, 3, 0, 0, 0, 0}}}),
-                conta,
-                pta);
-        }

        auto tl2 =
            m.add_instruction(migraphx::make_op("transpose", {{"permutation", {0, 1, 3, 2}}}), l2);
        migraphx::shape ts2{migraphx::shape::int8_type, {3, 2, 9, 7}};
        auto tb = m.add_instruction(
            migraphx::make_op("hip::allocate", {{"shape", migraphx::to_value(ts2)}}));
+
        migraphx::instruction_ref ptb{};
        if(int8_x4)
        {
@@ -306,24 +347,72 @@ TEST_CASE(quant_dot_trans_pad)
                migraphx::make_op("hip::allocate", {{"shape", migraphx::to_value(ps2)}}));
        }
        auto contb = m.add_instruction(migraphx::make_op("gpu::contiguous"), tl2, tb);
-        auto packb = contb;
+        auto pb    = contb;
        if(int8_x4)
        {
-            auto pb = m.add_instruction(
+            pb = m.add_instruction(
                migraphx::make_op("gpu::pad", {{"mode", 0}, {"pads", {0, 0, 3, 0, 0, 0, 0, 0}}}),
                contb,
                ptb);
+        }
+
+        auto alpha_broadcast = m.add_instruction(
+            migraphx::make_op("multibroadcast", {{"out_lens", conta->get_shape().lens()}}), alpha);
+        auto alpha_alloc = m.add_instruction(
+            migraphx::make_op("hip::allocate",
+                              {{"shape",
+                                migraphx::to_value(migraphx::shape(migraphx::shape::int32_type,
+                                                                   conta->get_shape().lens()))}}));
+        auto alpha_contiguous =
+            m.add_instruction(migraphx::make_op("gpu::contiguous"), alpha_broadcast, alpha_alloc);
+
+        // alpha = int32 and tl1 = int8, convert tl1 to int32 for multiplication and then convert
+        // back result to int8
+        auto tl1_convert_alloc = m.add_instruction(migraphx::make_op(
+            "hip::allocate", {{"shape", migraphx::to_value(alpha_contiguous->get_shape())}}));
+        auto tl1_convert       = m.add_instruction(
+            migraphx::make_op("gpu::convert", {{"target_type", alpha->get_shape().type()}}),
+            conta,
+            tl1_convert_alloc);
+        auto mul_alloc       = m.add_instruction(migraphx::make_op(
+            "hip::allocate", {{"shape", migraphx::to_value(tl1_convert->get_shape())}}));
+        auto tl1_alpha_int32 = m.add_instruction(
+            migraphx::make_op("gpu::mul"), alpha_contiguous, tl1_convert, mul_alloc);
+        // convert mul_res to int8
+        auto tl1_alpha_int8_alloc = m.add_instruction(migraphx::make_op(
+            "hip::allocate", {{"shape", migraphx::to_value(conta->get_shape())}}));
+
+        migraphx::instruction_ref pta{};
+        if(int8_x4)
+        {
+            pta = m.add_instruction(
+                migraphx::make_op("hip::allocate", {{"shape", migraphx::to_value(ps1)}}));
+        }
+
+        auto tl1_alpha_int8 = m.add_instruction(
+            migraphx::make_op("gpu::convert", {{"target_type", conta->get_shape().type()}}),
+            tl1_alpha_int32,
+            tl1_alpha_int8_alloc);

+        auto pa = tl1_alpha_int8;
+        if(int8_x4)
+        {
+            pa = m.add_instruction(
+                migraphx::make_op("gpu::pad", {{"mode", 0}, {"pads", {0, 0, 0, 3, 0, 0, 0, 0}}}),
+                tl1_alpha_int8,
+                pta);
+        }
+
+        auto packb = pb;
+        if(int8_x4)
+        {
            auto allocpb = m.add_instruction(
                migraphx::make_op("hip::allocate", {{"shape", migraphx::to_value(ps2)}}));
            packb = m.add_instruction(migraphx::make_op("gpu::int8_gemm_pack_a"), pb, allocpb);
        }
+
        auto gemm = m.add_instruction(
-            migraphx::make_op("gpu::quant_gemm",
-                              {{"alpha", 3}, {"beta", 0}, {"int8_x4_format", int8_x4}}),
-            pa,
-            packb,
-            output);
+            migraphx::make_op("gpu::quant_gemm", {{"int8_x4_format", int8_x4}}), pa, packb, output);
        m.add_return({gemm});

        return m;

--- a/test/onnx/depthtospace_crd_test.onnx
+++ b/test/onnx/depthtospace_crd_test.onnx
+depthtospace_crd_test:
+6
+xy"DepthToSpace*
+	blocksize*
+mode"CRDdepthtospace_crd_testZ
+x
+
+
+
+
+b
+y
+
+
+
+
+
+
+B
\ No newline at end of file
--- a/test/onnx/depthtospace_simple_test.onnx
+++ b/test/onnx/depthtospace_simple_test.onnx
+depthtospace_simple_test:
+6
+xy"DepthToSpace*
+	blocksize*
+mode"DCRdepthtospace_simple_testZ
+x
+
+
+
+
+b
+y
+
+
+
+
+B
\ No newline at end of file
--- a/test/onnx/depthtospace_test.onnx
+++ b/test/onnx/depthtospace_test.onnx
+depthtospace_test:
+6
+xy"DepthToSpace*
+	blocksize*
+mode"DCRdepthtospace_testZ
+x
+
+
+
+
+b
+y
+
+
+
+
+
+
+B
\ No newline at end of file
--- a/test/onnx/gen_onnx.py
+++ b/test/onnx/gen_onnx.py
@@ -1016,6 +1016,51 @@ def deconv_stride_test():
    return ([node], [x, w], [y])


+@onnx_test
+def depthtospace_test():
+
+    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [2, 8, 5, 5])
+    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [2, 2, 10, 10])
+
+    node = onnx.helper.make_node('DepthToSpace',
+                                 inputs=['x'],
+                                 outputs=['y'],
+                                 blocksize=2,
+                                 mode='DCR')
+
+    return ([node], [x], [y])
+
+
+@onnx_test
+def depthtospace_simple_test():
+
+    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [1, 8, 2, 3])
+    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [1, 2, 4, 6])
+
+    node = onnx.helper.make_node('DepthToSpace',
+                                 inputs=['x'],
+                                 outputs=['y'],
+                                 blocksize=2,
+                                 mode='DCR')
+
+    return ([node], [x], [y])
+
+
+@onnx_test
+def depthtospace_crd_test():
+
+    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [2, 8, 5, 5])
+    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [2, 2, 10, 10])
+
+    node = onnx.helper.make_node('DepthToSpace',
+                                 inputs=['x'],
+                                 outputs=['y'],
+                                 blocksize=2,
+                                 mode='CRD')
+
+    return ([node], [x], [y])
+
+
 @onnx_test
 def dequantizelinear_test():
    arg0 = helper.make_tensor_value_info('0', TensorProto.INT8, [5])
@@ -2607,6 +2652,59 @@ def min_test():
    return ([node], [a, b, c], [y])


+@onnx_test
+def multinomial_test():
+    sample_size = 10
+    seed = 0.0
+    input = helper.make_tensor_value_info("input", TensorProto.FLOAT, [1, 10])
+    output = helper.make_tensor_value_info("output", TensorProto.INT32,
+                                           [1, 10])
+
+    node = onnx.helper.make_node('Multinomial',
+                                 inputs=['input'],
+                                 sample_size=sample_size,
+                                 seed=seed,
+                                 outputs=['output'])
+
+    return ([node], [input], [output])
+
+
+@onnx_test
+def multinomial_dtype_error_test():
+    sample_size = 10
+    dtype = 0
+    input = helper.make_tensor_value_info("input", TensorProto.FLOAT, [1, 10])
+    output = helper.make_tensor_value_info("output", TensorProto.INT64,
+                                           [1, 10])
+
+    node = onnx.helper.make_node('Multinomial',
+                                 inputs=['input'],
+                                 sample_size=sample_size,
+                                 dtype=dtype,
+                                 outputs=['output'])
+
+    return ([node], [input], [output])
+
+
+@onnx_test
+def multinomial_int64_test():
+    sample_size = 10
+    dtype = 7
+    seed = 1.0
+    input = helper.make_tensor_value_info("input", TensorProto.FLOAT, [1, 10])
+    output = helper.make_tensor_value_info("output", TensorProto.INT64,
+                                           [1, 10])
+
+    node = onnx.helper.make_node('Multinomial',
+                                 inputs=['input'],
+                                 sample_size=sample_size,
+                                 dtype=dtype,
+                                 seed=seed,
+                                 outputs=['output'])
+
+    return ([node], [input], [output])
+
+
 @onnx_test
 def neg_test():
    x = helper.make_tensor_value_info('0', TensorProto.INT64, [2, 3])
@@ -2650,6 +2748,18 @@ def no_pad_test():
    return ([node], [x], [y])


+@onnx_test
+def nonzero_dynamic_test():
+    x = helper.make_tensor_value_info('data', TensorProto.BOOL, [2, 2])
+    y = helper.make_tensor_value_info('indices', TensorProto.INT64, [2, 3])
+
+    node = onnx.helper.make_node('NonZero',
+                                 inputs=['data'],
+                                 outputs=['indices'])
+
+    return ([node], [x], [y])
+
+
 @onnx_test
 def nonzero_test():
    data1 = np.array([[1., 0.], [1., 1.]])
@@ -2947,6 +3057,186 @@ def quantizelinear_neg_axis_test():
    return make_quantizelinear_axis_graph(-2)


+@onnx_test
+def randomnormal_test():
+    dtype = 11
+    mean = 10.0
+    scale = 1.5
+    seed = 0.0
+    shape = [2, 3, 4]
+    output = helper.make_tensor_value_info('output', TensorProto.DOUBLE,
+                                           [2, 3, 4])
+
+    node = onnx.helper.make_node('RandomNormal',
+                                 inputs=[],
+                                 outputs=['output'],
+                                 dtype=dtype,
+                                 mean=mean,
+                                 scale=scale,
+                                 seed=seed,
+                                 shape=shape)
+
+    return ([node], [], [output])
+
+
+@onnx_test
+def randomnormal_dtype_error_test():
+    dtype = 6
+    shape = [2, 3, 4]
+    output = helper.make_tensor_value_info('output', TensorProto.INT32,
+                                           [2, 3, 4])
+
+    node = onnx.helper.make_node('RandomNormal',
+                                 inputs=[],
+                                 outputs=['output'],
+                                 dtype=dtype,
+                                 shape=shape)
+
+    return ([node], [], [output])
+
+
+@onnx_test
+def randomnormal_shape_error_test():
+    dtype = 1
+    output = helper.make_tensor_value_info('output', TensorProto.FLOAT,
+                                           [2, 3, 4])
+
+    node = onnx.helper.make_node('RandomNormal',
+                                 inputs=[],
+                                 outputs=['output'],
+                                 dtype=dtype)
+
+    return ([node], [], [output])
+
+
+@onnx_test
+def randomnormallike_test():
+    dtype = 10
+    mean = 10.0
+    scale = 1.5
+    seed = 0.0
+    input = helper.make_tensor_value_info('input', TensorProto.FLOAT16,
+                                          [2, 3, 4])
+    output = helper.make_tensor_value_info('output', TensorProto.FLOAT16,
+                                           [2, 3, 4])
+
+    node = onnx.helper.make_node('RandomNormalLike',
+                                 inputs=['input'],
+                                 outputs=['output'],
+                                 dtype=dtype,
+                                 mean=mean,
+                                 scale=scale,
+                                 seed=seed)
+
+    return ([node], [input], [output])
+
+
+@onnx_test
+def randomnormallike_type_error_test():
+    seed = 0
+    input = helper.make_tensor_value_info('input', TensorProto.INT32,
+                                          [2, 3, 4])
+    output = helper.make_tensor_value_info('output', TensorProto.FLOAT,
+                                           [2, 3, 4])
+
+    node = onnx.helper.make_node('RandomNormalLike',
+                                 inputs=['input'],
+                                 outputs=['output'],
+                                 seed=seed)
+
+    return ([node], [input], [output])
+
+
+@onnx_test
+def randomuniform_test():
+    dtype = 11
+    high = 1.0
+    low = 0.0
+    seed = 0.0
+    shape = [2, 3, 4]
+    output = helper.make_tensor_value_info('output', TensorProto.DOUBLE,
+                                           [2, 3, 4])
+
+    node = onnx.helper.make_node('RandomUniform',
+                                 inputs=[],
+                                 outputs=['output'],
+                                 dtype=dtype,
+                                 high=high,
+                                 low=low,
+                                 seed=seed,
+                                 shape=shape)
+
+    return ([node], [], [output])
+
+
+@onnx_test
+def randomuniform_dtype_error_test():
+    dtype = 6
+    shape = [2, 3, 4]
+    output = helper.make_tensor_value_info('output', TensorProto.INT32,
+                                           [2, 3, 4])
+
+    node = onnx.helper.make_node('RandomUniform',
+                                 inputs=[],
+                                 outputs=['output'],
+                                 dtype=dtype,
+                                 shape=shape)
+
+    return ([node], [], [output])
+
+
+@onnx_test
+def randomuniform_shape_error_test():
+    dtype = 1
+    output = helper.make_tensor_value_info('output', TensorProto.FLOAT,
+                                           [2, 3, 4])
+
+    node = onnx.helper.make_node('RandomUniform',
+                                 inputs=[],
+                                 outputs=['output'],
+                                 dtype=dtype)
+
+    return ([node], [], [output])
+
+
+@onnx_test
+def randomuniformlike_test():
+    dtype = 10
+    high = 10.0
+    low = 1.0
+    seed = 0.0
+    input = helper.make_tensor_value_info('input', TensorProto.FLOAT16,
+                                          [2, 3, 4])
+    output = helper.make_tensor_value_info('output', TensorProto.FLOAT16,
+                                           [2, 3, 4])
+
+    node = onnx.helper.make_node('RandomUniformLike',
+                                 inputs=['input'],
+                                 outputs=['output'],
+                                 dtype=dtype,
+                                 high=high,
+                                 low=low,
+                                 seed=seed)
+
+    return ([node], [input], [output])
+
+
+@onnx_test
+def randomuniformlike_type_error_test():
+    seed = 0
+    input = helper.make_tensor_value_info('input', TensorProto.INT32,
+                                          [2, 3, 4])
+    output = helper.make_tensor_value_info('output', TensorProto.FLOAT,
+                                           [2, 3, 4])
+
+    node = onnx.helper.make_node('RandomUniformLike',
+                                 inputs=['input'],
+                                 outputs=['output'],
+                                 seed=seed)
+
+    return ([node], [input], [output])
+
+
 @onnx_test
 def range_test():