Merge branch 'develop' into uif2-initial

8d7a8a6c · Artur Wojcik · 25b33431 · a09dc502 · 8d7a8a6c · 8d7a8a6c
Commit 8d7a8a6c authored Dec 06, 2023 by Artur Wojcik
20 changed files
--- a/examples/diffusion/python_stable_diffusion_21/txt2img.py
+++ b/examples/diffusion/python_stable_diffusion_21/txt2img.py
+#  The MIT License (MIT)
+#
+#  Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+#
+#  Permission is hereby granted, free of charge, to any person obtaining a copy
+#  of this software and associated documentation files (the 'Software'), to deal
+#  in the Software without restriction, including without limitation the rights
+#  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+#  copies of the Software, and to permit persons to whom the Software is
+#  furnished to do so, subject to the following conditions:
+#
+#  The above copyright notice and this permission notice shall be included in
+#  all copies or substantial portions of the Software.
+#
+#  THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+#  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+#  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+#  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+#  THE SOFTWARE.
+
+from argparse import ArgumentParser
+from diffusers import EulerDiscreteScheduler
+from transformers import CLIPTokenizer
+from PIL import Image
+
+import migraphx as mgx
+import numpy as np
+import os
+import torch
+import time
+from functools import wraps
+
+
+# measurement helper
+def measure(fn):
+    @wraps(fn)
+    def measure_ms(*args, **kwargs):
+        start_time = time.perf_counter_ns()
+        result = fn(*args, **kwargs)
+        end_time = time.perf_counter_ns()
+        print(f"Elapsed time: {(end_time - start_time) * 1e-6:.4f} ms\n")
+        return result
+
+    return measure_ms
+
+
+def get_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "-s",
+        "--seed",
+        type=int,
+        default=42,
+        help="Random seed",
+    )
+
+    parser.add_argument(
+        "-t",
+        "--steps",
+        type=int,
+        default=20,
+        help="Number of steps",
+    )
+
+    parser.add_argument(
+        "-p",
+        "--prompt",
+        type=str,
+        required=True,
+        help="Prompt",
+    )
+
+    parser.add_argument(
+        "-n",
+        "--negative-prompt",
+        type=str,
+        default="",
+        help="Negative prompt",
+    )
+
+    parser.add_argument(
+        "--scale",
+        type=float,
+        default=7.0,
+        help="Guidance scale",
+    )
+
+    parser.add_argument(
+        "-o",
+        "--output",
+        type=str,
+        default=None,
+        help="Output name",
+    )
+    return parser.parse_args()
+
+
+class StableDiffusionMGX():
+    def __init__(self):
+        model_id = "stabilityai/stable-diffusion-2-1"
+        print(f"Using {model_id}")
+
+        print("Creating EulerDiscreteScheduler scheduler")
+        self.scheduler = EulerDiscreteScheduler.from_pretrained(
+            model_id, subfolder="scheduler")
+
+        print("Creating CLIPTokenizer tokenizer...")
+        self.tokenizer = CLIPTokenizer.from_pretrained(model_id,
+                                                       subfolder="tokenizer")
+
+        print("Load models...")
+        self.vae = StableDiffusionMGX.load_mgx_model(
+            "vae_decoder", {"latent_sample": [1, 4, 64, 64]})
+        self.text_encoder = StableDiffusionMGX.load_mgx_model(
+            "text_encoder", {"input_ids": [1, 77]})
+        self.unet = StableDiffusionMGX.load_mgx_model(
+            "unet", {
+                "sample": [1, 4, 64, 64],
+                "encoder_hidden_states": [1, 77, 1024],
+                "timestep": [1],
+            })
+
+    def run(self, prompt, negative_prompt, steps, seed, scale):
+        # need to set this for each run
+        self.scheduler.set_timesteps(steps)
+
+        print("Tokenizing prompt...")
+        text_input = self.tokenize(prompt)
+
+        print("Creating text embeddings for prompt...")
+        text_embeddings = self.get_embeddings(text_input)
+
+        print("Tokenizing negative prompt...")
+        uncond_input = self.tokenize(negative_prompt)
+
+        print("Creating text embeddings for negative prompt...")
+        uncond_embeddings = self.get_embeddings(uncond_input)
+
+        print(
+            f"Creating random input data ({1}x{4}x{64}x{64}) (latents) with seed={seed}..."
+        )
+        latents = torch.randn((1, 4, 64, 64),
+                              generator=torch.manual_seed(seed))
+
+        print("Apply initial noise sigma\n")
+        latents = latents * self.scheduler.init_noise_sigma
+
+        print("Running denoising loop...")
+        for step, t in enumerate(self.scheduler.timesteps):
+            print(f"#{step}/{len(self.scheduler.timesteps)} step")
+            latents = self.denoise_step(text_embeddings, uncond_embeddings,
+                                        latents, t, scale)
+
+        print("Scale denoised result...")
+        latents = 1 / 0.18215 * latents
+
+        print("Decode denoised result...")
+        image = self.decode(latents)
+
+        return image
+
+    @staticmethod
+    @measure
+    def load_mgx_model(name, shapes):
+        file = f"models/sd21-onnx/{name}/model"
+        print(f"Loading {name} model from {file}")
+        if os.path.isfile(f"{file}.mxr"):
+            print("Found mxr, loading it...")
+            model = mgx.load(f"{file}.mxr", format="msgpack")
+        elif os.path.isfile(f"{file}.onnx"):
+            print("Parsing from onnx file...")
+            model = mgx.parse_onnx(f"{file}.onnx", map_input_dims=shapes)
+            model.compile(mgx.get_target("gpu"))
+            print(f"Saving {name} model to mxr file...")
+            mgx.save(model, f"{file}.mxr", format="msgpack")
+        else:
+            print(f"No {name} model found. Please download it and re-try.")
+            os.exit(1)
+        return model
+
+    @measure
+    def tokenize(self, input):
+        return self.tokenizer([input],
+                              padding="max_length",
+                              max_length=self.tokenizer.model_max_length,
+                              truncation=True,
+                              return_tensors="np")
+
+    @measure
+    def get_embeddings(self, input):
+        return np.array(
+            self.text_encoder.run(
+                {"input_ids":
+                 input.input_ids.astype(np.int32)})[0]).astype(np.float32)
+
+    @staticmethod
+    def convert_to_rgb_image(image):
+        image = np.clip(image / 2 + 0.5, 0, 1)
+        image = np.transpose(image, (0, 2, 3, 1))
+        images = (image * 255).round().astype("uint8")
+        return Image.fromarray(images[0])
+
+    @staticmethod
+    def save_image(pil_image, filename="output.png"):
+        pil_image.save(filename)
+
+    @measure
+    def denoise_step(self, text_embeddings, uncond_embeddings, latents, t,
+                     scale):
+        sample = self.scheduler.scale_model_input(latents,
+                                                  t).numpy().astype(np.float32)
+        timestep = np.atleast_1d(t.numpy().astype(
+            np.int64))  # convert 0D -> 1D
+
+        noise_pred_uncond = np.array(
+            self.unet.run({
+                "sample": sample,
+                "encoder_hidden_states": uncond_embeddings,
+                "timestep": timestep
+            })[0])
+
+        noise_pred_text = np.array(
+            self.unet.run({
+                "sample": sample,
+                "encoder_hidden_states": text_embeddings,
+                "timestep": timestep
+            })[0])
+
+        # perform guidance
+        noise_pred = noise_pred_uncond + scale * (noise_pred_text -
+                                                  noise_pred_uncond)
+
+        # compute the previous noisy sample x_t -> x_t-1
+        return self.scheduler.step(torch.from_numpy(noise_pred), t,
+                                   latents).prev_sample
+
+    @measure
+    def decode(self, latents):
+        return np.array(
+            self.vae.run({"latent_sample":
+                          latents.numpy().astype(np.float32)})[0])
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    sd = StableDiffusionMGX()
+    result = sd.run(args.prompt, args.negative_prompt, args.steps, args.seed,
+                    args.scale)
+
+    print("Convert result to rgb image...")
+    image = StableDiffusionMGX.convert_to_rgb_image(result)
+    filename = args.output if args.output else f"output_s{args.seed}_t{args.steps}.png"
+    StableDiffusionMGX.save_image(image, args.output)
+    print(f"Image saved to {filename}")
--- a/requirements.txt
+++ b/requirements.txt
@@ -29,4 +29,4 @@ pybind/pybind11@d159a563383d10c821ba7b2a71905d1207db6de4 --build
 msgpack/msgpack-c@cpp-3.3.0 -DMSGPACK_BUILD_TESTS=Off
 sqlite3@3.43.2 -DCMAKE_POSITION_INDEPENDENT_CODE=On
 ROCmSoftwarePlatform/composable_kernel@70eefcf4f263aa5c25f3c9ff0db8f6f199ef0fb9 -DCK_BUILD_JIT_LIB=On -DCMAKE_POSITION_INDEPENDENT_CODE=On
-ROCmSoftwarePlatform/rocMLIR@13f6c2a69cfe80a575c6b241ec7353d1e953cb12 -DBUILD_FAT_LIBROCKCOMPILER=On
+ROCmSoftwarePlatform/rocMLIR@a6880f1e6daec99876cd6a4820fbc69c57216401 -DBUILD_FAT_LIBROCKCOMPILER=On
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -221,6 +221,8 @@ register_migraphx_ops(
    scatternd_add
    scatternd_mul
    scatternd_none
+    scatternd_max
+    scatternd_min
    select_module
    sigmoid
    sign
@@ -239,6 +241,7 @@ register_migraphx_ops(
    transpose
    unary_not
    undefined
+    unique
    unknown
    unsqueeze
    where
@@ -288,6 +291,7 @@ find_package(TBB QUIET)
 if(TBB_FOUND)
    check_execution_par(TBB_HAS_EXECUTION_PAR TBB::tbb)
    if(TBB_HAS_EXECUTION_PAR)
+        list(APPEND PACKAGE_DEPENDS PACKAGE TBB)
        target_link_libraries(migraphx PUBLIC TBB::tbb)
        set(MIGRAPHX_HAS_EXECUTORS_DEFAULT On)
        message(STATUS "Using TBB for parallel execution")

--- a/src/driver/CMakeLists.txt
+++ b/src/driver/CMakeLists.txt
@@ -25,6 +25,7 @@
 add_executable(driver 
    main.cpp
    verify.cpp
+    passes.cpp
    perf.cpp
    resnet50.cpp
    inceptionv3.cpp

--- a/src/driver/main.cpp
+++ b/src/driver/main.cpp
@@ -26,6 +26,7 @@
 #include "argument_parser.hpp"
 #include "command.hpp"
 #include "precision.hpp"
+#include "passes.hpp"
 #include "perf.hpp"
 #include "models.hpp"
 #include "marker_roctx.hpp"
@@ -83,6 +84,7 @@ struct loader
    std::vector<std::string> param_dims;
    std::vector<std::string> dyn_param_dims;
    std::vector<std::string> output_names;
+    std::vector<std::string> passes;

    void parse(argument_parser& ap)
    {
@@ -130,6 +132,7 @@ struct loader
           ap.append(),
           ap.nargs(2));
        ap(optimize, {"--optimize", "-O"}, ap.help("Optimize when reading"), ap.set_value(true));
+        ap(passes, {"--apply-pass", "-p"}, ap.help("Passes to apply to model"), ap.append());
        ap(output_type,
           {"--graphviz", "-g"},
           ap.help("Print out a graphviz representation."),
@@ -337,6 +340,8 @@ struct loader
                                     migraphx::dead_code_elimination{},
                                 });
        }
+        if(not passes.empty())
+            migraphx::run_passes(*p.get_main_module(), get_passes(passes));
        return p;
    }


--- a/src/targets/gpu/device/gather.cpp
+++ b/src/targets/gpu/device/gather.cpp
@@ -21,47 +21,89 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#include <migraphx/shape.hpp>
-#include <migraphx/argument.hpp>
-#include <migraphx/gpu/device/gather.hpp>
-#include <migraphx/gpu/device/tensor.hpp>
-#include <migraphx/gpu/device/launch.hpp>
-#include <migraphx/gpu/device/types.hpp>
+
+#include "passes.hpp"
+
+#include <migraphx/auto_contiguous.hpp>
+#include <migraphx/dead_code_elimination.hpp>
+#include <migraphx/eliminate_allocation.hpp>
+#include <migraphx/eliminate_common_subexpression.hpp>
+#include <migraphx/eliminate_concat.hpp>
+#include <migraphx/eliminate_contiguous.hpp>
+#include <migraphx/eliminate_data_type.hpp>
+#include <migraphx/eliminate_identity.hpp>
+#include <migraphx/eliminate_pad.hpp>
+#include <migraphx/inline_module.hpp>
+#include <migraphx/insert_pad.hpp>
+#include <migraphx/normalize_ops.hpp>
+#include <migraphx/optimize_module.hpp>
+#include <migraphx/promote_literals.hpp>
+#include <migraphx/propagate_constant.hpp>
+#include <migraphx/rewrite_gelu.hpp>
+#include <migraphx/rewrite_pooling.hpp>
+#include <migraphx/rewrite_quantization.hpp>
+#include <migraphx/rewrite_rnn.hpp>
+#include <migraphx/simplify_algebra.hpp>
+#include <migraphx/simplify_dyn_ops.hpp>
+#include <migraphx/simplify_qdq.hpp>
+#include <migraphx/simplify_reshapes.hpp>
+
+#include <migraphx/ranges.hpp>
+#include <unordered_map>

 namespace migraphx {
+namespace driver {
 inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-namespace device {

-argument gather(hipStream_t stream, argument result, argument arg1, argument arg2, int64_t axis)
+std::unordered_map<std::string, pass> create_passes_lookup()
 {
-    const auto& input_shape = arg1.get_shape();
-    auto lens               = input_shape.lens();
-    auto axis_dim_size      = lens[axis];
-    lens[axis]              = arg2.get_shape().elements();
-    shape out_comp_shape{result.get_shape().type(), lens};
-    std::size_t nelements = result.get_shape().elements();
+    std::unordered_map<std::string, pass> result;
+    // clang-format off
+    std::initializer_list<pass> passes = {
+        auto_contiguous{},
+        dead_code_elimination{},
+        eliminate_allocation{},
+        eliminate_common_subexpression{},
+        eliminate_concat{},
+        eliminate_contiguous{},
+        eliminate_data_type{},
+        eliminate_identity{},
+        eliminate_pad{},
+        inline_module{},
+        insert_pad{},
+        normalize_ops{},
+        optimize_module{},
+        promote_literals{},
+        propagate_constant{},
+        rewrite_gelu{},
+        rewrite_pooling{},
+        rewrite_quantization{},
+        rewrite_rnn{},
+        simplify_algebra{},
+        simplify_dyn_ops{},
+        simplify_qdq{},
+        simplify_reshapes{},
+    };
+    // clang-format on
+    for(const auto& pass : passes)
+        result[pass.name()] = pass;
+    result["eliminate_dead_code"] = dead_code_elimination{};
+    return result;
+}

-    visit_all(result, arg1)([&](auto output, auto input_v) {
-        hip_visit_views(input_v, out_comp_shape)([&](auto input, auto out_comp) {
-            arg2.visit([&](auto indices) {
-                const auto* indices_ptr = device_cast(indices.data());
-                auto* output_ptr        = device_cast(output.data());
-                gs_launch(stream, nelements, 256)([=](auto i) __device__ {
-                    auto idx      = out_comp.multi(i);
-                    auto in_index = indices_ptr[idx[axis]];
-                    in_index      = (in_index < 0) ? in_index + axis_dim_size : in_index;
-                    idx[axis]     = in_index;
-                    output_ptr[i] = input[idx];
-                });
-            });
+std::vector<pass> get_passes(const std::vector<std::string>& names)
+{
+    std::vector<pass> result;
+    static const std::unordered_map<std::string, pass> lookup = create_passes_lookup();
+    std::transform(
+        names.begin(), names.end(), std::back_inserter(result), [](const std::string& name) {
+            if(not contains(lookup, name))
+                MIGRAPHX_THROW("Unknown pass: " + name);
+            return lookup.at(name);
        });
-    });
-
    return result;
 }

-} // namespace device
-} // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
+} // namespace driver
 } // namespace migraphx
--- a/src/targets/gpu/include/migraphx/gpu/device/gather.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/gather.hpp
@@ -21,24 +21,20 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_GATHER_HPP
-#define MIGRAPHX_GUARD_RTGLIB_DEVICE_GATHER_HPP
+#ifndef MIGRAPHX_GUARD_DRIVER_PASSES_HPP
+#define MIGRAPHX_GUARD_DRIVER_PASSES_HPP

-#include <migraphx/argument.hpp>
-#include <migraphx/gpu/device/config.hpp>
-#include <hip/hip_runtime_api.h>
+#include <migraphx/pass.hpp>
+#include <vector>

 namespace migraphx {
+namespace driver {
 inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-namespace device {

-argument MIGRAPHX_DEVICE_EXPORT
-gather(hipStream_t stream, argument result, argument arg1, argument arg2, int64_t axis);
+std::vector<pass> get_passes(const std::vector<std::string>& names);

-} // namespace device
-} // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
+} // namespace driver
 } // namespace migraphx

 #endif
--- a/src/eliminate_data_type.cpp
+++ b/src/eliminate_data_type.cpp
@@ -31,6 +31,72 @@
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

+void insert_convert_to_supported_type(module& m,
+                                      instruction_ref ins,
+                                      migraphx::shape::type_t target_type,
+                                      std::set<migraphx::shape::type_t> unsupported_types)
+{
+    migraphx::shape::type_t orig_type   = ins->get_shape().type();
+    std::vector<instruction_ref> inputs = ins->inputs();
+    std::transform(inputs.begin(), inputs.end(), inputs.begin(), [&](const auto& i) {
+        if(contains(unsupported_types, i->get_shape().type()))
+        {
+            return m.insert_instruction(
+                ins,
+                migraphx::make_op("convert", {{"target_type", migraphx::to_value(target_type)}}),
+                i);
+        }
+        else
+        {
+            return i;
+        }
+    });
+    // if no change
+    if(inputs == ins->inputs())
+        return;
+    auto op         = ins->get_operator();
+    auto attributes = op.attributes();
+    if(attributes.contains("general_data_type"))
+    {
+        op = make_op(attributes["general_data_type"].to<std::string>(), op.to_value());
+    }
+    auto new_ins = m.insert_instruction(ins, op, inputs);
+    if(orig_type == shape::tuple_type)
+    {
+        auto orig_outs = ins->outputs();
+        if(not std::all_of(orig_outs.begin(), orig_outs.end(), [&](const auto out_ins) {
+               return out_ins->name() == "get_tuple_elem";
+           }))
+            MIGRAPHX_THROW(
+                "eliminate_data_type: Instruction with tuple output doesn't have all its "
+                "usages as get_tuple_elem instruction");
+
+        std::transform(
+            orig_outs.begin(), orig_outs.end(), orig_outs.begin(), [&](const auto out_ins) {
+                auto gte_ins       = m.insert_instruction(ins, out_ins->get_operator(), new_ins);
+                auto orig_out_type = out_ins->get_shape().type();
+                if(contains(unsupported_types, orig_out_type))
+                {
+                    auto gte_convert = m.insert_instruction(
+                        ins, make_op("convert", {{"target_type", orig_out_type}}), gte_ins);
+                    return m.replace_instruction(out_ins, gte_convert);
+                }
+                else
+                {
+                    return m.replace_instruction(out_ins, gte_ins);
+                }
+            });
+    }
+    else
+    {
+        auto convert_back_ins = m.insert_instruction(
+            ins,
+            migraphx::make_op("convert", {{"target_type", migraphx::to_value(orig_type)}}),
+            new_ins);
+        m.replace_instruction(ins, convert_back_ins);
+    }
+}
+
 void eliminate_data_type::apply(module& m) const
 {
    static const std::vector<std::string> skip_op_names = {"convert",
@@ -42,31 +108,17 @@ void eliminate_data_type::apply(module& m) const
                                                           "scatternd_add",
                                                           "scatternd_mul",
                                                           "scatternd_none"};
+    if(unsupported_types.empty())
+        return;
+
    for(auto ins : iterator_for(m))
    {
        if(ins->name()[0] == '@')
            continue;
-        if(contains(skip_op_names, ins->name()))
-            continue;
-        auto inputs = ins->inputs();
-        std::transform(inputs.begin(), inputs.end(), inputs.begin(), [&](auto i) {
-            if(types.count(i->get_shape().type()) == 0)
-                return i;
-            return m.insert_instruction(ins, make_op("convert", {{"target_type", target_type}}), i);
-        });
-        if(inputs == ins->inputs())
+        if(contains(skip_op_names, ins->name()) and not contains(unsupported_ops, ins->name()))
            continue;
-        auto op         = ins->get_operator();
-        auto attributes = op.attributes();
-        if(attributes.contains("general_data_type"))
-        {
-            op = make_op(attributes["general_data_type"].to<std::string>(), op.to_value());
-        }
-        auto old_type = ins->get_shape().type();
-        auto out      = m.insert_instruction(ins, op, inputs);
-        auto convert =
-            m.insert_instruction(ins, make_op("convert", {{"target_type", old_type}}), out);
-        m.replace_instruction(ins, convert);
+        if(contains(unsupported_ops, "all") or contains(unsupported_ops, ins->name()))
+            insert_convert_to_supported_type(m, ins, target_type, unsupported_types);
    }
 }


--- a/src/include/migraphx/bit_cast.hpp
+++ b/src/include/migraphx/bit_cast.hpp
@@ -21,10 +21,13 @@
 * ************************************************************************ */
 #ifndef MIGRAPHX_GUARD_RTGLIB_BITCAST_HPP
 #define MIGRAPHX_GUARD_RTGLIB_BITCAST_HPP
+#include <type_traits>
 #if defined(__GNUC__) && !defined(__clang__)
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #endif
+
+#include <migraphx/requires.hpp>
 #include <migraphx/config.hpp>

 // NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
@@ -32,7 +35,10 @@

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
-template <typename To, typename From>
+template <typename To,
+          typename From,
+          MIGRAPHX_REQUIRES(std::is_trivially_copyable<To>{} and
+                            std::is_trivially_copyable<From>{})>
 inline constexpr To bit_cast(From fr) noexcept
 {
    static_assert(sizeof(To) == sizeof(From));

--- a/src/include/migraphx/eliminate_data_type.hpp
+++ b/src/include/migraphx/eliminate_data_type.hpp
@@ -40,8 +40,9 @@ struct module;
 */
 struct MIGRAPHX_EXPORT eliminate_data_type
 {
-    std::set<shape::type_t> types;
+    std::set<shape::type_t> unsupported_types;
    shape::type_t target_type;
+    std::set<std::string> unsupported_ops = {"all"};
    std::string name() const { return "eliminate_data_type"; }
    void apply(module& m) const;
 };

--- a/src/targets/gpu/gather.cpp
+++ b/src/targets/gpu/gather.cpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -21,25 +21,27 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#include <migraphx/gpu/gather.hpp>
-#include <migraphx/gpu/context.hpp>
-#include <migraphx/gpu/device/gather.hpp>
+#ifndef MIGRAPHX_GUARD_OPERATORS_SCATTERND_MAX_HPP
+#define MIGRAPHX_GUARD_OPERATORS_SCATTERND_MAX_HPP
+
+#include <migraphx/op/scatternd_op.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
+namespace op {

-shape hip_gather::compute_shape(std::vector<shape> inputs) const
+struct scatternd_max : scatternd_op<scatternd_max>
 {
-    inputs.pop_back();
-    return op.normalize_compute_shape(inputs);
-}
+    scatternd_max() {}

-argument hip_gather::compute(context& ctx, const shape&, const std::vector<argument>& args) const
-{
-    return device::gather(ctx.get_stream().get(), args.back(), args[0], args[1], op.axis);
-}
+    auto reduction() const
+    {
+        return [](auto& x, const auto& y) { x = std::max(x, y); };
+    }
+};

-} // namespace gpu
+} // namespace op
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
+
+#endif
--- a/src/targets/gpu/pad.cpp
+++ b/src/targets/gpu/pad.cpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -21,26 +21,27 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#include <migraphx/gpu/pad.hpp>
-#include <migraphx/gpu/context.hpp>
-#include <migraphx/gpu/device/pad.hpp>
+#ifndef MIGRAPHX_GUARD_OPERATORS_SCATTERND_MIN_HPP
+#define MIGRAPHX_GUARD_OPERATORS_SCATTERND_MIN_HPP
+
+#include <migraphx/op/scatternd_op.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
+namespace op {

-shape hip_pad::compute_shape(std::vector<shape> inputs) const
+struct scatternd_min : scatternd_op<scatternd_min>
 {
-    inputs.pop_back();
-    check_shapes{inputs, *this}.has(1).standard();
-    return op.compute_shape(inputs);
-}
+    scatternd_min() {}

-argument hip_pad::compute(context& ctx, const shape&, const std::vector<argument>& args) const
-{
-    return device::pad(ctx.get_stream().get(), args.back(), args.front(), op.value, op.pads);
-}
+    auto reduction() const
+    {
+        return [](auto& x, const auto& y) { x = std::min(x, y); };
+    }
+};

-} // namespace gpu
+} // namespace op
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
+
+#endif
--- a/src/include/migraphx/op/scatternd_op.hpp
+++ b/src/include/migraphx/op/scatternd_op.hpp
@@ -121,7 +121,8 @@ struct scatternd_op : op_name<Derived>
                auto k             = indices_shape.lens().back();
                auto q             = indices_shape.ndim();
                auto r             = dyn_out.computed_shape.ndim();
-                par_for(updates_shape.elements(), [&](const auto i) {
+                for(auto i = 0u; i < updates_shape.elements(); ++i)
+                {
                    auto updates_idx = updates_std.multi(i);
                    std::vector<std::size_t> indices_idx(q, 0);
                    std::copy(
@@ -135,7 +136,7 @@ struct scatternd_op : op_name<Derived>
                    std::copy(updates_idx.begin() + q - 1, updates_idx.end(), out_idx.begin() + k);

                    self.reduction()(output[dyn_out.computed_shape.index(out_idx)], updates[i]);
-                });
+                }
            });
        });


--- a/src/include/migraphx/op/unique.hpp
+++ b/src/include/migraphx/op/unique.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef MIGRAPHX_GUARD_OPERATORS_UNIQUE_HPP
+#define MIGRAPHX_GUARD_OPERATORS_UNIQUE_HPP
+
+#include <migraphx/shape_for_each.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/tune_axis.hpp>
+#include <utility>
+#include <map>
+#include <limits>
+#include <optional>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+
+// https://onnx.ai/onnx/operators/onnx__Unique.html
+// The Onnx spec refers to numpy specification, used as a reference:
+// https://numpy.org/doc/stable/reference/generated/numpy.unique.html
+
+// Input : Given an array of elements : X.
+
+// Output(s) :
+// 1. Find the unique elements (Y) of input (X).
+//
+// There are three outputs in addition to the unique elements in Y:
+// 2. the indices of the input array that give the unique values
+// 3. the indices of the unique array that reconstruct the input array
+// 4. the number of times each unique value comes up in the input array
+
+// Optional Attribute: 'Sorted' = 1 for sorted; = 0 for unsorted.
+// Onnx specification makes 'sorted' a default, while Numpy always sorts.
+//
+// Optional Attribute: 'Axis' is 'None' (default) or a valid int < rank(X).
+// Negative values are allowed.
+//
+// Numpy has the following important note on Axis:
+// ------------------------------------------------------------------
+// When an axis is specified the subarrays indexed by the axis are
+// sorted. This is done by making the specified axis the first
+// dimension of the array (move the axis to the first dimension to
+// keep the order of the other axes) and then flattening the subarrays
+// in C order. The flattened subarrays are then viewed as a structured
+// type with each element given a label, with the effect that we end
+// up with a 1-D array of structured types that can be treated in the
+// same way as any other 1-D array. The result is that the flattened
+// subarrays are sorted in lexicographic order starting with the first
+// element.
+// ------------------------------------------------------------------
+
+struct unique
+{
+
+    template <class T>
+    auto make_idx_less_fn(const T& data, size_t chunk_sz) const
+    {
+        return [&data, chunk_sz](auto idx1, auto idx2) {
+            return std::lexicographical_compare(data.begin() + idx1,
+                                                data.begin() + idx1 + chunk_sz,
+                                                data.begin() + idx2,
+                                                data.begin() + idx2 + chunk_sz);
+        };
+    }
+
+    // CASE SORTED:
+    //
+    // To process into a sorted unique series of elements/chunks:
+    // Chunk size == 1 means a simple element; >1 means a flat representation.
+    // Steps: first go through the input elements/chunks for uniqueness.
+    // At the end of this processing, per the sorted sequence of unique elements:
+    // update/create data structures: y, y_indices, x_rev_indices, y_count
+    //
+    // INPUT x: [2, 1, 1, 3, 4, 3], attr_sorted = 1;
+
+    // OUTPUT(s): indices..
+    // y_indices: [1, 0, 3, 4]  --- first incidence, in terms of index in sequence x
+    // x_rev_indices: [1, 0, 0, 2, 3, 2] --- x seen in terms of indices of unique sequence y
+    // y_count: [2, 1, 2, 1] -- count at each y_index. sum = len(x)
+
+    // NOTE: y [1, 2, 3, 4]   --- the unique output is constructed from x[y_indices[...]]
+
+    template <class T>
+    auto sorted_uniq_indices(const T& input_data, size_t chunk_sz) const
+    {
+        struct y_info
+        {
+            size_t y_idx;
+            size_t x_idx;
+            size_t ct = 0;
+        };
+
+        auto idx_less_fn = make_idx_less_fn(input_data, chunk_sz);
+        std::map<size_t, y_info, decltype(idx_less_fn)> uniq_val_map(idx_less_fn);
+
+        std::tuple<std::vector<std::size_t>, std::vector<std::size_t>, std::vector<std::size_t>> rv;
+        auto& [y_indices, x_rev_indices, y_count] = rv;
+
+        // go through all the elements and find the unique elements..
+        size_t count_x = input_data.size();
+        for(size_t f_idx = 0, x_idx = 0; f_idx < count_x; f_idx += chunk_sz, x_idx++)
+        {
+            y_info entry          = {.y_idx = uniq_val_map.size(), .x_idx = x_idx};
+            auto [itr, added_new] = uniq_val_map.insert({f_idx, entry});
+            itr->second.ct++;
+            x_rev_indices.push_back(itr->second.y_idx);
+        }
+
+        std::vector<std::size_t> y2x_indices(uniq_val_map.size());
+        y_indices.resize(uniq_val_map.size());
+        y_count.resize(uniq_val_map.size());
+        size_t idx = 0;
+        // the unique elements are now sorted:
+        // post-processing for all the return indices.
+        for(const auto& v : uniq_val_map)
+        {
+            y2x_indices[v.second.y_idx] = idx;
+            y_indices[idx]              = v.second.x_idx;
+            y_count[idx]                = v.second.ct;
+            idx++;
+        }
+        // update x_rev_indices as per the sorted order of y_indices
+        for(auto& i : x_rev_indices)
+            i = y2x_indices[i];
+
+        return rv;
+    }
+
+    // CASE UNSORTED:
+    //
+    // To process into an un-sorted unique series of elements/chunks:
+    // For chunk size = 1 is a simple element, else use a flat representation of a tensor obj
+    // Go through the input elements/chunks one by one with inline processing of indices..
+
+    // INPUT x: [2, 1, 1, 3, 4, 3], attr_sorted = 0;
+
+    // OUTPUT(s): indices..
+    // y_indices: [0, 1, 3, 4]  --- first incidence, in terms of index in sequence x
+    // x_rev_indices: [0, 1, 1, 2, 3, 2] --- x seen in terms of indices of unique sequence y
+    // y_count: [1, 2, 2, 1] -- count at each y_index. sum = len(x)
+
+    // NOTE: y [2, 1, 3, 4]   --- the unique output is constructed from x[y_indices[...]]
+    // Output data structures: y_indices, x_rev_indices, y_count are processed inline.
+
+    template <class T>
+    auto unsorted_uniq_indices(const T& input_data, size_t chunk_sz) const
+    {
+        auto idx_less_fn = make_idx_less_fn(input_data, chunk_sz);
+        std::map<size_t, size_t, decltype(idx_less_fn)> uniq_val_map(idx_less_fn);
+
+        // rv is used for NVRO below..
+        std::tuple<std::vector<std::size_t>, std::vector<std::size_t>, std::vector<std::size_t>> rv;
+        auto& [y_indices, x_rev_indices, y_count] = rv;
+
+        // go through all the elements and add the unique elements into the map..
+        // inline processing for outputs: y_indices, x_rev_indices, y_count
+        size_t count_x = input_data.size();
+        for(size_t f_idx = 0; f_idx < count_x; f_idx += chunk_sz)
+        {
+            auto [itr, added_new] = uniq_val_map.insert({f_idx, y_indices.size()});
+            if(added_new)
+            {
+                y_count.push_back(0);
+                y_indices.push_back(x_rev_indices.size());
+            }
+            y_count[itr->second]++;
+            x_rev_indices.push_back(itr->second);
+        }
+
+        return rv;
+    }
+
+    // Axis. Default: none. Range: [-rank, rank-1]
+    std::optional<int64_t> axis;
+
+    // Sorted, Default: 1= sorted. 0 = unsorted.
+    bool sorted = true;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.axis, "axis"), f(self.sorted, "sorted"));
+    }
+
+    std::string name() const { return "unique"; }
+
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(1);
+
+        auto& sh_x         = inputs[0];
+        auto lens_x        = sh_x.lens();
+        size_t dim_x       = sh_x.ndim();
+        size_t max_uniq_ct = sh_x.elements();
+        std::vector<shape::dynamic_dimension> d_out;
+
+        if(axis)
+        {
+            int64_t t_axis = migraphx::tune_axis(dim_x, *axis, name());
+            if(t_axis != 0)
+                MIGRAPHX_THROW("Unique: Only supports axis = 0 or None");
+
+            d_out = sh_x.to_dynamic().dyn_dims();
+            // only axis = 0 is supported:
+            max_uniq_ct = lens_x[0];
+            // min = 1 unique element; max = full dimension along axis 0
+            d_out[0] = {1, max_uniq_ct};
+        }
+        else
+        {
+            d_out.push_back({1, max_uniq_ct});
+        }
+
+        shape sh_y = {sh_x.type(), d_out};
+        // The three outputted Indices are just 1-D:
+        shape sh_idx{shape::int64_type, {d_out[0]}};
+
+        return {{sh_y, sh_idx, sh_idx, sh_idx}};
+    }
+
+    argument compute(const dyn_output& dyn_out, std::vector<argument> args) const
+    {
+        auto sh_x          = args.front().get_shape();
+        auto lens_x        = sh_x.lens();
+        shape output_shape = dyn_out.computed_shape;
+        auto vec_ss        = output_shape.sub_shapes();
+        auto ct_x          = sh_x.elements();
+        shape sh_y         = {vec_ss[0].type(), {ct_x}};
+        shape sh_idx       = {vec_ss[1].type(), {ct_x}};
+        shape sh_x_idx     = {vec_ss[1].type(), {ct_x}};
+
+        argument res_y{sh_y};
+        argument res_y_idx{sh_idx};
+        argument res_x_rev_idx{sh_idx};
+        argument res_y_ct_idx{sh_idx};
+
+        std::vector<size_t> out_y_idx;
+        std::vector<size_t> out_x_rev_idx;
+        std::vector<size_t> out_y_ct;
+
+        // If axis is not none, for >1D tensors, we have to consider
+        // then, the uniqueness of chunks of sub-tensors: a subsequence of built-ins..
+        // For a built-in type, chunk_sz is of course = 1
+        size_t chunk_sz = 1;
+        if(axis)
+            chunk_sz = ct_x / lens_x[0]; // axis = 0 is supported.
+
+        visit_all(args.front(), res_y)([&](auto x, auto y_flat) {
+            using o_type = typename decltype(x)::value_type;
+            std::vector<o_type> x_in(x.begin(), x.end());
+
+            std::tie(out_y_idx, out_x_rev_idx, out_y_ct) =
+                sorted ? sorted_uniq_indices(x_in, chunk_sz)
+                       : unsorted_uniq_indices(x_in, chunk_sz);
+
+            const auto uniq_ct = out_y_idx.size();
+
+            // construct y from x[indices] in flattened form
+            // later we reshape y to the final shape..
+            auto y_dst = y_flat.begin();
+            for(size_t idx = 0; idx < uniq_ct; idx++)
+                y_dst = copy_n(x_in.begin() + out_y_idx[idx] * chunk_sz, chunk_sz, y_dst);
+
+            std::vector<size_t> lens_y;
+            // if axis is specified:
+            // the output shape keeps the n-1 dimensions of x
+            if(axis)
+            {
+                lens_y    = lens_x;
+                lens_y[0] = uniq_ct;
+            }
+            else
+            {
+                lens_y = {uniq_ct};
+            }
+            sh_y   = {sh_y.type(), lens_y};
+            sh_idx = {sh_idx.type(), {uniq_ct}};
+        });
+
+        visit_all(res_y_idx, res_x_rev_idx, res_y_ct_idx)(
+            [&](auto y_indices, auto x_rev_indices, auto y_count) {
+                std::copy(out_y_idx.begin(), out_y_idx.end(), y_indices.begin());
+                std::copy(out_x_rev_idx.begin(), out_x_rev_idx.end(), x_rev_indices.begin());
+                std::copy(out_y_ct.begin(), out_y_ct.end(), y_count.begin());
+                sh_x_idx = {sh_idx.type(), {out_x_rev_idx.size()}};
+            });
+
+        return {{res_y.reshape(sh_y),
+                 res_y_idx.reshape(sh_idx),
+                 res_x_rev_idx.reshape(sh_x_idx),
+                 res_y_ct_idx.reshape(sh_idx)}};
+    }
+};
+
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/include/migraphx/operators.hpp
+++ b/src/include/migraphx/operators.hpp
@@ -119,6 +119,8 @@
 #include <migraphx/op/scatternd_add.hpp>
 #include <migraphx/op/scatternd_none.hpp>
 #include <migraphx/op/scatternd_mul.hpp>
+#include <migraphx/op/scatternd_max.hpp>
+#include <migraphx/op/scatternd_min.hpp>
 #include <migraphx/op/sigmoid.hpp>
 #include <migraphx/op/sign.hpp>
 #include <migraphx/op/sinh.hpp>
@@ -137,6 +139,7 @@
 #include <migraphx/op/unary.hpp>
 #include <migraphx/op/unary_not.hpp>
 #include <migraphx/op/undefined.hpp>
+#include <migraphx/op/unique.hpp>
 #include <migraphx/op/unknown.hpp>
 #include <migraphx/op/unsqueeze.hpp>
 #include <migraphx/op/where.hpp>

--- a/src/include/migraphx/tune_axis.hpp
+++ b/src/include/migraphx/tune_axis.hpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -24,21 +24,21 @@
 #ifndef MIGRAPHX_GUARD_OPERATORS_TUNE_AXIS_HPP
 #define MIGRAPHX_GUARD_OPERATORS_TUNE_AXIS_HPP

-#include <utility>
-#include <cstdint>
 #include <migraphx/stringutils.hpp>
 #include <migraphx/errors.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

-inline int tune_axis(const int n_dim, const int axis, const std::string& op_name = "OPERATOR")
+inline int tune_axis(int n_dim, int axis, const std::string& op_name = "OPERATOR")
 {
-    if(axis >= n_dim or std::abs(axis) > n_dim)
-    {
+    if(axis < 0)
+        axis += n_dim;
+
+    if(axis < 0 or axis >= n_dim)
        MIGRAPHX_THROW(to_upper(op_name) + ": axis is out of range.");
-    }
-    return (axis < 0) ? axis + n_dim : axis;
+
+    return axis;
 }

 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/gpu/include/migraphx/gpu/device/pad.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/pad.hpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -21,27 +21,26 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
+#ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_ONNX_POOLING_HPP
+#define MIGRAPHX_GUARD_AMDMIGRAPHX_ONNX_POOLING_HPP

-#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_PAD_HPP
-#define MIGRAPHX_GUARD_RTGLIB_DEVICE_PAD_HPP
-
-#include <migraphx/argument.hpp>
-#include <migraphx/gpu/device/config.hpp>
-#include <hip/hip_runtime_api.h>
+#include <migraphx/config.hpp>
+#include <migraphx/onnx/onnx_parser.hpp>
+#include <migraphx/onnx/op_parser.hpp>
+#include <migraphx/instruction.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-namespace device {
+namespace onnx {
+
+value handle_pooling_values(const op_desc& opd,
+                            onnx_parser::node_info info,
+                            const shape& in_shape,
+                            value values);

-argument MIGRAPHX_DEVICE_EXPORT pad(hipStream_t stream,
-                                    argument result,
-                                    argument arg1,
-                                    float value,
-                                    std::vector<std::int64_t> pads);
+instruction_ref add_pooling_op(const op_desc& opd, onnx_parser::node_info info, instruction_ref l0);

-} // namespace device
-} // namespace gpu
+} // namespace onnx
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx


--- a/src/onnx/onnx.proto
+++ b/src/onnx/onnx.proto
--- a/src/onnx/onnx_parser.cpp
+++ b/src/onnx/onnx_parser.cpp
@@ -34,7 +34,9 @@
 #include <migraphx/file_buffer.hpp>
 #include <migraphx/filesystem.hpp>
 #include <migraphx/op/unknown.hpp>
+#include <migraphx/float8.hpp>
 #include <migraphx/env.hpp>
+#include <onnx.pb.h>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -484,6 +486,8 @@ literal onnx_parser::parse_value(const onnx::AttributeProto& attr) const
    case onnx::AttributeProto::TENSORS:
    case onnx::AttributeProto::SPARSE_TENSOR:
    case onnx::AttributeProto::SPARSE_TENSORS:
+    case onnx::AttributeProto::TYPE_PROTOS:
+    case onnx::AttributeProto::TYPE_PROTO:
    case onnx::AttributeProto::GRAPHS: return {};
    }
    MIGRAPHX_THROW("PARSE_VALUE: Invalid attribute type " + std::to_string(attr.type()));
@@ -545,6 +549,18 @@ literal onnx_parser::parse_tensor(const onnx::TensorProto& t) const
    case onnx::TensorProto::DOUBLE:
        return create_literal(shape::double_type, dims, t.double_data());
    case onnx::TensorProto::FLOAT: return create_literal(shape::float_type, dims, t.float_data());
+    case onnx::TensorProto::FLOAT8E4M3FNUZ: {
+        std::vector<int32_t> data_int32(t.int32_data().begin(), t.int32_data().end());
+        std::vector<migraphx::fp8::fp8e4m3fnuz> data_fp8;
+        std::transform(data_int32.begin(),
+                       data_int32.end(),
+                       std::back_inserter(data_fp8),
+                       [](float raw_val) { return migraphx::fp8::fp8e4m3fnuz{raw_val}; });
+        return create_literal(shape::fp8e4m3fnuz_type, dims, data_fp8);
+    }
+    case onnx::TensorProto::FLOAT8E5M2FNUZ:
+    case onnx::TensorProto::FLOAT8E5M2:
+    case onnx::TensorProto::FLOAT8E4M3FN:
    case onnx::TensorProto::UNDEFINED:
    case onnx::TensorProto::STRING:
    case onnx::TensorProto::COMPLEX64:
@@ -609,6 +625,13 @@ shape::type_t get_type(int dtype)
    case 11: return shape::double_type;
    case 12: return shape::uint32_type;
    case 13: return shape::uint64_type;
+    case 18: return shape::fp8e4m3fnuz_type;
+    case 14:
+    case 15:
+    case 16:
+    case 17:
+    case 19:
+    case 20:
    default: {
        MIGRAPHX_THROW("Prototensor data type " + std::to_string(dtype) + " not supported");
    }

--- a/src/onnx/parse_multinomial.cpp
+++ b/src/onnx/parse_multinomial.cpp
@@ -127,9 +127,9 @@ struct parse_multinomial : op_parser<parse_multinomial>
            // use literal.  The array populated by random_uniform may have any shape, as long its
            // number of elements is batch_size * sample_size .
            size_t batch_size = s0.lens().front();
-            auto rand_dummy   = info.add_literal(
-                migraphx::literal{migraphx::shape::float_type, {batch_size * sample_size}});
-
+            auto rand_dummy   = info.add_literal(migraphx::literal{
+                migraphx::shape{migraphx::shape::float_type, {batch_size, sample_size}},
+                std::vector<float>(batch_size * sample_size)});
            randoms =
                info.add_instruction(migraphx::make_op("random_uniform"), seed_input, rand_dummy);
        }