Merge branch 'develop' of https://github.com/ROCmSoftwarePlatform/AMDMIGraphX into auto_contig_fix

d7dfe995 · Khalique Ahmed · c6ec6638 · e3e00547 · d7dfe995 · d7dfe995
Commit d7dfe995 authored Dec 05, 2023 by Khalique Ahmed
20 changed files
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -136,12 +136,14 @@ rocmtest clang_debug: rocmnode('mi100+') { cmake_build ->
    }
 }, mlir_debug: rocmnode('mi100+') { cmake_build ->
    stage('MLIR Debug') {
-        withEnv(['MIGRAPHX_ENABLE_EXTRA_MLIR=1']) {
+        withEnv(['MIGRAPHX_ENABLE_EXTRA_MLIR=1', 'MIGRAPHX_MLIR_USE_SPECIFIC_OPS=fused,attention,convolution,dot']) {
            def sanitizers = "undefined"
            // Note: the -fno-sanitize= is copied from upstream LLVM_UBSAN_FLAGS.
            def debug_flags_cxx = "-g -O2 -fsanitize=${sanitizers} -fno-sanitize=vptr,function -fno-sanitize-recover=${sanitizers}"
            def debug_flags = "-g -O2 -fsanitize=${sanitizers} -fno-sanitize=vptr -fno-sanitize-recover=${sanitizers}"
            def gpu_targets = getgputargets()
+            // Since the purpose of this run verify all things MLIR supports,
+            // enabling all possible types of offloads
            cmake_build(flags: "-DCMAKE_BUILD_TYPE=debug -DMIGRAPHX_ENABLE_PYTHON=Off -DMIGRAPHX_ENABLE_MLIR=On -DCMAKE_CXX_FLAGS_DEBUG='${debug_flags_cxx}' -DCMAKE_C_FLAGS_DEBUG='${debug_flags}' -DGPU_TARGETS='${gpu_targets}'")
        }
    }

--- a/docs/.doxygen/Doxyfile
+++ b/docs/.doxygen/Doxyfile
@@ -28,7 +28,14 @@ MACRO_EXPANSION = YES
 OUTPUT_DIRECTORY = docBin
-PREDEFINED = DOXYGEN
+PREDEFINED = \
+    DOXYGEN \
+    MIGRAPHX_EXPORT= \
+    MIGRAPHX_API_EXPORT= \
+    MIGRAPHX_GPU_EXPORT= \
+    MIGRAPHX_CPU_EXPORT= \
+    MIGRAPHX_ONNX_EXPORT= \
+    MIGRAPHX_TF_EXPORT= \
 PROJECT_NAME = MIGraphX

--- a/docs/.sphinx/requirements.txt
+++ b/docs/.sphinx/requirements.txt
@@ -89,7 +89,7 @@ requests==2.28.2
    # via
    #   pygithub
    #   sphinx
-rocm-docs-core==0.29.0
+rocm-docs-core==0.30.0
    # via -r requirements.in
 smmap==5.0.0
    # via gitdb

--- a/docs/dev/data.rst
+++ b/docs/dev/data.rst
@@ -5,26 +5,36 @@ shape
 -----
 .. doxygenstruct:: migraphx::internal::shape
+   :members:
+   :undoc-members:
 literal
 -------
 .. doxygenstruct:: migraphx::internal::literal
+   :members:
+   :undoc-members:
 argument
 --------
 .. doxygenstruct:: migraphx::internal::argument
+   :members:
+   :undoc-members:
 raw_data
 --------
 .. doxygenstruct:: migraphx::internal::raw_data
+   :members:
+   :undoc-members:
-.. doxygenfunction:: migraphx::internal::visit_all
+.. doxygenfunction:: template<class T, class ...Ts> auto migraphx::internal::visit_all(T &&x, Ts&&... xs)
 tensor_view
 -----------
 .. doxygenstruct:: migraphx::internal::tensor_view
+   :members:
+   :undoc-members:
--- a/docs/dev/dev_intro.rst
+++ b/docs/dev/dev_intro.rst
@@ -18,8 +18,8 @@ Directions for building MIGraphX from source can be found in the main README fil
 Adding Two Literals
 --------------------
-A program is a collection of modules, which are collections of instructions to be executed when calling `eval <migraphx::program::eval>`.
+A program is a collection of modules, which are collections of instructions to be executed when calling :cpp:any:`eval <migraphx::internal::program::eval>`.
-Each instruction has an associated `operation <migraphx::operation>` which represents the computation to be performed by the instruction.
+Each instruction has an associated :cpp:any:`operation <migraphx::internal::operation>` which represents the computation to be performed by the instruction.
 We start with a snippet of the simple ``add_two_literals()`` function::
@@ -41,14 +41,14 @@ We start with a snippet of the simple ``add_two_literals()`` function::
    auto result = p.eval({}).back();
    std::cout << "add_two_literals: 1 + 2 = " << result << "\n";
-We start by creating a simple ``migraphx::program`` object and then getting a pointer to the main module of it.
+We start by creating a simple :cpp:any:`migraphx::program <migraphx::internal::program>` object and then getting a pointer to the main module of it.
 The program is a collection of ``modules`` that start executing from the main module, so instructions are added to the modules rather than directly onto the program object.
-We then use the `add_literal <migraphx::program::add_literal>` function to add an instruction that stores the literal number ``1`` while returning an `instruction_ref <migraphx::instruction_ref>`.
+We then use the :cpp:any:`add_literal <migraphx::internal::program::add_literal>` function to add an instruction that stores the literal number ``1`` while returning an :cpp:any:`instruction_ref <migraphx::internal::instruction_ref>`.
-The returned `instruction_ref <migraphx::instruction_ref>` can be used in another instruction as an input.
+The returned :cpp:any:`instruction_ref <migraphx::internal::instruction_ref>` can be used in another instruction as an input.
-We use the same `add_literal <migraphx::program::add_literal>` function to add a ``2`` to the program.
+We use the same :cpp:any:`add_literal <migraphx::internal::program::add_literal>` function to add a ``2`` to the program.
 After creating the literals, we then create the instruction to add the numbers together.
-This is done by using the `add_instruction <migraphx::program::add_instruction>` function with the ``"add"`` `operation <migraphx::program::operation>` created by `make_op <migraphx::program::make_op>` along with the previous `add_literal` `instruction_ref <migraphx::instruction_ref>` for the input arguments of the instruction.
+This is done by using the :cpp:any:`add_instruction <migraphx::internal::program::add_instruction>` function with the ``"add"`` :cpp:any:`operation <migraphx::internal::program::operation>` created by :cpp:any:`make_op <migraphx::internal::program::make_op>` along with the previous `add_literal` :cpp:any:`instruction_ref <migraphx::internal::instruction_ref>` for the input arguments of the instruction.
-Finally, we can run this `program <migraphx::program>` by compiling it for the reference target (CPU) and then running it with `eval <migraphx::program::eval>`
+Finally, we can run this :cpp:any:`program <migraphx::internal::program>` by compiling it for the reference target (CPU) and then running it with :cpp:any:`eval <migraphx::internal::program::eval>`
 The result is then retreived and printed to the console.
 We can compile the program for the GPU as well, but the file will have to be moved to the ``test/gpu/`` directory and the correct target must be included::
@@ -76,8 +76,8 @@ We can modify the program to take an input parameter ``x``, as seen in the ``add
    p.compile(migraphx::ref::target{});
 This adds a parameter of type ``int32``, and compiles it for the CPU.
-To run the program, we need to pass the parameter as a ``parameter_map`` when we call `eval <migraphx::program::eval>`.
+To run the program, we need to pass the parameter as a ``parameter_map`` when we call :cpp:any:`eval <migraphx::internal::program::eval>`.
-We create the ``parameter_map`` by setting the ``x`` key to an `argument <migraphx::argument>` object with an ``int`` data type::
+We create the ``parameter_map`` by setting the ``x`` key to an :cpp:any:`argument <migraphx::internal::argument>` object with an ``int`` data type::
    // create a parameter_map object for passing a value to the "x" parameter
    std::vector<int> data = {4};
@@ -92,7 +92,7 @@ We create the ``parameter_map`` by setting the ``x`` key to an `argument <migrap
 Handling Tensor Data
 ---------------------
-In the previous examples we have only been dealing with scalars, but the `shape <migraphx::shape>` class can describe multi-dimensional tensors.
+In the previous examples we have only been dealing with scalars, but the :cpp:any:`shape <migraphx::internal::shape>` class can describe multi-dimensional tensors.
 For example, we can compute a simple convolution::
    migraphx::program p;
@@ -109,7 +109,7 @@ For example, we can compute a simple convolution::
 Here we create two parameters for both the ``input`` and ``weights``.
 In the previous examples, we created simple literals, however, most programs will take data from allocated buffers (usually on the GPU).
-In this case, we can create `argument <migraphx::argument>` objects directly from the pointers to the buffers::
+In this case, we can create :cpp:any:`argument <migraphx::internal::argument>` objects directly from the pointers to the buffers::
    // Compile the program
    p.compile(migraphx::ref::target{});
@@ -133,8 +133,8 @@ In this case, we can create `argument <migraphx::argument>` objects directly fro
    EXPECT(migraphx::verify::verify_rms_range(results_vector, sol));
-An `argument <migraphx::argument>` can handle memory buffers from either the GPU or the CPU.
+An :cpp:any:`argument <migraphx::internal::argument>` can handle memory buffers from either the GPU or the CPU.
-By default when running the `program <migraphx::program>`, buffers are allocated on the corresponding target.
+By default when running the :cpp:any:`program <migraphx::internal::program>`, buffers are allocated on the corresponding target.
 When compiling for the CPU, the buffers by default will be allocated on the CPU.
 When compiling for the GPU, the buffers by default will be allocated on the GPU.
 With the option ``offload_copy=true`` set while compiling for the GPU, the buffers will be located on the CPU.
@@ -143,7 +143,7 @@ With the option ``offload_copy=true`` set while compiling for the GPU, the buffe
 Importing From ONNX
 --------------------
-A `program <migraphx::program>` can be built directly from an onnx file using the MIGraphX ONNX parser.
+A :cpp:any:`program <migraphx::internal::program>` can be built directly from an onnx file using the MIGraphX ONNX parser.
 This makes it easier to use neural networks directly from other frameworks.
 In this case, there is an ``parse_onnx`` function::

--- a/docs/dev/env_vars.rst
+++ b/docs/dev/env_vars.rst
@@ -4,13 +4,13 @@ Environment Variables
 For parsing
 ---------------
-**MIGRAPHX_TRACE_ONNX_PARSER**
+.. envvar:: MIGRAPHX_TRACE_ONNX_PARSER
 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Print debugging traces for the onnx parser.
 Prints: initializers (if used), ONNX node operators, added MIGraphX instructions
-**MIGRAPHX_DISABLE_FP16_INSTANCENORM_CONVERT**
+.. envvar:: MIGRAPHX_DISABLE_FP16_INSTANCENORM_CONVERT
 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Disables the conversion from fp16 to fp32 for the InstanceNormalization ONNX operator that MIGX does as a workaround for accuracy issues with reduce_mean/variance.
@@ -20,16 +20,16 @@ See ``parse_instancenorm.cpp`` for more details.
 Matchers
 ------------
-**MIGRAPHX_TRACE_MATCHES**
+.. envvar:: MIGRAPHX_TRACE_MATCHES
 Set to "1" to print the matcher that matches an instruction and the matched instruction.
 Set to "2" and use the ``MIGRAPHX_TRACE_MATHCES_FOR`` flag to filter out results.
-**MIGRAPHX_TRACE_MATCHES_FOR**
+.. envvar:: MIGRAPHX_TRACE_MATCHES_FOR
 Set to the name of any matcher and only traces for that matcher will be printed out.
-**MIGRAPHX_VALIDATE_MATCHES**
+.. envvar:: MIGRAPHX_VALIDATE_MATCHES
 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Validate the module after finding the matches (runs ``module.validate()``).
@@ -37,7 +37,7 @@ Validate the module after finding the matches (runs ``module.validate()``).
 Program Execution 
 ---------------------
-**MIGRAPHX_TRACE_EVAL**
+.. envvar:: MIGRAPHX_TRACE_EVAL
 Set to "1", "2", or "3" to use.
 "1" prints the instruction run and the time taken.
@@ -48,7 +48,7 @@ Set to "1", "2", or "3" to use.
 Program Verification
 ------------------------
-**MIGRAPHX_VERIFY_ENABLE_ALLCLOSE**
+.. envvar:: MIGRAPHX_VERIFY_ENABLE_ALLCLOSE
 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Uses ``allclose`` with the given ``atol`` and ``rtol`` for verifying ranges with ``driver verify`` or the tests that use ``migraphx/verify.hpp``.
@@ -57,76 +57,76 @@ Uses ``allclose`` with the given ``atol`` and ``rtol`` for verifying ranges with
 Pass debugging or Pass controls
 -----------------------------------
-**MIGRAPHX_TRACE_ELIMINATE_CONTIGUOUS**
+.. envvar:: MIGRAPHX_TRACE_ELIMINATE_CONTIGUOUS
 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Debug print the instructions that have input ``contiguous`` instructions removed.
-**MIGRAPHX_DISABLE_POINTWISE_FUSION**
+.. envvar:: MIGRAPHX_DISABLE_POINTWISE_FUSION
 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Disables the ``fuse_pointwise`` compile pass.
-**MIGRAPHX_DEBUG_MEMORY_COLORING**
+.. envvar:: MIGRAPHX_DEBUG_MEMORY_COLORING
 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Print debug statements for the ``memory_coloring`` pass.
-**MIGRAPHX_TRACE_SCHEDULE**
+.. envvar:: MIGRAPHX_TRACE_SCHEDULE
 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Print debug statements for the ``schedule`` pass.
-**MIGRAPHX_TRACE_PROPAGATE_CONSTANT**
+.. envvar:: MIGRAPHX_TRACE_PROPAGATE_CONSTANT
 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Traces instructions replaced with a constant.
-**MIGRAPHX_INT8_QUANTIZATION_PARAMS**
+.. envvar:: MIGRAPHX_INT8_QUANTIZATION_PARAMS
 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Print the quantization parameters in only the main module.
-**MIGRAPHX_DISABLE_DNNL_POST_OPS_WORKAROUND**
+.. envvar:: MIGRAPHX_DISABLE_DNNL_POST_OPS_WORKAROUND
 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Disable the DNNL post ops workaround.
-**MIGRAPHX_DISABLE_MIOPEN_FUSION**
+.. envvar:: MIGRAPHX_DISABLE_MIOPEN_FUSION
 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Disable MIOpen fusions.
-**MIGRAPHX_DISABLE_SCHEDULE_PASS**
+.. envvar:: MIGRAPHX_DISABLE_SCHEDULE_PASS
 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Disable the ``schedule`` pass.
-**MIGRAPHX_DISABLE_REDUCE_FUSION**
+.. envvar:: MIGRAPHX_DISABLE_REDUCE_FUSION
 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Disable the ``fuse_reduce`` pass.
-**MIGRAPHX_ENABLE_NHWC**
+.. envvar:: MIGRAPHX_ENABLE_NHWC
 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Enable the ``layout_nhwc`` pass.
-**MIGRAPHX_ENABLE_CK**
+.. envvar:: MIGRAPHX_ENABLE_CK
 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Enable using the Composable Kernels library.
 Should be used in conjunction with ``MIGRAPHX_DISABLE_MLIR=1``.
-**MIGRAPHX_DISABLE_MLIR** 
+.. envvar:: MIGRAPHX_DISABLE_MLIR*
 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Disable using the rocMLIR library.
-**MIGRAPHX_ENABLE_EXTRA_MLIR**
+.. envvar:: MIGRAPHX_ENABLE_EXTRA_MLIR
 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Enables additional opportunities to use MLIR that may improve performance.
-**MIGRAPHX_COPY_LITERALS**
+.. envvar:: MIGRAPHX_COPY_LITERALS
 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Use ``hip_copy_to_gpu`` with a new ``literal`` instruction rather than use ``hip_copy_literal{}``.
@@ -134,22 +134,22 @@ Use ``hip_copy_to_gpu`` with a new ``literal`` instruction rather than use ``hip
 Compilation traces
 ----------------------
-**MIGRAPHX_TRACE_FINALIZE**
+.. envvar:: MIGRAPHX_TRACE_FINALIZE
 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Debug print instructions during the ``module.finalize()`` step.
-**MIGRAPHX_TRACE_COMPILE**
+.. envvar:: MIGRAPHX_TRACE_COMPILE
 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Print trace information for the graph compilation process.
-**MIGRAPHX_TRACE_PASSES**
+.. envvar:: MIGRAPHX_TRACE_PASSES
 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Print the compile pass and the program after the pass.
-**MIGRAPHX_TIME_PASSES**
+.. envvar:: MIGRAPHX_TIME_PASSES
 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Time the compile passes.
@@ -158,77 +158,77 @@ Time the compile passes.
 GPU Kernels JIT compilation debugging (applicable for both hiprtc and hipclang)
 -----------------------------------------
-**MIGRAPHX_TRACE_CMD_EXECUTE**
+.. envvar:: MIGRAPHX_TRACE_CMD_EXECUTE
 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Print commands executed by the MIGraphX ``process``.
-**MIGRAPHX_TRACE_HIPRTC**
+.. envvar:: MIGRAPHX_TRACE_HIPRTC
 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Print HIPRTC options and C++ file executed.
-**MIGRAPHX_DEBUG_SAVE_TEMP_DIR**
+.. envvar:: MIGRAPHX_DEBUG_SAVE_TEMP_DIR
 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Make it so the created temporary directories are not deleted.
-**MIGRAPHX_GPU_DEBUG**
+.. envvar:: MIGRAPHX_GPU_DEBUG
 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Internally, this adds the option ``-DMIGRAPHX_DEBUG`` when compiling GPU kernels. It enables assertions and capture of source locations for the errors. 
-**MIGRAPHX_GPU_DEBUG_SYM**
+.. envvar:: MIGRAPHX_GPU_DEBUG_SYM
 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Adds the option ``-g`` when compiling HIPRTC.
-**MIGRAPHX_GPU_DUMP_SRC**
+.. envvar:: MIGRAPHX_GPU_DUMP_SRC
 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Dump the HIPRTC source files compiled.
-**MIGRAPHX_GPU_DUMP_ASM**
+.. envvar:: MIGRAPHX_GPU_DUMP_ASM
 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Dump the hip-clang assembly.
-**MIGRAPHX_GPU_OPTIMIZE**
+.. envvar:: MIGRAPHX_GPU_OPTIMIZE
 Set the optimization mode for GPU compile (``-O`` option).
 Defaults to ``-O3``.
-**MIGRAPHX_GPU_COMPILE_PARALLEL**
+.. envvar:: MIGRAPHX_GPU_COMPILE_PARALLEL
 Set to the number of threads to use.
 Compile GPU code in parallel with the given number of threads.
-**MIGRAPHX_TRACE_NARY**
+.. envvar:: MIGRAPHX_TRACE_NARY
 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Print the ``nary`` device functions used.
-**MIGRAPHX_ENABLE_HIPRTC_WORKAROUNDS**
+.. envvar:: MIGRAPHX_ENABLE_HIPRTC_WORKAROUNDS
 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Enable HIPRTC workarounds for bugs in HIPRTC.
-**MIGRAPHX_USE_FAST_SOFTMAX**
+.. envvar:: MIGRAPHX_USE_FAST_SOFTMAX
 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Use the fast softmax optimization.
-**MIGRAPHX_ENABLE_NULL_STREAM**
+.. envvar:: MIGRAPHX_ENABLE_NULL_STREAM
 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Allow using null stream for miopen and hipStream.
-**MIGRAPHX_NSTREAMS**
+.. envvar:: MIGRAPHX_NSTREAMS
 Set to the number of streams to use.
 Defaults to 1.
-**MIGRAPHX_TRACE_BENCHMARKING**
+.. envvar:: MIGRAPHX_TRACE_BENCHMARKING
 Set to "1" to print benchmarching trace.
 Set to "2" to print benchmarching trace with more detail.
@@ -236,45 +236,49 @@ Set to "2" to print benchmarching trace with more detail.
 MLIR vars
 -------------
-**MIGRAPHX_TRACE_MLIR**
+.. envvar:: MIGRAPHX_TRACE_MLIR
 Set to "1" to trace MLIR and print any failures.
 Set to "2" to additionally print all MLIR operations.
-**MIGRAPHX_MLIR_USE_SPECIFIC_OPS**
+.. envvar:: MIGRAPHX_MLIR_USE_SPECIFIC_OPS
 Set to the name of the operations you want to always use MLIR regardless of GPU architecture.
 Accepts a list of operators separated by commas (ex: "fused", "convolution", "dot").
-**MIGRAPHX_MLIR_TUNING_DB**
+.. envvar:: MIGRAPHX_MLIR_TUNING_DB
 Set to the path of the MLIR tuning database to load.
-**MIGRAPHX_MLIR_TUNING_CFG**
+.. envvar:: MIGRAPHX_MLIR_TUNING_CFG
 Set to the path of the tuning configuration.
 Appends to tuning cfg file that could be used with rocMLIR tuning scripts.
-**MIGRAPHX_MLIR_TUNE_EXHAUSTIVE**
+.. envvar:: MIGRAPHX_MLIR_TUNE_EXHAUSTIVE
 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Do exhaustive tuning for MLIR.
+.. envvar:: MIGRAPHX_MLIR_TUNE_LIMIT
+Set to an integer greater than 1.
+Limits the number of solutions that MLIR will use for tuning.
 CK vars
 -----------
-**MIGRAPHX_LOG_CK_GEMM**
+.. envvar:: MIGRAPHX_LOG_CK_GEMM
 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Print Composable Kernels GEMM traces.
-**MIGRAPHX_CK_DEBUG**
+.. envvar:: MIGRAPHX_CK_DEBUG
 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Always add the ``-DMIGRAPHX_CK_CHECK=1`` for compiling Composable Kernels operators.
-**MIGRAPHX_TUNE_CK**
+.. envvar:: MIGRAPHX_TUNE_CK
 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Use tuning for Composable Kernels.
@@ -282,19 +286,19 @@ Use tuning for Composable Kernels.
 Testing 
 ------------
-**MIGRAPHX_TRACE_TEST_COMPILE**
+.. envvar:: MIGRAPHX_TRACE_TEST_COMPILE
 Set to the target that you want to trace the compilation of (ex. "gpu", "cpu").
 Prints the compile trace for the given target for the verify tests.
 This flag shouldn't be used in conjunction with ``MIGRAPHX_TRACE_COMPILE``.
 For the verify tests only use ``MIGRAPHX_TRACE_TEST_COMPILE``.
-**MIGRAPHX_TRACE_TEST**
+.. envvar:: MIGRAPHX_TRACE_TEST
 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Prints the reference and target programs even if the verify passed successfully.
-**MIGRAPHX_DUMP_TEST**
+.. envvar:: MIGRAPHX_DUMP_TEST
 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Dumps verify tests to ``.mxr`` files.
--- a/docs/dev/operators.rst
+++ b/docs/dev/operators.rst
@@ -5,6 +5,8 @@ operation
 ---------
 .. doxygenstruct:: migraphx::internal::operation
+   :members:
+   :undoc-members:
 .. doxygenfunction:: migraphx::internal::is_context_free
@@ -14,3 +16,5 @@ operators
 ---------
 .. doxygennamespace:: migraphx::internal::op
+   :members:
+   :undoc-members:
--- a/docs/dev/pass.rst
+++ b/docs/dev/pass.rst
@@ -5,63 +5,82 @@ pass
 ----
 .. doxygenstruct:: migraphx::internal::pass
+   :members:
+   :undoc-members:
 dead_code_elimination
 ---------------------
 .. doxygenstruct:: migraphx::internal::dead_code_elimination
+   :members:
+   :undoc-members:
 eliminate_common_subexpression
 ------------------------------
 .. doxygenstruct:: migraphx::internal::eliminate_common_subexpression
+   :members:
+   :undoc-members:
 eliminate_concat
 ----------------
 .. doxygenstruct:: migraphx::internal::eliminate_concat
+   :members:
+   :undoc-members:
 eliminate_contiguous
 --------------------
 .. doxygenstruct:: migraphx::internal::eliminate_contiguous
+   :members:
+   :undoc-members:
 eliminate_identity
 ------------------
 .. doxygenstruct:: migraphx::internal::eliminate_identity
+   :members:
+   :undoc-members:
 eliminate_pad
 -------------
 .. doxygenstruct:: migraphx::internal::eliminate_pad
+   :members:
+   :undoc-members:
 propagate_constant
 ------------------
 .. doxygenstruct:: migraphx::internal::propagate_constant
+   :members:
-rewrite_batchnorm
+   :undoc-members:
-----------------
-.. doxygenstruct:: migraphx::internal::rewrite_batchnorm
 rewrite_rnn
 -----------
 .. doxygenstruct:: migraphx::internal::rewrite_rnn
+   :members:
+   :undoc-members:
 schedule
 --------
 .. doxygenstruct:: migraphx::internal::schedule
+   :members:
+   :undoc-members:
 simplify_algebra
 ----------------
 .. doxygenstruct:: migraphx::internal::simplify_algebra
+   :members:
+   :undoc-members:
 simplify_reshapes
 -----------------
 .. doxygenstruct:: migraphx::internal::simplify_reshapes
+   :members:
+   :undoc-members:
--- a/docs/dev/program.rst
+++ b/docs/dev/program.rst
@@ -5,6 +5,8 @@ instruction
 -----------
 .. doxygenstruct:: migraphx::internal::instruction
+   :members:
+   :undoc-members:
 instruction_ref
 ---------------
@@ -17,6 +19,8 @@ program
 -------
 .. doxygenstruct:: migraphx::internal::program
+   :members:
+   :undoc-members:
 parse_onnx
 ----------

--- a/docs/dev/targets.rst
+++ b/docs/dev/targets.rst
@@ -5,14 +5,20 @@ target
 ------
 .. doxygenstruct:: migraphx::internal::target
+   :members:
+   :undoc-members:
 gpu::target
 -----------
 .. doxygenstruct:: migraphx::internal::gpu::target
+   :members:
+   :undoc-members:
 cpu::target
 -----------
 .. doxygenstruct:: migraphx::internal::cpu::target
+   :members:
+   :undoc-members:
--- a/docs/reference/cpp.rst
+++ b/docs/reference/cpp.rst
@@ -8,45 +8,65 @@ shape
 .. doxygenenum:: migraphx_shape_datatype_t
 .. doxygenstruct:: migraphx::shape
+   :members:
+   :undoc-members:
 argument
 --------
 .. doxygenstruct:: migraphx::argument
+   :members:
+   :undoc-members:
 target
 ------
 .. doxygenstruct:: migraphx::target
+   :members:
+   :undoc-members:
 program
 -------
 .. doxygenstruct:: migraphx::program_parameter_shapes
+   :members:
+   :undoc-members:
 .. doxygenstruct:: migraphx::program_parameters
+   :members:
+   :undoc-members:
 .. doxygenstruct:: migraphx_compile_options
+   :members:
+   :undoc-members:
 .. doxygenstruct:: migraphx::program
+   :members:
+   :undoc-members:
 quantize
 --------
 .. doxygenstruct:: migraphx::quantize_op_names
+   :members:
+   :undoc-members:
 .. doxygenfunction:: migraphx::quantize_fp16(const program&)
 .. doxygenfunction:: migraphx::quantize_fp16(const program&, const quantize_op_names&)
 .. doxygenstruct:: migraphx::quantize_int8_options
+   :members:
+   :undoc-members:
-.. doxygenfunction:: migraphx::quantize_int8
+.. doxygenfunction::migraphx::quantize_int8
 parse_onnx
 ----------
 .. doxygenstruct:: migraphx::onnx_options
+   :members:
+   :undoc-members:
 .. doxygenfunction:: migraphx::parse_onnx(const char *)
@@ -63,16 +83,18 @@ parse_onnx
 load
 ----
-.. doxygenstruct:: migraphx_file_options
+.. doxygenstruct:: migraphx::file_options
+   :members:
+   :undoc-members:
 .. doxygenfunction:: migraphx::load(const char *)
-.. doxygenfunction:: migraphx::load(const char *, migraphx_file_options)
+.. doxygenfunction:: migraphx::load(const char *, const file_options&)
 save
 ----
 .. doxygenfunction:: migraphx::save(const program&, const char *)
-.. doxygenfunction:: migraphx::save(const program&, const char *, migraphx_file_options)
+.. doxygenfunction:: migraphx::save(const program&, const char *, const file_options&)
--- a/requirements.txt
+++ b/requirements.txt
@@ -29,4 +29,4 @@ pybind/pybind11@d159a563383d10c821ba7b2a71905d1207db6de4 --build
 msgpack/msgpack-c@cpp-3.3.0 -DMSGPACK_BUILD_TESTS=Off
 sqlite3@3.43.2 -DCMAKE_POSITION_INDEPENDENT_CODE=On
 ROCmSoftwarePlatform/composable_kernel@70eefcf4f263aa5c25f3c9ff0db8f6f199ef0fb9 -DCK_BUILD_JIT_LIB=On -DCMAKE_POSITION_INDEPENDENT_CODE=On
-ROCmSoftwarePlatform/rocMLIR@13f6c2a69cfe80a575c6b241ec7353d1e953cb12 -DBUILD_FAT_LIBROCKCOMPILER=On
+ROCmSoftwarePlatform/rocMLIR@a6880f1e6daec99876cd6a4820fbc69c57216401 -DBUILD_FAT_LIBROCKCOMPILER=On
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -241,6 +241,7 @@ register_migraphx_ops(
    transpose
    unary_not
    undefined
+    unique
    unknown
    unsqueeze
    where
@@ -290,6 +291,7 @@ find_package(TBB QUIET)
 if(TBB_FOUND)
    check_execution_par(TBB_HAS_EXECUTION_PAR TBB::tbb)
    if(TBB_HAS_EXECUTION_PAR)
+        list(APPEND PACKAGE_DEPENDS PACKAGE TBB)
        target_link_libraries(migraphx PUBLIC TBB::tbb)
        set(MIGRAPHX_HAS_EXECUTORS_DEFAULT On)
        message(STATUS "Using TBB for parallel execution")

--- a/src/include/migraphx/bit_cast.hpp
+++ b/src/include/migraphx/bit_cast.hpp
@@ -21,10 +21,13 @@
 * ************************************************************************ */
 #ifndef MIGRAPHX_GUARD_RTGLIB_BITCAST_HPP
 #define MIGRAPHX_GUARD_RTGLIB_BITCAST_HPP
+#include <type_traits>
 #if defined(__GNUC__) && !defined(__clang__)
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #endif
+#include <migraphx/requires.hpp>
 #include <migraphx/config.hpp>
 // NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
@@ -32,7 +35,10 @@
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
-template <typename To, typename From>
+template <typename To,
+          typename From,
+          MIGRAPHX_REQUIRES(std::is_trivially_copyable<To>{} and
+                            std::is_trivially_copyable<From>{})>
 inline constexpr To bit_cast(From fr) noexcept
 {
    static_assert(sizeof(To) == sizeof(From));

--- a/src/include/migraphx/op/unique.hpp
+++ b/src/include/migraphx/op/unique.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_OPERATORS_UNIQUE_HPP
+#define MIGRAPHX_GUARD_OPERATORS_UNIQUE_HPP
+#include <migraphx/shape_for_each.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/tune_axis.hpp>
+#include <utility>
+#include <map>
+#include <limits>
+#include <optional>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+// https://onnx.ai/onnx/operators/onnx__Unique.html
+// The Onnx spec refers to numpy specification, used as a reference:
+// https://numpy.org/doc/stable/reference/generated/numpy.unique.html
+// Input : Given an array of elements : X.
+// Output(s) :
+// 1. Find the unique elements (Y) of input (X).
+//
+// There are three outputs in addition to the unique elements in Y:
+// 2. the indices of the input array that give the unique values
+// 3. the indices of the unique array that reconstruct the input array
+// 4. the number of times each unique value comes up in the input array
+// Optional Attribute: 'Sorted' = 1 for sorted; = 0 for unsorted.
+// Onnx specification makes 'sorted' a default, while Numpy always sorts.
+//
+// Optional Attribute: 'Axis' is 'None' (default) or a valid int < rank(X).
+// Negative values are allowed.
+//
+// Numpy has the following important note on Axis:
+// ------------------------------------------------------------------
+// When an axis is specified the subarrays indexed by the axis are
+// sorted. This is done by making the specified axis the first
+// dimension of the array (move the axis to the first dimension to
+// keep the order of the other axes) and then flattening the subarrays
+// in C order. The flattened subarrays are then viewed as a structured
+// type with each element given a label, with the effect that we end
+// up with a 1-D array of structured types that can be treated in the
+// same way as any other 1-D array. The result is that the flattened
+// subarrays are sorted in lexicographic order starting with the first
+// element.
+// ------------------------------------------------------------------
+struct unique
+{
+    template <class T>
+    auto make_idx_less_fn(const T& data, size_t chunk_sz) const
+    {
+        return [&data, chunk_sz](auto idx1, auto idx2) {
+            return std::lexicographical_compare(data.begin() + idx1,
+                                                data.begin() + idx1 + chunk_sz,
+                                                data.begin() + idx2,
+                                                data.begin() + idx2 + chunk_sz);
+        };
+    }
+    // CASE SORTED:
+    //
+    // To process into a sorted unique series of elements/chunks:
+    // Chunk size == 1 means a simple element; >1 means a flat representation.
+    // Steps: first go through the input elements/chunks for uniqueness.
+    // At the end of this processing, per the sorted sequence of unique elements:
+    // update/create data structures: y, y_indices, x_rev_indices, y_count
+    //
+    // INPUT x: [2, 1, 1, 3, 4, 3], attr_sorted = 1;
+    // OUTPUT(s): indices..
+    // y_indices: [1, 0, 3, 4]  --- first incidence, in terms of index in sequence x
+    // x_rev_indices: [1, 0, 0, 2, 3, 2] --- x seen in terms of indices of unique sequence y
+    // y_count: [2, 1, 2, 1] -- count at each y_index. sum = len(x)
+    // NOTE: y [1, 2, 3, 4]   --- the unique output is constructed from x[y_indices[...]]
+    template <class T>
+    auto sorted_uniq_indices(const T& input_data, size_t chunk_sz) const
+    {
+        struct y_info
+        {
+            size_t y_idx;
+            size_t x_idx;
+            size_t ct = 0;
+        };
+        auto idx_less_fn = make_idx_less_fn(input_data, chunk_sz);
+        std::map<size_t, y_info, decltype(idx_less_fn)> uniq_val_map(idx_less_fn);
+        std::tuple<std::vector<std::size_t>, std::vector<std::size_t>, std::vector<std::size_t>> rv;
+        auto& [y_indices, x_rev_indices, y_count] = rv;
+        // go through all the elements and find the unique elements..
+        size_t count_x = input_data.size();
+        for(size_t f_idx = 0, x_idx = 0; f_idx < count_x; f_idx += chunk_sz, x_idx++)
+        {
+            y_info entry          = {.y_idx = uniq_val_map.size(), .x_idx = x_idx};
+            auto [itr, added_new] = uniq_val_map.insert({f_idx, entry});
+            itr->second.ct++;
+            x_rev_indices.push_back(itr->second.y_idx);
+        }
+        std::vector<std::size_t> y2x_indices(uniq_val_map.size());
+        y_indices.resize(uniq_val_map.size());
+        y_count.resize(uniq_val_map.size());
+        size_t idx = 0;
+        // the unique elements are now sorted:
+        // post-processing for all the return indices.
+        for(const auto& v : uniq_val_map)
+        {
+            y2x_indices[v.second.y_idx] = idx;
+            y_indices[idx]              = v.second.x_idx;
+            y_count[idx]                = v.second.ct;
+            idx++;
+        }
+        // update x_rev_indices as per the sorted order of y_indices
+        for(auto& i : x_rev_indices)
+            i = y2x_indices[i];
+        return rv;
+    }
+    // CASE UNSORTED:
+    //
+    // To process into an un-sorted unique series of elements/chunks:
+    // For chunk size = 1 is a simple element, else use a flat representation of a tensor obj
+    // Go through the input elements/chunks one by one with inline processing of indices..
+    // INPUT x: [2, 1, 1, 3, 4, 3], attr_sorted = 0;
+    // OUTPUT(s): indices..
+    // y_indices: [0, 1, 3, 4]  --- first incidence, in terms of index in sequence x
+    // x_rev_indices: [0, 1, 1, 2, 3, 2] --- x seen in terms of indices of unique sequence y
+    // y_count: [1, 2, 2, 1] -- count at each y_index. sum = len(x)
+    // NOTE: y [2, 1, 3, 4]   --- the unique output is constructed from x[y_indices[...]]
+    // Output data structures: y_indices, x_rev_indices, y_count are processed inline.
+    template <class T>
+    auto unsorted_uniq_indices(const T& input_data, size_t chunk_sz) const
+    {
+        auto idx_less_fn = make_idx_less_fn(input_data, chunk_sz);
+        std::map<size_t, size_t, decltype(idx_less_fn)> uniq_val_map(idx_less_fn);
+        // rv is used for NVRO below..
+        std::tuple<std::vector<std::size_t>, std::vector<std::size_t>, std::vector<std::size_t>> rv;
+        auto& [y_indices, x_rev_indices, y_count] = rv;
+        // go through all the elements and add the unique elements into the map..
+        // inline processing for outputs: y_indices, x_rev_indices, y_count
+        size_t count_x = input_data.size();
+        for(size_t f_idx = 0; f_idx < count_x; f_idx += chunk_sz)
+        {
+            auto [itr, added_new] = uniq_val_map.insert({f_idx, y_indices.size()});
+            if(added_new)
+            {
+                y_count.push_back(0);
+                y_indices.push_back(x_rev_indices.size());
+            }
+            y_count[itr->second]++;
+            x_rev_indices.push_back(itr->second);
+        }
+        return rv;
+    }
+    // Axis. Default: none. Range: [-rank, rank-1]
+    std::optional<int64_t> axis;
+    // Sorted, Default: 1= sorted. 0 = unsorted.
+    bool sorted = true;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.axis, "axis"), f(self.sorted, "sorted"));
+    }
+    std::string name() const { return "unique"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(1);
+        auto& sh_x         = inputs[0];
+        auto lens_x        = sh_x.lens();
+        size_t dim_x       = sh_x.ndim();
+        size_t max_uniq_ct = sh_x.elements();
+        std::vector<shape::dynamic_dimension> d_out;
+        if(axis)
+        {
+            int64_t t_axis = migraphx::tune_axis(dim_x, *axis, name());
+            if(t_axis != 0)
+                MIGRAPHX_THROW("Unique: Only supports axis = 0 or None");
+            d_out = sh_x.to_dynamic().dyn_dims();
+            // only axis = 0 is supported:
+            max_uniq_ct = lens_x[0];
+            // min = 1 unique element; max = full dimension along axis 0
+            d_out[0] = {1, max_uniq_ct};
+        }
+        else
+        {
+            d_out.push_back({1, max_uniq_ct});
+        }
+        shape sh_y = {sh_x.type(), d_out};
+        // The three outputted Indices are just 1-D:
+        shape sh_idx{shape::int64_type, {d_out[0]}};
+        return {{sh_y, sh_idx, sh_idx, sh_idx}};
+    }
+    argument compute(const dyn_output& dyn_out, std::vector<argument> args) const
+    {
+        auto sh_x          = args.front().get_shape();
+        auto lens_x        = sh_x.lens();
+        shape output_shape = dyn_out.computed_shape;
+        auto vec_ss        = output_shape.sub_shapes();
+        auto ct_x          = sh_x.elements();
+        shape sh_y         = {vec_ss[0].type(), {ct_x}};
+        shape sh_idx       = {vec_ss[1].type(), {ct_x}};
+        shape sh_x_idx     = {vec_ss[1].type(), {ct_x}};
+        argument res_y{sh_y};
+        argument res_y_idx{sh_idx};
+        argument res_x_rev_idx{sh_idx};
+        argument res_y_ct_idx{sh_idx};
+        std::vector<size_t> out_y_idx;
+        std::vector<size_t> out_x_rev_idx;
+        std::vector<size_t> out_y_ct;
+        // If axis is not none, for >1D tensors, we have to consider
+        // then, the uniqueness of chunks of sub-tensors: a subsequence of built-ins..
+        // For a built-in type, chunk_sz is of course = 1
+        size_t chunk_sz = 1;
+        if(axis)
+            chunk_sz = ct_x / lens_x[0]; // axis = 0 is supported.
+        visit_all(args.front(), res_y)([&](auto x, auto y_flat) {
+            using o_type = typename decltype(x)::value_type;
+            std::vector<o_type> x_in(x.begin(), x.end());
+            std::tie(out_y_idx, out_x_rev_idx, out_y_ct) =
+                sorted ? sorted_uniq_indices(x_in, chunk_sz)
+                       : unsorted_uniq_indices(x_in, chunk_sz);
+            const auto uniq_ct = out_y_idx.size();
+            // construct y from x[indices] in flattened form
+            // later we reshape y to the final shape..
+            auto y_dst = y_flat.begin();
+            for(size_t idx = 0; idx < uniq_ct; idx++)
+                y_dst = copy_n(x_in.begin() + out_y_idx[idx] * chunk_sz, chunk_sz, y_dst);
+            std::vector<size_t> lens_y;
+            // if axis is specified:
+            // the output shape keeps the n-1 dimensions of x
+            if(axis)
+            {
+                lens_y    = lens_x;
+                lens_y[0] = uniq_ct;
+            }
+            else
+            {
+                lens_y = {uniq_ct};
+            }
+            sh_y   = {sh_y.type(), lens_y};
+            sh_idx = {sh_idx.type(), {uniq_ct}};
+        });
+        visit_all(res_y_idx, res_x_rev_idx, res_y_ct_idx)(
+            [&](auto y_indices, auto x_rev_indices, auto y_count) {
+                std::copy(out_y_idx.begin(), out_y_idx.end(), y_indices.begin());
+                std::copy(out_x_rev_idx.begin(), out_x_rev_idx.end(), x_rev_indices.begin());
+                std::copy(out_y_ct.begin(), out_y_ct.end(), y_count.begin());
+                sh_x_idx = {sh_idx.type(), {out_x_rev_idx.size()}};
+            });
+        return {{res_y.reshape(sh_y),
+                 res_y_idx.reshape(sh_idx),
+                 res_x_rev_idx.reshape(sh_x_idx),
+                 res_y_ct_idx.reshape(sh_idx)}};
+    }
+};
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/include/migraphx/operators.hpp
+++ b/src/include/migraphx/operators.hpp
@@ -139,6 +139,7 @@
 #include <migraphx/op/unary.hpp>
 #include <migraphx/op/unary_not.hpp>
 #include <migraphx/op/undefined.hpp>
+#include <migraphx/op/unique.hpp>
 #include <migraphx/op/unknown.hpp>
 #include <migraphx/op/unsqueeze.hpp>
 #include <migraphx/op/where.hpp>

--- a/src/include/migraphx/tune_axis.hpp
+++ b/src/include/migraphx/tune_axis.hpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -24,21 +24,21 @@
 #ifndef MIGRAPHX_GUARD_OPERATORS_TUNE_AXIS_HPP
 #define MIGRAPHX_GUARD_OPERATORS_TUNE_AXIS_HPP
-#include <utility>
-#include <cstdint>
 #include <migraphx/stringutils.hpp>
 #include <migraphx/errors.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
-inline int tune_axis(const int n_dim, const int axis, const std::string& op_name = "OPERATOR")
+inline int tune_axis(int n_dim, int axis, const std::string& op_name = "OPERATOR")
 {
-    if(axis >= n_dim or std::abs(axis) > n_dim)
+    if(axis < 0)
-    {
+        axis += n_dim;
+    if(axis < 0 or axis >= n_dim)
        MIGRAPHX_THROW(to_upper(op_name) + ": axis is out of range.");
-    }
-    return (axis < 0) ? axis + n_dim : axis;
+    return axis;
 }
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/onnx/onnx.proto
+++ b/src/onnx/onnx.proto
--- a/src/onnx/onnx_parser.cpp
+++ b/src/onnx/onnx_parser.cpp
@@ -34,7 +34,9 @@
 #include <migraphx/file_buffer.hpp>
 #include <migraphx/filesystem.hpp>
 #include <migraphx/op/unknown.hpp>
+#include <migraphx/float8.hpp>
 #include <migraphx/env.hpp>
+#include <onnx.pb.h>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -484,6 +486,8 @@ literal onnx_parser::parse_value(const onnx::AttributeProto& attr) const
    case onnx::AttributeProto::TENSORS:
    case onnx::AttributeProto::SPARSE_TENSOR:
    case onnx::AttributeProto::SPARSE_TENSORS:
+    case onnx::AttributeProto::TYPE_PROTOS:
+    case onnx::AttributeProto::TYPE_PROTO:
    case onnx::AttributeProto::GRAPHS: return {};
    }
    MIGRAPHX_THROW("PARSE_VALUE: Invalid attribute type " + std::to_string(attr.type()));
@@ -545,6 +549,18 @@ literal onnx_parser::parse_tensor(const onnx::TensorProto& t) const
    case onnx::TensorProto::DOUBLE:
        return create_literal(shape::double_type, dims, t.double_data());
    case onnx::TensorProto::FLOAT: return create_literal(shape::float_type, dims, t.float_data());
+    case onnx::TensorProto::FLOAT8E4M3FNUZ: {
+        std::vector<int32_t> data_int32(t.int32_data().begin(), t.int32_data().end());
+        std::vector<migraphx::fp8::fp8e4m3fnuz> data_fp8;
+        std::transform(data_int32.begin(),
+                       data_int32.end(),
+                       std::back_inserter(data_fp8),
+                       [](float raw_val) { return migraphx::fp8::fp8e4m3fnuz{raw_val}; });
+        return create_literal(shape::fp8e4m3fnuz_type, dims, data_fp8);
+    }
+    case onnx::TensorProto::FLOAT8E5M2FNUZ:
+    case onnx::TensorProto::FLOAT8E5M2:
+    case onnx::TensorProto::FLOAT8E4M3FN:
    case onnx::TensorProto::UNDEFINED:
    case onnx::TensorProto::STRING:
    case onnx::TensorProto::COMPLEX64:
@@ -609,6 +625,13 @@ shape::type_t get_type(int dtype)
    case 11: return shape::double_type;
    case 12: return shape::uint32_type;
    case 13: return shape::uint64_type;
+    case 18: return shape::fp8e4m3fnuz_type;
+    case 14:
+    case 15:
+    case 16:
+    case 17:
+    case 19:
+    case 20:
    default: {
        MIGRAPHX_THROW("Prototensor data type " + std::to_string(dtype) + " not supported");
    }

--- a/src/onnx/parse_multinomial.cpp
+++ b/src/onnx/parse_multinomial.cpp
@@ -127,9 +127,9 @@ struct parse_multinomial : op_parser<parse_multinomial>
            // use literal.  The array populated by random_uniform may have any shape, as long its
            // number of elements is batch_size * sample_size .
            size_t batch_size = s0.lens().front();
-            auto rand_dummy   = info.add_literal(
+            auto rand_dummy   = info.add_literal(migraphx::literal{
-                migraphx::literal{migraphx::shape::float_type, {batch_size * sample_size}});
+                migraphx::shape{migraphx::shape::float_type, {batch_size, sample_size}},
+                std::vector<float>(batch_size * sample_size)});
            randoms =
                info.add_instruction(migraphx::make_op("random_uniform"), seed_input, rand_dummy);
        }