Merge remote-tracking branch 'origin/blas_tuning' into fp8_rocblas

f8761f3f · Umang Yadav · 6fb61ded · 1ea4a08a · f8761f3f · f8761f3f
Commit f8761f3f authored Nov 06, 2023 by Umang Yadav
20 changed files
--- a/Dockerfile
+++ b/Dockerfile
@@ -80,6 +80,10 @@ ADD rbuild.ini /rbuild.ini
 # Temporarily install a new cmake until switching to ubuntu 22.04
 RUN pip3 install cmake==3.22.1

+# Location where onnx unit tests models are cached
+ENV ONNX_HOME=/.onnx
+RUN mkdir -p $ONNX_HOME/models && chmod 777 $ONNX_HOME/models
+
 COPY ./tools/install_prereqs.sh /
 RUN /install_prereqs.sh /usr/local / && rm /install_prereqs.sh
 RUN test -f /usr/local/hash || exit 1
@@ -91,11 +95,6 @@ RUN pip3 install yapf==0.28.0
 ADD docs/.sphinx/requirements.txt /doc-requirements.txt
 RUN pip3 install -r /doc-requirements.txt

-# Download real models to run onnx unit tests
-ENV ONNX_HOME=/.onnx
-COPY ./tools/download_models.sh /
-RUN /download_models.sh && rm /download_models.sh
-
 # Install latest ccache version
 RUN cget -p $PREFIX install facebook/zstd@v1.4.5 -X subdir -DCMAKE_DIR=build/cmake
 RUN cget -p $PREFIX install ccache@v4.1 -DENABLE_TESTING=OFF

--- a/src/include/migraphx/op/multinomial.hpp
+++ b/src/include/migraphx/op/multinomial.hpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -21,11 +21,52 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
+
+/**
+ *  * Multinomial or categorical distribution.  Performs a sampling of random input
+ *         and returns a count of
+ *         each category, or bucket.  This does not require the standard multinomial
+ *         distribution but instead takes a probability distribution, i.e. cumulative
+ *         distribution function (CDF) as its first input.
+ *
+ *      Inputs:   args[0] - a tensor of probabilities for each category.  Values are
+ *                          cumulative density function
+ *                          totals as provided by operation prefix_scan_sum.  Values are
+ *                          cumulative probabilities (i.e. start with any set of numbers > 0
+ *                          and then apply prefix_scan_sum).  Values do not need to be
+ *                          normalized to sum to 1; this is done in runtime computation.
+ *
+ *                          This input has Rank 2.  Dimension 0 is batch #, so that there can be
+ *                          a different CDF for each iteration in the batch.  The size of dimension
+ *                          1 is the number of categories.
+ *
+ *                args[1] - a tensor of random numbers.  The last dimension is the sample
+ *                          size, i.e. the number of
+ *                          random samples in each iteration of the batch.  Nominally
+ *                          has two dimensions where the first dimension is batch size, but
+ *                          any reshaping such that the total
+ *                          number of elements is (batch_size * sample_size) is legal.
+ *
+ *                          Values as created by a std::mt19937 like this:
+ *
+ *                           size_t sample_size = 100000;
+ *                           float seed         = 0.0f;
+ *                           std::mt19937 gen(seed);
+ *                           std::uniform_real_distribution<> dis(0.0, 1.0);
+ *                           std::vector<float> rand_samples(sample_size);
+ *                           std::generate(rand_samples.begin(), rand_samples.end(), [&]() { return
+ *                                dis(gen); });
+ *
+ *        Output:   A 2D vector of category each input.  Dimensions are (Input 1[first], Input
+ 2[last]).
+ *
+*/
 #ifndef MIGRAPHX_GUARD_OPERATORS_MULTINOMIAL_HPP
 #define MIGRAPHX_GUARD_OPERATORS_MULTINOMIAL_HPP

-#include <migraphx/check_shapes.hpp>
 #include <migraphx/argument.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/dyn_output.hpp>
 #include <migraphx/par_for.hpp>
 #include <migraphx/reflect.hpp>
 #include <random>
@@ -47,22 +88,35 @@ struct multinomial
    std::string name() const { return "multinomial"; }
    shape compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.has(2).only_dims(2);
-        size_t sample_size = inputs.back().lens().back();
+        check_shapes{inputs, *this, true}.has(2).only_dims(2);

-        if(not contains({shape::int32_type, shape::int64_type}, dtype))
-            MIGRAPHX_THROW(
-                "Multinomial: Invalid output type. Valid types are int32_type and int64_type.");
+        if(inputs.back().ndim() < 1)
+            MIGRAPHX_THROW("Multinomial: Second input shape (sample) has no dimensions");
+        if(dtype == shape::bool_type)
+            MIGRAPHX_THROW("Multinomial: boolean output type invalid.");

-        return {dtype, {inputs.front().lens().front(), sample_size}};
+        // Output takes one dimension from each of the two input shapes.  If they are both fixed,
+        // return a static shape
+        if((not inputs.front().dynamic()) or (inputs.front().dyn_dims().front().is_fixed()))
+        {
+            if((not inputs.back().dynamic()) or (inputs.back().dyn_dims().back().is_fixed()))
+            {
+                size_t batch = {inputs.front().max_lens().front()};
+                size_t sample_size{inputs.back().max_lens().back()};
+                return {dtype, {batch, sample_size}};
+            }
+        }
+        return {dtype,
+                {inputs.front().to_dynamic().dyn_dims().front(),
+                 inputs.back().to_dynamic().dyn_dims().back()}};
    }

-    argument compute(const shape& output_shape, std::vector<argument> args) const
+    argument compute(const dyn_output& dyn_out, std::vector<argument> args) const
    {
-        argument result{output_shape};
-        size_t batch_size  = output_shape.lens().front();
+        argument result{dyn_out.computed_shape};
+        size_t batch_size  = dyn_out.computed_shape.lens().front();
        size_t class_size  = args[0].get_shape().lens().back();
-        size_t sample_size = output_shape.lens().back();
+        size_t sample_size = dyn_out.computed_shape.lens().back();

        visit_all(args[0], args[1])([&](auto cdf, auto dist) {
            result.visit([&](auto output) {
@@ -70,13 +124,16 @@ struct multinomial
                    auto idx       = args[1].get_shape().multi(i);
                    auto cdf_begin = cdf.begin() + (idx[0] * class_size);
                    auto cdf_end   = cdf_begin + class_size;
+
+                    // std::upper_bound returns an iterator to the bucket the value belongs in,
+                    // when normalized by the probability distribution dist
                    auto sample_iter =
                        std::upper_bound(cdf_begin, cdf_end, dist[i] * *(std::prev(cdf_end)));
+                    // convert iterator to an integer index
                    output[i] = std::distance(cdf_begin, sample_iter);
                });
            });
        });
-
        return result;
    }
 };

--- a/src/include/migraphx/op/prefix_scan_op.hpp
+++ b/src/include/migraphx/op/prefix_scan_op.hpp
@@ -22,6 +22,12 @@
 * THE SOFTWARE.
 */

+/**
+ * Parent struct for prefix scan ops.  A prefix scan is a mathematical entity useful
+ * in parallelizing various computations.  Given a list of numbers, a prefix scan
+ * op returns an equal size list of running totals of the values.  Other operations
+ * besides addition can be supported by child ops.
+ */
 #ifndef MIGRAPHX_GUARD_OPERATORS_SCAN_OP_HPP
 #define MIGRAPHX_GUARD_OPERATORS_SCAN_OP_HPP


--- a/src/include/migraphx/op/random_uniform.hpp
+++ b/src/include/migraphx/op/random_uniform.hpp
@@ -65,11 +65,10 @@ struct random_uniform
        return inputs.at(1);
    }

-    argument compute(const shape&, std::vector<argument> args) const
+    argument compute(const dyn_output& dyn_out, std::vector<argument> args) const
    {
        // Output goes into the passed buffer, not the shape output.
-        auto result = args[1];
-
+        argument result{dyn_out.computed_shape};
        uint64_t local_seed = args[0].at<uint64_t>(0);
        std::mt19937 gen(local_seed);


--- a/src/onnx/parse_clip.cpp
+++ b/src/onnx/parse_clip.cpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal

--- a/src/onnx/parse_multinomial.cpp
+++ b/src/onnx/parse_multinomial.cpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -41,6 +41,9 @@ struct parse_multinomial : op_parser<parse_multinomial>
                          const onnx_parser::node_info& info,
                          std::vector<instruction_ref> args) const
    {
+        if(args.empty())
+            MIGRAPHX_THROW("PARSE_MULTINOMIAL: no arguments given");
+
        int dtype = 6;
        if(contains(info.attributes, "dtype"))
            dtype = info.attributes.at("dtype").i();
@@ -49,35 +52,90 @@ struct parse_multinomial : op_parser<parse_multinomial>
        size_t sample_size = 1;
        if(contains(info.attributes, "sample_size"))
            sample_size = info.attributes.at("sample_size").i();
+        else
+            MIGRAPHX_THROW("PARSE_MULTINOMIAL: sample_size not given");
+
+        // Use logarithmic math to scale probabilities while avoiding division by very
+        // small numbers.  Scaling by the maximum makes very tiny ranges more
+        // tractable; any constant factor gives equivalent distr. since the Multinomial op.
+        // normalizes at runtime.

        // Subtract the per-batch maximum log-probability, making the per-batch max 0
        auto maxes =
            info.add_instruction(migraphx::make_op("reduce_max", {{"axes", {1}}}), args[0]);
-        auto mb_maxes = info.add_instruction(
-            migraphx::make_op("multibroadcast", {{"out_lens", args[0]->get_shape().lens()}}),
-            maxes);
-        auto cdf = info.add_instruction(migraphx::make_op("sub"), args[0], mb_maxes);
+        auto cdf = info.add_common_op("sub", args[0], maxes);
        // Take the element-wise exponent to get probabilities in the range (0, 1]
        cdf = info.add_instruction(migraphx::make_op("exp"), cdf);
-        // Compute the cumulative density function
+        // Compute the cumulative distribution function
        cdf = info.add_instruction(
            migraphx::make_op("prefix_scan_sum", {{"axis", 1}, {"exclusive", false}}), cdf);

-        // Pre-compute random distribution
-        std::mt19937 gen(std::chrono::high_resolution_clock::now().time_since_epoch().count());
+        instruction_ref seed_input;
        if(contains(info.attributes, "seed"))
-            gen.seed(info.attributes.at("seed").f());
+        {
+            float seed = info.attributes.at("seed").f();
+            migraphx::shape s{migraphx::shape::float_type, {1}};
+            std::vector<float> data = {seed};
+            seed_input              = info.add_literal(migraphx::literal(s, data));
+        }
+        else
+        {
+            seed_input = info.add_instruction(migraphx::make_op("random_seed"));
+        }
+        instruction_ref randoms;
+
+        shape s0 = args[0]->get_shape();
+
+        if(s0.dynamic())
+        {
+            //  Dynamic batch_size will be taken from args[0].  The input argument to this should
+            // have a second dimension of sample_size.
+            std::vector<shape::dynamic_dimension> dyn_dim_set;
+            dyn_dim_set.emplace_back(s0.dyn_dims().front());
+            dyn_dim_set.emplace_back(shape::dynamic_dimension{sample_size, sample_size});
+
+            // read the input dimensions
+            auto dim_of =
+                info.add_instruction(migraphx::make_op("dimensions_of", {{"end", 2}}), args[0]);
+
+            // The next two operations insert the value sample_size into the second array position
+
+            // make an argument of (1, 0)
+            shape s(shape::int64_type, {2});
+            std::vector<int64_t> data1{1, 0};
+            auto l1        = info.add_literal(s, data1);
+            auto batch_arg = info.add_instruction(migraphx::make_op("mul"), dim_of, l1);
+            std::vector<int64_t> data2(2, 0);
+            // make an argument of (0, sample_size)
+            data2[1]         = sample_size;
+            auto l2          = info.add_literal(s, data2);
+            auto alloc_shape = info.add_instruction(migraphx::make_op("add"), batch_arg, l2);
+            // alloc_shape should contain the input-based shape dimensions as its values at runtime,
+            // and its own shape is {2}
+
+            // compile_shape is the shape used when compiling the Allocate op, and may be dynamic
+            migraphx::shape compile_shape =
+                migraphx::shape(s0.type(), {s0.dyn_dims().front(), {sample_size, sample_size}});

-        std::uniform_real_distribution<> dis(0.0, 1.0);
-        size_t batch_size = args[0]->get_shape().lens().front();
-        migraphx::shape dist_shape{migraphx::shape::float_type, {batch_size, sample_size}};
+            // Allocate on-device storage for the random values
+            auto alloc = info.add_instruction(
+                migraphx::make_op("allocate", {{"shape", to_value(compile_shape)}}), alloc_shape);
+            randoms = info.add_instruction(migraphx::make_op("random_uniform"), seed_input, alloc);
+        }
+        else
+        {
+            // use literal.  The array populated by random_uniform may have any shape, as long its
+            // number of elements is batch_size * sample_size .
+            size_t batch_size = s0.lens().front();
+            auto rand_dummy   = info.add_literal(
+                migraphx::literal{migraphx::shape::float_type, {batch_size * sample_size}});

-        std::vector<float> random_dist(batch_size * sample_size);
-        std::generate(random_dist.begin(), random_dist.end(), [&]() { return dis(gen); });
-        auto dist_lit = info.add_literal(migraphx::literal{dist_shape, random_dist});
+            randoms =
+                info.add_instruction(migraphx::make_op("random_uniform"), seed_input, rand_dummy);
+        }

        return info.add_instruction(
-            migraphx::make_op("multinomial", {{"dtype", output_type}}), cdf, dist_lit);
+            migraphx::make_op("multinomial", {{"dtype", output_type}}), cdf, randoms);
    }
 };


--- a/src/onnx/parse_split.cpp
+++ b/src/onnx/parse_split.cpp
@@ -68,13 +68,34 @@ struct parse_split : op_parser<parse_split>
        // no split attribute, input is equally divided
        else
        {
-            if((lens[tuned_axis] % info.num_outputs) != 0)
+            std::size_t num_outputs = info.num_outputs;
+            // the num_outputs attribute seems to be redundant since we already have
+            // node_info::num_outputs, but we can still perform an error check
+            if(contains(info.attributes, "num_outputs"))
            {
-                MIGRAPHX_THROW("PARSE_SPLIT: input cannot be equally divided into " +
-                               std::to_string(info.num_outputs) + " splits!");
+                num_outputs =
+                    parser.parse_value(info.attributes.at("num_outputs")).at<std::size_t>();
+                if(num_outputs != info.num_outputs)
+                {
+                    MIGRAPHX_THROW("PARSE_SPLIT: num_outputs attribute " +
+                                   std::to_string(num_outputs) +
+                                   " doesn't match actual number of outputs " +
+                                   std::to_string(info.num_outputs) + "!");
+                }
+            }
+
+            if(lens[tuned_axis] % num_outputs == 0)
+            {
+                std::size_t chunk_size = lens[tuned_axis] / num_outputs;
+                vec_splits.resize(num_outputs, chunk_size);
+            }
+            else
+            {
+                std::size_t chunk_size      = lens[tuned_axis] / num_outputs + 1;
+                std::size_t last_chunk_size = lens[tuned_axis] - chunk_size * (num_outputs - 1);
+                vec_splits.resize(num_outputs - 1, chunk_size);
+                vec_splits.push_back(last_chunk_size);
            }
-            auto dl = lens[tuned_axis] / info.num_outputs;
-            vec_splits.resize(info.num_outputs, dl);
        }

        if(std::accumulate(vec_splits.begin(), vec_splits.end(), int64_t(0)) !=

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -253,7 +253,7 @@ get_target_property(ROCBLAS_LOCATION roc::rocblas LOCATION)
 check_library_exists(MIOpen "miopenHiddenSetConvolutionFindMode" "${MIOPEN_LOCATION}" HAS_FIND_MODE_API)
 check_library_exists(MIOpen "miopenFindSolutions" "${MIOPEN_LOCATION}" HAS_FIND_2_API)
 # Beta API for automated GEMM tuning
-check_library_exists(roc::rocblas "rocblas_gemm_ex_get_solutions" "${ROCBLAS_LOCATION}" HAS_ROCBLAS_BETA_FEATURES_API)
+check_library_exists(roc::rocblas "rocblas_gemm_ex_get_solutions" "${ROCBLAS_LOCATION}" HAS_ROCBLAS_TUNING_BETA_FEATURE_API)

 set(MIGRAPHX_USE_FIND_2_API "${HAS_FIND_2_API}" CACHE BOOL "")

@@ -276,11 +276,11 @@ else()
    message(STATUS "MIOpen does not have find mode api")
 endif()

-if(HAS_ROCBLAS_BETA_FEATURES_API)
+if(HAS_ROCBLAS_TUNING_BETA_FEATURE_API)
    target_compile_definitions(migraphx_gpu PUBLIC -DMIGRAPHX_USE_ROCBLAS_TUNING_API -DROCBLAS_BETA_FEATURES_API -DROCBLAS_NO_DEPRECATED_WARNINGS)
    message(STATUS "MIGraphx is using Beta API of rocBLAS")
 else()
-    message(STATUS "rocBLAS does not have Beta API")
+    message(STATUS "rocBLAS does not have User Tuning Beta API")
 endif()

 target_link_libraries(migraphx_gpu PUBLIC migraphx MIOpen roc::rocblas)

--- a/src/targets/gpu/gemm_impl.cpp
+++ b/src/targets/gpu/gemm_impl.cpp
@@ -22,24 +22,14 @@
 * THE SOFTWARE.
 */

-/**
- * Contains a templated struct implementation that wraps several rocBLAS API calls
- * used by the GEMM operator.  These are accessed by methods declared in gemm_impl.hpp
- *
- */
-
 #include <rocblas/rocblas.h>
 #include <migraphx/gpu/gemm_impl.hpp>
+#include <migraphx/reduce_dims.hpp>
+#include <migraphx/generate.hpp>
 #include <migraphx/time.hpp>

 using microseconds = std::chrono::duration<double, std::micro>;

-#if ROCBLAS_VERSION_MAJOR > 2 or (ROCBLAS_VERSION_MAJOR == 2 and ROCBLAS_VERSION_MINOR >= 38)
-using flag_type = rocblas_gemm_flags;
-#else
-using flag_type = int;
-#endif
-
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
@@ -230,7 +220,7 @@ struct gemm_impl
            auto common_args = create_strided_batched_args_common(ctx, input_args);
            rocblas_invoke(&rocblas_gemm_strided_batched_ex,
                           common_args,
-                           rocblas_gemm_algo_standard,
+                           rocblas_gemm_algo_solution_index,
                           solution_idx,
                           gemm_flags);
        }
@@ -239,7 +229,7 @@ struct gemm_impl
            auto common_args = create_gemm_ex_args_common(ctx, input_args);
            rocblas_invoke(&rocblas_gemm_ex,
                           common_args,
-                           rocblas_gemm_algo_standard,
+                           rocblas_gemm_algo_solution_index,
                           solution_idx,
                           gemm_flags);
        }
@@ -450,9 +440,6 @@ struct gemm_impl
                ctx.finish();
            });

-            // todo:  Measured time dropped from 20 us to about 6.7 us when I raised hot_calls from
-            // 1 to 11. The higher the hot_calls value, the faster per-call time up to at least 25,
-            // and increasing cold_calls makes little or no difference.  Why?
            host_time /= hot_calls;

            // dev/evaluation only: track time for first solution.
@@ -554,17 +541,16 @@ int32_t gemm_finalize(context& ctx,
    if(solution_idx == 0)
    {
        auto gemm_item = gemm_impl<float>(output_shape, input_shapes, alpha, beta, compute_fp32);
-        solution_idx = gemm_item.tune(ctx, input_shapes);
+        solution_idx   = gemm_item.tune(ctx, input_shapes);
    }
    else
    {
        // If a tuned solution index is already given, don't tune again but validate
        // in case the data was tuned with a different rocBLAS version
        auto gemm_item = gemm_impl<float>(output_shape, input_shapes, alpha, beta, compute_fp32);
-        solution_idx = gemm_item.validate(ctx, input_shapes, solution_idx);
+        solution_idx   = gemm_item.validate(ctx, input_shapes, solution_idx);
    }
 #else
-    // suppress compiler warnings
    (void)ctx, (void)output_shape, (void)input_shapes;
    (void)alpha, (void)beta, (void)compute_fp32;
 #endif
@@ -584,23 +570,19 @@ int32_t gemm_finalize(context& ctx,
                      int32_t solution_idx)
 {
 #ifdef MIGRAPHX_USE_ROCBLAS_TUNING_API
-
-    // This code should be called only if either the environment var.
-    // MIGRAPHX_ENABLE_GEMM_TUNING, or option --exhaustive-tune, is set
    if(solution_idx == 0)
    {
        auto gemm_item = gemm_impl<int32_t>(output_shape, input_shapes, alpha, beta, compute_fp32);
-        solution_idx = gemm_item.tune(ctx, input_shapes);
+        solution_idx   = gemm_item.tune(ctx, input_shapes);
    }
    else
    {
        // If a tuned solution index is already given, don't tune again but validate
        // in case the data was tuned with a different rocBLAS version
        auto gemm_item = gemm_impl<int32_t>(output_shape, input_shapes, alpha, beta, compute_fp32);
-        solution_idx = gemm_item.validate(ctx, input_shapes, solution_idx);
+        solution_idx   = gemm_item.validate(ctx, input_shapes, solution_idx);
    }
 #else
-    // suppress compiler warnings
    (void)ctx, (void)output_shape, (void)input_shapes;
    (void)alpha, (void)beta, (void)compute_fp32;
 #endif

--- a/src/targets/gpu/include/migraphx/gpu/gemm_impl.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/gemm_impl.hpp
@@ -27,11 +27,7 @@
 #include <iterator>
 #include <migraphx/shape.hpp>
 #include <migraphx/argument.hpp>
-#include <migraphx/generate.hpp>
 #include <migraphx/gpu/context.hpp>
-#include <migraphx/reduce_dims.hpp>
-#include <migraphx/gpu/hip.hpp>
-#include <migraphx/time.hpp>

 // Set this environment variable to "true" to perform GEMM tuning even when the
 // --exhaustive-tune option isn't set.  Can be used to skip slow convolution tuning.
@@ -44,12 +40,6 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {

-#if ROCBLAS_VERSION_MAJOR >= 2 && ROCBLAS_VERSION_MINOR >= 38
-using flag_type = rocblas_gemm_flags;
-#else
-using flag_type = int;
-#endif
-
 /**
 * @brief Templated implementations of the compute() and finalize() methods of the Gemm operator.
 *        For each function there are overloads using either float or int32_t for the arguments

--- a/src/targets/gpu/include/migraphx/gpu/rocblas.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/rocblas.hpp
@@ -25,7 +25,6 @@
 #define MIGRAPHX_GUARD_MIGRAPHLIB_ROCBLAS_HPP
 #include <migraphx/manage_ptr.hpp>
 #include <migraphx/gpu/config.hpp>
-// ROCBLAS_BETA_FEATURES_API is defined by CMake, if available.
 #include <rocblas/rocblas.h>

 namespace migraphx {

--- a/test/api/CMakeLists.txt
+++ b/test/api/CMakeLists.txt
@@ -30,6 +30,9 @@ function(add_api_test TEST_NAME TEST_SRC TEST_DIR)
    add_test(NAME ${NAME} COMMAND $<TARGET_FILE:${NAME}> WORKING_DIRECTORY ${TEST_DIR}) 
    add_dependencies(tests ${NAME})
    add_dependencies(check ${NAME})
+    if(WIN32)
+        target_compile_definitions(${NAME} PRIVATE _CRT_SECURE_NO_WARNINGS)
+    endif()
 endfunction()

 # Workaround: C file dont work with clang-tidy right now, need a fix in rocm-cmake
@@ -41,6 +44,9 @@ function(add_c_api_test TEST_NAME TEST_SRC TEST_DIR)
    add_test(NAME ${NAME} COMMAND $<TARGET_FILE:${NAME}> WORKING_DIRECTORY ${TEST_DIR}) 
    add_dependencies(tests ${NAME})
    add_dependencies(check ${NAME})
+    if(WIN32)
+        target_compile_definitions(${NAME} PRIVATE _CRT_SECURE_NO_WARNINGS)
+    endif()
 endfunction()

 add_api_test(array_base test_array_base.cpp ${TEST_ONNX_DIR})
@@ -57,10 +63,6 @@ add_api_test(custom_op test_custom_op.cpp ${TEST_ONNX_DIR})
 add_api_test(tf_parser test_tf_parser.cpp ${TEST_TF_DIR})
 # GPU-based tests
 if(MIGRAPHX_ENABLE_GPU)
-list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
-find_package(hip)
 add_api_test(gpu test_gpu.cpp ${TEST_ONNX_DIR})
-target_link_libraries(test_api_gpu)
 add_api_test(custom_op_gpu test_custom_op_gpu.cpp ${TEST_ONNX_DIR})
-target_link_libraries(test_api_custom_op_gpu)
 endif()
--- a/test/onnx/gen_onnx.py
+++ b/test/onnx/gen_onnx.py
@@ -4883,9 +4883,9 @@ def mod_test_fmod_different_dtypes():

 @onnx_test()
 def multinomial_test():
-    sample_size = 10
-    seed = 0.0
-    input = helper.make_tensor_value_info("input", TensorProto.FLOAT, [1, 10])
+    sample_size = 13
+    seed = 0.
+    input = helper.make_tensor_value_info("input", TensorProto.FLOAT, [3, 10])
    output = helper.make_tensor_value_info("output", TensorProto.INT32,
                                           [1, 10])

@@ -4898,6 +4898,44 @@ def multinomial_test():
    return ([node], [input], [output])


+@onnx_test()
+def multinomial_dyn_test():
+    sample_size = 100000
+    seed = 1.3
+    categories = 5
+    input = helper.make_tensor_value_info("input", TensorProto.FLOAT,
+                                          [None, categories])
+    output = helper.make_tensor_value_info("output", TensorProto.FLOAT,
+                                           [None, categories])
+
+    node = onnx.helper.make_node(
+        'Multinomial',
+        inputs=['input'],
+        sample_size=sample_size,
+        dtype=1,  # shape::float_type
+        seed=seed,
+        outputs=['output'])
+
+    return ([node], [input], [output])
+
+
+@onnx_test()
+def multinomial_autoseed_dyn_test():
+    # If seed attribute is not given, device should auto generate one at runtime
+    sample_size = 12
+    input = helper.make_tensor_value_info("input", TensorProto.FLOAT,
+                                          [None, 10])
+    output = helper.make_tensor_value_info("output", TensorProto.INT32,
+                                           [None, 10])
+
+    node = onnx.helper.make_node('Multinomial',
+                                 inputs=['input'],
+                                 sample_size=sample_size,
+                                 outputs=['output'])
+
+    return ([node], [input], [output])
+
+
 @onnx_test()
 def multinomial_generated_seed_test():
    sample_size = 10
@@ -8042,6 +8080,42 @@ def split_test_no_attribute():
    return ([const_node, node], [x], [y1, y2, y3, y4])


+@onnx_test()
+def split_test_uneven():
+    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [12, 15])
+    y1 = helper.make_tensor_value_info('y1', TensorProto.FLOAT, [3, 15])
+    y2 = helper.make_tensor_value_info('y2', TensorProto.FLOAT, [3, 15])
+    y3 = helper.make_tensor_value_info('y3', TensorProto.FLOAT, [3, 15])
+    y4 = helper.make_tensor_value_info('y4', TensorProto.FLOAT, [3, 15])
+    y5 = helper.make_tensor_value_info('y5', TensorProto.FLOAT, [0, 15])
+
+    node = onnx.helper.make_node(
+        'Split',
+        inputs=['x'],
+        outputs=['y1', 'y2', 'y3', 'y4', 'y5'],
+    )
+
+    return ([node], [x], [y1, y2, y3, y4, y5])
+
+
+@onnx_test()
+def split_test_uneven_num_outputs():
+    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [11, 15])
+    y1 = helper.make_tensor_value_info('y1', TensorProto.FLOAT, [3, 15])
+    y2 = helper.make_tensor_value_info('y2', TensorProto.FLOAT, [3, 15])
+    y3 = helper.make_tensor_value_info('y3', TensorProto.FLOAT, [3, 15])
+    y4 = helper.make_tensor_value_info('y4', TensorProto.FLOAT, [2, 15])
+
+    node = onnx.helper.make_node(
+        'Split',
+        inputs=['x'],
+        outputs=['y1', 'y2', 'y3', 'y4'],
+        num_outputs=4,
+    )
+
+    return ([node], [x], [y1, y2, y3, y4])
+
+
 @onnx_test()
 def split_test_no_attribute_invalid_split():
    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [300, 15])
@@ -8101,6 +8175,24 @@ def split_test_no_attribute_invalid_input_split():
    return ([node], [x], [y1, y2, y3])


+@onnx_test()
+def split_test_invalid_num_outputs():
+    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [11, 15])
+    y1 = helper.make_tensor_value_info('y1', TensorProto.FLOAT, [3, 15])
+    y2 = helper.make_tensor_value_info('y2', TensorProto.FLOAT, [3, 15])
+    y3 = helper.make_tensor_value_info('y3', TensorProto.FLOAT, [3, 15])
+    y4 = helper.make_tensor_value_info('y4', TensorProto.FLOAT, [2, 15])
+
+    node = onnx.helper.make_node(
+        'Split',
+        inputs=['x'],
+        outputs=['y1', 'y2', 'y3', 'y4'],
+        num_outputs=5,
+    )
+
+    return ([node], [x], [y1, y2, y3, y4])
+
+
 @onnx_test()
 def sqrt_test():
    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [10, 15])

--- a/test/onnx/multinomial_autoseed_dyn_test.onnx
+++ b/test/onnx/multinomial_autoseed_dyn_test.onnx
--- a/test/onnx/multinomial_dyn_test.onnx
+++ b/test/onnx/multinomial_dyn_test.onnx
--- a/test/onnx/multinomial_int64_test.onnx
+++ b/test/onnx/multinomial_int64_test.onnx
--- a/test/onnx/multinomial_test.onnx
+++ b/test/onnx/multinomial_test.onnx
--- a/test/onnx/onnx_test.cpp
+++ b/test/onnx/onnx_test.cpp
@@ -4679,32 +4679,140 @@ TEST_CASE(multinomial_test)
 {
    migraphx::program p;
    auto* mm           = p.get_main_module();
-    size_t sample_size = 10;
-    float seed         = 0.0f;
+    size_t sample_size = 13;
+    size_t batch_size  = 3;
+    size_t categories  = 10;
+    float seed         = 0;

-    auto input = mm->add_parameter("input", migraphx::shape{migraphx::shape::float_type, {1, 10}});
-    auto maxes = mm->add_instruction(migraphx::make_op("reduce_max", {{"axes", {1}}}), input);
-    auto mb_maxes =
-        mm->add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", {1, 10}}}), maxes);
+    auto input = mm->add_parameter(
+        "input", migraphx::shape{migraphx::shape::float_type, {batch_size, categories}});
+    auto maxes    = mm->add_instruction(migraphx::make_op("reduce_max", {{"axes", {1}}}), input);
+    auto mb_maxes = mm->add_instruction(
+        migraphx::make_op("multibroadcast", {{"out_lens", {batch_size, 10}}}), maxes);
    auto cdf = mm->add_instruction(migraphx::make_op("sub"), input, mb_maxes);
    cdf      = mm->add_instruction(migraphx::make_op("exp"), cdf);
    cdf      = mm->add_instruction(
        migraphx::make_op("prefix_scan_sum", {{"axis", 1}, {"exclusive", false}}), cdf);

-    std::mt19937 gen(seed);
-    std::uniform_real_distribution<> dis(0.0, 1.0);
-    std::vector<float> rand_samples(sample_size);
-    std::generate(rand_samples.begin(), rand_samples.end(), [&]() { return dis(gen); });
-    migraphx::shape rs{migraphx::shape::float_type, {1, sample_size}};
-    auto rs_lit = mm->add_literal(migraphx::literal{rs, rand_samples});
-
-    mm->add_instruction(migraphx::make_op("multinomial"), cdf, rs_lit);
+    migraphx::shape s{migraphx::shape::float_type, {1}};
+    std::vector<float> seed_data = {seed};
+    auto seed_input              = mm->add_literal(migraphx::literal(s, seed_data));
+    auto rand_dummy =
+        mm->add_literal(migraphx::literal{migraphx::shape::float_type, {batch_size * sample_size}});

+    auto randoms = mm->add_instruction(migraphx::make_op("random_uniform"), seed_input, rand_dummy);
+    mm->add_instruction(migraphx::make_op("multinomial"), cdf, randoms);
    auto prog = optimize_onnx("multinomial_test.onnx");

    EXPECT(p == prog);
 }

+TEST_CASE(multinomial_dyn_test)
+{
+    // compile-time random seed
+    migraphx::program p;
+    auto* mm           = p.get_main_module();
+    size_t sample_size = 100000;
+    size_t categories  = 5;
+    float seed         = 1.3f;
+
+    auto input = mm->add_parameter(
+        "input",
+        migraphx::shape{migraphx::shape::float_type, {{1, categories}, {categories, categories}}});
+
+    auto maxes = mm->add_instruction(migraphx::make_op("reduce_max", {{"axes", {1}}}), input);
+
+    auto cdf = add_common_op(*mm, migraphx::make_op("sub"), {input, maxes});
+    cdf      = mm->add_instruction(migraphx::make_op("exp"), cdf);
+    cdf      = mm->add_instruction(
+        migraphx::make_op("prefix_scan_sum", {{"axis", 1}, {"exclusive", false}}), cdf);
+
+    migraphx::shape s{migraphx::shape::float_type, {1}};
+    std::vector<float> seed_data = {seed};
+    auto seed_input              = mm->add_literal(migraphx::literal(s, seed_data));
+
+    // dynamic input only:  must calculate alloc_shape as (batch_size, sample_size)
+    //                read the runtime input dimensions
+    auto dim_of = mm->add_instruction(migraphx::make_op("dimensions_of", {{"end", 2}}), input);
+    // make an argument of (1, 0)
+    migraphx::shape lit_shape(migraphx::shape::int64_type, {2});
+    std::vector<int64_t> data1{1, 0};
+    auto l1        = mm->add_literal(lit_shape, data1);
+    auto batch_arg = mm->add_instruction(migraphx::make_op("mul"), dim_of, l1);
+    std::vector<int64_t> data2(2, 0);
+    // make an argument of (0, sample_size)
+    data2[1]         = sample_size;
+    auto l2          = mm->add_literal(lit_shape, data2);
+    auto alloc_shape = mm->add_instruction(migraphx::make_op("add"), batch_arg, l2);
+    migraphx::shape compile_shape =
+        migraphx::shape(migraphx::shape::float_type,
+                        {input->get_shape().dyn_dims().front(), {sample_size, sample_size}});
+
+    auto alloc = mm->add_instruction(
+        migraphx::make_op("allocate", {{"shape", to_value(compile_shape)}}), alloc_shape);
+
+    auto randoms = mm->add_instruction(migraphx::make_op("random_uniform"), seed_input, alloc);
+    auto ret     = mm->add_instruction(
+        migraphx::make_op("multinomial", {{"dtype", migraphx::shape::float_type}}), cdf, randoms);
+    mm->add_return({ret});
+
+    migraphx::onnx_options options;
+    options.default_dyn_dim_value  = {1, categories};
+    options.print_program_on_error = true;
+    auto prog                      = migraphx::parse_onnx("multinomial_dyn_test.onnx", options);
+    EXPECT(p == prog);
+}
+
+TEST_CASE(multinomial_autoseed_dyn_test)
+{
+    // runtime random seed
+    migraphx::program p;
+    auto* mm           = p.get_main_module();
+    size_t sample_size = 12;
+    size_t categories  = 10;
+
+    auto input = mm->add_parameter(
+        "input", migraphx::shape{migraphx::shape::float_type, {{1, 10}, {10, 10}}});
+
+    auto maxes = mm->add_instruction(migraphx::make_op("reduce_max", {{"axes", {1}}}), input);
+
+    auto cdf = add_common_op(*mm, migraphx::make_op("sub"), {input, maxes});
+    cdf      = mm->add_instruction(migraphx::make_op("exp"), cdf);
+    cdf      = mm->add_instruction(
+        migraphx::make_op("prefix_scan_sum", {{"axis", 1}, {"exclusive", false}}), cdf);
+    auto seed_input = mm->add_instruction(migraphx::make_op("random_seed"));
+
+    // dynamic input only:  must calculate alloc_shape as (batch_size, sample_size)
+    //                read the runtime input dimensions
+    auto dim_of = mm->add_instruction(migraphx::make_op("dimensions_of", {{"end", 2}}), input);
+    // make an argument of (1, 0)
+    migraphx::shape lit_shape(migraphx::shape::int64_type, {2});
+    std::vector<int64_t> data1{1, 0};
+    auto l1        = mm->add_literal(lit_shape, data1);
+    auto batch_arg = mm->add_instruction(migraphx::make_op("mul"), dim_of, l1);
+    std::vector<int64_t> data2(2, 0);
+    // make an argument of (0, sample_size)
+    data2[1]         = sample_size;
+    auto l2          = mm->add_literal(lit_shape, data2);
+    auto alloc_shape = mm->add_instruction(migraphx::make_op("add"), batch_arg, l2);
+    migraphx::shape compile_shape =
+        migraphx::shape(migraphx::shape::float_type,
+                        {input->get_shape().dyn_dims().front(), {sample_size, sample_size}});
+
+    auto alloc = mm->add_instruction(
+        migraphx::make_op("allocate", {{"shape", to_value(compile_shape)}}), alloc_shape);
+
+    auto randoms = mm->add_instruction(migraphx::make_op("random_uniform"), seed_input, alloc);
+    auto ret     = mm->add_instruction(migraphx::make_op("multinomial"), cdf, randoms);
+    mm->add_return({ret});
+
+    migraphx::onnx_options options;
+    options.default_dyn_dim_value  = {1, categories};
+    options.print_program_on_error = true;
+    auto prog = migraphx::parse_onnx("multinomial_autoseed_dyn_test.onnx", options);
+    EXPECT(p == prog);
+}
+
 TEST_CASE(multinomial_dtype_error_test)
 {
    EXPECT(test::throws([&] { migraphx::parse_onnx("multinomial_dtype_error_test.onnx"); }));
@@ -4712,10 +4820,11 @@ TEST_CASE(multinomial_dtype_error_test)

 TEST_CASE(multinomial_generated_seed_test)
 {
+    // multinomial op. no longer generates its own randoms
    auto p1 = optimize_onnx("multinomial_generated_seed_test.onnx");
    auto p2 = optimize_onnx("multinomial_generated_seed_test.onnx");

-    EXPECT(p1 != p2);
+    EXPECT(p1 == p2);
 }

 TEST_CASE(multinomial_int64_test)
@@ -4723,27 +4832,27 @@ TEST_CASE(multinomial_int64_test)
    migraphx::program p;
    auto* mm                      = p.get_main_module();
    size_t sample_size            = 10;
-    float seed                    = 1.0f;
+    float seed                    = 1.0;
+    uint32_t batch_size           = 1;
    migraphx::shape::type_t dtype = migraphx::shape::type_t::int64_type;

    auto input = mm->add_parameter("input", migraphx::shape{migraphx::shape::float_type, {1, 10}});
    auto maxes = mm->add_instruction(migraphx::make_op("reduce_max", {{"axes", {1}}}), input);
-    auto mb_maxes =
-        mm->add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", {1, 10}}}), maxes);
-    auto cdf = mm->add_instruction(migraphx::make_op("sub"), input, mb_maxes);
+
+    auto cdf = add_common_op(*mm, migraphx::make_op("sub"), {input, maxes});
    cdf      = mm->add_instruction(migraphx::make_op("exp"), cdf);
    cdf      = mm->add_instruction(
        migraphx::make_op("prefix_scan_sum", {{"axis", 1}, {"exclusive", false}}), cdf);

-    std::mt19937 gen(seed);
-    std::uniform_real_distribution<> dis(0.0, 1.0);
-    std::vector<float> rand_samples(sample_size);
-    std::generate(rand_samples.begin(), rand_samples.end(), [&]() { return dis(gen); });
-    migraphx::shape rs{migraphx::shape::float_type, {1, sample_size}};
-    auto rs_lit = mm->add_literal(migraphx::literal{rs, rand_samples});
-
-    mm->add_instruction(migraphx::make_op("multinomial", {{"dtype", dtype}}), cdf, rs_lit);
+    migraphx::shape s{migraphx::shape::float_type, {1}};
+    std::vector<float> data = {seed};
+    auto seed_input         = mm->add_literal(migraphx::literal(s, data));

+    // static size
+    auto rand_dummy =
+        mm->add_literal(migraphx::literal{migraphx::shape::float_type, {batch_size * sample_size}});
+    auto randoms = mm->add_instruction(migraphx::make_op("random_uniform"), seed_input, rand_dummy);
+    mm->add_instruction(migraphx::make_op("multinomial", {{"dtype", dtype}}), cdf, randoms);
    auto prog = optimize_onnx("multinomial_int64_test.onnx");

    EXPECT(p == prog);
@@ -7671,6 +7780,46 @@ TEST_CASE(split_test_default)
    EXPECT(p == prog);
 }

+TEST_CASE(split_test_uneven)
+{
+    migraphx::program p;
+    auto* mm   = p.get_main_module();
+    auto input = mm->add_parameter("x", migraphx::shape{migraphx::shape::float_type, {12, 15}});
+    auto r1    = mm->add_instruction(
+        migraphx::make_op("slice", {{"axes", {0}}, {"starts", {0}}, {"ends", {3}}}), input);
+    auto r2 = mm->add_instruction(
+        migraphx::make_op("slice", {{"axes", {0}}, {"starts", {3}}, {"ends", {6}}}), input);
+    auto r3 = mm->add_instruction(
+        migraphx::make_op("slice", {{"axes", {0}}, {"starts", {6}}, {"ends", {9}}}), input);
+    auto r4 = mm->add_instruction(
+        migraphx::make_op("slice", {{"axes", {0}}, {"starts", {9}}, {"ends", {12}}}), input);
+    auto r5 = mm->add_instruction(
+        migraphx::make_op("slice", {{"axes", {0}}, {"starts", {12}}, {"ends", {12}}}), input);
+    mm->add_return({r1, r2, r3, r4, r5});
+
+    auto prog = migraphx::parse_onnx("split_test_uneven.onnx");
+    EXPECT(p == prog);
+}
+
+TEST_CASE(split_test_uneven_num_outputs)
+{
+    migraphx::program p;
+    auto* mm   = p.get_main_module();
+    auto input = mm->add_parameter("x", migraphx::shape{migraphx::shape::float_type, {11, 15}});
+    auto r1    = mm->add_instruction(
+        migraphx::make_op("slice", {{"axes", {0}}, {"starts", {0}}, {"ends", {3}}}), input);
+    auto r2 = mm->add_instruction(
+        migraphx::make_op("slice", {{"axes", {0}}, {"starts", {3}}, {"ends", {6}}}), input);
+    auto r3 = mm->add_instruction(
+        migraphx::make_op("slice", {{"axes", {0}}, {"starts", {6}}, {"ends", {9}}}), input);
+    auto r4 = mm->add_instruction(
+        migraphx::make_op("slice", {{"axes", {0}}, {"starts", {9}}, {"ends", {11}}}), input);
+    mm->add_return({r1, r2, r3, r4});
+
+    auto prog = migraphx::parse_onnx("split_test_uneven_num_outputs.onnx");
+    EXPECT(p == prog);
+}
+
 TEST_CASE(split_test_no_attribute_invalid_split)
 {
    EXPECT(
@@ -7688,6 +7837,11 @@ TEST_CASE(split_test_no_attribute_invalid_input_split)
        [&] { migraphx::parse_onnx("split_test_no_attribute_invalid_input_split.onnx"); }));
 }

+TEST_CASE(split_test_invalid_num_outputs)
+{
+    EXPECT(test::throws([&] { migraphx::parse_onnx("split_test_invalid_num_outputs.onnx"); }));
+}
+
 TEST_CASE(sqrt_test)
 {
    migraphx::program p;

--- a/test/onnx/split_test_invalid_num_outputs.onnx
+++ b/test/onnx/split_test_invalid_num_outputs.onnx
+	split_test_invalid_num_outputs:
+.
+xy1y2y3y4"Split*
+num_outputssplit_test_invalid_num_outputsZ
+x
+
+
+b
+y1
+
+
+b
+y2
+
+
+b
+y3
+
+
+b
+y4
+
+
+B
\ No newline at end of file
--- a/test/onnx/split_test_uneven.onnx
+++ b/test/onnx/split_test_uneven.onnx