Merge branch 'develop' into mi200

70d9faf7 · Chris Austen · GitHub · a56c531c · a60bdb67 · 70d9faf7
Unverified Commit 70d9faf7 authored Dec 13, 2023 by Chris Austen Committed by GitHub Dec 13, 2023
20 changed files
--- a/src/include/migraphx/op/allocate.hpp
+++ b/src/include/migraphx/op/allocate.hpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -37,20 +37,22 @@ namespace op {
 * Static allocate:
 * No inputs: `allocate()`
 * `this.s` attribute set to the static output shape of the buffer.
+ * `this.s` attribute can be set to a dynamic output shape; however this will allocate the maximum
+ * buffer size for that case
 *
 * Dynamic allocate:
 * One input: `allocate(output_dims)`
 * `output_dims` are the output buffer dimensions and has a static shape.
- * Either `this.s` or `this.buf_type` must be set to calculate the dynamic output shape at compute
- * time. If `this.buf_type` is set, the compute_shape() of allocate at compile time will have
- * dynamic_dimensions from {0, max_int} with rank = output_dims.ndim(). If `this.s` is set then the
- * compute_shape() will output `this.s`; `this.s` should be a dynamic shape.
+ * Either `this.s` or `this.buf_type` (but not both) must be set to calculate the dynamic output
+ * shape at compute time. If `this.buf_type` is set, the compute_shape() of allocate at compile time
+ * will have dynamic_dimensions from {0, max_int} with rank = output_dims.ndim(). If `this.s` is set
+ * then the compute_shape() will output `this.s`; `this.s` should be a dynamic shape.
 */
 struct allocate
 {
-    shape s{};
+    optional<shape> s;
    // for dynamic allocate to set the buffer type
-    shape::type_t buf_type = shape::half_type;
+    optional<shape::type_t> buf_type;

    template <class Self, class F>
    static auto reflect(Self& self, F f)
@@ -62,8 +64,12 @@ struct allocate

    shape compute_shape(const std::vector<shape>& inputs) const
    {
-        if(s != shape())
+        if(s.has_value())
        {
+            if(buf_type.has_value())
+            {
+                MIGRAPHX_THROW("ALLOCATE: shape and buf_type attributes both set");
+            }
            if(inputs.size() == 1)
            {
                migraphx::check_shapes{inputs, *this, false}.only_dims(1);
@@ -72,16 +78,20 @@ struct allocate
            {
                migraphx::check_shapes{inputs, *this, false}.has(0);
            }
-            return s;
+            return s.value();
        }
        else
        {
+            if(not buf_type.has_value())
+            {
+                MIGRAPHX_THROW("ALLOCATE: shape and buf_type attributes both not set");
+            }
            migraphx::check_shapes{inputs, *this, false}.has(1).only_dims(1);
            const auto& out_dims = inputs.at(0);
            std::size_t max_val = std::numeric_limits<std::size_t>::max();
            std::vector<shape::dynamic_dimension> dyn_dims(out_dims.lens().at(0),
                                                           shape::dynamic_dimension{0, max_val});
-            return {buf_type, dyn_dims};
+            return {buf_type.value(), dyn_dims};
        }
    }
    argument compute(const shape& output_shape, const std::vector<argument>& args) const
@@ -94,7 +104,11 @@ struct allocate
        {
            std::vector<std::size_t> output_dims(output_shape.ndim());
            args.at(0).visit([&](auto a) { output_dims.assign(a.begin(), a.end()); });
-            return argument{shape{buf_type, output_dims}};
+            if(s)
+            {
+                return argument{shape{s->type(), output_dims}};
+            }
+            return argument{shape{buf_type.value(), output_dims}};
        }
    }
 };

--- a/src/include/migraphx/op/binary.hpp
+++ b/src/include/migraphx/op/binary.hpp
@@ -29,6 +29,7 @@
 #include <migraphx/argument.hpp>
 #include <migraphx/value.hpp>
 #include <migraphx/dyn_output.hpp>
+#include <migraphx/par.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -95,7 +96,7 @@ struct binary : op_name<Derived>
    {
        argument result{dyn_out.computed_shape};
        visit_all(result, args[0], args[1])([&](auto output, auto input1, auto input2) {
-            std::transform(input1.begin(),
+            par_transform(input1.begin(),
                          input1.end(),
                          input2.begin(),
                          output.begin(),

--- a/src/include/migraphx/op/dequantizelinear.hpp
+++ b/src/include/migraphx/op/dequantizelinear.hpp
@@ -72,8 +72,8 @@ struct dequantizelinear
        visit_all(x, x_zero_point)([&](auto input, auto zero_pts) {
            visit_all(result, x_scale)([&](auto output, auto scales) {
                par_for(output_shape.elements(), [&](auto i) {
-                    output[i] = static_cast<double>(static_cast<int64_t>(input[i]) -
-                                                    static_cast<int64_t>(zero_pts[i])) *
+                    output[i] = static_cast<double>(static_cast<double>(input[i]) -
+                                                    static_cast<double>(zero_pts[i])) *
                                scales[i];
                });
            });

--- a/src/targets/gpu/include/migraphx/gpu/int8_conv_pack.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/int8_conv_pack.hpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -21,31 +21,32 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#ifndef MIGRAPHX_GUARD_RTGLIB_INT8_CONV_PACK_HPP
-#define MIGRAPHX_GUARD_RTGLIB_INT8_CONV_PACK_HPP
+#ifndef MIGRAPHX_GUARD_OPERATORS_ISINF_HPP
+#define MIGRAPHX_GUARD_OPERATORS_ISINF_HPP

-#include <migraphx/argument.hpp>
+#include <migraphx/op/unary.hpp>
 #include <migraphx/config.hpp>
-#include <utility>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
+namespace op {

-struct context;
-
-struct miopen_int8_conv_pack
+struct isinf : unary<isinf>
 {
-    std::string name() const { return "gpu::int8_conv_pack"; }
-    shape compute_shape(const std::vector<shape>& inputs) const;
-    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const;
-    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    auto apply() const
+    {
+        return [&](auto x) { return std::isinf(static_cast<double>(x)); };
+    }
+
+    std::string name() const { return "isinf"; }
+
+    shape compute_shape(std::vector<shape> inputs) const
    {
-        return shapes.size() - 1;
+        return unary<isinf>::compute_shape(std::move(inputs)).with_type(shape::bool_type);
    }
 };

-} // namespace gpu
+} // namespace op
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx


--- a/src/include/migraphx/op/multinomial.hpp
+++ b/src/include/migraphx/op/multinomial.hpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -21,11 +21,52 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
+
+/**
+ *  * Multinomial or categorical distribution.  Performs a sampling of random input
+ *         and returns a count of
+ *         each category, or bucket.  This does not require the standard multinomial
+ *         distribution but instead takes a probability distribution, i.e. cumulative
+ *         distribution function (CDF) as its first input.
+ *
+ *      Inputs:   args[0] - a tensor of probabilities for each category.  Values are
+ *                          cumulative density function
+ *                          totals as provided by operation prefix_scan_sum.  Values are
+ *                          cumulative probabilities (i.e. start with any set of numbers > 0
+ *                          and then apply prefix_scan_sum).  Values do not need to be
+ *                          normalized to sum to 1; this is done in runtime computation.
+ *
+ *                          This input has Rank 2.  Dimension 0 is batch #, so that there can be
+ *                          a different CDF for each iteration in the batch.  The size of dimension
+ *                          1 is the number of categories.
+ *
+ *                args[1] - a tensor of random numbers.  The last dimension is the sample
+ *                          size, i.e. the number of
+ *                          random samples in each iteration of the batch.  Nominally
+ *                          has two dimensions where the first dimension is batch size, but
+ *                          any reshaping such that the total
+ *                          number of elements is (batch_size * sample_size) is legal.
+ *
+ *                          Values as created by a std::mt19937 like this:
+ *
+ *                           size_t sample_size = 100000;
+ *                           float seed         = 0.0f;
+ *                           std::mt19937 gen(seed);
+ *                           std::uniform_real_distribution<> dis(0.0, 1.0);
+ *                           std::vector<float> rand_samples(sample_size);
+ *                           std::generate(rand_samples.begin(), rand_samples.end(), [&]() { return
+ *                                dis(gen); });
+ *
+ *        Output:   A 2D vector of category each input.  Dimensions are (Input 1[first], Input
+ 2[last]).
+ *
+*/
 #ifndef MIGRAPHX_GUARD_OPERATORS_MULTINOMIAL_HPP
 #define MIGRAPHX_GUARD_OPERATORS_MULTINOMIAL_HPP

-#include <migraphx/check_shapes.hpp>
 #include <migraphx/argument.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/dyn_output.hpp>
 #include <migraphx/par_for.hpp>
 #include <migraphx/reflect.hpp>
 #include <random>
@@ -47,22 +88,35 @@ struct multinomial
    std::string name() const { return "multinomial"; }
    shape compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.has(2).only_dims(2);
-        size_t sample_size = inputs.back().lens().back();
+        check_shapes{inputs, *this, true}.has(2).only_dims(2);

-        if(not contains({shape::int32_type, shape::int64_type}, dtype))
-            MIGRAPHX_THROW(
-                "Multinomial: Invalid output type. Valid types are int32_type and int64_type.");
+        if(inputs.back().ndim() < 1)
+            MIGRAPHX_THROW("Multinomial: Second input shape (sample) has no dimensions");
+        if(dtype == shape::bool_type)
+            MIGRAPHX_THROW("Multinomial: boolean output type invalid.");

-        return {dtype, {inputs.front().lens().front(), sample_size}};
+        // Output takes one dimension from each of the two input shapes.  If they are both fixed,
+        // return a static shape
+        if((not inputs.front().dynamic()) or (inputs.front().dyn_dims().front().is_fixed()))
+        {
+            if((not inputs.back().dynamic()) or (inputs.back().dyn_dims().back().is_fixed()))
+            {
+                size_t batch = {inputs.front().max_lens().front()};
+                size_t sample_size{inputs.back().max_lens().back()};
+                return {dtype, {batch, sample_size}};
+            }
+        }
+        return {dtype,
+                {inputs.front().to_dynamic().dyn_dims().front(),
+                 inputs.back().to_dynamic().dyn_dims().back()}};
    }

-    argument compute(const shape& output_shape, std::vector<argument> args) const
+    argument compute(const dyn_output& dyn_out, std::vector<argument> args) const
    {
-        argument result{output_shape};
-        size_t batch_size  = output_shape.lens().front();
+        argument result{dyn_out.computed_shape};
+        size_t batch_size  = dyn_out.computed_shape.lens().front();
        size_t class_size  = args[0].get_shape().lens().back();
-        size_t sample_size = output_shape.lens().back();
+        size_t sample_size = dyn_out.computed_shape.lens().back();

        visit_all(args[0], args[1])([&](auto cdf, auto dist) {
            result.visit([&](auto output) {
@@ -70,13 +124,16 @@ struct multinomial
                    auto idx       = args[1].get_shape().multi(i);
                    auto cdf_begin = cdf.begin() + (idx[0] * class_size);
                    auto cdf_end   = cdf_begin + class_size;
+
+                    // std::upper_bound returns an iterator to the bucket the value belongs in,
+                    // when normalized by the probability distribution dist
                    auto sample_iter =
                        std::upper_bound(cdf_begin, cdf_end, dist[i] * *(std::prev(cdf_end)));
+                    // convert iterator to an integer index
                    output[i] = std::distance(cdf_begin, sample_iter);
                });
            });
        });
-
        return result;
    }
 };

--- a/src/targets/ref/include/migraphx/ref/gemm.hpp
+++ b/src/targets/ref/include/migraphx/ref/gemm.hpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -21,25 +21,29 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#ifndef MIGRAPHX_GUARD_RTGLIB_CPU_GEMM_HPP
-#define MIGRAPHX_GUARD_RTGLIB_CPU_GEMM_HPP
+#ifndef MIGRAPHX_GUARD_OPERATORS_NEARBYINT_HPP
+#define MIGRAPHX_GUARD_OPERATORS_NEARBYINT_HPP

-#include <migraphx/argument.hpp>
+#include <migraphx/op/unary.hpp>
 #include <migraphx/config.hpp>
+#include <fenv.h>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
-namespace ref {
-
-void migemm(
-    const argument& c_arg, const argument& a_arg, const argument& b_arg, float alpha, float beta);
-void migemm(const argument& c_arg,
-            const argument& a_arg,
-            const argument& b_arg,
-            int32_t alpha,
-            int32_t beta);
-
-} // namespace ref
+namespace op {
+struct nearbyint : unary<nearbyint>
+{
+    auto apply() const
+    {
+        return [](auto x) {
+            auto rounding_mode = fegetround();
+            fesetround(FE_TONEAREST);
+            return std::nearbyint(x);
+            fesetround(rounding_mode);
+        };
+    }
+};
+} // namespace op
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx


--- a/src/include/migraphx/op/normalize_attribute.hpp
+++ b/src/include/migraphx/op/normalize_attribute.hpp
@@ -40,6 +40,8 @@ namespace op {
 * 2. use_rank (default) vs use_len:
 *  `use_rank` sets the max value/index of the attribute as the rank of lens.
 *  `use_lens` sets the max value/index as the corresponding value in lens at the axes index.
+ *      Uses the dynamic_dimension.max value for dynamic shapes. Returns the original vector
+ *      (no normalization) if any of dynamic_dimension[axes] are not fixed.
 * 3. `clip_min` vs. `not_clip_min` (default):
 *  Clip values less than the minimum to the minimum or not.
 * 4. `include_min` vs. `exclude_min` (default):

--- a/src/include/migraphx/op/pooling.hpp
+++ b/src/include/migraphx/op/pooling.hpp
@@ -70,7 +70,8 @@ struct pooling
    // 2 smaller than the input tensor rank (NCHW layout)
    std::vector<std::size_t> lengths = {1, 1};

-    // Dilations are not supported at this time.
+    // Spacing between the elements of the pooling kernel. Must be the same ndim as lengths.
+    std::vector<std::size_t> dilations = {1, 1};

    // ceiling mode is a flag affecting output size
    // or equivalently, placements of the pooling kernel.
@@ -99,6 +100,7 @@ struct pooling
                    f(self.padding_mode, "padding_mode"),
                    f(self.stride, "stride"),
                    f(self.lengths, "lengths"),
+                    f(self.dilations, "dilations"),
                    f(self.ceil_mode, "ceil_mode"),
                    f(self.lp_order, "lp_order"),
                    f(self.dyn_global, "dyn_global"));
@@ -112,14 +114,17 @@ struct pooling
            return;
        if((padding_mode != default_ and padding.size() != stride.size() and
            (padding.size()) != stride.size() * 2) or
-           stride.size() != lengths.size())
+           stride.size() != lengths.size() or dilations.size() != lengths.size())
        {
            MIGRAPHX_THROW("POOLING: inconsistent attribute sizes");
        }
-        if(std::any_of(lengths.begin(), lengths.end(), [&](auto i) { return (i == 0); }) or
-           std::any_of(stride.begin(), stride.end(), [&](auto i) { return (i == 0); }))
+
+        const auto is_zero = [](auto el) { return el == 0; };
+        if(std::any_of(lengths.begin(), lengths.end(), is_zero) or
+           std::any_of(stride.begin(), stride.end(), is_zero) or
+           std::any_of(dilations.begin(), dilations.end(), is_zero))
        {
-            MIGRAPHX_THROW("POOLING: size 0 pooling kernel or stride");
+            MIGRAPHX_THROW("POOLING: size 0 pooling kernel or stride or dilations");
        }

        // TODO:  update lowering to run the reference
@@ -142,6 +147,11 @@ struct pooling

    value attributes() const { return {{"normalize_padding", "padding"}}; }

+    inline std::size_t dilate_dim(std::size_t dim, std::size_t dilation) const
+    {
+        return 1 + dilation * (dim - 1);
+    }
+
    std::vector<std::size_t> calc_spatial_dim_out(const std::vector<std::size_t>& input_lens,
                                                  std::size_t kdims) const
    {
@@ -151,8 +161,9 @@ struct pooling
            std::size_t padding_factor = 2 * padding[i];
            if(padding.size() == 2 * kdims)
                padding_factor = padding[i] + padding[i + kdims];
+            std::size_t dilated_length = dilate_dim(lengths[i], dilations[i]);
            std::size_t dim_size;
-            if(input_lens[i + 2] + padding_factor < lengths[i])
+            if(input_lens[i + 2] + padding_factor < dilated_length)
            {
                if(padding_mode == default_)
                    MIGRAPHX_THROW("POOLING: not enough padding for the given kernel size");
@@ -162,7 +173,7 @@ struct pooling
            }
            else
            {
-                dim_size = input_lens[i + 2] + padding_factor - lengths[i];
+                dim_size = input_lens[i + 2] + padding_factor - dilated_length;
            }
            std::size_t len =
                (ceil_mode)
@@ -331,6 +342,7 @@ struct pooling
                int start = static_cast<int>(idx_o[dim] * stride[d_2]) -
                            static_cast<int>(padding_vals[d_2]);
                int end;
+                std::size_t dilated_kernel_dim = dilate_dim(kernel_dims[d_2], dilations[d_2]);
                // NOLINT
                if(count_include_pad and ceil_mode and (mode != pooling_mode::max))
                {
@@ -340,15 +352,14 @@ struct pooling
                    // padding.  Clip out-of-bounds indexes but not padding.

                    // Check if this kernel extends beyond the padding at end of dimension
-                    end = std::min(start + kernel_dims[d_2],
+                    end = std::min(start + dilated_kernel_dim,
                                   in_lens[dim] + static_cast<int>(padding_vals[d_2]));
                }
                else
                {
                    // In non-ceiling mode, when
                    // count_include_pad is false, or for max pooling, clip off padding.
-                    end   = std::min(start + kernel_dims[d_2], in_lens[dim]);
-                    start = std::max(start, 0);
+                    end = std::min(start + dilated_kernel_dim, in_lens[dim]);
                }
                win_start.push_back(start);
                if(end < start)
@@ -366,6 +377,16 @@ struct pooling

            // for each element in the window...
            shape_for_each(win_shape, [&](const auto& idx_w) {
+                // Skip elements that belong to the dilated area
+                for(size_t axis = 0; axis < idx_w.size(); ++axis)
+                {
+                    if(idx_w[axis] % dilations[axis])
+                    {
+                        pool_size -= 1;
+                        return;
+                    }
+                }
+
                // the coordinates of this element
                auto idx = idx_o;

@@ -390,7 +411,15 @@ struct pooling
                    // this is a padding element.  Padding locations
                    // don't contribute to average or max pooling total but can play in
                    // lpnorm pooling.
-                    output_val = op(output_val, 0);
+                    if(mode == pooling_mode::lpnorm)
+                    {
+                        output_val = op(output_val, op.template init<Type>());
+                    }
+                    if(mode == pooling_mode::average)
+                    {
+                        // Ignore padding
+                        pool_size -= 1;
+                    }
                }
            });
            output[i] = Type(op.final(output_val, pool_size));

--- a/src/include/migraphx/op/prefix_scan_op.hpp
+++ b/src/include/migraphx/op/prefix_scan_op.hpp
@@ -22,6 +22,12 @@
 * THE SOFTWARE.
 */

+/**
+ * Parent struct for prefix scan ops.  A prefix scan is a mathematical entity useful
+ * in parallelizing various computations.  Given a list of numbers, a prefix scan
+ * op returns an equal size list of running totals of the values.  Other operations
+ * besides addition can be supported by child ops.
+ */
 #ifndef MIGRAPHX_GUARD_OPERATORS_SCAN_OP_HPP
 #define MIGRAPHX_GUARD_OPERATORS_SCAN_OP_HPP


--- a/src/include/migraphx/op/quant_convolution.hpp
+++ b/src/include/migraphx/op/quant_convolution.hpp
@@ -27,6 +27,7 @@
 #include <migraphx/op/common.hpp>
 #include <migraphx/argument.hpp>
 #include <migraphx/check_shapes.hpp>
+#include <migraphx/shape.hpp>
 #include <migraphx/config.hpp>
 #include <migraphx/convolution.hpp>
 #include <migraphx/value.hpp>
@@ -87,11 +88,13 @@ struct quant_convolution
        }

        // all input type must be int8_type and output is float_type
-        if(t != shape::int8_type)
+        std::set<migraphx::shape::type_t> supported_types = {shape::int8_type,
+                                                             shape::fp8e4m3fnuz_type};
+        if(not contains(supported_types, t))
        {
-            MIGRAPHX_THROW("QUANT_CONVOLUTION: only accept input and weights of type int8_t");
+            MIGRAPHX_THROW("QUANT_CONVOLUTION: only accept input and weights of type int8_t or "
+                           "fp8e4m3fnuz_type");
        }
-        t = shape::int32_type;

        std::vector<size_t> output_lens{input.lens()[0], weights.lens()[0]};
        auto padding_size = padding.size();
@@ -107,8 +110,11 @@ struct quant_convolution
                        stride[i] +
                    1)));
        }
-
-        return inputs[0].with_lens(t, output_lens);
+        if(t == shape::int8_type)
+        {
+            return inputs[0].with_lens(shape::int32_type, output_lens);
+        } // else fp8 conv
+        return inputs[0].with_lens(shape::float_type, output_lens);
    }

    size_t kdims() const

--- a/src/include/migraphx/op/quant_dot.hpp
+++ b/src/include/migraphx/op/quant_dot.hpp
@@ -44,9 +44,11 @@ struct quant_dot
        const shape& a = inputs.at(0);
        const shape& b = inputs.at(1);
        auto t         = a.type();
-        if(t != shape::int8_type)
+        std::set<migraphx::shape::type_t> suppported_types = {shape::int8_type,
+                                                              shape::fp8e4m3fnuz_type};
+        if(not contains(suppported_types, t))
        {
-            MIGRAPHX_THROW("QUANT_DOT: only support data type int8_t");
+            MIGRAPHX_THROW("QUANT_DOT: only support data type int8_t and fp8e4m3fnuz_type");
        }

        if(not std::all_of(
@@ -73,6 +75,10 @@ struct quant_dot

        auto out_lens   = a.lens();
        out_lens[dim_1] = b.lens()[dim_1];
+        if(t == shape::fp8e4m3fnuz_type)
+        {
+            return {shape::float_type, out_lens};
+        } // else int8 gemm
        return {shape::int32_type, out_lens};
    }
 };

--- a/src/include/migraphx/op/quantizelinear.hpp
+++ b/src/include/migraphx/op/quantizelinear.hpp
--- a/src/include/migraphx/op/random_uniform.hpp
+++ b/src/include/migraphx/op/random_uniform.hpp
@@ -65,11 +65,10 @@ struct random_uniform
        return inputs.at(1);
    }

-    argument compute(const shape&, std::vector<argument> args) const
+    argument compute(const dyn_output& dyn_out, std::vector<argument> args) const
    {
        // Output goes into the passed buffer, not the shape output.
-        auto result = args[1];
-
+        argument result{dyn_out.computed_shape};
        uint64_t local_seed = args[0].at<uint64_t>(0);
        std::mt19937 gen(local_seed);


--- a/src/include/migraphx/op/reshape.hpp
+++ b/src/include/migraphx/op/reshape.hpp
--- a/src/include/migraphx/op/reshape_lazy.hpp
+++ b/src/include/migraphx/op/reshape_lazy.hpp
--- a/src/targets/gpu/include/migraphx/gpu/pack_int8_args.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/pack_int8_args.hpp
--- a/src/targets/gpu/include/migraphx/gpu/device/gather.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/gather.hpp
--- a/src/include/migraphx/op/scatternd_op.hpp
+++ b/src/include/migraphx/op/scatternd_op.hpp
--- a/src/include/migraphx/op/slice.hpp
+++ b/src/include/migraphx/op/slice.hpp
--- a/src/include/migraphx/op/unary.hpp
+++ b/src/include/migraphx/op/unary.hpp