Merge branch 'develop' into mi200

70d9faf7 · Chris Austen · GitHub · a56c531c · a60bdb67 · 70d9faf7
Unverified Commit 70d9faf7 authored Dec 13, 2023 by Chris Austen Committed by GitHub Dec 13, 2023
20 changed files
--- a/test/fp8e4m3fnuz.cpp
+++ b/test/fp8e4m3fnuz.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <cmath>
+#include <migraphx/float_equal.hpp>
+#include <migraphx/float8.hpp>
+#include <migraphx/half.hpp>
+#include <migraphx/ranges.hpp>
+#include "test.hpp"
+
+#include <limits>
+
+float fp8e4m3fnuz_to_fp32_value(uint8_t input)
+{
+    constexpr std::array<float, 256> e4m3fnuz_lut = {
+        0.0f,           0.0009765625f,  0.001953125f,
+        0.0029296875f,  0.00390625f,    0.0048828125f,
+        0.005859375f,   0.0068359375f,  0.0078125f,
+        0.0087890625f,  0.009765625f,   0.0107421875f,
+        0.01171875f,    0.0126953125f,  0.013671875f,
+        0.0146484375f,  0.015625f,      0.017578125f,
+        0.01953125f,    0.021484375f,   0.0234375f,
+        0.025390625f,   0.02734375f,    0.029296875f,
+        0.03125f,       0.03515625f,    0.0390625f,
+        0.04296875f,    0.046875f,      0.05078125f,
+        0.0546875f,     0.05859375f,    0.0625f,
+        0.0703125f,     0.078125f,      0.0859375f,
+        0.09375f,       0.1015625f,     0.109375f,
+        0.1171875f,     0.125f,         0.140625f,
+        0.15625f,       0.171875f,      0.1875f,
+        0.203125f,      0.21875f,       0.234375f,
+        0.25f,          0.28125f,       0.3125f,
+        0.34375f,       0.375f,         0.40625f,
+        0.4375f,        0.46875f,       0.5f,
+        0.5625f,        0.625f,         0.6875f,
+        0.75f,          0.8125f,        0.875f,
+        0.9375f,        1.0f,           1.125f,
+        1.25f,          1.375f,         1.5f,
+        1.625f,         1.75f,          1.875f,
+        2.0f,           2.25f,          2.5f,
+        2.75f,          3.0f,           3.25f,
+        3.5f,           3.75f,          4.0f,
+        4.5f,           5.0f,           5.5f,
+        6.0f,           6.5f,           7.0f,
+        7.5f,           8.0f,           9.0f,
+        10.0f,          11.0f,          12.0f,
+        13.0f,          14.0f,          15.0f,
+        16.0f,          18.0f,          20.0f,
+        22.0f,          24.0f,          26.0f,
+        28.0f,          30.0f,          32.0f,
+        36.0f,          40.0f,          44.0f,
+        48.0f,          52.0f,          56.0f,
+        60.0f,          64.0f,          72.0f,
+        80.0f,          88.0f,          96.0f,
+        104.0f,         112.0f,         120.0f,
+        128.0f,         144.0f,         160.0f,
+        176.0f,         192.0f,         208.0f,
+        224.0f,         240.0f,         std::numeric_limits<float>::quiet_NaN(),
+        -0.0009765625f, -0.001953125f,  -0.0029296875f,
+        -0.00390625f,   -0.0048828125f, -0.005859375f,
+        -0.0068359375f, -0.0078125f,    -0.0087890625f,
+        -0.009765625f,  -0.0107421875f, -0.01171875f,
+        -0.0126953125f, -0.013671875f,  -0.0146484375f,
+        -0.015625f,     -0.017578125f,  -0.01953125f,
+        -0.021484375f,  -0.0234375f,    -0.025390625f,
+        -0.02734375f,   -0.029296875f,  -0.03125f,
+        -0.03515625f,   -0.0390625f,    -0.04296875f,
+        -0.046875f,     -0.05078125f,   -0.0546875f,
+        -0.05859375f,   -0.0625f,       -0.0703125f,
+        -0.078125f,     -0.0859375f,    -0.09375f,
+        -0.1015625f,    -0.109375f,     -0.1171875f,
+        -0.125f,        -0.140625f,     -0.15625f,
+        -0.171875f,     -0.1875f,       -0.203125f,
+        -0.21875f,      -0.234375f,     -0.25f,
+        -0.28125f,      -0.3125f,       -0.34375f,
+        -0.375f,        -0.40625f,      -0.4375f,
+        -0.46875f,      -0.5f,          -0.5625f,
+        -0.625f,        -0.6875f,       -0.75f,
+        -0.8125f,       -0.875f,        -0.9375f,
+        -1.0f,          -1.125f,        -1.25f,
+        -1.375f,        -1.5f,          -1.625f,
+        -1.75f,         -1.875f,        -2.0f,
+        -2.25f,         -2.5f,          -2.75f,
+        -3.0f,          -3.25f,         -3.5f,
+        -3.75f,         -4.0f,          -4.5f,
+        -5.0f,          -5.5f,          -6.0f,
+        -6.5f,          -7.0f,          -7.5f,
+        -8.0f,          -9.0f,          -10.0f,
+        -11.0f,         -12.0f,         -13.0f,
+        -14.0f,         -15.0f,         -16.0f,
+        -18.0f,         -20.0f,         -22.0f,
+        -24.0f,         -26.0f,         -28.0f,
+        -30.0f,         -32.0f,         -36.0f,
+        -40.0f,         -44.0f,         -48.0f,
+        -52.0f,         -56.0f,         -60.0f,
+        -64.0f,         -72.0f,         -80.0f,
+        -88.0f,         -96.0f,         -104.0f,
+        -112.0f,        -120.0f,        -128.0f,
+        -144.0f,        -160.0f,        -176.0f,
+        -192.0f,        -208.0f,        -224.0f,
+        -240.0f,
+    };
+
+    return e4m3fnuz_lut[input];
+}
+
+TEST_CASE(test_fp8_cast_to_float)
+{
+    std::vector<uint8_t> bit_vals(256);
+    std::iota(bit_vals.begin(), bit_vals.end(), 0);
+    EXPECT(bool{std::all_of(bit_vals.begin(), bit_vals.end(), [](uint8_t bit_val) {
+        migraphx::fp8::fp8e4m3fnuz fp8_val(bit_val, migraphx::fp8::fp8e4m3fnuz::from_bits());
+        if(std::isnan(float(fp8_val)) and std::isnan(fp8e4m3fnuz_to_fp32_value(bit_val)))
+        {
+            return true;
+        }
+        return migraphx::float_equal(float(fp8_val), fp8e4m3fnuz_to_fp32_value(bit_val));
+    })});
+}
+
+TEST_CASE(test_fp8_cast_from_float)
+{
+    std::unordered_map<float, uint8_t> test_vals = {{256, 0x7f},        {-256, 0xff},
+                                                    {240, 0x7f},        {-240, 0xff},
+                                                    {1e-07, 0x0},       {1e+07, 0x7f},
+                                                    {1, 0x40},          {-1, 0xc0},
+                                                    {0.1, 0x25},        {0.11, 0x26},
+                                                    {0.111, 0x26},      {0.1111, 0x26},
+                                                    {-0.1, 0xa5},       {-0.11, 0xa6},
+                                                    {-0.111, 0xa6},     {-0.1111, 0xa6},
+                                                    {0.2, 0x2d},        {2, 0x48},
+                                                    {20, 0x62},         {200, 0x7c},
+                                                    {-0.2, 0xad},       {-2, 0xc8},
+                                                    {-20, 0xe2},        {-200, 0xfc},
+                                                    {0.5, 0x38},        {-0.5, 0xb8},
+                                                    {1.17549e-38, 0x0}, {1.4013e-45, 0x0},
+                                                    {0.00390625, 0x4},  {-0.00390625, 0x84},
+                                                    {0.00195312, 0x2},  {-0.00195312, 0x82},
+                                                    {0.000976562, 0x1}, {-0.000976562, 0x81},
+                                                    {0.000488281, 0x0}, {-0.000488281, 0x0}};
+
+    EXPECT(bool{std::all_of(test_vals.begin(), test_vals.end(), [](const auto sample) {
+        return migraphx::float_equal(
+            migraphx::fp8::fp8e4m3fnuz(sample.first),
+            migraphx::fp8::fp8e4m3fnuz(sample.second, migraphx::fp8::fp8e4m3fnuz::from_bits()));
+    })});
+}
+
+TEST_CASE(test_positive_zero)
+{
+    float zero = 0.0;
+    migraphx::fp8::fp8e4m3fnuz fp8_zero(zero);
+    EXPECT(fp8_zero.is_zero());
+    EXPECT(migraphx::float_equal(zero, float(fp8_zero)));
+}
+
+TEST_CASE(test_negative_zero)
+{
+    float nzero = -0.0;
+    float pzero = 0.0;
+    migraphx::fp8::fp8e4m3fnuz fp8_nzero(nzero);
+    EXPECT(fp8_nzero.is_zero());
+    //  negative zero gets converted to positive zero
+    EXPECT(migraphx::float_equal(pzero, float(fp8_nzero)));
+}
+
+TEST_CASE(test_nan_1)
+{
+    float fnan = std::numeric_limits<float>::quiet_NaN();
+    migraphx::fp8::fp8e4m3fnuz fp8_nan(fnan);
+    EXPECT(fp8_nan.is_nan());
+    EXPECT(std::isnan(fp8_nan));
+}
+
+TEST_CASE(test_nan_2)
+{
+    auto fnan = std::numeric_limits<migraphx::fp8::fp8e4m3fnuz>::quiet_NaN();
+    migraphx::fp8::fp8e4m3fnuz fp8_nan(fnan.data, migraphx::fp8::fp8e4m3fnuz::from_bits());
+    EXPECT(fp8_nan.is_nan());
+    EXPECT(std::isnan(fp8_nan));
+    EXPECT(std::isnan(float(fp8_nan)));
+}
+
+TEST_CASE(test_infinity_1)
+{
+    float finf = std::numeric_limits<float>::infinity();
+    // no inf in fp8e4m3fnuz it gets clipped to Nans
+    migraphx::fp8::fp8e4m3fnuz fp8_nan(finf);
+    EXPECT(fp8_nan.is_nan());
+    EXPECT(std::isnan(float(fp8_nan)));
+}
+
+TEST_CASE(test_infinity_2)
+{
+    // neg inf
+    float finf = -1.0 * std::numeric_limits<float>::infinity();
+    // no inf in fp8e4m3fnuz it gets clipped to NaNs
+    migraphx::fp8::fp8e4m3fnuz fp8_nan(finf);
+    EXPECT(fp8_nan.is_nan());
+    EXPECT(std::isnan(float(fp8_nan)));
+}
+
+TEST_CASE(test_numeric_max_1)
+{
+    float fmax = std::numeric_limits<float>::max();
+    migraphx::fp8::fp8e4m3fnuz fp8_max(fmax);
+    EXPECT(fp8_max == std::numeric_limits<migraphx::fp8::fp8e4m3fnuz>::max());
+}
+
+TEST_CASE(test_numeric_max_2)
+{
+    // gets clipped to max
+    float fmax = 2 * std::numeric_limits<migraphx::fp8::fp8e4m3fnuz>::max();
+    migraphx::fp8::fp8e4m3fnuz fp8_max(fmax);
+    EXPECT(fp8_max == std::numeric_limits<migraphx::fp8::fp8e4m3fnuz>::max());
+}
+
+TEST_CASE(test_numeric_lowest_1)
+{
+    float flowest = std::numeric_limits<float>::lowest();
+    migraphx::fp8::fp8e4m3fnuz fp8_lowest(flowest);
+    EXPECT(fp8_lowest == std::numeric_limits<migraphx::fp8::fp8e4m3fnuz>::lowest());
+}
+
+TEST_CASE(test_numeric_lowest_2)
+{
+    // gets clipped to lowest
+    float fmin = 2.0 * std::numeric_limits<migraphx::fp8::fp8e4m3fnuz>::lowest();
+    migraphx::fp8::fp8e4m3fnuz fp8_lowest(fmin);
+    EXPECT(fp8_lowest == std::numeric_limits<migraphx::fp8::fp8e4m3fnuz>::lowest());
+}
+
+TEST_CASE(test_max_eq_lowest)
+{
+    EXPECT(migraphx::float_equal(std::numeric_limits<migraphx::fp8::fp8e4m3fnuz>::lowest(),
+                                 -1 * std::numeric_limits<migraphx::fp8::fp8e4m3fnuz>::max()));
+}
+
+TEST_CASE(test_isfinite)
+{
+    EXPECT(std::isfinite(migraphx::fp8::fp8e4m3fnuz(0.0)));
+    EXPECT(std::isfinite(migraphx::fp8::fp8e4m3fnuz(-0.0)));
+    EXPECT(not std::isfinite(
+        migraphx::fp8::fp8e4m3fnuz(std::numeric_limits<migraphx::fp8::fp8e4m3fnuz>::quiet_NaN())));
+}
+
+TEST_CASE(test_no_infinity)
+{
+    EXPECT(not bool{std::numeric_limits<migraphx::fp8::fp8e4m3fnuz>::has_infinity});
+}
+
+TEST_CASE(test_binary_ops)
+{
+    auto a = migraphx::fp8::fp8e4m3fnuz(-1.0);
+    auto b = migraphx::fp8::fp8e4m3fnuz(1.0);
+    auto c = migraphx::fp8::fp8e4m3fnuz(0.0);
+    auto d = migraphx::fp8::fp8e4m3fnuz(-0.0);
+    EXPECT(migraphx::float_equal((c + d), c));
+    EXPECT(migraphx::float_equal((c + d), d));
+    EXPECT(migraphx::float_equal((a + b), c));
+    EXPECT(migraphx::float_equal((a + b), d));
+
+    auto e = migraphx::fp8::fp8e4m3fnuz(10.0);
+    auto f = migraphx::fp8::fp8e4m3fnuz(-10.0);
+    EXPECT(bool{e > f});
+    EXPECT(bool{f < e});
+    EXPECT(bool{f <= e});
+    EXPECT(bool{e >= f});
+    EXPECT(bool{e <= e});
+    EXPECT(bool{f >= f});
+    EXPECT(not migraphx::float_equal(f, e));
+}
+
+TEST_CASE(test_fabs)
+{
+    auto a = migraphx::fp8::fp8e4m3fnuz(-1.0);
+    auto b = migraphx::fp8::fp8e4m3fnuz(1.0);
+    EXPECT(migraphx::float_equal(b, migraphx::fp8::fabs(a)));
+}
+
+TEST_CASE(test_stream_op)
+{
+    auto a = migraphx::fp8::fp8e4m3fnuz(-1.0);
+    std::stringstream ss;
+    ss << a;
+    EXPECT(std::string("-1") == ss.str());
+    ss     = std::stringstream();
+    auto b = std::numeric_limits<migraphx::fp8::fp8e4m3fnuz>::quiet_NaN();
+    ss << b;
+    EXPECT(std::string("nan") == ss.str());
+}
+
+int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/test/fp8e5m2.cpp
+++ b/test/fp8e5m2.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <cmath>
+#include <migraphx/float_equal.hpp>
+#include <migraphx/float8.hpp>
+#include <migraphx/half.hpp>
+#include <migraphx/ranges.hpp>
+#include "test.hpp"
+
+#include <limits>
+#include <sstream>
+
+float fp8e5m2_to_fp32_value(uint8_t input)
+{
+    constexpr std::array<float, 256> e4m3fnuz_lut = {
+        0.0,
+        1.52587890625e-05,
+        3.0517578125e-05,
+        4.57763671875e-05,
+        6.103515625e-05,
+        7.62939453125e-05,
+        9.1552734375e-05,
+        0.0001068115234375,
+        0.0001220703125,
+        0.000152587890625,
+        0.00018310546875,
+        0.000213623046875,
+        0.000244140625,
+        0.00030517578125,
+        0.0003662109375,
+        0.00042724609375,
+        0.00048828125,
+        0.0006103515625,
+        0.000732421875,
+        0.0008544921875,
+        0.0009765625,
+        0.001220703125,
+        0.00146484375,
+        0.001708984375,
+        0.001953125,
+        0.00244140625,
+        0.0029296875,
+        0.00341796875,
+        0.00390625,
+        0.0048828125,
+        0.005859375,
+        0.0068359375,
+        0.0078125,
+        0.009765625,
+        0.01171875,
+        0.013671875,
+        0.015625,
+        0.01953125,
+        0.0234375,
+        0.02734375,
+        0.03125,
+        0.0390625,
+        0.046875,
+        0.0546875,
+        0.0625,
+        0.078125,
+        0.09375,
+        0.109375,
+        0.125,
+        0.15625,
+        0.1875,
+        0.21875,
+        0.25,
+        0.3125,
+        0.375,
+        0.4375,
+        0.5,
+        0.625,
+        0.75,
+        0.875,
+        1.0,
+        1.25,
+        1.5,
+        1.75,
+        2.0,
+        2.5,
+        3.0,
+        3.5,
+        4.0,
+        5.0,
+        6.0,
+        7.0,
+        8.0,
+        10.0,
+        12.0,
+        14.0,
+        16.0,
+        20.0,
+        24.0,
+        28.0,
+        32.0,
+        40.0,
+        48.0,
+        56.0,
+        64.0,
+        80.0,
+        96.0,
+        112.0,
+        128.0,
+        160.0,
+        192.0,
+        224.0,
+        256.0,
+        320.0,
+        384.0,
+        448.0,
+        512.0,
+        640.0,
+        768.0,
+        896.0,
+        1024.0,
+        1280.0,
+        1536.0,
+        1792.0,
+        2048.0,
+        2560.0,
+        3072.0,
+        3584.0,
+        4096.0,
+        5120.0,
+        6144.0,
+        7168.0,
+        8192.0,
+        10240.0,
+        12288.0,
+        14336.0,
+        16384.0,
+        20480.0,
+        24576.0,
+        28672.0,
+        32768.0,
+        40960.0,
+        49152.0,
+        57344.0,
+        std::numeric_limits<float>::infinity(),
+        std::numeric_limits<float>::quiet_NaN(),
+        std::numeric_limits<float>::quiet_NaN(),
+        std::numeric_limits<float>::quiet_NaN(),
+        -0.0,
+        -1.52587890625e-05,
+        -3.0517578125e-05,
+        -4.57763671875e-05,
+        -6.103515625e-05,
+        -7.62939453125e-05,
+        -9.1552734375e-05,
+        -0.0001068115234375,
+        -0.0001220703125,
+        -0.000152587890625,
+        -0.00018310546875,
+        -0.000213623046875,
+        -0.000244140625,
+        -0.00030517578125,
+        -0.0003662109375,
+        -0.00042724609375,
+        -0.00048828125,
+        -0.0006103515625,
+        -0.000732421875,
+        -0.0008544921875,
+        -0.0009765625,
+        -0.001220703125,
+        -0.00146484375,
+        -0.001708984375,
+        -0.001953125,
+        -0.00244140625,
+        -0.0029296875,
+        -0.00341796875,
+        -0.00390625,
+        -0.0048828125,
+        -0.005859375,
+        -0.0068359375,
+        -0.0078125,
+        -0.009765625,
+        -0.01171875,
+        -0.013671875,
+        -0.015625,
+        -0.01953125,
+        -0.0234375,
+        -0.02734375,
+        -0.03125,
+        -0.0390625,
+        -0.046875,
+        -0.0546875,
+        -0.0625,
+        -0.078125,
+        -0.09375,
+        -0.109375,
+        -0.125,
+        -0.15625,
+        -0.1875,
+        -0.21875,
+        -0.25,
+        -0.3125,
+        -0.375,
+        -0.4375,
+        -0.5,
+        -0.625,
+        -0.75,
+        -0.875,
+        -1.0,
+        -1.25,
+        -1.5,
+        -1.75,
+        -2.0,
+        -2.5,
+        -3.0,
+        -3.5,
+        -4.0,
+        -5.0,
+        -6.0,
+        -7.0,
+        -8.0,
+        -10.0,
+        -12.0,
+        -14.0,
+        -16.0,
+        -20.0,
+        -24.0,
+        -28.0,
+        -32.0,
+        -40.0,
+        -48.0,
+        -56.0,
+        -64.0,
+        -80.0,
+        -96.0,
+        -112.0,
+        -128.0,
+        -160.0,
+        -192.0,
+        -224.0,
+        -256.0,
+        -320.0,
+        -384.0,
+        -448.0,
+        -512.0,
+        -640.0,
+        -768.0,
+        -896.0,
+        -1024.0,
+        -1280.0,
+        -1536.0,
+        -1792.0,
+        -2048.0,
+        -2560.0,
+        -3072.0,
+        -3584.0,
+        -4096.0,
+        -5120.0,
+        -6144.0,
+        -7168.0,
+        -8192.0,
+        -10240.0,
+        -12288.0,
+        -14336.0,
+        -16384.0,
+        -20480.0,
+        -24576.0,
+        -28672.0,
+        -32768.0,
+        -40960.0,
+        -49152.0,
+        -57344.0,
+        -1.0f * std::numeric_limits<float>::infinity(),
+        std::numeric_limits<float>::quiet_NaN(),
+        std::numeric_limits<float>::quiet_NaN(),
+        std::numeric_limits<float>::quiet_NaN(),
+
+    };
+
+    return e4m3fnuz_lut[input];
+}
+
+TEST_CASE(test_fp8_cast_to_float)
+{
+    std::vector<uint8_t> bit_vals(256);
+    std::iota(bit_vals.begin(), bit_vals.end(), 0);
+    EXPECT(bool{std::all_of(bit_vals.begin(), bit_vals.end(), [](uint8_t bit_val) {
+        migraphx::fp8::fp8e5m2 fp8_val(bit_val, migraphx::fp8::fp8e5m2::from_bits());
+        if(std::isnan(float(fp8_val)) and std::isnan(fp8e5m2_to_fp32_value(bit_val)))
+        {
+            return true;
+        }
+        else if(std::isinf(float(fp8_val)) and std::isinf(fp8e5m2_to_fp32_value(bit_val)))
+        {
+            return true;
+        }
+        return migraphx::float_equal(float(fp8_val), fp8e5m2_to_fp32_value(bit_val));
+    })});
+}
+
+TEST_CASE(test_fp8_cast_from_float)
+{
+    std::unordered_map<float, uint8_t> test_vals = {
+        {-60000, 0xfb},
+        {-57344, 0xfb},
+        {-448, 0xdf},
+        {-256, 0xdc},
+        {-240, 0xdc},
+        {-200, 0xda},
+        {-20, 0xcd},
+        {-2, 0xc0},
+        {-1, 0xbc},
+        {-0.5, 0xb8},
+        {-0.2, 0xb2},
+        {-0.1111, 0xaf},
+        {-0.111, 0xaf},
+        {-0.11, 0xaf},
+        {-0.1, 0xae},
+        {6.10351e-05, 0x4},
+        {-6.10351e-05, 0x84},
+        {3.05176e-05, 0x2},
+        {-3.05176e-05, 0x82},
+        {1.52588e-05, 0x1},
+        {-1.52588e-05, 0x81},
+        {7.62939e-06, 0x0},
+        {-7.62939e-06, 0x80},
+        {0.1, 0x2e},
+        {0.11, 0x2f},
+        {0.111, 0x2f},
+        {0.1111, 0x2f},
+        {0.2, 0x32},
+        {0.5, 0x38},
+        {1, 0x3c},
+        {2, 0x40},
+        {20, 0x4d},
+        {200, 0x5a},
+        {240, 0x5c},
+        {256, 0x5c},
+        {448, 0x5f},
+        {57344, 0x7b},
+        {60000, 0x7b},
+        {1e+07, 0x7b},
+    };
+
+    EXPECT(bool{std::all_of(test_vals.begin(), test_vals.end(), [](const auto sample) {
+        return migraphx::float_equal(
+            migraphx::fp8::fp8e5m2(sample.first),
+            migraphx::fp8::fp8e5m2(sample.second, migraphx::fp8::fp8e5m2::from_bits()));
+    })});
+}
+
+TEST_CASE(test_positive_zero)
+{
+    float zero = 0.0;
+    migraphx::fp8::fp8e5m2 fp8_zero(zero);
+    EXPECT(fp8_zero.is_zero());
+    EXPECT(migraphx::float_equal(zero, float(fp8_zero)));
+}
+
+TEST_CASE(test_negative_zero)
+{
+    float nzero = -0.0;
+    migraphx::fp8::fp8e5m2 fp8_nzero(nzero);
+    EXPECT(fp8_nzero.is_zero());
+    //  negative zero is preserved for fp8e5m2
+    EXPECT(migraphx::float_equal(nzero, float(fp8_nzero)));
+}
+
+TEST_CASE(test_pos_zero_eq_neg_zero)
+{
+    float nzero = -0.0;
+    float pzero = 0.0;
+    migraphx::fp8::fp8e5m2 fp8_nzero(nzero);
+    migraphx::fp8::fp8e5m2 fp8_pzero(pzero);
+    EXPECT(fp8_nzero == fp8_pzero);
+}
+
+TEST_CASE(test_nan_1)
+{
+    float fnan = std::numeric_limits<float>::quiet_NaN();
+    migraphx::fp8::fp8e5m2 fp8_nan(fnan);
+    EXPECT(fp8_nan.is_nan());
+    EXPECT(std::isnan(fp8_nan));
+}
+
+TEST_CASE(test_nan_2)
+{
+    auto fnan = std::numeric_limits<migraphx::fp8::fp8e5m2>::quiet_NaN();
+    migraphx::fp8::fp8e5m2 fp8_nan(fnan.data, migraphx::fp8::fp8e5m2::from_bits());
+    EXPECT(fp8_nan.is_nan());
+    EXPECT(std::isnan(fp8_nan));
+    EXPECT(std::isnan(float(fp8_nan)));
+}
+
+TEST_CASE(test_infinity_1)
+{
+    // float infinity should get clipped to max
+    float finf = std::numeric_limits<float>::infinity();
+    migraphx::fp8::fp8e5m2 fp8_max(finf);
+    EXPECT(fp8_max == std::numeric_limits<migraphx::fp8::fp8e5m2>::max());
+}
+
+TEST_CASE(test_infinity_2)
+{
+    // neg inf
+    float finf = -1.0 * std::numeric_limits<float>::infinity();
+    // no inf in fp8e5m2, it gets clipped to lowest
+    migraphx::fp8::fp8e5m2 fp8_lowest(finf);
+    EXPECT(bool{fp8_lowest == std::numeric_limits<migraphx::fp8::fp8e5m2>::lowest()});
+}
+
+TEST_CASE(test_numeric_max_1)
+{
+    float fmax = std::numeric_limits<float>::max();
+    migraphx::fp8::fp8e5m2 fp8_max(fmax);
+    EXPECT(fp8_max == std::numeric_limits<migraphx::fp8::fp8e5m2>::max());
+}
+
+TEST_CASE(test_numeric_max_2)
+{
+    // gets clipped to max
+    float fmax = 2 * std::numeric_limits<migraphx::fp8::fp8e5m2>::max();
+    migraphx::fp8::fp8e5m2 fp8_max(fmax);
+    EXPECT(fp8_max == std::numeric_limits<migraphx::fp8::fp8e5m2>::max());
+}
+
+TEST_CASE(test_numeric_lowest_1)
+{
+    float flowest = std::numeric_limits<float>::lowest();
+    migraphx::fp8::fp8e5m2 fp8_lowest(flowest);
+    EXPECT(fp8_lowest == std::numeric_limits<migraphx::fp8::fp8e5m2>::lowest());
+}
+
+TEST_CASE(test_numeric_lowest_2)
+{
+    // gets clipped to lowest
+    float fmin = 2.0 * std::numeric_limits<migraphx::fp8::fp8e5m2>::lowest();
+    migraphx::fp8::fp8e5m2 fp8_lowest(fmin);
+    EXPECT(fp8_lowest == std::numeric_limits<migraphx::fp8::fp8e5m2>::lowest());
+}
+
+TEST_CASE(test_max_eq_lowest)
+{
+    EXPECT(migraphx::float_equal(std::numeric_limits<migraphx::fp8::fp8e5m2>::lowest(),
+                                 -1 * std::numeric_limits<migraphx::fp8::fp8e5m2>::max()));
+}
+
+TEST_CASE(test_isfinite)
+{
+    EXPECT(std::isfinite(migraphx::fp8::fp8e5m2(0.0)));
+    EXPECT(std::isfinite(migraphx::fp8::fp8e5m2(-0.0)));
+    EXPECT(not std::isfinite(
+        migraphx::fp8::fp8e5m2(std::numeric_limits<migraphx::fp8::fp8e5m2>::quiet_NaN())));
+    EXPECT(not std::isfinite(std::numeric_limits<migraphx::fp8::fp8e5m2>::infinity()));
+    // -1.0 * inf  is float(-inf) which with clipping/saturation gets converted into fp8::lowest()
+    EXPECT(std::isfinite(
+        migraphx::fp8::fp8e5m2(-1.0 * std::numeric_limits<migraphx::fp8::fp8e5m2>::infinity())));
+    EXPECT(not std::isfinite(migraphx::fp8::fp8e5m2(0xFC, migraphx::fp8::fp8e5m2::from_bits())));
+}
+
+TEST_CASE(test_binary_ops)
+{
+    auto a = migraphx::fp8::fp8e5m2(-1.0);
+    auto b = migraphx::fp8::fp8e5m2(1.0);
+    auto c = migraphx::fp8::fp8e5m2(0.0);
+    auto d = migraphx::fp8::fp8e5m2(-0.0);
+    EXPECT(migraphx::float_equal((c + d), c));
+    EXPECT(migraphx::float_equal((c + d), d));
+    EXPECT(migraphx::float_equal((a + b), c));
+    EXPECT(migraphx::float_equal((a + b), d));
+
+    auto e = migraphx::fp8::fp8e5m2(10.0);
+    auto f = migraphx::fp8::fp8e5m2(-10.0);
+    EXPECT(bool{e > f});
+    EXPECT(bool{f < e});
+    EXPECT(bool{f <= e});
+    EXPECT(bool{e >= f});
+    EXPECT(bool{e <= e});
+    EXPECT(bool{f >= f});
+    EXPECT(not migraphx::float_equal(f, e));
+}
+
+TEST_CASE(test_fabs)
+{
+    auto a = migraphx::fp8::fp8e5m2(-1.0);
+    auto b = migraphx::fp8::fp8e5m2(1.0);
+    EXPECT(migraphx::float_equal(b, migraphx::fp8::fabs(a)));
+}
+
+TEST_CASE(test_stream_op)
+{
+    auto a = migraphx::fp8::fp8e5m2(-1.0);
+    std::stringstream ss;
+    ss << a;
+    EXPECT(std::string("-1") == ss.str());
+    ss     = std::stringstream();
+    auto b = std::numeric_limits<migraphx::fp8::fp8e5m2>::quiet_NaN();
+    ss << b;
+    EXPECT(std::string("nan") == ss.str());
+}
+
+int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/test/fp8e5m2fnuz.cpp
+++ b/test/fp8e5m2fnuz.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <cmath>
+#include <migraphx/float_equal.hpp>
+#include <migraphx/float8.hpp>
+#include <migraphx/half.hpp>
+#include <migraphx/ranges.hpp>
+#include "test.hpp"
+
+#include <limits>
+
+float fp8e5m2fnuz_to_fp32_value(uint8_t input)
+{
+    constexpr std::array<float, 256> e4m3fnuz_lut = {
+        0.0,
+        7.62939453125e-06,
+        1.52587890625e-05,
+        2.288818359375e-05,
+        3.0517578125e-05,
+        3.814697265625e-05,
+        4.57763671875e-05,
+        5.340576171875e-05,
+        6.103515625e-05,
+        7.62939453125e-05,
+        9.1552734375e-05,
+        0.0001068115234375,
+        0.0001220703125,
+        0.000152587890625,
+        0.00018310546875,
+        0.000213623046875,
+        0.000244140625,
+        0.00030517578125,
+        0.0003662109375,
+        0.00042724609375,
+        0.00048828125,
+        0.0006103515625,
+        0.000732421875,
+        0.0008544921875,
+        0.0009765625,
+        0.001220703125,
+        0.00146484375,
+        0.001708984375,
+        0.001953125,
+        0.00244140625,
+        0.0029296875,
+        0.00341796875,
+        0.00390625,
+        0.0048828125,
+        0.005859375,
+        0.0068359375,
+        0.0078125,
+        0.009765625,
+        0.01171875,
+        0.013671875,
+        0.015625,
+        0.01953125,
+        0.0234375,
+        0.02734375,
+        0.03125,
+        0.0390625,
+        0.046875,
+        0.0546875,
+        0.0625,
+        0.078125,
+        0.09375,
+        0.109375,
+        0.125,
+        0.15625,
+        0.1875,
+        0.21875,
+        0.25,
+        0.3125,
+        0.375,
+        0.4375,
+        0.5,
+        0.625,
+        0.75,
+        0.875,
+        1.0,
+        1.25,
+        1.5,
+        1.75,
+        2.0,
+        2.5,
+        3.0,
+        3.5,
+        4.0,
+        5.0,
+        6.0,
+        7.0,
+        8.0,
+        10.0,
+        12.0,
+        14.0,
+        16.0,
+        20.0,
+        24.0,
+        28.0,
+        32.0,
+        40.0,
+        48.0,
+        56.0,
+        64.0,
+        80.0,
+        96.0,
+        112.0,
+        128.0,
+        160.0,
+        192.0,
+        224.0,
+        256.0,
+        320.0,
+        384.0,
+        448.0,
+        512.0,
+        640.0,
+        768.0,
+        896.0,
+        1024.0,
+        1280.0,
+        1536.0,
+        1792.0,
+        2048.0,
+        2560.0,
+        3072.0,
+        3584.0,
+        4096.0,
+        5120.0,
+        6144.0,
+        7168.0,
+        8192.0,
+        10240.0,
+        12288.0,
+        14336.0,
+        16384.0,
+        20480.0,
+        24576.0,
+        28672.0,
+        32768.0,
+        40960.0,
+        49152.0,
+        57344.0,
+        std::numeric_limits<float>::quiet_NaN(),
+        -7.62939453125e-06,
+        -1.52587890625e-05,
+        -2.288818359375e-05,
+        -3.0517578125e-05,
+        -3.814697265625e-05,
+        -4.57763671875e-05,
+        -5.340576171875e-05,
+        -6.103515625e-05,
+        -7.62939453125e-05,
+        -9.1552734375e-05,
+        -0.0001068115234375,
+        -0.0001220703125,
+        -0.000152587890625,
+        -0.00018310546875,
+        -0.000213623046875,
+        -0.000244140625,
+        -0.00030517578125,
+        -0.0003662109375,
+        -0.00042724609375,
+        -0.00048828125,
+        -0.0006103515625,
+        -0.000732421875,
+        -0.0008544921875,
+        -0.0009765625,
+        -0.001220703125,
+        -0.00146484375,
+        -0.001708984375,
+        -0.001953125,
+        -0.00244140625,
+        -0.0029296875,
+        -0.00341796875,
+        -0.00390625,
+        -0.0048828125,
+        -0.005859375,
+        -0.0068359375,
+        -0.0078125,
+        -0.009765625,
+        -0.01171875,
+        -0.013671875,
+        -0.015625,
+        -0.01953125,
+        -0.0234375,
+        -0.02734375,
+        -0.03125,
+        -0.0390625,
+        -0.046875,
+        -0.0546875,
+        -0.0625,
+        -0.078125,
+        -0.09375,
+        -0.109375,
+        -0.125,
+        -0.15625,
+        -0.1875,
+        -0.21875,
+        -0.25,
+        -0.3125,
+        -0.375,
+        -0.4375,
+        -0.5,
+        -0.625,
+        -0.75,
+        -0.875,
+        -1.0,
+        -1.25,
+        -1.5,
+        -1.75,
+        -2.0,
+        -2.5,
+        -3.0,
+        -3.5,
+        -4.0,
+        -5.0,
+        -6.0,
+        -7.0,
+        -8.0,
+        -10.0,
+        -12.0,
+        -14.0,
+        -16.0,
+        -20.0,
+        -24.0,
+        -28.0,
+        -32.0,
+        -40.0,
+        -48.0,
+        -56.0,
+        -64.0,
+        -80.0,
+        -96.0,
+        -112.0,
+        -128.0,
+        -160.0,
+        -192.0,
+        -224.0,
+        -256.0,
+        -320.0,
+        -384.0,
+        -448.0,
+        -512.0,
+        -640.0,
+        -768.0,
+        -896.0,
+        -1024.0,
+        -1280.0,
+        -1536.0,
+        -1792.0,
+        -2048.0,
+        -2560.0,
+        -3072.0,
+        -3584.0,
+        -4096.0,
+        -5120.0,
+        -6144.0,
+        -7168.0,
+        -8192.0,
+        -10240.0,
+        -12288.0,
+        -14336.0,
+        -16384.0,
+        -20480.0,
+        -24576.0,
+        -28672.0,
+        -32768.0,
+        -40960.0,
+        -49152.0,
+        -57344.0,
+    };
+
+    return e4m3fnuz_lut[input];
+}
+
+TEST_CASE(test_fp8_cast_to_float)
+{
+    std::vector<uint8_t> bit_vals(256);
+    std::iota(bit_vals.begin(), bit_vals.end(), 0);
+    EXPECT(bool{std::all_of(bit_vals.begin(), bit_vals.end(), [](uint8_t bit_val) {
+        migraphx::fp8::fp8e5m2fnuz fp8_val(bit_val, migraphx::fp8::fp8e5m2fnuz::from_bits());
+        if(std::isnan(float(fp8_val)) and std::isnan(fp8e5m2fnuz_to_fp32_value(bit_val)))
+        {
+            return true;
+        }
+        return migraphx::float_equal(float(fp8_val), fp8e5m2fnuz_to_fp32_value(bit_val));
+    })});
+}
+
+TEST_CASE(test_fp8_cast_from_float)
+{
+    std::unordered_map<float, uint8_t> test_vals = {
+        {57344, 0x7f},      {-57344, 0xff},       {60000, 0x7f},      {-60000, 0xff},
+        {448, 0x63},        {-448, 0xe3},         {256, 0x60},        {-256, 0xe0},
+        {240, 0x60},        {-240, 0xe0},         {3.05176e-05, 0x4}, {-3.05176e-05, 0x84},
+        {1.52588e-05, 0x2}, {-1.52588e-05, 0x82}, {7.62939e-06, 0x1}, {-7.62939e-06, 0x81},
+        {3.81469e-06, 0x0}, {-3.81469e-06, 0x0},  {1e+07, 0x7f},      {1, 0x40},
+        {-1, 0xc0},         {0.1, 0x32},          {0.11, 0x33},       {0.111, 0x33},
+        {0.1111, 0x33},     {-0.1, 0xb2},         {-0.11, 0xb3},      {-0.111, 0xb3},
+        {-0.1111, 0xb3},    {0.2, 0x36},          {2, 0x44},          {20, 0x51},
+        {200, 0x5e},        {-0.2, 0xb6},         {-2, 0xc4},         {-20, 0xd1},
+        {-200, 0xde},       {0.5, 0x3c},          {-0.5, 0xbc},       {1.17549e-38, 0x0},
+        {1.4013e-45, 0x0},
+    };
+
+    EXPECT(bool{std::all_of(test_vals.begin(), test_vals.end(), [](const auto sample) {
+        return migraphx::float_equal(
+            migraphx::fp8::fp8e5m2fnuz(sample.first),
+            migraphx::fp8::fp8e5m2fnuz(sample.second, migraphx::fp8::fp8e5m2fnuz::from_bits()));
+    })});
+}
+
+TEST_CASE(test_positive_zero)
+{
+    float zero = 0.0;
+    migraphx::fp8::fp8e5m2fnuz fp8_zero(zero);
+    EXPECT(fp8_zero.is_zero());
+    EXPECT(migraphx::float_equal(zero, float(fp8_zero)));
+}
+
+TEST_CASE(test_negative_zero)
+{
+    float nzero = -0.0;
+    float pzero = 0.0;
+    migraphx::fp8::fp8e5m2fnuz fp8_nzero(nzero);
+    EXPECT(fp8_nzero.is_zero());
+    //  negative zero gets converted to positive zero
+    EXPECT(migraphx::float_equal(pzero, float(fp8_nzero)));
+}
+
+TEST_CASE(test_nan_1)
+{
+    float fnan = std::numeric_limits<float>::quiet_NaN();
+    migraphx::fp8::fp8e5m2fnuz fp8_nan(fnan);
+    EXPECT(fp8_nan.is_nan());
+    EXPECT(std::isnan(fp8_nan));
+}
+
+TEST_CASE(test_nan_2)
+{
+    auto fnan = std::numeric_limits<migraphx::fp8::fp8e5m2fnuz>::quiet_NaN();
+    migraphx::fp8::fp8e5m2fnuz fp8_nan(fnan.data, migraphx::fp8::fp8e5m2fnuz::from_bits());
+    EXPECT(fp8_nan.is_nan());
+    EXPECT(std::isnan(fp8_nan));
+    EXPECT(std::isnan(float(fp8_nan)));
+}
+
+TEST_CASE(test_infinity_1)
+{
+    float finf = std::numeric_limits<float>::infinity();
+    // no inf in fp8e5m2fnuz it gets clipped to Nans
+    migraphx::fp8::fp8e5m2fnuz fp8_nan(finf);
+    EXPECT(fp8_nan.is_nan());
+    EXPECT(std::isnan(float(fp8_nan)));
+}
+
+TEST_CASE(test_infinity_2)
+{
+    // neg inf
+    float finf = -1.0 * std::numeric_limits<float>::infinity();
+    // no inf in fp8e5m2fnuz it gets clipped to NaNs
+    migraphx::fp8::fp8e5m2fnuz fp8_nan(finf);
+    EXPECT(fp8_nan.is_nan());
+    EXPECT(std::isnan(float(fp8_nan)));
+}
+
+TEST_CASE(test_numeric_max_1)
+{
+    float fmax = std::numeric_limits<float>::max();
+    migraphx::fp8::fp8e5m2fnuz fp8_max(fmax);
+    EXPECT(fp8_max == std::numeric_limits<migraphx::fp8::fp8e5m2fnuz>::max());
+}
+
+TEST_CASE(test_numeric_max_2)
+{
+    // gets clipped to max
+    float fmax = 2 * std::numeric_limits<migraphx::fp8::fp8e5m2fnuz>::max();
+    migraphx::fp8::fp8e5m2fnuz fp8_max(fmax);
+    EXPECT(fp8_max == std::numeric_limits<migraphx::fp8::fp8e5m2fnuz>::max());
+}
+
+TEST_CASE(test_numeric_lowest_1)
+{
+    float flowest = std::numeric_limits<float>::lowest();
+    migraphx::fp8::fp8e5m2fnuz fp8_lowest(flowest);
+    EXPECT(fp8_lowest == std::numeric_limits<migraphx::fp8::fp8e5m2fnuz>::lowest());
+}
+
+TEST_CASE(test_numeric_lowest_2)
+{
+    // gets clipped to lowest
+    float fmin = 2.0 * std::numeric_limits<migraphx::fp8::fp8e5m2fnuz>::lowest();
+    migraphx::fp8::fp8e5m2fnuz fp8_lowest(fmin);
+    EXPECT(fp8_lowest == std::numeric_limits<migraphx::fp8::fp8e5m2fnuz>::lowest());
+}
+
+TEST_CASE(test_max_eq_lowest)
+{
+    EXPECT(migraphx::float_equal(std::numeric_limits<migraphx::fp8::fp8e5m2fnuz>::lowest(),
+                                 -1 * std::numeric_limits<migraphx::fp8::fp8e5m2fnuz>::max()));
+}
+
+TEST_CASE(test_isfinite)
+{
+    EXPECT(std::isfinite(migraphx::fp8::fp8e5m2fnuz(0.0)));
+    EXPECT(std::isfinite(migraphx::fp8::fp8e5m2fnuz(-0.0)));
+    EXPECT(not std::isfinite(
+        migraphx::fp8::fp8e5m2fnuz(std::numeric_limits<migraphx::fp8::fp8e5m2fnuz>::quiet_NaN())));
+}
+
+TEST_CASE(test_no_infinity)
+{
+    EXPECT(not bool{std::numeric_limits<migraphx::fp8::fp8e5m2fnuz>::has_infinity});
+}
+
+TEST_CASE(test_binary_ops)
+{
+    auto a = migraphx::fp8::fp8e5m2fnuz(-1.0);
+    auto b = migraphx::fp8::fp8e5m2fnuz(1.0);
+    auto c = migraphx::fp8::fp8e5m2fnuz(0.0);
+    auto d = migraphx::fp8::fp8e5m2fnuz(-0.0);
+    EXPECT(migraphx::float_equal((c + d), c));
+    EXPECT(migraphx::float_equal((c + d), d));
+    EXPECT(migraphx::float_equal((a + b), c));
+    EXPECT(migraphx::float_equal((a + b), d));
+
+    auto e = migraphx::fp8::fp8e5m2fnuz(10.0);
+    auto f = migraphx::fp8::fp8e5m2fnuz(-10.0);
+    EXPECT(bool{e > f});
+    EXPECT(bool{f < e});
+    EXPECT(bool{f <= e});
+    EXPECT(bool{e >= f});
+    EXPECT(bool{e <= e});
+    EXPECT(bool{f >= f});
+    EXPECT(not migraphx::float_equal(f, e));
+}
+
+TEST_CASE(test_fabs)
+{
+    auto a = migraphx::fp8::fp8e5m2fnuz(-1.0);
+    auto b = migraphx::fp8::fp8e5m2fnuz(1.0);
+    EXPECT(migraphx::float_equal(b, migraphx::fp8::fabs(a)));
+}
+
+TEST_CASE(test_stream_op)
+{
+    auto a = migraphx::fp8::fp8e5m2fnuz(-1.0);
+    std::stringstream ss;
+    ss << a;
+    EXPECT(std::string("-1") == ss.str());
+    ss     = std::stringstream();
+    auto b = std::numeric_limits<migraphx::fp8::fp8e5m2fnuz>::quiet_NaN();
+    ss << b;
+    EXPECT(std::string("nan") == ss.str());
+}
+int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/test/fuse_pointwise.cpp
+++ b/test/fuse_pointwise.cpp
@@ -414,8 +414,8 @@ TEST_CASE(add_reshape_add_nonstandard)
        auto y       = mm->add_parameter("y", s1);
        auto z       = mm->add_parameter("z", s2);
        auto add1    = mm->add_instruction(migraphx::make_op("add"), x, y);
-        auto c       = mm->add_instruction(migraphx::make_op("contiguous"), add1);
-        auto reshape = mm->add_instruction(migraphx::make_op("reshape", {{"dims", s2.lens()}}), c);
+        auto reshape =
+            mm->add_instruction(migraphx::make_op("reshape", {{"dims", s2.lens()}}), add1);
        auto add2    = mm->add_instruction(migraphx::make_op("add"), reshape, z);
        mm->add_return({add2});
    }
@@ -426,10 +426,8 @@ TEST_CASE(add_reshape_add_nonstandard)
        auto x   = mm->add_parameter("x", s1);
        auto y   = mm->add_parameter("y", s1);
        auto z   = mm->add_parameter("z", s2);
-        auto cx  = mm->add_instruction(migraphx::make_op("contiguous"), x);
-        auto cy  = mm->add_instruction(migraphx::make_op("contiguous"), y);
-        auto x2  = mm->add_instruction(migraphx::make_op("reshape", {{"dims", s3.lens()}}), cx);
-        auto y2  = mm->add_instruction(migraphx::make_op("reshape", {{"dims", s3.lens()}}), cy);
+        auto x2  = mm->add_instruction(migraphx::make_op("reshape", {{"dims", s3.lens()}}), x);
+        auto y2  = mm->add_instruction(migraphx::make_op("reshape", {{"dims", s3.lens()}}), y);
        auto z2  = mm->add_instruction(migraphx::make_op("reshape", {{"dims", s3.lens()}}), z);
        auto fadd =
            add_pointwise(p2, "main:pointwise0", {x2, y2, z2}, [=](auto* pm, const auto& inputs) {
@@ -466,10 +464,8 @@ TEST_CASE(add_unsqueeze_add_nonstandard)
        auto x   = mm->add_parameter("x", s1);
        auto y   = mm->add_parameter("y", s1);
        auto z   = mm->add_parameter("z", s2);
-        auto cx  = mm->add_instruction(migraphx::make_op("contiguous"), x);
-        auto cy  = mm->add_instruction(migraphx::make_op("contiguous"), y);
-        auto x2  = mm->add_instruction(migraphx::make_op("reshape", {{"dims", s2.lens()}}), cx);
-        auto y2  = mm->add_instruction(migraphx::make_op("reshape", {{"dims", s2.lens()}}), cy);
+        auto x2  = mm->add_instruction(migraphx::make_op("reshape", {{"dims", s2.lens()}}), x);
+        auto y2  = mm->add_instruction(migraphx::make_op("reshape", {{"dims", s2.lens()}}), y);
        auto fadd =
            add_pointwise(p2, "main:pointwise0", {x2, y2, z}, [=](auto* pm, const auto& inputs) {
                auto add1 = pm->add_instruction(migraphx::make_op("add"), inputs[0], inputs[1]);

--- a/test/gpu/codegen_literal.cpp
+++ b/test/gpu/codegen_literal.cpp
@@ -64,7 +64,7 @@ TEST_CASE(mul_literal_round_test)
    auto l1 = mm->add_literal(1 / 0.00787402f);

    auto mul   = mm->add_instruction(migraphx::make_op("mul"), l0, l1);
-    auto round = mm->add_instruction(migraphx::make_op("round"), mul);
+    auto round = mm->add_instruction(migraphx::make_op("nearbyint"), mul);

    mm->add_return({round});


--- a/test/gpu/fuse_mlir.cpp
+++ b/test/gpu/fuse_mlir.cpp
@@ -144,14 +144,19 @@ TEST_CASE(int_quant_dot_tanh_fails)
        auto tanh = add_pointwise(p1, "main:pointwise0", {dot}, single_pointwise("tanh"));
        mm->add_return({tanh});
    }
-    migraphx::program p2(p1);
-    // This pass should do nothing as int32_t tanh isn't supported.
+    // This pass should not fuse as int32_t tanh isn't supported.
    run_pass(p1);
-    EXPECT(p1 == p2);
+    auto* mm = p1.get_main_module();
+    bool has_pointwise =
+        std::any_of(mm->begin(), mm->end(), [&](const auto& i) { return i.name() == "pointwise"; });
+    EXPECT(has_pointwise);
 }

 int main(int argc, const char* argv[])
 {
+    if(migraphx::gpu::mlir_enabled())
+    {
        test::run(argc, argv);
+    }
    return 0;
 }
--- a/test/gpu/gemm_tune.cpp
+++ b/test/gpu/gemm_tune.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <iostream>
+#include <vector>
+#include <migraphx/gpu/gemm.hpp>
+#include <hip/hip_runtime_api.h>
+#include <migraphx/gpu/target.hpp>
+#include <migraphx/verify.hpp>
+#include <test.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/iterator_for.hpp>
+
+// includes needed for run_lowering
+#include <migraphx/gpu/lowering.hpp>
+#include <migraphx/auto_contiguous.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/pass_manager.hpp>
+
+// Abbreviated lowering; we don't need the usual cleanup passes for this test
+void run_lowering(migraphx::program& p, bool offload_copy = false)
+{
+    auto ctx = migraphx::gpu::context{};
+    migraphx::run_passes(
+        *p.get_main_module(),
+        {migraphx::auto_contiguous{}, migraphx::gpu::lowering{&ctx, offload_copy}});
+}
+
+/**
+ * Tests the automatic GEMM tuning feature.  In the finalize() method of the gemm op,
+ * rocBLAS API functions are called to quickly benchmark all the GEMM solutions
+ * available in the currently installed rocBLAS library and choose the index of the fastest.
+ */
+TEST_CASE(gemm_tune_with_rocblas)
+{
+    migraphx::program p;
+    auto* mm = p.get_main_module();
+
+    migraphx::shape sa{migraphx::shape::float_type, {4, 2}};
+    migraphx::shape sb{migraphx::shape::float_type, {2, 3}};
+    auto a = mm->add_parameter("a", sa);
+    auto b = mm->add_parameter("b", sb);
+
+    migraphx::operation dot_op = migraphx::make_op("dot");
+    mm->add_instruction(dot_op, a, b);
+
+    // lowering adds gemm implementation for dot operator
+    run_lowering(p);
+
+    migraphx::target gpu_t = migraphx::gpu::target{};
+    migraphx::compile_options options;
+    options.exhaustive_tune = true;
+    p.compile(gpu_t, options);
+
+    migraphx::value solution_idx(0);
+    for(auto ins : iterator_for(*p.get_main_module()))
+    {
+        if(ins->name() == "gpu::gemm")
+        {
+            auto gemm_op = migraphx::get_operation(ins);
+
+            // tuned solution index is not deterministic, but anything other than 0
+            // (default, invalid, or not available) is good.
+            // gemm_op.to_value().debug_print();
+            solution_idx = gemm_op.to_value()["solution_idx"];
+            break;
+        }
+    }
+#ifdef MIGRAPHX_USE_ROCBLAS_TUNING_API
+    EXPECT(0 != solution_idx.to<std::size_t>());
+#else
+    EXPECT(0 == solution_idx.to<std::size_t>());
+#endif
+}
+
+// GEMM tuning of a strided-batch matrix; invokes rocblas_gemm_strided_batched_ex
+TEST_CASE(gemm_tune_strided)
+{
+    migraphx::program p;
+    auto* mm = p.get_main_module();
+
+    migraphx::shape sa{migraphx::shape::float_type, {4, 2, 2}};
+    migraphx::shape sb{migraphx::shape::float_type, {4, 2, 2}};
+    migraphx::shape s_output{migraphx::shape::float_type, {4, 2, 2}};
+    auto a      = mm->add_parameter("a", sa);
+    auto b      = mm->add_parameter("b", sb);
+    auto output = mm->add_parameter("out", s_output);
+
+    auto gemm_oper = migraphx::make_op("gpu::gemm", {{"beta", 2}});
+    mm->add_instruction(gemm_oper, a, b, output);
+
+    migraphx::target gpu_t = migraphx::gpu::target{};
+    migraphx::compile_options options;
+    options.exhaustive_tune = true;
+    p.compile(gpu_t, options);
+
+    migraphx::value solution_idx(0);
+    for(auto ins : iterator_for(*p.get_main_module()))
+    {
+        if(ins->name() == "gpu::gemm")
+        {
+            auto gemm_op = migraphx::get_operation(ins);
+            auto gemmv   = gemm_op.to_value();
+
+            // tuned solution index is not deterministic, but anything other than 0
+            // (default, invalid, or not available) is good.
+            solution_idx = gemm_op.to_value()["solution_idx"];
+            break;
+        }
+    }
+#ifdef MIGRAPHX_USE_ROCBLAS_TUNING_API
+    EXPECT(0 != solution_idx.to<std::size_t>());
+#else
+    EXPECT(0 == solution_idx.to<std::size_t>());
+#endif
+}
+
+// GEMM tuning of a strided-batch matrix; created by lowering
+TEST_CASE(gemm_tune_strided_lowered)
+{
+    migraphx::program p;
+    auto* mm = p.get_main_module();
+
+    // At time of writing this test, gemm_impl considers a shape is strided if it has
+    // at least three dimensions and the 3rd-to-last is nonzero, invoking
+    // rocblas_gemm_strided_batched_ex. Also, DOT operator requires all dimensions except the last
+    // two to be equal.
+    migraphx::shape sa{migraphx::shape::float_type, {4, 2, 5}};
+    migraphx::shape sb{migraphx::shape::float_type, {4, 5, 3}};
+    auto a = mm->add_parameter("a", sa);
+    auto b = mm->add_parameter("b", sb);
+
+    migraphx::operation dot_op = migraphx::make_op("dot");
+    mm->add_instruction(dot_op, a, b);
+
+    // lowering adds gemm implementation for dot operator
+    run_lowering(p);
+
+    migraphx::target gpu_t = migraphx::gpu::target{};
+    migraphx::compile_options options;
+    options.exhaustive_tune = true;
+    p.compile(gpu_t, options);
+
+    migraphx::value solution_idx(0);
+    for(auto ins : iterator_for(*p.get_main_module()))
+    {
+        if(ins->name() == "gpu::gemm")
+        {
+            auto gemm_op = migraphx::get_operation(ins);
+
+            // tuned solution index is not deterministic, but anything other than 0
+            // (default, invalid, or not available) is good.
+            solution_idx = gemm_op.to_value()["solution_idx"];
+            break;
+        }
+    }
+#ifdef MIGRAPHX_USE_ROCBLAS_TUNING_API
+    EXPECT(0 != solution_idx.to<std::size_t>());
+#else
+    EXPECT(0 == solution_idx.to<std::size_t>());
+#endif
+}
+
+TEST_CASE(gemm_tune_invalid_sol_index)
+{
+    migraphx::program p;
+    auto* mm = p.get_main_module();
+
+    migraphx::shape sa{migraphx::shape::float_type, {4, 2}};
+    migraphx::shape sb{migraphx::shape::float_type, {2, 3}};
+    migraphx::shape s_output{migraphx::shape::float_type, {4, 3}};
+    auto a      = mm->add_parameter("a", sa);
+    auto b      = mm->add_parameter("b", sb);
+    auto output = mm->add_parameter("out", s_output);
+
+    auto gemm_oper = migraphx::make_op("gpu::gemm", {{"solution_idx", 987654321}});
+    mm->add_instruction(gemm_oper, a, b, output);
+
+    migraphx::target gpu_t = migraphx::gpu::target{};
+    migraphx::compile_options options;
+    options.exhaustive_tune = true;
+    p.compile(gpu_t, options);
+
+    migraphx::value solution_idx(0);
+    for(auto ins : iterator_for(*p.get_main_module()))
+    {
+        if(ins->name() == "gpu::gemm")
+        {
+            auto gemm_op = migraphx::get_operation(ins);
+            auto gemmv   = gemm_op.to_value();
+
+            // given invalid starting index, should return default 0
+            solution_idx = gemm_op.to_value()["solution_idx"];
+            break;
+        }
+    }
+#ifdef MIGRAPHX_USE_ROCBLAS_TUNING_API
+    EXPECT(0 == solution_idx.to<std::size_t>());
+#else
+    EXPECT(0 != solution_idx.to<std::size_t>());
+#endif
+}
+
+int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/test/gpu/jit.cpp
+++ b/test/gpu/jit.cpp
@@ -139,7 +139,8 @@ const std::string math_template = R"__migraphx__(
 #include <migraphx/kernels/pointwise.hpp>
 #include <migraphx/kernels/math.hpp>
 #include <migraphx/kernels/types.hpp>
-using namespace migraphx;
+
+namespace migraphx {
 extern "C" {
 __global__ void kernel(${type}* p) 
 {
@@ -148,6 +149,7 @@ __global__ void kernel(${type}* p)

 }
 }
+}

 int main() {}

@@ -354,10 +356,14 @@ TEST_CASE(compile_math)
        if(t == migraphx::shape::half_type)
            name.insert(0, "migraphx::");
        data_types.push_back(name);
+        // fp8 doesn't have vectorization support yet, therefore skip it for now.
+        if(t != migraphx::shape::fp8e4m3fnuz_type)
+        {
            migraphx::transform(vec_sizes, std::back_inserter(data_types), [&](auto i) {
                return "migraphx::vec<" + name + ", " + std::to_string(i) + ">";
            });
        }
+    }
    migraphx::shape input{migraphx::shape::float_type, {5, 2}};
    migraphx::gpu::hip_compile_options options;
    options.global = 1024;
@@ -396,7 +402,10 @@ TEST_CASE(assert_type_min_max)
    migraphx::gpu::hip_compile_options options;
    for(auto&& t : migraphx::shape::types())
    {
-        if(contains({migraphx::shape::bool_type, migraphx::shape::tuple_type}, t))
+        if(contains({migraphx::shape::bool_type,
+                     migraphx::shape::fp8e4m3fnuz_type,
+                     migraphx::shape::tuple_type},
+                    t))
            continue;
        auto name = migraphx::shape::cpp_type(t);
        if(t == migraphx::shape::half_type)
@@ -423,7 +432,6 @@ TEST_CASE(assert_type_min_max)
                min = std::to_string(as.min());
                max = std::to_string(as.max());
            }
-
            auto src = migraphx::interpolate_string(assert_template,
                                                    {{"type", name}, {"max", max}, {"min", min}});
            migraphx::shape input{migraphx::shape::float_type, {5, 2}};

--- a/test/gpu/mlir.cpp
+++ b/test/gpu/mlir.cpp
@@ -141,9 +141,9 @@ TEST_CASE(conv)
 {
    const std::string mlir_output = R"__migraphx__(
 module {
-  func.func @mlir_convolution(%arg0: tensor<2x8x3x3xf32>, %arg1: tensor<1x8x4x4xf32>) -> tensor<1x2x2x2xf32> attributes {arch = "", kernel = "mixr", num_cu = 0 : i64} {
-    %0 = migraphx.convolution(%arg1, %arg0) {dilation = [1, 1], group = 1 : i64, padding = [0, 0, 0, 0], padding_mode = 0 : i64, stride = [1, 1]} : (tensor<1x8x4x4xf32>, tensor<2x8x3x3xf32>) -> tensor<1x2x2x2xf32>
-    return %0 : tensor<1x2x2x2xf32>
+  func.func @mlir_convolution(%arg0: !migraphx.shaped<2x8x3x3xf32, 72x9x3x1>, %arg1: !migraphx.shaped<1x8x4x4xf32, 128x16x4x1>) -> !migraphx.shaped<1x2x2x2xf32, 8x4x2x1> attributes {arch = "", kernel = "mixr", num_cu = 0 : i64} {
+    %0 = migraphx.convolution %arg1, %arg0 {dilation = [1, 1], group = 1 : i64, padding = [0, 0, 0, 0], padding_mode = 0 : i64, stride = [1, 1]} : <1x8x4x4xf32, 128x16x4x1>, <2x8x3x3xf32, 72x9x3x1> -> <1x2x2x2xf32, 8x4x2x1>
+    return %0 : !migraphx.shaped<1x2x2x2xf32, 8x4x2x1>
  }
 }
 )__migraphx__";
@@ -160,15 +160,38 @@ module {
    EXPECT(verify_mlir(m));
 }

+TEST_CASE(conv_nhwc)
+{
+    const std::string mlir_output = R"__migraphx__(
+module {
+  func.func @mlir_convolution(%arg0: !migraphx.shaped<2x8x3x3xf32, 72x1x24x8>, %arg1: !migraphx.shaped<1x8x4x4xf32, 128x1x32x8>) -> !migraphx.shaped<1x2x2x2xf32, 8x1x4x2> attributes {arch = "", kernel = "mixr", num_cu = 0 : i64} {
+    %0 = migraphx.convolution %arg1, %arg0 {dilation = [1, 1], group = 1 : i64, padding = [0, 0, 0, 0], padding_mode = 0 : i64, stride = [1, 1]} : <1x8x4x4xf32, 128x1x32x8>, <2x8x3x3xf32, 72x1x24x8> -> <1x2x2x2xf32, 8x1x4x2>
+    return %0 : !migraphx.shaped<1x2x2x2xf32, 8x1x4x2>
+  }
+}
+)__migraphx__";
+    migraphx::module m;
+    auto x    = m.add_parameter("x", {migraphx::shape::float_type, {1, 8, 4, 4}, {128, 1, 32, 8}});
+    auto w    = m.add_parameter("w", {migraphx::shape::float_type, {2, 8, 3, 3}, {72, 1, 24, 8}});
+    auto conv = m.add_instruction(migraphx::make_op("convolution"), x, w);
+    m.add_return({conv});
+    auto s = migraphx::gpu::dump_mlir(m);
+    // Skip test if MLIR is not enabled
+    if(s.empty())
+        return;
+    CHECK(encode(s) == encode(mlir_output));
+    EXPECT(verify_mlir(m));
+}
+
 TEST_CASE(conv_add_relu)
 {
    const std::string mlir_output = R"__migraphx__(
 module {
-  func.func @mlir_convolution_add_relu(%arg0: tensor<1x2x2x2xf32>, %arg1: tensor<2x8x3x3xf32>, %arg2: tensor<1x8x4x4xf32>) -> tensor<1x2x2x2xf32> attributes {arch = "", kernel = "mixr", num_cu = 0 : i64} {
-    %0 = migraphx.convolution(%arg2, %arg1) {dilation = [1, 1], group = 1 : i64, padding = [0, 0, 0, 0], padding_mode = 0 : i64, stride = [1, 1]} : (tensor<1x8x4x4xf32>, tensor<2x8x3x3xf32>) -> tensor<1x2x2x2xf32>
-    %1 = migraphx.add(%0, %arg0) : (tensor<1x2x2x2xf32>, tensor<1x2x2x2xf32>) -> tensor<1x2x2x2xf32>
-    %2 = migraphx.relu(%1) : (tensor<1x2x2x2xf32>) -> tensor<1x2x2x2xf32>
-    return %2 : tensor<1x2x2x2xf32>
+  func.func @mlir_convolution_add_relu(%arg0: !migraphx.shaped<1x2x2x2xf32, 8x4x2x1>, %arg1: !migraphx.shaped<2x8x3x3xf32, 72x9x3x1>, %arg2: !migraphx.shaped<1x8x4x4xf32, 128x16x4x1>) -> !migraphx.shaped<1x2x2x2xf32, 8x4x2x1> attributes {arch = "", kernel = "mixr", num_cu = 0 : i64} {
+    %0 = migraphx.convolution %arg2, %arg1 {dilation = [1, 1], group = 1 : i64, padding = [0, 0, 0, 0], padding_mode = 0 : i64, stride = [1, 1]} : <1x8x4x4xf32, 128x16x4x1>, <2x8x3x3xf32, 72x9x3x1> -> <1x2x2x2xf32, 8x4x2x1>
+    %1 = migraphx.add %0, %arg0 : <1x2x2x2xf32, 8x4x2x1>, <1x2x2x2xf32, 8x4x2x1> -> <1x2x2x2xf32, 8x4x2x1>
+    %2 = migraphx.relu %1 : <1x2x2x2xf32, 8x4x2x1> -> <1x2x2x2xf32, 8x4x2x1>
+    return %2 : !migraphx.shaped<1x2x2x2xf32, 8x4x2x1>
  }
 }
 )__migraphx__";
@@ -192,10 +215,10 @@ TEST_CASE(quant_dot_add)
 {
    const std::string mlir_output = R"__migraphx__(
 module {
-  func.func @mlir_quant_dot_add(%arg0: tensor<1x5x4xi8>, %arg1: tensor<1x4x3xi8>, %arg2: tensor<1x5x3xi32>) -> tensor<1x5x3xi32> attributes {arch = "", kernel = "mixr", num_cu = 0 : i64} {
-    %0 = migraphx.quant_dot(%arg0, %arg1) : (tensor<1x5x4xi8>, tensor<1x4x3xi8>) -> tensor<1x5x3xi32>
-    %1 = migraphx.add(%0, %arg2) : (tensor<1x5x3xi32>, tensor<1x5x3xi32>) -> tensor<1x5x3xi32>
-    return %1 : tensor<1x5x3xi32>
+  func.func @mlir_quant_dot_add(%arg0: !migraphx.shaped<1x5x4xi8, 20x4x1>, %arg1: !migraphx.shaped<1x4x3xi8, 12x3x1>, %arg2: !migraphx.shaped<1x5x3xi32, 15x3x1>) -> !migraphx.shaped<1x5x3xi32, 15x3x1> attributes {arch = "", kernel = "mixr", num_cu = 0 : i64} {
+    %0 = migraphx.quant_dot %arg0, %arg1 : <1x5x4xi8, 20x4x1>, <1x4x3xi8, 12x3x1> -> <1x5x3xi32, 15x3x1>
+    %1 = migraphx.add %0, %arg2 : <1x5x3xi32, 15x3x1>, <1x5x3xi32, 15x3x1> -> <1x5x3xi32, 15x3x1>
+    return %1 : !migraphx.shaped<1x5x3xi32, 15x3x1>
  }
 }
 )__migraphx__";
@@ -219,10 +242,10 @@ TEST_CASE(dot_add)
 {
    const std::string mlir_output = R"__migraphx__(
 module {
-  func.func @mlir_dot_add(%arg0: tensor<1x5x4xf32>, %arg1: tensor<1x4x3xf32>, %arg2: tensor<1x5x3xf32>) -> tensor<1x5x3xf32> attributes {arch = "", kernel = "mixr", num_cu = 0 : i64} {
-    %0 = migraphx.dot(%arg0, %arg1) : (tensor<1x5x4xf32>, tensor<1x4x3xf32>) -> tensor<1x5x3xf32>
-    %1 = migraphx.add(%0, %arg2) : (tensor<1x5x3xf32>, tensor<1x5x3xf32>) -> tensor<1x5x3xf32>
-    return %1 : tensor<1x5x3xf32>
+  func.func @mlir_dot_add(%arg0: !migraphx.shaped<1x5x4xf32, 20x4x1>, %arg1: !migraphx.shaped<1x4x3xf32, 12x3x1>, %arg2: !migraphx.shaped<1x5x3xf32, 15x3x1>) -> !migraphx.shaped<1x5x3xf32, 15x3x1> attributes {arch = "", kernel = "mixr", num_cu = 0 : i64} {
+    %0 = migraphx.dot %arg0, %arg1 : <1x5x4xf32, 20x4x1>, <1x4x3xf32, 12x3x1> -> <1x5x3xf32, 15x3x1>
+    %1 = migraphx.add %0, %arg2 : <1x5x3xf32, 15x3x1>, <1x5x3xf32, 15x3x1> -> <1x5x3xf32, 15x3x1>
+    return %1 : !migraphx.shaped<1x5x3xf32, 15x3x1>
  }
 }
 )__migraphx__";
@@ -245,11 +268,11 @@ TEST_CASE(conv_int8_dequantize_quantize)
 {
    const std::string mlir_output = R"__migraphx__(
 module {
-  func.func @mlir_quant_convolution_dequantizelinear_quantizelinear(%arg0: tensor<2x8x3x3xi8>, %arg1: tensor<1x8x4x4xi8>, %arg2: tensor<1x2x2x2xf32>, %arg3: tensor<1x2x2x2xi32>) -> tensor<1x2x2x2xi32> attributes {arch = "", kernel = "mixr", num_cu = 0 : i64} {
-      %0 = migraphx.quant_convolution(%arg1, %arg0) {dilation = [1, 1], group = 1 : i64, padding = [0, 0, 0, 0], padding_mode = 0 : i64, stride = [1, 1]} : (tensor<1x8x4x4xi8>, tensor<2x8x3x3xi8>) -> tensor<1x2x2x2xi32>
-      %1 = migraphx.dequantizelinear(%0, %arg2, %arg3) : (tensor<1x2x2x2xi32>, tensor<1x2x2x2xf32>, tensor<1x2x2x2xi32>) -> tensor<1x2x2x2xf32>
-      %2 = migraphx.quantizelinear(%1, %arg2, %arg3) : (tensor<1x2x2x2xf32>, tensor<1x2x2x2xf32>, tensor<1x2x2x2xi32>) -> tensor<1x2x2x2xi32>
-      return %2 : tensor<1x2x2x2xi32>
+  func.func @mlir_quant_convolution_dequantizelinear_quantizelinear(%arg0: !migraphx.shaped<2x8x3x3xi8, 72x9x3x1>, %arg1: !migraphx.shaped<1x8x4x4xi8, 128x16x4x1>, %arg2: !migraphx.shaped<1x2x2x2xf32, 8x4x2x1>, %arg3: !migraphx.shaped<1x2x2x2xi32, 8x4x2x1>) -> !migraphx.shaped<1x2x2x2xi32, 8x4x2x1> attributes {arch = "", kernel = "mixr", num_cu = 0 : i64} {
+      %0 = migraphx.quant_convolution %arg1, %arg0 {dilation = [1, 1], group = 1 : i64, padding = [0, 0, 0, 0], padding_mode = 0 : i64, stride = [1, 1]} : <1x8x4x4xi8, 128x16x4x1>, <2x8x3x3xi8, 72x9x3x1> -> <1x2x2x2xi32, 8x4x2x1>
+      %1 = migraphx.dequantizelinear %0, %arg2, %arg3 : <1x2x2x2xi32, 8x4x2x1>, <1x2x2x2xf32, 8x4x2x1>, !migraphx.shaped<1x2x2x2xi32, 8x4x2x1> -> <1x2x2x2xf32, 8x4x2x1>
+      %2 = migraphx.quantizelinear %1, %arg2, %arg3 : <1x2x2x2xf32, 8x4x2x1>, <1x2x2x2xf32, 8x4x2x1>, !migraphx.shaped<1x2x2x2xi32, 8x4x2x1> -> <1x2x2x2xi32, 8x4x2x1>
+      return %2 : !migraphx.shaped<1x2x2x2xi32, 8x4x2x1>
    }
 }
 )__migraphx__";
@@ -278,10 +301,10 @@ TEST_CASE(dot_convert)
 {
    const std::string mlir_output = R"__migraphx__(
 module {
-  func.func @mlir_dot_convert(%arg0: tensor<1x5x4xf32>, %arg1: tensor<1x4x3xf32>) -> tensor<1x5x3xf16> attributes {arch = "", kernel = "mixr", num_cu = 0 : i64} {
-    %0 = migraphx.dot(%arg0, %arg1) : (tensor<1x5x4xf32>, tensor<1x4x3xf32>) -> tensor<1x5x3xf32>
-    %1 = migraphx.convert(%0) {target_type  =  1  :  i64} : (tensor<1x5x3xf32>) -> tensor<1x5x3xf16>
-    return %1 : tensor<1x5x3xf16>
+  func.func @mlir_dot_convert(%arg0: !migraphx.shaped<1x5x4xf32, 20x4x1>, %arg1: !migraphx.shaped<1x4x3xf32, 12x3x1>) -> !migraphx.shaped<1x5x3xf16, 15x3x1> attributes {arch = "", kernel = "mixr", num_cu = 0 : i64} {
+    %0 = migraphx.dot %arg0, %arg1 : <1x5x4xf32, 20x4x1>, <1x4x3xf32, 12x3x1> -> <1x5x3xf32, 15x3x1>
+    %1 = migraphx.convert %0 {target_type  =  1  :  i64} : <1x5x3xf32, 15x3x1> to <1x5x3xf16, 15x3x1>
+    return %1 : !migraphx.shaped<1x5x3xf16, 15x3x1>
  }
 }
 )__migraphx__";
@@ -304,10 +327,10 @@ TEST_CASE(dot_where)
 {
    const std::string mlir_output = R"__migraphx__(
 module {
-  func.func @mlir_dot_where(%arg0: tensor<1x5x4xf32>, %arg1: tensor<1x4x3xf32>, %arg2: tensor<1x5x3xi8>, %arg3: tensor<1x5x3xf32>) -> tensor<1x5x3xf32> attributes {arch = "", kernel = "mixr", num_cu = 0 : i64} {
-    %0 = migraphx.dot(%arg0, %arg1) : (tensor<1x5x4xf32>, tensor<1x4x3xf32>) -> tensor<1x5x3xf32>
-    %1 = migraphx.where(%arg2, %0, %arg3) : (tensor<1x5x3xi8>, tensor<1x5x3xf32>, tensor<1x5x3xf32>) -> tensor<1x5x3xf32>
-    return %1 : tensor<1x5x3xf32>
+  func.func @mlir_dot_where(%arg0: !migraphx.shaped<1x5x4xf32, 20x4x1>, %arg1: !migraphx.shaped<1x4x3xf32, 12x3x1>, %arg2: !migraphx.shaped<1x5x3xi8, 15x3x1>, %arg3: !migraphx.shaped<1x5x3xf32, 15x3x1>) -> !migraphx.shaped<1x5x3xf32, 15x3x1> attributes {arch = "", kernel = "mixr", num_cu = 0 : i64} {
+    %0 = migraphx.dot %arg0, %arg1 : <1x5x4xf32, 20x4x1>, <1x4x3xf32, 12x3x1> -> <1x5x3xf32, 15x3x1>
+    %1 = migraphx.where %arg2, %0, %arg3 : <1x5x3xi8, 15x3x1>, <1x5x3xf32, 15x3x1>, <1x5x3xf32, 15x3x1> -> <1x5x3xf32, 15x3x1>
+    return %1 : !migraphx.shaped<1x5x3xf32, 15x3x1>
  }
 }
 )__migraphx__";

--- a/test/gpu/pack_int8_args.cpp
+++ b/test/gpu/pack_int8_args.cpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#include <migraphx/instruction_ref.hpp>
-#include <migraphx/gpu/context.hpp>
-#include <migraphx/gpu/lowering.hpp>
-#include <migraphx/gpu/target.hpp>
-#include <migraphx/gpu/allocation_model.hpp>
-#include <migraphx/apply_alpha_beta.hpp>
-#include <migraphx/adjust_allocation.hpp>
-#include <migraphx/gpu/pack_int8_args.hpp>
-#include <migraphx/gpu/rocblas.hpp>
-#include <migraphx/gpu/device_name.hpp>
-#include <migraphx/auto_contiguous.hpp>
-#include <migraphx/dead_code_elimination.hpp>
-#include <migraphx/replace_allocate.hpp>
-#include <migraphx/instruction.hpp>
-#include <migraphx/iterator_for.hpp>
-#include <migraphx/pass_manager.hpp>
-#include <migraphx/make_op.hpp>
-#include <test.hpp>
-#include "make_precompile_op.hpp"
-
-// Treat some operators as compilable to enable lowering
-MIGRAPHX_GPU_TEST_PRECOMPILE("add", "mul", "convert")
-
-void run_passes(migraphx::module& m, migraphx::gpu::context& ctx)
-{
-    migraphx::run_passes(m,
-                         {migraphx::auto_contiguous{},
-                          migraphx::gpu::lowering{&ctx, false},
-                          migraphx::dead_code_elimination{},
-                          migraphx::replace_allocate{migraphx::gpu::gpu_allocation_model{}},
-                          migraphx::dead_code_elimination{},
-                          migraphx::gpu::pack_int8_args{},
-                          migraphx::dead_code_elimination{}});
-}
-
-TEST_CASE(quant_dot)
-{
-    auto create_module = [] {
-        migraphx::module m("test");
-        migraphx::shape m1_shape{migraphx::shape::int8_type, {5, 8}};
-        migraphx::shape m2_shape{migraphx::shape::int8_type, {8, 7}};
-        migraphx::shape m3_shape{migraphx::shape::int32_type, {5, 7}};
-
-        auto l1 = m.add_parameter("a", m1_shape);
-        auto l2 = m.add_parameter("b", m2_shape);
-        auto l3 = m.add_parameter("c", m3_shape);
-        auto r =
-            migraphx::add_apply_alpha_beta(m, {l1, l2, l3}, migraphx::make_op("quant_dot"), 1, 1);
-        m.add_return({r});
-        return m;
-    };
-
-    auto create_optimized_int8_x4 = [](bool int8_x4) {
-        migraphx::module m("test");
-        migraphx::shape m1_shape{migraphx::shape::int8_type, {5, 8}};
-        migraphx::shape m2_shape{migraphx::shape::int8_type, {8, 7}};
-        migraphx::shape m3_shape{migraphx::shape::int32_type, {5, 7}};
-
-        auto l1         = m.add_parameter("a", m1_shape);
-        auto l2         = m.add_parameter("b", m2_shape);
-        auto l3         = m.add_parameter("c", m3_shape);
-        auto beta       = m.add_literal(1);
-        auto output     = m.add_parameter("test:#output_0", m3_shape);
-        auto gemm_alloc = m.add_instruction(
-            migraphx::make_op("hip::allocate", {{"shape", migraphx::to_value(m3_shape)}}));
-
-        auto packa = l2;
-        if(int8_x4)
-        {
-            auto alloc = m.add_instruction(
-                migraphx::make_op("hip::allocate", {{"shape", migraphx::to_value(m2_shape)}}));
-            packa = m.add_instruction(migraphx::make_op("gpu::int8_gemm_pack_a"), l2, alloc);
-        }
-        auto gemm = m.add_instruction(
-            migraphx::make_op("gpu::quant_gemm",
-                              {{"int8_x4_format", int8_x4},
-                               {"compute_fp32", migraphx::gpu::get_compute_fp32_flag()}}),
-            l1,
-            packa,
-            gemm_alloc);
-
-        auto beta_broadcast = m.add_instruction(
-            migraphx::make_op("multibroadcast", {{"out_lens", m3_shape.lens()}}), beta);
-        auto beta_alloc = m.add_instruction(
-            migraphx::make_op("hip::allocate", {{"shape", migraphx::to_value(m3_shape)}}));
-        auto beta_contiguous =
-            m.add_instruction(migraphx::make_op("gpu::contiguous"), beta_broadcast, beta_alloc);
-        auto mul_alloc = m.add_instruction(
-            migraphx::make_op("hip::allocate", {{"shape", migraphx::to_value(m3_shape)}}));
-        auto m3_beta = m.add_instruction(make_precompile_op("mul"), l3, beta_contiguous, mul_alloc);
-        auto gemm_add = m.add_instruction(make_precompile_op("add"), gemm, m3_beta, output);
-        m.add_return({gemm_add});
-
-        return m;
-    };
-
-    auto m1  = create_module();
-    auto ctx = migraphx::gpu::context{};
-    run_passes(m1, ctx);
-
-    bool int8_x4 = migraphx::gpu::get_int8_x4_format(ctx);
-    auto m2      = create_optimized_int8_x4(int8_x4);
-    EXPECT(m1 == m2);
-}
-
-TEST_CASE(quant_dot_trans)
-{
-    auto create_module = [] {
-        migraphx::module m("test");
-        migraphx::shape s1{migraphx::shape::int8_type, {3, 2, 8, 5}};
-        migraphx::shape s2{migraphx::shape::int8_type, {3, 2, 7, 8}};
-
-        auto l1 = m.add_parameter("a", s1);
-        auto tl1 =
-            m.add_instruction(migraphx::make_op("transpose", {{"permutation", {0, 1, 3, 2}}}), l1);
-        auto l2 = m.add_parameter("b", s2);
-        auto tl2 =
-            m.add_instruction(migraphx::make_op("transpose", {{"permutation", {0, 1, 3, 2}}}), l2);
-        auto r = migraphx::add_apply_alpha_beta(m, {tl1, tl2}, migraphx::make_op("quant_dot"), 3);
-        m.add_return({r});
-        return m;
-    };
-
-    auto create_optimized_int8_x4 = [](bool int8_x4) {
-        migraphx::module m("test");
-        migraphx::shape s1{migraphx::shape::int8_type, {3, 2, 8, 5}};
-        migraphx::shape s2{migraphx::shape::int8_type, {3, 2, 7, 8}};
-        migraphx::shape s3{migraphx::shape::int32_type, {3, 2, 5, 7}};
-
-        auto l1     = m.add_parameter("a", s1);
-        auto l2     = m.add_parameter("b", s2);
-        auto alpha  = m.add_literal(3);
-        auto output = m.add_parameter("test:#output_0", s3);
-
-        auto tl1 =
-            m.add_instruction(migraphx::make_op("transpose", {{"permutation", {0, 1, 3, 2}}}), l1);
-        migraphx::shape ts1{migraphx::shape::int8_type, {3, 2, 5, 8}};
-        auto alloca = m.add_instruction(
-            migraphx::make_op("hip::allocate", {{"shape", migraphx::to_value(ts1)}}));
-        auto conta = m.add_instruction(migraphx::make_op("gpu::contiguous"), tl1, alloca);
-
-        auto tl2 =
-            m.add_instruction(migraphx::make_op("transpose", {{"permutation", {0, 1, 3, 2}}}), l2);
-        migraphx::shape ts2{migraphx::shape::int8_type, {3, 2, 8, 7}};
-        auto allocb = m.add_instruction(
-            migraphx::make_op("hip::allocate", {{"shape", migraphx::to_value(ts2)}}));
-        auto contb = m.add_instruction(migraphx::make_op("gpu::contiguous"), tl2, allocb);
-
-        auto alpha_broadcast = m.add_instruction(
-            migraphx::make_op("multibroadcast", {{"out_lens", conta->get_shape().lens()}}), alpha);
-        auto alpha_alloc = m.add_instruction(migraphx::make_op(
-            "hip::allocate",
-            {{"shape",
-              migraphx::to_value(migraphx::shape(migraphx::shape::int32_type, {3, 2, 5, 8}))}}));
-        auto alpha_contiguous =
-            m.add_instruction(migraphx::make_op("gpu::contiguous"), alpha_broadcast, alpha_alloc);
-        // alpha = int32 and tl1 = int8, convert tl1 to int32 for multiplication and then convert
-        // back result to int8
-        auto tl1_convert_alloc = m.add_instruction(migraphx::make_op(
-            "hip::allocate", {{"shape", migraphx::to_value(alpha_contiguous->get_shape())}}));
-        auto tl1_convert =
-            m.add_instruction(make_precompile_op(migraphx::make_op(
-                                  "convert", {{"target_type", alpha->get_shape().type()}})),
-                              conta,
-                              tl1_convert_alloc);
-        auto mul_alloc = m.add_instruction(migraphx::make_op(
-            "hip::allocate", {{"shape", migraphx::to_value(tl1_convert->get_shape())}}));
-        auto tl1_alpha_int32 =
-            m.add_instruction(make_precompile_op("mul"), alpha_contiguous, tl1_convert, mul_alloc);
-        // convert mul_res to int8
-        auto tl1_alpha_int8_alloc = m.add_instruction(migraphx::make_op(
-            "hip::allocate", {{"shape", migraphx::to_value(conta->get_shape())}}));
-        auto tl1_alpha_int8 =
-            m.add_instruction(make_precompile_op(migraphx::make_op(
-                                  "convert", {{"target_type", conta->get_shape().type()}})),
-                              tl1_alpha_int32,
-                              tl1_alpha_int8_alloc);
-
-        auto packb = contb;
-        if(int8_x4)
-        {
-            auto allocpb = m.add_instruction(
-                migraphx::make_op("hip::allocate", {{"shape", migraphx::to_value(ts2)}}));
-            packb = m.add_instruction(migraphx::make_op("gpu::int8_gemm_pack_a"), contb, allocpb);
-        }
-
-        auto gemm = m.add_instruction(
-            migraphx::make_op("gpu::quant_gemm",
-                              {{"int8_x4_format", int8_x4},
-                               {"compute_fp32", migraphx::gpu::get_compute_fp32_flag()}}),
-            tl1_alpha_int8,
-            packb,
-            output);
-        m.add_return({gemm});
-
-        return m;
-    };
-
-    auto m1  = create_module();
-    auto ctx = migraphx::gpu::context{};
-    run_passes(m1, ctx);
-
-    bool int8_x4 = migraphx::gpu::get_int8_x4_format(ctx);
-    auto m2      = create_optimized_int8_x4(int8_x4);
-
-    EXPECT(m1 == m2);
-}
-
-TEST_CASE(quant_dot_pad)
-{
-    auto create_module = [] {
-        migraphx::module m("test");
-        migraphx::shape s1{migraphx::shape::int8_type, {5, 6}};
-        migraphx::shape s2{migraphx::shape::int8_type, {6, 7}};
-        migraphx::shape s3{migraphx::shape::int32_type, {5, 7}};
-
-        auto l1 = m.add_parameter("a", s1);
-        auto l2 = m.add_parameter("b", s2);
-        auto l3 = m.add_parameter("c", s3);
-        auto r =
-            migraphx::add_apply_alpha_beta(m, {l1, l2, l3}, migraphx::make_op("quant_dot"), 1, 1);
-        m.add_return({r});
-        return m;
-    };
-
-    auto create_optimized_int8_x4 = [](bool int8_x4) {
-        migraphx::module m("test");
-        migraphx::shape s1{migraphx::shape::int8_type, {5, 6}};
-        migraphx::shape ps1{migraphx::shape::int8_type, {5, 8}};
-        migraphx::shape s2{migraphx::shape::int8_type, {6, 7}};
-        migraphx::shape ps2{migraphx::shape::int8_type, {8, 7}};
-        migraphx::shape s3{migraphx::shape::int32_type, {5, 7}};
-
-        auto l1     = m.add_parameter("a", s1);
-        auto l2     = m.add_parameter("b", s2);
-        auto l3     = m.add_parameter("c", s3);
-        auto beta   = m.add_literal(1);
-        auto output = m.add_parameter("test:#output_0", s3);
-
-        auto pl1   = l1;
-        auto packa = l2;
-        migraphx::instruction_ref pl2{};
-        if(int8_x4)
-        {
-            auto po1 = m.insert_instruction(
-                l1, migraphx::make_op("hip::allocate", {{"shape", migraphx::to_value(ps1)}}));
-            pl1 = m.add_instruction(
-                migraphx::make_op("gpu::pad", {{"mode", 0}, {"pads", {0, 2, 0, 0}}, {"value", 0}}),
-                l1,
-                po1);
-
-            auto po2 = m.insert_instruction(
-                l2, migraphx::make_op("hip::allocate", {{"shape", migraphx::to_value(ps2)}}));
-            pl2 = m.insert_instruction(
-                std::next(l2),
-                migraphx::make_op("gpu::pad", {{"mode", 0}, {"pads", {2, 0, 0, 0}}, {"value", 0}}),
-                l2,
-                po2);
-        }
-
-        auto gemm_alloc = m.add_instruction(
-            migraphx::make_op("hip::allocate", {{"shape", migraphx::to_value(s3)}}));
-
-        if(int8_x4)
-        {
-            auto alloc = m.add_instruction(
-                migraphx::make_op("hip::allocate", {{"shape", migraphx::to_value(ps2)}}));
-            packa = m.add_instruction(migraphx::make_op("gpu::int8_gemm_pack_a"), pl2, alloc);
-        }
-
-        auto gemm = m.add_instruction(
-            migraphx::make_op("gpu::quant_gemm",
-                              {{"int8_x4_format", int8_x4},
-                               {"compute_fp32", migraphx::gpu::get_compute_fp32_flag()}}),
-            pl1,
-            packa,
-            gemm_alloc);
-
-        auto beta_broadcast =
-            m.add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", s3.lens()}}), beta);
-        auto beta_alloc = m.add_instruction(
-            migraphx::make_op("hip::allocate", {{"shape", migraphx::to_value(s3)}}));
-        auto beta_contiguous =
-            m.add_instruction(migraphx::make_op("gpu::contiguous"), beta_broadcast, beta_alloc);
-        auto mul_alloc = m.add_instruction(
-            migraphx::make_op("hip::allocate", {{"shape", migraphx::to_value(s3)}}));
-        auto m3_beta = m.add_instruction(make_precompile_op("mul"), l3, beta_contiguous, mul_alloc);
-        auto gemm_add = m.add_instruction(make_precompile_op("add"), gemm, m3_beta, output);
-        m.add_return({gemm_add});
-        return m;
-    };
-
-    auto m1  = create_module();
-    auto ctx = migraphx::gpu::context{};
-    run_passes(m1, ctx);
-
-    bool int8_x4 = migraphx::gpu::get_int8_x4_format(ctx);
-    auto m2      = create_optimized_int8_x4(int8_x4);
-
-    EXPECT(m1 == m2);
-}
-
-TEST_CASE(quant_dot_trans_pad)
-{
-    auto create_module = [] {
-        migraphx::module m("test");
-        migraphx::shape s1{migraphx::shape::int8_type, {3, 2, 9, 5}};
-        migraphx::shape s2{migraphx::shape::int8_type, {3, 2, 7, 9}};
-
-        auto l1 = m.add_parameter("a", s1);
-        auto tl1 =
-            m.add_instruction(migraphx::make_op("transpose", {{"permutation", {0, 1, 3, 2}}}), l1);
-        auto l2 = m.add_parameter("b", s2);
-        auto tl2 =
-            m.add_instruction(migraphx::make_op("transpose", {{"permutation", {0, 1, 3, 2}}}), l2);
-        auto r = migraphx::add_apply_alpha_beta(m, {tl1, tl2}, migraphx::make_op("quant_dot"), 3);
-        m.add_return({r});
-        return m;
-    };
-
-    auto create_optimized_int8_x4 = [](bool int8_x4) {
-        migraphx::module m("test");
-        migraphx::shape s1{migraphx::shape::int8_type, {3, 2, 9, 5}};
-        migraphx::shape ps1{migraphx::shape::int8_type, {3, 2, 5, 12}};
-        migraphx::shape s2{migraphx::shape::int8_type, {3, 2, 7, 9}};
-        migraphx::shape ps2{migraphx::shape::int8_type, {3, 2, 12, 7}};
-        migraphx::shape s3{migraphx::shape::int32_type, {3, 2, 5, 7}};
-
-        auto l1     = m.add_parameter("a", s1);
-        auto l2     = m.add_parameter("b", s2);
-        auto alpha  = m.add_literal(3);
-        auto output = m.add_parameter("test:#output_0", s3);
-
-        auto tl1 =
-            m.add_instruction(migraphx::make_op("transpose", {{"permutation", {0, 1, 3, 2}}}), l1);
-        migraphx::shape ts1{migraphx::shape::int8_type, {3, 2, 5, 9}};
-        auto ta = m.add_instruction(
-            migraphx::make_op("hip::allocate", {{"shape", migraphx::to_value(ts1)}}));
-        auto conta = m.add_instruction(migraphx::make_op("gpu::contiguous"), tl1, ta);
-
-        auto tl2 =
-            m.add_instruction(migraphx::make_op("transpose", {{"permutation", {0, 1, 3, 2}}}), l2);
-        migraphx::shape ts2{migraphx::shape::int8_type, {3, 2, 9, 7}};
-        auto tb = m.add_instruction(
-            migraphx::make_op("hip::allocate", {{"shape", migraphx::to_value(ts2)}}));
-
-        migraphx::instruction_ref ptb{};
-        if(int8_x4)
-        {
-            ptb = m.add_instruction(
-                migraphx::make_op("hip::allocate", {{"shape", migraphx::to_value(ps2)}}));
-        }
-        auto contb = m.add_instruction(migraphx::make_op("gpu::contiguous"), tl2, tb);
-        auto pb    = contb;
-        if(int8_x4)
-        {
-            pb = m.add_instruction(
-                migraphx::make_op("gpu::pad", {{"mode", 0}, {"pads", {0, 0, 3, 0, 0, 0, 0, 0}}}),
-                contb,
-                ptb);
-        }
-
-        auto alpha_broadcast = m.add_instruction(
-            migraphx::make_op("multibroadcast", {{"out_lens", conta->get_shape().lens()}}), alpha);
-        auto alpha_alloc = m.add_instruction(
-            migraphx::make_op("hip::allocate",
-                              {{"shape",
-                                migraphx::to_value(migraphx::shape(migraphx::shape::int32_type,
-                                                                   conta->get_shape().lens()))}}));
-        auto alpha_contiguous =
-            m.add_instruction(migraphx::make_op("gpu::contiguous"), alpha_broadcast, alpha_alloc);
-
-        // alpha = int32 and tl1 = int8, convert tl1 to int32 for multiplication and then convert
-        // back result to int8
-        auto tl1_convert_alloc = m.add_instruction(migraphx::make_op(
-            "hip::allocate", {{"shape", migraphx::to_value(alpha_contiguous->get_shape())}}));
-        auto tl1_convert =
-            m.add_instruction(make_precompile_op(migraphx::make_op(
-                                  "convert", {{"target_type", alpha->get_shape().type()}})),
-                              conta,
-                              tl1_convert_alloc);
-        auto mul_alloc = m.add_instruction(migraphx::make_op(
-            "hip::allocate", {{"shape", migraphx::to_value(tl1_convert->get_shape())}}));
-        auto tl1_alpha_int32 =
-            m.add_instruction(make_precompile_op("mul"), alpha_contiguous, tl1_convert, mul_alloc);
-        // convert mul_res to int8
-        auto tl1_alpha_int8_alloc = m.add_instruction(migraphx::make_op(
-            "hip::allocate", {{"shape", migraphx::to_value(conta->get_shape())}}));
-
-        migraphx::instruction_ref pta{};
-        if(int8_x4)
-        {
-            pta = m.add_instruction(
-                migraphx::make_op("hip::allocate", {{"shape", migraphx::to_value(ps1)}}));
-        }
-
-        auto tl1_alpha_int8 =
-            m.add_instruction(make_precompile_op(migraphx::make_op(
-                                  "convert", {{"target_type", conta->get_shape().type()}})),
-                              tl1_alpha_int32,
-                              tl1_alpha_int8_alloc);
-
-        auto pa = tl1_alpha_int8;
-        if(int8_x4)
-        {
-            pa = m.add_instruction(
-                migraphx::make_op("gpu::pad", {{"mode", 0}, {"pads", {0, 0, 0, 3, 0, 0, 0, 0}}}),
-                tl1_alpha_int8,
-                pta);
-        }
-
-        auto packb = pb;
-        if(int8_x4)
-        {
-            auto allocpb = m.add_instruction(
-                migraphx::make_op("hip::allocate", {{"shape", migraphx::to_value(ps2)}}));
-            packb = m.add_instruction(migraphx::make_op("gpu::int8_gemm_pack_a"), pb, allocpb);
-        }
-
-        auto gemm = m.add_instruction(
-            migraphx::make_op("gpu::quant_gemm",
-                              {{"int8_x4_format", int8_x4},
-                               {"compute_fp32", migraphx::gpu::get_compute_fp32_flag()}}),
-            pa,
-            packb,
-            output);
-        m.add_return({gemm});
-
-        return m;
-    };
-
-    auto m1  = create_module();
-    auto ctx = migraphx::gpu::context{};
-    run_passes(m1, ctx);
-
-    bool int8_x4 = migraphx::gpu::get_int8_x4_format(ctx);
-    auto m2      = create_optimized_int8_x4(int8_x4);
-
-    EXPECT(m1 == m2);
-}
-
-int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/test/include/test.hpp
+++ b/test/include/test.hpp
@@ -24,6 +24,7 @@

 #include <atomic>
 #include <algorithm>
+#include <array>
 #include <cassert>
 #include <cstdio>
 #include <cstdlib>

--- a/test/jit.cpp
+++ b/test/jit.cpp
@@ -47,7 +47,11 @@ compile_function(const std::string& src, const std::string& flags, const std::st
 {
    migraphx::src_compiler compiler;
    compiler.flags = flags + "-std=c++14 -fPIC -shared";
+#ifdef _WIN32
+    compiler.output = "simple.dll";
+#else
    compiler.output = "libsimple.so";
+#endif
    migraphx::src_file f{"main.cpp", src};
    auto image = compiler.compile({f});
    return migraphx::dynamic_loader{image}.get_function<F>(fname);

--- a/test/onnx/.onnxrt-commit
+++ b/test/onnx/.onnxrt-commit
-cc7e8cc21f83df3a41d9736dba9211bb832764ad
+44b58437402b207c8216f3be8c75accb7409be1c
--- a/test/onnx/averagepool_dilate_test.onnx
+++ b/test/onnx/averagepool_dilate_test.onnx
+averagepool_dilate_test:
+Y
+xy"AveragePool*
+	dilations@*
+kernel_shape@*
+pads@@*
+strides@averagepool_dilate_testZ
+x
+
+
+
+b
+y
+
+
+
+B
\ No newline at end of file
--- a/test/onnx/dynamicquantizelinear_1d_test.onnx
+++ b/test/onnx/dynamicquantizelinear_1d_test.onnx
+	dynamicquantizelinear_1d_test:
+4
+xyy_scaley_zero_point"DynamicQuantizeLineardynamicquantizelinear_1d_testZ
+x
+
+
+b
+y
+
+
+b
+y_scale
+
+
+b
+y_zero_point
+
+
+B
\ No newline at end of file
--- a/test/onnx/dynamicquantizelinear_2d_test.onnx
+++ b/test/onnx/dynamicquantizelinear_2d_test.onnx
+	dynamicquantizelinear_2d_test:
+4
+xyy_scaley_zero_point"DynamicQuantizeLineardynamicquantizelinear_2d_testZ
+x
+
+
+b
+y
+
+
+b
+y_scale
+
+
+b
+y_zero_point
+
+
+B
\ No newline at end of file
--- a/test/onnx/gen_onnx.py
+++ b/test/onnx/gen_onnx.py
@@ -276,6 +276,22 @@ def averagepool_1d_test():
    return ([node], [x], [out])


+@onnx_test()
+def averagepool_dilate_test():
+    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [1, 4, 3])
+    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [1, 4, 2])
+
+    node = onnx.helper.make_node('AveragePool',
+                                 inputs=['x'],
+                                 outputs=['y'],
+                                 kernel_shape=[2],
+                                 strides=[1],
+                                 pads=[1, 1],
+                                 dilations=[3])
+
+    return ([node], [x], [y])
+
+
 @onnx_test()
 def averagepool_3d_test():
    x = helper.make_tensor_value_info('0', TensorProto.FLOAT, [1, 3, 5, 5, 5])
@@ -1952,6 +1968,40 @@ def dropout_test():
    return ([node], [x], [y])


+@onnx_test()
+def dynamicquantizelinear_1d_test():
+    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [6])
+    y = helper.make_tensor_value_info('y', TensorProto.UINT8, [6])
+    y_scale = helper.make_tensor_value_info('y_scale', TensorProto.FLOAT, [1])
+    y_zero_point = helper.make_tensor_value_info('y_zero_point',
+                                                 TensorProto.UINT8, [1])
+
+    node = onnx.helper.make_node(
+        'DynamicQuantizeLinear',
+        inputs=['x'],
+        outputs=['y', 'y_scale', 'y_zero_point'],
+    )
+
+    return ([node], [x], [y, y_scale, y_zero_point])
+
+
+@onnx_test()
+def dynamicquantizelinear_2d_test():
+    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [3, 4])
+    y = helper.make_tensor_value_info('y', TensorProto.UINT8, [3, 4])
+    y_scale = helper.make_tensor_value_info('y_scale', TensorProto.FLOAT, [1])
+    y_zero_point = helper.make_tensor_value_info('y_zero_point',
+                                                 TensorProto.UINT8, [1])
+
+    node = onnx.helper.make_node(
+        'DynamicQuantizeLinear',
+        inputs=['x'],
+        outputs=['y', 'y_scale', 'y_zero_point'],
+    )
+
+    return ([node], [x], [y, y_scale, y_zero_point])
+
+
 @onnx_test()
 def elu_test():
    x = helper.make_tensor_value_info('0', TensorProto.FLOAT, [3])
@@ -3858,6 +3908,64 @@ def instance_norm_val_3d_test():
    return ([node], [], [y], [x_tensor, scale_tensor, bias_tensor])


+@onnx_test()
+def isinf_half_test():
+    t1 = helper.make_tensor_value_info('t1', TensorProto.FLOAT16, [2, 3])
+    t2 = helper.make_tensor_value_info('t2', TensorProto.BOOL, [2, 3])
+
+    node = onnx.helper.make_node(
+        'IsInf',
+        inputs=['t1'],
+        outputs=['t2'],
+    )
+    return ([node], [t1], [t2])
+
+
+@onnx_test()
+def isinf_neg_test():
+    t1 = helper.make_tensor_value_info('t1', TensorProto.FLOAT, [2, 3])
+    t2 = helper.make_tensor_value_info('t2', TensorProto.BOOL, [2, 3])
+
+    node = onnx.helper.make_node(
+        'IsInf',
+        detect_negative=[1],
+        detect_positive=[0],
+        inputs=['t1'],
+        outputs=['t2'],
+    )
+    return ([node], [t1], [t2])
+
+
+@onnx_test()
+def isinf_double_pos_test():
+    t1 = helper.make_tensor_value_info('t1', TensorProto.DOUBLE, [2, 3])
+    t2 = helper.make_tensor_value_info('t2', TensorProto.BOOL, [2, 3])
+
+    node = onnx.helper.make_node(
+        'IsInf',
+        detect_negative=[0],
+        detect_positive=[1],
+        inputs=['t1'],
+        outputs=['t2'],
+    )
+    return ([node], [t1], [t2])
+
+
+@onnx_test()
+def isinf_no_detect_test():
+    t1 = helper.make_tensor_value_info('t1', TensorProto.FLOAT, [2, 3])
+    t2 = helper.make_tensor_value_info('t2', TensorProto.BOOL, [2, 3])
+
+    node = onnx.helper.make_node(
+        'IsInf',
+        detect_negative=[0],
+        detect_positive=[0],
+        inputs=['t1'],
+        outputs=['t2'],
+    )
+    return ([node], [t1], [t2])
+
+
 @onnx_test()
 def isnan_float_test():
    t1 = helper.make_tensor_value_info('t1', TensorProto.FLOAT, [2, 3])
@@ -4276,6 +4384,50 @@ def loop_test():
    return ([node], [iter, cond, a, b], [b_loop, uout])


+@onnx_test()
+def loop_test_implicit_tripcnt():
+    body = helper.make_graph([
+        helper.make_node("Add", ["a", "b_in"], ["my_local"]),
+        helper.make_node("Sub", ["a", "b_in"], ["a_sub_b_in"]),
+        helper.make_node("Greater", ["my_local", "a_sub_b_in"],
+                         ["keep_going"]),
+        helper.make_node("Add", ["a_sub_b_in", "a_sub_b_in"],
+                         ["user_defined_vals"]),
+    ], "body", [
+        helper.make_tensor_value_info('iteration_num', TensorProto.INT64, [1]),
+        helper.make_tensor_value_info('keep_going_inp', TensorProto.BOOL, [1]),
+        helper.make_tensor_value_info('b_in', TensorProto.FLOAT, [1])
+    ], [
+        helper.make_tensor_value_info('keep_going', TensorProto.BOOL, [1]),
+        helper.make_tensor_value_info('a_sub_b_in', TensorProto.FLOAT, [1]),
+        helper.make_tensor_value_info('my_local', TensorProto.FLOAT, [1]),
+        helper.make_tensor_value_info('user_defined_vals', TensorProto.FLOAT,
+                                      [1]),
+    ])
+
+    iter = helper.make_tensor(name='max_trip_count',
+                              data_type=TensorProto.INT64,
+                              dims=[1],
+                              vals=[15])
+
+    node = helper.make_node(
+        "Loop",
+        inputs=["max_trip_count", "keep_going_cond", "b"],
+        outputs=["b_loop", "my_local_loop", "user_defined_vals_loop"],
+        body=body)
+
+    a = helper.make_tensor_value_info('a', TensorProto.FLOAT, [1])
+    b = helper.make_tensor_value_info('b', TensorProto.FLOAT, [1])
+    cond = helper.make_tensor_value_info('keep_going_cond', TensorProto.BOOL,
+                                         [1])
+
+    b_loop = helper.make_tensor_value_info('b_loop', TensorProto.FLOAT, [1])
+    uout = helper.make_tensor_value_info('user_defined_vals_loop',
+                                         TensorProto.FLOAT, [2, 1])
+
+    return ([node], [cond, a, b], [b_loop, uout], [iter])
+
+
 @onnx_test()
 def lpnormalization_axis_error_test():
    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [2, 3])
@@ -4382,6 +4534,177 @@ def lrn_test():
    return ([node], [x], [y])


+@onnx_test()
+def lstm_bi_layout_cell_test():
+    seq = helper.make_tensor_value_info('seq', TensorProto.FLOAT, [3, 5, 10])
+    w = helper.make_tensor_value_info('w', TensorProto.FLOAT, [2, 80, 10])
+    r = helper.make_tensor_value_info('r', TensorProto.FLOAT, [2, 80, 20])
+    bias = helper.make_tensor_value_info('bias', TensorProto.FLOAT, [2, 160])
+    seq_len = helper.make_tensor_value_info('seq_len', TensorProto.INT32, [3])
+    h0 = helper.make_tensor_value_info('h0', TensorProto.FLOAT, [3, 2, 20])
+    c0 = helper.make_tensor_value_info('c0', TensorProto.FLOAT, [3, 2, 20])
+    pph = helper.make_tensor_value_info('pph', TensorProto.FLOAT, [2, 60])
+
+    cellout = helper.make_tensor_value_info('cellout', TensorProto.FLOAT,
+                                            [3, 2, 20])
+
+    node = onnx.helper.make_node(
+        'LSTM',
+        inputs=['seq', 'w', 'r', 'bias', 'seq_len', 'h0', 'c0', 'pph'],
+        outputs=['', '', 'cellout'],
+        activations=['sigmoid', 'tanh', 'tanh'],
+        clip=0,
+        direction='bidirectional',
+        hidden_size=20,
+        input_forget=1,
+        layout=1)
+
+    return ([node], [seq, w, r, bias, seq_len, h0, c0, pph], [cellout])
+
+
+@onnx_test()
+def lstm_bi_layout_last_test():
+    seq = helper.make_tensor_value_info('seq', TensorProto.FLOAT, [3, 5, 10])
+    w = helper.make_tensor_value_info('w', TensorProto.FLOAT, [2, 80, 10])
+    r = helper.make_tensor_value_info('r', TensorProto.FLOAT, [2, 80, 20])
+    bias = helper.make_tensor_value_info('bias', TensorProto.FLOAT, [2, 160])
+    seq_len = helper.make_tensor_value_info('seq_len', TensorProto.INT32, [3])
+    h0 = helper.make_tensor_value_info('h0', TensorProto.FLOAT, [3, 2, 20])
+    c0 = helper.make_tensor_value_info('c0', TensorProto.FLOAT, [3, 2, 20])
+    pph = helper.make_tensor_value_info('pph', TensorProto.FLOAT, [2, 60])
+
+    hs = helper.make_tensor_value_info('hs', TensorProto.FLOAT, [3, 5, 2, 20])
+    output = helper.make_tensor_value_info('output', TensorProto.FLOAT,
+                                           [3, 2, 20])
+
+    node = onnx.helper.make_node(
+        'LSTM',
+        inputs=['seq', 'w', 'r', 'bias', 'seq_len', 'h0', 'c0', 'pph'],
+        outputs=['hs', 'output'],
+        activations=['sigmoid', 'tanh', 'tanh'],
+        clip=0,
+        direction='bidirectional',
+        hidden_size=20,
+        input_forget=1,
+        layout=1)
+
+    return ([node], [seq, w, r, bias, seq_len, h0, c0, pph], [hs, output])
+
+
+@onnx_test()
+def lstm_f_layout_hs_test():
+    seq = helper.make_tensor_value_info('seq', TensorProto.FLOAT, [3, 5, 10])
+    w = helper.make_tensor_value_info('w', TensorProto.FLOAT, [1, 80, 10])
+    r = helper.make_tensor_value_info('r', TensorProto.FLOAT, [1, 80, 20])
+    bias = helper.make_tensor_value_info('bias', TensorProto.FLOAT, [1, 160])
+    seq_len = helper.make_tensor_value_info('seq_len', TensorProto.INT32, [3])
+    h0 = helper.make_tensor_value_info('h0', TensorProto.FLOAT, [3, 1, 20])
+    c0 = helper.make_tensor_value_info('c0', TensorProto.FLOAT, [3, 1, 20])
+    pph = helper.make_tensor_value_info('pph', TensorProto.FLOAT, [1, 60])
+
+    hs = helper.make_tensor_value_info('hs', TensorProto.FLOAT, [3, 5, 1, 20])
+    output = helper.make_tensor_value_info('output', TensorProto.FLOAT,
+                                           [3, 1, 20])
+
+    node = onnx.helper.make_node(
+        'LSTM',
+        inputs=['seq', 'w', 'r', 'bias', 'seq_len', 'h0', 'c0', 'pph'],
+        outputs=['hs', 'output'],
+        activations=['sigmoid', 'tanh', 'tanh'],
+        clip=0,
+        direction='forward',
+        hidden_size=20,
+        input_forget=1,
+        layout=1)
+
+    return ([node], [seq, w, r, bias, seq_len, h0, c0, pph], [hs, output])
+
+
+@onnx_test()
+def lstm_f_layout_cell_test():
+    seq = helper.make_tensor_value_info('seq', TensorProto.FLOAT, [3, 5, 10])
+    w = helper.make_tensor_value_info('w', TensorProto.FLOAT, [1, 80, 10])
+    r = helper.make_tensor_value_info('r', TensorProto.FLOAT, [1, 80, 20])
+    bias = helper.make_tensor_value_info('bias', TensorProto.FLOAT, [1, 160])
+    seq_len = helper.make_tensor_value_info('seq_len', TensorProto.INT32, [3])
+    h0 = helper.make_tensor_value_info('h0', TensorProto.FLOAT, [3, 1, 20])
+    c0 = helper.make_tensor_value_info('c0', TensorProto.FLOAT, [3, 1, 20])
+    pph = helper.make_tensor_value_info('pph', TensorProto.FLOAT, [1, 60])
+
+    cellout = helper.make_tensor_value_info('cellout', TensorProto.FLOAT,
+                                            [3, 1, 20])
+
+    node = onnx.helper.make_node(
+        'LSTM',
+        inputs=['seq', 'w', 'r', 'bias', 'seq_len', 'h0', 'c0', 'pph'],
+        outputs=['', '', 'cellout'],
+        activations=['sigmoid', 'tanh', 'tanh'],
+        clip=0,
+        direction='forward',
+        hidden_size=20,
+        input_forget=1,
+        layout=1)
+
+    return ([node], [seq, w, r, bias, seq_len, h0, c0, pph], [cellout])
+
+
+@onnx_test()
+def lstm_r_layout_test():
+    seq = helper.make_tensor_value_info('seq', TensorProto.FLOAT, [3, 5, 10])
+    w = helper.make_tensor_value_info('w', TensorProto.FLOAT, [1, 80, 10])
+    r = helper.make_tensor_value_info('r', TensorProto.FLOAT, [1, 80, 20])
+    bias = helper.make_tensor_value_info('bias', TensorProto.FLOAT, [1, 160])
+    seq_len = helper.make_tensor_value_info('seq_len', TensorProto.INT32, [3])
+    h0 = helper.make_tensor_value_info('h0', TensorProto.FLOAT, [3, 1, 20])
+    c0 = helper.make_tensor_value_info('c0', TensorProto.FLOAT, [3, 1, 20])
+    pph = helper.make_tensor_value_info('pph', TensorProto.FLOAT, [1, 60])
+
+    hs = helper.make_tensor_value_info('hs', TensorProto.FLOAT, [3, 5, 1, 20])
+
+    node = onnx.helper.make_node(
+        'LSTM',
+        inputs=['seq', 'w', 'r', 'bias', 'seq_len', 'h0', 'c0', 'pph'],
+        outputs=['hs'],
+        activations=['sigmoid', 'tanh', 'tanh'],
+        clip=0,
+        direction='reverse',
+        hidden_size=20,
+        input_forget=1,
+        layout=1)
+
+    return ([node], [seq, w, r, bias, seq_len, h0, c0, pph], [hs])
+
+
+@onnx_test()
+def lstm_r_layout_hs_cell_test():
+    seq = helper.make_tensor_value_info('seq', TensorProto.FLOAT, [3, 5, 10])
+    w = helper.make_tensor_value_info('w', TensorProto.FLOAT, [1, 80, 10])
+    r = helper.make_tensor_value_info('r', TensorProto.FLOAT, [1, 80, 20])
+    bias = helper.make_tensor_value_info('bias', TensorProto.FLOAT, [1, 160])
+    seq_len = helper.make_tensor_value_info('seq_len', TensorProto.INT32, [3])
+    h0 = helper.make_tensor_value_info('h0', TensorProto.FLOAT, [3, 1, 20])
+    c0 = helper.make_tensor_value_info('c0', TensorProto.FLOAT, [3, 1, 20])
+    pph = helper.make_tensor_value_info('pph', TensorProto.FLOAT, [1, 60])
+
+    output = helper.make_tensor_value_info('output', TensorProto.FLOAT,
+                                           [3, 1, 20])
+    cellout = helper.make_tensor_value_info('cellout', TensorProto.FLOAT,
+                                            [3, 1, 20])
+
+    node = onnx.helper.make_node(
+        'LSTM',
+        inputs=['seq', 'w', 'r', 'bias', 'seq_len', 'h0', 'c0', 'pph'],
+        outputs=['', 'output', 'cellout'],
+        activations=['sigmoid', 'tanh', 'tanh'],
+        clip=0,
+        direction='reverse',
+        hidden_size=20,
+        input_forget=1,
+        layout=1)
+
+    return ([node], [seq, w, r, bias, seq_len, h0, c0, pph], [output, cellout])
+
+
 @onnx_test()
 def matmul_bmbm_test():
    m1 = helper.make_tensor_value_info('1', TensorProto.FLOAT, [3, 6, 7])
@@ -4609,6 +4932,22 @@ def maxpool_notset_test():
    return ([node], [x], [y])


+@onnx_test()
+def maxpool_dilate_test():
+    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [1, 4, 3])
+    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [1, 4, 2])
+
+    node = onnx.helper.make_node('MaxPool',
+                                 inputs=['x'],
+                                 outputs=['y'],
+                                 kernel_shape=[2],
+                                 strides=[1],
+                                 pads=[1, 1],
+                                 dilations=[3])
+
+    return ([node], [x], [y])
+
+
 @onnx_test()
 def maxpool_same_upper_test():
    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [1, 1, 5, 5])
@@ -4883,9 +5222,9 @@ def mod_test_fmod_different_dtypes():

 @onnx_test()
 def multinomial_test():
-    sample_size = 10
-    seed = 0.0
-    input = helper.make_tensor_value_info("input", TensorProto.FLOAT, [1, 10])
+    sample_size = 13
+    seed = 0.
+    input = helper.make_tensor_value_info("input", TensorProto.FLOAT, [3, 10])
    output = helper.make_tensor_value_info("output", TensorProto.INT32,
                                           [1, 10])

@@ -4898,6 +5237,44 @@ def multinomial_test():
    return ([node], [input], [output])


+@onnx_test()
+def multinomial_dyn_test():
+    sample_size = 100000
+    seed = 1.3
+    categories = 5
+    input = helper.make_tensor_value_info("input", TensorProto.FLOAT,
+                                          [None, categories])
+    output = helper.make_tensor_value_info("output", TensorProto.FLOAT,
+                                           [None, categories])
+
+    node = onnx.helper.make_node(
+        'Multinomial',
+        inputs=['input'],
+        sample_size=sample_size,
+        dtype=1,  # shape::float_type
+        seed=seed,
+        outputs=['output'])
+
+    return ([node], [input], [output])
+
+
+@onnx_test()
+def multinomial_autoseed_dyn_test():
+    # If seed attribute is not given, device should auto generate one at runtime
+    sample_size = 12
+    input = helper.make_tensor_value_info("input", TensorProto.FLOAT,
+                                          [None, 10])
+    output = helper.make_tensor_value_info("output", TensorProto.INT32,
+                                           [None, 10])
+
+    node = onnx.helper.make_node('Multinomial',
+                                 inputs=['input'],
+                                 sample_size=sample_size,
+                                 outputs=['output'])
+
+    return ([node], [input], [output])
+
+
 @onnx_test()
 def multinomial_generated_seed_test():
    sample_size = 10
@@ -5652,75 +6029,382 @@ def qlinearadd_bcast_test():


 @onnx_test()
-def qlinearconv_test():
-    # https://xadupre.github.io/draft/onnx/onnx_doc_folder/onnx__QLinearConv.html
-    x = helper.make_tensor_value_info('X', TensorProto.UINT8, [1, 1, 7, 7])
-    sc_x = helper.make_tensor('1', TensorProto.FLOAT, [], [0.00369204697])
-    zero_pt_x = helper.make_tensor('2', TensorProto.UINT8, [], [132])
+def qlinearaveragepool_1d_test():
+    x = helper.make_tensor_value_info('x', TensorProto.INT8, [1, 3, 32])
+    x_scale = helper.make_tensor('x_scale', TensorProto.FLOAT, [], [0.05])
+    x_zero_point = helper.make_tensor('x_zero_point', TensorProto.INT8, [],
+                                      [0])

-    wt = helper.make_tensor('3', TensorProto.UINT8, [1, 1, 1, 1], [0])
-    sc_wt = helper.make_tensor('4', TensorProto.FLOAT, [], [0.00172794575])
-    zero_pt_wt = helper.make_tensor('5', TensorProto.UINT8, [], [255])
+    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [1, 3, 31])
+    y_scale = helper.make_tensor('y_scale', TensorProto.FLOAT, [], [0.05])
+    y_zero_point = helper.make_tensor('y_zero_point', TensorProto.INT8, [],
+                                      [16])

-    sc_y = helper.make_tensor('6', TensorProto.FLOAT, [], [0.00162681262])
-    zero_pt_y = helper.make_tensor('7', TensorProto.UINT8, [], [123])
+    node = onnx.helper.make_node(
+        'QLinearAveragePool',
+        inputs=['x', 'x_scale', 'x_zero_point', 'y_scale', 'y_zero_point'],
+        outputs=['y'],
+        kernel_shape=[2],
+    )

-    out = helper.make_tensor_value_info('out', TensorProto.UINT8, [1, 1, 7, 7])
+    return ([node], [x], [y], [x_scale, x_zero_point, y_scale, y_zero_point])
+
+
+@onnx_test()
+def qlinearaveragepool_2d_test():
+    x = helper.make_tensor_value_info('x', TensorProto.INT8, [1, 3, 4, 4])
+    x_scale = helper.make_tensor('x_scale', TensorProto.FLOAT, [], [0.05])
+    x_zero_point = helper.make_tensor('x_zero_point', TensorProto.INT8, [],
+                                      [0])
+
+    y = helper.make_tensor_value_info('y', TensorProto.INT8, [1, 3, 3, 3])
+    y_scale = helper.make_tensor('y_scale', TensorProto.FLOAT, [], [0.015])
+    y_zero_point = helper.make_tensor('y_zero_point', TensorProto.INT8, [],
+                                      [16])

    node = onnx.helper.make_node(
-        'QLinearConv',
-        inputs=['X', '1', '2', '3', '4', '5', '6', '7'],
-        outputs=['out'],
+        'QLinearAveragePool',
+        inputs=['x', 'x_scale', 'x_zero_point', 'y_scale', 'y_zero_point'],
+        outputs=['y'],
+        kernel_shape=[2, 2],
    )
-    return ([node], [x], [out],
-            [sc_x, zero_pt_x, wt, sc_wt, zero_pt_wt, sc_y, zero_pt_y])
+
+    return ([node], [x], [y], [x_scale, x_zero_point, y_scale, y_zero_point])


 @onnx_test()
-def qlinearconv_pad_1_test():
-    # https://xadupre.github.io/draft/onnx/onnx_doc_folder/onnx__Conv.html
-    x = helper.make_tensor_value_info('X', TensorProto.UINT8, [1, 1, 5, 5])
-    sc_x = helper.make_tensor('1', TensorProto.FLOAT, [],
-                              [0.09411764705882353])
-    zero_pt_x = helper.make_tensor('2', TensorProto.UINT8, [], [0])
+def qlinearaveragepool_2d_ceil_test():
+    x = helper.make_tensor_value_info('x', TensorProto.UINT8, [1, 1, 4, 4])
+    x_scale = helper.make_tensor('x_scale', TensorProto.FLOAT, [], [0.5])
+    x_zero_point = helper.make_tensor('x_zero_point', TensorProto.UINT8, [],
+                                      [0])

-    wt = helper.make_tensor('3', TensorProto.UINT8, [1, 1, 3, 3],
-                            [1, 1, 1, 1, 1, 1, 1, 1, 1])
-    sc_wt = helper.make_tensor('4', TensorProto.FLOAT, [], [1.0])
-    zero_pt_wt = helper.make_tensor('5', TensorProto.UINT8, [], [0])
+    y = helper.make_tensor_value_info('y', TensorProto.UINT8, [1, 1, 2, 2])
+    y_scale = helper.make_tensor('y_scale', TensorProto.FLOAT, [], [0.05])
+    y_zero_point = helper.make_tensor('y_zero_point', TensorProto.UINT8, [],
+                                      [0])

-    sc_y = helper.make_tensor('6', TensorProto.FLOAT, [], [0.6352941176470588])
-    zero_pt_y = helper.make_tensor('7', TensorProto.UINT8, [], [0])
+    node = onnx.helper.make_node(
+        'QLinearAveragePool',
+        inputs=['x', 'x_scale', 'x_zero_point', 'y_scale', 'y_zero_point'],
+        outputs=['y'],
+        kernel_shape=[3, 3],
+        strides=[2, 2],
+        ceil_mode=True,
+    )

-    out = helper.make_tensor_value_info('out', TensorProto.UINT8, [1, 1, 5, 5])
+    return ([node], [x], [y], [x_scale, x_zero_point, y_scale, y_zero_point])
+
+
+@onnx_test()
+def qlinearaveragepool_2d_dilations_test():
+    x = helper.make_tensor_value_info('x', TensorProto.INT8, [1, 1, 4, 4])
+    x_scale = helper.make_tensor('x_scale', TensorProto.FLOAT, [], [0.5])
+    x_zero_point = helper.make_tensor('x_zero_point', TensorProto.INT8, [],
+                                      [0])
+
+    y = helper.make_tensor_value_info('y', TensorProto.INT8, [1, 1, 2, 2])
+    y_scale = helper.make_tensor('y_scale', TensorProto.FLOAT, [], [0.25])
+    y_zero_point = helper.make_tensor('y_zero_point', TensorProto.INT8, [],
+                                      [84])

    node = onnx.helper.make_node(
-        'QLinearConv',
-        inputs=['X', '1', '2', '3', '4', '5', '6', '7'],
-        outputs=['out'],
-        pads=[1, 1, 1, 1],
+        'QLinearAveragePool',
+        inputs=['x', 'x_scale', 'x_zero_point', 'y_scale', 'y_zero_point'],
+        outputs=['y'],
+        kernel_shape=[2, 2],
+        strides=[1, 1],
+        dilations=[2, 2],
+        ceil_mode=True,
    )
-    return ([node], [x], [out],
-            [sc_x, zero_pt_x, wt, sc_wt, zero_pt_wt, sc_y, zero_pt_y])
+
+    return ([node], [x], [y], [x_scale, x_zero_point, y_scale, y_zero_point])


 @onnx_test()
-def qlinearconv_pad_0_test():
-    # https://xadupre.github.io/draft/onnx/onnx_doc_folder/onnx__Conv.html
-    x = helper.make_tensor_value_info('X', TensorProto.UINT8, [1, 1, 5, 5])
-    sc_x = helper.make_tensor('1', TensorProto.FLOAT, [],
-                              [0.09411764705882353])
-    zero_pt_x = helper.make_tensor('2', TensorProto.UINT8, [], [0])
+def qlinearaveragepool_2d_pads_count_include_pad_test():
+    x = helper.make_tensor_value_info('x', TensorProto.INT8, [1, 3, 4, 4])
+    x_scale = helper.make_tensor('x_scale', TensorProto.FLOAT, [], [0.05])
+    x_zero_point = helper.make_tensor('x_zero_point', TensorProto.INT8, [],
+                                      [0])

-    wt = helper.make_tensor('3', TensorProto.UINT8, [1, 1, 3, 3],
-                            [1, 1, 1, 1, 1, 1, 1, 1, 1])
-    sc_wt = helper.make_tensor('4', TensorProto.FLOAT, [], [1.0])
-    zero_pt_wt = helper.make_tensor('5', TensorProto.UINT8, [], [0])
+    y = helper.make_tensor_value_info('y', TensorProto.INT8, [1, 3, 6, 6])
+    y_scale = helper.make_tensor('y_scale', TensorProto.FLOAT, [], [0.01])
+    y_zero_point = helper.make_tensor('y_zero_point', TensorProto.INT8, [],
+                                      [32])

-    sc_y = helper.make_tensor('6', TensorProto.FLOAT, [], [0.6352941176470588])
-    zero_pt_y = helper.make_tensor('7', TensorProto.INT8, [], [-128])
+    node = onnx.helper.make_node(
+        'QLinearAveragePool',
+        inputs=['x', 'x_scale', 'x_zero_point', 'y_scale', 'y_zero_point'],
+        outputs=['y'],
+        kernel_shape=[3, 3],
+        pads=[2, 2, 2, 2],
+        count_include_pad=1,
+    )

-    out = helper.make_tensor_value_info('out', TensorProto.INT8, [1, 1, 3, 3])
+    return ([node], [x], [y], [x_scale, x_zero_point, y_scale, y_zero_point])
+
+
+@onnx_test()
+def qlinearaveragepool_2d_same_lower_test():
+    x = helper.make_tensor_value_info('x', TensorProto.UINT8, [1, 3, 4, 4])
+    x_scale = helper.make_tensor('x_scale', TensorProto.FLOAT, [], [0.5])
+    x_zero_point = helper.make_tensor('x_zero_point', TensorProto.UINT8, [],
+                                      [0])
+
+    y = helper.make_tensor_value_info('y', TensorProto.UINT8, [1, 3, 4, 4])
+    y_scale = helper.make_tensor('y_scale', TensorProto.FLOAT, [], [0.5])
+    y_zero_point = helper.make_tensor('y_zero_point', TensorProto.UINT8, [],
+                                      [0])
+
+    node = onnx.helper.make_node(
+        'QLinearAveragePool',
+        inputs=['x', 'x_scale', 'x_zero_point', 'y_scale', 'y_zero_point'],
+        outputs=['y'],
+        kernel_shape=[2, 2],
+        auto_pad="SAME_LOWER",
+    )
+
+    return ([node], [x], [y], [x_scale, x_zero_point, y_scale, y_zero_point])
+
+
+@onnx_test()
+def qlinearaveragepool_2d_same_upper_test():
+    x = helper.make_tensor_value_info('x', TensorProto.INT8, [1, 3, 4, 4])
+    x_scale = helper.make_tensor('x_scale', TensorProto.FLOAT, [], [0.5])
+    x_zero_point = helper.make_tensor('x_zero_point', TensorProto.INT8, [],
+                                      [32])
+
+    y = helper.make_tensor_value_info('y', TensorProto.INT8, [1, 3, 4, 4])
+    y_scale = helper.make_tensor('y_scale', TensorProto.FLOAT, [], [0.25])
+    y_zero_point = helper.make_tensor('y_zero_point', TensorProto.INT8, [],
+                                      [0])
+
+    node = onnx.helper.make_node(
+        'QLinearAveragePool',
+        inputs=['x', 'x_scale', 'x_zero_point', 'y_scale', 'y_zero_point'],
+        outputs=['y'],
+        kernel_shape=[2, 2],
+        auto_pad="SAME_UPPER",
+    )
+
+    return ([node], [x], [y], [x_scale, x_zero_point, y_scale, y_zero_point])
+
+
+@onnx_test()
+def qlinearaveragepool_2d_strides_test():
+    x = helper.make_tensor_value_info('x', TensorProto.INT8, [1, 3, 8, 8])
+    x_scale = helper.make_tensor('x_scale', TensorProto.FLOAT, [], [0.05])
+    x_zero_point = helper.make_tensor('x_zero_point', TensorProto.INT8, [],
+                                      [0])
+
+    y = helper.make_tensor_value_info('y', TensorProto.INT8, [1, 3, 2, 2])
+    y_scale = helper.make_tensor('y_scale', TensorProto.FLOAT, [], [0.05])
+    y_zero_point = helper.make_tensor('y_zero_point', TensorProto.INT8, [],
+                                      [8])
+
+    node = onnx.helper.make_node(
+        'QLinearAveragePool',
+        inputs=['x', 'x_scale', 'x_zero_point', 'y_scale', 'y_zero_point'],
+        outputs=['y'],
+        kernel_shape=[5, 5],
+        strides=[2, 2],
+    )
+
+    return ([node], [x], [y], [x_scale, x_zero_point, y_scale, y_zero_point])
+
+
+@onnx_test()
+def qlinearaveragepool_3d_test():
+    x = helper.make_tensor_value_info('x', TensorProto.INT8, [1, 3, 3, 3, 3])
+    x_scale = helper.make_tensor('x_scale', TensorProto.FLOAT, [], [0.05])
+    x_zero_point = helper.make_tensor('x_zero_point', TensorProto.INT8, [],
+                                      [0])
+
+    y = helper.make_tensor_value_info('y', TensorProto.INT8, [1, 3, 2, 2, 2])
+    y_scale = helper.make_tensor('y_scale', TensorProto.FLOAT, [], [0.02])
+    y_zero_point = helper.make_tensor('y_zero_point', TensorProto.INT8, [],
+                                      [0])
+
+    node = onnx.helper.make_node(
+        'QLinearAveragePool',
+        inputs=['x', 'x_scale', 'x_zero_point', 'y_scale', 'y_zero_point'],
+        outputs=['y'],
+        kernel_shape=[2, 2, 2],
+    )
+
+    return ([node], [x], [y], [x_scale, x_zero_point, y_scale, y_zero_point])
+
+
+@onnx_test()
+def qlinearaveragepool_notset_test():
+    x = helper.make_tensor_value_info('x', TensorProto.INT8, [1, 1, 5, 5])
+    x_scale = helper.make_tensor('x_scale', TensorProto.FLOAT, [], [0.5])
+    x_zero_point = helper.make_tensor('x_zero_point', TensorProto.INT8, [],
+                                      [0])
+    y = helper.make_tensor_value_info('y', TensorProto.INT8, [1, 1, 1, 1])
+    y_scale = helper.make_tensor('y_scale', TensorProto.FLOAT, [], [0.5])
+    y_zero_point = helper.make_tensor('y_zero_point', TensorProto.INT8, [],
+                                      [10])
+
+    node = onnx.helper.make_node(
+        'QLinearAveragePool',
+        inputs=['x', 'x_scale', 'x_zero_point', 'y_scale', 'y_zero_point'],
+        outputs=['y'],
+        kernel_shape=[6, 6],
+        strides=[2, 2],
+        pads=[0, 0, 1, 1],
+        channels_last=0,
+        auto_pad='NOTSET')
+
+    return ([node], [x], [y], [x_scale, x_zero_point, y_scale, y_zero_point])
+
+
+@onnx_test()
+def qlinearaveragepool_nt_cip_test():
+    x = helper.make_tensor_value_info('x', TensorProto.UINT8, [1, 1, 5, 5])
+    x_scale = helper.make_tensor('x_scale', TensorProto.FLOAT, [], [0.5])
+    x_zero_point = helper.make_tensor('x_zero_point', TensorProto.UINT8, [],
+                                      [0])
+    y = helper.make_tensor_value_info('y', TensorProto.UINT8, [1, 1, 1, 1])
+    y_scale = helper.make_tensor('y_scale', TensorProto.FLOAT, [], [0.5])
+    y_zero_point = helper.make_tensor('y_zero_point', TensorProto.UINT8, [],
+                                      [10])
+
+    node = onnx.helper.make_node(
+        'QLinearAveragePool',
+        inputs=['x', 'x_scale', 'x_zero_point', 'y_scale', 'y_zero_point'],
+        outputs=['y'],
+        kernel_shape=[6, 6],
+        strides=[2, 2],
+        pads=[0, 0, 1, 1],
+        channels_last=0,
+        auto_pad='NOTSET',
+        count_include_pad=1)
+
+    return ([node], [x], [y], [x_scale, x_zero_point, y_scale, y_zero_point])
+
+
+@onnx_test()
+def qlinearconcat_test():
+    y_scale = helper.make_tensor('1', TensorProto.FLOAT, [], [0.5])
+    y_zero_point = helper.make_tensor('2', TensorProto.INT8, [], [2])
+
+    t0 = helper.make_tensor_value_info('t0', TensorProto.INT8, [2])
+    s0 = helper.make_tensor('3', TensorProto.FLOAT, [], [0.5])
+    zp0 = helper.make_tensor('4', TensorProto.INT8, [], [1])
+
+    t1 = helper.make_tensor_value_info('t1', TensorProto.INT8, [3])
+    s1 = helper.make_tensor('5', TensorProto.FLOAT, [], [0.25])
+    zp1 = helper.make_tensor('6', TensorProto.INT8, [], [0])
+
+    y = helper.make_tensor_value_info('out', TensorProto.INT8, [5])
+
+    node = onnx.helper.make_node(
+        'QLinearConcat',
+        inputs=['1', '2', 't0', '3', '4', 't1', '5', '6'],
+        axis=0,
+        outputs=['out'],
+    )
+
+    return ([node], [t0, t1], [y], [y_scale, y_zero_point, s0, zp0, s1, zp1])
+
+
+@onnx_test()
+def qlinearconcat_3d_test():
+    y_scale = helper.make_tensor('1', TensorProto.FLOAT, [], [0.5])
+    y_zero_point = helper.make_tensor('2', TensorProto.INT8, [], [2])
+
+    t0 = helper.make_tensor_value_info('t0', TensorProto.INT8, [3, 4, 2])
+    s0 = helper.make_tensor('3', TensorProto.FLOAT, [], [0.5])
+    zp0 = helper.make_tensor('4', TensorProto.INT8, [], [10])
+
+    t1 = helper.make_tensor_value_info('t1', TensorProto.INT8, [3, 2, 2])
+    s1 = helper.make_tensor('5', TensorProto.FLOAT, [], [0.4])
+    zp1 = helper.make_tensor('6', TensorProto.INT8, [], [20])
+
+    y = helper.make_tensor_value_info('out', TensorProto.UINT8, [3, 6, 2])
+
+    node = onnx.helper.make_node(
+        'QLinearConcat',
+        inputs=['1', '2', 't0', '3', '4', 't1', '5', '6'],
+        axis=1,
+        outputs=['out'],
+    )
+
+    return ([node], [t0, t1], [y], [y_scale, y_zero_point, s0, zp0, s1, zp1])
+
+
+@onnx_test()
+def qlinearconv_test():
+    # https://xadupre.github.io/draft/onnx/onnx_doc_folder/onnx__QLinearConv.html
+    x = helper.make_tensor_value_info('X', TensorProto.UINT8, [1, 1, 7, 7])
+    sc_x = helper.make_tensor('1', TensorProto.FLOAT, [], [0.00369204697])
+    zero_pt_x = helper.make_tensor('2', TensorProto.UINT8, [], [132])
+
+    wt = helper.make_tensor('3', TensorProto.UINT8, [1, 1, 1, 1], [0])
+    sc_wt = helper.make_tensor('4', TensorProto.FLOAT, [], [0.00172794575])
+    zero_pt_wt = helper.make_tensor('5', TensorProto.UINT8, [], [255])
+
+    sc_y = helper.make_tensor('6', TensorProto.FLOAT, [], [0.00162681262])
+    zero_pt_y = helper.make_tensor('7', TensorProto.UINT8, [], [123])
+
+    out = helper.make_tensor_value_info('out', TensorProto.UINT8, [1, 1, 7, 7])
+
+    node = onnx.helper.make_node(
+        'QLinearConv',
+        inputs=['X', '1', '2', '3', '4', '5', '6', '7'],
+        outputs=['out'],
+    )
+    return ([node], [x], [out],
+            [sc_x, zero_pt_x, wt, sc_wt, zero_pt_wt, sc_y, zero_pt_y])
+
+
+@onnx_test()
+def qlinearconv_pad_1_test():
+    # https://xadupre.github.io/draft/onnx/onnx_doc_folder/onnx__Conv.html
+    x = helper.make_tensor_value_info('X', TensorProto.UINT8, [1, 1, 5, 5])
+    sc_x = helper.make_tensor('1', TensorProto.FLOAT, [],
+                              [0.09411764705882353])
+    zero_pt_x = helper.make_tensor('2', TensorProto.UINT8, [], [0])
+
+    wt = helper.make_tensor('3', TensorProto.UINT8, [1, 1, 3, 3],
+                            [1, 1, 1, 1, 1, 1, 1, 1, 1])
+    sc_wt = helper.make_tensor('4', TensorProto.FLOAT, [], [1.0])
+    zero_pt_wt = helper.make_tensor('5', TensorProto.UINT8, [], [0])
+
+    sc_y = helper.make_tensor('6', TensorProto.FLOAT, [], [0.6352941176470588])
+    zero_pt_y = helper.make_tensor('7', TensorProto.UINT8, [], [0])
+
+    out = helper.make_tensor_value_info('out', TensorProto.UINT8, [1, 1, 5, 5])
+
+    node = onnx.helper.make_node(
+        'QLinearConv',
+        inputs=['X', '1', '2', '3', '4', '5', '6', '7'],
+        outputs=['out'],
+        pads=[1, 1, 1, 1],
+    )
+    return ([node], [x], [out],
+            [sc_x, zero_pt_x, wt, sc_wt, zero_pt_wt, sc_y, zero_pt_y])
+
+
+@onnx_test()
+def qlinearconv_pad_0_test():
+    # https://xadupre.github.io/draft/onnx/onnx_doc_folder/onnx__Conv.html
+    x = helper.make_tensor_value_info('X', TensorProto.UINT8, [1, 1, 5, 5])
+    sc_x = helper.make_tensor('1', TensorProto.FLOAT, [],
+                              [0.09411764705882353])
+    zero_pt_x = helper.make_tensor('2', TensorProto.UINT8, [], [0])
+
+    wt = helper.make_tensor('3', TensorProto.UINT8, [1, 1, 3, 3],
+                            [1, 1, 1, 1, 1, 1, 1, 1, 1])
+    sc_wt = helper.make_tensor('4', TensorProto.FLOAT, [], [1.0])
+    zero_pt_wt = helper.make_tensor('5', TensorProto.UINT8, [], [0])
+
+    sc_y = helper.make_tensor('6', TensorProto.FLOAT, [], [0.6352941176470588])
+    zero_pt_y = helper.make_tensor('7', TensorProto.INT8, [], [-128])
+
+    out = helper.make_tensor_value_info('out', TensorProto.INT8, [1, 1, 3, 3])

    node = onnx.helper.make_node(
        'QLinearConv',
@@ -5783,6 +6467,26 @@ def qlinearglobalavgpool_test():
    return ([n], [x], [y], [sc_x, z_pt_x, sc_y, z_pt_y])


+@onnx_test()
+def qlinearleakyrelu_test():
+    x = helper.make_tensor_value_info('X', TensorProto.INT8, [64])
+    sc_x = helper.make_tensor('X_scale', TensorProto.FLOAT, [], [0.05])
+    zero_pt_x = helper.make_tensor('X_zero_point', TensorProto.INT8, [], [0])
+
+    sc_y = helper.make_tensor('Y_scale', TensorProto.FLOAT, [], [0.05])
+    zero_pt_y = helper.make_tensor('Y_zero_point', TensorProto.INT8, [], [10])
+
+    y = helper.make_tensor_value_info('Y', TensorProto.INT8, [64])
+
+    node = onnx.helper.make_node(
+        'QLinearLeakyRelu',
+        inputs=['X', 'X_scale', 'X_zero_point', 'Y_scale', 'Y_zero_point'],
+        outputs=['Y'],
+        alpha=1.1,
+    )
+    return ([node], [x], [y], [sc_x, zero_pt_x, sc_y, zero_pt_y])
+
+
 def qlinearmatmul_1D_test():
    a = helper.make_tensor_value_info('A', TensorProto.UINT8, [8])
    sc_a = helper.make_tensor('A_scale', TensorProto.FLOAT, [], [0.05])
@@ -5868,6 +6572,81 @@ def qlinearmatmul_3D_test():
            [sc_a, zero_pt_a, sc_b, zero_pt_b, sc_c, zero_pt_c])


+@onnx_test()
+def qlinearmul_test():
+    a = helper.make_tensor_value_info('A', TensorProto.UINT8, [64])
+    sc_a = helper.make_tensor('A_scale', TensorProto.FLOAT, [], [0.05])
+    zero_pt_a = helper.make_tensor('A_zero_point', TensorProto.UINT8, [], [0])
+
+    b = helper.make_tensor_value_info('B', TensorProto.UINT8, [64])
+    sc_b = helper.make_tensor('B_scale', TensorProto.FLOAT, [], [0.05])
+    zero_pt_b = helper.make_tensor('B_zero_point', TensorProto.UINT8, [], [16])
+
+    sc_c = helper.make_tensor('C_scale', TensorProto.FLOAT, [], [0.05])
+    zero_pt_c = helper.make_tensor('C_zero_point', TensorProto.UINT8, [],
+                                   [100])
+
+    c = helper.make_tensor_value_info('C', TensorProto.UINT8, [64])
+
+    node = onnx.helper.make_node(
+        'QLinearMul',
+        inputs=[
+            'A', 'A_scale', 'A_zero_point', 'B', 'B_scale', 'B_zero_point',
+            'C_scale', 'C_zero_point'
+        ],
+        outputs=['C'],
+    )
+    return ([node], [a, b], [c],
+            [sc_a, zero_pt_a, sc_b, zero_pt_b, sc_c, zero_pt_c])
+
+
+@onnx_test()
+def qlinearmul_bcast_test():
+    a = helper.make_tensor_value_info('A', TensorProto.INT8, [64])
+    sc_a = helper.make_tensor('A_scale', TensorProto.FLOAT, [], [0.05])
+    zero_pt_a = helper.make_tensor('A_zero_point', TensorProto.INT8, [], [0])
+
+    b = helper.make_tensor_value_info('B', TensorProto.INT8, [1, 1, 64])
+    sc_b = helper.make_tensor('B_scale', TensorProto.FLOAT, [], [0.05])
+    zero_pt_b = helper.make_tensor('B_zero_point', TensorProto.INT8, [], [128])
+
+    sc_c = helper.make_tensor('C_scale', TensorProto.FLOAT, [], [0.15])
+    zero_pt_c = helper.make_tensor('C_zero_point', TensorProto.INT8, [], [32])
+
+    c = helper.make_tensor_value_info('C', TensorProto.INT8, [1, 1, 64])
+
+    node = onnx.helper.make_node(
+        'QLinearMul',
+        inputs=[
+            'A', 'A_scale', 'A_zero_point', 'B', 'B_scale', 'B_zero_point',
+            'C_scale', 'C_zero_point'
+        ],
+        outputs=['C'],
+    )
+    return ([node], [a, b], [c],
+            [sc_a, zero_pt_a, sc_b, zero_pt_b, sc_c, zero_pt_c])
+
+
+@onnx_test()
+def qlinearsigmoid_test():
+    x = helper.make_tensor_value_info('X', TensorProto.INT8, [64])
+    sc_x = helper.make_tensor('X_scale', TensorProto.FLOAT, [], [0.05])
+    zero_pt_x = helper.make_tensor('X_zero_point', TensorProto.INT8, [], [0])
+
+    sc_y = helper.make_tensor('Y_scale', TensorProto.FLOAT, [], [0.0035])
+    zero_pt_y = helper.make_tensor('Y_zero_point', TensorProto.INT8, [],
+                                   [-128])
+
+    y = helper.make_tensor_value_info('Y', TensorProto.INT8, [64])
+
+    node = onnx.helper.make_node(
+        'QLinearSigmoid',
+        inputs=['X', 'X_scale', 'X_zero_point', 'Y_scale', 'Y_zero_point'],
+        outputs=['Y'],
+    )
+    return ([node], [x], [y], [sc_x, zero_pt_x, sc_y, zero_pt_y])
+
+
 @onnx_test()
 def quantizelinear_test():
    arg0 = helper.make_tensor_value_info('0', TensorProto.FLOAT, [5])
@@ -6947,6 +7726,16 @@ def roialign_test():
    return ([node], [x, roi, bi], [y])


+@onnx_test()
+def round_half_test():
+    x = helper.make_tensor_value_info('x', TensorProto.FLOAT16, [4, 4])
+    y = helper.make_tensor_value_info('y', TensorProto.FLOAT16, [4, 4])
+
+    node = onnx.helper.make_node('Round', inputs=['x'], outputs=['y'])
+
+    return ([node], [x], [y])
+
+
 @onnx_test()
 def scatter_add_test():
    x = helper.make_tensor_value_info('data', TensorProto.FLOAT, [3, 4, 5, 6])
@@ -7007,8 +7796,7 @@ def scatter_none_test():
    return ([node], [x, i, u], [y])


-@onnx_test()
-def scatternd_add_test():
+def make_scatternd_test(reduction="none"):
    data = helper.make_tensor_value_info('data', TensorProto.FLOAT, [2, 2, 2])
    indices = helper.make_tensor_value_info('indices', TensorProto.INT64,
                                            [2, 1, 2])
@@ -7020,44 +7808,39 @@ def scatternd_add_test():
    node = onnx.helper.make_node('ScatterND',
                                 inputs=['data', 'indices', 'updates'],
                                 outputs=['output'],
-                                 reduction="add")
+                                 reduction=reduction)

    return ([node], [data, indices, updates], [output])


+@onnx_test()
+def scatternd_add_test():
+    return make_scatternd_test("add")
+
+
 @onnx_test()
 def scatternd_mul_test():
-    data = helper.make_tensor_value_info('data', TensorProto.FLOAT, [2, 2, 2])
-    indices = helper.make_tensor_value_info('indices', TensorProto.INT64,
-                                            [2, 1, 2])
-    updates = helper.make_tensor_value_info('updates', TensorProto.FLOAT,
-                                            [2, 1, 2])
-    output = helper.make_tensor_value_info('output', TensorProto.FLOAT,
-                                           [2, 2, 2])
+    return make_scatternd_test("mul")

-    node = onnx.helper.make_node('ScatterND',
-                                 inputs=['data', 'indices', 'updates'],
-                                 outputs=['output'],
-                                 reduction="mul")

-    return ([node], [data, indices, updates], [output])
+@onnx_test()
+def scatternd_max_test():
+    return make_scatternd_test("max")
+
+
+@onnx_test()
+def scatternd_min_test():
+    return make_scatternd_test("min")


 @onnx_test()
 def scatternd_test():
-    data = helper.make_tensor_value_info('data', TensorProto.FLOAT, [2, 2, 2])
-    indices = helper.make_tensor_value_info('indices', TensorProto.INT64,
-                                            [2, 1, 2])
-    updates = helper.make_tensor_value_info('updates', TensorProto.FLOAT,
-                                            [2, 1, 2])
-    output = helper.make_tensor_value_info('output', TensorProto.FLOAT,
-                                           [2, 2, 2])
+    return make_scatternd_test()

-    node = onnx.helper.make_node('ScatterND',
-                                 inputs=['data', 'indices', 'updates'],
-                                 outputs=['output'])

-    return ([node], [data, indices, updates], [output])
+@onnx_test()
+def scatternd_invalid_reduction_test():
+    return make_scatternd_test("invalid")


 @onnx_test()
@@ -7866,6 +8649,32 @@ def slice_var_input_dyn1():
    return ([node], [data, starts, ends, axes], [output])


+@onnx_test()
+def slice_var_input_default_steps():
+    step = np.array([1, 1])
+    step_tensor = helper.make_tensor(name="step",
+                                     data_type=TensorProto.INT64,
+                                     dims=step.shape,
+                                     vals=step.astype(int))
+    arg_step = helper.make_node("Constant",
+                                inputs=[],
+                                outputs=['arg_step'],
+                                value=step_tensor)
+
+    data = helper.make_tensor_value_info('data', TensorProto.FLOAT, [None, 2])
+    starts = helper.make_tensor_value_info('starts', TensorProto.INT64, [2])
+    ends = helper.make_tensor_value_info('ends', TensorProto.INT64, [2])
+    axes = helper.make_tensor_value_info('axes', TensorProto.INT64, [2])
+    output = helper.make_tensor_value_info('output', TensorProto.FLOAT, [1, 2])
+
+    node = onnx.helper.make_node(
+        'Slice',
+        inputs=['data', 'starts', 'ends', 'axes', 'arg_step'],
+        outputs=['output'])
+
+    return ([arg_step, node], [data, starts, ends, axes], [output])
+
+
 @onnx_test()
 def slice_var_input_steps_error():
    step = np.array([2, 1])
@@ -7879,9 +8688,9 @@ def slice_var_input_steps_error():
                                value=step_tensor)

    data = helper.make_tensor_value_info('data', TensorProto.FLOAT, [3, 2])
-    starts = helper.make_tensor_value_info('starts', TensorProto.FLOAT, [2])
-    ends = helper.make_tensor_value_info('ends', TensorProto.FLOAT, [2])
-    axes = helper.make_tensor_value_info('axes', TensorProto.FLOAT, [2])
+    starts = helper.make_tensor_value_info('starts', TensorProto.INT64, [2])
+    ends = helper.make_tensor_value_info('ends', TensorProto.INT64, [2])
+    axes = helper.make_tensor_value_info('axes', TensorProto.INT64, [2])
    output = helper.make_tensor_value_info('output', TensorProto.FLOAT, [1, 2])

    node = onnx.helper.make_node(
@@ -8042,6 +8851,42 @@ def split_test_no_attribute():
    return ([const_node, node], [x], [y1, y2, y3, y4])


+@onnx_test()
+def split_test_uneven():
+    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [12, 15])
+    y1 = helper.make_tensor_value_info('y1', TensorProto.FLOAT, [3, 15])
+    y2 = helper.make_tensor_value_info('y2', TensorProto.FLOAT, [3, 15])
+    y3 = helper.make_tensor_value_info('y3', TensorProto.FLOAT, [3, 15])
+    y4 = helper.make_tensor_value_info('y4', TensorProto.FLOAT, [3, 15])
+    y5 = helper.make_tensor_value_info('y5', TensorProto.FLOAT, [0, 15])
+
+    node = onnx.helper.make_node(
+        'Split',
+        inputs=['x'],
+        outputs=['y1', 'y2', 'y3', 'y4', 'y5'],
+    )
+
+    return ([node], [x], [y1, y2, y3, y4, y5])
+
+
+@onnx_test()
+def split_test_uneven_num_outputs():
+    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [11, 15])
+    y1 = helper.make_tensor_value_info('y1', TensorProto.FLOAT, [3, 15])
+    y2 = helper.make_tensor_value_info('y2', TensorProto.FLOAT, [3, 15])
+    y3 = helper.make_tensor_value_info('y3', TensorProto.FLOAT, [3, 15])
+    y4 = helper.make_tensor_value_info('y4', TensorProto.FLOAT, [2, 15])
+
+    node = onnx.helper.make_node(
+        'Split',
+        inputs=['x'],
+        outputs=['y1', 'y2', 'y3', 'y4'],
+        num_outputs=4,
+    )
+
+    return ([node], [x], [y1, y2, y3, y4])
+
+
 @onnx_test()
 def split_test_no_attribute_invalid_split():
    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [300, 15])
@@ -8101,6 +8946,24 @@ def split_test_no_attribute_invalid_input_split():
    return ([node], [x], [y1, y2, y3])


+@onnx_test()
+def split_test_invalid_num_outputs():
+    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [11, 15])
+    y1 = helper.make_tensor_value_info('y1', TensorProto.FLOAT, [3, 15])
+    y2 = helper.make_tensor_value_info('y2', TensorProto.FLOAT, [3, 15])
+    y3 = helper.make_tensor_value_info('y3', TensorProto.FLOAT, [3, 15])
+    y4 = helper.make_tensor_value_info('y4', TensorProto.FLOAT, [2, 15])
+
+    node = onnx.helper.make_node(
+        'Split',
+        inputs=['x'],
+        outputs=['y1', 'y2', 'y3', 'y4'],
+        num_outputs=5,
+    )
+
+    return ([node], [x], [y1, y2, y3, y4])
+
+
 @onnx_test()
 def sqrt_test():
    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [10, 15])
@@ -8764,6 +9627,97 @@ def undefined_test():
    return ([node], [x], [y])


+@onnx_test()
+def unique_dynamic_sorted_test():
+    x = helper.make_tensor_value_info('X', TensorProto.FLOAT, [6])
+    y = helper.make_tensor_value_info('Y', TensorProto.FLOAT, [4])
+    y_ind = helper.make_tensor_value_info('indices', TensorProto.INT64, [4])
+    x_ind = helper.make_tensor_value_info('inverse_indices', TensorProto.INT64,
+                                          [6])
+    count = helper.make_tensor_value_info('counts', TensorProto.INT64, [4])
+
+    node = onnx.helper.make_node(
+        'Unique',
+        inputs=['X'],
+        outputs=['Y', 'indices', 'inverse_indices', 'counts'],
+        axis=0,
+        sorted=1)
+    return ([node], [x], [y, y_ind, x_ind, count])
+
+
+@onnx_test()
+def unique_dynamic_sorted_3D_test():
+    x = helper.make_tensor_value_info('X', TensorProto.INT64, [4, 4, 4])
+    y = helper.make_tensor_value_info('Y', TensorProto.INT64, [16])
+    y_ind = helper.make_tensor_value_info('indices', TensorProto.INT64, [16])
+    x_ind = helper.make_tensor_value_info('inverse_indices', TensorProto.INT64,
+                                          [64])
+    count = helper.make_tensor_value_info('counts', TensorProto.INT64, [16])
+
+    node = onnx.helper.make_node(
+        'Unique',
+        inputs=['X'],
+        outputs=['Y', 'indices', 'inverse_indices', 'counts'],
+        sorted=1)
+    return ([node], [x], [y, y_ind, x_ind, count])
+
+
+@onnx_test()
+def unique_dynamic_unsorted_test():
+    x = helper.make_tensor_value_info('X', TensorProto.FLOAT, [6])
+    y = helper.make_tensor_value_info('Y', TensorProto.FLOAT, [4])
+    y_ind = helper.make_tensor_value_info('indices', TensorProto.INT64, [4])
+    x_ind = helper.make_tensor_value_info('inverse_indices', TensorProto.INT64,
+                                          [6])
+    count = helper.make_tensor_value_info('counts', TensorProto.INT64, [4])
+
+    node = onnx.helper.make_node(
+        'Unique',
+        inputs=['X'],
+        outputs=['Y', 'indices', 'inverse_indices', 'counts'],
+        axis=0,
+        sorted=0)
+    return ([node], [x], [y, y_ind, x_ind, count])
+
+
+@onnx_test()
+def unique_sorted_test():
+    x = helper.make_tensor('X', TensorProto.FLOAT, [6], [2, 1, 1, 3, 4, 3])
+
+    y = helper.make_tensor_value_info('Y', TensorProto.FLOAT, [4])
+    y_ind = helper.make_tensor_value_info('indices', TensorProto.INT64, [4])
+    x_ind = helper.make_tensor_value_info('inverse_indices', TensorProto.INT64,
+                                          [6])
+    count = helper.make_tensor_value_info('counts', TensorProto.INT64, [4])
+
+    node = onnx.helper.make_node(
+        'Unique',
+        inputs=['X'],
+        outputs=['Y', 'indices', 'inverse_indices', 'counts'],
+        axis=0,
+        sorted=1)
+    return ([node], [], [y, y_ind, x_ind, count], [x])
+
+
+@onnx_test()
+def unique_unsorted_test():
+    x = helper.make_tensor('X', TensorProto.FLOAT, [6], [2, 1, 1, 3, 4, 3])
+
+    y = helper.make_tensor_value_info('Y', TensorProto.FLOAT, [4])
+    y_ind = helper.make_tensor_value_info('indices', TensorProto.INT64, [4])
+    x_ind = helper.make_tensor_value_info('inverse_indices', TensorProto.INT64,
+                                          [6])
+    count = helper.make_tensor_value_info('counts', TensorProto.INT64, [4])
+
+    node = onnx.helper.make_node(
+        'Unique',
+        inputs=['X'],
+        outputs=['Y', 'indices', 'inverse_indices', 'counts'],
+        axis=0,
+        sorted=0)
+    return ([node], [], [y, y_ind, x_ind, count], [x])
+
+
 @onnx_test()
 def unknown_test():
    x = helper.make_tensor_value_info('0', TensorProto.FLOAT, [2, 3, 4, 5])
@@ -8837,6 +9791,20 @@ def upsample_test():
    return ([node], [X], [Y], [scale_tensor])


+@onnx_test()
+def upsample_ver7_test():
+    X = helper.make_tensor_value_info('X', TensorProto.FLOAT, [1, 1, 2, 2])
+    Y = helper.make_tensor_value_info('Y', TensorProto.FLOAT, [1, 1, 4, 6])
+
+    node = onnx.helper.make_node('Upsample',
+                                 inputs=['X'],
+                                 outputs=['Y'],
+                                 mode='nearest',
+                                 scales=[1.0, 1.0, 2.0, 3.0])
+
+    return ([node], [X], [Y])
+
+
 @onnx_test()
 def variable_batch_test():
    x = helper.make_tensor_value_info('0', TensorProto.FLOAT,

--- a/test/onnx/isinf_double_pos_test.onnx
+++ b/test/onnx/isinf_double_pos_test.onnx
--- a/test/onnx/isinf_half_neg_test.onnx
+++ b/test/onnx/isinf_half_neg_test.onnx
--- a/test/onnx/isinf_half_pos_test.onnx
+++ b/test/onnx/isinf_half_pos_test.onnx