Merge branch 'develop' into optimize_jenkinsfile

a24ed87e · Chris Austen · GitHub · 6481cd69 · a09dc502 · a24ed87e
Unverified Commit a24ed87e authored Dec 05, 2023 by Chris Austen Committed by GitHub Dec 05, 2023
20 changed files
--- a/src/targets/gpu/kernels/include/migraphx/kernels/tensor_view.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/tensor_view.hpp
@@ -27,6 +27,7 @@
 #include <migraphx/kernels/shape.hpp>
 #include <migraphx/kernels/debug.hpp>
 #include <migraphx/kernels/iota_iterator.hpp>
+#include <migraphx/kernels/float8.hpp>

 namespace migraphx {


--- a/src/targets/gpu/kernels/include/migraphx/kernels/type_traits.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/type_traits.hpp
@@ -251,7 +251,7 @@ constexpr T numeric_max()
 }

 template <class T>
-constexpr T numeric_lowest()
+constexpr auto numeric_lowest() -> decltype(numeric_max<T>())
 {
    if constexpr(is_integral<T>{})
    {

--- a/src/targets/gpu/kernels/include/migraphx/kernels/vec.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/vec.hpp
@@ -207,7 +207,7 @@ struct implicit_conversion_op
    template <class U>
    constexpr operator U() const
    {
-        return x;
+        return static_cast<U>(x);
    }
 };


--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -61,9 +61,8 @@ struct miopen_apply
    const lowering* pass     = nullptr;
    std::unordered_map<std::string, std::function<instruction_ref(instruction_ref)>> apply_map{};
    instruction_ref last{};
-    bool offload_copy   = false;
-    bool int8_x4_format = true;
-    bool compute_fp32   = false;
+    bool offload_copy = false;
+    bool compute_fp32 = false;

    context& get_context() const
    {
@@ -84,10 +83,8 @@ struct miopen_apply
        assert(mod != nullptr);
        assert(pass != nullptr);

-        auto& ctx      = get_context();
-        int8_x4_format = get_int8_x4_format(ctx);
-        compute_fp32   = get_compute_fp32_flag();
-        offload_copy   = (mod == mpm->get_root_module()) ? pass->offload_copy : false;
+        compute_fp32 = get_compute_fp32_flag();
+        offload_copy = (mod == mpm->get_root_module()) ? pass->offload_copy : false;

        add_generic_op("contiguous");
        add_extend_op("argmax");
@@ -231,18 +228,15 @@ struct miopen_apply
            assert(refs.size() == 2);
            auto output = insert_allocation(ins, ins->get_shape());
            refs.push_back(output);
-            return mod->replace_instruction(
-                ins, rocblas_gemm<Op>{Op{}, 1, 0, int8_x4_format, compute_fp32}, refs);
+            return mod->replace_instruction(ins, rocblas_gemm<Op>{Op{}, 1, 0, compute_fp32}, refs);
        });
    }

    void add_convolution_op(const std::string& name)
    {
        apply_map.emplace(name, [=](instruction_ref ins) {
-            operation conv = make_op(
-                "gpu::" + name,
-                {{"op", ins->get_operator().to_value()}, {"int8_x4_format", int8_x4_format}});
-            auto output = insert_allocation(ins, ins->get_shape());
+            operation conv = make_op("gpu::" + name, {{"op", ins->get_operator().to_value()}});
+            auto output    = insert_allocation(ins, ins->get_shape());

            return mod->replace_instruction(ins,
                                            make_op("gpu::miopen_op", {{"op", to_value(conv)}}),

--- a/src/targets/gpu/mlir.cpp
+++ b/src/targets/gpu/mlir.cpp
@@ -37,7 +37,7 @@
 #include <mlir-c/Pass.h>
 #include <mlir-c/Support.h>
 #include <mutex>
-#if !defined(MLIR_MIGRAPHX_DIALECT_API_VERSION) || MLIR_MIGRAPHX_DIALECT_API_VERSION != 3
+#if !defined(MLIR_MIGRAPHX_DIALECT_API_VERSION) || MLIR_MIGRAPHX_DIALECT_API_VERSION != 4
 #warning "Incompatible version of rocMLIR library used, disabling"
 // Only undefine when not using cppcheck
 #ifndef CPPCHECK
@@ -73,6 +73,7 @@ namespace gpu {

 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TRACE_MLIR);
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_MLIR_TUNE_EXHAUSTIVE);
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_MLIR_TUNE_LIMIT);
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_MLIR_TUNING_DB);
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_MLIR_TUNING_CFG);

@@ -318,31 +319,30 @@ struct mlir_program
        return result;
    }

-    MlirType make_tensor(const shape& s) const
+    MlirType make_mlir_shaped(const shape& s) const
    {
-        if(not s.standard())
-            MIGRAPHX_THROW("MLIR expects all tensors to be in standard shape");
        if(s.dynamic())
            MIGRAPHX_THROW("MLIR does not support dynamic shapes");
        std::vector<int64_t> lens(s.lens().begin(), s.lens().end());
-        return mlirRankedTensorTypeGet(
-            lens.size(), lens.data(), make_type(s.type()), mlirAttributeGetNull());
+        std::vector<int64_t> strides(s.strides().begin(), s.strides().end());
+        return rocmlirMIXRShapedTypeGet(
+            lens.size(), lens.data(), strides.data(), make_type(s.type()));
    }

    template <class Range>
-    std::vector<MlirType> make_tensors(const Range& r)
+    std::vector<MlirType> make_mlir_shapeds(const Range& r)
    {
        std::vector<MlirType> result;
        std::transform(r.begin(), r.end(), std::back_inserter(result), [&](const auto& s) {
-            return make_tensor(s);
+            return make_mlir_shaped(s);
        });
        return result;
    }

    MlirType make_function_type(const std::vector<shape>& inputs, const std::vector<shape>& outputs)
    {
-        auto in  = make_tensors(inputs);
-        auto out = make_tensors(outputs);
+        auto in  = make_mlir_shapeds(inputs);
+        auto out = make_mlir_shapeds(outputs);
        return mlirFunctionTypeGet(ctx.get(), in.size(), in.data(), out.size(), out.data());
    }

@@ -504,11 +504,7 @@ struct mlir_program

        mlir_operation_state& add_results(const std::vector<shape>& outputs)
        {
-            std::vector<shape> reshaped(outputs.size());
-            std::transform(outputs.begin(), outputs.end(), reshaped.begin(), [](const shape& r) {
-                return shape{r.type(), r.lens()};
-            });
-            auto x = prog->make_tensors(reshaped);
+            auto x = prog->make_mlir_shapeds(outputs);
            if(not x.empty())
            {
                mlirOperationStateAddResults(&op_state, x.size(), x.data());
@@ -581,7 +577,7 @@ struct mlir_program
        std::vector<shape> outputs = m.get_output_shapes();

        std::vector<MlirLocation> arg_locs(inputs.size(), location);
-        auto body_inputs   = make_tensors(inputs);
+        auto body_inputs   = make_mlir_shapeds(inputs);
        mlir_region region = mlirRegionCreate();
        mlir_block fbody = mlirBlockCreate(body_inputs.size(), body_inputs.data(), arg_locs.data());
        MlirBlock result = fbody.get();
@@ -607,7 +603,7 @@ struct mlir_program
            return "func.return";
        if(ins->name() == "@literal")
        {
-            return "tosa.const";
+            return "migraphx.literal";
        }
        return "migraphx." + ins->name();
    }
@@ -666,7 +662,8 @@ struct mlir_program
            if(ins->name() == "@literal")
            {
                literal r            = ins->get_literal();
-                MlirType tensor_type = make_tensor(ins->get_shape());
+                MlirType shaped_type = make_mlir_shaped(ins->get_shape());
+                MlirType tensor_type = rocmlirMIXRShapedTypeAsTensor(shaped_type);
                MlirAttribute mlir_value_attr =
                    mlirDenseElementsAttrRawBufferGet(tensor_type, r.get_shape().bytes(), r.data());
                ops.add_attributes({{"value", mlir_value_attr}});
@@ -796,7 +793,9 @@ struct mlir_program
        if(enabled(MIGRAPHX_MLIR_TUNE_EXHAUSTIVE{}))
            tuning_mode = RocmlirTuningParamSetKindExhaustive;
        mlir_tuning_space params{mlirRockTuningSpaceCreate(mmodule.get(), tuning_mode)};
-        for(auto i : range(mlirRockTuningGetNumParams(params.get())))
+        const auto limit =
+            value_of(MIGRAPHX_MLIR_TUNE_LIMIT{}, std::numeric_limits<std::size_t>::max());
+        for(auto i : range(std::min<std::size_t>(limit, mlirRockTuningGetNumParams(params.get()))))
        {
            mlir_tuning_param param{mlirRockTuningParamCreate()};
            if(not mlirRockTuningParamGet(params.get(), i, param.get()))
@@ -942,35 +941,7 @@ void adjust_param_shapes(module& m, const std::vector<shape>& inputs)
        auto param        = m.get_parameter(name);
        if(input.standard())
            continue;
-        auto lens    = input.lens();
-        auto strides = input.strides();
-        std::vector<operation> ops;
-        if(input.transposed())
-        {
-            auto perm  = find_permutation(input);
-            auto iperm = invert_permutation(perm);
-            lens       = reorder_dims(lens, iperm);
-            strides    = reorder_dims(strides, iperm);
-            ops.push_back(make_op("transpose", {{"permutation", perm}}));
-        }
-        if(input.broadcasted())
-        {
-            std::transform(lens.begin(),
-                           lens.end(),
-                           strides.begin(),
-                           lens.begin(),
-                           [](auto len, auto stride) -> std::size_t {
-                               if(stride == 0)
-                                   return 1;
-                               return len;
-                           });
-            ops.push_back(make_op("multibroadcast", {{"out_lens", input.lens()}}));
-        }
-        auto new_param =
-            std::accumulate(ops.begin(),
-                            ops.end(),
-                            m.add_parameter(name + ".0", shape{input.type(), lens}),
-                            [&](auto x, auto op) { return m.insert_instruction(param, op, x); });
+        auto new_param = m.add_parameter(name + ".0", input);
        m.replace_instruction(param, new_param);
        m.remove_instruction(param);
    }
@@ -1032,6 +1003,15 @@ tuning_config get_tuning_config_mlir(const context& migraphx_ctx,
    mlir_program mp;
    mp.set_gpu_properties(migraphx_ctx);
    mp.parse(m);
+
+    const bool trace = enabled(MIGRAPHX_TRACE_MLIR{});
+    static std::mutex mutex;
+    if(trace)
+    {
+        const std::lock_guard<std::mutex> lock(mutex);
+        auto mod_op = mlirModuleGetOperation(mp.mmodule.get());
+        std::cout << mlir_print(&mlirOperationPrint, mod_op) << std::endl;
+    }
    return mp.get_tuning_config(exhaustive);
 }


--- a/src/targets/gpu/pack_int8_args.cpp
+++ b/src/targets/gpu/pack_int8_args.cpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#include <iterator>
-#include <migraphx/gpu/pack_int8_args.hpp>
-#include <migraphx/gpu/int8_gemm_pack.hpp>
-#include <migraphx/gpu/int8_conv_pack.hpp>
-#include <migraphx/gpu/hip.hpp>
-#include <migraphx/instruction.hpp>
-#include <migraphx/instruction_ref.hpp>
-#include <migraphx/program.hpp>
-#include <migraphx/iterator_for.hpp>
-#include <migraphx/make_op.hpp>
-#include <migraphx/permutation.hpp>
-
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-
-static instruction_ref pad_ins(module& m, instruction_ref ins, int offset)
-{
-    auto s                         = ins->get_shape();
-    auto lens                      = s.lens();
-    auto k                         = lens[lens.size() + offset];
-    auto pad_k                     = (k + 3) / 4 * 4;
-    auto pad_lens                  = lens;
-    pad_lens[lens.size() + offset] = pad_k;
-    auto ret_ins                   = ins;
-    if(pad_k != k)
-    {
-        std::vector<int64_t> pad_dims(lens.size() * 2, 0);
-        pad_dims[lens.size() + offset] = pad_k - k;
-        shape ps{s.type(), pad_lens};
-        auto ins_out =
-            m.insert_instruction(ins, make_op("hip::allocate", {{"shape", to_value(ps)}}));
-        auto pad = make_op("pad", {{"pads", pad_dims}});
-        ret_ins =
-            m.insert_instruction(std::next(ins), make_op("gpu::pad", pad.to_value()), ins, ins_out);
-    }
-
-    return ret_ins;
-}
-
-static std::vector<instruction_ref> pad_inputs(module& m, instruction_ref ins)
-{
-    std::vector<instruction_ref> ret_inputs;
-    auto inputs = ins->inputs();
-    auto in0    = inputs.at(0);
-    auto sa     = in0->get_shape();
-    bool transa = sa.transposed();
-    if(transa)
-    {
-        auto perm = find_permutation(sa);
-        auto val  = in0->get_operator().to_value();
-        if(val.contains("dims"))
-        {
-            int offset = static_cast<int>(perm.back()) - static_cast<int>(perm.size());
-            auto t_in  = in0->inputs().front();
-            auto p_in  = pad_ins(m, t_in, offset);
-            auto dims  = val.at("dims").to_vector<int64_t>();
-            auto r_in =
-                m.insert_instruction(ins, make_op("transpose", {{"permutation", dims}}), p_in);
-            ret_inputs.push_back(r_in);
-        }
-        else
-        {
-            shape cs{in0->get_shape().type(), in0->get_shape().lens()};
-            auto con_out =
-                m.insert_instruction(ins, make_op("hip::allocate", {{"shape", to_value(cs)}}));
-            auto cin0 = m.insert_instruction(ins, make_op("gpu::contiguous"), in0, con_out);
-            ret_inputs.push_back(pad_ins(m, cin0, -1));
-        }
-    }
-    else
-    {
-        ret_inputs.push_back(pad_ins(m, in0, -1));
-    }
-
-    auto in1    = inputs.at(1);
-    auto sb     = in1->get_shape();
-    bool transb = sb.transposed();
-    if(transb)
-    {
-        auto perm = find_permutation(sb);
-        auto val  = in1->get_operator().to_value();
-        if(val.contains("dims"))
-        {
-            int offset = static_cast<int>(perm[perm.size() - 2]) - static_cast<int>(perm.size());
-            auto t_in  = in1->inputs().front();
-            auto p_in  = pad_ins(m, t_in, offset);
-            auto dims  = val.at("dims").to_vector<int64_t>();
-            auto r_in =
-                m.insert_instruction(ins, make_op("transpose", {{"permutation", dims}}), p_in);
-            ret_inputs.push_back(r_in);
-        }
-        else
-        {
-            shape cs{in1->get_shape().type(), in1->get_shape().lens()};
-            auto con_out =
-                m.insert_instruction(ins, make_op("hip::allocate", {{"shape", to_value(cs)}}));
-            auto cin1 = m.insert_instruction(ins, make_op("gpu::contiguous"), in1, con_out);
-            ret_inputs.push_back(pad_ins(m, cin1, -2));
-        }
-    }
-    else
-    {
-        ret_inputs.push_back(pad_ins(m, in1, -2));
-    }
-    std::copy(inputs.begin() + 2, inputs.end(), std::back_inserter(ret_inputs));
-
-    return ret_inputs;
-}
-
-void pack_int8_args::apply(module& m) const
-{
-    for(auto ins : iterator_for(m))
-    {
-        if(ins->name() == "gpu::quant_gemm")
-        {
-            auto val = ins->get_operator().to_value();
-            assert(val.contains("int8_x4_format"));
-            if(not val.at("int8_x4_format").to<bool>())
-            {
-                continue;
-            }
-            auto inputs = ins->inputs();
-            auto lens   = inputs.at(0)->get_shape().lens();
-            // gemm need the k to be multiple of 4, so need packing that dimension
-            auto old_inputs = inputs;
-            if((lens.back() % 4) != 0)
-            {
-                inputs = pad_inputs(m, ins);
-            }
-
-            bool transa = inputs[0]->get_shape().transposed();
-            bool transb = inputs[1]->get_shape().transposed();
-            if(not transb)
-            {
-                auto packed_b = m.insert_instruction(
-                    ins, make_op("hip::allocate", {{"shape", to_value(inputs[1]->get_shape())}}));
-                auto output_b = m.insert_instruction(
-                    ins, make_op("gpu::int8_gemm_pack_a"), {inputs[1], packed_b});
-                inputs[1] = output_b;
-            }
-
-            if(transa)
-            {
-                auto packed_a = m.insert_instruction(
-                    ins, make_op("hip::allocate", {{"shape", to_value(inputs[0]->get_shape())}}));
-                auto output_a = m.insert_instruction(
-                    ins, make_op("gpu::int8_gemm_pack_b"), {inputs[0], packed_a});
-                inputs[0] = output_a;
-            }
-
-            if(inputs != old_inputs)
-            {
-                m.replace_instruction(ins, ins->get_operator(), inputs);
-            }
-        }
-        else if(ins->name() == "gpu::quant_convolution")
-        {
-            auto val = ins->get_operator().to_value();
-            if(not val.at("int8_x4_format").to<bool>())
-            {
-                continue;
-            }
-
-            auto inputs   = ins->inputs();
-            auto packed_x = m.insert_instruction(
-                ins,
-                make_op("hip::allocate",
-                        {{"shape", to_value(pack_int8_shape(inputs[0]->get_shape()))}}));
-            auto output_x =
-                m.insert_instruction(ins, make_op("gpu::int8_conv_pack"), {inputs[0], packed_x});
-            instruction::replace_argument(ins, inputs[0], output_x);
-
-            auto packed_w = m.insert_instruction(
-                ins,
-                make_op("hip::allocate",
-                        {{"shape", to_value(pack_int8_shape(inputs[1]->get_shape()))}}));
-            auto output_w =
-                m.insert_instruction(ins, make_op("gpu::int8_conv_pack"), {inputs[1], packed_w});
-            instruction::replace_argument(ins, inputs[1], output_w);
-        }
-    }
-}
-
-shape pack_int8_args::pack_int8_shape(const shape& s) const
-{
-    if(s.type() != shape::int8_type)
-    {
-        MIGRAPHX_THROW("PACK_INT8_ARGS: only process int8_type");
-    }
-
-    auto lens    = s.lens();
-    auto strides = s.strides();
-    lens[1]      = (lens[1] + 3) / 4 * 4;
-    strides[0]   = strides[1] * lens[1];
-
-    return {s.type(), lens, strides};
-}
-
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
--- a/src/targets/gpu/pad.cpp
+++ b/src/targets/gpu/pad.cpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#include <migraphx/gpu/pad.hpp>
-#include <migraphx/gpu/context.hpp>
-#include <migraphx/gpu/device/pad.hpp>
-
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-
-shape hip_pad::compute_shape(std::vector<shape> inputs) const
-{
-    inputs.pop_back();
-    check_shapes{inputs, *this}.has(1).standard();
-    return op.compute_shape(inputs);
-}
-
-argument hip_pad::compute(context& ctx, const shape&, const std::vector<argument>& args) const
-{
-    return device::pad(ctx.get_stream().get(), args.back(), args.front(), op.value, op.pads);
-}
-
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
--- a/src/targets/gpu/prefuse_ops.cpp
+++ b/src/targets/gpu/prefuse_ops.cpp
@@ -28,7 +28,10 @@
 #include <migraphx/register_op.hpp>
 #include <migraphx/pass_manager.hpp>
 #include <migraphx/dead_code_elimination.hpp>
+#ifdef MIGRAPHX_USE_COMPOSABLEKERNEL
 #include <migraphx/gpu/ck.hpp>
+#endif
+#include <migraphx/gpu/fuse_mlir.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -128,26 +131,49 @@ struct pre_gemm_softmax_gemm : gemm_softmax_gemm
 };
 MIGRAPHX_REGISTER_OP(pre_gemm_softmax_gemm);

-MIGRAPHX_PRED_MATCHER(is_ck_gemm, instruction_ref ins)
+auto is_ck_gemm()
 {
-    if(ins->name() != "dot")
+    return match::make_basic_pred_matcher([=](instruction_ref ins) {
+#ifdef MIGRAPHX_USE_COMPOSABLEKERNEL
+        if(not enabled(MIGRAPHX_ENABLE_CK{}))
+            return false;
+        if(ins->name() != "dot")
+            return false;
+        if(not pre_gemm_softmax_gemm::is_ck_supported_type(ins->get_shape().type()))
+            return false;
+        return true;
+#else
+        (void)ins;
        return false;
-    if(not pre_gemm_softmax_gemm::is_ck_supported_type(ins->get_shape().type()))
-        return false;
-    return true;
+#endif
+    });
+}
+
+auto is_mlir_gemm()
+{
+    return match::make_basic_pred_matcher([=](instruction_ref ins) {
+        if(not mlir_attention_enabled())
+            return false;
+        if(ins->name() != "dot")
+            return false;
+        return std::all_of(ins->inputs().begin(), ins->inputs().end(), [&](auto i) {
+            return pre_gemm_softmax_gemm::is_mlir_supported_type(i->get_shape().type());
+        });
+    });
 }

 struct find_gemm_softmax_gemm
 {
    auto matcher() const
    {
-        auto gemm1 =
-            match::skip(match::name("contiguous"))(match::name("dot")(is_ck_gemm().bind("gemm1")));
+        auto gemm1 = match::skip(match::name("contiguous"))(
+            match::name("dot")(match::any_of(is_ck_gemm(), is_mlir_gemm()).bind("gemm1")));
        auto mul = match::name("mul")(
            match::nargs(2), match::either_arg(0, 1)(match::is_constant().bind("scale"), gemm1));
        auto softmax = match::name("softmax")(match::arg(0)(mul)).bind("softmax");

-        return match::name("dot")(is_ck_gemm().bind("gemm2"))(match::arg(0)(softmax));
+        return match::name("dot")(match::any_of(is_ck_gemm(), is_mlir_gemm()).bind("gemm2"))(
+            match::arg(0)(softmax));
    }

    void apply(module_pass_manager& mpm, const match::matcher_result& r) const
@@ -182,8 +208,7 @@ void prefuse_ops::apply(module_pass_manager& mpm) const
    match::find_matches(mpm.get_module(), find_layernorm{});
    mpm.run_pass(dead_code_elimination{});
    match::find_matches(mpm.get_module(), find_add_layernorm{});
-    if(enabled(MIGRAPHX_ENABLE_CK{}))
-        match::find_matches(mpm, find_gemm_softmax_gemm{});
+    match::find_matches(mpm, find_gemm_softmax_gemm{});
 }

 } // namespace gpu

--- a/src/targets/gpu/rocblas.cpp
+++ b/src/targets/gpu/rocblas.cpp
@@ -53,19 +53,16 @@ bool get_compute_fp32_flag()
    return (starts_with(device_name, "gfx9") and device_name >= "gfx908");
 }

-bool get_int8_x4_format(context& ctx)
+bool rocblas_fp8_available()
 {
-#if ROCBLAS_VERSION_MAJOR >= 3
-    (void)(ctx);
+#ifndef MIGRAPHX_USE_ROCBLAS_FP8_API
    return false;
 #else
-    // int8x4 packed format is only available starting from rocblas-v2.38 and it is deprecated in
-    // v3.0 and will be removed in v4.0
-    rocblas_gemm_flags flag;
-    rocblas_query_int8_layout_flag(ctx.get_stream().get_rocblas(), &flag);
-    return flag == rocblas_gemm_flags_pack_int8x4;
+    const auto device_name = trim(split_string(get_device_name(), ':').front());
+    return (starts_with(device_name, "gfx9") and device_name >= "gfx940");
 #endif
 }
+
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/src/targets/gpu/target.cpp
+++ b/src/targets/gpu/target.cpp
@@ -63,7 +63,6 @@
 #include <migraphx/gpu/fuse_ops.hpp>
 #include <migraphx/gpu/prefuse_ops.hpp>
 #include <migraphx/gpu/lowering.hpp>
-#include <migraphx/gpu/pack_int8_args.hpp>
 #include <migraphx/gpu/schedule_model.hpp>
 #include <migraphx/gpu/sync_device.hpp>
 #include <migraphx/gpu/target.hpp>
@@ -99,12 +98,28 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
    ctx.set_exhaustive_tune_flag(options.exhaustive_tune);
    std::set<shape::type_t> unsupported_types(shape::types().begin(), shape::types().end());
    unsupported_types.erase(shape::type_t::float_type);
+    unsupported_types.erase(shape::type_t::fp8e4m3fnuz_type);
    unsupported_types.erase(shape::type_t::half_type);
    unsupported_types.erase(shape::type_t::bool_type);
    unsupported_types.erase(shape::type_t::int8_type);
    unsupported_types.erase(shape::type_t::uint8_type);
    unsupported_types.erase(shape::type_t::int32_type);
    unsupported_types.erase(shape::type_t::tuple_type);
+    std::set<std::string> unsupported_fp8_ops = {};
+    if(not gpu::rocblas_fp8_available())
+    {
+        unsupported_fp8_ops.insert("dot");
+    }
+    // add all device kernels
+    unsupported_fp8_ops.insert("logsoftmax");
+    unsupported_fp8_ops.insert("nonzero");
+    unsupported_fp8_ops.insert("prefix_scan_sum");
+    unsupported_fp8_ops.insert("scatter_none");
+    unsupported_fp8_ops.insert("topk");
+    unsupported_fp8_ops.insert("rnn_var_sl_shift_output");
+    unsupported_fp8_ops.insert("multinomial");
+    unsupported_fp8_ops.insert("argmax");
+    unsupported_fp8_ops.insert("argmin");
    // clang-format off
    return
    {
@@ -136,6 +151,8 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
        prefuse_ops{},
        dead_code_elimination{},
        auto_contiguous{},
+        eliminate_data_type{{migraphx::shape::fp8e4m3fnuz_type}, shape::float_type, unsupported_fp8_ops},
+        dead_code_elimination{},
        optimize_module{},
        fuse_pointwise{},
        dead_code_elimination{},
@@ -154,7 +171,6 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
        dead_code_elimination{},
        compile_miopen{&gctx},
        dead_code_elimination{},
-        pack_int8_args{},
        dead_code_elimination{},
        fuse_ops{&ctx, options.fast_math},
        dead_code_elimination{},

--- a/src/targets/ref/CMakeLists.txt
+++ b/src/targets/ref/CMakeLists.txt
@@ -33,8 +33,9 @@ rocm_set_soversion(migraphx_ref ${MIGRAPHX_SO_VERSION})
 find_path(BLAZE_INCLUDE blaze/Blaze.h)

 rocm_clang_tidy_check(migraphx_ref)
+target_link_libraries(migraphx_ref PRIVATE Threads::Threads)
 target_link_libraries(migraphx_ref PUBLIC migraphx)
-target_include_directories(migraphx_ref PRIVATE ${BLAZE_INCLUDE})
+target_include_directories(migraphx_ref SYSTEM PRIVATE ${BLAZE_INCLUDE})
 target_compile_definitions(migraphx_ref PRIVATE -DBLAZE_USE_CPP_THREADS)

 migraphx_generate_export_header(migraphx_ref)

--- a/src/tf/CMakeLists.txt
+++ b/src/tf/CMakeLists.txt
@@ -38,7 +38,11 @@ protobuf_generate_cpp(
 )
 add_library(tf-proto STATIC ${PROTO_SRCS})
 target_include_directories(tf-proto SYSTEM PUBLIC ${CMAKE_CURRENT_BINARY_DIR} ${PROTOBUF_INCLUDE_DIR})
-target_compile_options(tf-proto PRIVATE -w)
+if(MSVC)
+    target_compile_options(tf-proto PRIVATE /w)
+else()
+    target_compile_options(tf-proto PRIVATE -w)
+endif()
 target_link_libraries(tf-proto PRIVATE ${PROTOBUF_LIBRARY})
 set_target_properties(tf-proto PROPERTIES POSITION_INDEPENDENT_CODE On)

@@ -49,7 +53,10 @@ target_include_directories(migraphx_tf PRIVATE include)
 set_target_properties(migraphx_tf PROPERTIES EXPORT_NAME tf)
 rocm_set_soversion(migraphx_tf ${MIGRAPHX_SO_VERSION})
 rocm_clang_tidy_check(migraphx_tf)
-target_link_libraries(migraphx_tf PRIVATE tf-proto "-Wl,--exclude-libs,ALL")
+target_link_libraries(migraphx_tf PRIVATE tf-proto)
+if(NOT WIN32)
+    target_link_libraries(migraphx_tf PRIVATE "-Wl,--exclude-libs,ALL")
+endif()
 target_link_libraries(migraphx_tf PUBLIC migraphx)

 rocm_install_targets(

--- a/src/tmp_dir.cpp
+++ b/src/tmp_dir.cpp
@@ -31,8 +31,18 @@
 #include <sstream>
 #include <iostream>
 #include <string>
-#include <sys/types.h>
+
+#ifdef _WIN32
+// cppcheck-suppress definePrefix
+#define WIN32_LEAN_AND_MEAN
+#include <Windows.h>
+#undef getpid
+// cppcheck-suppress [definePrefix, defineUpperCase]
+#define getpid _getpid
+#else
 #include <unistd.h>
+#include <sys/types.h>
+#endif

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

--- a/src/verify_args.cpp
+++ b/src/verify_args.cpp
@@ -88,7 +88,6 @@ bool verify_args(const std::string& name,
            if(target_nan_idx >= 0)
                std::cout << "Non finite number found in target at " << target_nan_idx << ": "
                          << target[target_nan_idx] << std::endl;
-            std::cout << "MIGraphX verification passed successfully." << std::endl;
        }
    });
    return passed;

--- a/src/version.h.in
+++ b/src/version.h.in
@@ -25,5 +25,5 @@
 #define MIGRAPHX_VERSION_MAJOR @PROJECT_VERSION_MAJOR@
 #define MIGRAPHX_VERSION_MINOR @PROJECT_VERSION_MINOR@
 #define MIGRAPHX_VERSION_PATCH @PROJECT_VERSION_PATCH@
-#define MIGRAPHX_VERSION_TWEAK @PROJECT_VERSION_TWEAK@
+#define MIGRAPHX_VERSION_TWEAK "@PROJECT_VERSION_TWEAK@"
 // clang-format on
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -25,7 +25,7 @@
 cmake_policy(SET CMP0057 NEW)

 find_package(Threads REQUIRED)
-rocm_test_link_libraries(Threads::Threads migraphx migraphx_ref migraphx_onnx migraphx_tf)
+rocm_test_link_libraries(Threads::Threads migraphx migraphx_onnx migraphx_tf)
 rocm_test_include_directories(include)

 set(MIGRAPHX_DISABLE_LARGE_BUFFER_TESTS Off CACHE BOOL "")
@@ -146,7 +146,11 @@ endfunction()

 function(test_headers PREFIX)
    file(GLOB HEADERS CONFIGURE_DEPENDS ${ARGN})
-
+    if(NOT MIGRAPHX_USE_COMPOSABLEKERNEL)
+        list(REMOVE_ITEM HEADERS
+              ${CMAKE_SOURCE_DIR}/src/targets/gpu/include/migraphx/gpu/ck.hpp)
+    endif()
+    list(REMOVE_ITEM HEADERS ${CMAKE_SOURCE_DIR}/src/include/migraphx/float8_impl.hpp)
    foreach(HEADER ${HEADERS})
        file(RELATIVE_PATH HEADER_REL ${CMAKE_SOURCE_DIR} ${HEADER})
        string(MAKE_C_IDENTIFIER ${HEADER_REL} TEST_NAME)

--- a/test/api/CMakeLists.txt
+++ b/test/api/CMakeLists.txt
@@ -30,6 +30,9 @@ function(add_api_test TEST_NAME TEST_SRC TEST_DIR)
    add_test(NAME ${NAME} COMMAND $<TARGET_FILE:${NAME}> WORKING_DIRECTORY ${TEST_DIR}) 
    add_dependencies(tests ${NAME})
    add_dependencies(check ${NAME})
+    if(WIN32)
+        target_compile_definitions(${NAME} PRIVATE _CRT_SECURE_NO_WARNINGS)
+    endif()
 endfunction()

 # Workaround: C file dont work with clang-tidy right now, need a fix in rocm-cmake
@@ -41,6 +44,9 @@ function(add_c_api_test TEST_NAME TEST_SRC TEST_DIR)
    add_test(NAME ${NAME} COMMAND $<TARGET_FILE:${NAME}> WORKING_DIRECTORY ${TEST_DIR}) 
    add_dependencies(tests ${NAME})
    add_dependencies(check ${NAME})
+    if(WIN32)
+        target_compile_definitions(${NAME} PRIVATE _CRT_SECURE_NO_WARNINGS)
+    endif()
 endfunction()

 add_api_test(array_base test_array_base.cpp ${TEST_ONNX_DIR})
@@ -57,10 +63,6 @@ add_api_test(custom_op test_custom_op.cpp ${TEST_ONNX_DIR})
 add_api_test(tf_parser test_tf_parser.cpp ${TEST_TF_DIR})
 # GPU-based tests
 if(MIGRAPHX_ENABLE_GPU)
-list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
-find_package(hip)
 add_api_test(gpu test_gpu.cpp ${TEST_ONNX_DIR})
-target_link_libraries(test_api_gpu)
 add_api_test(custom_op_gpu test_custom_op_gpu.cpp ${TEST_ONNX_DIR})
-target_link_libraries(test_api_custom_op_gpu)
 endif()
--- a/test/api/test_cpu.cpp
+++ b/test/api/test_cpu.cpp
@@ -198,4 +198,29 @@ TEST_CASE(set_loop_default_iter_num)
    EXPECT(out_shapes[1].lengths() == out_lens1);
 }

+TEST_CASE(set_loop_limit_iterations)
+{
+    migraphx::onnx_options option;
+    option.set_default_loop_iterations(15);
+    option.set_limit_loop_iterations(10);
+    auto p                             = migraphx::parse_onnx("loop_default_test.onnx", option);
+    auto out_shapes                    = p.get_output_shapes();
+    std::vector<std::size_t> out_lens0 = {1};
+    EXPECT(out_shapes[0].lengths() == out_lens0);
+    std::vector<std::size_t> out_lens1 = {10, 1};
+    EXPECT(out_shapes[1].lengths() == out_lens1);
+}
+
+TEST_CASE(set_loop_limit_iterations2)
+{
+    migraphx::onnx_options option;
+    option.set_limit_loop_iterations(10);
+    auto p          = migraphx::parse_onnx("loop_test_implicit_tripcnt.onnx", option);
+    auto out_shapes = p.get_output_shapes();
+    std::vector<std::size_t> out_lens0 = {1};
+    EXPECT(out_shapes[0].lengths() == out_lens0);
+    std::vector<std::size_t> out_lens1 = {10, 1};
+    EXPECT(out_shapes[1].lengths() == out_lens1);
+}
+
 int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/test/api/test_gpu.cpp
+++ b/test/api/test_gpu.cpp
@@ -317,4 +317,59 @@ TEST_CASE(loop_test)
    }
 }

+TEST_CASE(loop_test_limit_max_iter)
+{
+    auto run_prog = [&](int64_t limit_max_iterations) {
+        migraphx::onnx_options parse_options;
+        parse_options.set_limit_loop_iterations(limit_max_iterations);
+        auto p             = migraphx::parse_onnx("loop_test_implicit_tripcnt.onnx", parse_options);
+        auto shapes_before = p.get_output_shapes();
+        migraphx::compile_options options;
+        options.set_offload_copy();
+        p.compile(migraphx::target("gpu"), options);
+        auto shapes_after = p.get_output_shapes();
+        CHECK(shapes_before.size() == 2);
+        CHECK(bool{shapes_before.front() == shapes_after.front()});
+
+        migraphx::program_parameters pp;
+        auto param_shapes     = p.get_parameter_shapes();
+        auto aas              = param_shapes["a"];
+        std::vector<float> xd = {1.0f};
+        pp.add("a", migraphx::argument(aas, xd.data()));
+        auto bbs              = param_shapes["b"];
+        std::vector<float> yd = {2.0};
+        pp.add("b", migraphx::argument(bbs, yd.data()));
+
+        auto cs   = param_shapes["keep_going_cond"];
+        bool cond = true;
+        pp.add("keep_going_cond", migraphx::argument(cs, &cond));
+
+        auto outputs = p.eval(pp);
+        auto output  = outputs[0];
+        std::vector<std::vector<float>> ret;
+        ret.push_back(output.as_vector<float>());
+
+        output = outputs[1];
+        ret.push_back(output.as_vector<float>());
+
+        return ret;
+    };
+
+    {
+        auto result_vector       = run_prog(5);
+        std::vector<float> gold0 = {2.0f};
+        EXPECT(result_vector.at(0) == gold0);
+        std::vector<float> gold1 = {-2, 4, 0, 0, 0};
+        EXPECT(result_vector.at(1) == gold1);
+    }
+
+    {
+        auto result_vector       = run_prog(20);
+        std::vector<float> gold0 = {2.0f};
+        EXPECT(result_vector.at(0) == gold0);
+        std::vector<float> gold1 = {-2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+        EXPECT(result_vector.at(1) == gold1);
+    }
+}
+
 int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/test/float_equal.cpp
+++ b/test/float_equal.cpp
@@ -22,6 +22,7 @@
 * THE SOFTWARE.
 */
 #include <migraphx/float_equal.hpp>
+#include <migraphx/float8.hpp>
 #include <migraphx/half.hpp>
 #include "test.hpp"

@@ -53,7 +54,7 @@ auto test_float_equal(T x, U y)
 template <class T, class U>
 void test_equality()
 {
-    auto x1 = T(0.1);
+    auto x1 = T(0.125);
    auto x2 = U(0.0);
    auto x3 = U(1.0);
    EXPECT(test_float_equal(x1, x1));
@@ -71,8 +72,12 @@ void test_equality()
 TEST_CASE_REGISTER(test_equality<double, float>);
 TEST_CASE_REGISTER(test_equality<double, int>);
 TEST_CASE_REGISTER(test_equality<double, migraphx::half>);
+TEST_CASE_REGISTER(test_equality<double, migraphx::fp8::fp8e4m3fnuz>);
 TEST_CASE_REGISTER(test_equality<float, int>);
+TEST_CASE_REGISTER(test_equality<float, migraphx::fp8::fp8e4m3fnuz>);
 TEST_CASE_REGISTER(test_equality<migraphx::half, int>);
+TEST_CASE_REGISTER(test_equality<migraphx::half, migraphx::fp8::fp8e4m3fnuz>);
+TEST_CASE_REGISTER(test_equality<migraphx::fp8::fp8e4m3fnuz, int>);

 template <class T, class U>
 void test_limits()
@@ -110,8 +115,13 @@ void test_limits()
 TEST_CASE_REGISTER(test_limits<double, float>);
 TEST_CASE_REGISTER(test_limits<double, int>);
 TEST_CASE_REGISTER(test_limits<double, migraphx::half>);
+TEST_CASE_REGISTER(test_limits<double, migraphx::fp8::fp8e4m3fnuz>);
 TEST_CASE_REGISTER(test_limits<float, int>);
+TEST_CASE_REGISTER(test_limits<float, migraphx::fp8::fp8e4m3fnuz>);
 TEST_CASE_REGISTER(test_limits<int, migraphx::half>);
+TEST_CASE_REGISTER(test_limits<int, migraphx::fp8::fp8e4m3fnuz>);
+TEST_CASE_REGISTER(test_limits<migraphx::fp8::fp8e4m3fnuz, migraphx::half>);
+
 #ifndef _WIN32
 // On Windows, types int and long have the same min and max values.
 TEST_CASE_REGISTER(test_limits<long, int>);