Merge remote-tracking branch 'origin/develop' into ck-integration-tuning

2ab23275 · Alan Turner · 2c3563dd · 013d4829 · 2ab23275 · 2ab23275
Commit 2ab23275 authored Jun 16, 2023 by Alan Turner
20 changed files
--- a/cmake/Embed.cmake
+++ b/cmake/Embed.cmake
@@ -24,6 +24,40 @@
 find_program(EMBED_LD ld)
 find_program(EMBED_OBJCOPY objcopy)

+if(LINUX)
+    option(EMBED_USE_LD "Use ld to embed data files" ON)
+else()
+    option(EMBED_USE_LD "Use ld to embed data files" OFF)
+endif()
+
+function(wrap_string)
+    set(options)
+    set(oneValueArgs VARIABLE AT_COLUMN)
+    set(multiValueArgs)
+    cmake_parse_arguments(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    cmake_parse_arguments(WRAP_STRING "${options}" "${oneValueArgs}" "" ${ARGN})
+
+    string(LENGTH ${${PARSE_VARIABLE}} string_length)
+    math(EXPR offset "0")
+
+    while(string_length GREATER 0)
+
+        if(string_length GREATER ${PARSE_AT_COLUMN})
+            math(EXPR length "${PARSE_AT_COLUMN}")
+        else()
+            math(EXPR length "${string_length}")
+        endif()
+
+        string(SUBSTRING ${${PARSE_VARIABLE}} ${offset} ${length} line)
+        set(lines "${lines}\n${line}")
+
+        math(EXPR string_length "${string_length} - ${length}")
+        math(EXPR offset "${offset} + ${length}")
+    endwhile()
+
+    set(${PARSE_VARIABLE} "${lines}" PARENT_SCOPE)
+endfunction()
+
 function(generate_embed_source EMBED_NAME)
    set(options)
    set(oneValueArgs SRC HEADER)
@@ -46,14 +80,21 @@ function(generate_embed_source EMBED_NAME)
        list(GET PARSE_OBJECTS ${idx} OBJECT)
        set(START_SYMBOL "_binary_${SYMBOL}_start")
        set(END_SYMBOL "_binary_${SYMBOL}_end")
+        if(EMBED_USE_LD)
            string(APPEND EXTERNS "
                extern const char ${START_SYMBOL}[];
                extern const char ${END_SYMBOL}[];
            ")
+        else()
+            string(APPEND EXTERNS "
+                extern const char ${START_SYMBOL}[];
+                extern const char* ${END_SYMBOL};
+            ")
+        endif()

        # TODO: Should use NAME_WLE
        get_filename_component(BASE_NAME "${OBJECT}" NAME)
-        string(REGEX REPLACE ".[A-Za-z0-9_]$" "" BASE_NAME ${BASE_NAME})
+        string(REGEX REPLACE ".[A-Za-z0-9_]+$" "" BASE_NAME ${BASE_NAME})

        string(APPEND INIT_KERNELS "
            { \"${BASE_NAME}\", { ${START_SYMBOL}, ${END_SYMBOL}} },
@@ -86,9 +127,14 @@ function(embed_file OUTPUT_FILE OUTPUT_SYMBOL FILE)
        string(MAKE_C_IDENTIFIER "${REL_FILE}" SYMBOL)
        get_filename_component(OUTPUT_FILE_DIR "${REL_FILE}" DIRECTORY)
        file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${OUTPUT_FILE_DIR}")
+        if(EMBED_USE_LD)
            set(OUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/${REL_FILE}.o")
+        else()
+            set(OUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/${REL_FILE}.cpp")
+        endif()
        set(${OUTPUT_SYMBOL} ${SYMBOL} PARENT_SCOPE)
        set(${OUTPUT_FILE} "${OUT_FILE}" PARENT_SCOPE)
+        if(EMBED_USE_LD)
            add_custom_command(
                OUTPUT "${OUT_FILE}"
                COMMAND ${EMBED_LD} -r -o "${OUT_FILE}" -z noexecstack --format=binary "${REL_FILE}" 
@@ -97,6 +143,21 @@ function(embed_file OUTPUT_FILE OUTPUT_SYMBOL FILE)
                DEPENDS ${FILE}
                VERBATIM
            )
+        else()
+            set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${FILE})
+            # reads source file contents as hex string
+            file(READ ${FILE} HEX_STRING HEX)
+            # wraps the hex string into multiple lines
+            wrap_string(VARIABLE HEX_STRING AT_COLUMN 80)
+            # adds '0x' prefix and comma suffix before and after every byte respectively
+            string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1, " ARRAY_VALUES ${HEX_STRING})
+            # removes trailing comma
+            string(REGEX REPLACE ", $" "" ARRAY_VALUES ${ARRAY_VALUES})
+            file(WRITE "${OUT_FILE}" "
+                extern const char _binary_${SYMBOL}_start[] = { ${ARRAY_VALUES} };
+                extern const char* _binary_${SYMBOL}_end = _binary_${SYMBOL}_start + sizeof(_binary_${SYMBOL}_start);
+            \n")
+        endif()
    endforeach()
 endfunction()

@@ -119,6 +180,6 @@ function(add_embed_library EMBED_NAME)
    generate_embed_source(${EMBED_NAME} SRC ${SRC_FILE} HEADER ${HEADER_FILE} OBJECTS ${OUTPUT_FILES} SYMBOLS ${SYMBOLS})
    add_library(${EMBED_NAME} STATIC ${OUTPUT_FILES} "${SRC_FILE}")
    target_include_directories(${EMBED_NAME} PUBLIC "${EMBED_DIR}/include")
-    target_compile_options(${EMBED_NAME} PRIVATE -Wno-reserved-identifier)
+    target_compile_options(${EMBED_NAME} PRIVATE -Wno-reserved-identifier -Wno-extern-initializer -Wno-missing-variable-declarations)
    set_target_properties(${EMBED_NAME} PROPERTIES POSITION_INDEPENDENT_CODE On)
 endfunction()
--- a/src/common.cpp
+++ b/src/common.cpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -148,6 +148,18 @@ shape common_shape(const std::vector<shape>& shapes)
    return {compute_common_types(shapes), compute_common_lens(shapes)};
 }

+/**
+ * @brief  Creates and adds instructions to convert input arguments to common shapes and types
+ * by adding multi-broadcast and type convert operations. This is a utility function for creating
+ * operations where the shape and type of inputs need to match. It supports both dynamic and
+ * static-shaped arguments.
+ *
+ * @param m         containing module for instruction
+ * @param ins       insertion location in instruction list
+ * @param inputs    instructions to use as argument list; also, the shapes
+ *                  attached to each instruction_ref are considered for broadcasting
+ * @return std::vector<instruction_ref>   a modified argument list
+ */
 std::vector<instruction_ref>
 insert_common_args(module& m, instruction_ref ins, std::vector<instruction_ref> inputs)
 {
@@ -158,7 +170,7 @@ insert_common_args(module& m, instruction_ref ins, std::vector<instruction_ref>
        if(inputs.size() != 2)
        {
            MIGRAPHX_THROW("INSERT_COMMON_OP: not handled; " + migraphx::to_string(inputs.size()) +
-                           "inputs, only handle two inputs if any are dynamic shape");
+                           " inputs.  Requires exactly two inputs if any are dynamic shape");
        }

        auto c_type = compute_common_types(to_shapes(inputs));
@@ -224,6 +236,9 @@ instruction_ref insert_common_op(module& m,
    return m.insert_instruction(ins, op, insert_common_args(m, ins, std::move(inputs)));
 }

+/**
+ * Wrapper for insert_common_args() which inserts operation at the end of the module.
+ */
 instruction_ref add_common_op(module& m, const operation& op, std::vector<instruction_ref> inputs)
 {
    return insert_common_op(m, m.end(), op, std::move(inputs));

--- a/src/driver/main.cpp
+++ b/src/driver/main.cpp
@@ -455,8 +455,29 @@ struct compiler
    {
        auto p = l.load();
        // Dont compile if its already been compiled
+
        if(p.is_compiled())
+        {
+            if(ct.target_name == "gpu")
+            {
+                if(is_offload_copy_set(p) and not co.offload_copy)
+                {
+                    std::cout << "MIGraphX program was likely compiled with offload_copy set, Try "
+                                 "passing "
+                                 "`--enable-offload-copy` if program run fails.\n";
+                }
+                else if(co.offload_copy)
+                {
+                    std::cout << "MIGraphX program was likely compiled without "
+                                 "offload_copy set, Try "
+                                 "removing "
+                                 "`--enable-offload-copy` flag if passed to driver, if program run "
+                                 "fails.\n";
+                }
+            }
+
            return p;
+        }
        auto t = ct.get_target();
        if(to_fp16)
        {

--- a/src/driver/perf.cpp
+++ b/src/driver/perf.cpp
@@ -24,6 +24,8 @@
 #include "perf.hpp"

 #include <migraphx/generate.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/instruction_ref.hpp>
 #include <migraphx/register_target.hpp>
 #ifdef HAVE_GPU
 #include <migraphx/gpu/hip.hpp>
@@ -97,6 +99,38 @@ target get_target(bool gpu)
        return make_target("cpu");
 }

+bool is_offload_copy_set(const program& p)
+{
+    assert(p.is_compiled());
+    const module* mm                     = p.get_main_module();
+    std::vector<std::string> param_names = mm->get_parameter_names();
+    std::unordered_set<instruction_ref> param_ins;
+    std::transform(param_names.begin(),
+                   param_names.end(),
+                   std::inserter(param_ins, param_ins.begin()),
+                   [&](const auto& i) { return mm->get_parameter(i); });
+    for(const auto& i : *mm)
+    {
+        if(i.name() == "hip::copy_to_gpu")
+        {
+            auto copy_arg = instruction::get_output_alias(i.inputs().front(), true);
+            param_ins.erase(copy_arg);
+        }
+        else if(i.name() == "@return")
+        {
+            auto return_args = i.inputs();
+            for(const auto& j : return_args)
+            {
+                auto alias_ins = instruction::get_output_alias(j, true);
+                if((alias_ins->name() == "@param" && param_ins.erase(alias_ins) == 0) or
+                   (alias_ins->name() != "hip::copy_from_gpu"))
+                    return false;
+            }
+        }
+    }
+    return param_ins.empty();
+}
+
 } // namespace  MIGRAPHX_INLINE_NS
 } // namespace driver
 } // namespace migraphx
--- a/src/driver/perf.hpp
+++ b/src/driver/perf.hpp
@@ -39,6 +39,15 @@ parameter_map create_param_map(const program& p, const target& t, bool offload =
 parameter_map fill_param_map(parameter_map& m, const program& p, bool gpu);
 parameter_map create_param_map(const program& p, bool gpu = true);
 target get_target(bool gpu);
+/**
+ * @brief Checks if MIGraphX program compiled for "GPU" has offload_copy set of not. This is
+ intended to print a HINT for the users and would not always correctly classify compiled program as
+ with or without offload_copy in all cases.
+
+ * @param p Compiled MIGraphX program for GPU backend
+ * @return true if program is classified as compiled with "offload_copy" set
+ */
+bool is_offload_copy_set(const program& p);

 } // namespace MIGRAPHX_INLINE_NS
 } // namespace driver

--- a/src/include/migraphx/matcher.hpp
+++ b/src/include/migraphx/matcher.hpp
@@ -135,14 +135,14 @@ template <class M>
 auto bind_match(M m, std::string name)
 {
    return make_function_matcher(
-        [=, name = std::move(name)](matcher_context& ctx,
+        [=, m_name = std::move(name)](matcher_context& ctx,
                                      instruction_ref ins) -> optional<instruction_ref> {
            auto result = m.match(ctx, ins);
            if(result)
            {
                if(not ctx.has_instruction(ins))
                    return nullopt;
-                ctx.instructions[name] = ins;
+                ctx.instructions[m_name] = ins;
            }
            return result;
        });
@@ -655,9 +655,9 @@ auto skip_output(Ms... ms)
 inline auto var(std::string s)
 {
    return make_basic_fun_matcher(
-        [=, s = std::move(s)](const matcher_context& ctx,
+        [=, m_s = std::move(s)](const matcher_context& ctx,
                                instruction_ref) -> optional<instruction_ref> {
-            auto it = ctx.instructions.find(s);
+            auto it = ctx.instructions.find(m_s);
            if(it == ctx.instructions.end())
                return nullopt;
            return it->second;
@@ -667,7 +667,7 @@ inline auto var(std::string s)
 inline auto name(std::string s)
 {
    return make_basic_pred_matcher(
-        [=, s = std::move(s)](instruction_ref ins) { return ins->name() == s; });
+        [=, m_s = std::move(s)](instruction_ref ins) { return ins->name() == m_s; });
 }

 inline auto name_contains(const std::string& name)
@@ -678,8 +678,8 @@ inline auto name_contains(const std::string& name)

 inline auto name(std::unordered_set<std::string> names)
 {
-    return make_basic_pred_matcher([=, names = std::move(names)](instruction_ref ins) {
-        return names.count(ins->name()) > 0;
+    return make_basic_pred_matcher([=, m_names = std::move(names)](instruction_ref ins) {
+        return m_names.count(ins->name()) > 0;
    });
 }


--- a/src/include/migraphx/op/broadcast.hpp
+++ b/src/include/migraphx/op/broadcast.hpp
@@ -68,6 +68,9 @@ struct broadcast
        {
            // the ONNX broadcast op is deprecated now, so not handling the negative
            // value of axis anymore
+            if(s0.dynamic())
+                MIGRAPHX_THROW(
+                    "BROADCAST: Single dynamic input shape not supported.  Use two inputs.");
            if(axis >= broadcast_lens.size())
            {
                MIGRAPHX_THROW("BROADCAST : axis " + migraphx::to_string(axis) +

--- a/src/include/migraphx/op/multibroadcast.hpp
+++ b/src/include/migraphx/op/multibroadcast.hpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -37,8 +37,10 @@ namespace op {
 /**
 * Broadcast multiple dimensions between two tensors.
 * Two versions of this operator: one input and two inputs.
- * One input version uses output_lens attribute and broadcasts to it.
- * Two inputs version broadcasts both inputs to the common shape at evaluation time.
+ * One input version uses output_lens attribute and broadcasts to it (does not support
+ * dynamic shape input).
+ *
+ * Two inputs version broadcasts the first input to the common shape of the two inputs.
 */
 struct multibroadcast
 {
@@ -81,6 +83,9 @@ struct multibroadcast

        if(inputs.size() == 1)
        {
+            if(s0.dynamic())
+                MIGRAPHX_THROW(
+                    "MULTIBROADCAST: Single dynamic input shape not supported.  Use two inputs.");
            if(s0.lens().size() > output_lens.size())
            {
                MIGRAPHX_THROW("MULTIBROADCAST: input dimensions should <= output size");

--- a/src/include/migraphx/op/reshape.hpp
+++ b/src/include/migraphx/op/reshape.hpp
@@ -29,6 +29,7 @@
 #include <migraphx/config.hpp>
 #include <migraphx/value.hpp>
 #include <migraphx/dyn_output.hpp>
+#include <migraphx/optional.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -96,9 +97,115 @@ struct reshape
        return {s0.type(), output_dyn_dims};
    }

+    template <class Iterator>
+    static auto compute_end_dim(Iterator start, Iterator last, std::size_t dim)
+    {
+        std::size_t x = 1;
+        auto it       = std::find_if(start, last, [&](auto i) {
+            x *= i;
+            return x >= dim;
+        });
+        if(x != dim)
+            return start;
+        return it;
+    }
+
+    template <class DimIterator, class StrideIterator>
+    static auto can_strides_merge(DimIterator dim_start,
+                                  DimIterator dim_last,
+                                  StrideIterator stride_start,
+                                  StrideIterator stride_last)
+    {
+        assert(std::distance(dim_start, dim_last) == std::distance(stride_start, stride_last));
+        auto cstride = *std::prev(stride_last);
+        return std::equal(std::make_reverse_iterator(dim_last),
+                          std::make_reverse_iterator(dim_start + 1),
+                          std::make_reverse_iterator(stride_last - 1),
+                          std::make_reverse_iterator(stride_start),
+                          [&](auto dim, auto stride) {
+                              cstride *= dim;
+                              return stride == cstride;
+                          });
+    }
+
+    // This will reshape the dimesions of the input shape to use the lens of
+    // `rdims`. If this can't be done without changing memory layout then it
+    // will return nullopt
+    static optional<shape> reshape_dims(const shape& input, const std::vector<std::size_t>& rdims)
+    {
+        if(input.standard())
+            return shape{input.type(), rdims};
+
+        const auto& idims    = input.lens();
+        const auto& istrides = input.strides();
+
+        std::vector<std::size_t> rstrides;
+        std::size_t i = 0;
+        std::size_t r = 0;
+        while(i < idims.size() and r < rdims.size())
+        {
+            auto idim = idims[i];
+            auto rdim = rdims[r];
+            if(rdim == idim)
+            {
+                rstrides.push_back(istrides[i]);
+            }
+            // squeeze
+            else if(rdim > idim)
+            {
+                auto start = idims.begin() + i;
+                auto it    = compute_end_dim(start, idims.end(), rdim);
+                if(it == start)
+                    return nullopt;
+                auto n = it - start;
+                assert((i + n) <= istrides.size());
+                if(not can_strides_merge(
+                       start, it + 1, istrides.begin() + i, istrides.begin() + i + n + 1))
+                    return nullopt;
+                i += n;
+                rstrides.push_back(istrides[i]);
+            }
+            // unsqueeze
+            else // if(rdim < idim)
+            {
+                auto start = rdims.begin() + i;
+                auto it    = compute_end_dim(start, rdims.end(), idim);
+                if(it == start)
+                    return nullopt;
+                auto n = it - start;
+                assert((r + n) <= rdims.size());
+                auto stride = istrides[i] * idim;
+                std::for_each(start, it + 1, [&](auto dim) {
+                    stride /= dim;
+                    rstrides.push_back(stride);
+                });
+                r += n;
+            }
+            i++;
+            r++;
+        }
+
+        // Handle trailing 1s
+        if(rstrides.size() < rdims.size() and not rstrides.empty())
+        {
+            auto stride = rstrides.back();
+            for(auto d : range(rdims.begin() + rstrides.size(), rdims.end()))
+            {
+                if(d != 1)
+                    return nullopt;
+                rstrides.push_back(stride);
+            }
+        }
+
+        if(rdims.size() != rstrides.size())
+            return nullopt;
+
+        return shape{input.type(), rdims, rstrides};
+    }
+
    shape static_compute_shape(std::vector<shape> inputs, std::size_t n_neg_dims) const
    {
-        check_shapes{inputs, *this}.standard();
+        check_shapes{inputs, *this}.has(1);
        auto&& idims = inputs.front().lens();
        std::vector<std::size_t> rdims(dims.begin(), dims.end());

@@ -125,12 +232,17 @@ struct reshape
            }
        }

-        shape s{inputs.front().type(), rdims};
-        if(s.elements() != inputs.front().elements())
+        auto s = reshape_dims(inputs.front(), rdims);
+        if(not s.has_value())
+            MIGRAPHX_THROW("Reshape on axis that is not packed.");
+
+        if(s->elements() != inputs.front().elements())
            MIGRAPHX_THROW("Reshape: Wrong number of elements for reshape: reshape has " +
-                           std::to_string(s.elements()) + " elements whereas the input has " +
+                           std::to_string(s->elements()) + " elements whereas the input has " +
                           std::to_string(inputs.front().elements()));
-        return s;
+
+        assert(s->bytes() == inputs.front().bytes());
+        return *s;
    }

    shape compute_shape(std::vector<shape> inputs) const

--- a/src/include/migraphx/shape.hpp
+++ b/src/include/migraphx/shape.hpp
@@ -183,7 +183,7 @@ struct shape
    const std::vector<std::size_t>& strides() const;

    /*!
-     * The number of dimensions in the shape.
+     * The number of dimensions in the shape, either static or dynamic.
     * Same as the number of indices required to get a data value.
     */
    std::size_t ndim() const;

--- a/src/onnx/onnx_parser.cpp
+++ b/src/onnx/onnx_parser.cpp
@@ -149,6 +149,25 @@ instruction_ref onnx_parser::node_info::add_broadcastable_binary_op(const std::s
    return this->add_common_op(op_name, arg0, arg1);
 }

+/**
+ * @brief A wrapper for insert_common_args(), which constructs an argument list
+ * and inserts multibroadcast and convert ops to match inputs to a common shape and type
+ * as required.  The requested operation is placed after the added multibroadcast and convert ops,
+ * if any, so that their results are transparent to the programmer.
+ *
+ * Use add_common_op() to match input sizes when inputs may be
+ *  either static or dynamic.
+ *
+ * @param op_name               string; Name of operation (op) to add; valid names are the same as
+ * for make_op()
+ *
+ * @param inputs                vector of instruction_ref.  List of instructions for the new
+ * operator.  Multibroadcast and convert operations, if needed, are deduced from these too.
+ *
+ * @return instruction_ref      Returns an instruction_ref which is the result of the requested
+ * operation.
+ *
+ */
 instruction_ref onnx_parser::node_info::add_common_op(const std::string& op_name,
                                                      std::vector<instruction_ref> inputs) const
 {

--- a/src/onnx/parse_instancenorm.cpp
+++ b/src/onnx/parse_instancenorm.cpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -84,16 +84,17 @@ struct parse_instancenorm : op_parser<parse_instancenorm>
            MIGRAPHX_THROW(opd.op_name + ": invalid output type: " + std::to_string(dtype) +
                           ". Valid types are 1 (float), 10 (half), and 11 (double).");

-        auto ndims = dims.size();
+        bool dyn_input = x->get_shape().dynamic();
+        auto ndims     = x->get_shape().ndim();
        assert(ndims >= 2);
        auto kdims = ndims - 2;
-
        std::vector<int64_t> axes(kdims);
        std::iota(axes.begin(), axes.end(), 2);
        auto mean = info.add_instruction(make_op("reduce_mean", {{"axes", axes}}), x);
-        auto mean_bcast =
-            info.add_instruction(make_op("multibroadcast", {{"out_lens", dims}}), mean);
-        auto l1 = info.add_instruction(make_op("sub"), x, mean_bcast);
+
+        // Use add_common_op() to insert multibroadcast/convert instructions where needed when
+        // inputs may be either static or dynamic.
+        auto l1 = info.add_common_op("sub", x, mean);
        // for the fp16, if not converting to fp32 then divide `x` and `mean` by `sqrt(n)` and take
        // reduce_sum to calculate variance i.e.
        // var =  reduce_sum((x/s_n - mean/s_n)^2) where s_n = sqrt(n)
@@ -107,23 +108,32 @@ struct parse_instancenorm : op_parser<parse_instancenorm>
                });
            n              = 1.0 / std::sqrt(n);
            auto n_literal = info.add_literal(literal{dtype, {n}});
-            mean_bcast     = info.add_common_op("mul", {mean_bcast, n_literal});
            x              = info.add_common_op("mul", {x, n_literal});
        }
-        auto l0              = info.add_instruction(make_op("sqdiff"), x, mean_bcast);
+        auto l0              = info.add_common_op("sqdiff", x, mean);
        auto variance        = info.add_instruction(make_op(reduce_op_name, {{"axes", axes}}), l0);
        auto epsilon_literal = info.add_literal(literal{shape{literal_dtype}, {epsilon}});
-        auto epsilon_bcast =
-            info.add_instruction(make_op("multibroadcast", {{"out_lens", dims}}), epsilon_literal);
-        auto variance_bcast =
-            info.add_instruction(make_op("multibroadcast", {{"out_lens", dims}}), variance);
-        auto l2 = info.add_instruction(make_op("add"), variance_bcast, epsilon_bcast);
+        auto l2              = info.add_common_op("add", variance, epsilon_literal);
+
        auto l3 = info.add_instruction(make_op("rsqrt"), l2);
-        auto l4 = info.add_instruction(make_op("mul"), l1, l3);
-        auto scale_bcast =
-            info.add_instruction(make_op("broadcast", {{"axis", 1}, {"out_lens", dims}}), scale);
-        auto bias_bcast =
+        auto l4 = info.add_common_op("mul", l1, l3);
+
+        // add_common_op() doesn't apply the plain broadcast op, so we add that op explicitly for
+        // both scale and bias.
+        instruction_ref scale_bcast;
+        instruction_ref bias_bcast;
+        if(dyn_input)
+        {
+            scale_bcast = info.add_instruction(make_op("broadcast", {{"axis", 1}}), scale, x);
+            bias_bcast  = info.add_instruction(make_op("broadcast", {{"axis", 1}}), bias, x);
+        }
+        else
+        {
+            scale_bcast = info.add_instruction(
+                make_op("broadcast", {{"axis", 1}, {"out_lens", dims}}), scale);
+            bias_bcast =
                info.add_instruction(make_op("broadcast", {{"axis", 1}, {"out_lens", dims}}), bias);
+        }
        auto l5  = info.add_instruction(make_op("mul"), l4, scale_bcast);
        auto ret = info.add_instruction(make_op("add"), l5, bias_bcast);
        if(dtype == shape::half_type and convert_fp16)

--- a/src/program.cpp
+++ b/src/program.cpp
@@ -539,7 +539,8 @@ std::vector<argument> program::eval(parameter_map params, execution_environment
            ins_out[x] = ss.str();
        });

-        ret = generic_eval(*this,
+        ret = generic_eval(
+            *this,
            ctx,
            std::move(params),
            with_check_context([&](auto& ins, auto f, auto&& check_context) {
@@ -551,15 +552,28 @@ std::vector<argument> program::eval(parameter_map params, execution_environment
                ctx.finish();
                double t2 = t.record<milliseconds>();
                std::cout << "Time: " << t1 << "ms, " << t2 << "ms" << std::endl;
-                               if(trace_level > 1 and ins->name().front() != '@' and
-                                  ins->name() != "load" and not result.empty())
+                if(trace_level > 1 and ins->name().front() != '@' and ins->name() != "load" and
+                   not result.empty())
+                {
+                    migraphx::argument buffer;
+                    try
                    {
                        target tgt = make_target(this->impl->target_name);
-                                   auto buffer = tgt.copy_from(result);
+                        buffer     = tgt.copy_from(result);
+                    }
+                    catch(const migraphx::exception&)
+                    {
+                        // instruction was run on host then no need to copy buffer from target
+                        buffer = result;
+                    }
+                    catch(...)
+                    {
+                        MIGRAPHX_THROW(
+                            "MIGraphX program execution with MIGRAPHX_TRACE_EVAL failed.\n");
+                    }
                    if(trace_level == 2)
                    {
-                                       std::cout << "Output has "
-                                                 << to_string_range(classify_argument(buffer))
+                        std::cout << "Output has " << to_string_range(classify_argument(buffer))
                                  << std::endl;
                        std::cout << "Output: ";
                        preview_argument(std::cout, buffer);

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -35,12 +35,11 @@ endif()

 find_package(composable_kernel 1.0.0 COMPONENTS jit_library REQUIRED) 

-set(MIGRAPHX_USE_HIPRTC OFF CACHE BOOL "Use hipRTC APIs")
-#if(BUILD_DEV)
-#set(MIGRAPHX_USE_HIPRTC OFF CACHE BOOL "Use hipRTC APIs")
-#else()
-#set(MIGRAPHX_USE_HIPRTC ON CACHE BOOL "Use hipRTC APIs")
-#endif()
+if(BUILD_DEV)
+    set(MIGRAPHX_USE_HIPRTC OFF CACHE BOOL "Use hipRTC APIs")
+else()
+    set(MIGRAPHX_USE_HIPRTC ON CACHE BOOL "Use hipRTC APIs")
+endif()

 include(Embed)
 file(GLOB KERNEL_FILES ${CONFIGURE_DEPENDS}

--- a/src/targets/gpu/hip.cpp
+++ b/src/targets/gpu/hip.cpp
@@ -146,7 +146,11 @@ std::vector<T> read_from_gpu(const void* x, std::size_t sz)
    gpu_sync();
    std::vector<T> result(sz);
    assert(not is_device_ptr(result.data()));
-    assert(is_device_ptr(x));
+    if(not is_device_ptr(x))
+    {
+        MIGRAPHX_THROW(
+            "read_from_gpu() requires Src buffer to be on the GPU, Copy from gpu failed\n");
+    }
    auto status = hipMemcpy(result.data(), x, sz * sizeof(T), hipMemcpyDeviceToHost);
    if(status != hipSuccess)
        MIGRAPHX_THROW("Copy from gpu failed: " + hip_error(status)); // NOLINT

--- a/src/targets/gpu/jit/ck_gemm.cpp
+++ b/src/targets/gpu/jit/ck_gemm.cpp
@@ -22,7 +22,7 @@
 * THE SOFTWARE.
 */
 #include <fstream>
-#include <filesystem>
+#include <migraphx/filesystem.hpp>
 #include <migraphx/gpu/compiler.hpp>
 #include <migraphx/make_op.hpp>
 #include <migraphx/gpu/context.hpp>

--- a/src/targets/gpu/kernels/include/migraphx/kernels/math.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/math.hpp
@@ -188,10 +188,8 @@ MIGRAPHX_DEVICE_MATH_BINARY_FOR(float, max, ::max)
 MIGRAPHX_DEVICE_MATH_BINARY_FOR(float, min, ::min)
 MIGRAPHX_DEVICE_MATH_BINARY_FOR(double, max, ::max)
 MIGRAPHX_DEVICE_MATH_BINARY_FOR(double, min, ::min)
-// Add overloads for half that calls the float version, this should use "hmax" and "hmin" once
-// perf CI docker is upgraded to rocm-5.5
-MIGRAPHX_DEVICE_MATH_BINARY_FOR(migraphx::half, max, ::fmaxf)
-MIGRAPHX_DEVICE_MATH_BINARY_FOR(migraphx::half, min, ::fminf)
+MIGRAPHX_DEVICE_MATH_BINARY_FOR(migraphx::half, max, ::__hmax)
+MIGRAPHX_DEVICE_MATH_BINARY_FOR(migraphx::half, min, ::__hmin)

 template <class T, MIGRAPHX_REQUIRES(not is_any_vec<T>())>
 constexpr auto max(const T& a, const T& b)

--- a/src/value.cpp
+++ b/src/value.cpp
@@ -527,6 +527,9 @@ std::size_t value_hash(const std::string& key, const T& x)
    hash_combine(h, x);
    return h;
 }
+
+std::size_t value_hash(const std::string& key, std::nullptr_t) { return hash_value(key); }
+
 std::size_t value_hash(const std::string& key, const std::vector<value>& x)
 {
    std::size_t h = hash_value(key);

--- a/test/api/test_gpu.cpp
+++ b/test/api/test_gpu.cpp
@@ -151,7 +151,7 @@ TEST_CASE(dynamic_batch_load_and_run_offload)
    c_options.set_offload_copy();
    p.compile(migraphx::target("gpu"), c_options);
    auto out_shapes = p.get_output_shapes();
-    CHECK(out_shapes.size() == 1);
+    EXPECT(out_shapes.size() == 1);
    EXPECT(out_shapes[0].dynamic());

    // batch size = 2
@@ -165,9 +165,9 @@ TEST_CASE(dynamic_batch_load_and_run_offload)
           migraphx::argument(migraphx::shape(migraphx_shape_float_type, {2, 3, 3, 3}), c.data()));
    auto outputs = p.eval(pp);

-    CHECK(shapes_before.size() == outputs.size());
-    CHECK(bool{outputs.front().get_shape() ==
-               migraphx::shape(migraphx_shape_float_type, {2, 1, 3, 3})});
+    EXPECT(shapes_before.size() == outputs.size());
+    EXPECT(bool{outputs.front().get_shape() ==
+                migraphx::shape(migraphx_shape_float_type, {2, 2, 2, 2})});
 }

 TEST_CASE(load_and_run_async)

--- a/test/onnx/gen_onnx.py
+++ b/test/onnx/gen_onnx.py
@@ -3341,6 +3341,39 @@ def instance_norm_type_mismatch_test():
    return ([node], [x, scale, bias], [y])


+@onnx_test()
+def instance_norm_dyn_batch_test():
+    # the batch size is a dynamic dimension
+    x = helper.make_tensor_value_info('0', TensorProto.FLOAT, [None, 2, 3, 3])
+    scale = helper.make_tensor_value_info('1', TensorProto.FLOAT, [2])
+    bias = helper.make_tensor_value_info('2', TensorProto.FLOAT, [2])
+    y = helper.make_tensor_value_info('3', TensorProto.FLOAT, [None, 2, 3, 3])
+
+    node = onnx.helper.make_node('InstanceNormalization',
+                                 inputs=['0', '1', '2'],
+                                 outputs=['3'])
+
+    return ([node], [x, scale, bias], [y])
+    return ([node], [x, scale, bias], [y])
+
+
+@onnx_test()
+def instance_norm_dyn_batch_half_test():
+    # the batch size is a dynamic dimension
+    x = helper.make_tensor_value_info('0', TensorProto.FLOAT16,
+                                      [None, 2, 3, 3])
+    scale = helper.make_tensor_value_info('1', TensorProto.FLOAT16, [2])
+    bias = helper.make_tensor_value_info('2', TensorProto.FLOAT16, [2])
+    y = helper.make_tensor_value_info('3', TensorProto.FLOAT16,
+                                      [None, 2, 3, 3])
+
+    node = onnx.helper.make_node('InstanceNormalization',
+                                 inputs=['0', '1', '2'],
+                                 outputs=['3'])
+
+    return ([node], [x, scale, bias], [y])
+
+
 @onnx_test()
 def instance_norm_invalid_type_test():
    x = helper.make_tensor_value_info('0', TensorProto.INT32, [1, 2, 3, 3])