Merge branch 'develop' into multinomial_parse

63952fb9 · Brian Pickrell · 61f3895c · e7471141 · 63952fb9 · 63952fb9
Commit 63952fb9 authored Jul 06, 2023 by Brian Pickrell
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -27,28 +27,27 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_BINARY_DIR}")
    message(FATAL_ERROR "The binary and source directroy cannot be the same")
 endif()
+get_property(_GENERATOR_IS_MULTI_CONFIG GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG)
 # This has to be initialized before the project() command appears
 # Set the default of CMAKE_BUILD_TYPE to be release, unless user specifies with -D.  MSVC_IDE does not use CMAKE_BUILD_TYPE
-if( NOT MSVC_IDE AND NOT CMAKE_BUILD_TYPE )
+if(_GENERATOR_IS_MULTI_CONFIG)
-    set( CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." )
+    if (NOT CMAKE_CONFIGURATION_TYPES)
-endif()
+        set(CMAKE_CONFIGURATION_TYPES "Debug;Release;RelWithDebInfo;MinSizeRel" CACHE STRING
+            "Available build types (configurations) on multi-config generators")
-# Setup valid strings for build type
+    endif()
-if (NOT CMAKE_CONFIGURATION_TYPES)
-    set(CMAKE_CONFIGURATION_TYPES "Debug;Release;RelWithDebInfo;MinSizeRel" CACHE STRING "Configs")
-endif()
-set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS ${CMAKE_CONFIGURATION_TYPES})
-# Default installation path
-if(WIN32)
-    set(CMAKE_INSTALL_PREFIX "/opt/rocm/x86_64-w64-mingw32" CACHE PATH "")
 else()
-    set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "")
+    if(NOT CMAKE_BUILD_TYPE)
+        set(CMAKE_BUILD_TYPE Release CACHE STRING
+            "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel.")
+    endif()
 endif()
 set(CMAKE_BUILD_RPATH "${CMAKE_BINARY_DIR}/lib")
-project(migraphx)
+project(migraphx LANGUAGES C CXX)
+include(CTest)
 find_package(ROCM REQUIRED)
 find_path(HALF_INCLUDE_DIR half.hpp PATH_SUFFIXES half)
@@ -128,6 +127,7 @@ rocm_enable_clang_tidy(
        -bugprone-implicit-widening-of-multiplication-result
        -bugprone-macro-parentheses
        -bugprone-signed-char-misuse
+        -bugprone-unchecked-optional-access
        # Disable the aliased reserved identifiers
        -cert-dcl37-c
        -cert-dcl51-cpp
@@ -269,7 +269,9 @@ set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin)
 add_subdirectory(src)
 add_subdirectory(docs)
-add_subdirectory(test)
+if(BUILD_TESTING)
+    add_subdirectory(test)
+endif()
 add_subdirectory(tools)
 set(DEST_DIR ${CMAKE_BINARY_DIR})

--- a/Dockerfile
+++ b/Dockerfile
@@ -12,6 +12,9 @@ RUN apt-get update && apt-get install -y gnupg2 --no-install-recommends curl &&
 # Add rocm repository
 RUN sh -c 'echo deb [arch=amd64 trusted=yes] http://repo.radeon.com/rocm/apt/5.5/ focal main > /etc/apt/sources.list.d/rocm.list'
+# From docs.amd.com for installing rocm. Needed to install properly
+RUN sh -c "echo 'Package: *\nPin: release o=repo.radeon.com\nPin-priority: 600' > /etc/apt/preferences.d/rocm-pin-600"
 # Install dependencies
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
    apt-utils \
@@ -110,7 +113,7 @@ RUN git clone --single-branch --branch ${ONNXRUNTIME_BRANCH} --recursive ${ONNXR
 ADD tools/build_and_test_onnxrt.sh /onnxruntime/build_and_test_onnxrt.sh
-RUN cget -p /usr/local install ROCmSoftwarePlatform/rocMLIR@a997d5f51314b45d7a4c04f1599966dcf53f9b4d -DBUILD_MIXR_TARGET=On -DLLVM_ENABLE_ZSTD=Off -DLLVM_ENABLE_THREADS=Off
+RUN cget -p /usr/local install ROCmSoftwarePlatform/rocMLIR@8d25af3b3721c159bb41cc6388e9453b1018c126 -DBUILD_MIXR_TARGET=On -DLLVM_ENABLE_ZSTD=Off -DLLVM_ENABLE_THREADS=Off
 ENV MIOPEN_FIND_DB_PATH=/tmp/miopen/find-db
 ENV MIOPEN_USER_DB_PATH=/tmp/miopen/user-db

--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -89,6 +89,8 @@ def rocmnodename(name) {
        node_name = "${rocmtest_name} && vega";
    } else if(name == "navi21") {
        node_name = "${rocmtest_name} && navi21";
+    } else if(name == "mi100+") {
+        node_name = "${rocmtest_name} && (gfx908 || gfx90a)";
    } else if(name == "anygpu") {
        node_name = "${rocmtest_name} && (gfx908 || gfx90a || vega)";
    } else if(name == "nogpu") {
@@ -120,7 +122,7 @@ rocmtest clang_debug: rocmnode('vega') { cmake_build ->
    }
 }, hiprtc_gpu_debug: rocmnode('vega') { cmake_build ->
    stage('HipRTC GPU Debug') {
-        cmake_build(flags: "-DCMAKE_BUILD_TYPE=release -DMIGRAPHX_USE_HIPRTC=On", gpu_debug: true, hiprtc_workarounds:  true)
+        cmake_build(flags: "-DCMAKE_BUILD_TYPE=release -DMIGRAPHX_USE_HIPRTC=On", gpu_debug: true, hiprtc_workarounds: true)
    }
 }, all_targets_debug : rocmnode('vega') { cmake_build ->
    stage('All targets Release') {
@@ -134,6 +136,12 @@ rocmtest clang_debug: rocmnode('vega') { cmake_build ->
            cmake_build(flags: "-DCMAKE_BUILD_TYPE=debug -DMIGRAPHX_ENABLE_PYTHON=Off -DMIGRAPHX_ENABLE_MLIR=On -DCMAKE_CXX_FLAGS_DEBUG='${debug_flags}' -DCMAKE_C_FLAGS_DEBUG='${debug_flags}'")
        }
    }
+}, ck_release: rocmnode('mi100+') { cmake_build ->
+    stage('CK Release') {
+        withEnv(['MIGRAPHX_ENABLE_CK=1', 'MIGRAPHX_TUNE_CK=1']) {
+            cmake_build(flags: "-DCMAKE_BUILD_TYPE=release")
+        }
+    }
 }, clang_asan: rocmnode('nogpu') { cmake_build ->
    stage('Clang ASAN') {
        def sanitizers = "undefined,address"

--- a/requirements.txt
+++ b/requirements.txt
@@ -28,4 +28,4 @@ ROCmSoftwarePlatform/half@rocm-5.4.2
 pybind/pybind11@d159a563383d10c821ba7b2a71905d1207db6de4 --build
 msgpack/msgpack-c@cpp-3.3.0 -DMSGPACK_BUILD_TESTS=Off
 sqlite3@3.17 -DCMAKE_POSITION_INDEPENDENT_CODE=On
-ROCmSoftwarePlatform/composable_kernel@ac580f77a84c705c678816ef7195adfcc02bdda5 -DCK_BUILD_JIT_LIB=On -DCMAKE_POSITION_INDEPENDENT_CODE=On
+ROCmSoftwarePlatform/composable_kernel@5172ec5280f14974beee2acf1af1db3b2670244c -DCK_BUILD_JIT_LIB=On -DCMAKE_POSITION_INDEPENDENT_CODE=On
--- a/src/common.cpp
+++ b/src/common.cpp
@@ -157,7 +157,7 @@ insert_common_args(module& m, instruction_ref ins, std::vector<instruction_ref>
                ins, make_op("multibroadcast", {{"out_dyn_dims", to_value(c_dyn_dims)}}), inputs);
        }
        std::transform(inputs.begin() + 1, inputs.end(), inputs.begin() + 1, [&](auto input) {
-            // uses previous multibroadcast to avoid recalculating the common shape from the
+            // uses previous input to avoid recalculating the common shape from the
            // full set of input shapes at runtime
            if(input->get_shape().dyn_dims() != c_dyn_dims)
            {

--- a/src/dead_code_elimination.cpp
+++ b/src/dead_code_elimination.cpp
@@ -49,8 +49,10 @@ void dead_code_elimination::apply(module& m) const
        if(i == last)
            break;
        // Skip instruction with empty shape as output unless its [dynamic, builtin, undefined,
-        // identity, allocate]
+        // identity, allocate or tuple_type]
-        if((not i->get_shape().dynamic() and i->get_shape().elements() == 0) and
+        if((not i->get_shape().dynamic() and
+            (i->get_shape().elements() == 0 and
+             i->get_shape().type() != migraphx::shape::tuple_type)) and
           not(i->name().front() == '@') and not contains({"identity", "allocate"}, i->name()) and
           not i->is_undefined())
            continue;

--- a/src/include/migraphx/matcher.hpp
+++ b/src/include/migraphx/matcher.hpp
@@ -31,6 +31,7 @@
 #include <migraphx/optional.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/type_name.hpp>
+#include <migraphx/source_location.hpp>
 #include <migraphx/config.hpp>
 #include <unordered_map>
 #include <unordered_set>
@@ -370,31 +371,30 @@ match::matcher_result find_match(module& modl, M&& m)
 }
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TRACE_MATCHES)
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TRACE_MATCHES_FOR)
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_VALIDATE_MATCHES)
 /// Find matches for an instruction in the module for per section of matchers
 template <class Mod, class... Ms>
-void find_matches(size_t trace_pass, Mod& mod, instruction_ref ins, Ms&&... ms)
+void find_matches_for(source_location location, Mod& mod, instruction_ref ins, Ms&&... ms)
 {
-#if !defined(__GNUC__) || defined(__clang__) || __GNUC__ > 5
+    const int trace         = value_of(MIGRAPHX_TRACE_MATCHES{});
-    const
+    const bool validate     = enabled(MIGRAPHX_VALIDATE_MATCHES{});
-#endif
+    const auto trace_filter = string_value_of(MIGRAPHX_TRACE_MATCHES_FOR{});
-        int trace = value_of(MIGRAPHX_TRACE_MATCHES{});
+    const bool trace_for    = not trace_filter.empty() and
-#if !defined(__GNUC__) || defined(__clang__) || __GNUC__ > 5
+                           (contains(std::string{location.file_name()}, trace_filter) or
-    const
+                            contains(std::string{location.function_name()}, trace_filter));
-#endif
+    bool match = false;
-        bool validate = enabled(MIGRAPHX_VALIDATE_MATCHES{});
-    bool match        = false;
    each_args(
        [&](auto&& m) {
            if(match)
                return;
-            if(trace > 1 or trace_pass > 1)
+            if(trace > 1 or trace_for)
                std::cout << "Match: " << get_type_name(m) << std::endl;
            auto r = match_instruction(get_module(mod), ins, m.matcher());
            if(r.result == get_module(mod).end())
                return;
-            if(trace > 0 or trace_pass > 0)
+            if(trace > 0 or trace_for)
            {
                std::cout << "Matched by " << get_type_name(m) << std::endl;
                get_module(mod).debug_print(ins);
@@ -420,23 +420,19 @@ void find_matches(size_t trace_pass, Mod& mod, instruction_ref ins, Ms&&... ms)
 /// Find matches in a module
 template <class Mod, class... Ms>
-void find_matches(Mod& mod, Ms&&... ms)
+struct find_matches
 {
-    for(auto ins : iterator_for(get_module(mod)))
+    find_matches(Mod& mod, Ms&&... ms, source_location location = source_location::current())
    {
-        find_matches(0, mod, ins, ms...);
+        for(auto ins : iterator_for(get_module(mod)))
+        {
+            find_matches_for(location, mod, ins, ms...);
+        }
    }
-}
+};
-/// Find matches in a pass
 template <class Mod, class... Ms>
-void find_matches(size_t trace_pass, Mod& mod, Ms&&... ms)
+find_matches(Mod& mod, Ms&&... ms) -> find_matches<Mod, Ms...>;
-{
-    for(auto ins : iterator_for(get_module(mod)))
-    {
-        find_matches(trace_pass, mod, ins, ms...);
-    }
-}
 template <class M, class F>
 struct find_generic_match

--- a/src/include/migraphx/op/clip.hpp
+++ b/src/include/migraphx/op/clip.hpp
@@ -25,12 +25,13 @@
 #define MIGRAPHX_GUARD_OPERATORS_CLIP_HPP
 #include <array>
+#include <cmath>
 #include <migraphx/check_shapes.hpp>
 #include <migraphx/argument.hpp>
 #include <migraphx/par_for.hpp>
 #include <migraphx/config.hpp>
 #include <migraphx/value.hpp>
-#include <cmath>
+#include <migraphx/dyn_output.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -52,11 +53,11 @@ struct clip
        return inputs.front();
    }
-    argument compute(const shape& output_shape, std::vector<argument> args) const
+    argument compute(const dyn_output& dyn_out, std::vector<argument> args) const
    {
-        argument result{output_shape};
+        argument result{dyn_out.computed_shape};
        visit_all(result, args[0], args[1], args[2])([&](auto output, auto x, auto min, auto max) {
-            par_for(output_shape.elements(),
+            par_for(dyn_out.computed_shape.elements(),
                    [&](auto i) { output[i] = std::min(std::max(min[i], x[i]), max[i]); });
        });

--- a/src/include/migraphx/op/convert.hpp
+++ b/src/include/migraphx/op/convert.hpp
@@ -66,17 +66,7 @@ struct convert : unary<convert>
        auto type = target_type;
        return [type](auto x) {
            auto y = x;
-            shape::visit(type, [&](auto as) {
+            shape::visit(type, [&](auto as) { y = as(x); });
-                // clamping value between target_type's max and min doesn't work for NaNs,
-                if(std::isnan(x))
-                {
-                    y = as.nan();
-                }
-                else
-                {
-                    y = std::min(std::max(as(x), as.min()), as.max());
-                }
-            });
            return y;
        };
    }

--- a/src/include/migraphx/replace_allocate.hpp
+++ b/src/include/migraphx/replace_allocate.hpp
@@ -30,7 +30,7 @@
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
-struct module;
+struct module_pass_manager;
 /**
 *  Replace `allocate` instructions with target allocations or output parameters.
@@ -40,7 +40,7 @@ struct replace_allocate
    allocation_model model;
    bool offload_copy = false;
    std::string name() const { return "replace_allocate"; }
-    void apply(module& m) const;
+    void apply(module_pass_manager& mpm) const;
 };
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/include/migraphx/source_location.hpp
+++ b/src/include/migraphx/source_location.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_MIGRAPHX_SOURCE_LOCATION_HPP
+#define MIGRAPHX_GUARD_MIGRAPHX_SOURCE_LOCATION_HPP
+#include <migraphx/config.hpp>
+#if defined(CPPCHECK)
+#define MIGRAPHX_HAS_SOURCE_LOCATION 1
+#define MIGRAPHX_HAS_SOURCE_LOCATION_TS 1
+#elif defined(__has_include)
+#if __has_include(<source_location>) && __cplusplus >= 202003L
+#define MIGRAPHX_HAS_SOURCE_LOCATION 1
+#else
+#define MIGRAPHX_HAS_SOURCE_LOCATION 0
+#endif
+#if __has_include(<experimental/source_location>) && __cplusplus >= 201103L
+#define MIGRAPHX_HAS_SOURCE_LOCATION_TS 1
+#else
+#define MIGRAPHX_HAS_SOURCE_LOCATION_TS 0
+#endif
+#else
+#define MIGRAPHX_HAS_SOURCE_LOCATION 0
+#define MIGRAPHX_HAS_SOURCE_LOCATION_TS 0
+#endif
+#if MIGRAPHX_HAS_SOURCE_LOCATION
+#include <source_location>
+#elif MIGRAPHX_HAS_SOURCE_LOCATION_TS
+#include <experimental/source_location>
+#endif
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+#if MIGRAPHX_HAS_SOURCE_LOCATION
+using source_location = std::source_location;
+#elif MIGRAPHX_HAS_SOURCE_LOCATION_TS
+using source_location = std::experimental::source_location;
+#else
+struct source_location
+{
+    static constexpr source_location current() noexcept { return source_location{}; }
+    constexpr std::uint_least32_t line() const noexcept { return 0; }
+    constexpr std::uint_least32_t column() const noexcept { return 0; }
+    constexpr const char* file_name() const noexcept { return ""; }
+    constexpr const char* function_name() const noexcept { return ""; }
+};
+#endif
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_MIGRAPHX_SOURCE_LOCATION_HPP
--- a/src/module.cpp
+++ b/src/module.cpp
@@ -326,6 +326,8 @@ instruction_ref module::replace_instruction(instruction_ref ins, instruction_ref
    if(ins == std::prev(this->end()))
    {
+        // "rep" instruction could be used earlier in the program and moving it at the end
+        // may cause invalid program, therefore make an identity operation in this case.
        return replace_instruction(ins, make_op("identity"), rep);
    }

--- a/src/onnx/onnx_parser.cpp
+++ b/src/onnx/onnx_parser.cpp
@@ -38,6 +38,9 @@
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
+namespace onnx {
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TRACE_ONNX_PARSER)
 static shape shape_from_dyn_dims(shape::type_t shape_type,
                                 const std::vector<shape::dynamic_dimension>& dyn_dims)
@@ -53,8 +56,6 @@ static shape shape_from_dyn_dims(shape::type_t shape_type,
    return {shape_type, dyn_dims};
 }
-namespace onnx {
 static onnx_parser::attribute_map get_attributes(const onnx::NodeProto& node)
 {
    std::unordered_map<std::string, onnx::AttributeProto> result;
@@ -297,16 +298,48 @@ int64_t onnx_parser::get_opset_version(const onnx::ModelProto& model)
    return version;
 }
-std::vector<instruction_ref>
+void print_added_instructions(module* mod,
-onnx_parser::parse_graph(module* mod, const onnx::GraphProto& graph, bool inlining)
+                              const std::vector<instruction_ref>& args,
+                              const std::vector<instruction_ref>& result)
+{
+    // Print instructions added by the parser not in args
+    std::vector<instruction_ref> added_instructions;
+    fix([&](auto self, auto r) {
+        for(auto ins : r)
+        {
+            if(contains(args, ins))
+                continue;
+            if(contains(added_instructions, ins))
+                continue;
+            self(ins->inputs());
+            added_instructions.push_back(ins);
+        }
+    })(result);
+    mod->debug_print(added_instructions);
+}
+std::unordered_map<std::string, instruction_ref>
+parse_intializer(const onnx_parser& parser, module* mod, const onnx::GraphProto& graph)
 {
    std::unordered_map<std::string, instruction_ref> mod_insts;
    for(auto&& f : graph.initializer())
    {
+        if(enabled(MIGRAPHX_TRACE_ONNX_PARSER{}))
+            std::cout << "initializer: " << f.name() << std::endl;
        // backup instructions in parent mod
-        mod_insts[f.name()] = mod->add_literal(parse_tensor(f));
+        mod_insts[f.name()] = mod->add_literal(parser.parse_tensor(f));
+        if(enabled(MIGRAPHX_TRACE_ONNX_PARSER{}))
+            mod->debug_print(mod_insts[f.name()]);
    }
+    return mod_insts;
+}
+std::unordered_map<std::string, instruction_ref>
+parse_inputs(const onnx_parser& parser,
+             module* mod,
+             const onnx::GraphProto& graph,
+             std::unordered_map<std::string, instruction_ref> mod_insts)
+{
    for(auto&& input : graph.input())
    {
        const std::string& name = input.name();
@@ -317,7 +350,7 @@ onnx_parser::parse_graph(module* mod, const onnx::GraphProto& graph, bool inlini
            // scenario that a nested subgraph contains a parameter with the
            // name existed in its parent graph.
            // In the current implementation, MIGraphX throws an exception for that.
-            if(contains(instructions, name))
+            if(contains(parser.instructions, name))
            {
                MIGRAPHX_THROW("module \"" + mod->name() + "\" has parameter name \"" + name +
                               "\" existing in parent graph!");
@@ -325,28 +358,41 @@ onnx_parser::parse_graph(module* mod, const onnx::GraphProto& graph, bool inlini
            shape s;
            std::vector<std::size_t> dims;
-            if(map_input_dims.count(name) > 0)
+            if(parser.map_input_dims.count(name) > 0)
            {
-                dims = map_input_dims.at(name);
+                dims = parser.map_input_dims.at(name);
-                s    = parse_type(input.type(), dims);
+                s    = parser.parse_type(input.type(), dims);
            }
-            else if(map_dyn_input_dims.count(name) > 0)
+            else if(parser.map_dyn_input_dims.count(name) > 0)
            {
                shape::type_t shape_type = get_type(input.type().tensor_type().elem_type());
-                s = shape_from_dyn_dims(shape_type, map_dyn_input_dims.at(name));
+                s = shape_from_dyn_dims(shape_type, parser.map_dyn_input_dims.at(name));
            }
            else
            {
-                s = parse_type(input.type(), dims);
+                s = parser.parse_type(input.type(), dims);
            }
            mod_insts[name] = mod->add_parameter(name, s);
        }
    }
+    return mod_insts;
+}
+std::vector<instruction_ref>
+onnx_parser::parse_graph(module* mod, const onnx::GraphProto& graph, bool inlining)
+{
+    std::unordered_map<std::string, instruction_ref> mod_insts =
+        parse_intializer(*this, mod, graph);
+    mod_insts = parse_inputs(*this, mod, graph, mod_insts);
    std::copy(mod_insts.begin(), mod_insts.end(), std::inserter(instructions, instructions.end()));
    for(auto&& node : graph.node())
    {
+        if(enabled(MIGRAPHX_TRACE_ONNX_PARSER{}))
+            std::cout << "operator: " << node.op_type() << std::endl;
        std::vector<instruction_ref> args;
        for(auto&& input : node.input())
        {
@@ -384,6 +430,11 @@ onnx_parser::parse_graph(module* mod, const onnx::GraphProto& graph, bool inlini
                       result.begin(),
                       std::inserter(instructions, instructions.end()),
                       [](auto&& x, auto&& y) { return std::make_pair(x, y); });
+        if(enabled(MIGRAPHX_TRACE_ONNX_PARSER{}))
+        {
+            print_added_instructions(mod, args, result);
+        }
    }
    // Find instructions corresponding to the output

--- a/src/onnx/parse_where.cpp
+++ b/src/onnx/parse_where.cpp
@@ -31,8 +31,6 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace onnx {
-MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_CK)
 struct parse_where : op_parser<parse_where>
 {
    std::vector<op_desc> operators() const { return {{"Where"}}; }
@@ -59,13 +57,6 @@ struct parse_where : op_parser<parse_where>
                compute_broadcasted_lens(args[0]->get_shape().lens(), args[1]->get_shape().lens());
            lens = compute_broadcasted_lens(lens, args[2]->get_shape().lens());
-            if(enabled(MIGRAPHX_ENABLE_CK{}))
-            {
-                // Convert condition tensor to int32 to work around CK not supporting bool type
-                args[0] = info.add_instruction(
-                    make_op("convert", {{"target_type", shape::int32_type}}), args[0]);
-            }
            if(args[0]->get_shape().lens() != lens)
            {
                args[0] =

--- a/src/pass_manager.cpp
+++ b/src/pass_manager.cpp
@@ -68,12 +68,18 @@ void run_pass(program& prog, const pass& p, tracer trace)
 struct module_pm : module_pass_manager
 {
    module* mod           = nullptr;
+    module* root_mod      = nullptr;
    tracer* t             = nullptr;
    module* common_parent = nullptr;
    program* prog         = nullptr;
    module_pm(module* pmod = nullptr, tracer* pt = nullptr) : mod(pmod), t(pt) {}
+    module_pm(module* pmod = nullptr, module* rmod = nullptr, tracer* pt = nullptr)
+        : mod(pmod), root_mod(rmod), t(pt)
+    {
+    }
    template <class... Ts>
    void trace(Ts&&... xs) const
    {
@@ -97,6 +103,8 @@ struct module_pm : module_pass_manager
    virtual module* get_root_module() override
    {
+        if(root_mod != nullptr)
+            return root_mod;
        assert(prog);
        return prog->get_main_module();
    }
@@ -140,7 +148,7 @@ void run_passes(program& prog, module_ref root_mod, const std::vector<pass>& pas
                continue;
            if(not visited.insert(mod).second)
                continue;
-            module_pm mpm{mod, &trace};
+            module_pm mpm{mod, root_mod, &trace};
            mpm.prog      = &prog;
            auto parents  = range(tree.equal_range(mod));
            auto nparents = distance(parents);
@@ -164,7 +172,7 @@ void run_passes(module& mod, const std::vector<pass>& passes, tracer trace)
        trace = tracer{std::cout};
    for(const auto& p : passes)
    {
-        module_pm{&mod, &trace}.run_pass(p);
+        module_pm{&mod, &mod, &trace}.run_pass(p);
    }
 }

--- a/src/program.cpp
+++ b/src/program.cpp
@@ -359,6 +359,31 @@ std::string classify(T x)
    }
 }
+void print_statistics(std::ostream& os, const argument& a)
+{
+    a.visit(
+        [&](auto t) {
+            os << "Min value: " << *std::min_element(t.begin(), t.end()) << ", ";
+            os << "Max value: " << *std::max_element(t.begin(), t.end()) << ", ";
+            double num_elements = t.size();
+            auto mean           = std::accumulate(t.begin(), t.end(), 0.0) / num_elements;
+            auto stddev         = std::sqrt(
+                std::accumulate(t.begin(),
+                                t.end(),
+                                0.0,
+                                [&](auto r, auto v) { return r + std::pow((v - mean), 2.0); }) /
+                num_elements);
+            os << "Mean: " << mean << ", ";
+            os << "StdDev: " << stddev << "\n";
+        },
+        [&](const auto& xs) {
+            for(const auto& x : xs)
+            {
+                print_statistics(os, x);
+            }
+        });
+}
 std::unordered_set<std::string> classify_argument(const argument& a)
 {
    std::unordered_set<std::string> result;
@@ -578,6 +603,7 @@ std::vector<argument> program::eval(parameter_map params, execution_environment
                        std::cout << "Output: ";
                        preview_argument(std::cout, buffer);
                        std::cout << std::endl;
+                        print_statistics(std::cout, buffer);
                    }
                    else
                    {

--- a/src/promote_literals.cpp
+++ b/src/promote_literals.cpp
@@ -34,7 +34,7 @@ void promote_literals::apply(module_pass_manager& mpm) const
 {
    module& m              = mpm.get_module();
    module_ref root_module = mpm.get_root_module();
-    if(m.name() == "main")
+    if(m == *root_module)
        return;
    for(auto ins : iterator_for(m))

--- a/src/quantize_fp16.cpp
+++ b/src/quantize_fp16.cpp
@@ -52,14 +52,6 @@ static void quantize_module(module& m, const std::vector<std::string>& ins_names
        auto mod_inputs = ins->module_inputs();
        auto s          = ins->get_shape();
-        // Convert back to original type before quantizing the inputs
-        if(mod_inputs.empty())
-        {
-            auto r = m.insert_instruction(
-                std::next(ins), make_op("convert", {{"target_type", s.type()}}), ins);
-            m.replace_instruction(ins, r);
-        }
        // Convert each of the inputs that are floating point to fp16
        auto inputs = ins->inputs();
        std::transform(inputs.begin(), inputs.end(), inputs.begin(), [&](auto input) {
@@ -70,8 +62,17 @@ static void quantize_module(module& m, const std::vector<std::string>& ins_names
                ins, make_op("convert", {{"target_type", shape::half_type}}), input);
        });
-        // Replace inputs
+        // Insert quantized ins
-        m.replace_instruction(ins, ins->get_operator(), inputs, mod_inputs);
+        auto converted_ins = m.insert_instruction(ins, ins->get_operator(), inputs, mod_inputs);
+        // Convert back to original type after quantizing
+        if(mod_inputs.empty())
+        {
+            converted_ins = m.insert_instruction(
+                ins, make_op("convert", {{"target_type", s.type()}}), converted_ins);
+        }
+        // Replace original instruction
+        m.replace_instruction(ins, converted_ins);
    }
 }

--- a/src/replace_allocate.cpp
+++ b/src/replace_allocate.cpp
@@ -21,6 +21,7 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
+#include <migraphx/pass_manager.hpp>
 #include <migraphx/replace_allocate.hpp>
 #include <migraphx/instruction.hpp>
 #include <migraphx/program.hpp>
@@ -84,10 +85,11 @@ void insert_submod_allocations(instruction_ref ins, module& mod, const allocatio
    mod.replace_instruction(ins, ins->get_operator(), inputs, mod_args);
 }
-void replace_allocate::apply(module& m) const
+void replace_allocate::apply(module_pass_manager& mpm) const
 {
+    module& m              = mpm.get_module();
    auto mod_output_names  = create_output_names(m);
-    bool main_offload_copy = m.name() == "main" ? this->offload_copy : false;
+    bool root_offload_copy = (*mpm.get_root_module() == m) ? this->offload_copy : false;
    for(auto ins : iterator_for(m))
    {
        auto op      = ins->get_operator();
@@ -104,7 +106,7 @@ void replace_allocate::apply(module& m) const
            continue;
        auto s = ins->get_shape();
-        if(not main_offload_copy and model.needs_out_params() and contains(mod_output_names, ins))
+        if(not root_offload_copy and model.needs_out_params() and contains(mod_output_names, ins))
        {
            auto out_param = m.add_parameter(mod_output_names[ins], s);
            m.replace_instruction(ins, out_param);

--- a/src/simplify_algebra.cpp
+++ b/src/simplify_algebra.cpp
@@ -39,8 +39,6 @@
 #include <migraphx/algorithm.hpp>
 #include <unordered_set>
-MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TRACE_SIMPLIFY_ALGEBRA_MATCHES)
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -1487,13 +1485,10 @@ struct find_split_transpose
 void simplify_algebra::apply(module& m) const
 {
-    size_t trace = value_of(MIGRAPHX_TRACE_SIMPLIFY_ALGEBRA_MATCHES{});
    // Run simplifications multiple times
    for(int i = 0; i < 8; i++)
    {
-        match::find_matches(trace,
+        match::find_matches(m,
-                            m,
                            find_inner_broadcast{},
                            find_dot_broadcast{},
                            find_double_add_lit_broadcast{},