manual merge

b9d37172 · Khalique Ahmed · 1af66a1c · ea62d7aa · b9d37172 · b9d37172
Commit b9d37172 authored Oct 10, 2023 by Khalique Ahmed
20 changed files
--- a/examples/vision/cpp_mnist/mnist_inference.cpp
+++ b/examples/vision/cpp_mnist/mnist_inference.cpp
@@ -160,9 +160,9 @@ int main(int argc, char** argv)
    auto lengths = shape.lengths();
    auto num_results =
        std::accumulate(lengths.begin(), lengths.end(), 1, std::multiplies<size_t>());
-    float* results = reinterpret_cast<float*>(outputs[0].data());
-    float* max     = std::max_element(results, results + num_results);
-    int answer     = max - results;
+    float* results   = reinterpret_cast<float*>(outputs[0].data());
+    const float* max = std::max_element(results, results + num_results);
+    int answer       = max - results;

    std::cout << std::endl
              << "Randomly chosen digit: " << rand_digit << std::endl
@@ -192,12 +192,12 @@ void read_nth_digit(const int n, std::vector<float>& digit)
        for(int i = 0; i < HEIGHT * WIDTH; ++i)
        {
            unsigned char temp = 0;
-            file.read((char*)&temp, sizeof(temp));
+            file.read(reinterpret_cast<char*>(&temp), sizeof(temp));
            if(d == n)
            {
                float data = temp / 255.0;
                digit.push_back(data);
-                std::cout << SYMBOLS[(int)(data * 10) % 11];
+                std::cout << SYMBOLS[static_cast<int>(data * 10) % 11];
                if((i + 1) % WIDTH == 0)
                    std::cout << std::endl;
            }

--- a/hip-clang.docker
+++ b/hip-clang.docker
@@ -6,7 +6,7 @@ ARG PREFIX=/usr/local
 RUN dpkg --add-architecture i386

 # Add rocm repository
-RUN sh -c 'echo deb [arch=amd64 trusted=yes] http://repo.radeon.com/rocm/apt/5.6/ focal main > /etc/apt/sources.list.d/rocm.list'
+RUN sh -c 'echo deb [arch=amd64 trusted=yes] http://repo.radeon.com/rocm/apt/5.7/ focal main > /etc/apt/sources.list.d/rocm.list'

 # Install dependencies
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
@@ -54,5 +54,9 @@ ADD dev-requirements.txt /dev-requirements.txt
 ADD requirements.txt /requirements.txt
 ADD rbuild.ini /rbuild.ini

+# Temporarily install a new cmake until switching to ubuntu 22.04
+RUN pip3 install cmake==3.22.1
+
 COPY ./tools/install_prereqs.sh /
 RUN /install_prereqs.sh /usr/local / && rm /install_prereqs.sh
+
--- a/rbuild.ini
+++ b/rbuild.ini
@@ -6,7 +6,9 @@ deps =
    -f requirements.txt

 [gh]
-ignore = danmar/cppcheck
+ignore =
+    danmar/cppcheck
+    ROCmSoftwarePlatform/rocMLIR
 deps =
    -f dev-requirements.txt
    oneapi-src/oneDNN@v1.7
@@ -27,3 +29,12 @@ define =
    CMAKE_CXX_COMPILER_LAUNCHER=${deps_dir}/bin/ccache
    MIGRAPHX_ENABLE_CPU=On
    BUILD_DEV=On
+
+[cibuild]
+cxx = ${rocm_path}/llvm/bin/clang++
+cc = ${rocm_path}/llvm/bin/clang
+deps =
+    -f dev-requirements.txt
+define =
+    CMAKE_C_COMPILER_LAUNCHER=${deps_dir}/bin/ccache
+    CMAKE_CXX_COMPILER_LAUNCHER=${deps_dir}/bin/ccache
--- a/requirements.txt
+++ b/requirements.txt
@@ -28,4 +28,5 @@ ROCmSoftwarePlatform/half@rocm-5.6.0
 pybind/pybind11@d159a563383d10c821ba7b2a71905d1207db6de4 --build
 msgpack/msgpack-c@cpp-3.3.0 -DMSGPACK_BUILD_TESTS=Off
 sqlite3@3.17 -DCMAKE_POSITION_INDEPENDENT_CODE=On
-ROCmSoftwarePlatform/composable_kernel@5172ec5280f14974beee2acf1af1db3b2670244c -DCK_BUILD_JIT_LIB=On -DCMAKE_POSITION_INDEPENDENT_CODE=On
+ROCmSoftwarePlatform/composable_kernel@a22e479b8e1557961039db2d5c5ff89cff35e86b -DCK_BUILD_JIT_LIB=On -DCMAKE_POSITION_INDEPENDENT_CODE=On
+ROCmSoftwarePlatform/rocMLIR@a48dfb1f163fb0b38369e73e580968b72e85b594 -DBUILD_FAT_LIBROCKCOMPILER=On
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
 #####################################################################################
 # The MIT License (MIT)
 #
-# Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -28,6 +28,7 @@ include(ROCMInstallTargets)
 include(ROCMPackageConfigHelpers)
 include(RegisterOp)
 include(CheckCXXLinkerFlag)
+ 

 add_library(migraphx 
    adjust_allocation.cpp
@@ -36,6 +37,7 @@ add_library(migraphx
    argument.cpp
    auto_contiguous.cpp
    common.cpp
+    common_dims.cpp
    compile_src.cpp
    convert_to_json.cpp
    cpp_generator.cpp
@@ -94,6 +96,7 @@ add_library(migraphx
    serialize.cpp
    shape.cpp
    simplify_algebra.cpp
+    simplify_dyn_ops.cpp
    simplify_reshapes.cpp
    split_single_dyn_dim.cpp
    target.cpp
@@ -140,6 +143,7 @@ register_migraphx_ops(
    equal
    erf
    exp
+    fill
    flatten
    floor
    fmod
@@ -183,6 +187,8 @@ register_migraphx_ops(
    quant_convolution
    quant_dot
    quantizelinear
+    random_uniform
+    random_seed
    recip
    reduce_max
    reduce_mean
@@ -191,6 +197,7 @@ register_migraphx_ops(
    reduce_sum
    relu
    reshape
+    reshape_lazy
    reverse
    rnn
    rnn_last_cell_output
@@ -248,8 +255,6 @@ endif()
 target_link_libraries(migraphx PRIVATE -ldl)

 target_include_directories(migraphx SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
-
-find_package(Threads)
 target_link_libraries(migraphx PUBLIC Threads::Threads)

 find_package(nlohmann_json 3.8.0 REQUIRED)

--- a/src/api/api.cpp
+++ b/src/api/api.cpp
@@ -899,7 +899,7 @@ migraphx_dynamic_dimensions_assign_to(migraphx_dynamic_dimensions_t output,

 extern "C" migraphx_status
 migraphx_dynamic_dimensions_create(migraphx_dynamic_dimensions_t* dynamic_dimensions,
-                                   const_migraphx_dynamic_dimension_t* ptr,
+                                   const const_migraphx_dynamic_dimension_t* ptr,
                                   size_t size)
 {
    auto api_error_result = migraphx::try_([&] {
@@ -1432,7 +1432,7 @@ extern "C" migraphx_status migraphx_instructions_assign_to(migraphx_instructions
 }

 extern "C" migraphx_status migraphx_instructions_create(migraphx_instructions_t* instructions,
-                                                        const_migraphx_instruction_t* ptr,
+                                                        const const_migraphx_instruction_t* ptr,
                                                        size_t size)
 {
    auto api_error_result = migraphx::try_([&] {

--- a/src/api/include/migraphx/migraphx.h
+++ b/src/api/include/migraphx/migraphx.h
@@ -209,7 +209,7 @@ MIGRAPHX_C_EXPORT migraphx_status migraphx_dynamic_dimensions_assign_to(

 MIGRAPHX_C_EXPORT migraphx_status
 migraphx_dynamic_dimensions_create(migraphx_dynamic_dimensions_t* dynamic_dimensions,
-                                   const_migraphx_dynamic_dimension_t* ptr,
+                                   const const_migraphx_dynamic_dimension_t* ptr,
                                   size_t size);

 MIGRAPHX_C_EXPORT migraphx_status
@@ -377,7 +377,7 @@ MIGRAPHX_C_EXPORT migraphx_status migraphx_instructions_assign_to(
    migraphx_instructions_t output, const_migraphx_instructions_t input);

 MIGRAPHX_C_EXPORT migraphx_status migraphx_instructions_create(
-    migraphx_instructions_t* instructions, const_migraphx_instruction_t* ptr, size_t size);
+    migraphx_instructions_t* instructions, const const_migraphx_instruction_t* ptr, size_t size);

 MIGRAPHX_C_EXPORT migraphx_status migraphx_modules_destroy(migraphx_modules_t modules);


--- a/src/api/migraphx.py
+++ b/src/api/migraphx.py
@@ -79,7 +79,8 @@ def dynamic_dimension(h):
 def dynamic_dimensions(h):
    h.constructor(
        'create',
-        api.params(ptr='const_migraphx_dynamic_dimension_t*', size='size_t'),
+        api.params(ptr='const const_migraphx_dynamic_dimension_t*',
+                   size='size_t'),
        fname='migraphx::to_obj_vector<const_migraphx_dynamic_dimension_t>')
    h.method('size', returns='size_t')
    h.method('get',
@@ -215,7 +216,7 @@ def instruction(h):
 def instructions(h):
    h.constructor(
        'create',
-        api.params(ptr='const_migraphx_instruction_t*', size='size_t'),
+        api.params(ptr='const const_migraphx_instruction_t*', size='size_t'),
        fname='migraphx::to_obj_vector<const_migraphx_instruction_t>')



--- a/src/auto_contiguous.cpp
+++ b/src/auto_contiguous.cpp
@@ -26,7 +26,6 @@
 #include <migraphx/instruction.hpp>
 #include <migraphx/make_op.hpp>
 #include <migraphx/ranges.hpp>
-
 #include <migraphx/iterator_for.hpp>

 namespace migraphx {

--- a/src/common_dims.cpp
+++ b/src/common_dims.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/common_dims.hpp>
+#include <migraphx/ranges.hpp>
+#include <algorithm>
+#include <cassert>
+#include <numeric>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+template <class Iterator>
+static auto compute_end_dim(Iterator start, Iterator last, std::size_t dim)
+{
+    std::size_t x = 1;
+    auto it       = std::find_if(start, last, [&](auto i) {
+        x *= i;
+        return x > dim;
+    });
+    if(x < dim)
+        return start;
+    return it;
+}
+
+template <class Range>
+static auto elements(const Range& r)
+{
+    return std::accumulate(r.begin(), r.end(), std::size_t{1}, std::multiplies<>{});
+}
+
+struct common_dim_state
+{
+    common_dim_state(const std::vector<std::size_t>& pdims,
+                     std::vector<std::vector<std::size_t>>& paxes_map)
+        : dims(&pdims), axes_map(&paxes_map), it(dims->begin())
+    {
+    }
+    const std::vector<std::size_t>* dims            = nullptr;
+    std::vector<std::vector<std::size_t>>* axes_map = nullptr;
+    std::vector<std::size_t>::const_iterator it{};
+    std::size_t rem = 1;
+    std::size_t get() const { return *it / rem; }
+    bool is_end() const { return it == dims->end(); }
+    void next(std::size_t i = 1) { it += i; }
+    auto dims_for(std::size_t d) const
+    {
+        auto dim_end = compute_end_dim(it, dims->end(), d);
+        return range(it, dim_end);
+    }
+    void add_axes(std::size_t naxes, std::size_t start) MIGRAPHX_TIDY_CONST
+    {
+        auto axes = compute_axes(naxes, start);
+        axes_map->push_back(std::move(axes));
+    }
+
+    void add_multi_axes(std::size_t naxes, std::size_t start) MIGRAPHX_TIDY_CONST
+    {
+        auto axes = compute_axes(naxes, start);
+        std::transform(axes.begin(),
+                       axes.end(),
+                       std::back_inserter(*axes_map),
+                       [&](auto axis) -> std::vector<std::size_t> { return {axis}; });
+    }
+    std::vector<std::size_t> compute_axes(std::size_t naxes, std::size_t start) const
+    {
+        if(rem != 1)
+        {
+            assert(start > 0);
+            naxes++;
+            start--;
+        }
+        std::vector<std::size_t> axes(naxes);
+        std::iota(axes.begin(), axes.end(), start);
+        return axes;
+    }
+};
+
+static bool compute_common_dim(std::vector<std::size_t>& cd_dims,
+                               common_dim_state& state1,
+                               common_dim_state& state2)
+{
+    assert(state1.get() <= state2.get());
+    auto d2    = state2.get();
+    auto dims  = state1.dims_for(d2);
+    auto n     = elements(dims);
+    auto naxes = distance(dims);
+    if(naxes == 0)
+        return false;
+    // If not divisible then we can't compute a common dim
+    if((d2 % n) != 0)
+        return false;
+    auto rem = d2 / n;
+    state1.add_multi_axes(naxes, cd_dims.size());
+    state2.add_axes(rem == 1 ? naxes : naxes + 1, cd_dims.size());
+
+    state1.rem = rem;
+    state2.rem = 1;
+
+    cd_dims.insert(cd_dims.end(), dims.begin(), dims.end());
+    if(state1.rem != 1)
+        cd_dims.push_back(state1.rem);
+    state1.next(distance(dims));
+    state2.next();
+    return true;
+}
+
+common_dims common_dims::compute(const std::vector<std::size_t>& dims1,
+                                 const std::vector<std::size_t>& dims2)
+{
+    assert(elements(dims1) > 0);
+    assert(elements(dims1) == elements(dims2));
+    common_dims cd;
+    common_dim_state state1{dims1, cd.axes_map1};
+    common_dim_state state2{dims2, cd.axes_map2};
+    while(not state1.is_end() and not state2.is_end())
+    {
+        auto d1 = state1.get();
+        auto d2 = state2.get();
+        if(d1 <= d2)
+        {
+            if(not compute_common_dim(cd.dims, state1, state2))
+                return {};
+        }
+        else // if(d1 > d2)
+        {
+            if(not compute_common_dim(cd.dims, state2, state1))
+                return {};
+        }
+    }
+    assert(elements(dims1) == elements(cd.dims));
+    return cd;
+}
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/cpp_generator.cpp
+++ b/src/cpp_generator.cpp
@@ -213,13 +213,13 @@ cpp_generator::function cpp_generator::generate_module(const module& m,
                ins->get_literal().visit([&](auto v) {
                    assert(v.size() == 1);
                    auto x = v.front();
-                    if(std::isinf(x))
+                    if(std::isinf(static_cast<double>(x)))
                    {
                        string_literal = "__builtin_huge_val()";
                        if(x < 0)
                            string_literal = "-__builtin_huge_val()";
                    }
-                    else if(std::isnan(x))
+                    else if(std::isnan(static_cast<double>(x)))
                        string_literal = "__builtin_nan()";
                    else
                        string_literal = ins->get_literal().to_string();

--- a/src/driver/CMakeLists.txt
+++ b/src/driver/CMakeLists.txt
@@ -45,6 +45,9 @@ if(NOT WIN32)
 endif()
 rocm_clang_tidy_check(driver)

+file(STRINGS "${CMAKE_SOURCE_DIR}/test/onnx/.onnxrt-commit" String_output)
+target_compile_definitions(driver PUBLIC MIGRAPHX_ORT_SHA1="${String_output}")
+
 target_link_libraries(driver migraphx_all_targets migraphx_onnx migraphx_tf migraphx_py)

 rocm_install_targets(

--- a/src/driver/argument_parser.hpp
+++ b/src/driver/argument_parser.hpp
@@ -338,11 +338,22 @@ struct argument_parser

    MIGRAPHX_DRIVER_STATIC auto file_exist()
    {
-        return validate([](auto&, auto&, auto& params) {
+        return validate([](auto&, auto&, const auto& params) {
            if(params.empty())
                throw std::runtime_error("No argument passed.");
            if(not fs::exists(params.back()))
-                throw std::runtime_error("Path does not exists: " + params.back());
+                throw std::runtime_error("Path does not exist: " + params.back());
+        });
+    }
+
+    MIGRAPHX_DRIVER_STATIC auto matches(const std::unordered_set<std::string>& names)
+    {
+        return validate([=](auto&, auto&, const auto& params) {
+            auto invalid_param = std::find_if(
+                params.begin(), params.end(), [&](const auto& p) { return names.count(p) == 0; });
+            if(invalid_param != params.end())
+                throw std::runtime_error("Invalid argument: " + *invalid_param +
+                                         ". Valid arguments are {" + to_string_range(names) + "}");
        });
    }

@@ -570,8 +581,7 @@ struct argument_parser
                        continue;
                    if(flag[0] != '-')
                        continue;
-                    auto d =
-                        levenshtein_distance(flag.begin(), flag.end(), input.begin(), input.end());
+                    std::ptrdiff_t d = levenshtein_distance(flag, input);
                    if(d < result.distance)
                        result = result_t{&arg, flag, input, d};
                }

--- a/src/driver/main.cpp
+++ b/src/driver/main.cpp
@@ -82,6 +82,7 @@ struct loader
           {"--model"},
           ap.help("Load model"),
           ap.type("resnet50|inceptionv3|alexnet"),
+           ap.matches({"resnet50", "inceptionv3", "alexnet"}),
           ap.group("input"));
        ap(file_type, {"--onnx"}, ap.help("Load as onnx"), ap.set_value("onnx"));
        ap(file_type, {"--tf"}, ap.help("Load as tensorflow"), ap.set_value("tf"));
@@ -474,13 +475,15 @@ struct compiler
            {
                if(is_offload_copy_set(p) and not co.offload_copy)
                {
-                    std::cout << "MIGraphX program was likely compiled with offload_copy set, Try "
-                                 "passing "
-                                 "`--enable-offload-copy` if program run fails.\n";
+                    std::cout
+                        << "[WARNING]: MIGraphX program was likely compiled with offload_copy "
+                           "set, Try "
+                           "passing "
+                           "`--enable-offload-copy` if program run fails.\n";
                }
                else if(co.offload_copy)
                {
-                    std::cout << "MIGraphX program was likely compiled without "
+                    std::cout << "[WARNING]: MIGraphX program was likely compiled without "
                                 "offload_copy set, Try "
                                 "removing "
                                 "`--enable-offload-copy` flag if passed to driver, if program run "
@@ -533,13 +536,19 @@ struct params : command<params>
 struct verify : command<verify>
 {
    compiler c;
-    double tolerance     = 80;
+    migraphx::verify::tolerance tols;
    bool per_instruction = false;
    bool reduce          = false;
    void parse(argument_parser& ap)
    {
        c.parse(ap);
-        ap(tolerance, {"--tolerance"}, ap.help("Tolerance for errors"));
+        ap(tols.rms_tol, {"--rms-tol"}, ap.help("Tolerance for the RMS error (Default: 0.001)"));
+        ap(tols.atol,
+           {"--atol"},
+           ap.help("Tolerance for the elementwise absolute difference (Default: 0.001)"));
+        ap(tols.rtol,
+           {"--rtol"},
+           ap.help("Tolerance for the elementwise relative difference (Default: 0.001)"));
        ap(per_instruction,
           {"-i", "--per-instruction"},
           ap.help("Verify each instruction"),
@@ -564,15 +573,15 @@ struct verify : command<verify>

        if(per_instruction)
        {
-            verify_instructions(p, t, c.co, quantize, tolerance);
+            verify_instructions(p, t, c.co, quantize, tols);
        }
        else if(reduce)
        {
-            verify_reduced_program(p, t, c.co, quantize, m, tolerance);
+            verify_reduced_program(p, t, c.co, quantize, m, tols);
        }
        else
        {
-            verify_program(c.l.file, p, t, c.co, quantize, m, tolerance);
+            verify_program(c.l.file, p, t, c.co, quantize, m, tols);
        }
    }
 };
@@ -769,7 +778,7 @@ struct main_command
        {
            std::cout << "'" << color::fg_yellow << wrong_commands.front() << color::reset
                      << "' is not a valid command." << std::endl;
-            std::cout << get_command_help("Available commands:") << std::endl;
+            std::cout << get_command_help("Available commands:");
        }
        else
        {
@@ -801,6 +810,13 @@ int main(int argc, const char* argv[])

    auto&& m = get_commands();
    auto cmd = args.front();
+
+    if(cmd == "ort-sha")
+    {
+        std::cout << MIGRAPHX_ORT_SHA1 << std::endl;
+        return 0;
+    }
+
    if(m.count(cmd) > 0)
    {
        m.at(cmd)(argv[0], {args.begin() + 1, args.end()});

--- a/src/driver/verify.cpp
+++ b/src/driver/verify.cpp
@@ -30,6 +30,7 @@
 #include <migraphx/instruction.hpp>
 #include <migraphx/compile_options.hpp>
 #include <migraphx/quantization.hpp>
+#include <migraphx/ranges.hpp>

 namespace migraphx {
 namespace driver {
@@ -76,15 +77,25 @@ void verify_program(const std::string& name,
                    compile_options options,
                    precision quantize,
                    const parameter_map& inputs,
-                    double tolerance)
+                    verify::tolerance tols)
 {
-    auto x = run_ref(p, inputs);
-    auto y = run_target(p, t, options, quantize, inputs);
+    auto ref_outs    = run_ref(p, inputs);
+    auto target_outs = run_target(p, t, options, quantize, inputs);

-    std::size_t output_num = x.size();
+    std::size_t output_num = ref_outs.size();
    for(std::size_t i = 0; i < output_num; ++i)
    {
-        verify_args(name, x[i], y[i], tolerance);
+        if(ref_outs[i].get_shape().type() != target_outs[i].get_shape().type() or
+           ref_outs[i].get_shape().lens() != target_outs[i].get_shape().lens())
+        {
+            std::cout << "FAILED: " << name << std::endl;
+            std::cout << "Shape mismatch {" << ref_outs[i].get_shape() << "} != {"
+                      << target_outs[i].get_shape() << "}" << std::endl;
+        }
+        else
+        {
+            verify_args(name, target_outs[i], verify::expected{ref_outs[i]}, tols);
+        }
    }
 }

@@ -92,7 +103,7 @@ void verify_instructions(const program& prog,
                         const target& t,
                         compile_options options,
                         precision quantize,
-                         double tolerance)
+                         verify::tolerance tols)
 {
    const auto* mm_prog = prog.get_main_module();
    for(auto&& ins : (*mm_prog))
@@ -123,8 +134,7 @@ void verify_instructions(const program& prog,
        {
            std::cout << "Verify: " << ins.name() << std::endl;
            std::cout << p << std::endl;
-            verify_program(
-                ins.name(), p, t, options, quantize, create_param_map(p, false), tolerance);
+            verify_program(ins.name(), p, t, options, quantize, create_param_map(p, false), tols);
        }
        catch(...)
        {
@@ -140,14 +150,22 @@ void verify_reduced(program p,
                    compile_options options,
                    precision quantize,
                    const parameter_map& inputs,
-                    double tolerance)
+                    verify::tolerance tols)
 {
    auto* mm  = p.get_main_module();
-    auto last = std::prev(mm->end(), n + 1);
+    auto last = std::prev(mm->end(), n);
    mm->remove_instructions(last, mm->end());
    std::cout << "Verify: " << n << std::endl;
    std::cout << p << std::endl;
-    verify_program(std::to_string(n), p, t, options, quantize, inputs, tolerance);
+    try
+    {
+        verify_program(std::to_string(n), p, t, options, quantize, inputs, tols);
+    }
+    catch(const std::exception& e)
+    {
+        std::cout << "FAILED: " << n << std::endl;
+        std::cout << "Exception: " << e.what() << std::endl;
+    }
 }

 void verify_reduced_program(const program& p,
@@ -155,14 +173,20 @@ void verify_reduced_program(const program& p,
                            compile_options options,
                            precision quantize,
                            const parameter_map& inputs,
-                            double tolerance)
+                            verify::tolerance tols)
 {
    const auto* mm = p.get_main_module();
    auto n         = std::distance(mm->begin(), mm->end());
    std::cout << "Verify steps: " << n << std::endl;
-    for(std::size_t i = 0; i < n; i++)
+    for(std::size_t i = 1; i < n; i++)
    {
-        verify_reduced(p, i, t, options, quantize, inputs, tolerance);
+        auto last = std::prev(mm->end(), i + 1);
+        if(contains({"@literal", "@param"}, last->name()))
+        {
+            std::cout << "Skip: " << i << std::endl;
+            continue;
+        }
+        verify_reduced(p, i, t, options, quantize, inputs, tols);
    }
 }


--- a/src/driver/verify.hpp
+++ b/src/driver/verify.hpp
@@ -26,6 +26,7 @@

 #include "precision.hpp"
 #include <migraphx/program.hpp>
+#include <migraphx/verify.hpp>

 namespace migraphx {
 namespace driver {
@@ -37,18 +38,18 @@ void verify_program(const std::string& name,
                    compile_options options     = compile_options{},
                    precision quantize          = precision::fp32,
                    const parameter_map& inputs = {},
-                    double tolerance            = 100);
+                    verify::tolerance tols      = verify::tolerance{});
 void verify_instructions(const program& prog,
                         const target& t,
                         compile_options options = compile_options{},
                         precision quantize      = precision::fp32,
-                         double tolerance        = 80);
+                         verify::tolerance tols  = verify::tolerance{});
 void verify_reduced_program(const program& p,
                            const target& t,
                            compile_options options     = compile_options{},
                            precision quantize          = precision::fp32,
                            const parameter_map& inputs = {},
-                            double tolerance            = 80);
+                            verify::tolerance tols      = verify::tolerance{});

 } // namespace MIGRAPHX_INLINE_NS
 } // namespace driver

--- a/src/eliminate_contiguous.cpp
+++ b/src/eliminate_contiguous.cpp
@@ -36,6 +36,8 @@
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TRACE_ELIMINATE_CONTIGUOUS)
+
 static bool try_compute_shape(instruction_ref ins,
                              const std::vector<shape>& inputs,
                              const std::vector<module_ref>& mods)
@@ -79,14 +81,26 @@ static bool try_compute_shape(instruction_ref ins,
                return (arg == ins) ? new_shape : arg->get_shape();
            });

-            if(not try_compute_shape(output, input_shapes, mods))
+            if(not try_compute_shape(output, input_shapes, output->module_inputs()))
            {
                return false;
            }
        }
    }
+    catch(const std::exception& e)
+    {
+        if(enabled(MIGRAPHX_TRACE_ELIMINATE_CONTIGUOUS{}))
+        {
+            std::cout << "Exception: " << e.what() << std::endl;
+        }
+        return false;
+    }
    catch(...)
    {
+        if(enabled(MIGRAPHX_TRACE_ELIMINATE_CONTIGUOUS{}))
+        {
+            std::cout << "Unknown exception" << std::endl;
+        }
        return false;
    }

@@ -128,6 +142,11 @@ static void remove_contiguous(const std::string& op_name, module& m, F f)
        {
            if(arg->name() != op_name)
                continue;
+            if(enabled(MIGRAPHX_TRACE_ELIMINATE_CONTIGUOUS{}))
+            {
+                std::cout << "eliminate_contiguous: ";
+                m.debug_print(ins);
+            }
            auto prev = arg->inputs().front();
            replace(new_args, arg, prev);
            if(try_compute_shape(ins, new_args, mod_args))

--- a/src/fuse_pointwise.cpp
+++ b/src/fuse_pointwise.cpp
@@ -24,11 +24,14 @@
 #include <migraphx/fuse_pointwise.hpp>
 #include <migraphx/pass_manager.hpp>
 #include <migraphx/dead_code_elimination.hpp>
+#include <migraphx/simplify_reshapes.hpp>
 #include <migraphx/instruction.hpp>
 #include <migraphx/program.hpp>
 #include <migraphx/make_op.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/ranges.hpp>
+#include <migraphx/matcher.hpp>
+#include <migraphx/common_dims.hpp>
 #include <iterator>

 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_POINTWISE_FUSION)
@@ -41,7 +44,7 @@ static literal get_scalar(instruction_ref ins)
    if(ins->name() == "contiguous")
        return get_scalar(ins->inputs().front());
    const auto& s = ins->get_shape();
-    if(s.elements() != 1 && not(s.scalar()))
+    if(s.elements() != 1 and not(s.scalar()))
        return {};
    if(not ins->can_eval())
        return {};
@@ -189,6 +192,54 @@ static bool find_pointwise_modules(module& m)
    }
    return changed;
 }
+namespace {
+struct find_pointwise_reshape_pointwise
+{
+    auto matcher() const
+    {
+        auto reshape =
+            match::name("reshape", "squeeze", "unsqueeze", "flatten")(match::used_once());
+        auto skip_contiguous = [](auto... ms) {
+            return match::arg(0)(match::skip(match::name("contiguous")(match::used_once()))(ms...));
+        };
+        auto pointwise         = match::name("pointwise")(match::used_once());
+        auto reshape_pointwise = reshape(skip_contiguous(pointwise.bind("x"))).bind("reshape");
+        return match::name("pointwise")(match::any_of[match::inputs()](reshape_pointwise));
+    }
+
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        auto ins         = r.result;
+        auto x_ins       = r.instructions["x"];
+        auto reshape_ins = r.instructions["reshape"];
+
+        auto cd = common_dims::compute(ins->get_shape().lens(), x_ins->get_shape().lens());
+        if(cd.dims.empty())
+            return;
+
+        auto reshape_input = [&](const auto& ins_to_insert) {
+            return [&](auto input) {
+                auto c = m.insert_instruction(ins_to_insert, make_op("contiguous"), input);
+                return m.insert_instruction(
+                    ins_to_insert, make_op("reshape", {{"dims", cd.dims}}), c);
+            };
+        };
+        auto x_inputs = x_ins->inputs();
+        std::transform(x_inputs.begin(), x_inputs.end(), x_inputs.begin(), reshape_input(x_ins));
+        auto new_x_ins =
+            m.insert_instruction(x_ins, x_ins->get_operator(), x_inputs, x_ins->module_inputs());
+
+        auto inputs = ins->inputs();
+        std::transform(inputs.begin(), inputs.end(), inputs.begin(), [&](auto input) {
+            if(input == reshape_ins)
+                return new_x_ins;
+            return reshape_input(ins)(input);
+        });
+        auto pw = m.insert_instruction(ins, ins->get_operator(), inputs, ins->module_inputs());
+        m.replace_instruction(ins, make_op("reshape", {{"dims", ins->get_shape().lens()}}), pw);
+    }
+};
+} // namespace

 void fuse_pointwise::apply(module_pass_manager& mpm) const
 {
@@ -200,6 +251,8 @@ void fuse_pointwise::apply(module_pass_manager& mpm) const
    }
    for(int i = 0; i < 8; i++)
    {
+        match::find_matches(mpm.get_module(), find_pointwise_reshape_pointwise{});
+        mpm.run_pass(simplify_reshapes{1});
        if(not find_pointwise_modules(mpm.get_module()))
            break;
        mpm.run_pass(dead_code_elimination{});

--- a/src/fuse_reduce.cpp
+++ b/src/fuse_reduce.cpp
@@ -52,7 +52,7 @@ struct fused_reduce
    {
        if(mods.size() != 1)
            MIGRAPHX_THROW("should have one submodule.");
-        auto* sm = mods.front();
+        const auto* sm = mods.front();
        if(sm->get_output_shapes().size() != 1)
            MIGRAPHX_THROW("Only one output supported");
        auto names = sm->get_parameter_names();
@@ -143,7 +143,7 @@ insert_module_in_submodule(module_ref sm,
 }

 static std::vector<instruction_ref>
-find_inputs(module_ref sm,
+find_inputs(const_module_ref sm,
            const module& parent,
            const std::unordered_map<instruction_ref, instruction_ref>& map_ins)
 {

--- a/src/include/migraphx/algorithm.hpp
+++ b/src/include/migraphx/algorithm.hpp
@@ -26,6 +26,8 @@

 #include <algorithm>
 #include <numeric>
+#include <string>
+#include <vector>
 #include <migraphx/config.hpp>

 namespace migraphx {
@@ -90,6 +92,42 @@ levenshtein_distance(Iterator1 first1, Iterator1 last1, Iterator2 first2, Iterat
    return std::ptrdiff_t{1} + std::min({x1, x2, x3});
 }

+inline size_t levenshtein_distance(const std::string& s1, const std::string& s2)
+{
+    const size_t l1 = s1.length();
+    const size_t l2 = s2.length();
+
+    if(l1 < l2)
+        levenshtein_distance(s2, s1);
+
+    std::vector<size_t> d(l2 + 1);
+
+    std::iota(d.begin(), d.end(), 0);
+
+    for(size_t i = 1; i <= l1; i++)
+    {
+        size_t prev_cost = d[0];
+        d[0]             = i;
+
+        for(size_t j = 1; j <= l2; j++)
+        {
+            if(s1[i - 1] == s2[j - 1])
+            {
+                d[j] = prev_cost;
+            }
+            else
+            {
+                size_t cost_insert_or_delete = std::min(d[j - 1], d[j]);
+                size_t cost_substitute       = prev_cost;
+                prev_cost                    = d[j];
+                d[j]                         = std::min(cost_substitute, cost_insert_or_delete) + 1;
+            }
+        }
+    }
+
+    return d[l2];
+}
+
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx