manual merge

cd4ab535 · Khalique Ahmed · 3891ee58 · a0fa3742 · cd4ab535 · cd4ab535
Commit cd4ab535 authored Jun 20, 2023 by Khalique Ahmed
20 changed files
--- a/src/api/include/migraphx/migraphx.hpp
+++ b/src/api/include/migraphx/migraphx.hpp
@@ -571,10 +571,90 @@ using require_interface =
 // NOLINTNEXTLINE
 #define MIGRAPHX_CONST_HANDLE_BASE(name) MIGRAPHX_DETAIL_HANDLE_BASE(name, const)

+/**
+ * Container to hold optimal dynamic dimension values.
+ */
+struct optimals : MIGRAPHX_HANDLE_BASE(optimals)
+{
+    MIGRAPHX_HANDLE_CONSTRUCTOR(optimals)
+
+    optimals(std::initializer_list<size_t> init_list)
+    {
+        this->make_handle(&migraphx_optimals_create, init_list.begin(), init_list.size());
+    }
+};
+
+/**
+ * @brief Dynamic dimension object.
+ * @details minimum, maximum, and optimal dimensions
+ */
+struct dynamic_dimension : MIGRAPHX_CONST_HANDLE_BASE(dynamic_dimension)
+{
+    MIGRAPHX_HANDLE_CONSTRUCTOR(dynamic_dimension)
+
+    dynamic_dimension(size_t min, size_t max)
+    {
+        this->make_handle(&migraphx_dynamic_dimension_create_min_max, min, max);
+    }
+
+    dynamic_dimension(size_t min, size_t max, const optimals& opts)
+    {
+        this->make_handle(
+            &migraphx_dynamic_dimension_create_min_max_optimals, min, max, opts.get_handle_ptr());
+    }
+
+    bool is_fixed() const
+    {
+        bool result = false;
+        call(&migraphx_dynamic_dimension_is_fixed, &result, this->get_handle_ptr());
+        return result;
+    }
+
+    friend bool operator==(const dynamic_dimension& x, const dynamic_dimension& y)
+    {
+        bool pout;
+        call(&migraphx_dynamic_dimension_equal, &pout, x.get_handle_ptr(), y.get_handle_ptr());
+        return pout;
+    }
+
+    friend bool operator!=(const dynamic_dimension& x, const dynamic_dimension& y)
+    {
+        return not(x == y);
+    }
+};
+
+/**
+ * Container to hold dynamic_dimension objects.
+ */
+struct dynamic_dimensions : MIGRAPHX_HANDLE_BASE(dynamic_dimensions)
+{
+    MIGRAPHX_HANDLE_CONSTRUCTOR(dynamic_dimensions)
+
+    template <class... Ts>
+    dynamic_dimensions(Ts... xs)
+    {
+        std::array<const_migraphx_dynamic_dimension_t, sizeof...(Ts)> a{xs.get_handle_ptr()...};
+        this->make_handle(&migraphx_dynamic_dimensions_create, a.data(), a.size());
+    }
+
+    size_t size() const
+    {
+        size_t pout;
+        call(&migraphx_dynamic_dimensions_size, &pout, this->get_handle_ptr());
+        return pout;
+    }
+
+    dynamic_dimension operator[](size_t pidx) const
+    {
+        const_migraphx_dynamic_dimension_t pout;
+        call(&migraphx_dynamic_dimensions_get, &pout, this->get_handle_ptr(), pidx);
+        return {pout, this->share_handle()};
+    }
+};
+
 /**
 * @brief Describe shape of tensor
 * @details A shape consists of a data type, lengths of multi-dimension tensor, and strides
- *
 */
 struct shape : MIGRAPHX_CONST_HANDLE_BASE(shape)
 {
@@ -598,6 +678,13 @@ struct shape : MIGRAPHX_CONST_HANDLE_BASE(shape)
        this->make_handle(&migraphx_shape_create, type, plengths.data(), plengths.size());
    }

+    // Force all calls of the format `shape( type_t, { size_t compatibles } )` to map to
+    // shape(type_t, std::vector<std::size_t> l)
+    shape(migraphx_shape_datatype_t t, std::initializer_list<std::size_t> d)
+        : shape::shape(t, std::vector<std::size_t>{d.begin(), d.end()})
+    {
+    }
+
    shape(migraphx_shape_datatype_t type,
          std::vector<size_t> plengths,
          std::vector<size_t> pstrides)
@@ -610,6 +697,11 @@ struct shape : MIGRAPHX_CONST_HANDLE_BASE(shape)
                          pstrides.size());
    }

+    shape(migraphx_shape_datatype_t type, const dynamic_dimensions& dyn_dims)
+    {
+        this->make_handle(&migraphx_shape_create_dynamic, type, dyn_dims.get_handle_ptr());
+    }
+
    std::vector<size_t> lengths() const
    {
        const size_t* pout;
@@ -626,6 +718,14 @@ struct shape : MIGRAPHX_CONST_HANDLE_BASE(shape)
        return {pout, pout + pout_size};
    }

+    /// Get the dynamic dimensions of the shape
+    dynamic_dimensions dyn_dims() const
+    {
+        migraphx_dynamic_dimensions_t pout;
+        call(&migraphx_shape_dyn_dims, &pout, this->get_handle_ptr());
+        return {pout, own{}};
+    }
+
    migraphx_shape_datatype_t type() const
    {
        migraphx_shape_datatype_t pout;
@@ -654,6 +754,14 @@ struct shape : MIGRAPHX_CONST_HANDLE_BASE(shape)
        return result;
    }

+    /// Is the shape dynamic
+    bool dynamic() const
+    {
+        bool result = false;
+        call(&migraphx_shape_dynamic, &result, this->get_handle_ptr());
+        return result;
+    }
+
    // map element index to space index
    size_t index(size_t i) const
    {
@@ -687,6 +795,11 @@ struct argument : MIGRAPHX_CONST_HANDLE_BASE(argument)
    MIGRAPHX_DEPRECATED("Contructor without lifetime annotation is deprecated.")
    argument(const migraphx_argument* p) { this->set_handle(p, borrow{}); }

+    argument(shape pshape)
+    {
+        this->make_handle(&migraphx_argument_create_empty, pshape.get_handle_ptr());
+    }
+
    argument(shape pshape, void* pbuffer)
    {
        this->make_handle(&migraphx_argument_create, pshape.get_handle_ptr(), pbuffer);
@@ -1182,12 +1295,27 @@ struct onnx_options : MIGRAPHX_HANDLE_BASE(onnx_options)
             dim.size());
    }

+    void set_dyn_input_parameter_shape(const std::string& name, const dynamic_dimensions& dyn_dims)
+    {
+        call(&migraphx_onnx_options_set_dyn_input_parameter_shape,
+             this->get_handle_ptr(),
+             name.c_str(),
+             dyn_dims.get_handle_ptr());
+    }
+
    /// When there is a dimension parameter, then use this default value
    void set_default_dim_value(unsigned int value)
    {
        call(&migraphx_onnx_options_set_default_dim_value, this->get_handle_ptr(), value);
    }

+    void set_default_dyn_dim_value(const dynamic_dimension& dd)
+    {
+        call(&migraphx_onnx_options_set_default_dyn_dim_value,
+             this->get_handle_ptr(),
+             dd.get_handle_ptr());
+    }
+
    /// Set default max iteration number for the loop operator
    void set_default_loop_iterations(int64_t value)
    {

--- a/src/api/migraphx.py
+++ b/src/api/migraphx.py
@@ -45,56 +45,48 @@ def shape_type_wrap(p):
        p.read = 'migraphx::to_shape_type(${name})'


-@api.cwrap('migraphx::compile_options')
-def compile_options_type_wrap(p):
-    if p.returns:
-        p.add_param('migraphx_compile_options *')
-        p.bad_param('${name} == nullptr', 'Null pointer')
-        p.write = ['*${name} = migraphx::to_compile_options(${result})']
-    else:
-        p.add_param('migraphx_compile_options *')
-        p.read = '${name} == nullptr ? migraphx::compile_options{} : migraphx::to_compile_options(*${name})'
-
-
-@api.cwrap('migraphx::file_options')
-def file_options_type_wrap(p):
-    if p.returns:
-        p.add_param('migraphx_file_options *')
-        p.bad_param('${name} == nullptr', 'Null pointer')
-        p.write = ['*${name} = migraphx::to_file_options(${result})']
-    else:
-        p.add_param('migraphx_file_options *')
-        p.read = '${name} == nullptr ? migraphx::file_options{} : migraphx::to_file_options(*${name})'
+def auto_handle(*args, **kwargs):
+    def with_handle(f):
+        return api.handle('migraphx_' + f.__name__, 'migraphx::' + f.__name__,
+                          *args, **kwargs)(f)

+    return with_handle

-@api.cwrap('migraphx::onnx_options')
-def onnx_options_type_wrap(p):
-    if p.returns:
-        p.add_param('migraphx_onnx_options *')
-        p.bad_param('${name} == nullptr', 'Null pointer')
-        p.write = ['*${name} = migraphx::to_onnx_options(${result})']
-    else:
-        p.add_param('migraphx_onnx_options *')
-        p.read = '${name} == nullptr ? migraphx::onnx_options{} : migraphx::to_onnx_options(*${name})'

+@api.handle('migraphx_optimals', 'std::set<size_t>')
+def optimals(h):
+    h.constructor('create',
+                  api.params(ptr='const size_t*', size='size_t'),
+                  fname='migraphx::make_set<size_t>')

-@api.cwrap('migraphx::tf_options')
-def tf_options_type_wrap(p):
-    if p.returns:
-        p.add_param('migraphx_tf_options *')
-        p.bad_param('${name} == nullptr', 'Null pointer')
-        p.write = ['*${name} = migraphx::to_tf_options(${result})']
-    else:
-        p.add_param('migraphx_tf_options *')
-        p.read = '${name} == nullptr ? migraphx::tf_options{} : migraphx::to_tf_options(*${name})'

+@api.handle('migraphx_dynamic_dimension', 'migraphx::shape::dynamic_dimension')
+def dynamic_dimension(h):
+    h.constructor('create_min_max', api.params(min='size_t', max='size_t'))
+    h.constructor(
+        'create_min_max_optimals',
+        api.params(min='size_t', max='size_t', optimals='std::set<size_t>'))
+    h.method('is_fixed', returns='bool', const=True)
+    h.method('equal',
+             api.params(x='const migraphx::shape::dynamic_dimension&'),
+             invoke='migraphx::equal($@)',
+             returns='bool',
+             const=True)

-def auto_handle(*args, **kwargs):
-    def with_handle(f):
-        return api.handle('migraphx_' + f.__name__, 'migraphx::' + f.__name__,
-                          *args, **kwargs)(f)

-    return with_handle
+@api.handle('migraphx_dynamic_dimensions',
+            'std::vector<migraphx::shape::dynamic_dimension>')
+def dynamic_dimensions(h):
+    h.constructor(
+        'create',
+        api.params(ptr='const_migraphx_dynamic_dimension_t*', size='size_t'),
+        fname='migraphx::to_obj_vector<const_migraphx_dynamic_dimension_t>')
+    h.method('size', returns='size_t')
+    h.method('get',
+             api.params(idx='size_t'),
+             fname='at',
+             cpp_name='operator[]',
+             returns='const migraphx::shape::dynamic_dimension&')


 @auto_handle()
@@ -109,20 +101,29 @@ def shape(h):
                   lengths='std::vector<size_t>',
                   strides='std::vector<size_t>'))
    h.constructor('create_scalar', api.params(type='migraphx::shape::type_t'))
+    h.constructor(
+        'create_dynamic',
+        api.params(type='migraphx::shape::type_t',
+                   dims='std::vector<migraphx::shape::dynamic_dimension>'))
    h.method('lengths',
             fname='lens',
             returns='const std::vector<size_t>&',
             const=True)
    h.method('strides', returns='const std::vector<size_t>&', const=True)
+    h.method('dyn_dims',
+             returns='std::vector<migraphx::shape::dynamic_dimension>',
+             const=True)
    h.method('type', returns='migraphx::shape::type_t', const=True)
    h.method('elements', returns='size_t', const=True)
    h.method('bytes', returns='size_t', const=True)
+    h.method('ndim', returns='size_t', const=True)
    h.method('equal',
             api.params(x='const migraphx::shape&'),
             invoke='migraphx::equal($@)',
             returns='bool',
             const=True)
    h.method('standard', returns='bool', const=True)
+    h.method('dynamic', returns='bool', const=True)
    h.method('index', api.params(i='size_t'), returns='size_t', const=True)


@@ -130,6 +131,7 @@ def shape(h):
 def argument(h):
    h.constructor('create',
                  api.params(shape='const migraphx::shape&', buffer='void*'))
+    h.constructor('create_empty', api.params(shape='const migraphx::shape&'))
    h.method('shape',
             fname='get_shape',
             cpp_name='get_shape',
@@ -325,11 +327,22 @@ def onnx_options(h):
        api.params(name='const char*', dims='std::vector<size_t>'),
        invoke='migraphx::set_input_parameter_shape($@)',
    )
+    h.method(
+        'set_dyn_input_parameter_shape',
+        api.params(name='const char*',
+                   dims='std::vector<migraphx::shape::dynamic_dimension>'),
+        invoke='migraphx::set_dyn_input_parameter_shape($@)',
+    )
    h.method(
        'set_default_dim_value',
        api.params(value='size_t'),
        invoke='migraphx::set_default_dim_value($@)',
    )
+    h.method(
+        'set_default_dyn_dim_value',
+        api.params(dd='const migraphx::shape::dynamic_dimension&'),
+        invoke='migraphx::set_default_dyn_dim_value($@)',
+    )
    h.method(
        'set_default_loop_iterations',
        api.params(value='int64_t'),

--- a/src/common.cpp
+++ b/src/common.cpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -31,20 +31,6 @@

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
-
-// Example:
-// s0 = (3,2,4,5) and s1 = (2,1,1)
-//
-// In this case we need to broadcast (:,1,1) portion of
-// s1 plus broadcast the 1st dimension of s1
-// giving output_lens = (3,2,4,5)
-//
-// Another example:
-// s0 = (3,2,1,5) and s1 = (2,7,5)
-// In this case we need to broadcast the (:,:,1:,:) axis
-// of s0 plus the 1st dimension of s1 giving
-// output_lens = (3,2,7,5)
-//
 std::vector<std::size_t> compute_broadcasted_lens(std::vector<std::size_t> s0,
                                                  std::vector<std::size_t> s1)
 {
@@ -77,32 +63,38 @@ std::vector<shape::dynamic_dimension> compute_broadcasted_dyn_dims(shape s0, sha
    }
    auto offset = s1.ndim() - s0.ndim();
    std::vector<shape::dynamic_dimension> out_dims(s1.dyn_dims());
-    std::transform(
-        s0.dyn_dims().cbegin(),
-        s0.dyn_dims().cend(),
-        s1.dyn_dims().cbegin() + offset,
-        out_dims.begin() + offset,
-        [&](auto a, auto b) {
-            if(a == b)
-            {
-                return a;
-            }
-            else if(a == 1 or b == 1)
-            {
-                // setting opt to 0, may need to be changed
-                return shape::dynamic_dimension{std::max(a.min, b.min), std::max(a.max, b.max), 0};
-            }
-            else
-            {
-                MIGRAPHX_THROW("COMPUTE_BROADCASTED_DYN_DIMS: dynamic shapes {" +
-                               migraphx::to_string_range(s0.dyn_dims()) + "} and {" +
-                               migraphx::to_string_range(s1.dyn_dims()) + "} mismatch!");
-            }
-        });
+    std::transform(s0.dyn_dims().cbegin(),
+                   s0.dyn_dims().cend(),
+                   s1.dyn_dims().cbegin() + offset,
+                   out_dims.begin() + offset,
+                   [&](auto a, auto b) {
+                       if(a == b or b == 1)
+                       {
+                           return a;
+                       }
+                       else if(a == 1)
+                       {
+                           return b;
+                       }
+                       else
+                       {
+                           MIGRAPHX_THROW("COMPUTE_BROADCASTED_DYN_DIMS: dynamic shapes {" +
+                                          migraphx::to_string_range(s0.dyn_dims()) + "} and {" +
+                                          migraphx::to_string_range(s1.dyn_dims()) + "} mismatch!");
+                       }
+                   });
    return out_dims;
 }

-// Compute the common (broadcasted) dimensions of a list of fixed shapes
+std::vector<shape::dynamic_dimension> compute_common_dyn_dims(const std::vector<shape>& shapes)
+{
+    auto ret_shape = shapes.at(0);
+    std::for_each(shapes.cbegin() + 1, shapes.cend(), [&](auto s) {
+        ret_shape = shape{ret_shape.type(), compute_broadcasted_dyn_dims(ret_shape, s)};
+    });
+    return ret_shape.dyn_dims();
+}
+
 std::vector<std::size_t> compute_common_lens(const std::vector<shape>& shapes)
 {
    assert(not shapes.empty());
@@ -148,42 +140,35 @@ shape common_shape(const std::vector<shape>& shapes)
    return {compute_common_types(shapes), compute_common_lens(shapes)};
 }

-instruction_ref insert_common_op(module& m,
-                                 instruction_ref ins,
-                                 const operation& op,
-                                 std::vector<instruction_ref> inputs)
+std::vector<instruction_ref>
+insert_common_args(module& m, instruction_ref ins, std::vector<instruction_ref> inputs)
 {
    if(std::any_of(
           inputs.cbegin(), inputs.cend(), [](auto input) { return input->get_shape().dynamic(); }))
    {
-        // currently only handles the binary case
-        if(inputs.size() != 2)
-        {
-            MIGRAPHX_THROW("INSERT_COMMON_OP: not handled; " + migraphx::to_string(inputs.size()) +
-                           "inputs, only handle two inputs if any are dynamic shape");
-        }
-
-        auto c_type = compute_common_types(to_shapes(inputs));
-        auto c_dyn_dims =
-            compute_broadcasted_dyn_dims(inputs[0]->get_shape(), inputs[1]->get_shape());
+        auto input_shapes = to_shapes(inputs);
+        auto c_type       = compute_common_types(input_shapes);
+        auto c_dyn_dims   = compute_common_dyn_dims(input_shapes);

        // following should work for a static or dynamic shape
        if(inputs[0]->get_shape().dyn_dims() != c_dyn_dims)
        {
            inputs[0] = m.insert_instruction(
-                ins,
-                make_op("multibroadcast", {{"out_dyn_dims", to_value(c_dyn_dims)}}),
-                inputs[0],
-                inputs[1]);
-        }
-        if(inputs[1]->get_shape().dyn_dims() != c_dyn_dims)
-        {
-            inputs[1] = m.insert_instruction(
-                ins,
-                make_op("multibroadcast", {{"out_dyn_dims", to_value(c_dyn_dims)}}),
-                inputs[1],
-                inputs[0]);
+                ins, make_op("multibroadcast", {{"out_dyn_dims", to_value(c_dyn_dims)}}), inputs);
        }
+        std::transform(inputs.begin() + 1, inputs.end(), inputs.begin() + 1, [&](auto input) {
+            // uses previous multibroadcast to avoid recalculating the common shape from the
+            // full set of input shapes at runtime
+            if(input->get_shape().dyn_dims() != c_dyn_dims)
+            {
+                return m.insert_instruction(
+                    ins,
+                    make_op("multibroadcast", {{"out_dyn_dims", to_value(c_dyn_dims)}}),
+                    input,
+                    inputs[0]);
+            }
+            return input;
+        });
        std::transform(inputs.begin(), inputs.end(), inputs.begin(), [&](auto input) {
            if(input->get_shape().type() != c_type)
            {
@@ -210,7 +195,20 @@ instruction_ref insert_common_op(module& m,
            return input;
        });
    }
-    return m.insert_instruction(ins, op, inputs);
+    return inputs;
+}
+
+std::vector<instruction_ref> add_common_args(module& m, std::vector<instruction_ref> inputs)
+{
+    return insert_common_args(m, m.end(), std::move(inputs));
+}
+
+instruction_ref insert_common_op(module& m,
+                                 instruction_ref ins,
+                                 const operation& op,
+                                 std::vector<instruction_ref> inputs)
+{
+    return m.insert_instruction(ins, op, insert_common_args(m, ins, std::move(inputs)));
 }

 instruction_ref add_common_op(module& m, const operation& op, std::vector<instruction_ref> inputs)

--- a/src/cpp_generator.cpp
+++ b/src/cpp_generator.cpp
@@ -106,6 +106,18 @@ cpp_generator::function& cpp_generator::function::set_generic_types(const module
    return *this;
 }

+cpp_generator::function& cpp_generator::function::unused_param(const std::string& pname)
+{
+    body.insert(0, "(void)" + pname + ";\n");
+    return *this;
+}
+cpp_generator::function& cpp_generator::function::add_generic_param(const std::string& pname)
+{
+    params.push_back({pname, "T" + pname});
+    tparams.push_back("class T" + pname);
+    return *this;
+}
+
 struct cpp_generator_impl
 {
    std::stringstream fs{};
@@ -167,6 +179,8 @@ std::string cpp_generator::generate_point_op(const operation& op,
        else if(with_char(::isdigit)(key[0]))
        {
            auto i = std::stoul(key);
+            if(i >= args.size())
+                MIGRAPHX_THROW("Invalid argument index: " + key);
            return args.at(i);
        }
        else if(v.contains(key))
@@ -182,7 +196,8 @@ std::string cpp_generator::generate_point_op(const operation& op,

 std::string cpp_generator::str() const { return impl->fs.str(); }

-cpp_generator::function cpp_generator::generate_module(const module& m)
+cpp_generator::function cpp_generator::generate_module(const module& m,
+                                                       const generate_module_callback& g)
 {
    function f;
    auto name = transform_string(m.name(), [](char c) {
@@ -193,15 +208,25 @@ cpp_generator::function cpp_generator::generate_module(const module& m)
    f.set_name(name).set_types(m).set_body(
        m, [&](instruction_ref ins, const auto& names) -> std::string {
            if(ins->name() == "@literal")
-                return shape::cpp_type(ins->get_shape().type()) + "(" +
-                       ins->get_literal().to_string() + ")";
-            std::vector<std::string> args;
-            std::transform(ins->inputs().begin(),
-                           ins->inputs().end(),
-                           std::back_inserter(args),
-                           [&](auto i) { return names.at(i); });
-
-            auto s = this->generate_point_op(ins->get_operator(), args);
+            {
+                std::string string_literal;
+                ins->get_literal().visit([&](auto v) {
+                    assert(v.size() == 1);
+                    auto x = v.front();
+                    if(std::isinf(x))
+                    {
+                        string_literal = "__builtin_huge_val()";
+                        if(x < 0)
+                            string_literal = "-__builtin_huge_val()";
+                    }
+                    else if(std::isnan(x))
+                        string_literal = "__builtin_nan()";
+                    else
+                        string_literal = ins->get_literal().to_string();
+                });
+                return shape::cpp_type(ins->get_shape().type()) + "(" + string_literal + ")";
+            }
+            auto s = g(ins, names);
            if(impl->fresult)
                return impl->fresult(ins->get_shape()) + '(' + s + ')';
            else
@@ -210,6 +235,24 @@ cpp_generator::function cpp_generator::generate_module(const module& m)
    return f;
 }

+std::vector<std::string>
+cpp_generator::to_args(const std::vector<instruction_ref>& inputs,
+                       const std::unordered_map<instruction_ref, std::string>& names)
+{
+    std::vector<std::string> args;
+    std::transform(inputs.begin(), inputs.end(), std::back_inserter(args), [&](auto i) {
+        return names.at(i);
+    });
+    return args;
+}
+
+cpp_generator::function cpp_generator::generate_module(const module& m)
+{
+    return this->generate_module(m, [&](auto ins, const auto& names) {
+        return this->generate_point_op(ins->get_operator(), to_args(ins->inputs(), names));
+    });
+}
+
 std::string cpp_generator::create_function(const cpp_generator::function& f)
 {
    impl->function_count++;
@@ -218,6 +261,8 @@ std::string cpp_generator::create_function(const cpp_generator::function& f)
    std::string name = f.name.empty() ? "f" + std::to_string(impl->function_count) : f.name;
    impl->fs << join_strings(f.attributes, " ") << " " << f.return_type << " " << name;
    char delim = '(';
+    if(f.params.empty())
+        impl->fs << delim;
    for(auto&& p : f.params)
    {
        impl->fs << delim << p.type << " " << p.name;

--- a/src/driver/argument_parser.hpp
+++ b/src/driver/argument_parser.hpp
@@ -148,13 +148,21 @@ struct value_parser
    template <MIGRAPHX_REQUIRES(not std::is_enum<T>{} and not is_multi_value<T>{})>
    static T apply(const std::string& x)
    {
-        T result;
-        std::stringstream ss;
-        ss.str(x);
-        ss >> result;
-        if(ss.fail())
-            throw std::runtime_error("Failed to parse '" + x + "' as " + type_name<T>::apply());
-        return result;
+        // handle whitespace in string
+        if constexpr(std::is_same<T, std::string>{})
+        {
+            return x;
+        }
+        else
+        {
+            T result;
+            std::stringstream ss;
+            ss.str(x);
+            ss >> result;
+            if(ss.fail())
+                throw std::runtime_error("Failed to parse '" + x + "' as " + type_name<T>::apply());
+            return result;
+        }
    }

    template <MIGRAPHX_REQUIRES(std::is_enum<T>{} and not is_multi_value<T>{})>

--- a/src/driver/main.cpp
+++ b/src/driver/main.cpp
@@ -33,6 +33,7 @@
 #include <migraphx/tf.hpp>
 #include <migraphx/onnx.hpp>
 #include <migraphx/stringutils.hpp>
+#include <migraphx/convert_to_json.hpp>
 #include <migraphx/load_save.hpp>
 #include <migraphx/json.hpp>
 #include <migraphx/version.h>
@@ -68,7 +69,9 @@ struct loader
    bool brief                  = false;
    std::string output_type;
    std::string output;
+    std::string default_dyn_dim;
    std::vector<std::string> param_dims;
+    std::vector<std::string> dyn_param_dims;
    std::vector<std::string> output_names;

    void parse(argument_parser& ap)
@@ -83,7 +86,11 @@ struct loader
        ap(file_type, {"--tf"}, ap.help("Load as tensorflow"), ap.set_value("tf"));
        ap(file_type, {"--migraphx"}, ap.help("Load as MIGraphX"), ap.set_value("migraphx"));
        ap(file_type, {"--migraphx-json"}, ap.help("Load as MIGraphX JSON"), ap.set_value("json"));
-        ap(batch, {"--batch"}, ap.help("Set batch size for model"));
+        ap(batch,
+           {"--batch"},
+           ap.help("For a static model, sets default_dim_value size (commonly batch size). For a "
+                   "dynamic batch model, sets the batch "
+                   "size at runtime."));
        ap(is_nhwc, {"--nhwc"}, ap.help("Treat tensorflow format as nhwc"), ap.set_value(true));
        ap(skip_unknown_operators,
           {"--skip-unknown-operators"},
@@ -96,7 +103,16 @@ struct loader
           ap.help("Dim of a parameter (format: \"@name d1 d2 dn\")"),
           ap.append(),
           ap.nargs(2));
-
+        ap(dyn_param_dims,
+           {"--dyn-input-dim"},
+           ap.help("Dynamic dimensions of a parameter (format: \"@name_1\" \"[{min:x, max:y, "
+                   "optimals:[o1,o2,...]}, dim2,dim3, ...]\", \"@name_2\", ... You can supply a "
+                   "single integer value for a dimension to specify it as fixed."),
+           ap.append(),
+           ap.nargs(2));
+        ap(default_dyn_dim,
+           {"--default-dyn-dim"},
+           ap.help("Default dynamic dimension (format: \"{min:x, max:y, optimals:[o1,o2]}\")."));
        ap(output_names,
           {"--output-names"},
           ap.help("Names of node output (format: \"name_1 name_2 name_n\")"),
@@ -147,6 +163,40 @@ struct loader
        return map_input_dims;
    }

+    static auto parse_dyn_dims_json(const std::string& dd_json)
+    {
+        // expecting a json string like "[{min:1,max:64,optimals:[1,2,4,8]},3,224,224]"
+        auto v = from_json_string(convert_to_json(dd_json));
+        std::vector<migraphx::shape::dynamic_dimension> dyn_dims;
+        std::transform(v.begin(), v.end(), std::back_inserter(dyn_dims), [&](auto x) {
+            if(x.is_object())
+                return from_value<migraphx::shape::dynamic_dimension>(x);
+            auto d = x.template to<std::size_t>();
+            return migraphx::shape::dynamic_dimension{d, d};
+        });
+        return dyn_dims;
+    }
+
+    static auto parse_dyn_dims_map(const std::vector<std::string>& param_dyn_dims)
+    {
+        // expecting vector of strings formatted like
+        // {"@param_name_0", "dd_json_0", "@param_name_1", "dd_json_1", ...}
+        std::unordered_map<std::string, std::vector<shape::dynamic_dimension>> map_dyn_input_dims;
+        std::string name = "";
+        for(auto&& x : param_dyn_dims)
+        {
+            if(x[0] == '@')
+            {
+                name = x.substr(1);
+            }
+            else
+            {
+                map_dyn_input_dims[name] = parse_dyn_dims_json(x);
+            }
+        }
+        return map_dyn_input_dims;
+    }
+
    static auto parse_output_names(const std::vector<std::string>& output_names_info)
    {
        std::vector<std::string> output_node_names;
@@ -158,13 +208,44 @@ struct loader
        return output_node_names;
    }

+    tf_options get_tf_options() const
+    {
+        auto map_input_dims    = parse_param_dims(param_dims);
+        auto output_node_names = parse_output_names(output_names);
+        tf_options options;
+        options.is_nhwc           = is_nhwc;
+        options.batch_size        = batch;
+        options.map_input_dims    = map_input_dims;
+        options.output_node_names = output_node_names;
+        return options;
+    }
+
+    onnx_options get_onnx_options() const
+    {
+        auto map_input_dims     = parse_param_dims(param_dims);
+        auto map_dyn_input_dims = parse_dyn_dims_map(dyn_param_dims);
+        onnx_options options;
+        if(default_dyn_dim.empty())
+        {
+            options.default_dim_value = batch;
+        }
+        else
+        {
+            auto v                        = from_json_string(convert_to_json(default_dyn_dim));
+            options.default_dyn_dim_value = from_value<migraphx::shape::dynamic_dimension>(v);
+        }
+        options.skip_unknown_operators = skip_unknown_operators;
+        options.print_program_on_error = true;
+        options.map_input_dims         = map_input_dims;
+        options.map_dyn_input_dims     = map_dyn_input_dims;
+        return options;
+    }
+
    program load()
    {
        program p;
        if(model.empty())
        {
-            auto map_input_dims    = parse_param_dims(param_dims);
-            auto output_node_names = parse_output_names(output_names);
            if(file_type.empty())
            {
                if(ends_with(file, ".onnx"))
@@ -179,16 +260,11 @@ struct loader
            std::cout << "Reading: " << file << std::endl;
            if(file_type == "onnx")
            {
-                onnx_options options;
-                options.default_dim_value      = batch;
-                options.skip_unknown_operators = skip_unknown_operators;
-                options.print_program_on_error = true;
-                options.map_input_dims         = map_input_dims;
-                p                              = parse_onnx(file, options);
+                p = parse_onnx(file, get_onnx_options());
            }
            else if(file_type == "tf")
            {
-                p = parse_tf(file, tf_options{is_nhwc, batch, map_input_dims, output_node_names});
+                p = parse_tf(file, get_tf_options());
            }
            else if(file_type == "json")
            {
@@ -289,14 +365,21 @@ struct program_params
        ap(fill1, {"--fill1"}, ap.help("Fill parameter with 1s"), ap.append(), ap.nargs(2));
    }

-    auto generate(const program& p, const target& t, bool offload)
+    auto generate(const program& p, const target& t, bool offload, unsigned batch)
    {
        parameter_map m;
+        auto param_shapes = p.get_parameter_shapes();
+        std::unordered_map<std::string, shape> static_param_shapes;
+        std::transform(
+            param_shapes.cbegin(),
+            param_shapes.cend(),
+            std::inserter(static_param_shapes, static_param_shapes.end()),
+            [&](const auto& x) { return std::make_pair(x.first, x.second.to_static(batch)); });
        for(auto&& s : fill0)
-            m[s] = fill_argument(p.get_parameter_shape(s), 0);
+            m[s] = fill_argument(static_param_shapes.at(s), 0);
        for(auto&& s : fill1)
-            m[s] = fill_argument(p.get_parameter_shape(s), 1);
-        fill_param_map(m, p, t, offload);
+            m[s] = fill_argument(static_param_shapes.at(s), 1);
+        fill_param_map(m, static_param_shapes, t, offload);
        return m;
    }
 };
@@ -305,12 +388,12 @@ struct compiler_target
 {
 #ifdef HAVE_GPU
    std::string target_name = "gpu";
-#elif HAVE_CPU
+#elif defined(HAVE_CPU)
    std::string target_name = "cpu";
-#elif HAVE_FPGA
-    std::string target_name = "fpga"
+#elif defined(HAVE_FPGA)
+    std::string target_name = "fpga";
 #else
-    std::string target_name = "ref"
+    std::string target_name = "ref";
 #endif

    void parse(argument_parser& ap)
@@ -332,7 +415,8 @@ struct compiler
    program_params parameters;
    compiler_target ct;
    compile_options co;
-    precision quantize = precision::fp32;
+    bool to_fp16 = false;
+    bool to_int8 = false;

    std::vector<std::string> fill0;
    std::vector<std::string> fill1;
@@ -353,29 +437,55 @@ struct compiler
           {"--exhaustive-tune"},
           ap.help("Exhastively search for best tuning parameters for kernels"),
           ap.set_value(true));
-        ap(quantize, {"--fp16"}, ap.help("Quantize for fp16"), ap.set_value(precision::fp16));
-        ap(quantize, {"--int8"}, ap.help("Quantize for int8"), ap.set_value(precision::int8));
+        ap(to_fp16, {"--fp16"}, ap.help("Quantize for fp16"), ap.set_value(true));
+        ap(to_int8, {"--int8"}, ap.help("Quantize for int8"), ap.set_value(true));
    }

    auto params(const program& p)
    {
-        return parameters.generate(p, ct.get_target(), co.offload_copy);
+        return parameters.generate(p, ct.get_target(), co.offload_copy, l.batch);
+    }
+
+    auto host_params(const program& p)
+    {
+        return parameters.generate(p, ct.get_target(), true, l.batch);
    }

    program compile()
    {
        auto p = l.load();
        // Dont compile if its already been compiled
+
        if(p.is_compiled())
+        {
+            if(ct.target_name == "gpu")
+            {
+                if(is_offload_copy_set(p) and not co.offload_copy)
+                {
+                    std::cout << "MIGraphX program was likely compiled with offload_copy set, Try "
+                                 "passing "
+                                 "`--enable-offload-copy` if program run fails.\n";
+                }
+                else if(co.offload_copy)
+                {
+                    std::cout << "MIGraphX program was likely compiled without "
+                                 "offload_copy set, Try "
+                                 "removing "
+                                 "`--enable-offload-copy` flag if passed to driver, if program run "
+                                 "fails.\n";
+                }
+            }
+
            return p;
+        }
        auto t = ct.get_target();
-        if(quantize == precision::fp16)
+        if(to_fp16)
        {
            quantize_fp16(p);
        }
-        else if(quantize == precision::int8)
+        if(to_int8)
        {
-            quantize_int8(p, t, {params(p)});
+            quantize_int8(p, t, {host_params(p)});
        }
        p.compile(t, co);
        l.save(p);
@@ -432,19 +542,25 @@ struct verify : command<verify>
        std::cout << p << std::endl;

        auto t = c.ct.get_target();
-        auto m = c.parameters.generate(p, t, true);
+        auto m = c.parameters.generate(p, t, true, c.l.batch);
+
+        auto quantize = precision::fp32;
+        if(c.to_fp16)
+            quantize = precision::fp16;
+        if(c.to_int8)
+            quantize = precision::int8;

        if(per_instruction)
        {
-            verify_instructions(p, t, c.co, c.quantize, tolerance);
+            verify_instructions(p, t, c.co, quantize, tolerance);
        }
        else if(reduce)
        {
-            verify_reduced_program(p, t, c.co, c.quantize, m, tolerance);
+            verify_reduced_program(p, t, c.co, quantize, m, tolerance);
        }
        else
        {
-            verify_program(c.l.file, p, t, c.co, c.quantize, m, tolerance);
+            verify_program(c.l.file, p, t, c.co, quantize, m, tolerance);
        }
    }
 };
@@ -574,6 +690,26 @@ struct onnx : command<onnx>
    }
 };

+struct tf : command<tf>
+{
+    bool show_ops = false;
+    void parse(argument_parser& ap)
+    {
+        ap(show_ops,
+           {"--list", "-l"},
+           ap.help("List all tf operators supported by MIGraphX"),
+           ap.set_value(true));
+    }
+    void run() const
+    {
+        if(show_ops)
+        {
+            for(const auto& name : get_tf_operators())
+                std::cout << name << std::endl;
+        }
+    }
+};
+
 struct main_command
 {
    static std::string get_command_help(const std::string& title = colorize(color::fg_yellow,

--- a/src/driver/perf.cpp
+++ b/src/driver/perf.cpp
@@ -24,6 +24,8 @@
 #include "perf.hpp"

 #include <migraphx/generate.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/instruction_ref.hpp>
 #include <migraphx/register_target.hpp>
 #ifdef HAVE_GPU
 #include <migraphx/gpu/hip.hpp>
@@ -39,36 +41,25 @@ auto get_hash(const T& x)
    return std::hash<T>{}(x);
 }

-parameter_map fill_param_map(parameter_map& m, const program& p, const target& t, bool offload)
+parameter_map fill_param_map(parameter_map& m,
+                             const std::unordered_map<std::string, shape>& param_shapes,
+                             const target& t,
+                             bool offload)
 {
-    for(auto&& x : p.get_parameter_shapes())
+    for(auto&& x : param_shapes)
    {
        argument& arg = m[x.first];
        if(arg.empty())
+        {
+            assert(not x.second.dynamic());
            arg = generate_argument(x.second, get_hash(x.first));
+        }
        if(not offload)
            arg = t.copy_to(arg);
    }
    return m;
 }

-parameter_map fill_param_map(parameter_map& m, const program& p, bool gpu)
-{
-    for(auto&& x : p.get_parameter_shapes())
-    {
-        argument& arg = m[x.first];
-        if(arg.empty())
-            arg = generate_argument(x.second, get_hash(x.first));
-#ifdef HAVE_GPU
-        if(gpu)
-            arg = gpu::to_gpu(arg);
-#else
-        (void)gpu;
-#endif
-    }
-    return m;
-}
-
 parameter_map create_param_map(const program& p, const target& t, bool offload)
 {
    parameter_map m;
@@ -108,6 +99,38 @@ target get_target(bool gpu)
        return make_target("cpu");
 }

+bool is_offload_copy_set(const program& p)
+{
+    assert(p.is_compiled());
+    const module* mm                     = p.get_main_module();
+    std::vector<std::string> param_names = mm->get_parameter_names();
+    std::unordered_set<instruction_ref> param_ins;
+    std::transform(param_names.begin(),
+                   param_names.end(),
+                   std::inserter(param_ins, param_ins.begin()),
+                   [&](const auto& i) { return mm->get_parameter(i); });
+    for(const auto& i : *mm)
+    {
+        if(i.name() == "hip::copy_to_gpu")
+        {
+            auto copy_arg = instruction::get_output_alias(i.inputs().front(), true);
+            param_ins.erase(copy_arg);
+        }
+        else if(i.name() == "@return")
+        {
+            auto return_args = i.inputs();
+            for(const auto& j : return_args)
+            {
+                auto alias_ins = instruction::get_output_alias(j, true);
+                if((alias_ins->name() == "@param" && param_ins.erase(alias_ins) == 0) or
+                   (alias_ins->name() != "hip::copy_from_gpu"))
+                    return false;
+            }
+        }
+    }
+    return param_ins.empty();
+}
+
 } // namespace  MIGRAPHX_INLINE_NS
 } // namespace driver
 } // namespace migraphx
--- a/src/driver/perf.hpp
+++ b/src/driver/perf.hpp
@@ -30,13 +30,24 @@ namespace migraphx {
 namespace driver {
 inline namespace MIGRAPHX_INLINE_NS {

-parameter_map
-fill_param_map(parameter_map& m, const program& p, const target& t, bool offload = false);
+parameter_map fill_param_map(parameter_map& m,
+                             const std::unordered_map<std::string, shape>& param_shapes,
+                             const target& t,
+                             bool offload = false);
 parameter_map create_param_map(const program& p, const target& t, bool offload = false);

 parameter_map fill_param_map(parameter_map& m, const program& p, bool gpu);
 parameter_map create_param_map(const program& p, bool gpu = true);
 target get_target(bool gpu);
+/**
+ * @brief Checks if MIGraphX program compiled for "GPU" has offload_copy set of not. This is
+ intended to print a HINT for the users and would not always correctly classify compiled program as
+ with or without offload_copy in all cases.
+
+ * @param p Compiled MIGraphX program for GPU backend
+ * @return true if program is classified as compiled with "offload_copy" set
+ */
+bool is_offload_copy_set(const program& p);

 } // namespace MIGRAPHX_INLINE_NS
 } // namespace driver

--- a/src/dynamic_loader.cpp
+++ b/src/dynamic_loader.cpp
@@ -71,6 +71,16 @@ struct dynamic_loader_impl
    std::shared_ptr<tmp_dir> temp = nullptr;
 };

+fs::path dynamic_loader::path(void* address)
+{
+    fs::path p;
+    Dl_info info;
+    // Find the location of .so
+    if(dladdr(address, &info) != 0)
+        p = info.dli_fname;
+    return p;
+}
+
 dynamic_loader::dynamic_loader(const fs::path& p) : impl(std::make_shared<dynamic_loader_impl>(p))
 {
 }

--- a/src/fuse_pointwise.cpp
+++ b/src/fuse_pointwise.cpp
@@ -31,6 +31,8 @@
 #include <migraphx/ranges.hpp>
 #include <iterator>

+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_POINTWISE_FUSION)
+
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

@@ -67,13 +69,13 @@ static void create_pointwise_modules(module_pass_manager& mpm)
            continue;
        if(ins->get_operator().name() == "layout")
            continue;
-        assert(ins->get_operator().attributes().contains("point_op"));
        auto* pm = mpm.create_module(mpm.get_module().name() + ":pointwise" + std::to_string(n++));
        pm->set_bypass();

        std::unordered_map<instruction_ref, instruction_ref> param_map;
        std::vector<instruction_ref> pointwise_inputs;
        std::size_t i = 0;
+
        for(auto input : ins->inputs())
        {
            if(contains(param_map, input))
@@ -92,6 +94,10 @@ static void create_pointwise_modules(module_pass_manager& mpm)
            }
        }

+        // Don't create pointwise module if no inputs are detected
+        if(pointwise_inputs.empty())
+            continue;
+
        std::vector<instruction_ref> inputs;
        std::transform(ins->inputs().begin(),
                       ins->inputs().end(),
@@ -188,6 +194,10 @@ void fuse_pointwise::apply(module_pass_manager& mpm) const
 {
    create_pointwise_modules(mpm);
    mpm.run_pass(dead_code_elimination{});
+    if(enabled(MIGRAPHX_DISABLE_POINTWISE_FUSION{}))
+    {
+        return;
+    }
    for(int i = 0; i < 8; i++)
    {
        if(not find_pointwise_modules(mpm.get_module()))

--- a/src/fuse_reduce.cpp
+++ b/src/fuse_reduce.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/fuse_reduce.hpp>
+#include <migraphx/pass_manager.hpp>
+#include <migraphx/dead_code_elimination.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/program.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/iterator_for.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/matcher.hpp>
+#include <migraphx/register_op.hpp>
+#include <iterator>
+#include <map>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+struct fused_reduce
+{
+    std::vector<std::int64_t> axes{};
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.axes, "axes"));
+    }
+
+    shape compute_shape(const std::vector<shape>& inputs, std::vector<module_ref> mods) const
+    {
+        if(mods.size() != 1)
+            MIGRAPHX_THROW("should have one submodule.");
+        auto* sm = mods.front();
+        if(sm->get_output_shapes().size() != 1)
+            MIGRAPHX_THROW("Only one output supported");
+        auto names = sm->get_parameter_names();
+        check_shapes{inputs, *this}.has(names.size()).same_ndims();
+        std::sort(names.begin(), names.end());
+        auto shapes = sm->get_parameter_shapes();
+        // Check dimension matches for each input
+        if(not equal(names, inputs, [&](const auto& name, const auto& input) {
+               return shapes.at(name).lens() == input.lens();
+           }))
+            MIGRAPHX_THROW("Dimenstion does not match the submodule.");
+        const auto& s = inputs.at(0);
+        auto lens     = s.lens();
+        if(lens != sm->get_output_shapes().front().lens())
+        {
+            for(const auto& axis : axes)
+            {
+                lens[axis] = 1;
+            }
+        }
+
+        return shape::from_permutation(
+            sm->get_output_shapes().front().type(), lens, find_permutation(inputs));
+    }
+
+    std::string name() const { return "fused_reduce"; }
+};
+MIGRAPHX_REGISTER_OP(fused_reduce);
+
+static std::unordered_map<instruction_ref, instruction_ref>
+get_ins_param_map(const std::vector<instruction_ref>& inputs, const_module_ref sm)
+{
+    std::unordered_map<instruction_ref, instruction_ref> result;
+    auto names = sm->get_parameter_names();
+    std::sort(names.begin(), names.end());
+    assert(names.size() == inputs.size());
+    std::transform(names.begin(),
+                   names.end(),
+                   inputs.begin(),
+                   std::inserter(result, result.end()),
+                   [&](const auto& name, auto input) {
+                       return std::make_pair(input, sm->get_parameter(name));
+                   });
+    return result;
+}
+
+static void insert_params(module_ref sm,
+                          instruction_ref ins,
+                          std::unordered_map<instruction_ref, instruction_ref>& map_ins)
+{
+    auto n = sm->get_parameter_shapes().size();
+    for(auto input : ins->inputs())
+    {
+        if(contains(map_ins, input))
+            continue;
+        auto s         = shape{input->get_shape().type(), input->get_shape().lens()};
+        map_ins[input] = sm->add_parameter("x" + std::to_string(n++), s);
+    }
+}
+
+static auto insert_ins_in_submodule(module_ref sm,
+                                    instruction_ref ins,
+                                    std::unordered_map<instruction_ref, instruction_ref>& map_ins)
+{
+    insert_params(sm, ins, map_ins);
+    return sm->add_instructions({ins}, map_ins);
+}
+
+static auto insert_ins_in_submodule(module_ref sm, instruction_ref ins)
+{
+    std::unordered_map<instruction_ref, instruction_ref> map_ins;
+    return insert_ins_in_submodule(sm, ins, map_ins);
+}
+
+static auto
+insert_module_in_submodule(module_ref sm,
+                           instruction_ref ins,
+                           std::unordered_map<instruction_ref, instruction_ref>& map_ins)
+{
+    insert_params(sm, ins, map_ins);
+    auto* m        = ins->module_inputs().front();
+    auto param_map = get_ins_param_map(ins->inputs(), m);
+    for(auto&& [input, param] : param_map)
+    {
+        map_ins[param] = map_ins.at(input);
+    }
+    return sm->add_instructions(m, map_ins);
+}
+
+static std::vector<instruction_ref>
+find_inputs(module_ref sm,
+            const module& parent,
+            const std::unordered_map<instruction_ref, instruction_ref>& map_ins)
+{
+    std::vector<instruction_ref> result;
+    std::map<std::string, instruction_ref> names;
+    for(auto&& [input, param] : map_ins)
+    {
+        if(not sm->has_instruction(param))
+            continue;
+        if(param->name() != "@param")
+            continue;
+        if(not parent.has_instruction(input))
+            continue;
+        auto v      = param->get_operator().to_value();
+        auto name   = v.at("parameter").to<std::string>();
+        names[name] = input;
+    }
+    std::transform(names.begin(), names.end(), std::back_inserter(result), [](const auto& p) {
+        return p.second;
+    });
+    assert(result.size() == sm->get_parameter_shapes().size());
+    return result;
+}
+
+static void create_reduce_modules(module_pass_manager& mpm)
+{
+    std::size_t n = 0;
+    for(auto ins : iterator_for(mpm.get_module()))
+    {
+        if(not ins->get_operator().attributes().get("reduce", false))
+            continue;
+        if(ins->inputs().size() != 1)
+            continue;
+
+        auto* rm =
+            mpm.create_module(mpm.get_module().name() + ":" + ins->name() + std::to_string(n++));
+        rm->set_bypass();
+
+        rm->add_return(insert_ins_in_submodule(rm, ins));
+        auto v = ins->get_operator().to_value();
+        mpm.get_module().replace_instruction(
+            ins, make_op("fused_reduce", {{"axes", v["axes"]}}), ins->inputs(), {rm});
+    }
+}
+
+template <class... Ms>
+static auto match_broadcast(Ms... ms)
+{
+    return match::skip(match::name("contiguous"))(
+        match::name("multibroadcast")(match::arg(0)(ms...), match::used_once()).bind("broadcast"));
+}
+
+template <class... Ms>
+static auto any_input(Ms... ms)
+{
+    return match::any_of[match::inputs()](match::any(ms...).bind("input"));
+}
+
+static auto match_broadcastable_input(const std::string& op, const std::string& name)
+{
+    auto match_op                 = match::name(op)(match::used_once()).bind(name);
+    auto match_op_input           = any_input(match_op, match::used_once());
+    auto broadcast_match_op_input = any_input(match_broadcast(match_op), match::used_once());
+    return match::any_of(match_op_input, broadcast_match_op_input);
+}
+
+namespace {
+struct find_pointwise_reduce
+{
+    auto matcher() const
+    {
+        return match::name("fused_reduce")(match_broadcastable_input("pointwise", "pointwise"));
+    }
+
+    void apply(module_pass_manager& mpm, const match::matcher_result& r) const
+    {
+        auto reduce = r.result;
+        auto input  = r.instructions["pointwise"];
+
+        const auto* pm     = input->module_inputs().front();
+        const auto* old_rm = reduce->module_inputs().front();
+        auto* rm           = mpm.create_module(pm->name() + ":" + old_rm->name());
+        rm->set_bypass();
+
+        std::unordered_map<instruction_ref, instruction_ref> map_ins;
+        // Insert pointwise
+        auto rins      = insert_ins_in_submodule(rm, input, map_ins).front();
+        map_ins[input] = rins;
+
+        if(contains(r.instructions, "broadcast"))
+        {
+            auto broadcast     = r.instructions["broadcast"];
+            map_ins[broadcast] = insert_ins_in_submodule(rm, broadcast, map_ins).front();
+        }
+
+        // Insert fused_reduce
+        rm->add_return(insert_module_in_submodule(rm, reduce, map_ins));
+
+        auto new_inputs = find_inputs(rm, mpm.get_module(), map_ins);
+        mpm.get_module().replace_instruction(reduce, reduce->get_operator(), new_inputs, {rm});
+    }
+};
+
+struct find_reduce_pointwise
+{
+
+    auto matcher() const
+    {
+        return match::name("pointwise")(match_broadcastable_input("fused_reduce", "reduce"));
+    }
+
+    void apply(module_pass_manager& mpm, const match::matcher_result& r) const
+    {
+        auto pw     = r.result;
+        auto reduce = r.instructions["reduce"];
+        auto input  = r.instructions["input"];
+
+        const auto* pm     = pw->module_inputs().front();
+        const auto* old_rm = reduce->module_inputs().front();
+        auto* rm           = mpm.create_module(old_rm->name() + ":" + pm->name());
+        rm->set_bypass();
+        std::unordered_map<instruction_ref, instruction_ref> map_ins;
+        // Copy module instructions
+        insert_module_in_submodule(rm, reduce, map_ins);
+        if(contains(r.instructions, "broadcast"))
+        {
+            auto broadcast                       = r.instructions["broadcast"];
+            map_ins[broadcast->inputs().front()] = rm->get_returns().front();
+            auto bout                            = insert_ins_in_submodule(rm, broadcast, map_ins);
+            map_ins[input]                       = bout.front();
+        }
+        else
+        {
+            map_ins[input] = rm->get_returns().front();
+        }
+
+        auto out = insert_ins_in_submodule(rm, pw, map_ins);
+        rm->replace_return(out);
+
+        auto new_inputs = find_inputs(rm, mpm.get_module(), map_ins);
+        mpm.get_module().replace_instruction(pw, reduce->get_operator(), new_inputs, {rm});
+    }
+};
+
+struct find_reduce_reduce
+{
+    auto matcher() const
+    {
+        return match::name("fused_reduce")(match_broadcastable_input("fused_reduce", "reduce"));
+    }
+
+    void apply(module_pass_manager& mpm, const match::matcher_result& r) const
+    {
+        auto reduce1 = r.result;
+        auto reduce2 = r.instructions["reduce"];
+        auto input   = r.instructions["input"];
+
+        if(reduce1->get_operator() != reduce2->get_operator())
+            return;
+
+        const auto* rm1 = reduce1->module_inputs().front();
+        const auto* rm2 = reduce2->module_inputs().front();
+        auto* rm        = mpm.create_module(rm1->name() + ":" + rm2->name());
+        rm->set_bypass();
+
+        std::unordered_map<instruction_ref, instruction_ref> map_ins;
+        // Copy reduce1 instructions
+        insert_module_in_submodule(rm, reduce2, map_ins);
+        if(contains(r.instructions, "broadcast"))
+        {
+            auto broadcast                       = r.instructions["broadcast"];
+            map_ins[broadcast->inputs().front()] = rm->get_returns().front();
+            auto bout                            = insert_ins_in_submodule(rm, broadcast, map_ins);
+            map_ins[input]                       = bout.front();
+        }
+        else
+        {
+            map_ins[input] = rm->get_returns().front();
+        }
+
+        auto out = insert_module_in_submodule(rm, reduce1, map_ins);
+        rm->replace_return(out);
+
+        auto new_inputs = find_inputs(rm, mpm.get_module(), map_ins);
+        mpm.get_module().replace_instruction(reduce1, reduce1->get_operator(), new_inputs, {rm});
+    }
+};
+
+} // namespace
+
+void fuse_reduce::apply(module_pass_manager& mpm) const
+{
+    create_reduce_modules(mpm);
+    mpm.run_pass(dead_code_elimination{});
+    for(int i = 0; i < 4; i++)
+    {
+        match::find_matches(
+            mpm, find_reduce_pointwise{}, find_pointwise_reduce{}, find_reduce_reduce{});
+        mpm.run_pass(dead_code_elimination{});
+    }
+}
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/include/migraphx/auto_any_cast.hpp
+++ b/src/include/migraphx/auto_any_cast.hpp
@@ -42,7 +42,7 @@ void any_cast()
 template <class T>
 struct auto_any_caster
 {
-    T& x;
+    T& x; // NOLINT

    template <class U>
    operator U&()

--- a/src/include/migraphx/check_context.hpp
+++ b/src/include/migraphx/check_context.hpp
@@ -27,6 +27,8 @@
 #include <migraphx/program.hpp>
 #include <migraphx/config.hpp>
 #include <migraphx/register_op.hpp>
+#include <migraphx/stringutils.hpp>
+#include <migraphx/ranges.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -36,7 +38,27 @@ struct check_context
 {
    struct op : auto_register_op<op>
    {
-        std::string name() const { return "check_context::" + get_type_name<T>(); }
+        static std::string compute_op_name()
+        {
+            const auto& op_type_name                      = get_type_name<T>();
+            const auto& split_name                        = split_string(op_type_name, ':');
+            std::vector<std::string> name_without_version = {"check_context"};
+            // op_type_name would contain internal namespace name with version_x_y_z
+            // remove version and construct op_name such as check_context::migraphx::gpu::context
+            std::copy_if(
+                split_name.begin(),
+                split_name.end(),
+                std::back_inserter(name_without_version),
+                [&](const auto& i) { return not i.empty() and not contains(i, "version"); });
+            return join_strings(name_without_version, "::");
+        }
+
+        std::string name() const
+        {
+            static auto op_name = compute_op_name();
+            return op_name;
+        }
+
        shape compute_shape(const std::vector<shape>&) const { return {}; }
        argument compute(context& ctx, const shape&, const std::vector<argument>&) const
        {

--- a/src/include/migraphx/check_shapes.hpp
+++ b/src/include/migraphx/check_shapes.hpp
@@ -38,8 +38,8 @@ struct check_shapes
 {
    const shape* begin;
    const shape* end;
-    const std::string name;
-    const bool dynamic_allowed;
+    std::string name;
+    bool dynamic_allowed;

    check_shapes(const shape* b, const shape* e, const std::string& n, const bool d = false)
        : begin(b), end(e), name(n), dynamic_allowed(d)

--- a/src/include/migraphx/common.hpp
+++ b/src/include/migraphx/common.hpp
@@ -34,6 +34,26 @@ inline namespace MIGRAPHX_INLINE_NS {
 struct module;
 struct operation;

+/**
+ * Broadcasting works by comparing the shapes element-wise starting with
+ * the trailing (right-most) dimensions and working leftwards. This is equivalent
+ * to what is done in NumPy.
+ * example 1:
+ * s0 = (3,2,4,5) and s1 = (2,1,1)
+ * In this case we need to broadcast (:,1,1) portion of
+ * s1 plus broadcast the 1st dimension of s0
+ * giving output_lens = (3,2,4,5)
+ *
+ * example 2:
+ * s0 = (3,2,1,5) and s1 = (2,7,5)
+ * In this case we need to broadcast the (:,:,1:,:) axis
+ * of s0 plus the 1st dimension of s1 giving
+ * output_lens = (3,2,7,5)
+ *
+ * example 3:
+ * s0 = (4, 1, 1) and s1 = (3, 4)
+ * output_lens = (4, 3, 4)
+ */
 std::vector<std::size_t> compute_broadcasted_lens(std::vector<std::size_t> s0,
                                                  std::vector<std::size_t> s1);

@@ -41,10 +61,41 @@ std::vector<shape::dynamic_dimension> compute_broadcasted_dyn_dims(shape s0, sha

 shape common_shape(const std::vector<shape>& shapes);

+/**
+ * @brief Compute the common (broadcasted) dimensions of a list of fixed shapes
+ */
+std::vector<std::size_t> compute_common_lens(const std::vector<shape>& shapes);
+
+/**
+ * @ brief Compute the common (broadcasted) dynamic dimensions of a list of dynamic shapes
+ */
+std::vector<shape::dynamic_dimension> compute_common_dyn_dims(const std::vector<shape>& shapes);
+
+/**
+ * @brief  Creates and adds instructions to convert input arguments to common shapes and types
+ * by adding multi-broadcast and type convert operations. This is a utility function for creating
+ * operations where the shape and type of inputs need to match. It supports both dynamic and
+ * static-shaped arguments.
+ *
+ * @param m         containing module for instruction
+ * @param ins       insertion location in instruction list
+ * @param inputs    instructions to use as argument list; also, the shapes
+ *                  attached to each instruction_ref are considered for broadcasting
+ * @return std::vector<instruction_ref>   a modified argument list
+ */
+std::vector<instruction_ref>
+insert_common_args(module& m, instruction_ref ins, std::vector<instruction_ref> inputs);
+
+std::vector<instruction_ref> add_common_args(module& m, std::vector<instruction_ref> inputs);
+
 instruction_ref insert_common_op(module& m,
                                 instruction_ref ins,
                                 const operation& op,
                                 std::vector<instruction_ref> inputs);
+
+/**
+ * @brief Wrapper for insert_common_args() which inserts operation at the end of the module.
+ */
 instruction_ref add_common_op(module& m, const operation& op, std::vector<instruction_ref> inputs);

 } // namespace MIGRAPHX_INLINE_NS

--- a/src/include/migraphx/compile_options.hpp
+++ b/src/include/migraphx/compile_options.hpp
@@ -32,7 +32,12 @@ inline namespace MIGRAPHX_INLINE_NS {

 struct compile_options
 {
-    bool offload_copy    = false;
+    /**
+     * Have MIGX allocate memory for parameters and add instructions
+     * to copy parameters and output to/from an offload device like a GPU.
+     */
+    bool offload_copy = false;
+
    bool fast_math       = true;
    bool exhaustive_tune = false;
    tracer trace{};

--- a/src/include/migraphx/convolution.hpp
+++ b/src/include/migraphx/convolution.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_CONVOLUTION_HPP
+#define MIGRAPHX_GUARD_RTGLIB_CONVOLUTION_HPP
+
+#include <migraphx/config.hpp>
+#include <migraphx/dfor.hpp>
+#include <migraphx/par_for.hpp>
+#include <migraphx/shape_for_each.hpp>
+#include <migraphx/tensor_view.hpp>
+#include <vector>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+template <class Output, class T, class Padding, class Stride>
+void convolution(Output output, T input, T weights, Padding padding, Stride stride, int group)
+{
+    auto output_shape = output.get_shape();
+    auto in_lens      = input.get_shape().lens();
+
+    auto wei_lens = weights.get_shape().lens();
+    auto wei_n    = wei_lens[0];
+    auto wei_c    = wei_lens[1];
+    std::vector<std::size_t> win_size(wei_lens.begin() + 1, wei_lens.end());
+
+    par_for(output_shape.elements(), [&](auto i) {
+        auto idx_o = output_shape.multi(i);
+        auto w     = idx_o[1];
+        auto n_dim = idx_o.size();
+
+        std::vector<std::ptrdiff_t> win_start;
+        for(std::size_t dim = 2; dim < n_dim; ++dim)
+        {
+            auto d_2 = dim - 2;
+            win_start.push_back(std::ptrdiff_t(idx_o[dim] * stride[d_2]) -
+                                std::ptrdiff_t(padding[d_2]));
+        }
+        const auto group_id = w / (wei_n / group);
+
+        shape win_shape{output_shape.type(), win_size};
+
+        double acc = 0.0;
+        shape_for_each(win_shape, [&](auto idx_win) {
+            auto k           = idx_win[0];
+            const auto in_ch = group_id * wei_c + k;
+            std::vector<std::ptrdiff_t> idx(idx_o.begin(), idx_o.end());
+            idx[1] = in_ch;
+            std::transform(idx_win.begin() + 1,
+                           idx_win.end(),
+                           win_start.begin(),
+                           idx.begin() + 2,
+                           [](std::ptrdiff_t ii, std::ptrdiff_t jj) { return ii + jj; });
+            std::vector<std::ptrdiff_t> idx_wei(idx_o.size());
+            idx_wei[0] = w;
+            std::copy(idx_win.begin(), idx_win.end(), idx_wei.begin() + 1);
+            if(std::all_of(idx.begin() + 2, idx.end(), [&](auto ii) { return ii >= 0; }) and
+               std::equal(idx.begin(),
+                          idx.end(),
+                          in_lens.begin(),
+                          in_lens.end(),
+                          std::less<std::ptrdiff_t>{}))
+            {
+                acc += input(idx.begin(), idx.end()) * weights(idx_wei.begin(), idx_wei.end());
+            }
+        });
+
+        output[i] = acc;
+    });
+}
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/include/migraphx/cpp_generator.hpp
+++ b/src/include/migraphx/cpp_generator.hpp
@@ -77,6 +77,8 @@ struct cpp_generator
        function& set_types(const module& m);
        function& set_types(const module& m, const std::function<std::string(shape)>& parse);
        function& set_generic_types(const module& m);
+        function& add_generic_param(const std::string& pname);
+        function& unused_param(const std::string& pname);
    };

    cpp_generator();
@@ -105,6 +107,10 @@ struct cpp_generator

    std::string create_function(const function& f);

+    static std::vector<std::string>
+    to_args(const std::vector<instruction_ref>& inputs,
+            const std::unordered_map<instruction_ref, std::string>& names);
+
    private:
    std::unique_ptr<cpp_generator_impl> impl;
 };

--- a/src/include/migraphx/dynamic_loader.hpp
+++ b/src/include/migraphx/dynamic_loader.hpp
@@ -37,6 +37,12 @@ struct dynamic_loader_impl;

 struct dynamic_loader
 {
+    template <class T>
+    static fs::path path(T* address)
+    {
+        return path(reinterpret_cast<void*>(address));
+    }
+    static fs::path path(void* address);
    dynamic_loader() = default;

    dynamic_loader(const fs::path& p);

--- a/src/include/migraphx/fuse_reduce.hpp
+++ b/src/include/migraphx/fuse_reduce.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_MIGRAPHX_FUSE_REDUCE_HPP
+#define MIGRAPHX_GUARD_MIGRAPHX_FUSE_REDUCE_HPP
+
+#include <migraphx/config.hpp>
+#include <string>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+struct module_pass_manager;
+
+struct fuse_reduce
+{
+    std::string name() const { return "fuse_reduce"; }
+    void apply(module_pass_manager& mpm) const;
+};
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_MIGRAPHX_FUSE_POINTWISE_HPP