Merge remote-tracking branch 'origin/develop' into ck-gsg

fe493c28 · Alan Turner · ba0b3794 · cce35871 · fe493c28 · fe493c28
Commit fe493c28 authored Apr 10, 2023 by Alan Turner
20 changed files
--- a/.dockerignore
+++ b/.dockerignore
+# Ignore everything
+**
+# Allow files and directories
+!*.txt
+!*.ini
+!/tools/*.sh
+!/doc/*.txt
+!/test/onnx/.onnxrt-commit
--- a/.github/workflows/history.yaml
+++ b/.github/workflows/history.yaml
@@ -20,6 +20,10 @@ on:
        description: Repository where benchmark utils are stored
        required: true
        default: "ROCmSoftwarePlatform/migraphx-benchmark-utils"
+      organization:
+        description: Organization based on which location of files will be different
+        required: true
+        default: "AMD"
 jobs:
  release:
@@ -29,6 +33,7 @@ jobs:
      end_date: ${{ github.event.inputs.end_date || 'yyyy-mm-dd' }}
      history_repo: ${{ github.event.inputs.history_repo || 'ROCmSoftwarePlatform/migraphx-reports' }}
      benchmark_utils_repo: ${{ github.event.inputs.benchmark_utils_repo || 'ROCmSoftwarePlatform/migraphx-benchmark-utils' }}
+      organization: ${{ github.event.inputs.organization || 'AMD' }}
    secrets:
      gh_token: ${{ secrets.MIGRAPHX_BOT_TOKEN }}
      mail_user: ${{ secrets.MAIL_USERNAME }}

--- a/.github/workflows/performance.yaml
+++ b/.github/workflows/performance.yaml
@@ -29,6 +29,10 @@ on:
        description: Last N results
        required: true
        default: '10'
+      model_timeout:
+        description: If model in performance test script passes this threshold, it will be skipped
+        required: true
+        default: '30m'
      flags:
        description: -m for Max value; -s for Std dev; -r for Threshold file
        required: true
@@ -46,6 +50,7 @@ jobs:
      performance_reports_repo: ${{ github.event.inputs.performance_reports_repo || 'ROCmSoftwarePlatform/migraphx-reports' }}
      benchmark_utils_repo: ${{ github.event.inputs.benchmark_utils_repo || 'ROCmSoftwarePlatform/migraphx-benchmark-utils' }}
      organization: ${{ github.event.inputs.organization || 'AMD' }}
+      model_timeout: ${{ github.event.inputs.model_timeout || '30m' }}
    secrets:
      gh_token: ${{ secrets.MIGRAPHX_BOT_TOKEN }}
      mail_user: ${{ secrets.MAIL_USERNAME }}

--- a/Dockerfile
+++ b/Dockerfile
@@ -110,7 +110,7 @@ RUN git clone --single-branch --branch ${ONNXRUNTIME_BRANCH} --recursive ${ONNXR
 ADD tools/build_and_test_onnxrt.sh /onnxruntime/build_and_test_onnxrt.sh
-RUN cget -p /usr/local install ROCmSoftwarePlatform/rocMLIR@78b706fe9879587ab98b6614ae539265374a3fae -DBUILD_MIXR_TARGET=On -DLLVM_ENABLE_ZSTD=Off -DLLVM_ENABLE_THREADS=Off
+RUN cget -p /usr/local install ROCmSoftwarePlatform/rocMLIR@acb727b348086b58a7f261b32c0e4f0686a4c0ee -DBUILD_MIXR_TARGET=On -DLLVM_ENABLE_ZSTD=Off -DLLVM_ENABLE_THREADS=Off
 ENV MIOPEN_FIND_DB_PATH=/tmp/miopen/find-db
 ENV MIOPEN_USER_DB_PATH=/tmp/miopen/user-db

--- a/doc/src/driver/compile.rst
+++ b/doc/src/driver/compile.rst
@@ -32,6 +32,10 @@ Disable fast math optimization
 Perform an exhaustive search to find the fastest version of generated kernels for selected backend
+.. options:: --split-single-dyn-dim
+Enable the split single dynamic dimension pass
 .. option::  --fp16
 Quantize for fp16

--- a/doc/src/driver/read.rst
+++ b/doc/src/driver/read.rst
@@ -24,7 +24,7 @@ Load as MIGraphX JSON
 .. option::  --batch [unsigned int] (Default: 1)
-Set batch size for model
+For a static model, set batch size. For a dynamic batch model, sets the batch size at runtime.
 .. option::  --nhwc
@@ -46,6 +46,14 @@ Trim instructions from the end (Default: 0)
 Dim of a parameter (format: "@name d1 d2 dn")
+.. options:: --dyn-input-dim [std::vector<std::string>]
+Set dynamic dimensions of a parameter using JSON formatting (format "@name" "dynamic_dimension_json")
+.. options:: --default-dyn-dim
+Set the default dynamic dimension (format {min:x, max:y, optimals:[o1,o2,...]})
 .. option::  --optimize, -O
 Optimize when reading

--- a/examples/migraphx/cpp_parse_load_save/CMakeLists.txt
+++ b/examples/migraphx/cpp_parse_load_save/CMakeLists.txt
@@ -27,7 +27,7 @@ project (PLS)
 set (CMAKE_CXX_STANDARD 14)
 set (EXAMPLE parse_load_save)
-list (APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+list (APPEND CMAKE_PREFIX_PATH /opt/rocm)
 find_package (migraphx)
 message("source file: " ${EXAMPLE}.cpp " ---> bin: " ${EXAMPLE})

--- a/examples/migraphx/custom_op_miopen_kernel/CMakeLists.txt
+++ b/examples/migraphx/custom_op_miopen_kernel/CMakeLists.txt
@@ -27,7 +27,7 @@ project (custom_miopen_kernel)
 set (CMAKE_CXX_STANDARD 14)
 set (EXAMPLE custom_op_miopen_kernel)
-list (APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+list (APPEND CMAKE_PREFIX_PATH /opt/rocm)
 find_package (migraphx REQUIRED)
 find_package (miopen REQUIRED)

--- a/examples/migraphx/custom_op_rocblas_kernel/CMakeLists.txt
+++ b/examples/migraphx/custom_op_rocblas_kernel/CMakeLists.txt
@@ -28,7 +28,7 @@ set (CMAKE_CXX_STANDARD 14)
 set (EXAMPLE custom_op_rocblas_kernel)
-list (APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+list (APPEND CMAKE_PREFIX_PATH /opt/rocm)
 find_package (migraphx REQUIRED)
 find_package (rocblas REQUIRED)

--- a/examples/migraphx/migraphx_driver/README.md
+++ b/examples/migraphx/migraphx_driver/README.md
@@ -29,7 +29,7 @@ See below for a comprehensive list of commands and option arguments, as well as
 | --tf                                     | Load file as a tensorflow graph                           |
 | --migraphx                               | Load file as a migraphx graph                             |
 | --migraphx-json                          | Load file as a migraphx JSON graph                        |
-| --batch                                  | Set batch size for the model                              |
+| --batch                                  | For a static model, set batch size. For a dynamic batch model, sets the batch size at runtime.|
 | --nhwc                                   | Treat tensorflow format as nhwc                           |
 | --nchw                                   | Treat tensorflow format as nchw                           |
 | --skip-unknown-operators                 | Skip unknown operators when parsing and continue to parse |
@@ -44,12 +44,16 @@ See below for a comprehensive list of commands and option arguments, as well as
 | --output \| -o                           | Output to file                                            |
 | --fill0                                  | Fill parameter with 0s                                    |
 | --fill1                                  | Fill parameter with 1s                                    |
+| --input-dim                              | Set static dimensions of a parameter                      |
+| --dyn-input-dim                          | Set dynamic dimensions of a parameter                     |
+| --default-dyn-dim                        | Set default dynamic dimension                             |
 | --gpu                                    | Compile on the gpu                                        |
 | --cpu                                    | Compile on the cpu                                        |
 | --ref                                    | Compile on the reference implementation                   |
 | --enable-offload-copy                    | Enable implicit offload copying                           |
 | --disable-fast-math                      | Disable fast math optimization                            |
 | --exhaustive-tune                        | Enable exhaustive search to find fastest kernel           |
+| --split-single-dyn-dim                   | Enable split_single_dyn_dim compiler pass                 |
 | --fp16                                   | Quantize for fp16                                         |
 | --int8                                   | Quantize for int8                                         |
 | --tolerance                              | Tolerance for errors                                      |
@@ -88,7 +92,7 @@ batch_norm_inference
 broadcast
 capture
 ceil
-check_context::migraphx::version_1::gpu::context
+check_context::migraphx::gpu::context
 clip
 concat
 contiguous
@@ -304,7 +308,7 @@ $ /opt/rocm/bin/migraphx-driver run --onnx simple_graph.onnx
 ```
 Compiling ... 
 Reading: simple_graph.onnx
-@0 = check_context::migraphx::version_1::gpu::context -> float_type, {}, {}
+@0 = check_context::migraphx::gpu::context -> float_type, {}, {}
 @1 = hip::hip_allocate_memory[shape=float_type, {256}, {1},id=scratch] -> float_type, {256}, {1}
 @2 = hip::hip_copy_literal[id=@literal:1] -> float_type, {784, 128}, {128, 1}
 x:0 = @param:x:0 -> float_type, {1, 28, 28}, {784, 28, 1}
@@ -327,7 +331,7 @@ x:0 = @param:x:0 -> float_type, {1, 28, 28}, {784, 28, 1}
 @18 = @return(@17)
 Allocating params ... 
-@0 = check_context::migraphx::version_1::gpu::context -> float_type, {}, {}
+@0 = check_context::migraphx::gpu::context -> float_type, {}, {}
 @1 = hip::hip_allocate_memory[shape=float_type, {256}, {1},id=scratch] -> float_type, {256}, {1}
 @2 = hip::hip_copy_literal[id=@literal:1] -> float_type, {784, 128}, {128, 1}
 x:0 = @param:x:0 -> float_type, {1, 28, 28}, {784, 28, 1}
@@ -399,7 +403,7 @@ $ /opt/rocm/bin/migraphx-driver compile --gpu --fp16 simple_graph.pb
 ```
 Compiling ... 
 Reading: simple_graph.pb
-@0 = check_context::migraphx::version_1::gpu::context -> float_type, {}, {}
+@0 = check_context::migraphx::gpu::context -> float_type, {}, {}
 @1 = hip::hip_allocate_memory[shape=float_type, {456}, {1},id=scratch] -> float_type, {456}, {1}
 @2 = hip::hip_copy_literal[id=@literal:0] -> half_type, {784, 128}, {128, 1}
 @3 = load[offset=256,end=1824](@1) -> half_type, {1, 28, 28}, {784, 28, 1}
@@ -502,7 +506,7 @@ x = @param:x -> float_type, {1, 28, 28}, {784, 28, 1}
 @18 = ref::softmax[axis=1](@17) -> float_type, {1, 10}, {10, 1}
 @19 = ref::identity(@18) -> float_type, {1, 10}, {10, 1}
-@0 = check_context::migraphx::version_1::gpu::context -> float_type, {}, {}
+@0 = check_context::migraphx::gpu::context -> float_type, {}, {}
 @1 = hip::hip_allocate_memory[shape=float_type, {256}, {1},id=scratch] -> float_type, {256}, {1}
 @2 = hip::hip_copy_literal[id=@literal:3] -> float_type, {784, 128}, {128, 1}
 x = @param:x -> float_type, {1, 28, 28}, {784, 28, 1}
@@ -538,7 +542,7 @@ $ /opt/rocm/bin/migraphx-driver perf simple_graph.pb
 ```
 Compiling ... 
 Reading: simple_graph.pb
-@0 = check_context::migraphx::version_1::gpu::context -> float_type, {}, {}
+@0 = check_context::migraphx::gpu::context -> float_type, {}, {}
 @1 = hip::hip_allocate_memory[shape=float_type, {256}, {1},id=scratch] -> float_type, {256}, {1}
 @2 = hip::hip_copy_literal[id=@literal:3] -> float_type, {784, 128}, {128, 1}
 @3 = load[offset=0,end=512](@1) -> float_type, {1, 128}, {128, 1}
@@ -561,7 +565,7 @@ output = @param:output -> float_type, {1, 10}, {10, 1}
 Allocating params ... 
 Running performance report ... 
-@0 = check_context::migraphx::version_1::gpu::context -> float_type, {}, {}: 0.00057782ms, 1%
+@0 = check_context::migraphx::gpu::context -> float_type, {}, {}: 0.00057782ms, 1%
 @1 = hip::hip_allocate_memory[shape=float_type, {256}, {1},id=scratch] -> float_type, {256}, {1}: 0.000295ms, 1%
 @2 = hip::hip_copy_literal[id=@literal:3] -> float_type, {784, 128}, {128, 1}: 0.00027942ms, 1%
 @3 = load[offset=0,end=512](@1) -> float_type, {1, 128}, {128, 1}: 0.000232ms, 1%
@@ -591,7 +595,7 @@ hip::hip_copy_literal: 0.00186824ms, 1%
 load: 0.0016288ms, 1%
 @param: 0.0013428ms, 1%
 broadcast: 0.00118042ms, 1%
-check_context::migraphx::version_1::gpu::context: 0.00057782ms, 1%
+check_context::migraphx::gpu::context: 0.00057782ms, 1%
 reshape: 0.00033842ms, 1%
 hip::hip_allocate_memory: 0.000295ms, 1%

--- a/examples/vision/cpp_mnist/CMakeLists.txt
+++ b/examples/vision/cpp_mnist/CMakeLists.txt
@@ -27,7 +27,7 @@ project (CAI)
 set (CMAKE_CXX_STANDARD 14)
 set (EXAMPLE mnist_inference)
-list (APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+list (APPEND CMAKE_PREFIX_PATH /opt/rocm)
 find_package (migraphx)
 message("source file: " ${EXAMPLE}.cpp " ---> bin: " ${EXAMPLE})

--- a/rbuild.ini
+++ b/rbuild.ini
@@ -14,6 +14,7 @@ define =
    CMAKE_C_COMPILER_LAUNCHER=${deps_dir}/bin/ccache
    CMAKE_CXX_COMPILER_LAUNCHER=${deps_dir}/bin/ccache
    MIGRAPHX_ENABLE_CPU=On
+    BUILD_DEV=On
 [develop]
 cxx = ${rocm_path}/llvm/bin/clang++
@@ -25,3 +26,4 @@ define =
    CMAKE_C_COMPILER_LAUNCHER=${deps_dir}/bin/ccache
    CMAKE_CXX_COMPILER_LAUNCHER=${deps_dir}/bin/ccache
    MIGRAPHX_ENABLE_CPU=On
+    BUILD_DEV=On
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -50,6 +50,7 @@ add_library(migraphx
    env.cpp
    file_buffer.cpp
    fuse_pointwise.cpp
+    fuse_reduce.cpp
    generate.cpp
    inline_module.cpp
    insert_pad.cpp
@@ -73,6 +74,7 @@ add_library(migraphx
    process.cpp
    program.cpp
    propagate_constant.cpp
+    promote_literals.cpp
    quantization.cpp
    quantize_fp16.cpp
    quantize_int8.cpp
@@ -91,6 +93,7 @@ add_library(migraphx
    shape.cpp
    simplify_algebra.cpp
    simplify_reshapes.cpp
+    split_single_dyn_dim.cpp
    tmp_dir.cpp
    value.cpp
    verify_args.cpp

--- a/src/common.cpp
+++ b/src/common.cpp
@@ -89,8 +89,8 @@ std::vector<shape::dynamic_dimension> compute_broadcasted_dyn_dims(shape s0, sha
            }
            else if(a == 1 or b == 1)
            {
-                // setting opt to 0, may need to be changed
+                // setting optimals to empty, may need to be changed
-                return shape::dynamic_dimension{std::max(a.min, b.min), std::max(a.max, b.max), 0};
+                return shape::dynamic_dimension{std::max(a.min, b.min), std::max(a.max, b.max)};
            }
            else
            {
@@ -148,10 +148,8 @@ shape common_shape(const std::vector<shape>& shapes)
    return {compute_common_types(shapes), compute_common_lens(shapes)};
 }
-instruction_ref insert_common_op(module& m,
+std::vector<instruction_ref>
-                                 instruction_ref ins,
+insert_common_args(module& m, instruction_ref ins, std::vector<instruction_ref> inputs)
-                                 const operation& op,
-                                 std::vector<instruction_ref> inputs)
 {
    if(std::any_of(
           inputs.cbegin(), inputs.cend(), [](auto input) { return input->get_shape().dynamic(); }))
@@ -210,7 +208,20 @@ instruction_ref insert_common_op(module& m,
            return input;
        });
    }
-    return m.insert_instruction(ins, op, inputs);
+    return inputs;
+}
+std::vector<instruction_ref> add_common_args(module& m, std::vector<instruction_ref> inputs)
+{
+    return insert_common_args(m, m.end(), std::move(inputs));
+}
+instruction_ref insert_common_op(module& m,
+                                 instruction_ref ins,
+                                 const operation& op,
+                                 std::vector<instruction_ref> inputs)
+{
+    return m.insert_instruction(ins, op, insert_common_args(m, ins, std::move(inputs)));
 }
 instruction_ref add_common_op(module& m, const operation& op, std::vector<instruction_ref> inputs)

--- a/src/cpp_generator.cpp
+++ b/src/cpp_generator.cpp
@@ -106,6 +106,13 @@ cpp_generator::function& cpp_generator::function::set_generic_types(const module
    return *this;
 }
+cpp_generator::function& cpp_generator::function::add_generic_param(const std::string& pname)
+{
+    params.push_back({pname, "T" + pname});
+    tparams.push_back("class T" + pname);
+    return *this;
+}
 struct cpp_generator_impl
 {
    std::stringstream fs{};
@@ -182,7 +189,8 @@ std::string cpp_generator::generate_point_op(const operation& op,
 std::string cpp_generator::str() const { return impl->fs.str(); }
-cpp_generator::function cpp_generator::generate_module(const module& m)
+cpp_generator::function cpp_generator::generate_module(const module& m,
+                                                       const generate_module_callback& g)
 {
    function f;
    auto name = transform_string(m.name(), [](char c) {
@@ -195,13 +203,7 @@ cpp_generator::function cpp_generator::generate_module(const module& m)
            if(ins->name() == "@literal")
                return shape::cpp_type(ins->get_shape().type()) + "(" +
                       ins->get_literal().to_string() + ")";
-            std::vector<std::string> args;
+            auto s = g(ins, names);
-            std::transform(ins->inputs().begin(),
-                           ins->inputs().end(),
-                           std::back_inserter(args),
-                           [&](auto i) { return names.at(i); });
-            auto s = this->generate_point_op(ins->get_operator(), args);
            if(impl->fresult)
                return impl->fresult(ins->get_shape()) + '(' + s + ')';
            else
@@ -210,6 +212,24 @@ cpp_generator::function cpp_generator::generate_module(const module& m)
    return f;
 }
+std::vector<std::string>
+cpp_generator::to_args(const std::vector<instruction_ref>& inputs,
+                       const std::unordered_map<instruction_ref, std::string>& names)
+{
+    std::vector<std::string> args;
+    std::transform(inputs.begin(), inputs.end(), std::back_inserter(args), [&](auto i) {
+        return names.at(i);
+    });
+    return args;
+}
+cpp_generator::function cpp_generator::generate_module(const module& m)
+{
+    return this->generate_module(m, [&](auto ins, const auto& names) {
+        return this->generate_point_op(ins->get_operator(), to_args(ins->inputs(), names));
+    });
+}
 std::string cpp_generator::create_function(const cpp_generator::function& f)
 {
    impl->function_count++;

--- a/src/driver/argument_parser.hpp
+++ b/src/driver/argument_parser.hpp
@@ -148,13 +148,21 @@ struct value_parser
    template <MIGRAPHX_REQUIRES(not std::is_enum<T>{} and not is_multi_value<T>{})>
    static T apply(const std::string& x)
    {
-        T result;
+        // handle whitespace in string
-        std::stringstream ss;
+        if constexpr(std::is_same<T, std::string>{})
-        ss.str(x);
+        {
-        ss >> result;
+            return x;
-        if(ss.fail())
+        }
-            throw std::runtime_error("Failed to parse '" + x + "' as " + type_name<T>::apply());
+        else
-        return result;
+        {
+            T result;
+            std::stringstream ss;
+            ss.str(x);
+            ss >> result;
+            if(ss.fail())
+                throw std::runtime_error("Failed to parse '" + x + "' as " + type_name<T>::apply());
+            return result;
+        }
    }
    template <MIGRAPHX_REQUIRES(std::is_enum<T>{} and not is_multi_value<T>{})>

--- a/src/driver/main.cpp
+++ b/src/driver/main.cpp
@@ -33,6 +33,7 @@
 #include <migraphx/tf.hpp>
 #include <migraphx/onnx.hpp>
 #include <migraphx/stringutils.hpp>
+#include <migraphx/convert_to_json.hpp>
 #include <migraphx/load_save.hpp>
 #include <migraphx/json.hpp>
 #include <migraphx/version.h>
@@ -68,7 +69,9 @@ struct loader
    bool brief                  = false;
    std::string output_type;
    std::string output;
+    std::string default_dyn_dim;
    std::vector<std::string> param_dims;
+    std::vector<std::string> dyn_param_dims;
    std::vector<std::string> output_names;
    void parse(argument_parser& ap)
@@ -83,7 +86,11 @@ struct loader
        ap(file_type, {"--tf"}, ap.help("Load as tensorflow"), ap.set_value("tf"));
        ap(file_type, {"--migraphx"}, ap.help("Load as MIGraphX"), ap.set_value("migraphx"));
        ap(file_type, {"--migraphx-json"}, ap.help("Load as MIGraphX JSON"), ap.set_value("json"));
-        ap(batch, {"--batch"}, ap.help("Set batch size for model"));
+        ap(batch,
+           {"--batch"},
+           ap.help("For a static model, sets default_dim_value size (commonly batch size). For a "
+                   "dynamic batch model, sets the batch "
+                   "size at runtime."));
        ap(is_nhwc, {"--nhwc"}, ap.help("Treat tensorflow format as nhwc"), ap.set_value(true));
        ap(skip_unknown_operators,
           {"--skip-unknown-operators"},
@@ -96,7 +103,16 @@ struct loader
           ap.help("Dim of a parameter (format: \"@name d1 d2 dn\")"),
           ap.append(),
           ap.nargs(2));
+        ap(dyn_param_dims,
+           {"--dyn-input-dim"},
+           ap.help("Dynamic dimensions of a parameter (format: \"@name_1\" \"[{min:x, max:y, "
+                   "optimals:[o1,o2,...]}, dim2,dim3, ...]\", \"@name_2\", ... You can supply a "
+                   "single integer value for a dimension to specify it as fixed."),
+           ap.append(),
+           ap.nargs(2));
+        ap(default_dyn_dim,
+           {"--default-dyn-dim"},
+           ap.help("Default dynamic dimension (format: \"{min:x, max:y, optimals:[o1,o2]}\")."));
        ap(output_names,
           {"--output-names"},
           ap.help("Names of node output (format: \"name_1 name_2 name_n\")"),
@@ -147,6 +163,40 @@ struct loader
        return map_input_dims;
    }
+    static auto parse_dyn_dims_json(const std::string& dd_json)
+    {
+        // expecting a json string like "[{min:1,max:64,optimals:[1,2,4,8]},3,224,224]"
+        auto v = from_json_string(convert_to_json(dd_json));
+        std::vector<migraphx::shape::dynamic_dimension> dyn_dims;
+        std::transform(v.begin(), v.end(), std::back_inserter(dyn_dims), [&](auto x) {
+            if(x.is_object())
+                return from_value<migraphx::shape::dynamic_dimension>(x);
+            auto d = x.template to<std::size_t>();
+            return migraphx::shape::dynamic_dimension{d, d};
+        });
+        return dyn_dims;
+    }
+    static auto parse_dyn_dims_map(const std::vector<std::string>& param_dyn_dims)
+    {
+        // expecting vector of strings formatted like
+        // {"@param_name_0", "dd_json_0", "@param_name_1", "dd_json_1", ...}
+        std::unordered_map<std::string, std::vector<shape::dynamic_dimension>> map_dyn_input_dims;
+        std::string name = "";
+        for(auto&& x : param_dyn_dims)
+        {
+            if(x[0] == '@')
+            {
+                name = x.substr(1);
+            }
+            else
+            {
+                map_dyn_input_dims[name] = parse_dyn_dims_json(x);
+            }
+        }
+        return map_dyn_input_dims;
+    }
    static auto parse_output_names(const std::vector<std::string>& output_names_info)
    {
        std::vector<std::string> output_node_names;
@@ -158,13 +208,44 @@ struct loader
        return output_node_names;
    }
+    tf_options get_tf_options() const
+    {
+        auto map_input_dims    = parse_param_dims(param_dims);
+        auto output_node_names = parse_output_names(output_names);
+        tf_options options;
+        options.is_nhwc           = is_nhwc;
+        options.batch_size        = batch;
+        options.map_input_dims    = map_input_dims;
+        options.output_node_names = output_node_names;
+        return options;
+    }
+    onnx_options get_onnx_options() const
+    {
+        auto map_input_dims     = parse_param_dims(param_dims);
+        auto map_dyn_input_dims = parse_dyn_dims_map(dyn_param_dims);
+        onnx_options options;
+        if(default_dyn_dim.empty())
+        {
+            options.default_dim_value = batch;
+        }
+        else
+        {
+            auto v                        = from_json_string(convert_to_json(default_dyn_dim));
+            options.default_dyn_dim_value = from_value<migraphx::shape::dynamic_dimension>(v);
+        }
+        options.skip_unknown_operators = skip_unknown_operators;
+        options.print_program_on_error = true;
+        options.map_input_dims         = map_input_dims;
+        options.map_dyn_input_dims     = map_dyn_input_dims;
+        return options;
+    }
    program load()
    {
        program p;
        if(model.empty())
        {
-            auto map_input_dims    = parse_param_dims(param_dims);
-            auto output_node_names = parse_output_names(output_names);
            if(file_type.empty())
            {
                if(ends_with(file, ".onnx"))
@@ -179,16 +260,11 @@ struct loader
            std::cout << "Reading: " << file << std::endl;
            if(file_type == "onnx")
            {
-                onnx_options options;
+                p = parse_onnx(file, get_onnx_options());
-                options.default_dim_value      = batch;
-                options.skip_unknown_operators = skip_unknown_operators;
-                options.print_program_on_error = true;
-                options.map_input_dims         = map_input_dims;
-                p                              = parse_onnx(file, options);
            }
            else if(file_type == "tf")
            {
-                p = parse_tf(file, tf_options{is_nhwc, batch, map_input_dims, output_node_names});
+                p = parse_tf(file, get_tf_options());
            }
            else if(file_type == "json")
            {
@@ -289,14 +365,21 @@ struct program_params
        ap(fill1, {"--fill1"}, ap.help("Fill parameter with 1s"), ap.append(), ap.nargs(2));
    }
-    auto generate(const program& p, const target& t, bool offload)
+    auto generate(const program& p, const target& t, bool offload, unsigned batch)
    {
        parameter_map m;
+        auto param_shapes = p.get_parameter_shapes();
+        std::unordered_map<std::string, shape> static_param_shapes;
+        std::transform(
+            param_shapes.cbegin(),
+            param_shapes.cend(),
+            std::inserter(static_param_shapes, static_param_shapes.end()),
+            [&](const auto& x) { return std::make_pair(x.first, x.second.to_static(batch)); });
        for(auto&& s : fill0)
-            m[s] = fill_argument(p.get_parameter_shape(s), 0);
+            m[s] = fill_argument(static_param_shapes.at(s), 0);
        for(auto&& s : fill1)
-            m[s] = fill_argument(p.get_parameter_shape(s), 1);
+            m[s] = fill_argument(static_param_shapes.at(s), 1);
-        fill_param_map(m, p, t, offload);
+        fill_param_map(m, static_param_shapes, t, offload);
        return m;
    }
 };
@@ -305,12 +388,12 @@ struct compiler_target
 {
 #ifdef HAVE_GPU
    std::string target_name = "gpu";
-#elif HAVE_CPU
+#elif defined(HAVE_CPU)
    std::string target_name = "cpu";
-#elif HAVE_FPGA
+#elif defined(HAVE_FPGA)
-    std::string target_name = "fpga"
+    std::string target_name = "fpga";
 #else
-    std::string target_name = "ref"
+    std::string target_name = "ref";
 #endif
    void parse(argument_parser& ap)
@@ -353,13 +436,18 @@ struct compiler
           {"--exhaustive-tune"},
           ap.help("Exhastively search for best tuning parameters for kernels"),
           ap.set_value(true));
+        ap(co.split_single_dyn_dim,
+           {"--split-single-dyn-dim"},
+           ap.help("If there is a single non-fixed dynamic dimension in the model, then split to "
+                   "static submodules"),
+           ap.set_value(true));
        ap(quantize, {"--fp16"}, ap.help("Quantize for fp16"), ap.set_value(precision::fp16));
        ap(quantize, {"--int8"}, ap.help("Quantize for int8"), ap.set_value(precision::int8));
    }
    auto params(const program& p)
    {
-        return parameters.generate(p, ct.get_target(), co.offload_copy);
+        return parameters.generate(p, ct.get_target(), co.offload_copy, l.batch);
    }
    program compile()
@@ -432,7 +520,7 @@ struct verify : command<verify>
        std::cout << p << std::endl;
        auto t = c.ct.get_target();
-        auto m = c.parameters.generate(p, t, true);
+        auto m = c.parameters.generate(p, t, true, c.l.batch);
        if(per_instruction)
        {

--- a/src/driver/perf.cpp
+++ b/src/driver/perf.cpp
@@ -39,36 +39,25 @@ auto get_hash(const T& x)
    return std::hash<T>{}(x);
 }
-parameter_map fill_param_map(parameter_map& m, const program& p, const target& t, bool offload)
+parameter_map fill_param_map(parameter_map& m,
+                             const std::unordered_map<std::string, shape>& param_shapes,
+                             const target& t,
+                             bool offload)
 {
-    for(auto&& x : p.get_parameter_shapes())
+    for(auto&& x : param_shapes)
    {
        argument& arg = m[x.first];
        if(arg.empty())
+        {
+            assert(not x.second.dynamic());
            arg = generate_argument(x.second, get_hash(x.first));
+        }
        if(not offload)
            arg = t.copy_to(arg);
    }
    return m;
 }
-parameter_map fill_param_map(parameter_map& m, const program& p, bool gpu)
-{
-    for(auto&& x : p.get_parameter_shapes())
-    {
-        argument& arg = m[x.first];
-        if(arg.empty())
-            arg = generate_argument(x.second, get_hash(x.first));
-#ifdef HAVE_GPU
-        if(gpu)
-            arg = gpu::to_gpu(arg);
-#else
-        (void)gpu;
-#endif
-    }
-    return m;
-}
 parameter_map create_param_map(const program& p, const target& t, bool offload)
 {
    parameter_map m;

--- a/src/driver/perf.hpp
+++ b/src/driver/perf.hpp
@@ -30,8 +30,10 @@ namespace migraphx {
 namespace driver {
 inline namespace MIGRAPHX_INLINE_NS {
-parameter_map
+parameter_map fill_param_map(parameter_map& m,
-fill_param_map(parameter_map& m, const program& p, const target& t, bool offload = false);
+                             const std::unordered_map<std::string, shape>& param_shapes,
+                             const target& t,
+                             bool offload = false);
 parameter_map create_param_map(const program& p, const target& t, bool offload = false);
 parameter_map fill_param_map(parameter_map& m, const program& p, bool gpu);

--- a/src/dynamic_loader.cpp
+++ b/src/dynamic_loader.cpp
@@ -71,6 +71,16 @@ struct dynamic_loader_impl
    std::shared_ptr<tmp_dir> temp = nullptr;
 };
+fs::path dynamic_loader::path(void* address)
+{
+    fs::path p;
+    Dl_info info;
+    // Find the location of .so
+    if(dladdr(address, &info) != 0)
+        p = info.dli_fname;
+    return p;
+}
 dynamic_loader::dynamic_loader(const fs::path& p) : impl(std::make_shared<dynamic_loader_impl>(p))
 {
 }