Add flag for tuning in migraphx-driver (#1519)

* Add driver flag "--exhaustive-tune" to enable tuning, add support for the same in C/C++ and python API

Add flag for tuning in migraphx-driver (#1519)
* Add driver flag "--exhaustive-tune" to enable tuning, add support for the same in C/C++ and python API
cc098f4d · Umang Yadav · GitHub · 102c6bdb · cc098f4d · cc098f4d
Unverified Commit cc098f4d authored Feb 15, 2023 by Umang Yadav Committed by GitHub Feb 15, 2023
20 changed files
--- a/doc/src/dev_intro.rst
+++ b/doc/src/dev_intro.rst
@@ -137,7 +137,7 @@ An `argument <migraphx::argument>` can handle memory buffers from either the GPU
 By default when running the `program <migraphx::program>`, buffers are allocated on the corresponding target.
 When compiling for the CPU, the buffers by default will be allocated on the CPU.
 When compiling for the GPU, the buffers by default will be allocated on the GPU.
-With the option ``offloaf_copy=true`` set while compiling for the GPU, the buffers will be located on the CPU.
+With the option ``offload_copy=true`` set while compiling for the GPU, the buffers will be located on the CPU.
 Importing From ONNX

--- a/doc/src/driver/compile.rst
+++ b/doc/src/driver/compile.rst
@@ -28,6 +28,10 @@ Enable implicit offload copying
 Disable fast math optimization
+.. option:: --exhaustive-tune
+Perform an exhaustive search to find the fastest version of generated kernels for selected backend
 .. option::  --fp16
 Quantize for fp16

--- a/doc/src/reference/py.rst
+++ b/doc/src/reference/py.rst
@@ -215,13 +215,14 @@ program
    :rtype: list[shape]
-.. py:method:: compile(t, offload_copy=True, fast_math=True)
+.. py:method:: compile(t, offload_copy=True, fast_math=True, exhaustive_tune=False)
    Compiles the program for the target and optimizes it.
    :param target t: This is the target to compile the program for.
    :param bool offload_copy: For targets with offloaded memory(such as the gpu), this will insert instructions during compilation to copy the input parameters to the offloaded memory and to copy the final result from the offloaded memory back to main memory.
    :param bool fast_math: Optimize math functions to use faster approximate versions. There may be slight accuracy degredation when enabled.
+    :param exhaustive_tune: Flag to enable exhaustive search to find the fastest version of generated kernels for selected backend.
 .. py:method:: get_main_module()

--- a/examples/migraphx/migraphx_driver/README.md
+++ b/examples/migraphx/migraphx_driver/README.md
@@ -11,7 +11,7 @@ See below for a comprehensive list of commands and option arguments, as well as
 ### Commands
 | Command | Description                                                                |
-| --- | ---| 
+| ------- | -------------------------------------------------------------------------- |
 | op      | When followed by the option --list or -l, prints all operators of MIGraphX |
 | params  | Prints the input and output parameter shapes                               |
 | run     | Compiles, allocates parameters, evaluates, and prints input graph          |
@@ -22,7 +22,7 @@ See below for a comprehensive list of commands and option arguments, as well as
 ### Options
 | Option                                   | Description                                               |
-| --- | --- | 
+| ---------------------------------------- | --------------------------------------------------------- |
 | --help \| -h                             | Show help                                                 |
 | --model <resnet50\|inceptionv3\|alexnet> | Loads one of the three default models                     |
 | --onnx                                   | Load file as onnx graph                                   |
@@ -49,6 +49,7 @@ See below for a comprehensive list of commands and option arguments, as well as
 | --ref                                    | Compile on the reference implementation                   |
 | --enable-offload-copy                    | Enable implicit offload copying                           |
 | --disable-fast-math                      | Disable fast math optimization                            |
+| --exhaustive-tune                        | Enable exhaustive search to find fastest kernel           |
 | --fp16                                   | Quantize for fp16                                         |
 | --int8                                   | Quantize for int8                                         |
 | --tolerance                              | Tolerance for errors                                      |

--- a/src/api/api.cpp
+++ b/src/api/api.cpp
@@ -134,6 +134,11 @@ void set_offload_copy(compile_options& options, bool value) { options.offload_co
 void set_fast_math(compile_options& options, bool value) { options.fast_math = value; }
+void set_exhaustive_tune_flag(compile_options& options, bool value)
+{
+    options.exhaustive_tune = value;
+}
 void set_file_format(file_options& options, const char* format) { options.format = format; }
 void set_default_dim_value(onnx_options& options, size_t value)
@@ -1690,6 +1695,19 @@ migraphx_compile_options_set_fast_math(migraphx_compile_options_t compile_option
    return api_error_result;
 }
+extern "C" migraphx_status
+migraphx_compile_options_set_exhaustive_tune_flag(migraphx_compile_options_t compile_options,
+                                                  bool value)
+{
+    auto api_error_result = migraphx::try_([&] {
+        if(compile_options == nullptr)
+            MIGRAPHX_THROW(migraphx_status_bad_param,
+                           "Bad parameter compile_options: Null pointer");
+        migraphx::set_exhaustive_tune_flag((compile_options->object), (value));
+    });
+    return api_error_result;
+}
 extern "C" migraphx_status
 migraphx_parse_onnx(migraphx_program_t* out, const char* name, migraphx_onnx_options_t options)
 {

--- a/src/api/include/migraphx/migraphx.h
+++ b/src/api/include/migraphx/migraphx.h
@@ -427,6 +427,10 @@ migraphx_compile_options_set_offload_copy(migraphx_compile_options_t compile_opt
 migraphx_status migraphx_compile_options_set_fast_math(migraphx_compile_options_t compile_options,
                                                       bool value);
+migraphx_status
+migraphx_compile_options_set_exhaustive_tune_flag(migraphx_compile_options_t compile_options,
+                                                  bool value);
 migraphx_status
 migraphx_parse_onnx(migraphx_program_t* out, const char* name, migraphx_onnx_options_t options);

--- a/src/api/include/migraphx/migraphx.hpp
+++ b/src/api/include/migraphx/migraphx.hpp
@@ -1015,6 +1015,12 @@ struct compile_options : MIGRAPHX_HANDLE_BASE(compile_options)
    {
        call(&migraphx_compile_options_set_fast_math, this->get_handle_ptr(), value);
    }
+    /// Set or un-set exhaustive search to find fastest kernel
+    void set_exhaustive_tune_flag(bool value = true)
+    {
+        call(&migraphx_compile_options_set_exhaustive_tune_flag, this->get_handle_ptr(), value);
+    }
 };
 /// A program represents the all computation graphs to be compiled and executed

--- a/src/api/migraphx.py
+++ b/src/api/migraphx.py
@@ -354,6 +354,9 @@ def compile_options(h):
    h.method('set_fast_math',
             api.params(value='bool'),
             invoke='migraphx::set_fast_math($@)')
+    h.method('set_exhaustive_tune_flag',
+             api.params(value='bool'),
+             invoke='migraphx::set_exhaustive_tune_flag($@)')
 api.add_function('migraphx_parse_onnx',

--- a/src/driver/main.cpp
+++ b/src/driver/main.cpp
@@ -326,8 +326,7 @@ struct compiler
    loader l;
    program_params parameters;
    compiler_target ct;
-    bool offload_copy  = false;
+    compile_options co;
-    bool fast_math     = true;
    precision quantize = precision::fp32;
    std::vector<std::string> fill0;
@@ -337,19 +336,26 @@ struct compiler
        l.parse(ap);
        parameters.parse(ap);
        ct.parse(ap);
-        ap(offload_copy,
+        ap(co.offload_copy,
           {"--enable-offload-copy"},
           ap.help("Enable implicit offload copying"),
           ap.set_value(true));
-        ap(fast_math,
+        ap(co.fast_math,
           {"--disable-fast-math"},
           ap.help("Disable fast math optimization"),
           ap.set_value(false));
+        ap(co.exhaustive_tune,
+           {"--exhaustive-tune"},
+           ap.help("Exhastively search for best tuning parameters for kernels"),
+           ap.set_value(true));
        ap(quantize, {"--fp16"}, ap.help("Quantize for fp16"), ap.set_value(precision::fp16));
        ap(quantize, {"--int8"}, ap.help("Quantize for int8"), ap.set_value(precision::int8));
    }
-    auto params(const program& p) { return parameters.generate(p, ct.get_target(), offload_copy); }
+    auto params(const program& p)
+    {
+        return parameters.generate(p, ct.get_target(), co.offload_copy);
+    }
    program compile()
    {
@@ -366,10 +372,7 @@ struct compiler
        {
            quantize_int8(p, t, {params(p)});
        }
-        compile_options options;
+        p.compile(t, co);
-        options.offload_copy = offload_copy;
-        options.fast_math    = fast_math;
-        p.compile(t, options);
        l.save(p);
        return p;
    }
@@ -402,60 +405,41 @@ struct params : command<params>
 struct verify : command<verify>
 {
-    loader l;
+    compiler c;
-    program_params parameters;
-    compiler_target ct;
    double tolerance     = 80;
    bool per_instruction = false;
    bool reduce          = false;
-    bool offload_copy    = false;
-    bool fast_math       = true;
-    precision quantize   = precision::fp32;
    void parse(argument_parser& ap)
    {
-        l.parse(ap);
+        c.parse(ap);
-        parameters.parse(ap);
-        ct.parse(ap);
-        ap(offload_copy,
-           {"--enable-offload-copy"},
-           ap.help("Enable implicit offload copying"),
-           ap.set_value(true));
-        ap(fast_math,
-           {"--disable-fast-math"},
-           ap.help("Disable fast math optimization"),
-           ap.set_value(false));
        ap(tolerance, {"--tolerance"}, ap.help("Tolerance for errors"));
        ap(per_instruction,
           {"-i", "--per-instruction"},
           ap.help("Verify each instruction"),
           ap.set_value(true));
        ap(reduce, {"-r", "--reduce"}, ap.help("Reduce program and verify"), ap.set_value(true));
-        ap(quantize, {"--fp16"}, ap.help("Quantize for fp16"), ap.set_value(precision::fp16));
    }
    void run()
    {
-        auto p = l.load();
+        auto p = c.l.load();
-        l.save(p);
+        c.l.save(p);
        std::cout << p << std::endl;
-        compile_options options;
+        auto t = c.ct.get_target();
-        options.offload_copy = offload_copy;
+        auto m = c.parameters.generate(p, t, true);
-        options.fast_math    = fast_math;
-        auto t               = ct.get_target();
-        auto m               = parameters.generate(p, t, true);
        if(per_instruction)
        {
-            verify_instructions(p, t, options, quantize, tolerance);
+            verify_instructions(p, t, c.co, c.quantize, tolerance);
        }
        else if(reduce)
        {
-            verify_reduced_program(p, t, options, quantize, m, tolerance);
+            verify_reduced_program(p, t, c.co, c.quantize, m, tolerance);
        }
        else
        {
-            verify_program(l.file, p, t, options, quantize, m, tolerance);
+            verify_program(c.l.file, p, t, c.co, c.quantize, m, tolerance);
        }
    }
 };

--- a/src/driver/perf.cpp
+++ b/src/driver/perf.cpp
@@ -108,8 +108,6 @@ target get_target(bool gpu)
        return make_target("cpu");
 }
-void compile_program(program& p, bool gpu) { p.compile(get_target(gpu)); }
 } // namespace  MIGRAPHX_INLINE_NS
 } // namespace driver
 } // namespace migraphx
--- a/src/driver/perf.hpp
+++ b/src/driver/perf.hpp
@@ -37,7 +37,6 @@ parameter_map create_param_map(const program& p, const target& t, bool offload =
 parameter_map fill_param_map(parameter_map& m, const program& p, bool gpu);
 parameter_map create_param_map(const program& p, bool gpu = true);
 target get_target(bool gpu);
-void compile_program(program& p, bool gpu = true);
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace driver

--- a/src/include/migraphx/compile_options.hpp
+++ b/src/include/migraphx/compile_options.hpp
@@ -34,6 +34,7 @@ struct compile_options
 {
    bool offload_copy    = false;
    bool fast_math       = true;
+    bool exhaustive_tune = false;
    tracer trace{};
 };

--- a/src/include/migraphx/context.hpp
+++ b/src/include/migraphx/context.hpp
@@ -66,6 +66,7 @@ any_ptr get_queue_context(T&)
 {
    return {};
 }
 template <class T>
 void wait_for_context(T&, any_ptr)
 {
@@ -302,7 +303,7 @@ struct context
            PrivateDetailTypeErasedT value,
            typename std::enable_if<not std::is_reference<PrivateDetailTypeErasedU>::value,
                                    int>::type* = nullptr) noexcept
-            : private_detail_te_value(value)
+            : private_detail_te_value(std::move(value))
        {
        }
@@ -412,6 +413,7 @@ inline const ValueType& any_cast(const context& x)
 #endif
 inline void migraphx_to_value(value& v, const context& ctx) { v = ctx.to_value(); }
 inline void migraphx_from_value(const value& v, context& ctx) { ctx.from_value(v); }
 #endif

--- a/src/program.cpp
+++ b/src/program.cpp
@@ -210,17 +210,15 @@ void program::compile(const target& t, compile_options options)
    assert(not this->is_compiled());
    this->impl->target_name = t.name();
    this->impl->ctx         = t.get_context();
    if(enabled(MIGRAPHX_TRACE_COMPILE{}))
        options.trace = tracer{std::cout};
    options.trace(*this);
    options.trace();
    auto&& passes = t.get_passes(this->impl->ctx, options);
    run_passes(*this, passes, options.trace);
    auto mods = this->get_modules();
    // Validate and finalize
    for(const auto& mod : reverse(mods))
    {

--- a/src/py/migraphx_py.cpp
+++ b/src/py/migraphx_py.cpp
@@ -329,15 +329,21 @@ MIGRAPHX_PYBIND11_MODULE(migraphx, m)
        .def("is_compiled", &migraphx::program::is_compiled)
        .def(
            "compile",
-            [](migraphx::program& p, const migraphx::target& t, bool offload_copy, bool fast_math) {
+            [](migraphx::program& p,
+               const migraphx::target& t,
+               bool offload_copy,
+               bool fast_math,
+               bool exhaustive_tune) {
                migraphx::compile_options options;
                options.offload_copy    = offload_copy;
                options.fast_math       = fast_math;
+                options.exhaustive_tune = exhaustive_tune;
                p.compile(t, options);
            },
            py::arg("t"),
            py::arg("offload_copy")    = true,
-            py::arg("fast_math")    = true)
+            py::arg("fast_math")       = true,
+            py::arg("exhaustive_tune") = false)
        .def("get_main_module", [](const migraphx::program& p) { return p.get_main_module(); })
        .def(
            "create_module",

--- a/src/targets/gpu/include/migraphx/gpu/context.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/context.hpp
@@ -216,6 +216,10 @@ struct context
        return *current_device;
    }
+    bool get_exhaustive_tune_flag() const { return exhaustive_tune; }
+    void set_exhaustive_tune_flag(bool t) { exhaustive_tune = t; }
    hip_device::stream& get_stream() { return get_current_device().get_stream(); }
    hip_device::stream& get_stream(std::size_t n) { return get_current_device().get_stream(n); }
@@ -338,6 +342,7 @@ struct context
    // TODO: Make this a vector to support multiple devices
    std::shared_ptr<hip_device> current_device;
    std::vector<shared<hip_event_ptr>> events;
+    bool exhaustive_tune = false;
    bool measure_perf    = false;
    // for event perf timing
    shared<hip_event_ptr> start_event = nullptr;

--- a/src/targets/gpu/include/migraphx/gpu/convolution.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/convolution.hpp
@@ -175,7 +175,8 @@ struct miopen_convolution
            auto* miopen_stream_handle = ctx.get_stream().get_miopen();
-            solution_ptr = find_solution(miopen_stream_handle, conv_problem.get());
+            solution_ptr = find_solution(
+                miopen_stream_handle, conv_problem.get(), ctx.get_exhaustive_tune_flag());
            auto status = miopenGetSolutionWorkspaceSize(solution_ptr.get(), &workspace_size);
            if(status != miopenStatusSuccess)
                MIGRAPHX_THROW("MIOpen" + op.name() + " : failed to get solution's workspace size");
@@ -233,7 +234,7 @@ struct miopen_convolution
                                                       &perf,
                                                       workspace.implicit(),
                                                       workspace_size,
-                                                       false);
+                                                       ctx.get_exhaustive_tune_flag());
        if(status != miopenStatusSuccess)
            MIGRAPHX_THROW("MIOpen " + op.name() + " : find convolution failed");
        algo = perf.fwd_algo;

--- a/src/targets/gpu/include/migraphx/gpu/miopen.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/miopen.hpp
@@ -75,11 +75,18 @@ using miopen_find_options = MIGRAPHX_MANAGE_PTR(miopenFindOptions_t, miopenDestr
 using miopen_problem      = MIGRAPHX_MANAGE_PTR(miopenProblem_t, miopenDestroyProblem);
 using miopen_solution     = MIGRAPHX_MANAGE_PTR(miopenSolution_t, miopenDestroySolution);
-inline miopen_solution find_solution(miopenHandle_t handle, miopenProblem_t problem)
+inline miopen_solution
+find_solution(miopenHandle_t handle, miopenProblem_t problem, bool tune = false)
 {
    miopenSolution_t solution;
    size_t found           = 0;
-    auto status  = miopenFindSolutions(handle, problem, nullptr, &solution, &found, 1);
+    miopen_find_options fo = nullptr;
+    if(tune)
+    {
+        fo = make_obj<miopen_find_options>(&miopenCreateFindOptions);
+        miopenSetFindOptionTuning(fo.get(), 1);
+    }
+    auto status = miopenFindSolutions(handle, problem, fo.get(), &solution, &found, 1);
    auto result = miopen_solution{solution};
    if(status != miopenStatusSuccess or found == 0)
        MIGRAPHX_THROW("MIOpen miopenFindSolutions failed");

--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -83,7 +83,6 @@ struct miopen_apply
        auto& ctx      = get_context();
        int8_x4_format = get_int8_x4_format(ctx);
        compute_fp32   = get_compute_fp32_flag();
        offload_copy   = (mod->name() == "main") ? pass->offload_copy : false;
        add_generic_op("contiguous");

--- a/src/targets/gpu/target.cpp
+++ b/src/targets/gpu/target.cpp
@@ -91,6 +91,7 @@ pass enable_pass(bool enabled, pass p)
 std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_options& options) const
 {
    auto& ctx = any_cast<context>(gctx);
+    ctx.set_exhaustive_tune_flag(options.exhaustive_tune);
    std::set<shape::type_t> unsupported_types(shape::types().begin(), shape::types().end());
    unsupported_types.erase(shape::type_t::float_type);
    unsupported_types.erase(shape::type_t::half_type);