Merge branch 'develop' into mlir-c

032af369 · Paul · b406a418 · 46b0c33b · 032af369 · 032af369
Commit 032af369 authored Dec 10, 2021 by Paul
20 changed files
--- a/doc/src/conf.py
+++ b/doc/src/conf.py
@@ -32,8 +32,10 @@ import re
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
 extensions = [
-    'breathe', 'sphinx.ext.mathjax', 'sphinx.ext.viewcode', 'sphinx_rtd_theme'
+    'breathe', 'sphinx.ext.mathjax', 'sphinx.ext.viewcode', 'sphinx_rtd_theme',
+    'sphinx.ext.autosectionlabel'
 ]
+autosectionlabel_prefix_document = True
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']

--- a/doc/src/dev/roctx1.jpg
+++ b/doc/src/dev/roctx1.jpg
--- a/doc/src/dev/roctx2.jpg
+++ b/doc/src/dev/roctx2.jpg
--- a/doc/src/dev/tools.rst
+++ b/doc/src/dev/tools.rst
+Tools
+=====
+roctx.py
+--------
+MIGraphX driver provides `roctx` command which can be used with `rocprof` binary to get marker timing information for each MIGraphX operator.  
+In order to help user to process timing information, rocTX helper script is provided at `tools/roctx.py`.
+The `roctx.py` helper script provides two main functionality: `run` and `parse`. Available knobs and usage are given below:
+::
+    Usage: roctx.py [-h] [--json-path json_path] [--out out]
+    [--study-name study-name] [--repeat repeat] [--parse]
+    [--run run] [--debug]
+.. option::  --run
+Runs `migraphx-driver roctx` command and given `migraphx-driver` knobs, and then parses the results, providing GPU kernel timing information.
+MIGraphX knobs can be given via a string to `--run` knob. Please see the examples below.
+.. option::  --parse
+Given `--json-path`, parses JSON file and provides GPU kernel timing information.
+.. option::  --out
+Output folder
+.. option::  --study-name
+Optional. Allows user to name a study for easier interpretation. Defaults to timestamp.
+.. option::  --repeat
+Number of iterations. Set to **2** by default.
+.. option::  --debug
+Provides additional debug information related to data. Only use for debugging purposes.
+**Examples:**
+**Running inference with rocTX for a given ONNX file:**
+::
+    python roctx.py --run '--onnx --gpu fcn-resnet50-11.onnx' --out output_folder --repeat 5
+After a run, similar to output given below is expected at terminal. The output will provide `SUM`, `MIN`, `MAX` and `COUNT` information for each kernel executed for a given model.
+Average total time is also provided. There are three files provided for reference:
+1. `OUTPUT CSV FILE` provides a summary of the run, providing utilized MIGraphX knobs and related kernel timing information
+2. `KERNEL TIMING DETAILS` provides the hotspot kernel timing information
+3. This will provide all output data related to all iterations executed during a run.
+An example output:
+.. image:: ./roctx1.jpg
+Hotspot kerel timing information:
+.. image:: ./roctx2.jpg
+**Parsing an already existing JSON file:**
+::
+    python roctx.py --parse --json-path ../trace.json
\ No newline at end of file
--- a/doc/src/developer_guide.rst
+++ b/doc/src/developer_guide.rst
@@ -13,3 +13,4 @@ Developer Guide
   dev/quantization
   dev/pass
   dev/matchers
+   dev/tools
--- a/doc/src/driver.rst
+++ b/doc/src/driver.rst
@@ -61,3 +61,21 @@ Verify each instruction
 .. option::  -r, --reduce
 Reduce program and verify
+roctx
+----
+.. program:: migraphx-driver roctx
+Provides marker information for each operation, allowing MIGraphX to be used with `rocprof <https://rocmdocs.amd.com/en/latest/ROCm_Tools/ROCm-Tools.html>`_ for performance analysis.
+This allows user to get GPU-level kernel timing information.
+An example command line combined with rocprof for tracing purposes is given below:
+.. code-block:: bash
+    /opt/rocm/bin/rocprof --hip-trace --roctx-trace --flush-rate 1ms --timestamp on -d <OUTPUT_PATH> --obj-tracking on /opt/rocm/bin/migraphx-driver roctx <ONNX_FILE> <MIGRAPHX_OPTIONS>
+After `rocprof` is run, the output directory will contain trace information for HIP, HCC and ROCTX in seperate `.txt` files.
+To understand the interactions between API calls, it is recommended to utilize `roctx.py` helper script as desribed in :ref:`dev/tools:rocTX` section. 
+.. include:: ./driver/compile.rst
\ No newline at end of file
--- a/examples/vision/python_3dunet/3dunet_inference.ipynb
+++ b/examples/vision/python_3dunet/3dunet_inference.ipynb
@@ -10,6 +10,16 @@
    "https://github.com/naomifridman/Unet_Brain_tumor_segmentation"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "09ceec31",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install SimpleITK matplotlib scikit-image"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,

--- a/examples/vision/python_nfnet/nfnet_inference.ipynb
+++ b/examples/vision/python_nfnet/nfnet_inference.ipynb
@@ -17,7 +17,9 @@
    "- How to optimize NFNet ONNX model with AMD MIGraphX.\n",
    "- How to run inference on AMD GPU with the optimized ONNX model.\n",
    "\n",
-    "The NFNet utilized in this example is the smallest NFNet version, F0: 71.5M parameters (83.6% top-1 accuracy on ImageNet)"
+    "The NFNet utilized in this example is the smallest NFNet version, F0: 71.5M parameters (83.6% top-1 accuracy on ImageNet)\n",
+    "\n",
+    "Please make sure MIGraphX Python API is installed following the instructions at Github page."
   ]
  },
  {
@@ -107,7 +109,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "with open('../python_api_inference/imagenet_simple_labels.json') as json_data:\n",
+    "with open('../python_resnet50/imagenet_simple_labels.json') as json_data:\n",
    "    labels = json.load(json_data)"
   ]
  },

--- a/examples/vision/python_nfnet/requirements_nfnet.txt
+++ b/examples/vision/python_nfnet/requirements_nfnet.txt
 opencv-python
 onnxruntime
\ No newline at end of file
+image
\ No newline at end of file
--- a/src/fuse_pointwise.cpp
+++ b/src/fuse_pointwise.cpp
@@ -33,9 +33,6 @@ static void create_pointwise_modules(module_pass_manager& mpm)
    {
        if(not ins->get_operator().attributes().get("pointwise", false))
            continue;
-        // Skip convert op for now
-        if(ins->name() == "convert")
-            continue;
        assert(ins->get_operator().attributes().contains("point_op"));
        auto* pm = mpm.create_module(mpm.get_module().name() + ":pointwise" + std::to_string(n++));
        pm->set_bypass();
@@ -129,22 +126,25 @@ static std::vector<instruction_ref> append_pointwise_module(instruction_ref ins,
 static bool find_pointwise_modules(module& m)
 {
    bool changed = false;
+    auto last    = std::prev(m.end());
    for(auto ins : iterator_for(m))
    {
        if(ins->name() != "pointwise")
            continue;
-        if(ins->outputs().empty())
+        if(ins->outputs().empty() and ins != last)
            continue;
        auto it = std::find_if(ins->inputs().begin(), ins->inputs().end(), [&](auto i) {
            return i->name() == "pointwise" and i->outputs().size() == 1;
        });
        if(it == ins->inputs().end())
            continue;
+        auto input = *it;
+        auto new_inputs = append_pointwise_module(input, ins);
+        m.replace_instruction(input, input->get_operator(), new_inputs, input->module_inputs());
+        m.replace_instruction(ins, input);
+        m.move_instruction(input, ins);
-        auto new_inputs = append_pointwise_module(*it, ins);
-        m.replace_instruction(*it, (*it)->get_operator(), new_inputs, (*it)->module_inputs());
-        m.replace_instruction(ins, *it);
-        m.move_instruction(*it, ins);
        changed = true;
    }
    return changed;

--- a/src/include/migraphx/op/convert.hpp
+++ b/src/include/migraphx/op/convert.hpp
@@ -32,6 +32,11 @@ struct convert : unary<convert>
        return {target_type, inputs.at(0).lens(), inputs.at(0).strides()};
    }
+    std::string point_op() const
+    {
+        return "${function:convert}<" + shape::cpp_type(target_type) + ">(${0})";
+    }
    auto apply() const
    {
        auto type = target_type;

--- a/src/module.cpp
+++ b/src/module.cpp
@@ -179,6 +179,7 @@ instruction_ref module::insert_instruction(instruction_ref ins,
                                           const operation& op,
                                           std::vector<instruction_ref> args)
 {
+    assert(has_instruction(ins) or is_end(ins, this->end()));
    assert(not starts_with(op.name(), "@"));
    shape r     = compute_shape(op, args);
    auto result = impl->insert(ins, {op, r, std::move(args)});
@@ -200,6 +201,7 @@ instruction_ref module::insert_instruction(instruction_ref ins,
                                           std::vector<instruction_ref> args,
                                           std::vector<module_ref> module_args)
 {
+    assert(has_instruction(ins) or is_end(ins, this->end()));
    assert(not starts_with(op.name(), "@"));
    auto out_shape = compute_shape(op, args, module_args);
    auto result    = impl->insert(ins, {op, out_shape, std::move(args), std::move(module_args)});
@@ -212,6 +214,7 @@ instruction_ref module::replace_instruction(instruction_ref ins,
                                            const operation& op,
                                            std::vector<instruction_ref> args) MIGRAPHX_TIDY_CONST
 {
+    assert(has_instruction(ins));
    assert(not starts_with(op.name(), "@"));
    shape r = compute_shape(op, args);
@@ -225,6 +228,7 @@ instruction_ref module::replace_instruction(instruction_ref ins,
                                            std::vector<instruction_ref> args,
                                            std::vector<module_ref> module_args) MIGRAPHX_TIDY_CONST
 {
+    assert(has_instruction(ins));
    assert(not starts_with(op.name(), "@"));
    auto out_shape = compute_shape(op, args, module_args);
    instruction::replace(ins, op, out_shape, std::move(args), std::move(module_args));
@@ -291,6 +295,8 @@ instruction_ref module::remove_instructions(instruction_ref first, instruction_r
 instruction_ref module::move_instruction(instruction_ref src, instruction_ref dst)
 {
+    assert(has_instruction(src));
+    assert(has_instruction(dst) or is_end(dst, this->end()));
    impl->instructions.splice(dst, impl->instructions, src);
    return src;
 }

--- a/src/targets/gpu/compile_hip_code_object.cpp
+++ b/src/targets/gpu/compile_hip_code_object.cpp
@@ -108,7 +108,7 @@ operation compile_hip_code_object(const std::string& content, hip_compile_option
    srcs.push_back(src_file{fs::path{"main.cpp"},
                            std::make_pair(content.data(), content.data() + content.size())});
    auto args_hpp =
-        generate_args_hpp(options.reduced_inputs.empty() ? options.inputs : options.reduced_inputs);
+        generate_args_hpp(options.virtual_inputs.empty() ? options.inputs : options.virtual_inputs);
    srcs.push_back(src_file{fs::path{"args.hpp"},
                            std::make_pair(args_hpp.data(), args_hpp.data() + args_hpp.size())});
    options.params += " -DMIGRAPHX_NGLOBAL=" + std::to_string(options.global);

--- a/src/targets/gpu/compile_pointwise.cpp
+++ b/src/targets/gpu/compile_pointwise.cpp
@@ -20,7 +20,7 @@ static const char* const pointwise_kernel = R"__migraphx__(
 #include <migraphx/kernels/pointwise.hpp>
 #include <args.hpp>
-using namespace migraphx;
+namespace migraphx {
 ${preamble}
@@ -32,6 +32,8 @@ __global__ void kernel(${params})
 }
+} // namespace migraphx
 int main() {}
 )__migraphx__";
@@ -46,7 +48,7 @@ operation compile_pointwise(context&,
    options.local          = 1024;
    options.inputs         = inputs;
    options.output         = inputs.back();
-    options.reduced_inputs = reduce_dims(inputs);
+    options.virtual_inputs = reduce_dims(inputs);
    options.params         = "-Wno-float-equal";
    auto src               = interpolate_string(pointwise_kernel,
                                  {{"params", enum_params(inputs.size(), "void * private_p")},
@@ -60,6 +62,7 @@ operation compile_pointwise(context& ctx, const std::vector<shape>& inputs, modu
 {
    run_passes(m, {eliminate_common_subexpression{}, dead_code_elimination{}});
    cpp_generator g;
+    g.fmap([](const std::string& fname) { return "migraphx::" + fname; });
    auto name = g.create_function(g.generate_module(m).set_attributes({"__device__"}));
    return compile_pointwise((ctx), inputs, "&" + name, g.str());
 }

--- a/src/targets/gpu/compile_roialign.cpp
+++ b/src/targets/gpu/compile_roialign.cpp
@@ -50,7 +50,7 @@ operation compile_roialign(context&, const std::vector<shape>& io_shapes, const
    options.inputs         = io_shapes;
    options.output         = out_s;
    options.kernel_name    = "roialign_kernel";
-    options.reduced_inputs = io_shapes;
+    options.virtual_inputs = io_shapes;
    // sampling_ratio
    assert(val.contains("sampling_ratio"));

--- a/src/targets/gpu/device/include/migraphx/gpu/device/launch.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/launch.hpp
@@ -75,8 +75,9 @@ MIGRAPHX_DEVICE_CONSTEXPR auto gs_invoke(F&& f, index_int i, index) -> decltype(
 inline auto gs_launch(hipStream_t stream, index_int n, index_int local = 1024)
 {
-    index_int groups  = (n + local - 1) / local;
+    index_int groups = (n + local - 1) / local;
-    index_int nglobal = std::min<index_int>(256, groups) * local;
+    // max possible number of blocks is set to 1B (1,073,741,824)
+    index_int nglobal = std::min<index_int>(1073741824, groups) * local;
    return [=](auto f) {
        launch(stream, nglobal, local)([=](auto idx) __device__ {

--- a/src/targets/gpu/device/softmax.cpp
+++ b/src/targets/gpu/device/softmax.cpp
@@ -20,34 +20,58 @@ void softmax(hipStream_t stream, const argument& result, const argument& arg, in
    migraphx::shape batch_shape{result.get_shape().type(), batch_lens};
    hip_visit_all(result, arg, batch_shape)([&](auto output, auto input, auto batch) {
-        const index_int max_block_size = 256;
+        const index_int max_block_size = 128;
        const index_int block_size     = compute_block_size(batch_item_num, max_block_size);
-        gs_launch(stream,
+        using type = device_type<std::remove_cv_t<typename decltype(input)::value_type>>;
-                  batch_shape.elements() * block_size,
+        type init  = lowest();
-                  block_size)([=](auto i, auto idx) __device__ {
-            auto data_idx = batch.multi(i / block_size);
+        if(axis == batch_lens.size() - 1)
-            using type    = device_type<std::remove_cv_t<typename decltype(input)::value_type>>;
+        {
-            type init     = lowest();
+            gs_launch(stream, batch_shape.elements() * block_size, block_size)(
+                [=](auto i, auto idx) __device__ {
-            auto batch_max = block_reduce<max_block_size>(
+                    auto start_loc = i / block_size * batch_item_num;
-                idx, max{}, init, batch_item_num, [&](auto j) __device__ {
+                    auto batch_max = block_reduce<max_block_size>(
-                    data_idx[axis] = j;
+                        idx, max{}, init, batch_item_num, [&](auto j) __device__ {
-                    return input[data_idx];
+                            return input[start_loc + j];
-                });
+                        });
+                    auto batch_sum = block_reduce<max_block_size>(
+                        idx, sum{}, 0, batch_item_num, [&](auto j) __device__ {
+                            auto val = input[start_loc + j] - batch_max;
+                            return ::exp(to_hip_type(val));
+                        });
-            auto batch_sum =
+                    idx.local_stride(batch_item_num, [&](auto j) __device__ {
-                block_reduce<max_block_size>(idx, sum{}, 0, batch_item_num, [&](auto j) __device__ {
+                        auto val              = input[start_loc + j] - batch_max;
-                    data_idx[axis] = j;
+                        output[start_loc + j] = ::exp(to_hip_type(val)) / batch_sum;
-                    auto val       = input[data_idx] - batch_max;
+                    });
-                    return ::exp(to_hip_type(val));
                });
+        }
+        else
+        {
+            gs_launch(stream, batch_shape.elements() * block_size, block_size)(
+                [=](auto i, auto idx) __device__ {
+                    auto data_idx  = batch.multi(i / block_size);
+                    auto batch_max = block_reduce<max_block_size>(
+                        idx, max{}, init, batch_item_num, [&](auto j) __device__ {
+                            data_idx[axis] = j;
+                            return input[data_idx];
+                        });
-            idx.local_stride(batch_item_num, [&](auto j) __device__ {
+                    auto batch_sum = block_reduce<max_block_size>(
-                data_idx[axis]   = j;
+                        idx, sum{}, 0, batch_item_num, [&](auto j) __device__ {
-                auto val         = input[data_idx] - batch_max;
+                            data_idx[axis] = j;
-                output[data_idx] = ::exp(to_hip_type(val)) / batch_sum;
+                            auto val       = input[data_idx] - batch_max;
-            });
+                            return ::exp(to_hip_type(val));
-        });
+                        });
+                    idx.local_stride(batch_item_num, [&](auto j) __device__ {
+                        data_idx[axis]   = j;
+                        auto val         = input[data_idx] - batch_max;
+                        output[data_idx] = ::exp(to_hip_type(val)) / batch_sum;
+                    });
+                });
+        }
    });
 }

--- a/src/targets/gpu/fuse_ops.cpp
+++ b/src/targets/gpu/fuse_ops.cpp
@@ -169,7 +169,7 @@ MIGRAPHX_PRED_MATCHER(bias_shape, instruction_ref ins)
 MIGRAPHX_PRED_MATCHER(fusable_conv, instruction_ref ins)
 {
-    const auto device_name = split_string(get_device_name(), ':').front();
+    const auto device_name = trim(split_string(get_device_name(), ':').front());
    if(not contains(get_supported_archs(), device_name))
        return false;
    if(enabled(MIGRAPHX_DISABLE_MIOPEN_FUSION{}))

--- a/src/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp
@@ -16,7 +16,7 @@ struct hip_compile_options
    shape output;
    std::string kernel_name           = "kernel";
    std::string params                = "";
-    std::vector<shape> reduced_inputs = {};
+    std::vector<shape> virtual_inputs = {};
 };
 operation compile_hip_code_object(const std::string& content, hip_compile_options options);

--- a/src/targets/gpu/kernels/include/migraphx/kernels/functional.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/functional.hpp
@@ -16,6 +16,19 @@ struct swallow
 template <index_int>
 using ignore = swallow;
+template <class... Fs>
+struct overloaded : Fs...
+{
+    using Fs::operator()...;
+    overloaded(Fs... fs) : Fs(fs)... {}
+};
+template <class... Fs>
+overloaded<Fs...> overload(Fs... fs)
+{
+    return {fs...};
+}
 namespace detail {
 template <class R>
@@ -168,9 +181,13 @@ constexpr auto transform_args(F f, Fs... fs)
    return [=](auto... xs) { return transform_args(f)(xs...)(transform_args(fs...)); };
 }
+// NOLINTNEXTLINE
+#define MIGRAPHX_RETURNS(...) \
+    ->decltype(__VA_ARGS__) { return __VA_ARGS__; }
 // NOLINTNEXTLINE
 #define MIGRAPHX_LIFT(...) \
-    ([](auto&&... xs) { return (__VA_ARGS__)(static_cast<decltype(xs)>(xs)...); })
+    ([](auto&&... xs) MIGRAPHX_RETURNS((__VA_ARGS__)(static_cast<decltype(xs)>(xs)...))
 } // namespace migraphx
 #endif // MIGRAPHX_GUARD_KERNELS_FUNCTIONAL_HPP