Commit 032af369 authored by Paul's avatar Paul
Browse files

Merge branch 'develop' into mlir-c

parents b406a418 46b0c33b
...@@ -32,8 +32,10 @@ import re ...@@ -32,8 +32,10 @@ import re
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones. # ones.
extensions = [ extensions = [
'breathe', 'sphinx.ext.mathjax', 'sphinx.ext.viewcode', 'sphinx_rtd_theme' 'breathe', 'sphinx.ext.mathjax', 'sphinx.ext.viewcode', 'sphinx_rtd_theme',
'sphinx.ext.autosectionlabel'
] ]
autosectionlabel_prefix_document = True
# Add any paths that contain templates here, relative to this directory. # Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates'] templates_path = ['_templates']
......
Tools
=====
roctx.py
--------
MIGraphX driver provides `roctx` command which can be used with `rocprof` binary to get marker timing information for each MIGraphX operator.
In order to help user to process timing information, rocTX helper script is provided at `tools/roctx.py`.
The `roctx.py` helper script provides two main functionality: `run` and `parse`. Available knobs and usage are given below:
::
Usage: roctx.py [-h] [--json-path json_path] [--out out]
[--study-name study-name] [--repeat repeat] [--parse]
[--run run] [--debug]
.. option:: --run
Runs `migraphx-driver roctx` command and given `migraphx-driver` knobs, and then parses the results, providing GPU kernel timing information.
MIGraphX knobs can be given via a string to `--run` knob. Please see the examples below.
.. option:: --parse
Given `--json-path`, parses JSON file and provides GPU kernel timing information.
.. option:: --out
Output folder
.. option:: --study-name
Optional. Allows user to name a study for easier interpretation. Defaults to timestamp.
.. option:: --repeat
Number of iterations. Set to **2** by default.
.. option:: --debug
Provides additional debug information related to data. Only use for debugging purposes.
**Examples:**
**Running inference with rocTX for a given ONNX file:**
::
python roctx.py --run '--onnx --gpu fcn-resnet50-11.onnx' --out output_folder --repeat 5
After a run, similar to output given below is expected at terminal. The output will provide `SUM`, `MIN`, `MAX` and `COUNT` information for each kernel executed for a given model.
Average total time is also provided. There are three files provided for reference:
1. `OUTPUT CSV FILE` provides a summary of the run, providing utilized MIGraphX knobs and related kernel timing information
2. `KERNEL TIMING DETAILS` provides the hotspot kernel timing information
3. This will provide all output data related to all iterations executed during a run.
An example output:
.. image:: ./roctx1.jpg
Hotspot kerel timing information:
.. image:: ./roctx2.jpg
**Parsing an already existing JSON file:**
::
python roctx.py --parse --json-path ../trace.json
\ No newline at end of file
...@@ -13,3 +13,4 @@ Developer Guide ...@@ -13,3 +13,4 @@ Developer Guide
dev/quantization dev/quantization
dev/pass dev/pass
dev/matchers dev/matchers
dev/tools
...@@ -61,3 +61,21 @@ Verify each instruction ...@@ -61,3 +61,21 @@ Verify each instruction
.. option:: -r, --reduce .. option:: -r, --reduce
Reduce program and verify Reduce program and verify
roctx
----
.. program:: migraphx-driver roctx
Provides marker information for each operation, allowing MIGraphX to be used with `rocprof <https://rocmdocs.amd.com/en/latest/ROCm_Tools/ROCm-Tools.html>`_ for performance analysis.
This allows user to get GPU-level kernel timing information.
An example command line combined with rocprof for tracing purposes is given below:
.. code-block:: bash
/opt/rocm/bin/rocprof --hip-trace --roctx-trace --flush-rate 1ms --timestamp on -d <OUTPUT_PATH> --obj-tracking on /opt/rocm/bin/migraphx-driver roctx <ONNX_FILE> <MIGRAPHX_OPTIONS>
After `rocprof` is run, the output directory will contain trace information for HIP, HCC and ROCTX in seperate `.txt` files.
To understand the interactions between API calls, it is recommended to utilize `roctx.py` helper script as desribed in :ref:`dev/tools:rocTX` section.
.. include:: ./driver/compile.rst
\ No newline at end of file
...@@ -10,6 +10,16 @@ ...@@ -10,6 +10,16 @@
"https://github.com/naomifridman/Unet_Brain_tumor_segmentation" "https://github.com/naomifridman/Unet_Brain_tumor_segmentation"
] ]
}, },
{
"cell_type": "code",
"execution_count": null,
"id": "09ceec31",
"metadata": {},
"outputs": [],
"source": [
"!pip install SimpleITK matplotlib scikit-image"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
......
...@@ -17,7 +17,9 @@ ...@@ -17,7 +17,9 @@
"- How to optimize NFNet ONNX model with AMD MIGraphX.\n", "- How to optimize NFNet ONNX model with AMD MIGraphX.\n",
"- How to run inference on AMD GPU with the optimized ONNX model.\n", "- How to run inference on AMD GPU with the optimized ONNX model.\n",
"\n", "\n",
"The NFNet utilized in this example is the smallest NFNet version, F0: 71.5M parameters (83.6% top-1 accuracy on ImageNet)" "The NFNet utilized in this example is the smallest NFNet version, F0: 71.5M parameters (83.6% top-1 accuracy on ImageNet)\n",
"\n",
"Please make sure MIGraphX Python API is installed following the instructions at Github page."
] ]
}, },
{ {
...@@ -107,7 +109,7 @@ ...@@ -107,7 +109,7 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"with open('../python_api_inference/imagenet_simple_labels.json') as json_data:\n", "with open('../python_resnet50/imagenet_simple_labels.json') as json_data:\n",
" labels = json.load(json_data)" " labels = json.load(json_data)"
] ]
}, },
......
opencv-python opencv-python
onnxruntime onnxruntime
\ No newline at end of file image
\ No newline at end of file
...@@ -33,9 +33,6 @@ static void create_pointwise_modules(module_pass_manager& mpm) ...@@ -33,9 +33,6 @@ static void create_pointwise_modules(module_pass_manager& mpm)
{ {
if(not ins->get_operator().attributes().get("pointwise", false)) if(not ins->get_operator().attributes().get("pointwise", false))
continue; continue;
// Skip convert op for now
if(ins->name() == "convert")
continue;
assert(ins->get_operator().attributes().contains("point_op")); assert(ins->get_operator().attributes().contains("point_op"));
auto* pm = mpm.create_module(mpm.get_module().name() + ":pointwise" + std::to_string(n++)); auto* pm = mpm.create_module(mpm.get_module().name() + ":pointwise" + std::to_string(n++));
pm->set_bypass(); pm->set_bypass();
...@@ -129,22 +126,25 @@ static std::vector<instruction_ref> append_pointwise_module(instruction_ref ins, ...@@ -129,22 +126,25 @@ static std::vector<instruction_ref> append_pointwise_module(instruction_ref ins,
static bool find_pointwise_modules(module& m) static bool find_pointwise_modules(module& m)
{ {
bool changed = false; bool changed = false;
auto last = std::prev(m.end());
for(auto ins : iterator_for(m)) for(auto ins : iterator_for(m))
{ {
if(ins->name() != "pointwise") if(ins->name() != "pointwise")
continue; continue;
if(ins->outputs().empty()) if(ins->outputs().empty() and ins != last)
continue; continue;
auto it = std::find_if(ins->inputs().begin(), ins->inputs().end(), [&](auto i) { auto it = std::find_if(ins->inputs().begin(), ins->inputs().end(), [&](auto i) {
return i->name() == "pointwise" and i->outputs().size() == 1; return i->name() == "pointwise" and i->outputs().size() == 1;
}); });
if(it == ins->inputs().end()) if(it == ins->inputs().end())
continue; continue;
auto input = *it;
auto new_inputs = append_pointwise_module(input, ins);
m.replace_instruction(input, input->get_operator(), new_inputs, input->module_inputs());
m.replace_instruction(ins, input);
m.move_instruction(input, ins);
auto new_inputs = append_pointwise_module(*it, ins);
m.replace_instruction(*it, (*it)->get_operator(), new_inputs, (*it)->module_inputs());
m.replace_instruction(ins, *it);
m.move_instruction(*it, ins);
changed = true; changed = true;
} }
return changed; return changed;
......
...@@ -32,6 +32,11 @@ struct convert : unary<convert> ...@@ -32,6 +32,11 @@ struct convert : unary<convert>
return {target_type, inputs.at(0).lens(), inputs.at(0).strides()}; return {target_type, inputs.at(0).lens(), inputs.at(0).strides()};
} }
std::string point_op() const
{
return "${function:convert}<" + shape::cpp_type(target_type) + ">(${0})";
}
auto apply() const auto apply() const
{ {
auto type = target_type; auto type = target_type;
......
...@@ -179,6 +179,7 @@ instruction_ref module::insert_instruction(instruction_ref ins, ...@@ -179,6 +179,7 @@ instruction_ref module::insert_instruction(instruction_ref ins,
const operation& op, const operation& op,
std::vector<instruction_ref> args) std::vector<instruction_ref> args)
{ {
assert(has_instruction(ins) or is_end(ins, this->end()));
assert(not starts_with(op.name(), "@")); assert(not starts_with(op.name(), "@"));
shape r = compute_shape(op, args); shape r = compute_shape(op, args);
auto result = impl->insert(ins, {op, r, std::move(args)}); auto result = impl->insert(ins, {op, r, std::move(args)});
...@@ -200,6 +201,7 @@ instruction_ref module::insert_instruction(instruction_ref ins, ...@@ -200,6 +201,7 @@ instruction_ref module::insert_instruction(instruction_ref ins,
std::vector<instruction_ref> args, std::vector<instruction_ref> args,
std::vector<module_ref> module_args) std::vector<module_ref> module_args)
{ {
assert(has_instruction(ins) or is_end(ins, this->end()));
assert(not starts_with(op.name(), "@")); assert(not starts_with(op.name(), "@"));
auto out_shape = compute_shape(op, args, module_args); auto out_shape = compute_shape(op, args, module_args);
auto result = impl->insert(ins, {op, out_shape, std::move(args), std::move(module_args)}); auto result = impl->insert(ins, {op, out_shape, std::move(args), std::move(module_args)});
...@@ -212,6 +214,7 @@ instruction_ref module::replace_instruction(instruction_ref ins, ...@@ -212,6 +214,7 @@ instruction_ref module::replace_instruction(instruction_ref ins,
const operation& op, const operation& op,
std::vector<instruction_ref> args) MIGRAPHX_TIDY_CONST std::vector<instruction_ref> args) MIGRAPHX_TIDY_CONST
{ {
assert(has_instruction(ins));
assert(not starts_with(op.name(), "@")); assert(not starts_with(op.name(), "@"));
shape r = compute_shape(op, args); shape r = compute_shape(op, args);
...@@ -225,6 +228,7 @@ instruction_ref module::replace_instruction(instruction_ref ins, ...@@ -225,6 +228,7 @@ instruction_ref module::replace_instruction(instruction_ref ins,
std::vector<instruction_ref> args, std::vector<instruction_ref> args,
std::vector<module_ref> module_args) MIGRAPHX_TIDY_CONST std::vector<module_ref> module_args) MIGRAPHX_TIDY_CONST
{ {
assert(has_instruction(ins));
assert(not starts_with(op.name(), "@")); assert(not starts_with(op.name(), "@"));
auto out_shape = compute_shape(op, args, module_args); auto out_shape = compute_shape(op, args, module_args);
instruction::replace(ins, op, out_shape, std::move(args), std::move(module_args)); instruction::replace(ins, op, out_shape, std::move(args), std::move(module_args));
...@@ -291,6 +295,8 @@ instruction_ref module::remove_instructions(instruction_ref first, instruction_r ...@@ -291,6 +295,8 @@ instruction_ref module::remove_instructions(instruction_ref first, instruction_r
instruction_ref module::move_instruction(instruction_ref src, instruction_ref dst) instruction_ref module::move_instruction(instruction_ref src, instruction_ref dst)
{ {
assert(has_instruction(src));
assert(has_instruction(dst) or is_end(dst, this->end()));
impl->instructions.splice(dst, impl->instructions, src); impl->instructions.splice(dst, impl->instructions, src);
return src; return src;
} }
......
...@@ -108,7 +108,7 @@ operation compile_hip_code_object(const std::string& content, hip_compile_option ...@@ -108,7 +108,7 @@ operation compile_hip_code_object(const std::string& content, hip_compile_option
srcs.push_back(src_file{fs::path{"main.cpp"}, srcs.push_back(src_file{fs::path{"main.cpp"},
std::make_pair(content.data(), content.data() + content.size())}); std::make_pair(content.data(), content.data() + content.size())});
auto args_hpp = auto args_hpp =
generate_args_hpp(options.reduced_inputs.empty() ? options.inputs : options.reduced_inputs); generate_args_hpp(options.virtual_inputs.empty() ? options.inputs : options.virtual_inputs);
srcs.push_back(src_file{fs::path{"args.hpp"}, srcs.push_back(src_file{fs::path{"args.hpp"},
std::make_pair(args_hpp.data(), args_hpp.data() + args_hpp.size())}); std::make_pair(args_hpp.data(), args_hpp.data() + args_hpp.size())});
options.params += " -DMIGRAPHX_NGLOBAL=" + std::to_string(options.global); options.params += " -DMIGRAPHX_NGLOBAL=" + std::to_string(options.global);
......
...@@ -20,7 +20,7 @@ static const char* const pointwise_kernel = R"__migraphx__( ...@@ -20,7 +20,7 @@ static const char* const pointwise_kernel = R"__migraphx__(
#include <migraphx/kernels/pointwise.hpp> #include <migraphx/kernels/pointwise.hpp>
#include <args.hpp> #include <args.hpp>
using namespace migraphx; namespace migraphx {
${preamble} ${preamble}
...@@ -32,6 +32,8 @@ __global__ void kernel(${params}) ...@@ -32,6 +32,8 @@ __global__ void kernel(${params})
} }
} // namespace migraphx
int main() {} int main() {}
)__migraphx__"; )__migraphx__";
...@@ -46,7 +48,7 @@ operation compile_pointwise(context&, ...@@ -46,7 +48,7 @@ operation compile_pointwise(context&,
options.local = 1024; options.local = 1024;
options.inputs = inputs; options.inputs = inputs;
options.output = inputs.back(); options.output = inputs.back();
options.reduced_inputs = reduce_dims(inputs); options.virtual_inputs = reduce_dims(inputs);
options.params = "-Wno-float-equal"; options.params = "-Wno-float-equal";
auto src = interpolate_string(pointwise_kernel, auto src = interpolate_string(pointwise_kernel,
{{"params", enum_params(inputs.size(), "void * private_p")}, {{"params", enum_params(inputs.size(), "void * private_p")},
...@@ -60,6 +62,7 @@ operation compile_pointwise(context& ctx, const std::vector<shape>& inputs, modu ...@@ -60,6 +62,7 @@ operation compile_pointwise(context& ctx, const std::vector<shape>& inputs, modu
{ {
run_passes(m, {eliminate_common_subexpression{}, dead_code_elimination{}}); run_passes(m, {eliminate_common_subexpression{}, dead_code_elimination{}});
cpp_generator g; cpp_generator g;
g.fmap([](const std::string& fname) { return "migraphx::" + fname; });
auto name = g.create_function(g.generate_module(m).set_attributes({"__device__"})); auto name = g.create_function(g.generate_module(m).set_attributes({"__device__"}));
return compile_pointwise((ctx), inputs, "&" + name, g.str()); return compile_pointwise((ctx), inputs, "&" + name, g.str());
} }
......
...@@ -50,7 +50,7 @@ operation compile_roialign(context&, const std::vector<shape>& io_shapes, const ...@@ -50,7 +50,7 @@ operation compile_roialign(context&, const std::vector<shape>& io_shapes, const
options.inputs = io_shapes; options.inputs = io_shapes;
options.output = out_s; options.output = out_s;
options.kernel_name = "roialign_kernel"; options.kernel_name = "roialign_kernel";
options.reduced_inputs = io_shapes; options.virtual_inputs = io_shapes;
// sampling_ratio // sampling_ratio
assert(val.contains("sampling_ratio")); assert(val.contains("sampling_ratio"));
......
...@@ -75,8 +75,9 @@ MIGRAPHX_DEVICE_CONSTEXPR auto gs_invoke(F&& f, index_int i, index) -> decltype( ...@@ -75,8 +75,9 @@ MIGRAPHX_DEVICE_CONSTEXPR auto gs_invoke(F&& f, index_int i, index) -> decltype(
inline auto gs_launch(hipStream_t stream, index_int n, index_int local = 1024) inline auto gs_launch(hipStream_t stream, index_int n, index_int local = 1024)
{ {
index_int groups = (n + local - 1) / local; index_int groups = (n + local - 1) / local;
index_int nglobal = std::min<index_int>(256, groups) * local; // max possible number of blocks is set to 1B (1,073,741,824)
index_int nglobal = std::min<index_int>(1073741824, groups) * local;
return [=](auto f) { return [=](auto f) {
launch(stream, nglobal, local)([=](auto idx) __device__ { launch(stream, nglobal, local)([=](auto idx) __device__ {
......
...@@ -20,34 +20,58 @@ void softmax(hipStream_t stream, const argument& result, const argument& arg, in ...@@ -20,34 +20,58 @@ void softmax(hipStream_t stream, const argument& result, const argument& arg, in
migraphx::shape batch_shape{result.get_shape().type(), batch_lens}; migraphx::shape batch_shape{result.get_shape().type(), batch_lens};
hip_visit_all(result, arg, batch_shape)([&](auto output, auto input, auto batch) { hip_visit_all(result, arg, batch_shape)([&](auto output, auto input, auto batch) {
const index_int max_block_size = 256; const index_int max_block_size = 128;
const index_int block_size = compute_block_size(batch_item_num, max_block_size); const index_int block_size = compute_block_size(batch_item_num, max_block_size);
gs_launch(stream, using type = device_type<std::remove_cv_t<typename decltype(input)::value_type>>;
batch_shape.elements() * block_size, type init = lowest();
block_size)([=](auto i, auto idx) __device__ {
auto data_idx = batch.multi(i / block_size); if(axis == batch_lens.size() - 1)
using type = device_type<std::remove_cv_t<typename decltype(input)::value_type>>; {
type init = lowest(); gs_launch(stream, batch_shape.elements() * block_size, block_size)(
[=](auto i, auto idx) __device__ {
auto batch_max = block_reduce<max_block_size>( auto start_loc = i / block_size * batch_item_num;
idx, max{}, init, batch_item_num, [&](auto j) __device__ { auto batch_max = block_reduce<max_block_size>(
data_idx[axis] = j; idx, max{}, init, batch_item_num, [&](auto j) __device__ {
return input[data_idx]; return input[start_loc + j];
}); });
auto batch_sum = block_reduce<max_block_size>(
idx, sum{}, 0, batch_item_num, [&](auto j) __device__ {
auto val = input[start_loc + j] - batch_max;
return ::exp(to_hip_type(val));
});
auto batch_sum = idx.local_stride(batch_item_num, [&](auto j) __device__ {
block_reduce<max_block_size>(idx, sum{}, 0, batch_item_num, [&](auto j) __device__ { auto val = input[start_loc + j] - batch_max;
data_idx[axis] = j; output[start_loc + j] = ::exp(to_hip_type(val)) / batch_sum;
auto val = input[data_idx] - batch_max; });
return ::exp(to_hip_type(val));
}); });
}
else
{
gs_launch(stream, batch_shape.elements() * block_size, block_size)(
[=](auto i, auto idx) __device__ {
auto data_idx = batch.multi(i / block_size);
auto batch_max = block_reduce<max_block_size>(
idx, max{}, init, batch_item_num, [&](auto j) __device__ {
data_idx[axis] = j;
return input[data_idx];
});
idx.local_stride(batch_item_num, [&](auto j) __device__ { auto batch_sum = block_reduce<max_block_size>(
data_idx[axis] = j; idx, sum{}, 0, batch_item_num, [&](auto j) __device__ {
auto val = input[data_idx] - batch_max; data_idx[axis] = j;
output[data_idx] = ::exp(to_hip_type(val)) / batch_sum; auto val = input[data_idx] - batch_max;
}); return ::exp(to_hip_type(val));
}); });
idx.local_stride(batch_item_num, [&](auto j) __device__ {
data_idx[axis] = j;
auto val = input[data_idx] - batch_max;
output[data_idx] = ::exp(to_hip_type(val)) / batch_sum;
});
});
}
}); });
} }
......
...@@ -169,7 +169,7 @@ MIGRAPHX_PRED_MATCHER(bias_shape, instruction_ref ins) ...@@ -169,7 +169,7 @@ MIGRAPHX_PRED_MATCHER(bias_shape, instruction_ref ins)
MIGRAPHX_PRED_MATCHER(fusable_conv, instruction_ref ins) MIGRAPHX_PRED_MATCHER(fusable_conv, instruction_ref ins)
{ {
const auto device_name = split_string(get_device_name(), ':').front(); const auto device_name = trim(split_string(get_device_name(), ':').front());
if(not contains(get_supported_archs(), device_name)) if(not contains(get_supported_archs(), device_name))
return false; return false;
if(enabled(MIGRAPHX_DISABLE_MIOPEN_FUSION{})) if(enabled(MIGRAPHX_DISABLE_MIOPEN_FUSION{}))
......
...@@ -16,7 +16,7 @@ struct hip_compile_options ...@@ -16,7 +16,7 @@ struct hip_compile_options
shape output; shape output;
std::string kernel_name = "kernel"; std::string kernel_name = "kernel";
std::string params = ""; std::string params = "";
std::vector<shape> reduced_inputs = {}; std::vector<shape> virtual_inputs = {};
}; };
operation compile_hip_code_object(const std::string& content, hip_compile_options options); operation compile_hip_code_object(const std::string& content, hip_compile_options options);
......
...@@ -16,6 +16,19 @@ struct swallow ...@@ -16,6 +16,19 @@ struct swallow
template <index_int> template <index_int>
using ignore = swallow; using ignore = swallow;
template <class... Fs>
struct overloaded : Fs...
{
using Fs::operator()...;
overloaded(Fs... fs) : Fs(fs)... {}
};
template <class... Fs>
overloaded<Fs...> overload(Fs... fs)
{
return {fs...};
}
namespace detail { namespace detail {
template <class R> template <class R>
...@@ -168,9 +181,13 @@ constexpr auto transform_args(F f, Fs... fs) ...@@ -168,9 +181,13 @@ constexpr auto transform_args(F f, Fs... fs)
return [=](auto... xs) { return transform_args(f)(xs...)(transform_args(fs...)); }; return [=](auto... xs) { return transform_args(f)(xs...)(transform_args(fs...)); };
} }
// NOLINTNEXTLINE
#define MIGRAPHX_RETURNS(...) \
->decltype(__VA_ARGS__) { return __VA_ARGS__; }
// NOLINTNEXTLINE // NOLINTNEXTLINE
#define MIGRAPHX_LIFT(...) \ #define MIGRAPHX_LIFT(...) \
([](auto&&... xs) { return (__VA_ARGS__)(static_cast<decltype(xs)>(xs)...); }) ([](auto&&... xs) MIGRAPHX_RETURNS((__VA_ARGS__)(static_cast<decltype(xs)>(xs)...))
} // namespace migraphx } // namespace migraphx
#endif // MIGRAPHX_GUARD_KERNELS_FUNCTIONAL_HPP #endif // MIGRAPHX_GUARD_KERNELS_FUNCTIONAL_HPP
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment