Merge branch 'dyn_gemm' of github.com:ROCmSoftwarePlatform/AMDMIGraphX into dyn_onnx_gemm

801a349c · charlie · 1a44d14f · 740a2bc6 · 801a349c · 801a349c
Commit 801a349c authored Dec 01, 2022 by charlie
13 changed files
--- a/.github/workflows/performance.yaml
+++ b/.github/workflows/performance.yaml
@@ -5,14 +5,14 @@ on:
    branches: [develop]
    types: [opened, synchronize, closed]
  schedule:
-    - cron: "0 5 * * 1-6"
+    - cron: "0 6 * * 1-6"

  workflow_dispatch:
    inputs:
      rocm_release:
        description: ROCm Version
        required: true
-        default: '5.2'
+        default: '5.3'
      performance_reports_repo:
        description: Result repository
        required: true
@@ -30,9 +30,9 @@ concurrency: "perftest-${{ github.head_ref ||  github.base_ref || 'schedule' }}"

 jobs:
  release:
-    uses: rocmsoftwareplatform/migraphx-benchmark/.github/workflows/perf-test.yml@main
+    uses: ROCmSoftwarePlatform/migraphx-benchmark/.github/workflows/perf-test.yml@main
    with:
-      rocm_release: ${{ github.event.inputs.rocm_release || '5.2' }}
+      rocm_release: ${{ github.event.inputs.rocm_release || '5.3' }}
      result_number: ${{ github.event.inputs.result_number || '10' }}
      flags: ${{ github.event.inputs.flags || '-s' }} 
      performance_reports_repo: ${{ github.event.inputs.performance_reports_repo || 'ROCmSoftwarePlatform/migraphx-reports' }} 

--- a/examples/migraphx/migraphx_driver/README.md
+++ b/examples/migraphx/migraphx_driver/README.md
@@ -29,6 +29,7 @@ See below for a comprehensive list of commands and option arguments, as well as
 | --tf | Load file as a tensorflow graph |
 | --migraphx | Load file as a migraphx graph |
 | --migraphx-json | Load file as a migraphx JSON graph |
+| --batch | Set batch size for the model | 
 | --nhwc | Treat tensorflow format as nhwc | 
 | --nchw | Treat tensorflow format as nchw |
 | --skip-unknown-operators | Skip unknown operators when parsing and continue to parse |

--- a/examples/nlp/python_bert_squad/requirements_bertsquad.txt
+++ b/examples/nlp/python_bert_squad/requirements_bertsquad.txt
@@ -21,6 +21,6 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
 #####################################################################################
-tensorflow==2.7.2
+tensorflow==2.9.3
 onnxruntime
 tokenizers
\ No newline at end of file
--- a/src/eliminate_contiguous.cpp
+++ b/src/eliminate_contiguous.cpp
@@ -42,6 +42,13 @@ static bool try_compute_shape(instruction_ref ins,
    try
    {
        shape new_shape = ins->get_operator().compute_shape(inputs, mods);
+
+        // Cannot tell if a dynamic shape will need to be made contiguous
+        if(new_shape.dynamic())
+        {
+            return false;
+        }
+
        // If the output shape is a standard shape, no need to try its output
        if(new_shape.standard())
        {
@@ -133,14 +140,20 @@ static void remove_contiguous(const std::string& op_name, module& m, F f)
        }
    }

-    // Perform evaluations in parallel
+    // Perform static contiguous evaluations in parallel
    std::vector<argument> literals(const_instructions.size());
    par_for(const_instructions.size(), 1, [&](const auto i) {
-        auto c      = op::contiguous{};
-        auto prev   = const_instructions[i]->inputs().front();
-        literals[i] = c.compute(c.compute_shape({prev->get_shape()}), {prev->eval()});
+        auto c    = op::contiguous{};
+        auto prev = const_instructions[i]->inputs().front();
+        // compute the output contiguous shape from the previous instruction shape
+        shape computed_shape                   = c.compute_shape({prev->get_shape()});
+        const std::vector<argument>& prev_eval = {prev->eval()};
+        // prev_eval should not be used in make_compute_output_shape() as computed_shape is static
+        auto co_shape = make_compute_output_shape(pack(c, computed_shape, prev_eval));
+        literals[i]   = c.compute(co_shape, prev_eval);
    });

+    // Replace static contiguous operations with a literal
    for(size_t i = 0; i < const_instructions.size(); i++)
    {
        auto l = m.add_literal(literals[i].get_shape(), literals[i].data());

--- a/src/fuse_pointwise.cpp
+++ b/src/fuse_pointwise.cpp
@@ -45,7 +45,16 @@ static literal get_scalar(instruction_ref ins)
        return {};
    auto e = ins->eval();
    literal r{};
-    e.visit_at([&](auto x) { r = literal{x}; });
+    // needed for bool as visit_at invokes as() which promotes bool to int8
+    // Without this we'll break type checks for logical ops that are fused.
+    if(e.get_shape().type() == shape::bool_type)
+    {
+        r = literal{e.at<bool>()};
+    }
+    else
+    {
+        e.visit_at([&](auto x) { r = literal{x}; });
+    }
    return r;
 }


--- a/src/include/migraphx/op/contiguous.hpp
+++ b/src/include/migraphx/op/contiguous.hpp
@@ -28,6 +28,7 @@
 #include <migraphx/argument.hpp>
 #include <migraphx/shape_for_each.hpp>
 #include <migraphx/config.hpp>
+#include <migraphx/dyn_output.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -42,19 +43,27 @@ namespace op {
 struct contiguous
 {
    std::string name() const { return "contiguous"; }
+
    shape compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.has(1);
-        if(inputs.front().standard())
-            return inputs.front();
-        auto lens = inputs.at(0).lens();
-        auto t    = inputs.at(0).type();
-        return {t, lens};
+        check_shapes{inputs, *this, true}.has(1);
+        auto s0 = inputs.front();
+        if(s0.dynamic() or s0.standard())
+        {
+            return s0;
+        }
+        else
+        {
+            const auto& lens = s0.lens();
+            auto t           = s0.type();
+            return {t, lens};
+        }
    }
-    argument compute(const shape& output_shape, std::vector<argument> args) const
+
+    argument compute(const dyn_output& dyn_out, std::vector<argument> args) const
    {
-        assert(output_shape.standard());
-        argument result{output_shape};
+        assert(dyn_out.computed_shape.standard());
+        argument result{dyn_out.computed_shape};
        visit_all(result, args[0])([&](auto output, auto input) {
            shape_for_each(output.get_shape(), [&](const auto& idx) {
                output(idx.begin(), idx.end()) = input(idx.begin(), idx.end());

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -233,11 +233,14 @@ get_target_property(MIOPEN_LOCATION MIOpen LOCATION)
 check_library_exists(MIOpen "miopenHiddenSetConvolutionFindMode" "${MIOPEN_LOCATION}" HAS_FIND_MODE_API)
 check_library_exists(MIOpen "miopenFindSolutions" "${MIOPEN_LOCATION}" HAS_FIND_2_API)

-if(HAS_FIND_2_API) 
+# TODO: Set default to HAS_FIND_2_API
+set(MIGRAPHX_USE_FIND_2_API OFF CACHE BOOL "")
+
+if(MIGRAPHX_USE_FIND_2_API) 
    target_compile_definitions(migraphx_gpu PUBLIC -DMIGRAPHX_HAS_FIND_2_API)
    message(STATUS "MIGraphx is using Find-2.0 API of MIOpen")
 else()
-    message(STATUS "MIOpen does not have Find-2.0 API")
+    message(STATUS "MIGraphx is using legacy Find API in MIOpen")
 endif()

 if(HAS_FIND_MODE_API)

--- a/src/targets/gpu/target.cpp
+++ b/src/targets/gpu/target.cpp
@@ -146,8 +146,6 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
        dead_code_elimination{},
        pack_int8_args{},
        dead_code_elimination{},
-        adjust_allocation{gpu_allocation_model{}},
-        dead_code_elimination{},
        fuse_ops{&ctx, options.fast_math},
        dead_code_elimination{},
        replace_allocate{gpu_allocation_model{}, options.offload_copy},

--- a/test/fuse_pointwise.cpp
+++ b/test/fuse_pointwise.cpp
@@ -272,6 +272,35 @@ TEST_CASE(contiguous_input)
    EXPECT(p1 == p2);
 }

+TEST_CASE(contiguous_boolean_input)
+{
+
+    migraphx::shape s{migraphx::shape::bool_type, {2, 3}};
+    migraphx::shape s_lit{migraphx::shape::bool_type, {1}, {0}};
+    migraphx::program p1;
+    {
+        auto* mm = p1.get_main_module();
+        auto x   = mm->add_parameter("x", s);
+        auto one = mm->add_literal(migraphx::literal(s_lit, {1.0}));
+        auto yb =
+            mm->add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", s.lens()}}), one);
+        auto y    = mm->add_instruction(migraphx::make_op("contiguous"), yb);
+        auto xor1 = mm->add_instruction(migraphx::make_op("logical_xor"), x, y);
+        mm->add_return({xor1});
+    }
+    run_pass(p1);
+    migraphx::program p2;
+    {
+        auto* mm  = p2.get_main_module();
+        auto x    = mm->add_parameter("x", s);
+        auto xor1 = add_pointwise(p2, "main:pointwise0", {x}, [=](auto* pm, const auto& inputs) {
+            auto y = pm->add_literal(migraphx::literal(s_lit, {1}));
+            return pm->add_instruction(migraphx::make_op("logical_xor"), inputs[0], y);
+        });
+        mm->add_return({xor1});
+    }
+}
+
 TEST_CASE(all_scalar_input)
 {
    migraphx::shape s{migraphx::shape::float_type};

--- a/test/onnx/onnx_test.cpp
+++ b/test/onnx/onnx_test.cpp
@@ -6022,6 +6022,24 @@ TEST_CASE(transpose_test)
    EXPECT(p == prog);
 }

+TEST_CASE(transpose_dyn_test)
+{
+    migraphx::program p;
+    auto* mm   = p.get_main_module();
+    auto input = mm->add_parameter(
+        "0",
+        migraphx::shape{migraphx::shape::float_type, {{1, 4, 0}, {2, 2, 0}, {2, 2, 0}, {3, 3, 0}}});
+    std::vector<int64_t> perm{0, 3, 1, 2};
+    auto t0 = mm->add_instruction(migraphx::make_op("transpose", {{"permutation", perm}}), input);
+    mm->add_return({t0});
+
+    migraphx::onnx_options options;
+    options.default_dyn_dim_value = {1, 4, 0};
+    auto prog                     = migraphx::parse_onnx("transpose_dyn_test.onnx", options);
+
+    EXPECT(p == prog);
+}
+
 TEST_CASE(topk_attrk_test)
 {
    migraphx::program p;

--- a/test/op_shape_test.cpp
+++ b/test/op_shape_test.cpp
@@ -365,6 +365,12 @@ TEST_CASE(contiguous_shape)
    expect_shape(single, migraphx::make_op("contiguous"), single);
 }

+TEST_CASE(contiguous_dyn_shape)
+{
+    migraphx::shape s0{migraphx::shape::float_type, {{1, 4, 0}, {2, 2, 2}}};
+    expect_shape(s0, migraphx::make_op("contiguous"), s0);
+}
+
 TEST_CASE(contiguous_shape_scalar)
 {
    migraphx::shape output{migraphx::shape::float_type};
@@ -2289,6 +2295,12 @@ TEST_CASE(transpose_dyn_shape1)
    expect_shape(output, migraphx::make_op("transpose", {{"permutation", {2, 1, 0}}}), input);
 }

+TEST_CASE(transpose_axes_error)
+{
+    migraphx::shape input{migraphx::shape::float_type, {2, 2}};
+    throws_shape(migraphx::make_op("transpose", {{"permutation", {1}}}), input);
+}
+
 TEST_CASE(step_test)
 {
    migraphx::shape s1{migraphx::shape::float_type, {1, 2, 4}};

--- a/test/ref_dot_op_test.cpp
+++ b/test/ref_dot_op_test.cpp
@@ -35,7 +35,7 @@
 #include <migraphx/half.hpp>

 template <class T>
-void dot_2D_test()
+void dot_2d_test()
 {
    migraphx::program p;

@@ -82,11 +82,11 @@ void dot_2D_test()
    result.visit([&](auto output) { results_vector.assign(output.begin(), output.end()); });
    EXPECT(migraphx::verify_range(c, results_vector));
 }
-TEST_CASE_REGISTER(dot_2D_test<float>)
-TEST_CASE_REGISTER(dot_2D_test<double>)
+TEST_CASE_REGISTER(dot_2d_test<float>)
+TEST_CASE_REGISTER(dot_2d_test<double>)

 template <class T>
-void dot_4D_test()
+void dot_4d_test()
 {
    migraphx::program p;

@@ -133,8 +133,8 @@ void dot_4D_test()
    result.visit([&](auto output) { results_vector.assign(output.begin(), output.end()); });
    EXPECT(migraphx::verify_range(c, results_vector));
 }
-TEST_CASE_REGISTER(dot_4D_test<float>)
-TEST_CASE_REGISTER(dot_4D_test<double>)
+TEST_CASE_REGISTER(dot_4d_test<float>)
+TEST_CASE_REGISTER(dot_4d_test<double>)

 TEST_CASE(dot_3D_test)
 {

--- a/test/ref_ops_test.cpp
+++ b/test/ref_ops_test.cpp
@@ -926,6 +926,33 @@ TEST_CASE(contiguous_test)
    EXPECT(migraphx::verify_range(results_vector, data));
 }

+TEST_CASE(contiguous_dyn_test)
+{
+    migraphx::program p;
+    auto* mm = p.get_main_module();
+    migraphx::shape dyn_shape{migraphx::shape::float_type,
+                              {{1, 1, 0}, {2, 6, 0}, {2, 2, 0}, {2, 2, 0}}};
+    auto input = mm->add_parameter("X", dyn_shape);
+    mm->add_instruction(migraphx::make_op("contiguous"), input);
+    p.compile(migraphx::ref::target{});
+
+    migraphx::shape static_shape{migraphx::shape::float_type, {1, 3, 2, 2}, {12, 1, 6, 3}};
+    std::vector<float> data(12);
+    std::iota(data.begin(), data.end(), 0);
+    migraphx::parameter_map params;
+    params["X"] = migraphx::argument(static_shape, data.data());
+    auto result = p.eval(params).back();
+
+    std::vector<size_t> new_strides = {12, 4, 2, 1};
+    EXPECT(result.get_shape().strides() == new_strides);
+
+    std::vector<float> results_vector(12);
+    result.visit([&](auto output) { results_vector.assign(output.begin(), output.end()); });
+
+    std::vector<float> gold = {0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11};
+    EXPECT(migraphx::verify_range(results_vector, gold));
+}
+
 TEST_CASE(conv_dyn_batch_test)
 {
    migraphx::program p;
@@ -7345,16 +7372,15 @@ TEST_CASE(transpose_dyn_test)
    migraphx::parameter_map params;
    migraphx::shape input_fixed_shape{migraphx::shape::float_type, {1, 2, 2, 3}};
    params["X"] = migraphx::argument(input_fixed_shape, data.data());
-    auto result = p.eval({}).back();
+    auto result = p.eval(params).back();
+
+    std::vector<size_t> new_lens = {1, 3, 2, 2};
+    EXPECT(result.get_shape().lens() == new_lens);

-    result.visit([&](auto output) {
-        std::vector<size_t> new_lens = {1, 3, 2, 2};
-        EXPECT(bool{output.get_shape().lens() == new_lens});
-    });
    std::vector<float> results_vector(12);
    result.visit([&](auto output) { results_vector.assign(output.begin(), output.end()); });
-    // no change in the data buffer
-    EXPECT(migraphx::verify_range(results_vector, data));
+    std::vector<float> gold = {0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11};
+    EXPECT(migraphx::verify_range(results_vector, gold));
 }

 TEST_CASE(unsqueeze_test)