Merge branch 'jit-layernorm-merge' of...

Merge branch 'jit-layernorm-merge' of https://github.com/ROCmSoftwarePlatform/AMDMIGraphX into layernorm_eps

Merge branch 'jit-layernorm-merge' of...
Merge branch 'jit-layernorm-merge' of https://github.com/ROCmSoftwarePlatform/AMDMIGraphX into layernorm_eps
86713e78 · Khalique Ahmed · d3930010 · 63db86bd · 86713e78 · 86713e78
Commit 86713e78 authored Sep 13, 2022 by Khalique Ahmed
13 changed files
--- a/src/targets/gpu/jit/scatternd.cpp
+++ b/src/targets/gpu/jit/scatternd.cpp
@@ -24,16 +24,8 @@
 #include <migraphx/gpu/compiler.hpp>
 #include <migraphx/make_op.hpp>
 #include <migraphx/gpu/context.hpp>
-
 #include <migraphx/gpu/compile_hip_code_object.hpp>
 #include <migraphx/gpu/compile_hip.hpp>
-#include <migraphx/ranges.hpp>
-#include <migraphx/reduce_dims.hpp>
-#include <migraphx/stringutils.hpp>
-#include <migraphx/dead_code_elimination.hpp>
-#include <migraphx/eliminate_common_subexpression.hpp>
-#include <migraphx/module.hpp>
-#include <migraphx/pass_manager.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

--- a/src/targets/gpu/jit/softmax.cpp
+++ b/src/targets/gpu/jit/softmax.cpp
@@ -26,15 +26,7 @@
 #include <migraphx/gpu/compile_hip_code_object.hpp>
 #include <migraphx/gpu/compile_hip.hpp>
 #include <migraphx/gpu/compile_gen.hpp>
-
-#include <migraphx/cpp_generator.hpp>
-#include <migraphx/ranges.hpp>
 #include <migraphx/reduce_dims.hpp>
-#include <migraphx/stringutils.hpp>
-#include <migraphx/dead_code_elimination.hpp>
-#include <migraphx/eliminate_common_subexpression.hpp>
-#include <migraphx/module.hpp>
-#include <migraphx/pass_manager.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

--- a/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
@@ -91,7 +91,7 @@ __device__ auto& array2vec(T& x)
 template <class T, class... Ts>
 constexpr auto array_for_each(T& x, Ts&... xs)
 {
-    MIGRAPHX_ASSERT((x.size() == xs.size() and ...));
+    MIGRAPHX_ASSERT(((x.size() == xs.size()) and ...));
    return [&](auto f) {
        constexpr auto size = decltype(x.size()){};
        if constexpr((is_vectorizable<typename T::value_type>() or

--- a/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
@@ -28,9 +28,60 @@
 #include <migraphx/kernels/types.hpp>
 #include <migraphx/kernels/integral_constant.hpp>
 #include <migraphx/kernels/type_traits.hpp>
+#include <migraphx/kernels/debug.hpp>

 namespace migraphx {

+#if defined(MIGRAPHX_NGLOBAL) && defined(MIGRAPHX_NLOCAL)
+#define MIGRAPHX_NGROUP ((MIGRAPHX_NGLOBAL + MIGRAPHX_NLOCAL - 1) / MIGRAPHX_NLOCAL)
+#endif
+
+inline __device__ __attribute__((const)) index_int compute_global_size()
+{
+#ifdef MIGRAPHX_NGLOBAL
+    return MIGRAPHX_NGLOBAL;
+#else
+    // This actualy works even when global is not divisible by local size.
+    // This doesnt actually do a multiplicatiosn. Instead it calls a device
+    // function to get the global size, which is why it works.
+    return blockDim.x * gridDim.x;  // NOLINT
+#endif
+}
+
+// We cant just use blockDim.x to get the local size since its broken on hip
+// when global is not divisible by local size. In this case, we calulate the
+// size for the last group.
+inline __device__ __attribute__((const)) index_int compute_local_size()
+{
+#ifdef MIGRAPHX_NLOCAL
+    const auto nlocal = MIGRAPHX_NLOCAL;
+#else
+    const auto nlocal = blockDim.x; // NOLINT
+#endif
+#ifdef MIGRAPHX_NGROUP
+    const auto ngroup = MIGRAPHX_NGROUP;
+#else
+    const auto ngroup = gridDim.x;  // NOLINT
+#endif
+    const auto group_id = blockIdx.x; // NOLINT
+    const auto nglobal  = compute_global_size();
+    if(group_id == ngroup - 1)
+    {
+        return 1 + (nglobal - 1) % nlocal;
+    }
+    else
+    {
+        return nlocal; // NOLINT
+    }
+}
+
+#ifdef MIGRAPHX_NGROUP
+// If global is divisible by local then local can be a const
+#if(MIGRAPHX_NGLOBAL % MIGRAPHX_NLOCAL == 0) || (MIGRAPHX_NGROUP == 1)
+#define MIGRAPHX_HAS_CONST_LOCAL 1
+#endif
+#endif
+
 struct index
 {
    index_int global = 0;
@@ -38,20 +89,44 @@ struct index
    index_int group  = 0;

 #ifdef MIGRAPHX_NGLOBAL
-    constexpr index_constant<MIGRAPHX_NGLOBAL> nglobal() const { return {}; }
+    constexpr index_constant<MIGRAPHX_NGLOBAL> nglobal() const
+    {
+        static_assert(MIGRAPHX_NGLOBAL > 0, "Global size must be greater than 0");
+        return {};
+    }
 #else
    __device__ index_int nglobal() const
    {
-        return blockDim.x * gridDim.x; // NOLINT
+        MIGRAPHX_ASSERT(compute_global_size() > 0);
+        return compute_global_size(); // NOLINT
    }
 #endif

-#ifdef MIGRAPHX_NLOCAL
-    constexpr index_constant<MIGRAPHX_NLOCAL> nlocal() const { return {}; }
+#ifdef MIGRAPHX_HAS_CONST_LOCAL
+    constexpr index_constant<MIGRAPHX_NLOCAL> nlocal() const
+    {
+        static_assert(MIGRAPHX_NLOCAL > 0, "Local size must be greater than 0");
+        return {};
+    }
 #else
    __device__ index_int nlocal() const
    {
-        return blockDim.x; // NOLINT
+#ifdef MIGRAPHX_NGROUP
+        static_assert((MIGRAPHX_NGLOBAL % MIGRAPHX_NLOCAL != 0) and (MIGRAPHX_NGROUP > 1),
+                      "Local size should be const");
+#endif
+        MIGRAPHX_ASSERT(compute_local_size() > 0);
+        return compute_local_size(); // NOLINT
+    }
+#endif
+
+#ifdef MIGRAPHX_NLOCAL
+    constexpr index_constant<MIGRAPHX_NLOCAL> max_nlocal() const { return {}; }
+#else
+    __device__ index_int max_nlocal() const
+    {
+        MIGRAPHX_ASSERT(blockDim.x > 0);
+        return blockDim.x;
    }
 #endif
    template <class N, class Stride>
@@ -63,6 +138,7 @@ struct index
    template <class F, class N, class Stride>
    static constexpr void for_stride(index_int start, N n, Stride stride, F f)
    {
+        MIGRAPHX_ASSERT(start < stride);
        if constexpr(not is_integral<N>{} and not is_integral<Stride>{} and
                     max_stride_iterations(n, stride) == 1)
        {

--- a/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
@@ -97,13 +97,14 @@ MIGRAPHX_DPP_REDUCE(op::product, v_mul)
 template <class Op, class T, class Index, class F>
 __device__ auto block_reduce(index idx, Op op, T init, Index n, F f)
 {
+    MIGRAPHX_ASSERT(idx.max_nlocal() == idx.nlocal());
 #if __AMDGCN_WAVEFRONT_SIZE == 32
    constexpr index_int lanes_per_thread = 16;
 #else
    constexpr index_int lanes_per_thread = 64;
 #endif
    using type = decltype(f(0));
-    __shared__ type buffer[idx.nlocal() / lanes_per_thread];
+    __shared__ type buffer[idx.max_nlocal() / lanes_per_thread];
    type x = init;
    idx.local_stride(n, [&](auto i) { x = op(x, f(i)); });
    dpp_reduce(x, op);
@@ -126,9 +127,9 @@ __device__ auto block_reduce(index idx, Op op, T init, Index n, F f)
 template <class Op, class T, class Index, class F>
 __device__ auto block_reduce(index idx, Op op, T init, Index n, F f)
 {
-
+    MIGRAPHX_ASSERT(idx.max_nlocal() == idx.nlocal());
    using type = decltype(f(0));
-    __shared__ type buffer[idx.nlocal()];
+    __shared__ type buffer[idx.max_nlocal()];
    type x = init;
    idx.local_stride(n, [&](auto i) { x = op(x, f(i)); });
    buffer[idx.local] = x;

--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -27,42 +27,24 @@
 #include <migraphx/instruction.hpp>
 #include <migraphx/make_op.hpp>

-#include <migraphx/op/abs.hpp>
-#include <migraphx/op/batch_norm_inference.hpp>
 #include <migraphx/op/convolution.hpp>
 #include <migraphx/op/deconvolution.hpp>
 #include <migraphx/op/dot.hpp>
-#include <migraphx/op/elu.hpp>
 #include <migraphx/op/if_op.hpp>
-#include <migraphx/op/leaky_relu.hpp>
-#include <migraphx/op/lrn.hpp>
-#include <migraphx/op/pooling.hpp>
 #include <migraphx/op/reshape.hpp>
 #include <migraphx/op/quant_convolution.hpp>
 #include <migraphx/op/quant_dot.hpp>

-#include <migraphx/gpu/abs.hpp>
 #include <migraphx/gpu/batch_norm_inference.hpp>
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/gpu/convolution.hpp>
 #include <migraphx/gpu/deconvolution.hpp>
 #include <migraphx/gpu/device_name.hpp>
-#include <migraphx/gpu/elu.hpp>
-#include <migraphx/gpu/equal.hpp>
 #include <migraphx/gpu/gemm.hpp>
-#include <migraphx/gpu/greater.hpp>
 #include <migraphx/gpu/int8_conv_pack.hpp>
-#include <migraphx/gpu/leaky_relu.hpp>
-#include <migraphx/gpu/less.hpp>
-#include <migraphx/gpu/logical_and.hpp>
-#include <migraphx/gpu/logical_or.hpp>
-#include <migraphx/gpu/logical_xor.hpp>
-#include <migraphx/gpu/lrn.hpp>
 #include <migraphx/gpu/miopen.hpp>
 #include <migraphx/gpu/quant_convolution.hpp>
 #include <migraphx/gpu/rocblas.hpp>
-#include <migraphx/gpu/unary_not.hpp>
-#include <migraphx/gpu/where.hpp>
 #include <migraphx/gpu/compiler.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/program.hpp>

--- a/src/targets/gpu/prefuse_ops.cpp
+++ b/src/targets/gpu/prefuse_ops.cpp
@@ -23,6 +23,7 @@
 */
 #include <migraphx/gpu/prefuse_ops.hpp>
 #include <migraphx/match/layernorm.hpp>
+#include <migraphx/check_shapes.hpp>
 #include <migraphx/make_op.hpp>
 #include <migraphx/register_op.hpp>


--- a/src/tf/parse_relu6.cpp
+++ b/src/tf/parse_relu6.cpp
@@ -41,8 +41,9 @@ struct parse_relu6 : op_parser<parse_relu6>
                          const tf_parser::node_info& info,
                          std::vector<instruction_ref> args) const
    {
-        auto min_val = info.add_literal(0.0f);
-        auto max_val = info.add_literal(6.0f);
+        shape::type_t output_type = args[0]->get_shape().type();
+        auto min_val = info.add_literal(migraphx::literal{migraphx::shape{output_type}, {0.0f}});
+        auto max_val = info.add_literal(migraphx::literal{migraphx::shape{output_type}, {6.0f}});

        return info.add_common_op("clip", args[0], min_val, max_val);
    }

--- a/test/onnx/onnx_test.cpp
+++ b/test/onnx/onnx_test.cpp
@@ -38,7 +38,6 @@
 #include <migraphx/onnx.hpp>
 #include <migraphx/make_op.hpp>
 #include <migraphx/op/convolution.hpp>
-#include <migraphx/op/pad.hpp>
 #include <migraphx/op/pooling.hpp>
 #include <migraphx/op/lrn.hpp>
 #include <migraphx/op/reshape.hpp>

--- a/test/tf/gen_tf_pb.py
+++ b/test/tf/gen_tf_pb.py
@@ -495,10 +495,10 @@ def relu6_test(g1):


 @tf_test
-def relu6_mismatch_test(g1):
+def relu6_half_test(g1):
    with g1.as_default():
        g1_input = tf.compat.v1.placeholder(tf.float16,
-                                            shape=(1, 3, 13, 37),
+                                            shape=(1, 3, 16, 16),
                                            name='0')
        tf.nn.relu6(g1_input, 'relu6')

@@ -708,7 +708,7 @@ if __name__ == '__main__':
    pow_test()
    relu_test()
    relu6_test()
-    relu6_mismatch_test()
+    relu6_half_test()
    reshape_test()
    rsqrt_test()
    shape_test()

--- a/test/tf/relu6_mismatch_test.pb
+++ b/test/tf/relu6_mismatch_test.pb
@@ -2,7 +2,7 @@
 :
 0Placeholder*
 dtype0*
-shape:
%
+shape:

 relu6Relu60*
 T0"
\ No newline at end of file
--- a/test/tf/tf_test.cpp
+++ b/test/tf/tf_test.cpp
@@ -729,27 +729,23 @@ TEST_CASE(relu6_test)
    EXPECT(p == prog);
 }

-TEST_CASE(relu6_mismatch_test)
+TEST_CASE(relu6_half_test)
 {
    migraphx::program p;

    auto* mm = p.get_main_module();
-    std::vector<size_t> input_lens{1, 3, 13, 37};
-    auto l0      = mm->add_parameter("0", migraphx::shape{migraphx::shape::half_type, input_lens});
-    auto min_val = mm->add_literal(0.0f);
-    auto max_val = mm->add_literal(6.0f);
-
-    auto l0_convert = mm->add_instruction(
-        migraphx::make_op("convert", {{"target_type", migraphx::shape::float_type}}), l0);
-
+    std::vector<size_t> input_lens{1, 3, 16, 16};
+    auto l0 = mm->add_parameter("0", migraphx::shape{migraphx::shape::half_type, input_lens});
+    auto min_val =
+        mm->add_literal(migraphx::literal{migraphx::shape{migraphx::shape::half_type}, {0.0f}});
+    auto max_val =
+        mm->add_literal(migraphx::literal{migraphx::shape{migraphx::shape::half_type}, {6.0f}});
    min_val = mm->add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", input_lens}}),
                                  min_val);
    max_val = mm->add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", input_lens}}),
                                  max_val);
-
-    mm->add_instruction(migraphx::make_op("clip"), l0_convert, min_val, max_val);
-
-    auto prog = optimize_tf("relu6_mismatch_test.pb", false);
+    mm->add_instruction(migraphx::make_op("clip"), l0, min_val, max_val);
+    auto prog = optimize_tf("relu6_half_test.pb", false);

    EXPECT(p == prog);
 }

--- a/test/verify/test_conv_group_add.cpp
+++ b/test/verify/test_conv_group_add.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "verify_program.hpp"
+#include <migraphx/program.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/make_op.hpp>
+
+struct test_conv_group_add : verify_program<test_conv_group_add>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto* mm = p.get_main_module();
+        migraphx::shape s{migraphx::shape::float_type, {1, 68, 28, 28}};
+        auto x    = mm->add_parameter("x", s);
+        auto w    = mm->add_parameter("w", {migraphx::shape::float_type, {68, 17, 1, 1}});
+        auto b    = mm->add_parameter("b", {migraphx::shape::float_type, {68}});
+        auto conv = mm->add_instruction(migraphx::make_op("convolution", {{"group", 4}}), x, w);
+        auto bb   = mm->add_instruction(
+            migraphx::make_op("broadcast", {{"axis", 1}, {"out_lens", {1, 68, 28, 28}}}), b);
+        mm->add_instruction(migraphx::make_op("add"), conv, bb);
+        return p;
+    }
+};