Merge remote-tracking branch 'origin/develop' into ck-elementwise

f7838bc8 · turneram · fea58a7b · d78bcdfb · f7838bc8 · f7838bc8
Commit f7838bc8 authored Sep 12, 2022 by turneram
20 changed files
--- a/src/targets/gpu/include/migraphx/gpu/topk.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/topk.hpp
@@ -27,7 +27,7 @@
 #include <migraphx/argument.hpp>
 #include <migraphx/reflect.hpp>
 #include <migraphx/op/topk.hpp>
-#include <migraphx/gpu/miopen.hpp>
+#include <migraphx/gpu/context.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

--- a/src/targets/gpu/jit/gathernd.cpp
+++ b/src/targets/gpu/jit/gathernd.cpp
@@ -27,13 +27,6 @@
 #include <migraphx/gpu/compile_hip_code_object.hpp>
 #include <migraphx/gpu/compile_hip.hpp>
-#include <migraphx/ranges.hpp>
-#include <migraphx/reduce_dims.hpp>
-#include <migraphx/stringutils.hpp>
-#include <migraphx/dead_code_elimination.hpp>
-#include <migraphx/eliminate_common_subexpression.hpp>
-#include <migraphx/module.hpp>
-#include <migraphx/pass_manager.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

--- a/src/targets/gpu/jit/layernorm.cpp
+++ b/src/targets/gpu/jit/layernorm.cpp
@@ -27,8 +27,6 @@
 #include <migraphx/gpu/compile_hip.hpp>
 #include <migraphx/gpu/compile_gen.hpp>
-#include <migraphx/cpp_generator.hpp>
-#include <migraphx/ranges.hpp>
 #include <migraphx/reduce_dims.hpp>
 #include <migraphx/stringutils.hpp>

--- a/src/targets/gpu/jit/pointwise.cpp
+++ b/src/targets/gpu/jit/pointwise.cpp
@@ -26,16 +26,7 @@
 #include <migraphx/gpu/compile_hip_code_object.hpp>
 #include <migraphx/gpu/compile_hip.hpp>
 #include <migraphx/gpu/compile_gen.hpp>
-#include <migraphx/cpp_generator.hpp>
-#include <migraphx/ranges.hpp>
 #include <migraphx/reduce_dims.hpp>
-#include <migraphx/permutation.hpp>
-#include <migraphx/stringutils.hpp>
-#include <migraphx/dead_code_elimination.hpp>
-#include <migraphx/eliminate_common_subexpression.hpp>
-#include <migraphx/module.hpp>
-#include <migraphx/pass_manager.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

--- a/src/targets/gpu/jit/reduce.cpp
+++ b/src/targets/gpu/jit/reduce.cpp
@@ -26,15 +26,7 @@
 #include <migraphx/gpu/compile_hip_code_object.hpp>
 #include <migraphx/gpu/compile_hip.hpp>
 #include <migraphx/gpu/compile_gen.hpp>
-#include <migraphx/cpp_generator.hpp>
-#include <migraphx/ranges.hpp>
 #include <migraphx/reduce_dims.hpp>
-#include <migraphx/stringutils.hpp>
-#include <migraphx/dead_code_elimination.hpp>
-#include <migraphx/eliminate_common_subexpression.hpp>
-#include <migraphx/module.hpp>
-#include <migraphx/pass_manager.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

--- a/src/targets/gpu/jit/roialign.cpp
+++ b/src/targets/gpu/jit/roialign.cpp
@@ -24,16 +24,7 @@
 #include <migraphx/gpu/compiler.hpp>
 #include <migraphx/gpu/compile_hip_code_object.hpp>
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/gpu/compile_hip.hpp>
-#include <migraphx/cpp_generator.hpp>
-#include <migraphx/ranges.hpp>
-#include <migraphx/reduce_dims.hpp>
-#include <migraphx/stringutils.hpp>
-#include <migraphx/dead_code_elimination.hpp>
-#include <migraphx/eliminate_common_subexpression.hpp>
-#include <migraphx/module.hpp>
-#include <migraphx/pass_manager.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

--- a/src/targets/gpu/jit/scatternd.cpp
+++ b/src/targets/gpu/jit/scatternd.cpp
@@ -24,16 +24,8 @@
 #include <migraphx/gpu/compiler.hpp>
 #include <migraphx/make_op.hpp>
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/gpu/compile_hip_code_object.hpp>
 #include <migraphx/gpu/compile_hip.hpp>
-#include <migraphx/ranges.hpp>
-#include <migraphx/reduce_dims.hpp>
-#include <migraphx/stringutils.hpp>
-#include <migraphx/dead_code_elimination.hpp>
-#include <migraphx/eliminate_common_subexpression.hpp>
-#include <migraphx/module.hpp>
-#include <migraphx/pass_manager.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

--- a/src/targets/gpu/jit/softmax.cpp
+++ b/src/targets/gpu/jit/softmax.cpp
@@ -26,15 +26,7 @@
 #include <migraphx/gpu/compile_hip_code_object.hpp>
 #include <migraphx/gpu/compile_hip.hpp>
 #include <migraphx/gpu/compile_gen.hpp>
-#include <migraphx/cpp_generator.hpp>
-#include <migraphx/ranges.hpp>
 #include <migraphx/reduce_dims.hpp>
-#include <migraphx/stringutils.hpp>
-#include <migraphx/dead_code_elimination.hpp>
-#include <migraphx/eliminate_common_subexpression.hpp>
-#include <migraphx/module.hpp>
-#include <migraphx/pass_manager.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

--- a/src/targets/gpu/kernel.cpp
+++ b/src/targets/gpu/kernel.cpp
@@ -80,7 +80,9 @@ void launch_kernel(hipFunction_t fun,
                   std::size_t global,
                   std::size_t local,
                   void* kernargs,
-                   std::size_t size)
+                   std::size_t size,
+                   hipEvent_t start,
+                   hipEvent_t stop)
 {
    assert(global > 0);
    assert(local > 0);
@@ -97,34 +99,55 @@ void launch_kernel(hipFunction_t fun,
 #endif
    };
-    auto status = hipExtModuleLaunchKernel(
+    auto status = hipExtModuleLaunchKernel(fun,
-        fun, global, 1, 1, local, 1, 1, 0, stream, nullptr, reinterpret_cast<void**>(&config));
+                                           global,
+                                           1,
+                                           1,
+                                           local,
+                                           1,
+                                           1,
+                                           0,
+                                           stream,
+                                           nullptr,
+                                           reinterpret_cast<void**>(&config),
+                                           start,
+                                           stop);
    if(status != hipSuccess)
        MIGRAPHX_THROW("Failed to launch kernel: " + hip_error(status));
+    if(stop != nullptr)
+    {
+        status = hipEventSynchronize(stop);
+        if(status != hipSuccess)
+            MIGRAPHX_THROW("Failed to sync event: " + hip_error(status));
+    }
 }
 void kernel::launch(hipStream_t stream,
                    std::size_t global,
                    std::size_t local,
-                    std::vector<void*> args) const
+                    std::vector<void*> args,
+                    hipEvent_t start,
+                    hipEvent_t stop) const
 {
    assert(impl != nullptr);
    void* kernargs   = args.data();
    std::size_t size = args.size() * sizeof(void*);
-    launch_kernel(impl->fun, stream, global, local, kernargs, size);
+    launch_kernel(impl->fun, stream, global, local, kernargs, size, start, stop);
 }
 void kernel::launch(hipStream_t stream,
                    std::size_t global,
                    std::size_t local,
-                    const std::vector<kernel_argument>& args) const
+                    const std::vector<kernel_argument>& args,
+                    hipEvent_t start,
+                    hipEvent_t stop) const
 {
    assert(impl != nullptr);
    std::vector<char> kernargs = pack_args(args);
    std::size_t size           = kernargs.size();
-    launch_kernel(impl->fun, stream, global, local, kernargs.data(), size);
+    launch_kernel(impl->fun, stream, global, local, kernargs.data(), size, start, stop);
 }
 } // namespace gpu

--- a/src/targets/gpu/kernels/include/migraphx/kernels/algorithm.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/algorithm.hpp
@@ -163,7 +163,7 @@ constexpr Iterator1 search(Iterator1 first, Iterator1 last, Iterator2 s_first, I
            {
                return last;
            }
-            if(!(*it == *s_it))
+            if(not(*it == *s_it))
            {
                break;
            }

--- a/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
@@ -153,7 +153,7 @@ struct array
        return true;
    }
-    friend constexpr bool operator!=(const array& x, const array& y) { return !(x == y); }
+    friend constexpr bool operator!=(const array& x, const array& y) { return not(x == y); }
    // This uses the product order rather than lexical order
    friend constexpr bool operator<(const array& x, const array& y)
    {

--- a/src/targets/gpu/kernels/include/migraphx/kernels/integral_constant.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/integral_constant.hpp
@@ -73,10 +73,10 @@ MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(!=)
 MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(&)
 MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(^)
 MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(|)
-MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(&&)
+MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(and)
-MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(||)
+MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(or)
-MIGRAPHX_INTEGRAL_CONSTANT_UNARY_OP(!)
+MIGRAPHX_INTEGRAL_CONSTANT_UNARY_OP(not )
 MIGRAPHX_INTEGRAL_CONSTANT_UNARY_OP(~)
 MIGRAPHX_INTEGRAL_CONSTANT_UNARY_OP(+)
 MIGRAPHX_INTEGRAL_CONSTANT_UNARY_OP(-)

--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -27,42 +27,24 @@
 #include <migraphx/instruction.hpp>
 #include <migraphx/make_op.hpp>
-#include <migraphx/op/abs.hpp>
-#include <migraphx/op/batch_norm_inference.hpp>
 #include <migraphx/op/convolution.hpp>
 #include <migraphx/op/deconvolution.hpp>
 #include <migraphx/op/dot.hpp>
-#include <migraphx/op/elu.hpp>
 #include <migraphx/op/if_op.hpp>
-#include <migraphx/op/leaky_relu.hpp>
-#include <migraphx/op/lrn.hpp>
-#include <migraphx/op/pooling.hpp>
 #include <migraphx/op/reshape.hpp>
 #include <migraphx/op/quant_convolution.hpp>
 #include <migraphx/op/quant_dot.hpp>
-#include <migraphx/gpu/abs.hpp>
 #include <migraphx/gpu/batch_norm_inference.hpp>
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/gpu/convolution.hpp>
 #include <migraphx/gpu/deconvolution.hpp>
 #include <migraphx/gpu/device_name.hpp>
-#include <migraphx/gpu/elu.hpp>
-#include <migraphx/gpu/equal.hpp>
 #include <migraphx/gpu/gemm.hpp>
-#include <migraphx/gpu/greater.hpp>
 #include <migraphx/gpu/int8_conv_pack.hpp>
-#include <migraphx/gpu/leaky_relu.hpp>
-#include <migraphx/gpu/less.hpp>
-#include <migraphx/gpu/logical_and.hpp>
-#include <migraphx/gpu/logical_or.hpp>
-#include <migraphx/gpu/logical_xor.hpp>
-#include <migraphx/gpu/lrn.hpp>
 #include <migraphx/gpu/miopen.hpp>
 #include <migraphx/gpu/quant_convolution.hpp>
 #include <migraphx/gpu/rocblas.hpp>
-#include <migraphx/gpu/unary_not.hpp>
-#include <migraphx/gpu/where.hpp>
 #include <migraphx/gpu/compiler.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/program.hpp>
@@ -341,7 +323,7 @@ struct miopen_apply
            catch(migraphx::exception&)
            {
                // In case no solver supports the default format, retry using the other format.
-                compile_quant_conv_with_format(!int8_x4_format);
+                compile_quant_conv_with_format(not int8_x4_format);
            }
            auto args      = ins->inputs();

--- a/src/targets/gpu/mlir.cpp
+++ b/src/targets/gpu/mlir.cpp
@@ -78,7 +78,7 @@ struct mlir_handle
        friend bool operator==(ptr x, ptr y) { return x.get_value() == y.get_value(); }
-        friend bool operator!=(ptr x, ptr y) { return !(x == y); }
+        friend bool operator!=(ptr x, ptr y) { return not(x == y); }
        T obj{};
    };
@@ -503,7 +503,7 @@ struct mlir_program
                pp =
                    problem_params{ins->get_operator(), to_shapes(ins->inputs()), ins->get_shape()};
                std::string tuned = get_tune_params();
-                if(!tuned.empty())
+                if(not tuned.empty())
                    ops.add_attributes({{"perf_config", tuned}});
                // check if HW supports xdlops
                if(contains(get_xdlops_archs(), target_name))

--- a/src/targets/gpu/pack_int8_args.cpp
+++ b/src/targets/gpu/pack_int8_args.cpp
@@ -154,7 +154,7 @@ void pack_int8_args::apply(module& m) const
            bool transa = inputs[0]->get_shape().transposed();
            bool transb = inputs[1]->get_shape().transposed();
-            if(!transb)
+            if(not transb)
            {
                auto packed_b = m.insert_instruction(
                    ins, make_op("hip::allocate", {{"shape", to_value(inputs[1]->get_shape())}}));

--- a/src/targets/gpu/prefuse_ops.cpp
+++ b/src/targets/gpu/prefuse_ops.cpp
@@ -23,6 +23,7 @@
 */
 #include <migraphx/gpu/prefuse_ops.hpp>
 #include <migraphx/match/layernorm.hpp>
+#include <migraphx/check_shapes.hpp>
 #include <migraphx/make_op.hpp>
 #include <migraphx/register_op.hpp>

--- a/src/targets/gpu/target.cpp
+++ b/src/targets/gpu/target.cpp
@@ -42,6 +42,7 @@
 #include <migraphx/register_target.hpp>
 #include <migraphx/replace_allocate.hpp>
 #include <migraphx/rewrite_batchnorm.hpp>
+#include <migraphx/rewrite_gelu.hpp>
 #include <migraphx/rewrite_pooling.hpp>
 #include <migraphx/rewrite_quantization.hpp>
 #include <migraphx/rewrite_rnn.hpp>
@@ -116,6 +117,8 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
        inline_module{},
        rewrite_pooling{},
        dead_code_elimination{},
+        rewrite_gelu{},
+        dead_code_elimination{},
        eliminate_common_subexpression{},
        dead_code_elimination{},
        simplify_algebra{},
@@ -134,8 +137,6 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
        lowering{&ctx, options.offload_copy},
        eliminate_contiguous{"gpu::contiguous"},
        dead_code_elimination{},
-        replace_allocate{gpu_allocation_model{}, options.offload_copy},
-        dead_code_elimination{},
        eliminate_concat{concat_gpu_optimization{}},
        dead_code_elimination{},
        pack_int8_args{},
@@ -144,6 +145,8 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
        dead_code_elimination{},
        fuse_ops{&ctx, options.fast_math},
        dead_code_elimination{},
+        replace_allocate{gpu_allocation_model{}, options.offload_copy},
+        dead_code_elimination{},
        compile_ops{&ctx},
        dead_code_elimination{},
        write_literals{&ctx},

--- a/src/tf/parse_conv.cpp
+++ b/src/tf/parse_conv.cpp
@@ -100,7 +100,7 @@ struct parse_conv : op_parser<parse_conv>
                {
                    MIGRAPHX_THROW("padding should have 4 values");
                }
-                if(padding[0] != padding[2] || padding[1] != padding[3])
+                if(padding[0] != padding[2] or padding[1] != padding[3])
                {
                    MIGRAPHX_THROW("migraphx does not support asymetric padding");
                }

--- a/src/tf/parse_depthwiseconv.cpp
+++ b/src/tf/parse_depthwiseconv.cpp
@@ -90,7 +90,7 @@ struct parse_depthwiseconv : op_parser<parse_depthwiseconv>
                calculate_padding(0, pads, input_dims[2], op.stride[0], op.dilation[0], weight_h);
                calculate_padding(1, pads, input_dims[3], op.stride[1], op.dilation[1], weight_w);
-                if(pads[0] != pads[2] || pads[1] != pads[3])
+                if(pads[0] != pads[2] or pads[1] != pads[3])
                {
                    std::vector<int64_t> padding = {0, 0, pads[0], pads[1], 0, 0, pads[2], pads[3]};
                    l0 = info.add_instruction(migraphx::make_op("pad", {{"pads", padding}}), l0);

--- a/src/tf/parse_pooling.cpp
+++ b/src/tf/parse_pooling.cpp
@@ -42,7 +42,7 @@ struct parse_pooling : op_parser<parse_pooling>
                          tf_parser::node_info info,
                          std::vector<instruction_ref> args) const
    {
-        if(!starts_with(opd.tf_name, "Max") && !starts_with(opd.tf_name, "Av"))
+        if(not starts_with(opd.tf_name, "Max") and not starts_with(opd.tf_name, "Av"))
        {
            MIGRAPHX_THROW("tf pooling mode must be Max or Average");
        }