Add_clip fusion (#370)

* initial testing of add_clip fusion * formatting * clipped relu fusion * formatting * remove some executables, add fusion test * formatting * remove clipped_relu code * fix clang-tidy * revert changes to cmake files * remove fusion from weight map * formatting * fix syntax error * formatting * fix syntax error * fix syntax error * formatting

Add_clip fusion (#370)
* initial testing of add_clip fusion * formatting * clipped relu fusion * formatting * remove some executables, add fusion test * formatting * remove clipped_relu code * fix clang-tidy * revert changes to cmake files * remove fusion from weight map * formatting * fix syntax error * formatting * fix syntax error * fix syntax error * formatting
1398bcc1 · kahmed10 · mvermeulen · 84a3f56e · 1398bcc1 · 1398bcc1
Commit 1398bcc1 authored Oct 04, 2019 by kahmed10 Committed by mvermeulen Oct 04, 2019
5 changed files
--- a/src/targets/gpu/device/add_unary.cpp
+++ b/src/targets/gpu/device/add_unary.cpp
@@ -16,6 +16,18 @@ void mul_add_relu(hipStream_t stream,
        [](auto x, auto a, auto b) { return std::max<decltype(a * x + b)>(0, a * x + b); });
 }

+void add_clip(hipStream_t stream,
+              const argument& result,
+              const argument& arg1,
+              const argument& arg2,
+              const float max,
+              const float min)
+{
+    nary(stream, result, arg1, arg2)([max, min](auto x, auto y) {
+        return std::min<decltype(x + y)>(std::max<decltype(x)>(min, x + y), max);
+    });
+}
+
 void add_relu(hipStream_t stream,
              const argument& result,
              const argument& arg1,
@@ -42,6 +54,19 @@ void add_tanh(hipStream_t stream,
    nary(stream, result, arg1, arg2)([](auto x, auto y) { return ::tanh(to_hip_type(x + y)); });
 }

+void add_clip(hipStream_t stream,
+              const argument& result,
+              const argument& arg1,
+              const argument& arg2,
+              const argument& arg3,
+              const float max,
+              const float min)
+{
+    nary(stream, result, arg1, arg2, arg3)([max, min](auto x, auto y, auto z) {
+        return std::min<decltype(x + y + z)>(std::max<decltype(x)>(min, x + y + z), max);
+    });
+}
+
 void add_relu(hipStream_t stream,
              const argument& result,
              const argument& arg1,

--- a/src/targets/gpu/fuse_ops.cpp
+++ b/src/targets/gpu/fuse_ops.cpp
 #include <migraphx/gpu/fuse_ops.hpp>
 #include <migraphx/matcher.hpp>
 #include <migraphx/gpu/miopen.hpp>
+#include <migraphx/gpu/clip.hpp>
 #include <migraphx/gpu/convolution.hpp>
 #include <migraphx/gpu/oper.hpp>
 #include <migraphx/gpu/device/mul_add.hpp>
@@ -8,6 +9,7 @@
 #include <migraphx/gpu/device/add.hpp>
 #include <migraphx/instruction.hpp>
 #include <migraphx/array.hpp>
+#include <migraphx/op/clip.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -170,6 +172,65 @@ struct hip_triadd
    }
 };

+struct hip_triadd_clip
+{
+    op::clip op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return op::clip::reflect(self.op, f);
+    }
+    std::string name() const { return "hip::triadd_clip"; }
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        check_shapes{inputs, *this}.has(4);
+        return inputs.front();
+    }
+    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const
+    {
+        device::add_clip(ctx.get_stream().get(),
+                         args.at(3),
+                         args.at(0),
+                         args.at(1),
+                         args.at(2),
+                         op.max_val,
+                         op.min_val);
+        return args.at(3);
+    }
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+
+struct hip_add_clip
+{
+    op::clip op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return op::clip::reflect(self.op, f);
+    }
+    std::string name() const { return "hip::add_clip"; }
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        check_shapes{inputs, *this}.has(3);
+        return inputs.front();
+    }
+    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const
+    {
+        device::add_clip(
+            ctx.get_stream().get(), args.at(2), args.at(0), args.at(1), op.max_val, op.min_val);
+        return args.at(2);
+    }
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+
 struct hip_triadd_relu : ternary_device<hip_triadd_relu, &device::add_relu>
 {
 };
@@ -253,6 +314,35 @@ void move_standard_front(std::vector<instruction_ref>& args)
        std::swap(*it, args.front());
 }

+struct find_add_clip
+{
+    auto matcher() const
+    {
+        return match::name(std::unordered_set<std::string>{"gpu::clip", "gpu::clipped_relu"})(
+            match::arg(0)(match::any_of(match::name("gpu::add"),
+                                        match::name("hip::triadd"),
+                                        match::any_of[match::inputs()](match::standard_shape()))
+                              .bind("add")));
+    }
+
+    void apply(program& p, match::matcher_result r) const
+    {
+        auto add_ins = r.instructions["add"];
+        auto ins     = r.result;
+        auto&& op    = any_cast<gpu::hip_clip>(ins->get_operator()).op;
+        auto args    = add_ins->inputs();
+        move_standard_front(args);
+        move_broadcasted_back(args);
+
+        // Use the allocation from the relu operator
+        args.back() = ins->inputs().back();
+        if(add_ins->name() == "gpu::add")
+            p.replace_instruction(ins, hip_add_clip{op}, args);
+        else if(add_ins->name() == "hip::triadd")
+            p.replace_instruction(ins, hip_triadd_clip{op}, args);
+    }
+};
+
 struct find_add_unary
 {
    std::string op_name;
@@ -490,7 +580,8 @@ struct find_conv_bias
    context* ctx = nullptr;
    auto matcher() const
    {
-        return conv_bias(match::none_of(match::output(match::name("gpu::relu"))));
+        return conv_bias(match::none_of(
+            match::output(match::name(std::unordered_set<std::string>{"gpu::relu"}))));
    }

    void apply(program& p, match::matcher_result r) const
@@ -521,7 +612,8 @@ void fuse_ops::apply(program& p) const
        find_mul_add_relu{},
        find_add_unary{"gpu::relu", hip_add_relu{}, hip_triadd_relu{}},
        find_add_unary{"gpu::sigmoid", hip_add_sigmoid{}, hip_triadd_sigmoid{}},
-        find_add_unary{"gpu::tanh", hip_add_tanh{}, hip_triadd_tanh{}}
+        find_add_unary{"gpu::tanh", hip_add_tanh{}, hip_triadd_tanh{}},
+        find_add_clip{}
    );
    // clang-format on
 }

--- a/src/targets/gpu/include/migraphx/gpu/device/add_unary.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/add_unary.hpp
@@ -17,6 +17,13 @@ void mul_add_relu(hipStream_t stream,
                  const argument& arg2,
                  const argument& arg3);

+void add_clip(hipStream_t stream,
+              const argument& result,
+              const argument& arg1,
+              const argument& arg2,
+              float max,
+              float min);
+
 void add_relu(hipStream_t stream,
              const argument& result,
              const argument& arg1,
@@ -32,6 +39,14 @@ void add_tanh(hipStream_t stream,
              const argument& arg1,
              const argument& arg2);

+void add_clip(hipStream_t stream,
+              const argument& result,
+              const argument& arg1,
+              const argument& arg2,
+              const argument& arg3,
+              float max,
+              float min);
+
 void add_relu(hipStream_t stream,
              const argument& result,
              const argument& arg1,

--- a/src/targets/gpu/schedule_model.cpp
+++ b/src/targets/gpu/schedule_model.cpp
@@ -95,16 +95,13 @@ void schedule_model::record(program& p, instruction_ref ins, std::size_t wait_id

 static std::unordered_map<std::string, std::size_t> create_weight_map()
 {
-    return {
-        {"hip::load_literal", 0},
-        {"hip::allocate", 0},
-        {"gpu::convolution", 4},
-        {"gpu::conv_bias_relu", 4},
-        {"gpu::pooling", 2},
-        {"gpu::gemm", 2},
-        {"gpu::concat", 1},
-        {"hip::add_relu", 2},
-    };
+    return {{"hip::load_literal", 0},
+            {"hip::allocate", 0},
+            {"gpu::convolution", 4},
+            {"gpu::conv_bias_relu", 4},
+            {"gpu::pooling", 2},
+            {"gpu::gemm", 2},
+            {"gpu::concat", 1}};
 }

 static const std::unordered_map<std::string, std::size_t>& weight_map()

--- a/test/gpu/ops_test.cpp
+++ b/test/gpu/ops_test.cpp
@@ -822,6 +822,27 @@ struct test_conv_relu_half : verify_program<test_conv_relu_half>
    }
 };

+struct test_conv_bias_clipped_relu : verify_program<test_conv_bias_clipped_relu>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto input =
+            p.add_parameter("x", migraphx::shape{migraphx::shape::float_type, {4, 3, 3, 3}});
+        auto weights =
+            p.add_parameter("w", migraphx::shape{migraphx::shape::float_type, {4, 3, 3, 3}});
+        auto l0   = migraphx::literal{migraphx::shape{migraphx::shape::float_type, {4}},
+                                    {2.0f, 2.0f, 2.0f, 2.0f}};
+        auto bias = p.add_literal(l0);
+        auto conv = p.add_instruction(migraphx::op::convolution{}, input, weights);
+        auto bcast_add =
+            p.add_instruction(migraphx::op::broadcast{1, conv->get_shape().lens()}, bias);
+        auto bias_add = p.add_instruction(migraphx::op::add{}, conv, bcast_add);
+        p.add_instruction(migraphx::op::clip{6.0f, 0.0f}, bias_add);
+        return p;
+    }
+};
+
 struct test_add_relu : verify_program<test_add_relu>
 {
    migraphx::program create_program() const