Enable pointwise fusion by default (#1082)

There is now a MIGRAPHX_DISABLE_POINTWISE_FUSION to disable it

Enable pointwise fusion by default (#1082)
There is now a MIGRAPHX_DISABLE_POINTWISE_FUSION to disable it
c7419a9c · Paul Fultz II · GitHub · e64b773f · c7419a9c · c7419a9c
Unverified Commit c7419a9c authored Feb 09, 2022 by Paul Fultz II Committed by GitHub Feb 09, 2022
6 changed files
--- a/src/targets/gpu/kernels/include/migraphx/kernels/preload.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/preload.hpp
@@ -6,15 +6,32 @@
 namespace migraphx {
+template <class T>
+struct remove_vec_impl
+{
+    using type = T;
+};
+template <class T, index_int N>
+struct remove_vec_impl<vec<T, N>>
+{
+    using type = T;
+};
+template <class T>
+using remove_vec = typename remove_vec_impl<T>::type;
 template <class T, class... Shapes>
 constexpr auto traverse_preload(Shapes... ss)
 {
    return [=](auto f, auto... g) {
        index_int offset = 0;
        auto each        = [&](auto x) {
+            using type          = remove_vec<typename decltype(x)::type>;
            constexpr auto s    = decltype(x.get_shape()){};
            constexpr auto size = _c<s.element_space()>;
-            if constexpr(not s.broadcasted() or (s.elements() - size) < 64)
+            if constexpr(not s.broadcasted() or (s.elements() - size) < 64 or
+                         not is_same<T, type>{})
                return f(x, offset, false_type{});
            else
            {
@@ -78,23 +95,23 @@ __device__ auto preload_copy(index idx, F f, __shared__ T* buffer, Ts... xs)
        invoke);
 }
-template <class T>
+template <class T, class Shape>
-struct remove_vec
+struct shape_type : Shape
 {
    using type = T;
 };
-template <class T, index_int N>
+template <class T>
-struct remove_vec<vec<T, N>>
+constexpr auto make_shape_type(T)
 {
-    using type = T;
+    return shape_type<typename T::type, typename T::shape_type>{};
-};
+}
 template <class T, class... Ts>
 __device__ auto preload(index idx, Ts... xs)
 {
-    using type               = typename remove_vec<T>::type;
+    using type               = remove_vec<T>;
-    constexpr auto size      = decltype(compute_preload_size<type>(xs.get_shape()...)){};
+    constexpr auto size      = decltype(compute_preload_size<type>(make_shape_type(xs)...)){};
    const index_int max_size = 512 * sizeof(type);
    return [=](auto f) {
        if constexpr(size > 0 and size < max_size)

--- a/src/targets/gpu/kernels/include/migraphx/kernels/tensor_view.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/tensor_view.hpp
@@ -9,7 +9,8 @@ namespace migraphx {
 template <class T, class Shape>
 struct tensor_view
 {
-    using type = T;
+    using type       = T;
+    using shape_type = Shape;
    constexpr Shape get_shape() const { return Shape{}; }
    constexpr index_int size() const { return get_shape().elements(); }

--- a/src/targets/gpu/kernels/include/migraphx/kernels/type_traits.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/type_traits.hpp
@@ -25,6 +25,16 @@ struct is_convertible : bool_constant<__is_convertible(From, To)>
 {
 };
+template <class T, class U>
+struct is_same : false_type
+{
+};
+template <class T>
+struct is_same<T, T> : true_type
+{
+};
 #define MIGRAPHX_REQUIRES(...) class = enable_if_t<__VA_ARGS__>
 } // namespace migraphx

--- a/src/targets/gpu/target.cpp
+++ b/src/targets/gpu/target.cpp
@@ -44,7 +44,7 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_SCHEDULE_PASS)
-MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_POINTWISE_FUSION)
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_POINTWISE_FUSION)
 struct id_pass
 {
@@ -100,7 +100,7 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
        simplify_reshapes{},
        propagate_constant{},
        dead_code_elimination{},
-        enable_pass(enabled(MIGRAPHX_ENABLE_POINTWISE_FUSION{}), fuse_pointwise{}),
+        enable_pass(not enabled(MIGRAPHX_DISABLE_POINTWISE_FUSION{}), fuse_pointwise{}),
        dead_code_elimination{},
        mlir_conv{&ctx},
        lowering{&ctx, options.offload_copy},

--- a/test/gpu/fast_math.cpp
+++ b/test/gpu/fast_math.cpp
-#include <test.hpp>
-#include <migraphx/quantization.hpp>
-#include <migraphx/iterator_for.hpp>
-#include <migraphx/op/add.hpp>
-#include <migraphx/op/mul.hpp>
-#include <migraphx/op/multibroadcast.hpp>
-#include <migraphx/op/pow.hpp>
-#include <migraphx/op/tanh.hpp>
-#include <migraphx/gpu/target.hpp>
-#include <migraphx/instruction.hpp>
-migraphx::program create_gelu()
-{
-    migraphx::program p;
-    auto* mm                 = p.get_main_module();
-    std::vector<float> data0 = {0.044715};
-    std::vector<float> data1 = {0.797885};
-    std::vector<float> data2 = {3};
-    std::vector<float> data3 = {0.5};
-    migraphx::shape s0{migraphx::shape::float_type, {1}};
-    std::vector<size_t> x_dims{1, 1, 5};
-    auto x         = mm->add_parameter("x", migraphx::shape{migraphx::shape::float_type, x_dims});
-    auto const_val = mm->add_literal(migraphx::literal{s0, data0});
-    auto sqrt_2_pi = mm->add_literal(migraphx::literal{s0, data1});
-    auto three_val = mm->add_literal(migraphx::literal{s0, data2});
-    auto half_val  = mm->add_literal(migraphx::literal{s0, data3});
-    auto mbcast_3         = mm->add_instruction(migraphx::op::multibroadcast{x_dims}, three_val);
-    auto pow_op           = mm->add_instruction(migraphx::op::pow{}, x, mbcast_3);
-    auto mbcast_const     = mm->add_instruction(migraphx::op::multibroadcast{x_dims}, const_val);
-    auto mul_const        = mm->add_instruction(migraphx::op::mul{}, mbcast_const, pow_op);
-    auto add_x            = mm->add_instruction(migraphx::op::add{}, x, mul_const);
-    auto mbcast_sqrt_2_pi = mm->add_instruction(migraphx::op::multibroadcast{x_dims}, sqrt_2_pi);
-    auto mul_add_x        = mm->add_instruction(migraphx::op::mul{}, mbcast_sqrt_2_pi, add_x);
-    auto tanh_op          = mm->add_instruction(migraphx::op::tanh{}, mul_add_x);
-    auto mbcast_half      = mm->add_instruction(migraphx::op::multibroadcast{x_dims}, half_val);
-    auto mul_half         = mm->add_instruction(migraphx::op::mul{}, mbcast_half, tanh_op);
-    auto add_mul_half     = mm->add_instruction(migraphx::op::add{}, mul_half, mbcast_half);
-    auto mul_x            = mm->add_instruction(migraphx::op::mul{}, x, add_mul_half);
-    mm->add_return({mul_x});
-    return p;
-}
-TEST_CASE(enable_fast_gelu)
-{
-    migraphx::program p = create_gelu();
-    p.compile(migraphx::gpu::target{});
-    CHECK(any_of(*p.get_main_module(), [&](auto&& i) { return i.name() == "gpu::gelu"; }));
-}
-TEST_CASE(disable_fast_gelu)
-{
-    migraphx::program p = create_gelu();
-    migraphx::compile_options options;
-    options.fast_math = false;
-    p.compile(migraphx::gpu::target{}, options);
-    CHECK(any_of(*p.get_main_module(), [&](auto&& i) { return i.name() == "gpu::gelu_new"; }));
-}
-int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/test/verify/test_sub_int.cpp
+++ b/test/verify/test_sub_int.cpp
+#include "verify_program.hpp"
+#include <migraphx/program.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/make_op.hpp>
+struct test_sub_int : verify_program<test_sub_int>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto* mm = p.get_main_module();
+        migraphx::shape s{migraphx::shape::float_type, {3}};
+        auto x  = mm->add_parameter("x", {migraphx::shape::int16_type, {4, 5}});
+        auto y  = mm->add_parameter("y", {migraphx::shape::int16_type, {2, 3, 4, 5}});
+        auto xb = mm->add_instruction(
+            migraphx::make_op("multibroadcast", {{"out_lens", {2, 3, 4, 5}}}), x);
+        auto diff = mm->add_instruction(migraphx::make_op("sub"), y, xb);
+        mm->add_return({diff});
+        return p;
+    }
+};