roialign, softmax, pow, acosh, atanh,pad tests are enabled now

13403ab2 · Umang Yadav · 1be95870 · 13403ab2 · 13403ab2 · 13403ab2
Commit 13403ab2 authored Nov 20, 2023 by Umang Yadav
12 changed files
--- a/src/targets/cpu/dnnl.cpp
+++ b/src/targets/cpu/dnnl.cpp
@@ -67,7 +67,7 @@ dnnl::memory::data_type to_dnnl_memory_data_type(shape::type_t t)
    case st::float_type: return dt::f32;
    case st::int32_type: return dt::s32;
    case st::int8_type: return dt::s8;
-    case st::uint8_type: return dt::u8;
+    case st::uint8_type:
    case st::fp8e4m3fnuz_type: return dt::u8;
    default: MIGRAPHX_THROW("Unsupported data type");
    }

--- a/src/targets/gpu/kernels/include/migraphx/kernels/float8.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/float8.hpp
@@ -25,6 +25,7 @@
 #if defined(__clang__)
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wfloat-equal"
+#pragma clang diagnostic ignored "-Wold-style-cast"
 #endif // __clang__
 #define MIGRAPHX_HIP_DEVICE __device__

@@ -132,7 +133,7 @@ struct float8

    // NOTE: ON-DEVICE... always optimal bias
    explicit constexpr MIGRAPHX_HIP_DEVICE
-    float8(float v,
+    float8(const float v,
           migraphx::fp8::rounding_mode rm = migraphx::fp8::rounding_mode::standard,
           uint32_t rng                    = 0)
    {
@@ -145,8 +146,7 @@ struct float8
 #else
    // DEVICE for non-gfx940 using s/w simulation
    explicit constexpr MIGRAPHX_HIP_DEVICE
-#endif
-    float8(float v,
+    float8(const float v,
           migraphx::fp8::rounding_mode rm = migraphx::fp8::rounding_mode::standard,
           uint32_t rng                    = 0)
    {
@@ -175,7 +175,42 @@ struct float8
 #endif // MIGRAPHX_FP8_DOWNCAST_CLIPPING}
        }
    }
+#endif // __gfx940___
+
+    // Constructor from half
+    explicit constexpr MIGRAPHX_HIP_DEVICE
+    float8(const _Float16 v, rounding_mode rm = rounding_mode::standard, uint32_t rng = 0)
+        : float8((float)v, rm, rng)
+    {
+    }

+    // constructor from int
+    explicit constexpr MIGRAPHX_HIP_DEVICE
+    float8(const int v, rounding_mode rm = rounding_mode::standard, uint32_t rng = 0)
+        : float8((float)v, rm, rng)
+    {
+    }
+
+    // constructor from uint
+    explicit constexpr MIGRAPHX_HIP_DEVICE
+    float8(const uint32_t v, rounding_mode rm = rounding_mode::standard, uint32_t rng = 0)
+        : float8((float)v, rm, rng)
+    {
+    }
+
+    // constructor from double
+    explicit constexpr MIGRAPHX_HIP_DEVICE
+    float8(const double v, rounding_mode rm = rounding_mode::standard, uint32_t rng = 0)
+        : float8((float)v, rm, rng)
+    {
+    }
+
+    // constructor from bool
+    explicit constexpr MIGRAPHX_HIP_DEVICE
+    float8(const bool v, rounding_mode rm = rounding_mode::standard, uint32_t rng = 0)
+        : float8((float)(v), rm, rng)
+    {
+    }
    // convert to float
 // #if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
 #if 0 // need constexpr operator(). This version can't be constexpr // NOLINT
@@ -209,6 +244,8 @@ struct float8
        return migraphx::fp8::impl::cast_from_f8<2, 5, float, FNUZ /*negative_zero_nan*/>(data);
    }

+    inline constexpr explicit MIGRAPHX_HIP_DEVICE operator bool() const { return not is_zero(); }
+
    // check for zero
    inline MIGRAPHX_HIP_DEVICE constexpr bool is_zero() const
    {

--- a/src/targets/gpu/kernels/include/migraphx/kernels/pad.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/pad.hpp
@@ -39,6 +39,7 @@ __device__ void pad(const index& idx,
                    const PadVal& pad_val)
 {
    auto output_shape = output.get_shape();
+    using otype       = typename Output::type;
    idx.global_stride(output_shape.elements(), [&](auto i) {
        // 1. get current multi-index for output
        // 2. get the size of the input to determine input boundaries
@@ -53,9 +54,9 @@ __device__ void pad(const index& idx,
        if(any_of(range_multi.begin(), range_multi.end(), [&](auto j) {
               return multi[j] < offsets[j] or input_idx[j] >= input_bounds[j];
           }))
-            output[multi] = pad_val;
+            output[multi] = otype(pad_val);
        else
-            output[multi] = input[input_idx];
+            output[multi] = otype(input[input_idx]);
    });
 }


--- a/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
@@ -392,7 +392,7 @@ struct block
        {
            using max_iterations = decltype(idx.max_local_stride_iterations(n));
            inner_storage<R, max_iterations{}, N> storage;
-            idx.local_stride(n, [&](auto j, auto d) { storage(j, d) = f(xs(j, d)...); });
+            idx.local_stride(n, [&](auto j, auto d) { storage(j, d) = R{f(xs(j, d)...)}; });
            return storage;
        }
    };

--- a/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
@@ -56,13 +56,13 @@ struct avg_pool
    template <class T>
    MIGRAPHX_DEVICE_CONSTEXPR T operator()(T x, T y)
    {
-        return x + y;
+        return static_cast<T>(x + y);
    }

    template <class T>
    MIGRAPHX_DEVICE_CONSTEXPR T final(T x, index_int y)
    {
-        return (y == 0) ? 0.0 : (x / y);
+        return (y == 0) ? static_cast<T>(0.0) : static_cast<T>(x / y);
    }
 };

@@ -70,13 +70,14 @@ template <class Iterator, class Op>
 MIGRAPHX_DEVICE_CONSTEXPR typename Iterator::value_type bilinear_interpolate(
    const Iterator data, const array<index_int, 2>& dims, array<float, 2> xy, Op pooling)
 {
+    using ret_type = typename Iterator::value_type;
    array<int, 2> low{};
    array<int, 2> high{};
    for(index_int ii = 0; ii < xy.size(); ++ii)
    {
        if(xy[ii] < -1.0f or xy[ii] > dims[ii])
        {
-            return 0;
+            return static_cast<ret_type>(0);
        }

        xy[ii]   = migraphx::max(xy[ii], 0.0f);
@@ -92,11 +93,14 @@ MIGRAPHX_DEVICE_CONSTEXPR typename Iterator::value_type bilinear_interpolate(
                                high[0] * dims[1] + low[1],
                                high[0] * dims[1] + high[1]};

-    float ly                                   = xy[0] - low[0];
-    float lx                                   = xy[1] - low[1];
-    float hy                                   = 1.0f - ly;
-    float hx                                   = 1.0f - lx;
-    array<typename Iterator::value_type, 4> ws = {hy * hx, hy * lx, ly * hx, ly * lx};
+    float ly              = xy[0] - low[0];
+    float lx              = xy[1] - low[1];
+    float hy              = 1.0f - ly;
+    float hx              = 1.0f - lx;
+    array<ret_type, 4> ws = {static_cast<ret_type>(hy * hx),
+                             static_cast<ret_type>(hy * lx),
+                             static_cast<ret_type>(ly * hx),
+                             static_cast<ret_type>(ly * lx)};

    auto v01 = pooling(data[locs[0]] * ws[0], data[locs[1]] * ws[1]);
    auto v23 = pooling(data[locs[2]] * ws[2], data[locs[3]] * ws[3]);
@@ -113,8 +117,9 @@ MIGRAPHX_DEVICE_CONSTEXPR auto calc_pooling(const Iterator& data,
                                            float roi_offset,
                                            Op op)
 {
-    typename Iterator::value_type output_val = op.init();
-    const int64_t count                      = bin_grid_size[0] * bin_grid_size[1];
+    using in_dtype      = typename Iterator::value_type;
+    in_dtype output_val = in_dtype{op.init()};
+    const int64_t count = bin_grid_size[0] * bin_grid_size[1];
    dfor(bin_grid_size[0], bin_grid_size[1])([&](auto iy, auto ix) {
        array<index_int, 2> id = {iy, ix};
        array<float, 2> locs =
@@ -148,7 +153,7 @@ __device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, W& y_t,
    const auto x    = x_t.begin();
    const auto rois = rois_t.begin();
    const auto ind  = ind_t.begin();
-
+    using ytype     = typename W::type;
    // input shape
    auto x_lens      = x_t.get_shape().lens;
    auto channel_num = x_lens[1];
@@ -176,10 +181,12 @@ __device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, W& y_t,
        const auto offset_rois = rois + (n * roi_column_num);
        const int batch_ind    = ind[n];

-        array<float, 2> roi_starts = {offset_rois[1] * s.spatial_scale,
-                                      offset_rois[0] * s.spatial_scale};
-        array<float, 2> roi_ends   = {offset_rois[3] * s.spatial_scale,
-                                    offset_rois[2] * s.spatial_scale};
+        array<float, 2> roi_starts = {
+            static_cast<float>(offset_rois[1]) * static_cast<float>(s.spatial_scale),
+            static_cast<float>(offset_rois[0]) * static_cast<float>(s.spatial_scale)};
+        array<float, 2> roi_ends = {
+            static_cast<float>(offset_rois[3]) * static_cast<float>(s.spatial_scale),
+            static_cast<float>(offset_rois[2]) * static_cast<float>(s.spatial_scale)};

        array<float, 2> roi_size{};
        array<float, 2> bin_size{};
@@ -199,25 +206,25 @@ __device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, W& y_t,
        const auto offset_x = x + ((batch_ind * channel_num + c) * in_dims[0] * in_dims[1]);
        if constexpr(s.is_avg_pooling)
        {
-            y_t[i] = calc_pooling(offset_x,
-                                  roi_starts,
-                                  bin_size,
-                                  {ph, pw},
-                                  bin_grid_size,
-                                  in_dims,
-                                  s.roi_offset,
-                                  avg_pool{});
+            y_t[i] = static_cast<ytype>(calc_pooling(offset_x,
+                                                     roi_starts,
+                                                     bin_size,
+                                                     {ph, pw},
+                                                     bin_grid_size,
+                                                     in_dims,
+                                                     s.roi_offset,
+                                                     avg_pool{}));
        }
        else
        {
-            y_t[i] = calc_pooling(offset_x,
-                                  roi_starts,
-                                  bin_size,
-                                  {ph, pw},
-                                  bin_grid_size,
-                                  in_dims,
-                                  s.roi_offset,
-                                  max_pool{});
+            y_t[i] = static_cast<ytype>(calc_pooling(offset_x,
+                                                     roi_starts,
+                                                     bin_size,
+                                                     {ph, pw},
+                                                     bin_grid_size,
+                                                     in_dims,
+                                                     s.roi_offset,
+                                                     max_pool{}));
        }
    }
 }

--- a/src/targets/gpu/kernels/include/migraphx/kernels/softmax.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/softmax.hpp
@@ -33,6 +33,7 @@ template <index_int Axis, class Input, class Output>
 __device__ void softmax(Input input1, Output output)
 {
    using block = reduce::auto_block<reduce::reduce_elements_with_axis<Input, Axis>()>;
+    using otype = typename Output::type;
    block::template run<reduce::with_axis<Input, Axis>>([&](auto, auto r) {
        auto input = r.inner(op::id{})(input1);
 #ifdef MIGRAPHX_USE_FAST_SOFTMAX
@@ -43,7 +44,7 @@ __device__ void softmax(Input input1, Output output)
        auto exp_in = r.inner([&](auto x) { return migraphx::exp(x - c); })(input);
        auto batch_sum =
            r.reduce(op::sum{}, 0, [](auto x) { return migraphx::convert<float>(x); })(exp_in);
-        r.inner([&](auto& y, auto x) { y = x / batch_sum; })(output, exp_in);
+        r.inner([&](auto& y, auto x) { y = otype{x / batch_sum}; })(output, exp_in);
    });
 }


--- a/test/verify/test_acosh.cpp
+++ b/test/verify/test_acosh.cpp
@@ -23,21 +23,23 @@
 */

 #include "verify_program.hpp"
+#include <migraphx/literal.hpp>
 #include <migraphx/program.hpp>
 #include <migraphx/generate.hpp>
 #include <migraphx/make_op.hpp>

-template <migraphx::shape::type_t DType>
-struct test_acosh : verify_program<test_acosh<DType>>
+template <typename CType>
+struct test_acosh : verify_program<test_acosh<CType>>
 {
    migraphx::program create_program() const
    {
        migraphx::program p;
-        auto* mm = p.get_main_module();
+        auto* mm                      = p.get_main_module();
+        migraphx::shape::type_t DType = migraphx::shape::get_type<CType>();
        migraphx::shape s{DType, {16}};
        auto x       = mm->add_parameter("x", s);
-        auto min_val = mm->add_literal(1.1f);
-        auto max_val = mm->add_literal(100.0f);
+        auto min_val = mm->add_literal(migraphx::literal{migraphx::shape{DType}, {1.1}});
+        auto max_val = mm->add_literal(migraphx::literal{migraphx::shape{DType}, {100.0}});
        min_val =
            mm->add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", {16}}}), min_val);
        max_val =
@@ -48,6 +50,6 @@ struct test_acosh : verify_program<test_acosh<DType>>
    }
 };

-template struct test_acosh<migraphx::shape::float_type>;
-// template struct test_acosh<migraphx::shape::half_type>;
-// template struct test_acosh<migraphx::shape::fp8e4m3fnuz_type>;
+template struct test_acosh<float>;
+template struct test_acosh<migraphx::half>;
+template struct test_acosh<migraphx::fp8::fp8e4m3fnuz>;
--- a/test/verify/test_atanh.cpp
+++ b/test/verify/test_atanh.cpp
@@ -23,21 +23,24 @@
 */

 #include "verify_program.hpp"
+#include <migraphx/float8.hpp>
+#include <migraphx/half.hpp>
 #include <migraphx/program.hpp>
 #include <migraphx/generate.hpp>
 #include <migraphx/make_op.hpp>

-template <migraphx::shape::type_t DType>
-struct test_atanh : verify_program<test_atanh<DType>>
+template <typename CType>
+struct test_atanh : verify_program<test_atanh<CType>>
 {
    migraphx::program create_program() const
    {
        migraphx::program p;
-        auto* mm = p.get_main_module();
+        auto* mm                      = p.get_main_module();
+        migraphx::shape::type_t DType = migraphx::shape::get_type<CType>();
        migraphx::shape s{DType, {16}};
        auto x       = mm->add_parameter("x", s);
-        auto min_val = mm->add_literal(-0.95f);
-        auto max_val = mm->add_literal(0.95f);
+        auto min_val = mm->add_literal(migraphx::literal{migraphx::shape{DType}, {-0.95f}});
+        auto max_val = mm->add_literal(migraphx::literal{migraphx::shape{DType}, {0.95f}});
        min_val =
            mm->add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", {16}}}), min_val);
        max_val =
@@ -48,6 +51,6 @@ struct test_atanh : verify_program<test_atanh<DType>>
    }
 };

-template struct test_atanh<migraphx::shape::float_type>;
-// template struct test_atanh<migraphx::shape::half_type>;
-// template struct test_atanh<migraphx::shape::fp8e4m3fnuz_type>;
+template struct test_atanh<float>;
+template struct test_atanh<migraphx::half>;
+template struct test_atanh<migraphx::fp8::fp8e4m3fnuz>;
--- a/test/verify/test_pad.cpp
+++ b/test/verify/test_pad.cpp
@@ -51,4 +51,4 @@ struct test_pad : verify_program<test_pad<DType>>
 template struct test_pad<migraphx::shape::int32_type>;
 template struct test_pad<migraphx::shape::float_type>;
 template struct test_pad<migraphx::shape::half_type>;
-// template struct test_pad<migraphx::shape::fp8e4m3fnuz_type>;
+template struct test_pad<migraphx::shape::fp8e4m3fnuz_type>;
--- a/test/verify/test_pow.cpp
+++ b/test/verify/test_pow.cpp
@@ -27,13 +27,15 @@
 #include <migraphx/generate.hpp>
 #include <migraphx/make_op.hpp>

-struct test_pow : verify_program<test_pow>
+template <typename CType>
+struct test_pow : verify_program<test_pow<CType>>
 {
    migraphx::program create_program() const
    {
        migraphx::program p;
-        auto* mm = p.get_main_module();
-        migraphx::shape s{migraphx::shape::float_type, {6}};
+        migraphx::shape::type_t DType = migraphx::shape::get_type<CType>();
+        auto* mm                      = p.get_main_module();
+        migraphx::shape s{DType, {6}};
        std::vector<float> vec_e(s.elements(), 2.0f);
        auto b = mm->add_parameter("x", s);
        auto e = mm->add_literal(migraphx::literal(s, vec_e));
@@ -41,4 +43,6 @@ struct test_pow : verify_program<test_pow>
        return p;
    }
 };
-// TODO: add fp8 tests
+template struct test_pow<float>;
+template struct test_pow<migraphx::half>;
+template struct test_pow<migraphx::fp8::fp8e4m3fnuz>;
--- a/test/verify/test_roialign.cpp
+++ b/test/verify/test_roialign.cpp
@@ -59,5 +59,5 @@ struct test_roialign : verify_program<test_roialign<DType>>
 };

 template struct test_roialign<migraphx::shape::float_type>;
-// template struct test_roialign<migraphx::shape::half_type>;
-// template struct test_roialign<migraphx::shape::fp8e4m3fnuz_type>;
+template struct test_roialign<migraphx::shape::half_type>;
+template struct test_roialign<migraphx::shape::fp8e4m3fnuz_type>;
--- a/test/verify/test_softmax.cpp
+++ b/test/verify/test_softmax.cpp
@@ -48,7 +48,7 @@ template struct test_softmax<0, migraphx::shape::half_type>;
 template struct test_softmax<1, migraphx::shape::half_type>;
 template struct test_softmax<2, migraphx::shape::half_type>;
 template struct test_softmax<3, migraphx::shape::half_type>;
-// template struct test_softmax<0, migraphx::shape::fp8e4m3fnuz_type>;
-// template struct test_softmax<1, migraphx::shape::fp8e4m3fnuz_type>;
-// template struct test_softmax<2, migraphx::shape::fp8e4m3fnuz_type>;
-// template struct test_softmax<3, migraphx::shape::fp8e4m3fnuz_type>;
+template struct test_softmax<0, migraphx::shape::fp8e4m3fnuz_type>;
+template struct test_softmax<1, migraphx::shape::fp8e4m3fnuz_type>;
+template struct test_softmax<2, migraphx::shape::fp8e4m3fnuz_type>;
+template struct test_softmax<3, migraphx::shape::fp8e4m3fnuz_type>;