Merge branch 'develop' into uif2-initial

8d7a8a6c · Artur Wojcik · 25b33431 · a09dc502 · 8d7a8a6c · 8d7a8a6c
Commit 8d7a8a6c authored Dec 06, 2023 by Artur Wojcik
20 changed files
--- a/src/targets/gpu/kernels/include/migraphx/kernels/gathernd.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/gathernd.hpp
@@ -53,35 +53,35 @@ __device__ void gathernd(const T& data_t, const U& indices_t, const V& output_t,
    auto indices_shape_lens = indices_shape.lens;
    auto data_shape_lens    = data_shape.lens;
    auto num_slice_dims     = indices_shape_lens.back();
-    std::size_t num_slices =
+    size_t num_slices =
        accumulate(indices_shape_lens.begin(), indices_shape_lens.end() - 1, 1, op::product{});
-    std::size_t slice_size = accumulate(data_shape_lens.begin() + num_slice_dims + batch_dims,
+    size_t slice_size = accumulate(data_shape_lens.begin() + num_slice_dims + batch_dims,
                                   data_shape_lens.end(),
                                   1,
                                   op::product{});
-    const std::size_t num_batches =
+    const size_t num_batches =
        accumulate(data_shape_lens.begin(), data_shape_lens.begin() + batch_dims, 1, op::product{});
-    const std::size_t data_batch_stride =
+    const size_t data_batch_stride =
        accumulate(data_shape_lens.begin() + batch_dims, data_shape_lens.end(), 1, op::product{});
    const auto num_slices_per_batch = num_slices / num_batches;
    ind.global_stride(output_shape.elements(), [&](auto i) {
        const auto* indices_ptr     = indices_t.data();
-        const std::size_t j         = i / slice_size;
+        const size_t j              = i / slice_size;
-        const std::size_t batch_idx = j / num_slices_per_batch;
+        const size_t batch_idx      = j / num_slices_per_batch;
        auto* slice_indices               = indices_ptr + (j * num_slice_dims);
-        std::size_t relative_slice_offset = 0;
+        size_t relative_slice_offset      = 0;
-        for(std::size_t idx = 0; idx < num_slice_dims; ++idx)
+        for(size_t idx = 0; idx < num_slice_dims; ++idx)
        {
            int64_t index                   = slice_indices[idx];
-            const std::size_t input_dim_idx = batch_dims + idx;
+            const size_t input_dim_idx      = batch_dims + idx;
            const auto input_dim            = data_shape_lens[input_dim_idx];
            MIGRAPHX_ASSERT(index >= -static_cast<int64_t>(input_dim) and
                            index < static_cast<int64_t>(input_dim));
            if(index < 0)
                index += input_dim;
-            std::size_t size_from_slice_dims =
+            size_t size_from_slice_dims =
                accumulate(data_shape_lens.begin() + batch_dims + idx + 1,
                           data_shape_lens.begin() + batch_dims + num_slice_dims,
                           slice_size,

--- a/src/targets/gpu/kernels/include/migraphx/kernels/layernorm.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/layernorm.hpp
@@ -52,14 +52,17 @@ __device__ void generic_binary_layernorm(
    block::template run<reduce_output>([&](auto, auto r) {
        auto input       = r.inner([&](auto x1, auto x2) { return op(x1, x2); })(input1, input2);
        using value_type = typename Input1::type;
+        using vec_value_type       = vec_type<value_type>;
        constexpr auto relements   = r.template elements<Input1>();
-        constexpr auto relements_r = vec_type<value_type>{1.0 / relements};
+        constexpr auto relements_r = vec_value_type{1.0 / relements};
        auto relements_rsqrt       = sqrt(relements_r);
-        auto means = r.reduce(op::sum{}, make_array<vec_type<value_type>>(0, 0), [&](auto x) {
+        auto means = r.reduce(op::sum{},
+                              make_array<vec_value_type>(vec_value_type{0}, vec_value_type{0}),
+                              [&](auto x) {
                                  auto x_out = x * relements_r;
-            // dividing x by sqrt(relements) before squaring allows computing higher values
+                                  // dividing x by sqrt(relements) before squaring allows computing
-            // before overflow in low precision
+                                  // higher values before overflow in low precision
                                  auto x2_sqrt = x * relements_rsqrt;
                                  return make_array(x_out, x2_sqrt * x2_sqrt);
                              })(input);
@@ -67,7 +70,7 @@ __device__ void generic_binary_layernorm(
        auto mean_x        = means[0];
        auto mean_x2       = means[1];
        auto variance      = mean_x2 - (mean_x * mean_x);
-        value_type eps_val = eps; // implicit conversion for eps
+        value_type eps_val = implicit_conversion(eps);
        r.inner([&](auto& y, auto x, auto... xs) {
            auto m = x - mean_x;

--- a/src/targets/gpu/kernels/include/migraphx/kernels/math.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/math.hpp
@@ -29,11 +29,15 @@
 #include <migraphx/kernels/functional.hpp>
 #include <migraphx/kernels/type_traits.hpp>
 #include <migraphx/kernels/hip.hpp>
+#include <migraphx/kernels/float8.hpp>
 namespace migraphx {
 namespace math {
 constexpr float as_float(migraphx::half x) { return x; }
+constexpr float as_float(migraphx::fp8::fp8e4m3fnuz x) { return x; }
 template <class T>
 constexpr T as_float(T x)
 {
@@ -57,14 +61,14 @@ constexpr T as_float(T x)
 // NOLINTNEXTLINE
 #define MIGRAPHX_DEVICE_MATH_FOR(type, name, fname)                    \
    template <class... Ts, MIGRAPHX_REQUIRES(not is_any_vec<Ts...>())> \
-    auto __device__ name(type x, Ts... xs)->type                       \
+    auto __device__ name(type x, Ts... xs) -> type                     \
    {                                                                  \
        return fname(x, xs...);                                        \
    }
 // NOLINTNEXTLINE
 #define MIGRAPHX_DEVICE_MATH_BINARY_FOR(type, name, fname) \
-    inline auto __device__ name(type x, type y)->type { return fname(x, y); }
+    inline auto __device__ name(type x, type y) -> type { return fname(x, y); }
 // NOLINTNEXTLINE
 #define MIGRAPHX_DEVICE_MATH_HALF(name, fname)                         \
@@ -72,6 +76,12 @@ constexpr T as_float(T x)
    auto __device__ name(migraphx::half x, Ts... xs)                   \
        MIGRAPHX_RETURNS(fname(math::as_float(x), math::as_float(xs)...))
+// NOLINTNEXTLINE
+#define MIGRAPHX_DEVICE_MATH_FP8(name, fname)                                      \
+    template <class... Ts, MIGRAPHX_REQUIRES(not is_any_vec<Ts...>())>             \
+    auto __device__ name(migraphx::fp8::fp8e4m3fnuz x, Ts... xs) MIGRAPHX_RETURNS( \
+        migraphx::fp8::fp8e4m3fnuz(fname(math::as_float(x), math::as_float(xs)...)))
 // Template with two overloads for math functions, one for half2 type and one for more generic
 // <half, N> vectorization where N is 4 or another even number.
@@ -162,6 +172,33 @@ MIGRAPHX_DEVICE_MATH_HALF(tan, ::tan)
 MIGRAPHX_DEVICE_MATH_HALF(tanh, ::tanh)
 MIGRAPHX_DEVICE_MATH_HALF(fmod, ::fmod)
+// use float to compute fp8 overload
+MIGRAPHX_DEVICE_MATH_FP8(abs, ::abs)
+MIGRAPHX_DEVICE_MATH_FP8(acos, ::acos)
+MIGRAPHX_DEVICE_MATH_FP8(acosh, ::acosh)
+MIGRAPHX_DEVICE_MATH_FP8(asin, ::asin)
+MIGRAPHX_DEVICE_MATH_FP8(asinh, ::asinh)
+MIGRAPHX_DEVICE_MATH_FP8(atan, ::atan)
+MIGRAPHX_DEVICE_MATH_FP8(atanh, ::atanh)
+MIGRAPHX_DEVICE_MATH_FP8(ceil, ::ceil)
+MIGRAPHX_DEVICE_MATH_FP8(cos, ::cos)
+MIGRAPHX_DEVICE_MATH_FP8(cosh, ::cosh)
+MIGRAPHX_DEVICE_MATH_FP8(erf, ::erf)
+MIGRAPHX_DEVICE_MATH_FP8(exp, ::exp)
+MIGRAPHX_DEVICE_MATH_FP8(floor, ::floor)
+MIGRAPHX_DEVICE_MATH_FP8(isnan, ::isnan)
+MIGRAPHX_DEVICE_MATH_FP8(log, ::log)
+MIGRAPHX_DEVICE_MATH_FP8(pow, ::pow)
+MIGRAPHX_DEVICE_MATH_FP8(remainder, ::remainder)
+MIGRAPHX_DEVICE_MATH_FP8(round, ::round)
+MIGRAPHX_DEVICE_MATH_FP8(rsqrt, ::rsqrt)
+MIGRAPHX_DEVICE_MATH_FP8(sin, ::sin)
+MIGRAPHX_DEVICE_MATH_FP8(sinh, ::sinh)
+MIGRAPHX_DEVICE_MATH_FP8(sqrt, ::sqrt)
+MIGRAPHX_DEVICE_MATH_FP8(tan, ::tan)
+MIGRAPHX_DEVICE_MATH_FP8(tanh, ::tanh)
+MIGRAPHX_DEVICE_MATH_FP8(fmod, ::fmod)
 // Map math functions to hip half2 functions
 // The half2 type is defined in include/hip/amd_detail/hip_fp16_gcc.h and is 2 16-bit floats
 // packed into a 32-bit number.  See include/hip/amd_detail/hip_fp16_math_fwd.h for the HIP names
@@ -253,7 +290,7 @@ MIGRAPHX_DEVICE_MATH_VEC(where)
 template <class T, class U>
 constexpr auto convert(U v)
 {
-    return vec_transform(v)([](auto x) -> T { return x; });
+    return vec_transform(v)([](auto x) -> T { return static_cast<T>(x); });
 }
 } // namespace migraphx

--- a/src/targets/gpu/kernels/include/migraphx/kernels/pad.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/pad.hpp
@@ -28,6 +28,7 @@
 #include <migraphx/kernels/index.hpp>
 #include <migraphx/kernels/algorithm.hpp>
 #include <migraphx/kernels/ranges.hpp>
+#include <migraphx/kernels/vec.hpp>
 namespace migraphx {
@@ -53,9 +54,9 @@ __device__ void pad(const index& idx,
        if(any_of(range_multi.begin(), range_multi.end(), [&](auto j) {
               return multi[j] < offsets[j] or input_idx[j] >= input_bounds[j];
           }))
-            output[multi] = pad_val;
+            output[multi] = implicit_conversion(pad_val);
        else
-            output[multi] = input[input_idx];
+            output[multi] = implicit_conversion(input[input_idx]);
    });
 }

--- a/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
--- a/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
@@ -62,7 +62,7 @@ struct avg_pool
    template <class T>
    MIGRAPHX_DEVICE_CONSTEXPR T final(T x, index_int y)
    {
-        return (y == 0) ? 0.0 : (x / y);
+        return (y == 0) ? T{0.0} : T{x / y};
    }
 };
@@ -76,7 +76,7 @@ MIGRAPHX_DEVICE_CONSTEXPR typename Iterator::value_type bilinear_interpolate(
    {
        if(xy[ii] < -1.0f or xy[ii] > dims[ii])
        {
-            return 0;
+            return implicit_conversion(0);
        }
        xy[ii]   = migraphx::max(xy[ii], 0.0f);
@@ -96,11 +96,12 @@ MIGRAPHX_DEVICE_CONSTEXPR typename Iterator::value_type bilinear_interpolate(
    float lx = xy[1] - low[1];
    float hy = 1.0f - ly;
    float hx = 1.0f - lx;
-    array<typename Iterator::value_type, 4> ws = {hy * hx, hy * lx, ly * hx, ly * lx};
+    // do calculations in floating point and convert final result to required type
+    array<float, 4> ws = {hy * hx, hy * lx, ly * hx, ly * lx};
    auto v01 = pooling(data[locs[0]] * ws[0], data[locs[1]] * ws[1]);
    auto v23 = pooling(data[locs[2]] * ws[2], data[locs[3]] * ws[3]);
-    return pooling(v01, v23);
+    return implicit_conversion(pooling(v01, v23));
 }
 template <class Iterator, class Op>
@@ -113,7 +114,8 @@ MIGRAPHX_DEVICE_CONSTEXPR auto calc_pooling(const Iterator& data,
                                            float roi_offset,
                                            Op op)
 {
-    typename Iterator::value_type output_val = op.init();
+    using in_dtype      = typename Iterator::value_type;
+    in_dtype output_val = in_dtype{op.init()};
    const int64_t count = bin_grid_size[0] * bin_grid_size[1];
    dfor(bin_grid_size[0], bin_grid_size[1])([&](auto iy, auto ix) {
        array<index_int, 2> id = {iy, ix};
@@ -148,7 +150,6 @@ __device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, W& y_t,
    const auto x    = x_t.begin();
    const auto rois = rois_t.begin();
    const auto ind  = ind_t.begin();
    // input shape
    auto x_lens      = x_t.get_shape().lens;
    auto channel_num = x_lens[1];
@@ -176,10 +177,12 @@ __device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, W& y_t,
        const auto offset_rois = rois + (n * roi_column_num);
        const int batch_ind    = ind[n];
-        array<float, 2> roi_starts = {offset_rois[1] * s.spatial_scale,
+        array<float, 2> roi_starts = {
-                                      offset_rois[0] * s.spatial_scale};
+            static_cast<float>(offset_rois[1]) * static_cast<float>(s.spatial_scale),
-        array<float, 2> roi_ends   = {offset_rois[3] * s.spatial_scale,
+            static_cast<float>(offset_rois[0]) * static_cast<float>(s.spatial_scale)};
-                                    offset_rois[2] * s.spatial_scale};
+        array<float, 2> roi_ends = {
+            static_cast<float>(offset_rois[3]) * static_cast<float>(s.spatial_scale),
+            static_cast<float>(offset_rois[2]) * static_cast<float>(s.spatial_scale)};
        array<float, 2> roi_size{};
        array<float, 2> bin_size{};

--- a/src/targets/gpu/include/migraphx/gpu/gather.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/gather.hpp
--- a/src/targets/gpu/kernels/include/migraphx/kernels/scatternd.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/scatternd.hpp
--- a/src/targets/gpu/kernels/include/migraphx/kernels/softmax.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/softmax.hpp
@@ -43,7 +43,7 @@ __device__ void softmax(Input input1, Output output)
        auto exp_in = r.inner([&](auto x) { return migraphx::exp(x - c); })(input);
        auto batch_sum =
            r.reduce(op::sum{}, 0, [](auto x) { return migraphx::convert<float>(x); })(exp_in);
-        r.inner([&](auto& y, auto x) { y = x / batch_sum; })(output, exp_in);
+        r.inner([&](auto& y, auto x) { y = implicit_conversion(x / batch_sum); })(output, exp_in);
    });
 }

--- a/src/targets/gpu/kernels/include/migraphx/kernels/tensor_view.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/tensor_view.hpp
--- a/src/targets/gpu/kernels/include/migraphx/kernels/type_traits.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/type_traits.hpp
--- a/src/targets/gpu/kernels/include/migraphx/kernels/vec.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/vec.hpp
--- a/src/targets/gpu/mlir.cpp
+++ b/src/targets/gpu/mlir.cpp
--- a/src/targets/gpu/prefuse_ops.cpp
+++ b/src/targets/gpu/prefuse_ops.cpp
--- a/src/targets/gpu/rocblas.cpp
+++ b/src/targets/gpu/rocblas.cpp
--- a/src/targets/gpu/target.cpp
+++ b/src/targets/gpu/target.cpp
--- a/test/gpu/fuse_mlir.cpp
+++ b/test/gpu/fuse_mlir.cpp
--- a/test/gpu/jit.cpp
+++ b/test/gpu/jit.cpp
--- a/test/gpu/mlir.cpp
+++ b/test/gpu/mlir.cpp
--- a/test/onnx/.onnxrt-commit
+++ b/test/onnx/.onnxrt-commit