merge changes

f155b0e6 · Umang Yadav · c4cee345 · f155b0e6 · f155b0e6 · f155b0e6
Commit f155b0e6 authored Dec 01, 2023 by Umang Yadav
8 changed files
--- a/src/eliminate_fp8.cpp
+++ b/src/eliminate_fp8.cpp
@@ -21,7 +21,6 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#include "migraphx/serialize.hpp"
 #include <iterator>
 #include <utility>
 #include <migraphx/eliminate_fp8.hpp>
@@ -30,6 +29,7 @@
 #include <migraphx/instruction.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/stringutils.hpp>
+#include <migraphx/serialize.hpp>
 #include <migraphx/ranges.hpp>
 namespace migraphx {

--- a/src/targets/gpu/kernels/include/migraphx/kernels/bit_cast.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/bit_cast.hpp
@@ -22,7 +22,6 @@
 #ifndef MIGRAPHX_GUARD_KERNELS_BITCAST_HPP
 #define MIGRAPHX_GUARD_KERNELS_BITCAST_HPP
 #include <migraphx/kernels/type_traits.hpp>
 namespace migraphx {

--- a/src/targets/gpu/kernels/include/migraphx/kernels/float8.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/float8.hpp
@@ -394,7 +394,6 @@ using fp8e5m2fnuz = float8<migraphx::fp8::f8_type::bf8, true>;
    }
 // NOLINTNEXTLINE
 #define MIGRAPHX_FP8_OTHER_OPS(T)                                            \
    inline constexpr __device__ T fabs(T v)                                  \
    {                                                                        \
@@ -502,7 +501,6 @@ class numeric_limits<fp8e5m2fnuz>
    {
        return fp8e5m2fnuz(0x7F, fp8e5m2fnuz::from_bits());
    }
    // this is min value that is not DeNormalized(DeNorm). DeNorm min is 0x01. I am not sure if we
    // want to make this distinction. For the floating points we would end up using lowest most of
    // the times.
@@ -530,7 +528,9 @@ class numeric_limits<fp8e5m2>
    }
    static constexpr __device__ fp8e5m2 max() { return fp8e5m2(0x7B, fp8e5m2::from_bits()); }
-    // this is min value that is not DeNormalized(DeNorm). DeNorm min is 0x01.
+    // this is min value that is not DeNormalized(DeNorm). DeNorm min is 0x01. I am not sure if we
+    // want to make this distinction. For the floating points we would end up using lowest most of
+    // the times.
    static constexpr __device__ fp8e5m2 min() { return fp8e5m2(0x4, fp8e5m2::from_bits()); }
    static constexpr __device__ fp8e5m2 lowest() { return fp8e5m2(0xFB, fp8e5m2::from_bits()); }
@@ -539,7 +539,6 @@ class numeric_limits<fp8e5m2>
 };
 } // namespace fp8
 template <class T,
          MIGRAPHX_REQUIRES(is_same<T, fp8::fp8e4m3fnuz>{} or is_same<T, fp8::fp8e5m2fnuz>{} or
                            is_same<T, fp8::fp8e4m3fn>{} or is_same<T, fp8::fp8e5m2>{})>
@@ -560,7 +559,6 @@ constexpr T numeric_lowest(migraphx::fp8::f8_type unused = migraphx::fp8::f8_typ
    (void)(unused);
    return fp8::numeric_limits<T>::lowest();
 }
 } // namespace migraphx
 // =================================================================================================
 #if defined(__clang__)

--- a/src/targets/gpu/kernels/include/migraphx/kernels/layernorm.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/layernorm.hpp
@@ -54,7 +54,6 @@ __device__ void generic_binary_layernorm(
        using value_type = typename Input1::type;
        using vec_value_type       = vec_type<value_type>;
        constexpr auto relements   = r.template elements<Input1>();
        constexpr auto relements_r = vec_value_type{1.0 / relements};
        auto relements_rsqrt       = sqrt(relements_r);

--- a/src/targets/gpu/kernels/include/migraphx/kernels/ops.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/ops.hpp
@@ -118,7 +118,7 @@ struct highest
    template <class T>
    constexpr operator T() const
    {
-        return numeric_max<vec_type<T>, void>();
+        return numeric_max<vec_type<T>>();
    }
 };
 } // namespace migraphx

--- a/src/targets/gpu/kernels/include/migraphx/kernels/pad.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/pad.hpp
@@ -40,7 +40,6 @@ __device__ void pad(const index& idx,
                    const PadVal& pad_val)
 {
    auto output_shape = output.get_shape();
-    using otype       = typename Output::type;
    idx.global_stride(output_shape.elements(), [&](auto i) {
        // 1. get current multi-index for output
        // 2. get the size of the input to determine input boundaries

--- a/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
@@ -56,7 +56,7 @@ struct avg_pool
    template <class T>
    MIGRAPHX_DEVICE_CONSTEXPR T operator()(T x, T y)
    {
-        return static_cast<T>(x + y);
+        return x + y;
    }
    template <class T>
@@ -70,7 +70,6 @@ template <class Iterator, class Op>
 MIGRAPHX_DEVICE_CONSTEXPR typename Iterator::value_type bilinear_interpolate(
    const Iterator data, const array<index_int, 2>& dims, array<float, 2> xy, Op pooling)
 {
-    using ret_type = typename Iterator::value_type;
    array<int, 2> low{};
    array<int, 2> high{};
    for(index_int ii = 0; ii < xy.size(); ++ii)
@@ -93,7 +92,6 @@ MIGRAPHX_DEVICE_CONSTEXPR typename Iterator::value_type bilinear_interpolate(
                                high[0] * dims[1] + low[1],
                                high[0] * dims[1] + high[1]};
    float ly = xy[0] - low[0];
    float lx = xy[1] - low[1];
    float hy = 1.0f - ly;
@@ -204,25 +202,25 @@ __device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, W& y_t,
        const auto offset_x = x + ((batch_ind * channel_num + c) * in_dims[0] * in_dims[1]);
        if constexpr(s.is_avg_pooling)
        {
-            y_t[i] = static_cast<ytype>(calc_pooling(offset_x,
+            y_t[i] = calc_pooling(offset_x,
-                                                     roi_starts,
+                                  roi_starts,
-                                                     bin_size,
+                                  bin_size,
-                                                     {ph, pw},
+                                  {ph, pw},
-                                                     bin_grid_size,
+                                  bin_grid_size,
-                                                     in_dims,
+                                  in_dims,
-                                                     s.roi_offset,
+                                  s.roi_offset,
-                                                     avg_pool{}));
+                                  avg_pool{});
        }
        else
        {
-            y_t[i] = static_cast<ytype>(calc_pooling(offset_x,
+            y_t[i] = calc_pooling(offset_x,
-                                                     roi_starts,
+                                  roi_starts,
-                                                     bin_size,
+                                  bin_size,
-                                                     {ph, pw},
+                                  {ph, pw},
-                                                     bin_grid_size,
+                                  bin_grid_size,
-                                                     in_dims,
+                                  in_dims,
-                                                     s.roi_offset,
+                                  s.roi_offset,
-                                                     max_pool{}));
+                                  max_pool{});
        }
    }
 }

--- a/src/targets/gpu/kernels/include/migraphx/kernels/softmax.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/softmax.hpp
@@ -33,7 +33,6 @@ template <index_int Axis, class Input, class Output>
 __device__ void softmax(Input input1, Output output)
 {
    using block = reduce::auto_block<reduce::reduce_elements_with_axis<Input, Axis>()>;
-    using otype = typename Output::type;
    block::template run<reduce::with_axis<Input, Axis>>([&](auto, auto r) {
        auto input = r.inner(op::id{})(input1);
 #ifdef MIGRAPHX_USE_FAST_SOFTMAX