remove unnecesssary changes

f000fe32 · Umang Yadav · 795bea35 · f000fe32 · f000fe32 · f000fe32
Commit f000fe32 authored Sep 26, 2023 by Umang Yadav
20 changed files
--- a/include/ck/utility/generic_memory_space_atomic.hpp
+++ b/include/ck/utility/generic_memory_space_atomic.hpp
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Weverything"
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
@@ -124,5 +121,3 @@ __device__ float2_t atomic_max<float2_t>(float2_t* p_dst, const float2_t& x)
 }
 } // namespace ck
-#pragma clang diagnostic pop
--- a/include/ck/utility/get_id.hpp
+++ b/include/ck/utility/get_id.hpp
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Weverything"
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
@@ -29,5 +26,3 @@ __device__ index_t get_grid_size() { return gridDim.x; }
 __device__ index_t get_block_size() { return blockDim.x; }
 } // namespace ck
-#pragma clang diagnostic pop
--- a/include/ck/utility/ignore.hpp
+++ b/include/ck/utility/ignore.hpp
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Weverything"
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
@@ -23,5 +20,3 @@ struct ignore_t
 inline constexpr detail::ignore_t ignore;
 } // namespace ck
-#pragma clang diagnostic pop
--- a/include/ck/utility/inner_product.hpp
+++ b/include/ck/utility/inner_product.hpp
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Weverything"
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
@@ -237,5 +234,3 @@ inner_product<int8x16_t, int8x16_t, int32_t>(const int8x16_t& a, const int8x16_t
 }
 } // namespace ck
-#pragma clang diagnostic pop
--- a/include/ck/utility/integral_constant.hpp
+++ b/include/ck/utility/integral_constant.hpp
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Weverything"
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
@@ -8,50 +5,47 @@
 namespace ck {
-template <class T, T v> struct integral_constant {
+template <class T, T v>
-  static constexpr T value = v;
+struct integral_constant
-  typedef T value_type;
+{
-  typedef integral_constant type;
+    static constexpr T value = v;
-  __host__ __device__ constexpr operator value_type() const noexcept {
+    typedef T value_type;
-    return value;
+    typedef integral_constant type;
-  }
+    __host__ __device__ constexpr operator value_type() const noexcept { return value; }
-  __host__ __device__ constexpr value_type operator()() const noexcept {
+    __host__ __device__ constexpr value_type operator()() const noexcept { return value; }
-    return value;
-  }
 };
 template <typename TX, TX X, typename TY, TY Y>
-__host__ __device__ constexpr auto operator+(integral_constant<TX, X>,
+__host__ __device__ constexpr auto operator+(integral_constant<TX, X>, integral_constant<TY, Y>)
-                                             integral_constant<TY, Y>) {
+{
-  return integral_constant<decltype(X + Y), X + Y>{};
+    return integral_constant<decltype(X + Y), X + Y>{};
 }
 template <typename TX, TX X, typename TY, TY Y>
-__host__ __device__ constexpr auto operator-(integral_constant<TX, X>,
+__host__ __device__ constexpr auto operator-(integral_constant<TX, X>, integral_constant<TY, Y>)
-                                             integral_constant<TY, Y>) {
+{
-  static_assert(Y <= X, "wrong!");
+    static_assert(Y <= X, "wrong!");
-  return integral_constant<decltype(X - Y), X - Y>{};
+    return integral_constant<decltype(X - Y), X - Y>{};
 }
 template <typename TX, TX X, typename TY, TY Y>
-__host__ __device__ constexpr auto operator*(integral_constant<TX, X>,
+__host__ __device__ constexpr auto operator*(integral_constant<TX, X>, integral_constant<TY, Y>)
-                                             integral_constant<TY, Y>) {
+{
-  return integral_constant<decltype(X * Y), X * Y>{};
+    return integral_constant<decltype(X * Y), X * Y>{};
 }
 template <typename TX, TX X, typename TY, TY Y>
-__host__ __device__ constexpr auto operator/(integral_constant<TX, X>,
+__host__ __device__ constexpr auto operator/(integral_constant<TX, X>, integral_constant<TY, Y>)
-                                             integral_constant<TY, Y>) {
+{
-  static_assert(Y > 0, "wrong!");
+    static_assert(Y > 0, "wrong!");
-  return integral_constant<decltype(X / Y), X / Y>{};
+    return integral_constant<decltype(X / Y), X / Y>{};
 }
 template <typename TX, TX X, typename TY, TY Y>
-__host__ __device__ constexpr auto operator%(integral_constant<TX, X>,
+__host__ __device__ constexpr auto operator%(integral_constant<TX, X>, integral_constant<TY, Y>)
-                                             integral_constant<TY, Y>) {
+{
-  static_assert(Y > 0, "wrong!");
+    static_assert(Y > 0, "wrong!");
-  return integral_constant<decltype(X % Y), X % Y>{};
+    return integral_constant<decltype(X % Y), X % Y>{};
 }
-} // namespace ck
-#pragma clang diagnostic pop
+} // namespace ck
--- a/include/ck/utility/is_known_at_compile_time.hpp
+++ b/include/ck/utility/is_known_at_compile_time.hpp
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Weverything"
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
@@ -57,5 +54,3 @@ struct is_known_at_compile_time<Tuple<Ts...>>
 };
 } // namespace ck
-#pragma clang diagnostic pop
--- a/include/ck/utility/magic_division.hpp
+++ b/include/ck/utility/magic_division.hpp
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Weverything"
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
@@ -18,135 +15,148 @@ namespace ck {
 // magic number division
 // Caution:
-//   1. For uint32_t as dividend: magic number division implementation being
+//   1. For uint32_t as dividend: magic number division implementation being used would produce
-//   used would produce correct result if the dividend is uint32_t and its value
+//   correct result if the dividend is uint32_t and its value is within 31-bit value range.
-//   is within 31-bit value range.
+//   2. For int32_t as dividendd: magic number division for int32_t dividened has not been
-//   2. For int32_t as dividendd: magic number division for int32_t dividened
+//   implemented, the int32_t dividend would be bit-wise interpreted as uint32_t and magic number
-//   has not been implemented, the int32_t dividend would be bit-wise
+//   division implementation for uint32_t is then used. Therefore, dividend value need to be
-//   interpreted as uint32_t and magic number division implementation for
+//   non-negative.
-//   uint32_t is then used. Therefore, dividend value need to be non-negative.
 // TODO:
 //   1. Implement magic number divison for int32_t
 //   2. Implement magic number divison for unit32_t with 32-bit value range
-struct MagicDivision {
+struct MagicDivision
-  // uint32_t
+{
-  __host__ __device__ static constexpr auto
+    // uint32_t
-  CalculateMagicNumbers(uint32_t divisor) {
+    __host__ __device__ static constexpr auto CalculateMagicNumbers(uint32_t divisor)
-    // WARNING: magic division is only applicable for division inside this
+    {
-    // range. You should use the return value of CalculateMagicNumbers, if
+        // WARNING: magic division is only applicable for division inside this range.
-    // division is not inside this range. The "else" logic below is to quiet
+        // You should use the return value of CalculateMagicNumbers, if division is not inside this
-    // down run-time error.
+        // range. The "else" logic below is to quiet down run-time error.
-    if (divisor >= 1 && divisor <= INT32_MAX) {
+        if(divisor >= 1 && divisor <= INT32_MAX)
-      uint32_t shift = 0;
+        {
-      for (shift = 0; shift < 32; ++shift) {
+            uint32_t shift = 0;
-        if ((1U << shift) >= divisor) {
+            for(shift = 0; shift < 32; ++shift)
-          break;
+            {
+                if((1U << shift) >= divisor)
+                {
+                    break;
+                }
+            }
+            uint64_t one        = 1;
+            uint64_t multiplier = ((one << 32) * ((one << shift) - divisor)) / divisor + 1;
+            // assert(multiplier <= 0xffffffffUL);
+            return make_tuple(uint32_t(multiplier), shift);
        }
-      }
+        else
+        {
-      uint64_t one = 1;
+            return make_tuple(uint32_t(0), uint32_t(0));
-      uint64_t multiplier =
+        }
-          ((one << 32) * ((one << shift) - divisor)) / divisor + 1;
+    }
-      // assert(multiplier <= 0xffffffffUL);
+    __host__ __device__ static constexpr uint32_t CalculateMagicMultiplier(uint32_t divisor)
-      return make_tuple(uint32_t(multiplier), shift);
+    {
-    } else {
+        auto tmp = CalculateMagicNumbers(divisor);
-      return make_tuple(uint32_t(0), uint32_t(0));
-    }
+        return tmp[Number<0>{}];
-  }
+    }
-  __host__ __device__ static constexpr uint32_t
+    __host__ __device__ static constexpr uint32_t CalculateMagicShift(uint32_t divisor)
-  CalculateMagicMultiplier(uint32_t divisor) {
+    {
-    auto tmp = CalculateMagicNumbers(divisor);
+        auto tmp = CalculateMagicNumbers(divisor);
-    return tmp[Number<0>{}];
+        return tmp[Number<1>{}];
-  }
+    }
-  __host__ __device__ static constexpr uint32_t
+    // integral_constant<uint32_t, .>
-  CalculateMagicShift(uint32_t divisor) {
+    template <uint32_t Divisor>
-    auto tmp = CalculateMagicNumbers(divisor);
+    __host__ __device__ static constexpr auto
+    CalculateMagicNumbers(integral_constant<uint32_t, Divisor>)
-    return tmp[Number<1>{}];
+    {
-  }
+        constexpr auto tmp = CalculateMagicNumbers(uint32_t{Divisor});
-  // integral_constant<uint32_t, .>
+        constexpr uint32_t multiplier = tmp[Number<0>{}];
-  template <uint32_t Divisor>
+        constexpr uint32_t shift      = tmp[Number<1>{}];
-  __host__ __device__ static constexpr auto
-  CalculateMagicNumbers(integral_constant<uint32_t, Divisor>) {
+        return make_tuple(integral_constant<uint32_t, multiplier>{},
-    constexpr auto tmp = CalculateMagicNumbers(uint32_t{Divisor});
+                          integral_constant<uint32_t, shift>{});
+    }
-    constexpr uint32_t multiplier = tmp[Number<0>{}];
-    constexpr uint32_t shift = tmp[Number<1>{}];
+    template <uint32_t Divisor>
+    __host__ __device__ static constexpr auto
-    return make_tuple(integral_constant<uint32_t, multiplier>{},
+    CalculateMagicMultiplier(integral_constant<uint32_t, Divisor>)
-                      integral_constant<uint32_t, shift>{});
+    {
-  }
+        constexpr uint32_t multiplier = CalculateMagicMultiplier(uint32_t{Divisor});
-  template <uint32_t Divisor>
+        return integral_constant<uint32_t, multiplier>{};
-  __host__ __device__ static constexpr auto
+    }
-  CalculateMagicMultiplier(integral_constant<uint32_t, Divisor>) {
-    constexpr uint32_t multiplier = CalculateMagicMultiplier(uint32_t{Divisor});
+    template <uint32_t Divisor>
+    __host__ __device__ static constexpr auto
-    return integral_constant<uint32_t, multiplier>{};
+    CalculateMagicShift(integral_constant<uint32_t, Divisor>)
-  }
+    {
+        constexpr uint32_t shift = CalculateMagicShift(uint32_t{Divisor});
-  template <uint32_t Divisor>
-  __host__ __device__ static constexpr auto
+        return integral_constant<uint32_t, shift>{};
-  CalculateMagicShift(integral_constant<uint32_t, Divisor>) {
+    }
-    constexpr uint32_t shift = CalculateMagicShift(uint32_t{Divisor});
+    // integral_constant<int32_t, .>
-    return integral_constant<uint32_t, shift>{};
+    template <int32_t Divisor>
-  }
+    __host__ __device__ static constexpr auto
+    CalculateMagicNumbers(integral_constant<int32_t, Divisor>)
-  // integral_constant<int32_t, .>
+    {
-  template <int32_t Divisor>
+        return CalculateMagicNumbers(integral_constant<uint32_t, Divisor>{});
-  __host__ __device__ static constexpr auto
+    }
-  CalculateMagicNumbers(integral_constant<int32_t, Divisor>) {
-    return CalculateMagicNumbers(integral_constant<uint32_t, Divisor>{});
+    template <int32_t Divisor>
-  }
+    __host__ __device__ static constexpr auto
+    CalculateMagicMultiplier(integral_constant<int32_t, Divisor>)
-  template <int32_t Divisor>
+    {
-  __host__ __device__ static constexpr auto
+        return CalculateMagicMultiplier(integral_constant<uint32_t, Divisor>{});
-  CalculateMagicMultiplier(integral_constant<int32_t, Divisor>) {
+    }
-    return CalculateMagicMultiplier(integral_constant<uint32_t, Divisor>{});
-  }
+    template <int32_t Divisor>
+    __host__ __device__ static constexpr auto
-  template <int32_t Divisor>
+    CalculateMagicShift(integral_constant<int32_t, Divisor>)
-  __host__ __device__ static constexpr auto
+    {
-  CalculateMagicShift(integral_constant<int32_t, Divisor>) {
+        return CalculateMagicShift(integral_constant<uint32_t, Divisor>{});
-    return CalculateMagicShift(integral_constant<uint32_t, Divisor>{});
+    }
-  }
+    // magic division for uint32_t
-  // magic division for uint32_t
+    __device__ static constexpr uint32_t
-  __device__ static constexpr uint32_t
+    DoMagicDivision(uint32_t dividend, uint32_t multiplier, uint32_t shift)
-  DoMagicDivision(uint32_t dividend, uint32_t multiplier, uint32_t shift) {
+    {
-    uint32_t tmp = __umulhi(dividend, multiplier);
+        uint32_t tmp = __umulhi(dividend, multiplier);
-    return (tmp + dividend) >> shift;
+        return (tmp + dividend) >> shift;
-  }
+    }
-  __host__ static constexpr uint32_t
+    __host__ static constexpr uint32_t
-  DoMagicDivision(uint32_t dividend, uint32_t multiplier, uint32_t shift) {
+    DoMagicDivision(uint32_t dividend, uint32_t multiplier, uint32_t shift)
-    uint32_t tmp = static_cast<uint64_t>(dividend) * multiplier >> 32;
+    {
-    return (tmp + dividend) >> shift;
+        uint32_t tmp = static_cast<uint64_t>(dividend) * multiplier >> 32;
-  }
+        return (tmp + dividend) >> shift;
+    }
-  // magic division for int32_t
-  // HACK: use dividend_i32 as if it's uint32_t, dividend_i32 need to be
+    // magic division for int32_t
-  // non-negative for result to be correct
+    // HACK: use dividend_i32 as if it's uint32_t, dividend_i32 need to be
-  // TODO: figure out how to do magic number divison for int32_t as dividended
+    // non-negative for result to be correct
-  __device__ static constexpr int32_t
+    // TODO: figure out how to do magic number divison for int32_t as dividended
-  DoMagicDivision(int32_t dividend_i32, uint32_t multiplier, uint32_t shift) {
+    __device__ static constexpr int32_t
-    uint32_t dividend_u32 = bit_cast<uint32_t>(dividend_i32);
+    DoMagicDivision(int32_t dividend_i32, uint32_t multiplier, uint32_t shift)
-    uint32_t tmp = __umulhi(dividend_u32, multiplier);
+    {
-    return (tmp + dividend_u32) >> shift;
+        uint32_t dividend_u32 = bit_cast<uint32_t>(dividend_i32);
-  }
+        uint32_t tmp          = __umulhi(dividend_u32, multiplier);
+        return (tmp + dividend_u32) >> shift;
-  __host__ static constexpr int32_t
+    }
-  DoMagicDivision(int32_t dividend_i32, uint32_t multiplier, uint32_t shift) {
-    uint32_t dividend_u32 = bit_cast<uint32_t>(dividend_i32);
+    __host__ static constexpr int32_t
-    uint32_t tmp = static_cast<uint64_t>(dividend_u32) * multiplier >> 32;
+    DoMagicDivision(int32_t dividend_i32, uint32_t multiplier, uint32_t shift)
-    return (tmp + dividend_u32) >> shift;
+    {
-  }
+        uint32_t dividend_u32 = bit_cast<uint32_t>(dividend_i32);
+        uint32_t tmp          = static_cast<uint64_t>(dividend_u32) * multiplier >> 32;
+        return (tmp + dividend_u32) >> shift;
+    }
 };
 struct MDiv
@@ -222,5 +232,3 @@ struct MDiv2
 };
 } // namespace ck
-#pragma clang diagnostic pop
--- a/include/ck/utility/math.hpp
+++ b/include/ck/utility/math.hpp
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Weverything"
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 #include "ck/ck.hpp"
-#include "enable_if.hpp"
 #include "integral_constant.hpp"
 #include "number.hpp"
 #include "type.hpp"
+#include "enable_if.hpp"
 namespace ck {
 namespace math {
-template <typename T, T s> struct scales {
+template <typename T, T s>
-  __host__ __device__ constexpr T operator()(T a) const { return s * a; }
+struct scales
+{
+    __host__ __device__ constexpr T operator()(T a) const { return s * a; }
 };
-template <typename T> struct plus {
+template <typename T>
-  __host__ __device__ constexpr T operator()(T a, T b) const { return a + b; }
+struct plus
+{
+    __host__ __device__ constexpr T operator()(T a, T b) const { return a + b; }
 };
-template <typename T> struct minus {
+template <typename T>
-  __host__ __device__ constexpr T operator()(T a, T b) const { return a - b; }
+struct minus
+{
+    __host__ __device__ constexpr T operator()(T a, T b) const { return a - b; }
 };
-struct multiplies {
+struct multiplies
-  template <typename A, typename B>
+{
-  __host__ __device__ constexpr auto operator()(const A &a, const B &b) const {
+    template <typename A, typename B>
-    return a * b;
+    __host__ __device__ constexpr auto operator()(const A& a, const B& b) const
-  }
+    {
+        return a * b;
+    }
 };
-template <typename T> struct maximize {
+template <typename T>
-  __host__ __device__ constexpr T operator()(T a, T b) const {
+struct maximize
-    return a >= b ? a : b;
+{
-  }
+    __host__ __device__ constexpr T operator()(T a, T b) const { return a >= b ? a : b; }
 };
-template <typename T> struct minimize {
+template <typename T>
-  __host__ __device__ constexpr T operator()(T a, T b) const {
+struct minimize
-    return a <= b ? a : b;
+{
-  }
+    __host__ __device__ constexpr T operator()(T a, T b) const { return a <= b ? a : b; }
 };
-template <typename T> struct integer_divide_ceiler {
+template <typename T>
-  __host__ __device__ constexpr T operator()(T a, T b) const {
+struct integer_divide_ceiler
-    static_assert(is_same<T, index_t>{} || is_same<T, int>{}, "wrong type");
+{
+    __host__ __device__ constexpr T operator()(T a, T b) const
+    {
+        static_assert(is_same<T, index_t>{} || is_same<T, int>{}, "wrong type");
-    return (a + b - Number<1>{}) / b;
+        return (a + b - Number<1>{}) / b;
-  }
+    }
 };
 template <typename X, typename Y>
-__host__ __device__ constexpr auto integer_divide_floor(X x, Y y) {
+__host__ __device__ constexpr auto integer_divide_floor(X x, Y y)
-  return x / y;
+{
+    return x / y;
 }
 template <typename X, typename Y>
-__host__ __device__ constexpr auto integer_divide_ceil(X x, Y y) {
+__host__ __device__ constexpr auto integer_divide_ceil(X x, Y y)
-  return (x + y - Number<1>{}) / y;
+{
+    return (x + y - Number<1>{}) / y;
 }
 template <typename X, typename Y>
-__host__ __device__ constexpr auto integer_least_multiple(X x, Y y) {
+__host__ __device__ constexpr auto integer_least_multiple(X x, Y y)
-  return y * integer_divide_ceil(x, y);
+{
+    return y * integer_divide_ceil(x, y);
 }
-template <typename T> __host__ __device__ constexpr T max(T x) { return x; }
+template <typename T>
+__host__ __device__ constexpr T max(T x)
+{
+    return x;
+}
-template <typename T> __host__ __device__ constexpr T max(T x, T y) {
+template <typename T>
-  return x > y ? x : y;
+__host__ __device__ constexpr T max(T x, T y)
+{
+    return x > y ? x : y;
 }
 template <index_t X>
-__host__ __device__ constexpr index_t max(Number<X>, index_t y) {
+__host__ __device__ constexpr index_t max(Number<X>, index_t y)
-  return X > y ? X : y;
+{
+    return X > y ? X : y;
 }
 template <index_t Y>
-__host__ __device__ constexpr index_t max(index_t x, Number<Y>) {
+__host__ __device__ constexpr index_t max(index_t x, Number<Y>)
-  return x > Y ? x : Y;
+{
+    return x > Y ? x : Y;
 }
 template <typename X, typename... Ys>
-__host__ __device__ constexpr auto max(X x, Ys... ys) {
+__host__ __device__ constexpr auto max(X x, Ys... ys)
-  static_assert(sizeof...(Ys) > 0, "not enough argument");
+{
+    static_assert(sizeof...(Ys) > 0, "not enough argument");
-  return max(x, max(ys...));
+    return max(x, max(ys...));
 }
-template <typename T> __host__ __device__ constexpr T min(T x) { return x; }
+template <typename T>
+__host__ __device__ constexpr T min(T x)
+{
+    return x;
+}
-template <typename T> __host__ __device__ constexpr T min(T x, T y) {
+template <typename T>
-  return x < y ? x : y;
+__host__ __device__ constexpr T min(T x, T y)
+{
+    return x < y ? x : y;
 }
 template <index_t X>
-__host__ __device__ constexpr index_t min(Number<X>, index_t y) {
+__host__ __device__ constexpr index_t min(Number<X>, index_t y)
-  return X < y ? X : y;
+{
+    return X < y ? X : y;
 }
 template <index_t Y>
-__host__ __device__ constexpr index_t min(index_t x, Number<Y>) {
+__host__ __device__ constexpr index_t min(index_t x, Number<Y>)
-  return x < Y ? x : Y;
+{
+    return x < Y ? x : Y;
 }
 template <typename X, typename... Ys>
-__host__ __device__ constexpr auto min(X x, Ys... ys) {
+__host__ __device__ constexpr auto min(X x, Ys... ys)
-  static_assert(sizeof...(Ys) > 0, "not enough argument");
+{
+    static_assert(sizeof...(Ys) > 0, "not enough argument");
-  return min(x, min(ys...));
+    return min(x, min(ys...));
 }
 template <typename T>
-__host__ __device__ constexpr T clamp(const T &x, const T &lowerbound,
+__host__ __device__ constexpr T clamp(const T& x, const T& lowerbound, const T& upperbound)
-                                      const T &upperbound) {
+{
-  return min(max(x, lowerbound), upperbound);
+    return min(max(x, lowerbound), upperbound);
 }
 // disallow implicit type casting
-template <typename T> __device__ T exp(T x);
+template <typename T>
+__device__ T exp(T x);
 // TODO: add f16 support using v_exp_f16
-template <> __device__ float exp<float>(float x) { return __expf(x); }
+template <>
+__device__ float exp<float>(float x)
+{
+    return __expf(x);
+}
-template <> __device__ double exp<double>(double x) { return exp(x); }
+template <>
+__device__ double exp<double>(double x)
+{
+    return exp(x);
+}
-// static inline __host__ float exp(float x) { return ::expf(x); }
+static inline __host__ float exp(float x) { return ::expf(x); }
-// static inline __host__ double exp(double x) { return std::exp(x); }
+static inline __host__ double exp(double x) { return std::exp(x); }
 // greatest common divisor, aka highest common factor
-__host__ __device__ constexpr index_t gcd(index_t x, index_t y) {
+__host__ __device__ constexpr index_t gcd(index_t x, index_t y)
-  if (x < 0) {
+{
-    return gcd(-x, y);
+    if(x < 0)
-  } else if (y < 0) {
+    {
-    return gcd(x, -y);
+        return gcd(-x, y);
-  } else if (x == y || x == 0) {
+    }
-    return y;
+    else if(y < 0)
-  } else if (y == 0) {
+    {
-    return x;
+        return gcd(x, -y);
-  } else if (x > y) {
+    }
-    return gcd(x % y, y);
+    else if(x == y || x == 0)
-  } else {
+    {
-    return gcd(x, y % x);
+        return y;
-  }
+    }
+    else if(y == 0)
+    {
+        return x;
+    }
+    else if(x > y)
+    {
+        return gcd(x % y, y);
+    }
+    else
+    {
+        return gcd(x, y % x);
+    }
 }
 template <index_t X, index_t Y>
-__host__ __device__ constexpr auto gcd(Number<X>, Number<Y>) {
+__host__ __device__ constexpr auto gcd(Number<X>, Number<Y>)
-  constexpr auto r = gcd(X, Y);
+{
+    constexpr auto r = gcd(X, Y);
-  return Number<r>{};
+    return Number<r>{};
 }
-template <typename X, typename... Ys,
+template <typename X, typename... Ys, typename enable_if<sizeof...(Ys) >= 2, bool>::type = false>
-          typename enable_if<sizeof...(Ys) >= 2, bool>::type = false>
+__host__ __device__ constexpr auto gcd(X x, Ys... ys)
-__host__ __device__ constexpr auto gcd(X x, Ys... ys) {
+{
-  return gcd(x, gcd(ys...));
+    return gcd(x, gcd(ys...));
 }
 // least common multiple
 template <typename X, typename Y>
-__host__ __device__ constexpr auto lcm(X x, Y y) {
+__host__ __device__ constexpr auto lcm(X x, Y y)
-  return (x * y) / gcd(x, y);
+{
+    return (x * y) / gcd(x, y);
 }
-template <typename X, typename... Ys,
+template <typename X, typename... Ys, typename enable_if<sizeof...(Ys) >= 2, bool>::type = false>
-          typename enable_if<sizeof...(Ys) >= 2, bool>::type = false>
+__host__ __device__ constexpr auto lcm(X x, Ys... ys)
-__host__ __device__ constexpr auto lcm(X x, Ys... ys) {
+{
-  return lcm(x, lcm(ys...));
+    return lcm(x, lcm(ys...));
 }
-template <typename T> struct equal {
+template <typename T>
-  __host__ __device__ constexpr bool operator()(T x, T y) const {
+struct equal
-    return x == y;
+{
-  }
+    __host__ __device__ constexpr bool operator()(T x, T y) const { return x == y; }
 };
-template <typename T> struct less {
+template <typename T>
-  __host__ __device__ constexpr bool operator()(T x, T y) const {
+struct less
-    return x < y;
+{
-  }
+    __host__ __device__ constexpr bool operator()(T x, T y) const { return x < y; }
 };
 template <index_t X>
@@ -206,5 +258,3 @@ __host__ __device__ constexpr auto next_power_of_two(Number<X> x)
 } // namespace math
 } // namespace ck
-#pragma clang diagnostic pop
--- a/include/ck/utility/math_v2.hpp
+++ b/include/ck/utility/math_v2.hpp
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Weverything"
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
@@ -16,169 +13,177 @@
 namespace ck {
 namespace math {
-// math functions for the host,  some are implemented by calling C++ std
+// math functions for the host,  some are implemented by calling C++ std functions
-// functions
-static inline __host__ float abs(float x) { return x < 0 ? x * -1.0 : x; };
+static inline __host__ float abs(float x) { return std::abs(x); };
-static inline __host__ double abs(double x) { return x < 0 ? x * -1.0 : x; };
+static inline __host__ double abs(double x) { return std::abs(x); };
-static inline __host__ int8_t abs(int8_t x) {
+static inline __host__ int8_t abs(int8_t x)
-  int8_t sgn = x >> (8 - 1);
+{
+    int8_t sgn = x >> (8 - 1);
-  return (x ^ sgn) - sgn;
+    return (x ^ sgn) - sgn;
 };
-static inline __host__ int32_t abs(int32_t x) {
+static inline __host__ int32_t abs(int32_t x)
-  int32_t sgn = x >> (32 - 1);
+{
+    int32_t sgn = x >> (32 - 1);
-  return (x ^ sgn) - sgn;
+    return (x ^ sgn) - sgn;
 };
-static inline __host__ half_t abs(half_t x) {
+static inline __host__ half_t abs(half_t x)
-  uint16_t xx = ck::bit_cast<uint16_t>(x);
+{
+    uint16_t xx = ck::bit_cast<uint16_t>(x);
-  uint16_t abs_xx = xx & 0x7fff;
+    uint16_t abs_xx = xx & 0x7fff;
-  half_t abs_x = ck::bit_cast<half_t>(abs_xx);
+    half_t abs_x = ck::bit_cast<half_t>(abs_xx);
-  return abs_x;
+    return abs_x;
 };
 #ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
-static inline __host__ int4_t abs(int4_t x) {
+static inline __host__ int4_t abs(int4_t x)
-  int4_t sgn = x >> (4 - 1);
+{
-  return (x ^ sgn) - sgn;
+    int4_t sgn = x >> (4 - 1);
+    return (x ^ sgn) - sgn;
 }
 #endif
-// TODO: to bit arithmetic to figure it out
+static inline __host__ bool isnan(float x) { return std::isnan(x); };
-static inline __host__ bool isnan(float x) {
-  (void)x;
-  return false;
-};
-static inline __host__ bool isnan(double x) {
+static inline __host__ bool isnan(double x) { return std::isnan(x); };
-  (void)x;
-  return false;
-};
-static inline __host__ bool isnan(int8_t x) {
+static inline __host__ bool isnan(int8_t x)
-  (void)x;
+{
-  return false;
+    (void)x;
+    return false;
 };
-static inline __host__ bool isnan(int32_t x) {
+static inline __host__ bool isnan(int32_t x)
-  (void)x;
+{
-  return false;
+    (void)x;
+    return false;
 };
-static inline __host__ bool isnan(half_t x) {
+static inline __host__ bool isnan(half_t x)
-  uint16_t xx = ck::bit_cast<uint16_t>(x);
+{
+    uint16_t xx = ck::bit_cast<uint16_t>(x);
-  return (xx & 0x7FFF) > 0x7C00;
+    return (xx & 0x7FFF) > 0x7C00;
 };
 #ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
-static inline __host__ bool isnan(int4_t x) {
+static inline __host__ bool isnan(int4_t x)
-  (void)x;
+{
-  return false;
+    (void)x;
+    return false;
 };
 #endif
-// MIGRAPHX doesn't care about host compilation, just return identity values for
+static inline __host__ half_t sqrt(half_t x)
-// now
+{
+    return static_cast<half_t>(std::sqrt(static_cast<float>(x)));
-static inline __host__ half_t sqrt(half_t x) { return x; };
+};
-static inline __host__ float sqrt(float x) { return x; };
+static inline __host__ float sqrt(float x) { return std::sqrt(x); };
-static inline __host__ double sqrt(double x) { return x; };
+static inline __host__ double sqrt(double x) { return std::sqrt(x); };
-static inline __host__ half_t tanh(half_t x) { return x; };
+static inline __host__ half_t tanh(half_t x)
+{
+    return static_cast<half_t>(std::tanh(static_cast<float>(x)));
+};
-static inline __host__ float tanh(float x) { return x; };
+static inline __host__ float tanh(float x) { return std::tanh(x); };
-static inline __host__ double tanh(double x) { return x; };
+static inline __host__ double tanh(double x) { return std::tanh(x); };
-// math functions for the HIP kernel,  some are implemented by calling hip
+// math functions for the HIP kernel,  some are implemented by calling hip builtin functions
-// builtin functions
 static inline __device__ float abs(float x) { return ::abs(x); };
 static inline __device__ double abs(double x) { return ::abs(x); };
-static inline __device__ int8_t abs(int8_t x) {
+static inline __device__ int8_t abs(int8_t x)
-  int8_t sgn = x >> (8 - 1);
+{
+    int8_t sgn = x >> (8 - 1);
-  return (x ^ sgn) - sgn;
+    return (x ^ sgn) - sgn;
 };
-static inline __device__ int32_t abs(int32_t x) {
+static inline __device__ int32_t abs(int32_t x)
-  int32_t sgn = x >> (32 - 1);
+{
+    int32_t sgn = x >> (32 - 1);
-  return (x ^ sgn) - sgn;
+    return (x ^ sgn) - sgn;
 };
 #ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
-static inline __device__ int4_t abs(int4_t x) {
+static inline __device__ int4_t abs(int4_t x)
-  int4_t sgn = x >> (4 - 1);
+{
+    int4_t sgn = x >> (4 - 1);
-  return (x ^ sgn) - sgn;
+    return (x ^ sgn) - sgn;
 };
 #endif
-static inline __device__ half_t abs(half_t x) {
+static inline __device__ half_t abs(half_t x)
-  uint16_t xx = ck::bit_cast<uint16_t>(x);
+{
+    uint16_t xx = ck::bit_cast<uint16_t>(x);
-  uint16_t abs_xx = xx & 0x7fff;
+    uint16_t abs_xx = xx & 0x7fff;
-  half_t abs_x = ck::bit_cast<half_t>(abs_xx);
+    half_t abs_x = ck::bit_cast<half_t>(abs_xx);
-  return abs_x;
+    return abs_x;
 };
 static inline __device__ bool isnan(float x) { return ::isnan(x); };
 static inline __device__ bool isnan(double x) { return ::isnan(x); };
-static inline __device__ bool isnan(int8_t x) {
+static inline __device__ bool isnan(int8_t x)
-  (void)x;
+{
-  return false;
+    (void)x;
+    return false;
 };
-static inline __device__ bool isnan(int32_t x) {
+static inline __device__ bool isnan(int32_t x)
-  (void)x;
+{
-  return false;
+    (void)x;
+    return false;
 };
 #ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
-static inline __device__ bool isnan(int4_t x) {
+static inline __device__ bool isnan(int4_t x)
-  (void)x;
+{
-  return false;
+    (void)x;
+    return false;
 };
 #endif
-static inline __device__ bool isnan(half_t x) {
+static inline __device__ bool isnan(half_t x)
-  uint16_t xx = ck::bit_cast<uint16_t>(x);
+{
+    uint16_t xx = ck::bit_cast<uint16_t>(x);
-  return (xx & 0x7FFF) > 0x7C00;
+    return (xx & 0x7FFF) > 0x7C00;
 };
-static inline __device__ half_t sqrt(half_t x) {
+static inline __device__ half_t sqrt(half_t x)
-  return static_cast<half_t>(__builtin_amdgcn_sqrtf(static_cast<float>(x)));
+{
+    return static_cast<half_t>(__builtin_amdgcn_sqrtf(static_cast<float>(x)));
 };
-static inline __device__ float sqrt(float x) {
+static inline __device__ float sqrt(float x) { return __builtin_amdgcn_sqrtf(x); };
-  return __builtin_amdgcn_sqrtf(x);
-};
-static inline __device__ double sqrt(double x) {
+static inline __device__ double sqrt(double x) { return __builtin_amdgcn_sqrt(x); };
-  return __builtin_amdgcn_sqrt(x);
-};
-static inline __device__ half_t tanh(half_t x) {
+static inline __device__ half_t tanh(half_t x)
-  return static_cast<half_t>(::tanhf(static_cast<float>(x)));
+{
+    return static_cast<half_t>(::tanhf(static_cast<float>(x)));
 };
 static inline __device__ float tanh(float x) { return ::tanhf(x); };
@@ -187,5 +192,3 @@ static inline __device__ double tanh(double x) { return ::tanh(x); };
 } // namespace math
 } // namespace ck
-#pragma clang diagnostic pop
--- a/include/ck/utility/multi_index.hpp
+++ b/include/ck/utility/multi_index.hpp
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Weverything"
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
@@ -13,5 +10,3 @@
 #else
 #include "statically_indexed_array_multi_index.hpp"
 #endif
-#pragma clang diagnostic pop
--- a/include/ck/utility/number.hpp
+++ b/include/ck/utility/number.hpp
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Weverything"
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
@@ -19,5 +16,3 @@ using LongNumber = integral_constant<long_index_t, N>;
 } // namespace ck
 #endif
-#pragma clang diagnostic pop
--- a/include/ck/utility/reduction_common.hpp
+++ b/include/ck/utility/reduction_common.hpp
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Weverything"
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
@@ -29,5 +26,3 @@ struct float_equal_zero
 };
 } // namespace ck
-#pragma clang diagnostic pop
--- a/include/ck/utility/reduction_enums.hpp
+++ b/include/ck/utility/reduction_enums.hpp
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Weverything"
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
@@ -42,5 +39,3 @@ enum struct IndicesType
 };
 } // namespace ck
-#pragma clang diagnostic pop
--- a/include/ck/utility/reduction_functions_accumulate.hpp
+++ b/include/ck/utility/reduction_functions_accumulate.hpp
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Weverything"
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
@@ -116,5 +113,3 @@ struct AccumulateWithIndexAndNanCheck<true, ReduceOperation, AccDataType, IndexD
 } // namespace detail
 } // namespace ck
-#pragma clang diagnostic pop
--- a/include/ck/utility/reduction_operator.hpp
+++ b/include/ck/utility/reduction_operator.hpp
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Weverything"
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
@@ -294,5 +291,3 @@ struct InMemoryDataOperationSupportedOnDataType<InMemoryDataOperationEnum::Add,
 } // namespace reduce
 } // namespace ck
-#pragma clang diagnostic pop
--- a/include/ck/utility/sequence.hpp
+++ b/include/ck/utility/sequence.hpp
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Weverything"
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
@@ -900,5 +897,3 @@ template <index_t NSize, index_t I>
 using uniform_sequence_gen_t = typename uniform_sequence_gen<NSize, I>::type;
 } // namespace ck
-#pragma clang diagnostic pop
--- a/include/ck/utility/sequence_helper.hpp
+++ b/include/ck/utility/sequence_helper.hpp
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Weverything"
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
@@ -38,5 +35,3 @@ __host__ __device__ constexpr auto to_sequence(Tuple<Number<Is>...>)
 }
 } // namespace ck
-#pragma clang diagnostic pop
--- a/include/ck/utility/static_buffer.hpp
+++ b/include/ck/utility/static_buffer.hpp
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Weverything"
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
@@ -196,5 +193,3 @@ __host__ __device__ constexpr auto make_static_buffer(LongNumber<N>)
 }
 } // namespace ck
-#pragma clang diagnostic pop
--- a/include/ck/utility/statically_indexed_array.hpp
+++ b/include/ck/utility/statically_indexed_array.hpp
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Weverything"
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
@@ -106,5 +103,3 @@ struct StaticallyIndexedArray_v2
 } // namespace ck
 #endif
-#pragma clang diagnostic pop
--- a/include/ck/utility/statically_indexed_array_multi_index.hpp
+++ b/include/ck/utility/statically_indexed_array_multi_index.hpp
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Weverything"
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
@@ -163,5 +160,3 @@ __host__ __device__ void print_multi_index(const Tuple<Xs...>& x)
 } // namespace ck
 #endif
-#pragma clang diagnostic pop