additional changes to make it work

4131b712 · Umang Yadav · 213196c0 · 4131b712 · 4131b712 · 4131b712
Commit 4131b712 authored Sep 27, 2023 by Umang Yadav
9 changed files
--- a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
+++ b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
@@ -8,8 +8,10 @@
 #include "ck/utility/tuple.hpp"
 #include "ck/tensor_description/tensor_adaptor.hpp"
 #include "ck/tensor_description/multi_index_transform_helper.hpp"
+#ifndef __HIPCC_RTC__
 #include <limits>
 #include <stdlib.h>
+#endif

 namespace ck {

@@ -777,7 +779,7 @@ struct BlockToCTileMap_GemmStreamK
            uint32_t dp_for_sk_iters = k_iters_per_tile.get();

            uint32_t best_sk_score =
-                std::numeric_limits<int>::max(); // we need to find the smallest sk iters
+                ck::NumericLimits<int32_t>::Max(); // we need to find the smallest sk iters
            for(uint32_t tentative_sk_blocks = min_sk_tiles; tentative_sk_blocks < max_sk_tiles;
                tentative_sk_blocks++)
            {

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp
@@ -3,8 +3,6 @@

 #pragma once

-#include <iostream>
-
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp"
 #ifndef __HIPCC_RTC__

--- a/include/ck/utility/amd_wave_read_first_lane.hpp
+++ b/include/ck/utility/amd_wave_read_first_lane.hpp
@@ -39,7 +39,7 @@ struct get_carrier<3>
    {
        using value_type = uint32_t;

-        std::array<std::byte, 3> bytes;
+        std::byte bytes[3];
        static_assert(sizeof(bytes) <= sizeof(value_type));

        // replacement of host std::copy_n()
@@ -61,15 +61,12 @@ struct get_carrier<3>
        }

        // method to trigger template substitution failure
-        __device__ carrier(const carrier& other) noexcept
-        {
-            copy_n(other.bytes.begin(), bytes.size(), bytes.begin());
-        }
+        __device__ carrier(const carrier& other) noexcept { copy_n(&other.bytes[0], 3, &bytes[0]); }

        public:
        __device__ carrier& operator=(value_type value) noexcept
        {
-            copy_n(reinterpret_cast<const std::byte*>(&value), bytes.size(), bytes.begin());
+            copy_n(reinterpret_cast<const std::byte*>(&value), 3, &bytes[0]);

            return *this;
        }
@@ -78,7 +75,7 @@ struct get_carrier<3>
        {
            std::byte result[sizeof(value_type)];

-            copy_n(bytes.begin(), bytes.size(), result);
+            copy_n(&bytes[0], 3, result);

            return *reinterpret_cast<const value_type*>(result);
        }
@@ -102,9 +99,9 @@ __device__ inline int32_t amd_wave_read_first_lane(int32_t value)
    return __builtin_amdgcn_readfirstlane(value);
 }

-template <
-    typename Object,
-    typename = std::enable_if_t<std::is_class_v<Object> && std::is_trivially_copyable_v<Object>>>
+template <typename Object,
+          typename = std::enable_if_t<std::is_class<Object>::value &&
+                                      std::is_trivially_copyable<Object>::value>>
 __device__ auto amd_wave_read_first_lane(const Object& obj)
 {
    using Size                = unsigned;

--- a/include/ck/utility/data_type.hpp
+++ b/include/ck/utility/data_type.hpp
@@ -1027,16 +1027,6 @@ struct NumericLimits<uint16_t>
    __host__ __device__ static constexpr uint16_t QuietNaN() { return 0; }
 };

-template <>
-struct NumericLimits<uint8_t>
-{
-    __host__ __device__ static constexpr uint8_t Lowest() noexcept { return 0; }
-    __host__ __device__ static constexpr uint8_t Min() noexcept { return 0; }
-    __host__ __device__ static constexpr uint8_t Max() noexcept { return 255U; }
-    __host__ __device__ static constexpr uint8_t Infinity() noexcept { return 0; }
-    __host__ __device__ static constexpr uint8_t QuietNaN() { return 0; }
-};
-
 template <>
 struct NumericLimits<float>
 {

--- a/include/ck/utility/f8_utils.hpp
+++ b/include/ck/utility/f8_utils.hpp
@@ -44,7 +44,7 @@ __host__ __device__ f8_t run_cast_to_f8(T x, uint32_t rng)
    constexpr uint32_t nan_mask = is_half ? 0x7C00 : 0x7F800000;

    // convert to bitwise
-    typedef typename std::conditional<std::is_same<T, half_t>::value, uint16_t, uint32_t>::type
+    typedef typename ck::conditional<std::is_same<T, half_t>::value, uint16_t, uint32_t>::type
        T_bitwise;
    T_bitwise x_bitwise = *(reinterpret_cast<T_bitwise*>(&x));

@@ -180,7 +180,7 @@ __host__ __device__ T run_cast_from_f8(f8_t x)

    constexpr int exp_low_cutoff =
        (1 << (type_exp - 1)) - (1 << (f8_exp - 1)) + 1 - (negative_zero_nan ? 1 : 0);
-    typename std::conditional<std::is_same<T, half_t>::value, uint16_t, uint32_t>::type retval;
+    typename ck::conditional<std::is_same<T, half_t>::value, uint16_t, uint32_t>::type retval;

    if constexpr(negative_zero_nan)
    {

--- a/include/ck/utility/math.hpp
+++ b/include/ck/utility/math.hpp
@@ -168,9 +168,11 @@ __device__ double exp<double>(double x)
    return exp(x);
 }

+#ifndef __HIPCC_RTC__
 static inline __host__ float exp(float x) { return ::expf(x); }

 static inline __host__ double exp(double x) { return std::exp(x); }
+#endif

 // greatest common divisor, aka highest common factor
 __host__ __device__ constexpr index_t gcd(index_t x, index_t y)

--- a/include/ck/utility/math_v2.hpp
+++ b/include/ck/utility/math_v2.hpp
@@ -13,6 +13,7 @@
 namespace ck {
 namespace math {

+#ifndef __HIPCC_RTC__
 // math functions for the host,  some are implemented by calling C++ std functions

 static inline __host__ float abs(float x) { return std::abs(x); };
@@ -100,7 +101,7 @@ static inline __host__ half_t tanh(half_t x)
 static inline __host__ float tanh(float x) { return std::tanh(x); };

 static inline __host__ double tanh(double x) { return std::tanh(x); };
-
+#endif
 // math functions for the HIP kernel,  some are implemented by calling hip builtin functions

 static inline __device__ float abs(float x) { return ::abs(x); };

--- a/include/ck/utility/random_gen.hpp
+++ b/include/ck/utility/random_gen.hpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once
+#include <ck/utility/ignore.hpp>

 namespace ck {

@@ -43,9 +44,9 @@ template <typename T,
          std::enable_if_t<!(std::is_same<float, T>{} || std::is_same<half_t, T>{}), bool> = false>
 __host__ __device__ uint32_t prand_generator(int id, T val, uint32_t seed = seed_t)
 {
-    std::ignore = id;
-    std::ignore = val;
-    std::ignore = seed;
+    ck::ignore = id;
+    ck::ignore = val;
+    ck::ignore = seed;

    return 0;
 }

--- a/include/ck/utility/type_convert.hpp
+++ b/include/ck/utility/type_convert.hpp
@@ -190,7 +190,7 @@ inline __host__ __device__ f8_t f8_convert_sr<f8_t, float>(float x)
    constexpr f8_rounding_mode rm    = f8_rounding_mode::stochastic;
    constexpr int seed               = 42;
    // as thread id is not available on host, use 0 for prn generation
-    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
+    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), x);
    return utils::cast_to_f8<float, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(
        x, rng);
 }
@@ -204,7 +204,7 @@ inline __host__ __device__ f8_t f8_convert_sr<f8_t, half_t>(half_t x)
    constexpr f8_rounding_mode rm    = f8_rounding_mode::stochastic;
    constexpr int seed               = 42;
    // as thread id is not available on host, use 0 for prn generation
-    uint32_t rng = prand_generator<half_t, seed>(reinterpret_cast<uintptr_t>(&x), x);
+    uint32_t rng = prand_generator<half_t, seed>(reinterpret_cast<size_t>(&x), x);
    return utils::cast_to_f8<half_t, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(
        x, rng);
 }