additional changes to make it work

4131b712 · Umang Yadav · 213196c0 · 4131b712 · 4131b712 · 4131b712
Commit 4131b712 authored Sep 27, 2023 by Umang Yadav
9 changed files
--- a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
+++ b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
@@ -8,8 +8,10 @@
 #include "ck/utility/tuple.hpp"
 #include "ck/tensor_description/tensor_adaptor.hpp"
 #include "ck/tensor_description/multi_index_transform_helper.hpp"
+#ifndef __HIPCC_RTC__
 #include <limits>
 #include <stdlib.h>
+#endif
 namespace ck {
@@ -88,8 +90,8 @@ struct BlockToCTileMap_M00_N0_M01
        const auto m00_n0_m01_to_m0_n0_block_cluster_adaptor = make_single_stage_tensor_adaptor(
            ck::make_tuple(make_insert_transform(1),
-                       make_unmerge_transform(ck::make_tuple(M00, M01)),
+                           make_unmerge_transform(ck::make_tuple(M00, M01)),
-                       make_pass_through_transform(ck::make_tuple(N0))),
+                           make_pass_through_transform(ck::make_tuple(N0))),
            ck::make_tuple(Sequence<>{}, Sequence<0>{}, Sequence<1>{}),
            ck::make_tuple(Sequence<0>{}, Sequence<1, 3>{}, Sequence<2>{}));
@@ -233,7 +235,7 @@ struct BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, void>
         */
        return ck::make_tuple(idx_N0_M01_local % M01_adapt + idx_M00 * M01_,
-                          idx_N0_M01_local / M01_adapt);
+                              idx_N0_M01_local / M01_adapt);
    }
    template <typename CTileIdx, typename CTileDim>
@@ -309,8 +311,8 @@ struct BlockToCTileMap_KSplit_M00_N0_M01Adapt
        index_t idx_N0_M01_local = idx_N0 + idx_M01 * N0;
        return ck::make_tuple(idx_ksplit,
-                          idx_N0_M01_local % M01_adapt + idx_M00 * M01_,
+                              idx_N0_M01_local % M01_adapt + idx_M00 * M01_,
-                          idx_N0_M01_local / M01_adapt);
+                              idx_N0_M01_local / M01_adapt);
    }
    template <typename CTileIdx, typename CTileDim>
@@ -408,8 +410,8 @@ struct BlockToCTileMap_M00_N00_M01_N01
        const auto m00_m01_n00_n01_to_m0_n0_block_cluster_adaptor =
            make_single_stage_tensor_adaptor(
                ck::make_tuple(make_insert_transform(1), // swallow the carry from lower dimensions
-                           make_unmerge_transform(ck::make_tuple(M00, M01)),
+                               make_unmerge_transform(ck::make_tuple(M00, M01)),
-                           make_unmerge_transform(ck::make_tuple(N00, N01))),
+                               make_unmerge_transform(ck::make_tuple(N00, N01))),
                ck::make_tuple(Sequence<>{}, Sequence<0>{}, Sequence<1>{}),
                ck::make_tuple(Sequence<0>{}, Sequence<1, 3>{}, Sequence<2, 4>{}));
@@ -527,8 +529,8 @@ struct BlockToCTileMap_KSplit_M00_N00_M01_N01
        const auto ksplit_m00_m01_n00_n01_to_m0_n0_block_cluster_adaptor =
            make_single_stage_tensor_adaptor(
                ck::make_tuple(make_pass_through_transform(KSplit),
-                           make_unmerge_transform(ck::make_tuple(M00, M01)),
+                               make_unmerge_transform(ck::make_tuple(M00, M01)),
-                           make_unmerge_transform(ck::make_tuple(N00, N01))),
+                               make_unmerge_transform(ck::make_tuple(N00, N01))),
                ck::make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
                ck::make_tuple(Sequence<0>{}, Sequence<1, 3>{}, Sequence<2, 4>{}));
@@ -777,7 +779,7 @@ struct BlockToCTileMap_GemmStreamK
            uint32_t dp_for_sk_iters = k_iters_per_tile.get();
            uint32_t best_sk_score =
-                std::numeric_limits<int>::max(); // we need to find the smallest sk iters
+                ck::NumericLimits<int32_t>::Max(); // we need to find the smallest sk iters
            for(uint32_t tentative_sk_blocks = min_sk_tiles; tentative_sk_blocks < max_sk_tiles;
                tentative_sk_blocks++)
            {
@@ -820,7 +822,7 @@ struct BlockToCTileMap_GemmStreamK
                dp_num_blocks      = num_tiles; // all tile to be dp block
                dp_start_block_idx = 0;
-                sk_total_iters     = 0; // clear this tiles
+                sk_total_iters     = 0;         // clear this tiles
            }
            else
            {

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp
@@ -3,8 +3,6 @@
 #pragma once
-#include <iostream>
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp"
 #ifndef __HIPCC_RTC__

--- a/include/ck/utility/amd_wave_read_first_lane.hpp
+++ b/include/ck/utility/amd_wave_read_first_lane.hpp
@@ -39,7 +39,7 @@ struct get_carrier<3>
    {
        using value_type = uint32_t;
-        std::array<std::byte, 3> bytes;
+        std::byte bytes[3];
        static_assert(sizeof(bytes) <= sizeof(value_type));
        // replacement of host std::copy_n()
@@ -61,15 +61,12 @@ struct get_carrier<3>
        }
        // method to trigger template substitution failure
-        __device__ carrier(const carrier& other) noexcept
+        __device__ carrier(const carrier& other) noexcept { copy_n(&other.bytes[0], 3, &bytes[0]); }
-        {
-            copy_n(other.bytes.begin(), bytes.size(), bytes.begin());
-        }
        public:
        __device__ carrier& operator=(value_type value) noexcept
        {
-            copy_n(reinterpret_cast<const std::byte*>(&value), bytes.size(), bytes.begin());
+            copy_n(reinterpret_cast<const std::byte*>(&value), 3, &bytes[0]);
            return *this;
        }
@@ -78,7 +75,7 @@ struct get_carrier<3>
        {
            std::byte result[sizeof(value_type)];
-            copy_n(bytes.begin(), bytes.size(), result);
+            copy_n(&bytes[0], 3, result);
            return *reinterpret_cast<const value_type*>(result);
        }
@@ -102,9 +99,9 @@ __device__ inline int32_t amd_wave_read_first_lane(int32_t value)
    return __builtin_amdgcn_readfirstlane(value);
 }
-template <
+template <typename Object,
-    typename Object,
+          typename = std::enable_if_t<std::is_class<Object>::value &&
-    typename = std::enable_if_t<std::is_class_v<Object> && std::is_trivially_copyable_v<Object>>>
+                                      std::is_trivially_copyable<Object>::value>>
 __device__ auto amd_wave_read_first_lane(const Object& obj)
 {
    using Size                = unsigned;

--- a/include/ck/utility/data_type.hpp
+++ b/include/ck/utility/data_type.hpp
@@ -1027,16 +1027,6 @@ struct NumericLimits<uint16_t>
    __host__ __device__ static constexpr uint16_t QuietNaN() { return 0; }
 };
-template <>
-struct NumericLimits<uint8_t>
-{
-    __host__ __device__ static constexpr uint8_t Lowest() noexcept { return 0; }
-    __host__ __device__ static constexpr uint8_t Min() noexcept { return 0; }
-    __host__ __device__ static constexpr uint8_t Max() noexcept { return 255U; }
-    __host__ __device__ static constexpr uint8_t Infinity() noexcept { return 0; }
-    __host__ __device__ static constexpr uint8_t QuietNaN() { return 0; }
-};
 template <>
 struct NumericLimits<float>
 {

--- a/include/ck/utility/f8_utils.hpp
+++ b/include/ck/utility/f8_utils.hpp
@@ -44,7 +44,7 @@ __host__ __device__ f8_t run_cast_to_f8(T x, uint32_t rng)
    constexpr uint32_t nan_mask = is_half ? 0x7C00 : 0x7F800000;
    // convert to bitwise
-    typedef typename std::conditional<std::is_same<T, half_t>::value, uint16_t, uint32_t>::type
+    typedef typename ck::conditional<std::is_same<T, half_t>::value, uint16_t, uint32_t>::type
        T_bitwise;
    T_bitwise x_bitwise = *(reinterpret_cast<T_bitwise*>(&x));
@@ -180,7 +180,7 @@ __host__ __device__ T run_cast_from_f8(f8_t x)
    constexpr int exp_low_cutoff =
        (1 << (type_exp - 1)) - (1 << (f8_exp - 1)) + 1 - (negative_zero_nan ? 1 : 0);
-    typename std::conditional<std::is_same<T, half_t>::value, uint16_t, uint32_t>::type retval;
+    typename ck::conditional<std::is_same<T, half_t>::value, uint16_t, uint32_t>::type retval;
    if constexpr(negative_zero_nan)
    {

--- a/include/ck/utility/math.hpp
+++ b/include/ck/utility/math.hpp
@@ -168,9 +168,11 @@ __device__ double exp<double>(double x)
    return exp(x);
 }
+#ifndef __HIPCC_RTC__
 static inline __host__ float exp(float x) { return ::expf(x); }
 static inline __host__ double exp(double x) { return std::exp(x); }
+#endif
 // greatest common divisor, aka highest common factor
 __host__ __device__ constexpr index_t gcd(index_t x, index_t y)

--- a/include/ck/utility/math_v2.hpp
+++ b/include/ck/utility/math_v2.hpp
@@ -13,6 +13,7 @@
 namespace ck {
 namespace math {
+#ifndef __HIPCC_RTC__
 // math functions for the host,  some are implemented by calling C++ std functions
 static inline __host__ float abs(float x) { return std::abs(x); };
@@ -100,7 +101,7 @@ static inline __host__ half_t tanh(half_t x)
 static inline __host__ float tanh(float x) { return std::tanh(x); };
 static inline __host__ double tanh(double x) { return std::tanh(x); };
+#endif
 // math functions for the HIP kernel,  some are implemented by calling hip builtin functions
 static inline __device__ float abs(float x) { return ::abs(x); };

--- a/include/ck/utility/random_gen.hpp
+++ b/include/ck/utility/random_gen.hpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
+#include <ck/utility/ignore.hpp>
 namespace ck {
@@ -43,9 +44,9 @@ template <typename T,
          std::enable_if_t<!(std::is_same<float, T>{} || std::is_same<half_t, T>{}), bool> = false>
 __host__ __device__ uint32_t prand_generator(int id, T val, uint32_t seed = seed_t)
 {
-    std::ignore = id;
+    ck::ignore = id;
-    std::ignore = val;
+    ck::ignore = val;
-    std::ignore = seed;
+    ck::ignore = seed;
    return 0;
 }

--- a/include/ck/utility/type_convert.hpp
+++ b/include/ck/utility/type_convert.hpp
@@ -190,7 +190,7 @@ inline __host__ __device__ f8_t f8_convert_sr<f8_t, float>(float x)
    constexpr f8_rounding_mode rm    = f8_rounding_mode::stochastic;
    constexpr int seed               = 42;
    // as thread id is not available on host, use 0 for prn generation
-    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
+    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), x);
    return utils::cast_to_f8<float, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(
        x, rng);
 }
@@ -204,7 +204,7 @@ inline __host__ __device__ f8_t f8_convert_sr<f8_t, half_t>(half_t x)
    constexpr f8_rounding_mode rm    = f8_rounding_mode::stochastic;
    constexpr int seed               = 42;
    // as thread id is not available on host, use 0 for prn generation
-    uint32_t rng = prand_generator<half_t, seed>(reinterpret_cast<uintptr_t>(&x), x);
+    uint32_t rng = prand_generator<half_t, seed>(reinterpret_cast<size_t>(&x), x);
    return utils::cast_to_f8<half_t, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(
        x, rng);
 }