Commit 4131b712 authored by Umang Yadav's avatar Umang Yadav
Browse files

additional changes to make it work

parent 213196c0
......@@ -8,8 +8,10 @@
#include "ck/utility/tuple.hpp"
#include "ck/tensor_description/tensor_adaptor.hpp"
#include "ck/tensor_description/multi_index_transform_helper.hpp"
#ifndef __HIPCC_RTC__
#include <limits>
#include <stdlib.h>
#endif
namespace ck {
......@@ -777,7 +779,7 @@ struct BlockToCTileMap_GemmStreamK
uint32_t dp_for_sk_iters = k_iters_per_tile.get();
uint32_t best_sk_score =
std::numeric_limits<int>::max(); // we need to find the smallest sk iters
ck::NumericLimits<int32_t>::Max(); // we need to find the smallest sk iters
for(uint32_t tentative_sk_blocks = min_sk_tiles; tentative_sk_blocks < max_sk_tiles;
tentative_sk_blocks++)
{
......
......@@ -3,8 +3,6 @@
#pragma once
#include <iostream>
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp"
#ifndef __HIPCC_RTC__
......
......@@ -39,7 +39,7 @@ struct get_carrier<3>
{
using value_type = uint32_t;
std::array<std::byte, 3> bytes;
std::byte bytes[3];
static_assert(sizeof(bytes) <= sizeof(value_type));
// replacement of host std::copy_n()
......@@ -61,15 +61,12 @@ struct get_carrier<3>
}
// method to trigger template substitution failure
__device__ carrier(const carrier& other) noexcept
{
copy_n(other.bytes.begin(), bytes.size(), bytes.begin());
}
__device__ carrier(const carrier& other) noexcept { copy_n(&other.bytes[0], 3, &bytes[0]); }
public:
__device__ carrier& operator=(value_type value) noexcept
{
copy_n(reinterpret_cast<const std::byte*>(&value), bytes.size(), bytes.begin());
copy_n(reinterpret_cast<const std::byte*>(&value), 3, &bytes[0]);
return *this;
}
......@@ -78,7 +75,7 @@ struct get_carrier<3>
{
std::byte result[sizeof(value_type)];
copy_n(bytes.begin(), bytes.size(), result);
copy_n(&bytes[0], 3, result);
return *reinterpret_cast<const value_type*>(result);
}
......@@ -102,9 +99,9 @@ __device__ inline int32_t amd_wave_read_first_lane(int32_t value)
return __builtin_amdgcn_readfirstlane(value);
}
template <
typename Object,
typename = std::enable_if_t<std::is_class_v<Object> && std::is_trivially_copyable_v<Object>>>
template <typename Object,
typename = std::enable_if_t<std::is_class<Object>::value &&
std::is_trivially_copyable<Object>::value>>
__device__ auto amd_wave_read_first_lane(const Object& obj)
{
using Size = unsigned;
......
......@@ -1027,16 +1027,6 @@ struct NumericLimits<uint16_t>
__host__ __device__ static constexpr uint16_t QuietNaN() { return 0; }
};
template <>
struct NumericLimits<uint8_t>
{
__host__ __device__ static constexpr uint8_t Lowest() noexcept { return 0; }
__host__ __device__ static constexpr uint8_t Min() noexcept { return 0; }
__host__ __device__ static constexpr uint8_t Max() noexcept { return 255U; }
__host__ __device__ static constexpr uint8_t Infinity() noexcept { return 0; }
__host__ __device__ static constexpr uint8_t QuietNaN() { return 0; }
};
template <>
struct NumericLimits<float>
{
......
......@@ -44,7 +44,7 @@ __host__ __device__ f8_t run_cast_to_f8(T x, uint32_t rng)
constexpr uint32_t nan_mask = is_half ? 0x7C00 : 0x7F800000;
// convert to bitwise
typedef typename std::conditional<std::is_same<T, half_t>::value, uint16_t, uint32_t>::type
typedef typename ck::conditional<std::is_same<T, half_t>::value, uint16_t, uint32_t>::type
T_bitwise;
T_bitwise x_bitwise = *(reinterpret_cast<T_bitwise*>(&x));
......@@ -180,7 +180,7 @@ __host__ __device__ T run_cast_from_f8(f8_t x)
constexpr int exp_low_cutoff =
(1 << (type_exp - 1)) - (1 << (f8_exp - 1)) + 1 - (negative_zero_nan ? 1 : 0);
typename std::conditional<std::is_same<T, half_t>::value, uint16_t, uint32_t>::type retval;
typename ck::conditional<std::is_same<T, half_t>::value, uint16_t, uint32_t>::type retval;
if constexpr(negative_zero_nan)
{
......
......@@ -168,9 +168,11 @@ __device__ double exp<double>(double x)
return exp(x);
}
#ifndef __HIPCC_RTC__
static inline __host__ float exp(float x) { return ::expf(x); }
static inline __host__ double exp(double x) { return std::exp(x); }
#endif
// greatest common divisor, aka highest common factor
__host__ __device__ constexpr index_t gcd(index_t x, index_t y)
......
......@@ -13,6 +13,7 @@
namespace ck {
namespace math {
#ifndef __HIPCC_RTC__
// math functions for the host, some are implemented by calling C++ std functions
static inline __host__ float abs(float x) { return std::abs(x); };
......@@ -100,7 +101,7 @@ static inline __host__ half_t tanh(half_t x)
static inline __host__ float tanh(float x) { return std::tanh(x); };
static inline __host__ double tanh(double x) { return std::tanh(x); };
#endif
// math functions for the HIP kernel, some are implemented by calling hip builtin functions
static inline __device__ float abs(float x) { return ::abs(x); };
......
......@@ -2,6 +2,7 @@
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <ck/utility/ignore.hpp>
namespace ck {
......@@ -43,9 +44,9 @@ template <typename T,
std::enable_if_t<!(std::is_same<float, T>{} || std::is_same<half_t, T>{}), bool> = false>
__host__ __device__ uint32_t prand_generator(int id, T val, uint32_t seed = seed_t)
{
std::ignore = id;
std::ignore = val;
std::ignore = seed;
ck::ignore = id;
ck::ignore = val;
ck::ignore = seed;
return 0;
}
......
......@@ -190,7 +190,7 @@ inline __host__ __device__ f8_t f8_convert_sr<f8_t, float>(float x)
constexpr f8_rounding_mode rm = f8_rounding_mode::stochastic;
constexpr int seed = 42;
// as thread id is not available on host, use 0 for prn generation
uint32_t rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
uint32_t rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), x);
return utils::cast_to_f8<float, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(
x, rng);
}
......@@ -204,7 +204,7 @@ inline __host__ __device__ f8_t f8_convert_sr<f8_t, half_t>(half_t x)
constexpr f8_rounding_mode rm = f8_rounding_mode::stochastic;
constexpr int seed = 42;
// as thread id is not available on host, use 0 for prn generation
uint32_t rng = prand_generator<half_t, seed>(reinterpret_cast<uintptr_t>(&x), x);
uint32_t rng = prand_generator<half_t, seed>(reinterpret_cast<size_t>(&x), x);
return utils::cast_to_f8<half_t, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(
x, rng);
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment