"git@developer.sourcefind.cn:cnjsdfcy/simbricks.git" did not exist on "7fac851bb01ec70c8f3cae338ac49907c94d21ee"
Commit 4131b712 authored by Umang Yadav's avatar Umang Yadav
Browse files

additional changes to make it work

parent 213196c0
...@@ -8,8 +8,10 @@ ...@@ -8,8 +8,10 @@
#include "ck/utility/tuple.hpp" #include "ck/utility/tuple.hpp"
#include "ck/tensor_description/tensor_adaptor.hpp" #include "ck/tensor_description/tensor_adaptor.hpp"
#include "ck/tensor_description/multi_index_transform_helper.hpp" #include "ck/tensor_description/multi_index_transform_helper.hpp"
#ifndef __HIPCC_RTC__
#include <limits> #include <limits>
#include <stdlib.h> #include <stdlib.h>
#endif
namespace ck { namespace ck {
...@@ -88,8 +90,8 @@ struct BlockToCTileMap_M00_N0_M01 ...@@ -88,8 +90,8 @@ struct BlockToCTileMap_M00_N0_M01
const auto m00_n0_m01_to_m0_n0_block_cluster_adaptor = make_single_stage_tensor_adaptor( const auto m00_n0_m01_to_m0_n0_block_cluster_adaptor = make_single_stage_tensor_adaptor(
ck::make_tuple(make_insert_transform(1), ck::make_tuple(make_insert_transform(1),
make_unmerge_transform(ck::make_tuple(M00, M01)), make_unmerge_transform(ck::make_tuple(M00, M01)),
make_pass_through_transform(ck::make_tuple(N0))), make_pass_through_transform(ck::make_tuple(N0))),
ck::make_tuple(Sequence<>{}, Sequence<0>{}, Sequence<1>{}), ck::make_tuple(Sequence<>{}, Sequence<0>{}, Sequence<1>{}),
ck::make_tuple(Sequence<0>{}, Sequence<1, 3>{}, Sequence<2>{})); ck::make_tuple(Sequence<0>{}, Sequence<1, 3>{}, Sequence<2>{}));
...@@ -233,7 +235,7 @@ struct BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, void> ...@@ -233,7 +235,7 @@ struct BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, void>
*/ */
return ck::make_tuple(idx_N0_M01_local % M01_adapt + idx_M00 * M01_, return ck::make_tuple(idx_N0_M01_local % M01_adapt + idx_M00 * M01_,
idx_N0_M01_local / M01_adapt); idx_N0_M01_local / M01_adapt);
} }
template <typename CTileIdx, typename CTileDim> template <typename CTileIdx, typename CTileDim>
...@@ -309,8 +311,8 @@ struct BlockToCTileMap_KSplit_M00_N0_M01Adapt ...@@ -309,8 +311,8 @@ struct BlockToCTileMap_KSplit_M00_N0_M01Adapt
index_t idx_N0_M01_local = idx_N0 + idx_M01 * N0; index_t idx_N0_M01_local = idx_N0 + idx_M01 * N0;
return ck::make_tuple(idx_ksplit, return ck::make_tuple(idx_ksplit,
idx_N0_M01_local % M01_adapt + idx_M00 * M01_, idx_N0_M01_local % M01_adapt + idx_M00 * M01_,
idx_N0_M01_local / M01_adapt); idx_N0_M01_local / M01_adapt);
} }
template <typename CTileIdx, typename CTileDim> template <typename CTileIdx, typename CTileDim>
...@@ -408,8 +410,8 @@ struct BlockToCTileMap_M00_N00_M01_N01 ...@@ -408,8 +410,8 @@ struct BlockToCTileMap_M00_N00_M01_N01
const auto m00_m01_n00_n01_to_m0_n0_block_cluster_adaptor = const auto m00_m01_n00_n01_to_m0_n0_block_cluster_adaptor =
make_single_stage_tensor_adaptor( make_single_stage_tensor_adaptor(
ck::make_tuple(make_insert_transform(1), // swallow the carry from lower dimensions ck::make_tuple(make_insert_transform(1), // swallow the carry from lower dimensions
make_unmerge_transform(ck::make_tuple(M00, M01)), make_unmerge_transform(ck::make_tuple(M00, M01)),
make_unmerge_transform(ck::make_tuple(N00, N01))), make_unmerge_transform(ck::make_tuple(N00, N01))),
ck::make_tuple(Sequence<>{}, Sequence<0>{}, Sequence<1>{}), ck::make_tuple(Sequence<>{}, Sequence<0>{}, Sequence<1>{}),
ck::make_tuple(Sequence<0>{}, Sequence<1, 3>{}, Sequence<2, 4>{})); ck::make_tuple(Sequence<0>{}, Sequence<1, 3>{}, Sequence<2, 4>{}));
...@@ -527,8 +529,8 @@ struct BlockToCTileMap_KSplit_M00_N00_M01_N01 ...@@ -527,8 +529,8 @@ struct BlockToCTileMap_KSplit_M00_N00_M01_N01
const auto ksplit_m00_m01_n00_n01_to_m0_n0_block_cluster_adaptor = const auto ksplit_m00_m01_n00_n01_to_m0_n0_block_cluster_adaptor =
make_single_stage_tensor_adaptor( make_single_stage_tensor_adaptor(
ck::make_tuple(make_pass_through_transform(KSplit), ck::make_tuple(make_pass_through_transform(KSplit),
make_unmerge_transform(ck::make_tuple(M00, M01)), make_unmerge_transform(ck::make_tuple(M00, M01)),
make_unmerge_transform(ck::make_tuple(N00, N01))), make_unmerge_transform(ck::make_tuple(N00, N01))),
ck::make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), ck::make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
ck::make_tuple(Sequence<0>{}, Sequence<1, 3>{}, Sequence<2, 4>{})); ck::make_tuple(Sequence<0>{}, Sequence<1, 3>{}, Sequence<2, 4>{}));
...@@ -777,7 +779,7 @@ struct BlockToCTileMap_GemmStreamK ...@@ -777,7 +779,7 @@ struct BlockToCTileMap_GemmStreamK
uint32_t dp_for_sk_iters = k_iters_per_tile.get(); uint32_t dp_for_sk_iters = k_iters_per_tile.get();
uint32_t best_sk_score = uint32_t best_sk_score =
std::numeric_limits<int>::max(); // we need to find the smallest sk iters ck::NumericLimits<int32_t>::Max(); // we need to find the smallest sk iters
for(uint32_t tentative_sk_blocks = min_sk_tiles; tentative_sk_blocks < max_sk_tiles; for(uint32_t tentative_sk_blocks = min_sk_tiles; tentative_sk_blocks < max_sk_tiles;
tentative_sk_blocks++) tentative_sk_blocks++)
{ {
...@@ -820,7 +822,7 @@ struct BlockToCTileMap_GemmStreamK ...@@ -820,7 +822,7 @@ struct BlockToCTileMap_GemmStreamK
dp_num_blocks = num_tiles; // all tile to be dp block dp_num_blocks = num_tiles; // all tile to be dp block
dp_start_block_idx = 0; dp_start_block_idx = 0;
sk_total_iters = 0; // clear this tiles sk_total_iters = 0; // clear this tiles
} }
else else
{ {
......
...@@ -3,8 +3,6 @@ ...@@ -3,8 +3,6 @@
#pragma once #pragma once
#include <iostream>
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp"
#ifndef __HIPCC_RTC__ #ifndef __HIPCC_RTC__
......
...@@ -39,7 +39,7 @@ struct get_carrier<3> ...@@ -39,7 +39,7 @@ struct get_carrier<3>
{ {
using value_type = uint32_t; using value_type = uint32_t;
std::array<std::byte, 3> bytes; std::byte bytes[3];
static_assert(sizeof(bytes) <= sizeof(value_type)); static_assert(sizeof(bytes) <= sizeof(value_type));
// replacement of host std::copy_n() // replacement of host std::copy_n()
...@@ -61,15 +61,12 @@ struct get_carrier<3> ...@@ -61,15 +61,12 @@ struct get_carrier<3>
} }
// method to trigger template substitution failure // method to trigger template substitution failure
__device__ carrier(const carrier& other) noexcept __device__ carrier(const carrier& other) noexcept { copy_n(&other.bytes[0], 3, &bytes[0]); }
{
copy_n(other.bytes.begin(), bytes.size(), bytes.begin());
}
public: public:
__device__ carrier& operator=(value_type value) noexcept __device__ carrier& operator=(value_type value) noexcept
{ {
copy_n(reinterpret_cast<const std::byte*>(&value), bytes.size(), bytes.begin()); copy_n(reinterpret_cast<const std::byte*>(&value), 3, &bytes[0]);
return *this; return *this;
} }
...@@ -78,7 +75,7 @@ struct get_carrier<3> ...@@ -78,7 +75,7 @@ struct get_carrier<3>
{ {
std::byte result[sizeof(value_type)]; std::byte result[sizeof(value_type)];
copy_n(bytes.begin(), bytes.size(), result); copy_n(&bytes[0], 3, result);
return *reinterpret_cast<const value_type*>(result); return *reinterpret_cast<const value_type*>(result);
} }
...@@ -102,9 +99,9 @@ __device__ inline int32_t amd_wave_read_first_lane(int32_t value) ...@@ -102,9 +99,9 @@ __device__ inline int32_t amd_wave_read_first_lane(int32_t value)
return __builtin_amdgcn_readfirstlane(value); return __builtin_amdgcn_readfirstlane(value);
} }
template < template <typename Object,
typename Object, typename = std::enable_if_t<std::is_class<Object>::value &&
typename = std::enable_if_t<std::is_class_v<Object> && std::is_trivially_copyable_v<Object>>> std::is_trivially_copyable<Object>::value>>
__device__ auto amd_wave_read_first_lane(const Object& obj) __device__ auto amd_wave_read_first_lane(const Object& obj)
{ {
using Size = unsigned; using Size = unsigned;
......
...@@ -1027,16 +1027,6 @@ struct NumericLimits<uint16_t> ...@@ -1027,16 +1027,6 @@ struct NumericLimits<uint16_t>
__host__ __device__ static constexpr uint16_t QuietNaN() { return 0; } __host__ __device__ static constexpr uint16_t QuietNaN() { return 0; }
}; };
template <>
struct NumericLimits<uint8_t>
{
__host__ __device__ static constexpr uint8_t Lowest() noexcept { return 0; }
__host__ __device__ static constexpr uint8_t Min() noexcept { return 0; }
__host__ __device__ static constexpr uint8_t Max() noexcept { return 255U; }
__host__ __device__ static constexpr uint8_t Infinity() noexcept { return 0; }
__host__ __device__ static constexpr uint8_t QuietNaN() { return 0; }
};
template <> template <>
struct NumericLimits<float> struct NumericLimits<float>
{ {
......
...@@ -44,7 +44,7 @@ __host__ __device__ f8_t run_cast_to_f8(T x, uint32_t rng) ...@@ -44,7 +44,7 @@ __host__ __device__ f8_t run_cast_to_f8(T x, uint32_t rng)
constexpr uint32_t nan_mask = is_half ? 0x7C00 : 0x7F800000; constexpr uint32_t nan_mask = is_half ? 0x7C00 : 0x7F800000;
// convert to bitwise // convert to bitwise
typedef typename std::conditional<std::is_same<T, half_t>::value, uint16_t, uint32_t>::type typedef typename ck::conditional<std::is_same<T, half_t>::value, uint16_t, uint32_t>::type
T_bitwise; T_bitwise;
T_bitwise x_bitwise = *(reinterpret_cast<T_bitwise*>(&x)); T_bitwise x_bitwise = *(reinterpret_cast<T_bitwise*>(&x));
...@@ -180,7 +180,7 @@ __host__ __device__ T run_cast_from_f8(f8_t x) ...@@ -180,7 +180,7 @@ __host__ __device__ T run_cast_from_f8(f8_t x)
constexpr int exp_low_cutoff = constexpr int exp_low_cutoff =
(1 << (type_exp - 1)) - (1 << (f8_exp - 1)) + 1 - (negative_zero_nan ? 1 : 0); (1 << (type_exp - 1)) - (1 << (f8_exp - 1)) + 1 - (negative_zero_nan ? 1 : 0);
typename std::conditional<std::is_same<T, half_t>::value, uint16_t, uint32_t>::type retval; typename ck::conditional<std::is_same<T, half_t>::value, uint16_t, uint32_t>::type retval;
if constexpr(negative_zero_nan) if constexpr(negative_zero_nan)
{ {
......
...@@ -168,9 +168,11 @@ __device__ double exp<double>(double x) ...@@ -168,9 +168,11 @@ __device__ double exp<double>(double x)
return exp(x); return exp(x);
} }
#ifndef __HIPCC_RTC__
static inline __host__ float exp(float x) { return ::expf(x); } static inline __host__ float exp(float x) { return ::expf(x); }
static inline __host__ double exp(double x) { return std::exp(x); } static inline __host__ double exp(double x) { return std::exp(x); }
#endif
// greatest common divisor, aka highest common factor // greatest common divisor, aka highest common factor
__host__ __device__ constexpr index_t gcd(index_t x, index_t y) __host__ __device__ constexpr index_t gcd(index_t x, index_t y)
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
namespace ck { namespace ck {
namespace math { namespace math {
#ifndef __HIPCC_RTC__
// math functions for the host, some are implemented by calling C++ std functions // math functions for the host, some are implemented by calling C++ std functions
static inline __host__ float abs(float x) { return std::abs(x); }; static inline __host__ float abs(float x) { return std::abs(x); };
...@@ -100,7 +101,7 @@ static inline __host__ half_t tanh(half_t x) ...@@ -100,7 +101,7 @@ static inline __host__ half_t tanh(half_t x)
static inline __host__ float tanh(float x) { return std::tanh(x); }; static inline __host__ float tanh(float x) { return std::tanh(x); };
static inline __host__ double tanh(double x) { return std::tanh(x); }; static inline __host__ double tanh(double x) { return std::tanh(x); };
#endif
// math functions for the HIP kernel, some are implemented by calling hip builtin functions // math functions for the HIP kernel, some are implemented by calling hip builtin functions
static inline __device__ float abs(float x) { return ::abs(x); }; static inline __device__ float abs(float x) { return ::abs(x); };
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
#include <ck/utility/ignore.hpp>
namespace ck { namespace ck {
...@@ -43,9 +44,9 @@ template <typename T, ...@@ -43,9 +44,9 @@ template <typename T,
std::enable_if_t<!(std::is_same<float, T>{} || std::is_same<half_t, T>{}), bool> = false> std::enable_if_t<!(std::is_same<float, T>{} || std::is_same<half_t, T>{}), bool> = false>
__host__ __device__ uint32_t prand_generator(int id, T val, uint32_t seed = seed_t) __host__ __device__ uint32_t prand_generator(int id, T val, uint32_t seed = seed_t)
{ {
std::ignore = id; ck::ignore = id;
std::ignore = val; ck::ignore = val;
std::ignore = seed; ck::ignore = seed;
return 0; return 0;
} }
......
...@@ -190,7 +190,7 @@ inline __host__ __device__ f8_t f8_convert_sr<f8_t, float>(float x) ...@@ -190,7 +190,7 @@ inline __host__ __device__ f8_t f8_convert_sr<f8_t, float>(float x)
constexpr f8_rounding_mode rm = f8_rounding_mode::stochastic; constexpr f8_rounding_mode rm = f8_rounding_mode::stochastic;
constexpr int seed = 42; constexpr int seed = 42;
// as thread id is not available on host, use 0 for prn generation // as thread id is not available on host, use 0 for prn generation
uint32_t rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x); uint32_t rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), x);
return utils::cast_to_f8<float, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>( return utils::cast_to_f8<float, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(
x, rng); x, rng);
} }
...@@ -204,7 +204,7 @@ inline __host__ __device__ f8_t f8_convert_sr<f8_t, half_t>(half_t x) ...@@ -204,7 +204,7 @@ inline __host__ __device__ f8_t f8_convert_sr<f8_t, half_t>(half_t x)
constexpr f8_rounding_mode rm = f8_rounding_mode::stochastic; constexpr f8_rounding_mode rm = f8_rounding_mode::stochastic;
constexpr int seed = 42; constexpr int seed = 42;
// as thread id is not available on host, use 0 for prn generation // as thread id is not available on host, use 0 for prn generation
uint32_t rng = prand_generator<half_t, seed>(reinterpret_cast<uintptr_t>(&x), x); uint32_t rng = prand_generator<half_t, seed>(reinterpret_cast<size_t>(&x), x);
return utils::cast_to_f8<half_t, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>( return utils::cast_to_f8<half_t, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(
x, rng); x, rng);
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment