Unverified Commit 2e3183af authored by arai713's avatar arai713 Committed by GitHub
Browse files

Codegen hipRTC compilation (#1579)



* updating codegen build for MIOpen access: adding .cmake for codegen component

* updating CMake

* adding in header guards for some headers due to issues with hiprtc compilation in MIOpen

* some more header guards

* putting env file in header guard

* cleaning up some includes

* updated types file for hiprtc purposes

* fixed types file: bit-wise/memcpy issue

* updating multiple utility files to deal with standard header inclusion for hiprtc

* added some more header guards in the utility files, replacing some standard header functionality

* added some more header guards

* fixing some conflicts in utility files, another round of header guards

* fixing errors in data type file

* resolved conflict errors in a few utility files

* added header guards/replicated functionality in device files

* resolved issues with standard headers in device files: device_base and device_grouped_conv_fwd_multiple_abd

* resolved issues with standard headers in device files: device_base.hpp, device_grouped_conv_fwd_multiple_abd.hpp, device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp

* added header guards for gridwise gemm files: gridwise_gemm_multiple_abd_xdl_cshuffle.hpp and gridwise_gemm_multiple_d_xdl_cshuffle.hpp

* fixed issue with numerics header, removed from transform_conv_fwd_to_gemm and added to device_column_to_image_impl, device_grouped_conv_fwd_multiple_abd_xdl_cshuffle, device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3, device_image_to_column_impl

* replaced standard header usage and added header guards in block to ctile map and gridwise_gemm_pipeline_selector

* resolved errors in device_gemm_xdl_splitk_c_shuffle files in regards to replacement of standard headers in previous commit

* added replicated functionality for standard header methods in utility files

* replaced standard header functionality in threadwise tensor slice transfer files and added header guards in element_wise_operation.hpp

* temp fix for namespace error in MIOpen

* remove standard header usage in codegen device op

* removed standard header usage in elementwise files, resolved namespace errors

* formatting fix

* changed codegen argument to ON for testing

* temporarily removing codegen compiler flag for testing purposes

* added codegen flag again, set default to ON

* set codegen flag default back to OFF

* replaced enable_if_t standard header usage in data_type.hpp

* added some debug prints to pinpoint issues in MIOpen

* added print outs to debug in MIOpen

* removed debug print outs from device op

* resolved stdexcept include error

* formatting fix

* adding includes to new fp8 file to resolve ck::enable_if_t errors

* made changes to amd_wave_read_first_lane

* updated functionality in type utility file

* fixed end of file issue

* resovled errors in type utility file, added functionality to array utility file

* fixed standard header usage replication in data_type file, resolves error with failing examples on navi3x

* formatting fix

* replaced standard header usage in amd_ck_fp8 file

* added include to random_gen file

* removed and replicated standard header usage from data_type and type_convert files for fp8 changes

* replicated standard unsigned integer types in random_gen

* resolved comments from review: put calls to reinterpret_cast for size_t in header guards

* updated/added copyright headers

* removed duplicate header

* fixed typo in header guard

* updated copyright headers

---------
Co-authored-by: default avatarIllia Silin <98187287+illsilin@users.noreply.github.com>
parent 2ab8bf4c
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
...@@ -212,7 +212,7 @@ template <typename SrcData, ...@@ -212,7 +212,7 @@ template <typename SrcData,
typename enable_if<DstDesc::IsKnownAtCompileTime(), bool>::type = false> typename enable_if<DstDesc::IsKnownAtCompileTime(), bool>::type = false>
struct ThreadwiseTensorSliceTransfer_v2 struct ThreadwiseTensorSliceTransfer_v2
{ {
static_assert((InvalidElementAsNaN && !std::is_integral<DstData>::value) || static_assert((InvalidElementAsNaN && !ck::is_integral<DstData>::value) ||
(!InvalidElementAsNaN), (!InvalidElementAsNaN),
"Filling invalid element as NaN is only for floating point types"); "Filling invalid element as NaN is only for floating point types");
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
#include "ck/library/utility/numeric.hpp"
#include "ck/utility/common_header.hpp" #include "ck/utility/common_header.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp" #include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp"
...@@ -148,8 +147,8 @@ struct TransformConvFwdToGemm ...@@ -148,8 +147,8 @@ struct TransformConvFwdToGemm
template <typename ConvDimsType, template <typename ConvDimsType,
typename ConvSpatialDimsType, typename ConvSpatialDimsType,
index_t NDim = NDimSpatial, index_t NDim = NDimSpatial,
typename std::enable_if<NDim == 1, bool>::type = false> typename ck::enable_if<NDim == 1, bool>::type = false>
__host__ __device__ TransformConvFwdToGemm(const ConvDimsType& a_g_n_c_wis_lengths, __host__ __device__ TransformConvFwdToGemm(const ConvDimsType& a_g_n_c_wis_lengths,
const ConvDimsType& a_g_n_c_wis_strides, const ConvDimsType& a_g_n_c_wis_strides,
const ConvDimsType& b_g_k_c_xs_lengths, const ConvDimsType& b_g_k_c_xs_lengths,
...@@ -201,11 +200,15 @@ struct TransformConvFwdToGemm ...@@ -201,11 +200,15 @@ struct TransformConvFwdToGemm
InRightPadW_{input_right_pads[I0]}, InRightPadW_{input_right_pads[I0]},
ZYX_{X_} ZYX_{X_}
{ {
#ifdef CK_CODE_GEN_RTC
static_assert(is_same_v<ConvSpatialDimsType, ck::Array<IndexType, NDimSpatial>>);
static_assert(is_same_v<ConvDimsType, ck::Array<IndexType, NDimSpatial + I3>>);
#else
static_assert(is_same_v<ConvSpatialDimsType, std::array<IndexType, NDimSpatial>> || static_assert(is_same_v<ConvSpatialDimsType, std::array<IndexType, NDimSpatial>> ||
is_same_v<ConvSpatialDimsType, ck::Array<IndexType, NDimSpatial>>); is_same_v<ConvSpatialDimsType, ck::Array<IndexType, NDimSpatial>>);
static_assert(is_same_v<ConvDimsType, std::array<IndexType, NDimSpatial + I3>> || static_assert(is_same_v<ConvDimsType, std::array<IndexType, NDimSpatial + I3>> ||
is_same_v<ConvDimsType, ck::Array<IndexType, NDimSpatial + I3>>); is_same_v<ConvDimsType, ck::Array<IndexType, NDimSpatial + I3>>);
#endif
if constexpr(SplitN) if constexpr(SplitN)
{ {
N_ = GetSplitedNSize( N_ = GetSplitedNSize(
...@@ -219,8 +222,8 @@ struct TransformConvFwdToGemm ...@@ -219,8 +222,8 @@ struct TransformConvFwdToGemm
template <typename ConvDimsType, template <typename ConvDimsType,
typename ConvSpatialDimsType, typename ConvSpatialDimsType,
index_t NDim = NDimSpatial, index_t NDim = NDimSpatial,
typename std::enable_if<NDim == 2, bool>::type = false> typename ck::enable_if<NDim == 2, bool>::type = false>
__host__ __device__ TransformConvFwdToGemm(const ConvDimsType& a_g_n_c_wis_lengths, __host__ __device__ TransformConvFwdToGemm(const ConvDimsType& a_g_n_c_wis_lengths,
const ConvDimsType& a_g_n_c_wis_strides, const ConvDimsType& a_g_n_c_wis_strides,
const ConvDimsType& b_g_k_c_xs_lengths, const ConvDimsType& b_g_k_c_xs_lengths,
...@@ -272,11 +275,15 @@ struct TransformConvFwdToGemm ...@@ -272,11 +275,15 @@ struct TransformConvFwdToGemm
InRightPadW_{input_right_pads[I1]}, InRightPadW_{input_right_pads[I1]},
ZYX_{Y_ * X_} ZYX_{Y_ * X_}
{ {
#ifdef CK_CODE_GEN_RTC
static_assert(is_same_v<ConvSpatialDimsType, ck::Array<IndexType, NDimSpatial>>);
static_assert(is_same_v<ConvDimsType, ck::Array<IndexType, NDimSpatial + I3>>);
#else
static_assert(is_same_v<ConvSpatialDimsType, std::array<IndexType, NDimSpatial>> || static_assert(is_same_v<ConvSpatialDimsType, std::array<IndexType, NDimSpatial>> ||
is_same_v<ConvSpatialDimsType, ck::Array<IndexType, NDimSpatial>>); is_same_v<ConvSpatialDimsType, ck::Array<IndexType, NDimSpatial>>);
static_assert(is_same_v<ConvDimsType, std::array<IndexType, NDimSpatial + I3>> || static_assert(is_same_v<ConvDimsType, std::array<IndexType, NDimSpatial + I3>> ||
is_same_v<ConvDimsType, ck::Array<IndexType, NDimSpatial + I3>>); is_same_v<ConvDimsType, ck::Array<IndexType, NDimSpatial + I3>>);
#endif
if constexpr(SplitN) if constexpr(SplitN)
{ {
N_ = GetSplitedNSize( N_ = GetSplitedNSize(
...@@ -290,8 +297,8 @@ struct TransformConvFwdToGemm ...@@ -290,8 +297,8 @@ struct TransformConvFwdToGemm
template <typename ConvDimsType, template <typename ConvDimsType,
typename ConvSpatialDimsType, typename ConvSpatialDimsType,
index_t NDim = NDimSpatial, index_t NDim = NDimSpatial,
typename std::enable_if<NDim == 3, bool>::type = false> typename ck::enable_if<NDim == 3, bool>::type = false>
__host__ __device__ TransformConvFwdToGemm(const ConvDimsType& a_g_n_c_wis_lengths, __host__ __device__ TransformConvFwdToGemm(const ConvDimsType& a_g_n_c_wis_lengths,
const ConvDimsType& a_g_n_c_wis_strides, const ConvDimsType& a_g_n_c_wis_strides,
const ConvDimsType& b_g_k_c_xs_lengths, const ConvDimsType& b_g_k_c_xs_lengths,
...@@ -343,11 +350,15 @@ struct TransformConvFwdToGemm ...@@ -343,11 +350,15 @@ struct TransformConvFwdToGemm
InRightPadW_{input_right_pads[I2]}, InRightPadW_{input_right_pads[I2]},
ZYX_{Z_ * Y_ * X_} ZYX_{Z_ * Y_ * X_}
{ {
#ifdef CK_CODE_GEN_RTC
static_assert(is_same_v<ConvSpatialDimsType, ck::Array<IndexType, NDimSpatial>>);
static_assert(is_same_v<ConvDimsType, ck::Array<IndexType, NDimSpatial + I3>>);
#else
static_assert(is_same_v<ConvSpatialDimsType, std::array<IndexType, NDimSpatial>> || static_assert(is_same_v<ConvSpatialDimsType, std::array<IndexType, NDimSpatial>> ||
is_same_v<ConvSpatialDimsType, ck::Array<IndexType, NDimSpatial>>); is_same_v<ConvSpatialDimsType, ck::Array<IndexType, NDimSpatial>>);
static_assert(is_same_v<ConvDimsType, std::array<IndexType, NDimSpatial + I3>> || static_assert(is_same_v<ConvDimsType, std::array<IndexType, NDimSpatial + I3>> ||
is_same_v<ConvDimsType, ck::Array<IndexType, NDimSpatial + I3>>); is_same_v<ConvDimsType, ck::Array<IndexType, NDimSpatial + I3>>);
#endif
if constexpr(SplitN) if constexpr(SplitN)
{ {
N_ = GetSplitedNSize( N_ = GetSplitedNSize(
...@@ -478,11 +489,11 @@ struct TransformConvFwdToGemm ...@@ -478,11 +489,11 @@ struct TransformConvFwdToGemm
// TODO: implement ck::tensor_layout::convolution that describe packed/strided dimemsion as // TODO: implement ck::tensor_layout::convolution that describe packed/strided dimemsion as
// properties // properties
template <typename ALayout, template <typename ALayout,
typename std::enable_if<NDimSpatial == 1 && typename ck::enable_if<NDimSpatial == 1 &&
(is_same_v<ALayout, tensor_layout::convolution::G_NW_C> || (is_same_v<ALayout, tensor_layout::convolution::G_NW_C> ||
is_same_v<ALayout, tensor_layout::convolution::NWGC> || is_same_v<ALayout, tensor_layout::convolution::NWGC> ||
is_same_v<ALayout, tensor_layout::convolution::GNWC>), is_same_v<ALayout, tensor_layout::convolution::GNWC>),
bool>::type = false> bool>::type = false>
__host__ __device__ auto MakeADescriptor_M_K() const __host__ __device__ auto MakeADescriptor_M_K() const
{ {
if constexpr(ConvForwardSpecialization == if constexpr(ConvForwardSpecialization ==
...@@ -691,11 +702,11 @@ struct TransformConvFwdToGemm ...@@ -691,11 +702,11 @@ struct TransformConvFwdToGemm
} }
template <typename ALayout, template <typename ALayout,
typename std::enable_if< typename ck::enable_if<NDimSpatial == 2 &&
NDimSpatial == 2 && (is_same_v<ALayout, tensor_layout::convolution::G_NHW_C> || (is_same_v<ALayout, tensor_layout::convolution::G_NHW_C> ||
is_same_v<ALayout, tensor_layout::convolution::NHWGC> || is_same_v<ALayout, tensor_layout::convolution::NHWGC> ||
is_same_v<ALayout, tensor_layout::convolution::GNHWC>), is_same_v<ALayout, tensor_layout::convolution::GNHWC>),
bool>::type = false> bool>::type = false>
__host__ __device__ auto MakeADescriptor_M_K() const __host__ __device__ auto MakeADescriptor_M_K() const
{ {
...@@ -932,7 +943,7 @@ struct TransformConvFwdToGemm ...@@ -932,7 +943,7 @@ struct TransformConvFwdToGemm
} }
template <typename ALayout, template <typename ALayout,
typename std::enable_if< typename ck::enable_if<
NDimSpatial == 3 && (is_same_v<ALayout, tensor_layout::convolution::G_NDHW_C> || NDimSpatial == 3 && (is_same_v<ALayout, tensor_layout::convolution::G_NDHW_C> ||
is_same_v<ALayout, tensor_layout::convolution::NDHWGC> || is_same_v<ALayout, tensor_layout::convolution::NDHWGC> ||
is_same_v<ALayout, tensor_layout::convolution::GNDHWC>), is_same_v<ALayout, tensor_layout::convolution::GNDHWC>),
...@@ -1242,19 +1253,19 @@ struct TransformConvFwdToGemm ...@@ -1242,19 +1253,19 @@ struct TransformConvFwdToGemm
} }
template <typename BLayout, template <typename BLayout,
typename std::enable_if<is_same_v<BLayout, tensor_layout::convolution::GKXC> || typename ck::enable_if<is_same_v<BLayout, tensor_layout::convolution::GKXC> ||
is_same_v<BLayout, tensor_layout::convolution::GKYXC> || is_same_v<BLayout, tensor_layout::convolution::GKYXC> ||
is_same_v<BLayout, tensor_layout::convolution::GKZYXC>, is_same_v<BLayout, tensor_layout::convolution::GKZYXC>,
bool>::type = false> bool>::type = false>
__host__ __device__ auto MakeBDescriptor_N_K() const __host__ __device__ auto MakeBDescriptor_N_K() const
{ {
if constexpr(ConvForwardSpecialization == if constexpr(ConvForwardSpecialization ==
device::ConvolutionForwardSpecialization::Filter3x3) device::ConvolutionForwardSpecialization::Filter3x3)
{ {
using FilterSizeNumType = using FilterSizeNumType =
std::conditional_t<NDimSpatial == 1, ck::conditional_t<NDimSpatial == 1,
Number<3>, Number<3>,
std::conditional_t<NDimSpatial == 2, Number<9>, Number<27>>>; ck::conditional_t<NDimSpatial == 2, Number<9>, Number<27>>>;
if constexpr(NumGroupsToMerge == 1) if constexpr(NumGroupsToMerge == 1)
{ {
...@@ -1297,13 +1308,13 @@ struct TransformConvFwdToGemm ...@@ -1297,13 +1308,13 @@ struct TransformConvFwdToGemm
template < template <
typename BLayout, typename BLayout,
typename std::enable_if<is_same_v<BLayout, tensor_layout::convolution::G_K_X_C> || typename ck::enable_if<is_same_v<BLayout, tensor_layout::convolution::G_K_X_C> ||
is_same_v<BLayout, tensor_layout::convolution::G_K_YX_C> || is_same_v<BLayout, tensor_layout::convolution::G_K_YX_C> ||
is_same_v<BLayout, tensor_layout::convolution::G_K_ZYX_C> || is_same_v<BLayout, tensor_layout::convolution::G_K_ZYX_C> ||
is_same_v<BLayout, tensor_layout::convolution::KXGC> || is_same_v<BLayout, tensor_layout::convolution::KXGC> ||
is_same_v<BLayout, tensor_layout::convolution::KYXGC> || is_same_v<BLayout, tensor_layout::convolution::KYXGC> ||
is_same_v<BLayout, tensor_layout::convolution::KZYXGC>, is_same_v<BLayout, tensor_layout::convolution::KZYXGC>,
bool>::type = false> bool>::type = false>
__host__ __device__ auto MakeBDescriptor_N_K() const __host__ __device__ auto MakeBDescriptor_N_K() const
{ {
const auto wei_k_yx_c_desc = make_naive_tensor_descriptor( const auto wei_k_yx_c_desc = make_naive_tensor_descriptor(
...@@ -1318,36 +1329,36 @@ struct TransformConvFwdToGemm ...@@ -1318,36 +1329,36 @@ struct TransformConvFwdToGemm
return wei_gemmn_gemmk_desc; return wei_gemmn_gemmk_desc;
} }
template <typename CLayout, template <
index_t NDimSp = NDimSpatial, typename CLayout,
index_t NDimSp = NDimSpatial,
typename std::enable_if<NDimSp == 1 && typename ck::enable_if<NDimSp == 1 && (is_same_v<CLayout, tensor_layout::convolution::G_K>),
(is_same_v<CLayout, tensor_layout::convolution::G_K>), bool>::type = false>
bool>::type = false>
__host__ __device__ auto MakeCDescriptor_M_N() const __host__ __device__ auto MakeCDescriptor_M_N() const
{ {
return make_naive_tensor_descriptor(make_tuple(N_ * Wo_, K_), return make_naive_tensor_descriptor(make_tuple(N_ * Wo_, K_),
make_tuple(I0, KStrideTensorC_)); make_tuple(I0, KStrideTensorC_));
} }
template <typename CLayout, template <
index_t NDimSp = NDimSpatial, typename CLayout,
index_t NDimSp = NDimSpatial,
typename std::enable_if<NDimSp == 2 && typename ck::enable_if<NDimSp == 2 && (is_same_v<CLayout, tensor_layout::convolution::G_K>),
(is_same_v<CLayout, tensor_layout::convolution::G_K>), bool>::type = false>
bool>::type = false>
__host__ __device__ auto MakeCDescriptor_M_N() const __host__ __device__ auto MakeCDescriptor_M_N() const
{ {
return make_naive_tensor_descriptor(make_tuple(N_ * Ho_ * Wo_, K_), return make_naive_tensor_descriptor(make_tuple(N_ * Ho_ * Wo_, K_),
make_tuple(I0, KStrideTensorC_)); make_tuple(I0, KStrideTensorC_));
} }
template <typename CLayout, template <
index_t NDimSp = NDimSpatial, typename CLayout,
index_t NDimSp = NDimSpatial,
typename std::enable_if<NDimSp == 3 && typename ck::enable_if<NDimSp == 3 && (is_same_v<CLayout, tensor_layout::convolution::G_K>),
(is_same_v<CLayout, tensor_layout::convolution::G_K>), bool>::type = false>
bool>::type = false>
__host__ __device__ auto MakeCDescriptor_M_N() const __host__ __device__ auto MakeCDescriptor_M_N() const
{ {
return make_naive_tensor_descriptor(make_tuple(N_ * Do_ * Ho_ * Wo_, K_), return make_naive_tensor_descriptor(make_tuple(N_ * Do_ * Ho_ * Wo_, K_),
...@@ -1355,12 +1366,12 @@ struct TransformConvFwdToGemm ...@@ -1355,12 +1366,12 @@ struct TransformConvFwdToGemm
} }
template <typename CLayout, template <typename CLayout,
index_t NDimSp = NDimSpatial, index_t NDimSp = NDimSpatial,
typename std::enable_if<NDimSp == 1 && typename ck::enable_if<NDimSp == 1 &&
(is_same_v<CLayout, tensor_layout::convolution::G_NW_K> || (is_same_v<CLayout, tensor_layout::convolution::G_NW_K> ||
is_same_v<CLayout, tensor_layout::convolution::NWGK> || is_same_v<CLayout, tensor_layout::convolution::NWGK> ||
is_same_v<CLayout, tensor_layout::convolution::GNWK>), is_same_v<CLayout, tensor_layout::convolution::GNWK>),
bool>::type = false> bool>::type = false>
__host__ __device__ auto MakeCDescriptor_M_N() const __host__ __device__ auto MakeCDescriptor_M_N() const
{ {
const IndexType NDoHoWo = N_ * Wo_; const IndexType NDoHoWo = N_ * Wo_;
...@@ -1410,11 +1421,11 @@ struct TransformConvFwdToGemm ...@@ -1410,11 +1421,11 @@ struct TransformConvFwdToGemm
template <typename CLayout, template <typename CLayout,
index_t NDimSp = NDimSpatial, index_t NDimSp = NDimSpatial,
typename std::enable_if< typename ck::enable_if<NDimSp == 2 &&
NDimSp == 2 && (is_same_v<CLayout, tensor_layout::convolution::G_NHW_K> || (is_same_v<CLayout, tensor_layout::convolution::G_NHW_K> ||
is_same_v<CLayout, tensor_layout::convolution::NHWGK> || is_same_v<CLayout, tensor_layout::convolution::NHWGK> ||
is_same_v<CLayout, tensor_layout::convolution::GNHWK>), is_same_v<CLayout, tensor_layout::convolution::GNHWK>),
bool>::type = false> bool>::type = false>
__host__ __device__ auto MakeCDescriptor_M_N() const __host__ __device__ auto MakeCDescriptor_M_N() const
{ {
const IndexType NDoHoWo = N_ * Ho_ * Wo_; const IndexType NDoHoWo = N_ * Ho_ * Wo_;
...@@ -1467,7 +1478,7 @@ struct TransformConvFwdToGemm ...@@ -1467,7 +1478,7 @@ struct TransformConvFwdToGemm
template <typename CLayout, template <typename CLayout,
index_t NDimSp = NDimSpatial, index_t NDimSp = NDimSpatial,
typename std::enable_if< typename ck::enable_if<
NDimSp == 3 && (is_same_v<CLayout, tensor_layout::convolution::G_NDHW_K> || NDimSp == 3 && (is_same_v<CLayout, tensor_layout::convolution::G_NDHW_K> ||
is_same_v<CLayout, tensor_layout::convolution::NDHWGK> || is_same_v<CLayout, tensor_layout::convolution::NDHWGK> ||
is_same_v<CLayout, tensor_layout::convolution::GNDHWK>), is_same_v<CLayout, tensor_layout::convolution::GNDHWK>),
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
#include "data_type.hpp" #include "data_type.hpp"
...@@ -1021,15 +1021,24 @@ __device__ void amd_direct_load_global_to_lds(const T* global_base_ptr, ...@@ -1021,15 +1021,24 @@ __device__ void amd_direct_load_global_to_lds(const T* global_base_ptr,
constexpr auto bytes_per_thread = sizeof(T) * NumElemsPerThread; constexpr auto bytes_per_thread = sizeof(T) * NumElemsPerThread;
static_assert(bytes_per_thread == dword_bytes); static_assert(bytes_per_thread == dword_bytes);
#ifndef CK_CODE_GEN_RTC
const uint32_t* global_ptr = const uint32_t* global_ptr =
reinterpret_cast<uint32_t*>(reinterpret_cast<uintptr_t>(global_base_ptr)); reinterpret_cast<uint32_t*>(reinterpret_cast<uintptr_t>(global_base_ptr));
#else
const uint32_t* global_ptr =
reinterpret_cast<uint32_t*>(reinterpret_cast<size_t>(global_base_ptr));
#endif
const int32x4_t src_resource = make_wave_buffer_resource(global_ptr, src_element_space_size); const int32x4_t src_resource = make_wave_buffer_resource(global_ptr, src_element_space_size);
const index_t global_offset_bytes = is_valid ? global_offset * sizeof(T) : 0x80000000; const index_t global_offset_bytes = is_valid ? global_offset * sizeof(T) : 0x80000000;
#if CK_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM #if CK_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM
T* lds_ptr = lds_base_ptr + lds_offset; T* lds_ptr = lds_base_ptr + lds_offset;
#ifndef CK_CODE_GEN_RTC
auto const lds_ptr_sgpr = auto const lds_ptr_sgpr =
__builtin_amdgcn_readfirstlane((reinterpret_cast<uintptr_t>(lds_ptr))); __builtin_amdgcn_readfirstlane((reinterpret_cast<uintptr_t>(lds_ptr)));
#else
auto const lds_ptr_sgpr = __builtin_amdgcn_readfirstlane((reinterpret_cast<size_t>(lds_ptr)));
#endif
asm volatile("s_mov_b32 m0, %0; \n\t" asm volatile("s_mov_b32 m0, %0; \n\t"
"buffer_load_dword %1, %2, 0 offen lds;\n\t" ::"s"(lds_ptr_sgpr), "buffer_load_dword %1, %2, 0 offen lds;\n\t" ::"s"(lds_ptr_sgpr),
"v"(global_offset_bytes), "v"(global_offset_bytes),
...@@ -1038,8 +1047,13 @@ __device__ void amd_direct_load_global_to_lds(const T* global_base_ptr, ...@@ -1038,8 +1047,13 @@ __device__ void amd_direct_load_global_to_lds(const T* global_base_ptr,
#else #else
// LDS pointer must be attributed with the LDS address space. // LDS pointer must be attributed with the LDS address space.
__attribute__((address_space(3))) uint32_t* lds_ptr = __attribute__((address_space(3))) uint32_t* lds_ptr =
#ifndef CK_CODE_GEN_RTC
reinterpret_cast<__attribute__((address_space(3))) uint32_t*>( reinterpret_cast<__attribute__((address_space(3))) uint32_t*>(
reinterpret_cast<uintptr_t>(lds_base_ptr + lds_offset)); reinterpret_cast<uintptr_t>(lds_base_ptr + lds_offset));
#else
reinterpret_cast<__attribute__((address_space(3))) uint32_t*>(
reinterpret_cast<size_t>(lds_base_ptr + lds_offset));
#endif
llvm_amdgcn_raw_buffer_load_lds( llvm_amdgcn_raw_buffer_load_lds(
src_resource, lds_ptr, sizeof(uint32_t), global_offset_bytes, 0, 0, 0); src_resource, lds_ptr, sizeof(uint32_t), global_offset_bytes, 0, 0, 0);
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
#include "ck/ck.hpp"
#include "ck/utility/enable_if.hpp"
#include "ck/utility/random_gen.hpp" #include "ck/utility/random_gen.hpp"
#include "ck/utility/type.hpp" #include "ck/utility/type.hpp"
...@@ -424,9 +426,9 @@ __host__ __device__ inline constexpr bool fp8_is_nan(bf8_fnuz_t a) ...@@ -424,9 +426,9 @@ __host__ __device__ inline constexpr bool fp8_is_nan(bf8_fnuz_t a)
} }
template <typename T, template <typename T,
std::enable_if_t<std::is_same_v<T, bf8_ocp_t> || std::is_same_v<T, f8_ocp_t> || ck::enable_if_t<is_same_v<T, bf8_ocp_t> || is_same_v<T, f8_ocp_t> ||
std::is_same_v<T, bf8_fnuz_t> || std::is_same_v<T, f8_fnuz_t>, is_same_v<T, bf8_fnuz_t> || is_same_v<T, f8_fnuz_t>,
bool> = true> bool> = true>
__host__ __device__ static inline constexpr bool fp8_is_inf(T) __host__ __device__ static inline constexpr bool fp8_is_inf(T)
{ {
return false; return false;
...@@ -823,7 +825,11 @@ __host__ __device__ static inline fp8_storage_t cvt_float_to_fp8(const float f) ...@@ -823,7 +825,11 @@ __host__ __device__ static inline fp8_storage_t cvt_float_to_fp8(const float f)
if constexpr(stochastic_rounding) if constexpr(stochastic_rounding)
{ {
constexpr int seed = 1254739; constexpr int seed = 1254739;
rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&f), f); #ifndef CK_CODE_GEN_RTC
rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&f), f);
#else
rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&f), f);
#endif
} }
return cast_to_f8_from_f32<interp, sat == ck_saturation_t::CK_SATFINITE, stochastic_rounding>( return cast_to_f8_from_f32<interp, sat == ck_saturation_t::CK_SATFINITE, stochastic_rounding>(
f, rng); f, rng);
...@@ -839,7 +845,11 @@ __host__ static inline fp8_storage_t cvt_float_to_fp8(const float f) ...@@ -839,7 +845,11 @@ __host__ static inline fp8_storage_t cvt_float_to_fp8(const float f)
if constexpr(stochastic_rounding) if constexpr(stochastic_rounding)
{ {
constexpr int seed = 1254739; constexpr int seed = 1254739;
#ifndef CK_CODE_GEN_RTC
rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&f), f); rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&f), f);
#else
rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&f), f);
#endif
} }
if constexpr(interp == ck_fp8_interpretation_t::CK_E4M3_FNUZ) if constexpr(interp == ck_fp8_interpretation_t::CK_E4M3_FNUZ)
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
...@@ -7,10 +7,12 @@ ...@@ -7,10 +7,12 @@
#include "ck/utility/functional2.hpp" #include "ck/utility/functional2.hpp"
#include "ck/utility/math.hpp" #include "ck/utility/math.hpp"
#ifndef CK_CODE_GEN_RTC
#include <array> #include <array>
#include <cstddef> #include <cstddef>
#include <cstdint> #include <cstdint>
#include <type_traits> #include <type_traits>
#endif
namespace ck { namespace ck {
namespace detail { namespace detail {
...@@ -37,7 +39,7 @@ struct get_carrier<3> ...@@ -37,7 +39,7 @@ struct get_carrier<3>
{ {
using value_type = uint32_t; using value_type = uint32_t;
std::array<std::byte, 3> bytes; Array<ck::byte, 3> bytes;
static_assert(sizeof(bytes) <= sizeof(value_type)); static_assert(sizeof(bytes) <= sizeof(value_type));
// replacement of host std::copy_n() // replacement of host std::copy_n()
...@@ -61,22 +63,22 @@ struct get_carrier<3> ...@@ -61,22 +63,22 @@ struct get_carrier<3>
// method to trigger template substitution failure // method to trigger template substitution failure
__device__ carrier(const carrier& other) noexcept __device__ carrier(const carrier& other) noexcept
{ {
copy_n(other.bytes.begin(), bytes.size(), bytes.begin()); copy_n(other.bytes.begin(), bytes.Size(), bytes.begin());
} }
public: public:
__device__ carrier& operator=(value_type value) noexcept __device__ carrier& operator=(value_type value) noexcept
{ {
copy_n(reinterpret_cast<const std::byte*>(&value), bytes.size(), bytes.begin()); copy_n(reinterpret_cast<const ck::byte*>(&value), bytes.Size(), bytes.begin());
return *this; return *this;
} }
__device__ operator value_type() const noexcept __device__ operator value_type() const noexcept
{ {
std::byte result[sizeof(value_type)]; ck::byte result[sizeof(value_type)];
copy_n(bytes.begin(), bytes.size(), result); copy_n(bytes.begin(), bytes.Size(), result);
return *reinterpret_cast<const value_type*>(result); return *reinterpret_cast<const value_type*>(result);
} }
...@@ -109,8 +111,8 @@ __device__ inline int64_t amd_wave_read_first_lane(int64_t value) ...@@ -109,8 +111,8 @@ __device__ inline int64_t amd_wave_read_first_lane(int64_t value)
{ {
constexpr unsigned object_size = sizeof(int64_t); constexpr unsigned object_size = sizeof(int64_t);
constexpr unsigned second_part_offset = object_size / 2; constexpr unsigned second_part_offset = object_size / 2;
auto* const from_obj = reinterpret_cast<const std::byte*>(&value); auto* const from_obj = reinterpret_cast<const ck::byte*>(&value);
alignas(int64_t) std::byte to_obj[object_size]; alignas(int64_t) ck::byte to_obj[object_size];
using Sgpr = uint32_t; using Sgpr = uint32_t;
...@@ -122,17 +124,16 @@ __device__ inline int64_t amd_wave_read_first_lane(int64_t value) ...@@ -122,17 +124,16 @@ __device__ inline int64_t amd_wave_read_first_lane(int64_t value)
return *reinterpret_cast<int64_t*>(to_obj); return *reinterpret_cast<int64_t*>(to_obj);
} }
template < template <typename Object,
typename Object, typename = ck::enable_if_t<ck::is_class_v<Object> && ck::is_trivially_copyable_v<Object>>>
typename = std::enable_if_t<std::is_class_v<Object> && std::is_trivially_copyable_v<Object>>>
__device__ auto amd_wave_read_first_lane(const Object& obj) __device__ auto amd_wave_read_first_lane(const Object& obj)
{ {
using Size = unsigned; using Size = unsigned;
constexpr Size SgprSize = 4; constexpr Size SgprSize = 4;
constexpr Size ObjectSize = sizeof(Object); constexpr Size ObjectSize = sizeof(Object);
auto* const from_obj = reinterpret_cast<const std::byte*>(&obj); auto* const from_obj = reinterpret_cast<const ck::byte*>(&obj);
alignas(Object) std::byte to_obj[ObjectSize]; alignas(Object) ck::byte to_obj[ObjectSize];
constexpr Size RemainedSize = ObjectSize % SgprSize; constexpr Size RemainedSize = ObjectSize % SgprSize;
constexpr Size CompleteSgprCopyBoundary = ObjectSize - RemainedSize; constexpr Size CompleteSgprCopyBoundary = ObjectSize - RemainedSize;
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#ifndef CK_ARRAY_HPP #ifndef CK_ARRAY_HPP
#define CK_ARRAY_HPP #define CK_ARRAY_HPP
...@@ -38,6 +38,8 @@ struct Array ...@@ -38,6 +38,8 @@ struct Array
} }
__host__ __device__ constexpr const TData* begin() const { return &mData[0]; } __host__ __device__ constexpr const TData* begin() const { return &mData[0]; }
__host__ __device__ constexpr const TData* end() const { return &mData[NSize]; } __host__ __device__ constexpr const TData* end() const { return &mData[NSize]; }
__host__ __device__ constexpr TData* begin() { return &mData[0]; }
__host__ __device__ constexpr TData* end() { return &mData[NSize]; }
}; };
// empty Array // empty Array
...@@ -54,7 +56,7 @@ template <typename X, typename... Xs> ...@@ -54,7 +56,7 @@ template <typename X, typename... Xs>
__host__ __device__ constexpr auto make_array(X&& x, Xs&&... xs) __host__ __device__ constexpr auto make_array(X&& x, Xs&&... xs)
{ {
using data_type = remove_cvref_t<X>; using data_type = remove_cvref_t<X>;
return Array<data_type, sizeof...(Xs) + 1>{std::forward<X>(x), std::forward<Xs>(xs)...}; return Array<data_type, sizeof...(Xs) + 1>{ck::forward<X>(x), ck::forward<Xs>(xs)...};
} }
// make empty array // make empty array
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#ifndef CK_CONTAINER_HELPER_HPP #ifndef CK_CONTAINER_HELPER_HPP
#define CK_CONTAINER_HELPER_HPP #define CK_CONTAINER_HELPER_HPP
...@@ -326,14 +326,14 @@ template <typename T, index_t NX, index_t NY> ...@@ -326,14 +326,14 @@ template <typename T, index_t NX, index_t NY>
__host__ __device__ constexpr auto container_concat(const Array<T, NX>& ax, const Array<T, NY>& ay) __host__ __device__ constexpr auto container_concat(const Array<T, NX>& ax, const Array<T, NY>& ay)
{ {
return unpack2( return unpack2(
[&](auto&&... zs) { return make_array(std::forward<decltype(zs)>(zs)...); }, ax, ay); [&](auto&&... zs) { return make_array(ck::forward<decltype(zs)>(zs)...); }, ax, ay);
} }
template <typename... X, typename... Y> template <typename... X, typename... Y>
__host__ __device__ constexpr auto container_concat(const Tuple<X...>& tx, const Tuple<Y...>& ty) __host__ __device__ constexpr auto container_concat(const Tuple<X...>& tx, const Tuple<Y...>& ty)
{ {
return unpack2( return unpack2(
[&](auto&&... zs) { return make_tuple(std::forward<decltype(zs)>(zs)...); }, tx, ty); [&](auto&&... zs) { return make_tuple(ck::forward<decltype(zs)>(zs)...); }, tx, ty);
} }
template <typename Container> template <typename Container>
......
...@@ -5,9 +5,21 @@ ...@@ -5,9 +5,21 @@
#include "ck/utility/amd_ck_fp8.hpp" #include "ck/utility/amd_ck_fp8.hpp"
#include "ck/utility/statically_indexed_array.hpp" #include "ck/utility/statically_indexed_array.hpp"
#ifdef CK_CODE_GEN_RTC
using int8_t = signed char;
using uint8_t = unsigned char;
using int16_t = signed short;
using uint16_t = unsigned short;
using float_t = float;
#endif
namespace ck { namespace ck {
#ifdef CK_CODE_GEN_RTC
using byte = unsigned char;
#else
using std::byte;
#endif
using bhalf_t = ushort; using bhalf_t = ushort;
using half_t = _Float16; using half_t = _Float16;
using int4_t = _BitInt(4); using int4_t = _BitInt(4);
...@@ -217,7 +229,7 @@ struct scalar_type<bool> ...@@ -217,7 +229,7 @@ struct scalar_type<bool>
}; };
template <typename T> template <typename T>
struct vector_type<T, 1, typename std::enable_if_t<is_native_type<T>()>> struct vector_type<T, 1, typename ck::enable_if_t<is_native_type<T>()>>
{ {
using d1_t = T; using d1_t = T;
using type = d1_t; using type = d1_t;
...@@ -253,7 +265,7 @@ struct vector_type<T, 1, typename std::enable_if_t<is_native_type<T>()>> ...@@ -253,7 +265,7 @@ struct vector_type<T, 1, typename std::enable_if_t<is_native_type<T>()>>
__device__ int static err = 0; __device__ int static err = 0;
template <typename T> template <typename T>
struct vector_type<T, 2, typename std::enable_if_t<is_native_type<T>()>> struct vector_type<T, 2, typename ck::enable_if_t<is_native_type<T>()>>
{ {
using d1_t = T; using d1_t = T;
typedef T d2_t __attribute__((ext_vector_type(2))); typedef T d2_t __attribute__((ext_vector_type(2)));
...@@ -313,7 +325,7 @@ struct vector_type<T, 2, typename std::enable_if_t<is_native_type<T>()>> ...@@ -313,7 +325,7 @@ struct vector_type<T, 2, typename std::enable_if_t<is_native_type<T>()>>
}; };
template <typename T> template <typename T>
struct vector_type<T, 3, typename std::enable_if_t<is_native_type<T>()>> struct vector_type<T, 3, typename ck::enable_if_t<is_native_type<T>()>>
{ {
using d1_t = T; using d1_t = T;
typedef T d2_t __attribute__((ext_vector_type(2))); typedef T d2_t __attribute__((ext_vector_type(2)));
...@@ -383,7 +395,7 @@ struct vector_type<T, 3, typename std::enable_if_t<is_native_type<T>()>> ...@@ -383,7 +395,7 @@ struct vector_type<T, 3, typename std::enable_if_t<is_native_type<T>()>>
}; };
template <typename T> template <typename T>
struct vector_type<T, 4, typename std::enable_if_t<is_native_type<T>()>> struct vector_type<T, 4, typename ck::enable_if_t<is_native_type<T>()>>
{ {
using d1_t = T; using d1_t = T;
typedef T d2_t __attribute__((ext_vector_type(2))); typedef T d2_t __attribute__((ext_vector_type(2)));
...@@ -453,7 +465,7 @@ struct vector_type<T, 4, typename std::enable_if_t<is_native_type<T>()>> ...@@ -453,7 +465,7 @@ struct vector_type<T, 4, typename std::enable_if_t<is_native_type<T>()>>
}; };
template <typename T> template <typename T>
struct vector_type<T, 5, typename std::enable_if_t<is_native_type<T>()>> struct vector_type<T, 5, typename ck::enable_if_t<is_native_type<T>()>>
{ {
using d1_t = T; using d1_t = T;
typedef T d4_t __attribute__((ext_vector_type(4))); typedef T d4_t __attribute__((ext_vector_type(4)));
...@@ -523,7 +535,7 @@ struct vector_type<T, 5, typename std::enable_if_t<is_native_type<T>()>> ...@@ -523,7 +535,7 @@ struct vector_type<T, 5, typename std::enable_if_t<is_native_type<T>()>>
}; };
template <typename T> template <typename T>
struct vector_type<T, 7, typename std::enable_if_t<is_native_type<T>()>> struct vector_type<T, 7, typename ck::enable_if_t<is_native_type<T>()>>
{ {
using d1_t = T; using d1_t = T;
typedef T d2_t __attribute__((ext_vector_type(2))); typedef T d2_t __attribute__((ext_vector_type(2)));
...@@ -605,7 +617,7 @@ struct vector_type<T, 7, typename std::enable_if_t<is_native_type<T>()>> ...@@ -605,7 +617,7 @@ struct vector_type<T, 7, typename std::enable_if_t<is_native_type<T>()>>
}; };
template <typename T> template <typename T>
struct vector_type<T, 8, typename std::enable_if_t<is_native_type<T>()>> struct vector_type<T, 8, typename ck::enable_if_t<is_native_type<T>()>>
{ {
using d1_t = T; using d1_t = T;
typedef T d2_t __attribute__((ext_vector_type(2))); typedef T d2_t __attribute__((ext_vector_type(2)));
...@@ -687,7 +699,7 @@ struct vector_type<T, 8, typename std::enable_if_t<is_native_type<T>()>> ...@@ -687,7 +699,7 @@ struct vector_type<T, 8, typename std::enable_if_t<is_native_type<T>()>>
}; };
template <typename T> template <typename T>
struct vector_type<T, 13, typename std::enable_if_t<is_native_type<T>()>> struct vector_type<T, 13, typename ck::enable_if_t<is_native_type<T>()>>
{ {
using d1_t = T; using d1_t = T;
typedef T d4_t __attribute__((ext_vector_type(4))); typedef T d4_t __attribute__((ext_vector_type(4)));
...@@ -769,7 +781,7 @@ struct vector_type<T, 13, typename std::enable_if_t<is_native_type<T>()>> ...@@ -769,7 +781,7 @@ struct vector_type<T, 13, typename std::enable_if_t<is_native_type<T>()>>
}; };
template <typename T> template <typename T>
struct vector_type<T, 16, typename std::enable_if_t<is_native_type<T>()>> struct vector_type<T, 16, typename ck::enable_if_t<is_native_type<T>()>>
{ {
using d1_t = T; using d1_t = T;
typedef T d2_t __attribute__((ext_vector_type(2))); typedef T d2_t __attribute__((ext_vector_type(2)));
...@@ -863,7 +875,7 @@ struct vector_type<T, 16, typename std::enable_if_t<is_native_type<T>()>> ...@@ -863,7 +875,7 @@ struct vector_type<T, 16, typename std::enable_if_t<is_native_type<T>()>>
}; };
template <typename T> template <typename T>
struct vector_type<T, 32, typename std::enable_if_t<is_native_type<T>()>> struct vector_type<T, 32, typename ck::enable_if_t<is_native_type<T>()>>
{ {
using d1_t = T; using d1_t = T;
typedef T d2_t __attribute__((ext_vector_type(2))); typedef T d2_t __attribute__((ext_vector_type(2)));
...@@ -967,7 +979,7 @@ struct vector_type<T, 32, typename std::enable_if_t<is_native_type<T>()>> ...@@ -967,7 +979,7 @@ struct vector_type<T, 32, typename std::enable_if_t<is_native_type<T>()>>
}; };
template <typename T> template <typename T>
struct vector_type<T, 64, typename std::enable_if_t<is_native_type<T>()>> struct vector_type<T, 64, typename ck::enable_if_t<is_native_type<T>()>>
{ {
using d1_t = T; using d1_t = T;
typedef T d2_t __attribute__((ext_vector_type(2))); typedef T d2_t __attribute__((ext_vector_type(2)));
...@@ -1083,7 +1095,7 @@ struct vector_type<T, 64, typename std::enable_if_t<is_native_type<T>()>> ...@@ -1083,7 +1095,7 @@ struct vector_type<T, 64, typename std::enable_if_t<is_native_type<T>()>>
}; };
template <typename T> template <typename T>
struct vector_type<T, 128, typename std::enable_if_t<is_native_type<T>()>> struct vector_type<T, 128, typename ck::enable_if_t<is_native_type<T>()>>
{ {
using d1_t = T; using d1_t = T;
typedef T d2_t __attribute__((ext_vector_type(2))); typedef T d2_t __attribute__((ext_vector_type(2)));
...@@ -1209,7 +1221,7 @@ struct vector_type<T, 128, typename std::enable_if_t<is_native_type<T>()>> ...@@ -1209,7 +1221,7 @@ struct vector_type<T, 128, typename std::enable_if_t<is_native_type<T>()>>
}; };
template <typename T> template <typename T>
struct vector_type<T, 256, typename std::enable_if_t<is_native_type<T>()>> struct vector_type<T, 256, typename ck::enable_if_t<is_native_type<T>()>>
{ {
using d1_t = T; using d1_t = T;
typedef T d2_t __attribute__((ext_vector_type(2))); typedef T d2_t __attribute__((ext_vector_type(2)));
...@@ -1374,7 +1386,7 @@ template <typename T, index_t N> ...@@ -1374,7 +1386,7 @@ template <typename T, index_t N>
struct non_native_vector_base< struct non_native_vector_base<
T, T,
N, N,
std::enable_if_t<sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8>> ck::enable_if_t<sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8>>
{ {
using data_t = typename nnvb_data_t_selector<T>::type; // select data_t based on the size of T using data_t = typename nnvb_data_t_selector<T>::type; // select data_t based on the size of T
static_assert(sizeof(T) == sizeof(data_t), "non_native_vector_base storage size mismatch"); static_assert(sizeof(T) == sizeof(data_t), "non_native_vector_base storage size mismatch");
...@@ -1499,7 +1511,7 @@ struct scalar_type<non_native_vector_base<pk_i4_t, N>> ...@@ -1499,7 +1511,7 @@ struct scalar_type<non_native_vector_base<pk_i4_t, N>>
// non-native vector_type implementation // non-native vector_type implementation
template <typename T> template <typename T>
struct vector_type<T, 1, typename std::enable_if_t<!is_native_type<T>()>> struct vector_type<T, 1, typename ck::enable_if_t<!is_native_type<T>()>>
{ {
using d1_t = T; using d1_t = T;
using d1_nnv_t = non_native_vector_base<T, 1>; using d1_nnv_t = non_native_vector_base<T, 1>;
...@@ -1550,7 +1562,7 @@ struct vector_type<T, 1, typename std::enable_if_t<!is_native_type<T>()>> ...@@ -1550,7 +1562,7 @@ struct vector_type<T, 1, typename std::enable_if_t<!is_native_type<T>()>>
}; };
template <typename T> template <typename T>
struct vector_type<T, 2, typename std::enable_if_t<!is_native_type<T>()>> struct vector_type<T, 2, typename ck::enable_if_t<!is_native_type<T>()>>
{ {
using d1_t = T; using d1_t = T;
using d1_nnv_t = non_native_vector_base<T, 1>; using d1_nnv_t = non_native_vector_base<T, 1>;
...@@ -1613,7 +1625,7 @@ struct vector_type<T, 2, typename std::enable_if_t<!is_native_type<T>()>> ...@@ -1613,7 +1625,7 @@ struct vector_type<T, 2, typename std::enable_if_t<!is_native_type<T>()>>
}; };
template <typename T> template <typename T>
struct vector_type<T, 4, typename std::enable_if_t<!is_native_type<T>()>> struct vector_type<T, 4, typename ck::enable_if_t<!is_native_type<T>()>>
{ {
using d1_t = T; using d1_t = T;
using d1_nnv_t = non_native_vector_base<T, 1>; using d1_nnv_t = non_native_vector_base<T, 1>;
...@@ -1686,7 +1698,7 @@ struct vector_type<T, 4, typename std::enable_if_t<!is_native_type<T>()>> ...@@ -1686,7 +1698,7 @@ struct vector_type<T, 4, typename std::enable_if_t<!is_native_type<T>()>>
}; };
template <typename T> template <typename T>
struct vector_type<T, 8, typename std::enable_if_t<!is_native_type<T>()>> struct vector_type<T, 8, typename ck::enable_if_t<!is_native_type<T>()>>
{ {
using d1_t = T; using d1_t = T;
using d1_nnv_t = non_native_vector_base<T, 1>; using d1_nnv_t = non_native_vector_base<T, 1>;
...@@ -1771,7 +1783,7 @@ struct vector_type<T, 8, typename std::enable_if_t<!is_native_type<T>()>> ...@@ -1771,7 +1783,7 @@ struct vector_type<T, 8, typename std::enable_if_t<!is_native_type<T>()>>
}; };
template <typename T> template <typename T>
struct vector_type<T, 16, typename std::enable_if_t<!is_native_type<T>()>> struct vector_type<T, 16, typename ck::enable_if_t<!is_native_type<T>()>>
{ {
using d1_t = T; using d1_t = T;
using d1_nnv_t = non_native_vector_base<T, 1>; using d1_nnv_t = non_native_vector_base<T, 1>;
...@@ -1866,7 +1878,7 @@ struct vector_type<T, 16, typename std::enable_if_t<!is_native_type<T>()>> ...@@ -1866,7 +1878,7 @@ struct vector_type<T, 16, typename std::enable_if_t<!is_native_type<T>()>>
}; };
template <typename T> template <typename T>
struct vector_type<T, 32, typename std::enable_if_t<!is_native_type<T>()>> struct vector_type<T, 32, typename ck::enable_if_t<!is_native_type<T>()>>
{ {
using d1_t = T; using d1_t = T;
using d2_t = non_native_vector_base<T, 2>; using d2_t = non_native_vector_base<T, 2>;
...@@ -1970,7 +1982,7 @@ struct vector_type<T, 32, typename std::enable_if_t<!is_native_type<T>()>> ...@@ -1970,7 +1982,7 @@ struct vector_type<T, 32, typename std::enable_if_t<!is_native_type<T>()>>
}; };
template <typename T> template <typename T>
struct vector_type<T, 64, typename std::enable_if_t<!is_native_type<T>()>> struct vector_type<T, 64, typename ck::enable_if_t<!is_native_type<T>()>>
{ {
using d1_t = T; using d1_t = T;
using d2_t = non_native_vector_base<T, 2>; using d2_t = non_native_vector_base<T, 2>;
...@@ -2210,20 +2222,230 @@ using pk_i4x2_t = typename vector_type<pk_i4_t, 2>::type; ...@@ -2210,20 +2222,230 @@ using pk_i4x2_t = typename vector_type<pk_i4_t, 2>::type;
using pk_i4x4_t = typename vector_type<pk_i4_t, 4>::type; using pk_i4x4_t = typename vector_type<pk_i4_t, 4>::type;
using pk_i4x8_t = typename vector_type<pk_i4_t, 8>::type; using pk_i4x8_t = typename vector_type<pk_i4_t, 8>::type;
#ifdef CK_CODE_GEN_RTC
template <typename T>
struct NumericLimits;
template <>
struct NumericLimits<int32_t>
{
__host__ __device__ static constexpr int32_t Lowest() noexcept { return -2147483647 - 1; }
__host__ __device__ static constexpr int32_t Min() noexcept { return -2147483647 - 1; }
__host__ __device__ static constexpr int32_t Max() noexcept { return 2147483647; }
__host__ __device__ static constexpr int32_t Infinity() noexcept { return 0; }
__host__ __device__ static constexpr int32_t QuietNaN() { return 0; }
};
template <>
struct NumericLimits<int16_t>
{
__host__ __device__ static constexpr int16_t Lowest() noexcept { return -32768; }
__host__ __device__ static constexpr int16_t Min() noexcept { return -32768; }
__host__ __device__ static constexpr int16_t Max() noexcept { return 32767; }
__host__ __device__ static constexpr int16_t Infinity() noexcept { return 0; }
__host__ __device__ static constexpr int16_t QuietNaN() { return 0; }
};
template <>
struct NumericLimits<int8_t>
{
__host__ __device__ static constexpr int8_t Lowest() noexcept { return -128; }
__host__ __device__ static constexpr int8_t Min() noexcept { return -128; }
__host__ __device__ static constexpr int8_t Max() noexcept { return 127; }
__host__ __device__ static constexpr int8_t Infinity() noexcept { return 0; }
__host__ __device__ static constexpr int8_t QuietNaN() { return 0; }
};
template <>
struct NumericLimits<uint32_t>
{
__host__ __device__ static constexpr uint32_t Lowest() noexcept { return 0; }
__host__ __device__ static constexpr uint32_t Min() noexcept { return 0; }
__host__ __device__ static constexpr uint32_t Max() noexcept { return 4294967295U; }
__host__ __device__ static constexpr uint32_t Infinity() noexcept { return 0; }
__host__ __device__ static constexpr uint32_t QuietNaN() { return 0; }
};
template <>
struct NumericLimits<uint16_t>
{
__host__ __device__ static constexpr uint16_t Lowest() noexcept { return 0; }
__host__ __device__ static constexpr uint16_t Min() noexcept { return 0; }
__host__ __device__ static constexpr uint16_t Max() noexcept { return 65535U; }
__host__ __device__ static constexpr uint16_t Infinity() noexcept { return 0; }
__host__ __device__ static constexpr uint16_t QuietNaN() { return 0; }
};
template <>
struct NumericLimits<float>
{
static constexpr unsigned int binary_min = 0x00800000;
static constexpr unsigned int binary_max = 0x7F7FFFFF;
static constexpr unsigned int binary_lowest = 0xFF7FFFFF;
static constexpr unsigned int binary_qnan = 0xFFC00001;
static constexpr unsigned int binary_inf = 0x7F8000000;
__host__ __device__ static constexpr float Min() { return bit_cast<float>(binary_min); }
__host__ __device__ static constexpr float Max() { return bit_cast<float>(binary_max); }
__host__ __device__ static constexpr float Lowest() { return bit_cast<float>(binary_lowest); }
__host__ __device__ static constexpr float QuietNaN() { return bit_cast<float>(binary_qnan); }
__host__ __device__ static constexpr float Infinity() { return bit_cast<float>(binary_inf); }
};
template <>
struct NumericLimits<half_t>
{
static constexpr unsigned short binary_min = 0x0400;
static constexpr unsigned short binary_max = 0x7BFF;
static constexpr unsigned short binary_lowest = 0xFBFF;
static constexpr unsigned short binary_qnan = 0x7FFF;
__host__ __device__ static constexpr half_t Min() { return bit_cast<half_t>(binary_min); }
__host__ __device__ static constexpr half_t Max() { return bit_cast<half_t>(binary_max); }
__host__ __device__ static constexpr half_t Lowest() { return bit_cast<half_t>(binary_lowest); }
__host__ __device__ static constexpr half_t QuietNaN() { return bit_cast<half_t>(binary_qnan); }
};
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
template <>
struct NumericLimits<int4_t>
{
__host__ __device__ static constexpr int4_t Min() { return int4_t(-8); }
__host__ __device__ static constexpr int4_t Max() { return int4_t(7); }
__host__ __device__ static constexpr int4_t Lowest() { return int4_t(-8); }
};
#endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
template <>
struct NumericLimits<f8_fnuz_t>
{
// negative zero nan mode with exp bias = 8
static constexpr uint8_t binary_min = 0x08; // 0b00001000
static constexpr uint8_t binary_max = 0x7F; // 0b01111111
static constexpr uint8_t binary_lowest = 0xFF; // 0b11111111
static constexpr uint8_t binary_qnan = 0x80; // 0b10000000
// ieee mode with exp bias = 7
// static constexpr uint8_t binary_min = 0x08; // 0b00001000
// static constexpr uint8_t binary_max = 0x77; // 0b01110111
// static constexpr uint8_t binary_lowest = 0xF7; // 0b11110111
// static constexpr uint8_t binary_qnan = 0x79; // any sign, exp=1111, mant!=0
__host__ __device__ static constexpr f8_fnuz_t Min() { return f8_fnuz_t(binary_min); }
__host__ __device__ static constexpr f8_fnuz_t Max() { return f8_fnuz_t(binary_max); }
__host__ __device__ static constexpr f8_fnuz_t Lowest() { return f8_fnuz_t(binary_lowest); }
__host__ __device__ static constexpr f8_fnuz_t QuietNaN() { return f8_fnuz_t(binary_qnan); }
};
template <>
struct NumericLimits<bf8_fnuz_t>
{
// negative zero nan mode with exp bias = 16
static constexpr uint8_t binary_min = 0x04; // 0b00000100
static constexpr uint8_t binary_max = 0x7F; // 0b01111111
static constexpr uint8_t binary_lowest = 0xFF; // 0b11111111
static constexpr uint8_t binary_qnan = 0x80; // 0b10000000
// ieee mode with exp bias = 15
// static constexpr uint8_t binary_min = 0x04; // 0b00000100
// static constexpr uint8_t binary_max = 0x7B; // 0b01111011
// static constexpr uint8_t binary_lowest = 0xFB; // 0b11111011
// static constexpr uint8_t binary_qnan = 0x79; // any sign, exp=1111, mant!=
__host__ __device__ static constexpr bf8_fnuz_t Min() { return bf8_fnuz_t(binary_min); }
__host__ __device__ static constexpr bf8_fnuz_t Max() { return bf8_fnuz_t(binary_max); }
__host__ __device__ static constexpr bf8_fnuz_t Lowest() { return bf8_fnuz_t(binary_lowest); }
__host__ __device__ static constexpr bf8_fnuz_t QuietNaN() { return bf8_fnuz_t(binary_qnan); }
};
template <>
struct NumericLimits<f8_ocp_t>
{
static constexpr uint8_t binary_min = 0x08; // 0b00001000 = 2^-6
static constexpr uint8_t binary_max = 0x7E; // 0b01111110 = 448
static constexpr uint8_t binary_lowest = 0xFE; // 0b11111110 = -448
static constexpr uint8_t binary_qnan = 0x7F; // 0b01111111
__host__ __device__ static constexpr f8_ocp_t Min() { return bit_cast<f8_ocp_t>(binary_min); }
__host__ __device__ static constexpr f8_ocp_t Max() { return bit_cast<f8_ocp_t>(binary_max); }
__host__ __device__ static constexpr f8_ocp_t Lowest()
{
return bit_cast<f8_ocp_t>(binary_lowest);
}
__host__ __device__ static constexpr f8_ocp_t QuietNaN()
{
return bit_cast<f8_ocp_t>(binary_qnan);
}
};
template <>
struct NumericLimits<bf8_ocp_t>
{
static constexpr uint8_t binary_min = 0x04; // 0b00000100 = 2^-14
static constexpr uint8_t binary_max = 0x7B; // 0b01111011 = 57344
static constexpr uint8_t binary_lowest = 0xFB; // 0b11111011 = -57344
static constexpr uint8_t binary_qnan = 0x7D; // 0b01111101
__host__ __device__ static constexpr bf8_ocp_t Min() { return bit_cast<bf8_ocp_t>(binary_min); }
__host__ __device__ static constexpr bf8_ocp_t Max() { return bit_cast<bf8_ocp_t>(binary_max); }
__host__ __device__ static constexpr bf8_ocp_t Lowest()
{
return bit_cast<bf8_ocp_t>(binary_lowest);
}
__host__ __device__ static constexpr bf8_ocp_t QuietNaN()
{
return bit_cast<bf8_ocp_t>(binary_qnan);
}
};
#else
template <typename T> template <typename T>
struct NumericLimits struct NumericLimits
{ {
__host__ __device__ static constexpr T Min() { return std::numeric_limits<T>::min(); } __host__ __device__ static constexpr T Min() { return std::numeric_limits<T>::min(); }
__host__ __device__ static constexpr T Max() { return std::numeric_limits<T>::max(); } __host__ __device__ static constexpr T Max() { return std::numeric_limits<T>::max(); }
__host__ __device__ static constexpr T Lowest() { return std::numeric_limits<T>::lowest(); } __host__ __device__ static constexpr T Lowest() { return std::numeric_limits<T>::lowest(); }
__host__ __device__ static constexpr T QuietNaN() __host__ __device__ static constexpr T QuietNaN()
{ {
return std::numeric_limits<T>::quiet_NaN(); return std::numeric_limits<T>::quiet_NaN();
} }
__host__ __device__ static constexpr T Infinity() { return std::numeric_limits<T>::infinity(); } __host__ __device__ static constexpr T Infinity() { return std::numeric_limits<T>::infinity(); }
}; };
...@@ -2347,6 +2569,7 @@ struct NumericLimits<bf8_ocp_t> ...@@ -2347,6 +2569,7 @@ struct NumericLimits<bf8_ocp_t>
return bit_cast<bf8_ocp_t>(binary_qnan); return bit_cast<bf8_ocp_t>(binary_qnan);
} }
}; };
#endif
template <typename T> template <typename T>
struct NumericUtils struct NumericUtils
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#ifndef UTILITY_DEBUG_HPP #ifndef UTILITY_DEBUG_HPP
#define UTILITY_DEBUG_HPP #define UTILITY_DEBUG_HPP
#include "type.hpp"
namespace ck { namespace ck {
namespace debug { namespace debug {
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
namespace ck { namespace ck {
#ifndef CK_CODE_GEN_RTC
template <bool B, typename T = void> template <bool B, typename T = void>
using enable_if = std::enable_if<B, T>; using enable_if = std::enable_if<B, T>;
template <bool B, typename T = void> template <bool B, typename T = void>
using enable_if_t = typename std::enable_if<B, T>::type; using enable_if_t = typename std::enable_if<B, T>::type;
#else
template <bool B, class T = void>
struct enable_if
{
};
template <class T>
struct enable_if<true, T>
{
using type = T;
};
template <bool B, class T = void>
using enable_if_t = typename enable_if<B, T>::type;
#endif
} // namespace ck } // namespace ck
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
#ifndef CK_CODE_GEN_RTC
#pragma once #pragma once
#include <cstdlib> #include <cstdlib>
...@@ -183,3 +184,4 @@ void UpdateEnvVar(EnvVar, const std::string_view& val) ...@@ -183,3 +184,4 @@ void UpdateEnvVar(EnvVar, const std::string_view& val)
} }
} // namespace ck } // namespace ck
#endif
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
...@@ -120,11 +120,11 @@ constexpr auto conditional_expr(X&& x, Y&& y) ...@@ -120,11 +120,11 @@ constexpr auto conditional_expr(X&& x, Y&& y)
{ {
if constexpr(predicate) if constexpr(predicate)
{ {
return std::forward<X>(x); return ck::forward<X>(x);
} }
else else
{ {
return std::forward<Y>(y); return ck::forward<Y>(y);
} }
} }
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#ifndef CK_FUNCTIONAL4_HPP #ifndef CK_FUNCTIONAL4_HPP
#define CK_FUNCTIONAL4_HPP #define CK_FUNCTIONAL4_HPP
...@@ -21,7 +21,7 @@ struct unpack_impl<Sequence<Is...>> ...@@ -21,7 +21,7 @@ struct unpack_impl<Sequence<Is...>>
template <typename F, typename X> template <typename F, typename X>
__host__ __device__ constexpr auto operator()(F&& f, X&& x) const __host__ __device__ constexpr auto operator()(F&& f, X&& x) const
{ {
return std::forward<F>(f)(std::forward<X>(x).At(Number<Is>{})...); return ck::forward<F>(f)(ck::forward<X>(x).At(Number<Is>{})...);
} }
}; };
...@@ -35,8 +35,8 @@ struct unpack2_impl<Sequence<Is...>, Sequence<Js...>> ...@@ -35,8 +35,8 @@ struct unpack2_impl<Sequence<Is...>, Sequence<Js...>>
template <typename F, typename X, typename Y> template <typename F, typename X, typename Y>
__host__ __device__ constexpr auto operator()(F&& f, X&& x, Y&& y) const __host__ __device__ constexpr auto operator()(F&& f, X&& x, Y&& y) const
{ {
return std::forward<F>(f)(std::forward<X>(x).At(Number<Is>{})..., return ck::forward<F>(f)(ck::forward<X>(x).At(Number<Is>{})...,
std::forward<Y>(y).At(Number<Js>{})...); ck::forward<Y>(y).At(Number<Js>{})...);
} }
}; };
...@@ -47,7 +47,7 @@ __host__ __device__ constexpr auto unpack(F&& f, X&& x) ...@@ -47,7 +47,7 @@ __host__ __device__ constexpr auto unpack(F&& f, X&& x)
{ {
using X_ = remove_reference_t<X>; using X_ = remove_reference_t<X>;
return detail::unpack_impl<typename arithmetic_sequence_gen<0, X_::Size(), 1>::type>{}( return detail::unpack_impl<typename arithmetic_sequence_gen<0, X_::Size(), 1>::type>{}(
std::forward<F>(f), std::forward<X>(x)); ck::forward<F>(f), ck::forward<X>(x));
} }
// TODO: properly implement unpack that takes any number of containers // TODO: properly implement unpack that takes any number of containers
...@@ -58,7 +58,7 @@ __host__ __device__ constexpr auto unpack2(F&& f, X&& x, Y&& y) ...@@ -58,7 +58,7 @@ __host__ __device__ constexpr auto unpack2(F&& f, X&& x, Y&& y)
using Y_ = remove_reference_t<Y>; using Y_ = remove_reference_t<Y>;
return detail::unpack2_impl<typename arithmetic_sequence_gen<0, X_::Size(), 1>::type, return detail::unpack2_impl<typename arithmetic_sequence_gen<0, X_::Size(), 1>::type,
typename arithmetic_sequence_gen<0, Y_::Size(), 1>::type>{}( typename arithmetic_sequence_gen<0, Y_::Size(), 1>::type>{}(
std::forward<F>(f), std::forward<X>(x), std::forward<Y>(y)); ck::forward<F>(f), ck::forward<X>(x), ck::forward<Y>(y));
} }
} // namespace ck } // namespace ck
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
...@@ -48,4 +48,9 @@ __host__ __device__ constexpr auto operator%(integral_constant<TX, X>, integral_ ...@@ -48,4 +48,9 @@ __host__ __device__ constexpr auto operator%(integral_constant<TX, X>, integral_
return integral_constant<decltype(X % Y), X % Y>{}; return integral_constant<decltype(X % Y), X % Y>{};
} }
template <bool B>
using bool_constant = integral_constant<bool, B>;
using true_type = bool_constant<true>;
using false_type = bool_constant<false>;
} // namespace ck } // namespace ck
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
#include "ck/utility/integral_constant.hpp"
namespace ck { namespace ck {
namespace detail { namespace detail {
template <class Default, class AlwaysVoid, template <class...> class Op, class... Args> template <class Default, class AlwaysVoid, template <class...> class Op, class... Args>
struct detector struct detector
{ {
using value_t = std::false_type; using value_t = integral_constant<bool, false>;
using type = Default; using type = Default;
}; };
template <class Default, template <class...> class Op, class... Args> template <class Default, template <class...> class Op, class... Args>
struct detector<Default, std::void_t<Op<Args...>>, Op, Args...> struct detector<Default, ck::void_t<Op<Args...>>, Op, Args...>
{ {
using value_t = std::true_type; using value_t = integral_constant<bool, true>;
using type = Op<Args...>; using type = Op<Args...>;
}; };
} // namespace detail } // namespace detail
...@@ -32,12 +34,12 @@ template <template <class...> class Op, class... Args> ...@@ -32,12 +34,12 @@ template <template <class...> class Op, class... Args>
using is_detected = typename detail::detector<nonesuch, void, Op, Args...>::value_t; using is_detected = typename detail::detector<nonesuch, void, Op, Args...>::value_t;
template <typename T> template <typename T>
using is_pack2_invocable_t = decltype(std::declval<T&>().is_pack2_invocable); using is_pack2_invocable_t = decltype(ck::declval<T&>().is_pack2_invocable);
template <typename T> template <typename T>
using is_pack4_invocable_t = decltype(std::declval<T&>().is_pack4_invocable); using is_pack4_invocable_t = decltype(ck::declval<T&>().is_pack4_invocable);
template <typename T> template <typename T>
using is_pack8_invocable_t = decltype(std::declval<T&>().is_pack8_invocable); using is_pack8_invocable_t = decltype(ck::declval<T&>().is_pack8_invocable);
} // namespace ck } // namespace ck
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#ifndef CK_CODE_GEN_RTC
#include <ostream> #include <ostream>
#endif
#pragma once #pragma once
...@@ -25,6 +28,7 @@ constexpr LoopScheduler make_default_loop_scheduler() ...@@ -25,6 +28,7 @@ constexpr LoopScheduler make_default_loop_scheduler()
} // namespace ck } // namespace ck
#ifndef CK_CODE_GEN_RTC
inline std::ostream& operator<<(std::ostream& os, const ck::LoopScheduler& s) inline std::ostream& operator<<(std::ostream& os, const ck::LoopScheduler& s)
{ {
switch(s) switch(s)
...@@ -35,3 +39,4 @@ inline std::ostream& operator<<(std::ostream& os, const ck::LoopScheduler& s) ...@@ -35,3 +39,4 @@ inline std::ostream& operator<<(std::ostream& os, const ck::LoopScheduler& s)
} }
return os; return os;
} }
#endif
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
...@@ -9,6 +9,10 @@ ...@@ -9,6 +9,10 @@
#include "type.hpp" #include "type.hpp"
#include "tuple.hpp" #include "tuple.hpp"
#ifdef CK_CODE_GEN_RTC
#define INT32_MAX 2147483647
#endif
namespace ck { namespace ck {
// magic number division // magic number division
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
...@@ -19,7 +19,7 @@ extern "C" __device__ float __ocml_native_recip_f32(float); ...@@ -19,7 +19,7 @@ extern "C" __device__ float __ocml_native_recip_f32(float);
#endif #endif
// math functions for the host, some are implemented by calling C++ std functions // math functions for the host, some are implemented by calling C++ std functions
#ifndef CK_CODE_GEN_RTC
static inline __host__ float abs(float x) { return std::abs(x); }; static inline __host__ float abs(float x) { return std::abs(x); };
static inline __host__ double abs(double x) { return std::abs(x); }; static inline __host__ double abs(double x) { return std::abs(x); };
...@@ -459,7 +459,7 @@ inline __host__ double expm1<double>(double x) ...@@ -459,7 +459,7 @@ inline __host__ double expm1<double>(double x)
{ {
return std::expm1(x); return std::expm1(x);
} }
#endif
// math functions for the HIP kernel, some are implemented by calling hip builtin functions // math functions for the HIP kernel, some are implemented by calling hip builtin functions
static inline __device__ float abs(float x) { return ::abs(x); }; static inline __device__ float abs(float x) { return ::abs(x); };
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
#include <ck/utility/ignore.hpp>
#include "ck/ck.hpp" #include "ck/ck.hpp"
#ifdef CK_CODE_GEN_RTC
using uint8_t = unsigned char;
using uint16_t = unsigned short;
using uint32_t = unsigned int;
#endif
namespace ck { namespace ck {
// Pseudo random number generator // Pseudo random number generator
// version for fp32 // version for fp32
template <typename T, uint32_t seed_t, std::enable_if_t<std::is_same<float, T>{}, bool> = false> template <typename T, uint32_t seed_t, ck::enable_if_t<std::is_same<float, T>{}, bool> = false>
__host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed = seed_t) __host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed = seed_t)
{ {
uint32_t x = *(reinterpret_cast<uint32_t*>(&val)); uint32_t x = *(reinterpret_cast<uint32_t*>(&val));
...@@ -25,7 +30,7 @@ __host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed = ...@@ -25,7 +30,7 @@ __host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed =
} }
// version for fp16 // version for fp16
template <typename T, uint32_t seed_t, std::enable_if_t<std::is_same<_Float16, T>{}, bool> = false> template <typename T, uint32_t seed_t, ck::enable_if_t<std::is_same<_Float16, T>{}, bool> = false>
__host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed = seed_t) __host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed = seed_t)
{ {
uint16_t x = *(reinterpret_cast<uint16_t*>(&val)); uint16_t x = *(reinterpret_cast<uint16_t*>(&val));
...@@ -40,15 +45,14 @@ __host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed = ...@@ -40,15 +45,14 @@ __host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed =
} }
// return 0 if data is not fp16 or fp32 // return 0 if data is not fp16 or fp32
template < template <typename T,
typename T, uint32_t seed_t,
uint32_t seed_t, ck::enable_if_t<!(std::is_same<float, T>{} || std::is_same<_Float16, T>{}), bool> = false>
std::enable_if_t<!(std::is_same<float, T>{} || std::is_same<_Float16, T>{}), bool> = false>
__host__ __device__ uint32_t prand_generator(int id, T val, uint32_t seed = seed_t) __host__ __device__ uint32_t prand_generator(int id, T val, uint32_t seed = seed_t)
{ {
std::ignore = id; ck::ignore = id;
std::ignore = val; ck::ignore = val;
std::ignore = seed; ck::ignore = seed;
return 0; return 0;
} }
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
#ifndef CK_CODE_GEN_RTC
#include <ostream> #include <ostream>
#endif
#include "ck/utility/integral_constant.hpp" #include "ck/utility/integral_constant.hpp"
#include "ck/utility/type.hpp" #include "ck/utility/type.hpp"
...@@ -900,6 +902,7 @@ using uniform_sequence_gen_t = typename uniform_sequence_gen<NSize, I>::type; ...@@ -900,6 +902,7 @@ using uniform_sequence_gen_t = typename uniform_sequence_gen<NSize, I>::type;
} // namespace ck } // namespace ck
#ifndef CK_CODE_GEN_RTC
template <ck::index_t... Is> template <ck::index_t... Is>
std::ostream& operator<<(std::ostream& os, const ck::Sequence<Is...>) std::ostream& operator<<(std::ostream& os, const ck::Sequence<Is...>)
{ {
...@@ -910,3 +913,4 @@ std::ostream& operator<<(std::ostream& os, const ck::Sequence<Is...>) ...@@ -910,3 +913,4 @@ std::ostream& operator<<(std::ostream& os, const ck::Sequence<Is...>)
os << S::At(S::Size() - ck::Number<1>{}).value << "}"; os << S::At(S::Size() - ck::Number<1>{}).value << "}";
return os; return os;
} }
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment