#ifndef CK_CONFIG_AMD_HPP #define CK_CONFIG_AMD_HPP #include "hip/hip_runtime.h" #include "hip/hip_fp16.h" #define CK_UNSIGNED_INDEX_TYPE 0 #define CK_DEVICE_BACKEND_AMD 1 #define CK_USE_AMD_INTRINSIC 1 #define CK_USE_AMD_INLINE_ASM 1 #define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 1 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0 namespace ck { enum address_space_t { generic = 0, vgpr = 1, lds = 2, global = 3 }; #if CK_UNSIGNED_INDEX_TYPE using index_t = uint32_t; #else using index_t = int32_t; #endif // For some reason, HIP compiler need this definition to generate optimal load and store // instruction typedef float float2_t __attribute__((ext_vector_type(2))); typedef float float4_t __attribute__((ext_vector_type(4))); typedef int32_t int32x4_t __attribute__((ext_vector_type(4))); // data type conversion template struct type_convert { template __device__ T operator()(X x) const { return static_cast(x); } }; } // namespace ck #endif