#ifndef CK_CONFIG_NVIDIA_HPP #define CK_CONFIG_NVIDIA_HPP #include "cuda_runtime.h" #include "cuda_fp16.h" #include "nvToolsExt.h" #include "helper_cuda.h" #define CK_UNSIGNED_INDEX_TYPE 0 #define CK_DEVICE_BACKEND_NVIDIA 1 #define CK_USE_AMD_INTRINSIC 0 #define CK_USE_AMD_INLINE_ASM 0 #define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0 namespace ck { enum address_space_t { generic = 0, global = generic }; #if CK_UNSIGNED_INDEX_TYPE using index_t = uint32_t; #else using index_t = int32_t; #endif // For some reason, CUDA need this definition, otherwise // compiler won't generate optimal load and store instruction, and // kernel would produce wrong result, indicating the compiler fail to generate correct // instruction, using float2_t = float2; using float4_t = float4; // data type conversion template struct type_convert { template __device__ T operator()(const X& x) const { return static_cast(x); } }; } // namespace ck #endif