config_nvidia.hpp.in 1.34 KB
Newer Older
Chao Liu's avatar
Chao Liu committed
1
2
#ifndef CK_CONFIG_NVIDIA_HPP
#define CK_CONFIG_NVIDIA_HPP
3

Chao Liu's avatar
Chao Liu committed
4
#include "cuda_runtime.h"
Chao Liu's avatar
Chao Liu committed
5
#include "cuda_fp16.h"
Chao Liu's avatar
Chao Liu committed
6
7
#include "nvToolsExt.h"
#include "helper_cuda.h"
Chao Liu's avatar
Chao Liu committed
8

Chao Liu's avatar
Chao Liu committed
9
#define CK_UNSIGNED_INDEX_TYPE 0
Chao Liu's avatar
Chao Liu committed
10
#define CK_DEVICE_BACKEND_NVIDIA 1
Chao Liu's avatar
Chao Liu committed
11
#define CK_USE_AMD_INTRINSIC 0
Chao Liu's avatar
Chao Liu committed
12
#define CK_USE_AMD_INLINE_ASM 0
13
#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 0
Chao Liu's avatar
Chao Liu committed
14
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
15
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
16
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
17
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
Chao Liu's avatar
Chao Liu committed
18

19
20
namespace ck {

21
22
enum address_space_t
{
Chao Liu's avatar
Chao Liu committed
23
24
    generic = 0,
    global  = generic
25
26
};

Chao Liu's avatar
Chao Liu committed
27
#if CK_UNSIGNED_INDEX_TYPE
28
using index_t = uint32_t;
Chao Liu's avatar
Chao Liu committed
29
#else
30
using index_t = int32_t;
Chao Liu's avatar
Chao Liu committed
31
32
#endif

Chao Liu's avatar
Chao Liu committed
33
34
35
36
37
38
// For some reason, CUDA need this definition, otherwise
//   compiler won't generate optimal load and store instruction, and
//   kernel would produce wrong result, indicating the compiler fail to generate correct
//   instruction,
using float2_t = float2;
using float4_t = float4;
Chao Liu's avatar
Chao Liu committed
39

Chao Liu's avatar
Chao Liu committed
40
41
42
43
44
45
46
47
48
49
50
// data type conversion
template <typename T>
struct type_convert
{
    template <typename X>
    __device__ T operator()(const X& x) const
    {
        return static_cast<T>(x);
    }
};

51
52
53
} // namespace ck

#endif