config.amd.hpp.in 4.06 KB
Newer Older
1
2
3
#ifndef CK_CONFIG_AMD_HPP
#define CK_CONFIG_AMD_HPP

Chao Liu's avatar
Chao Liu committed
4
#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
5
6
#include "hip/hip_runtime.h"
#include "hip/hip_fp16.h"
Chao Liu's avatar
Chao Liu committed
7
#endif
8
9
10
11
12
#include "bfloat16_dev.hpp"

// device backend
#define CK_DEVICE_BACKEND_AMD 1

Chao Liu's avatar
Chao Liu committed
13
// GPU ID
14
#if 0
Chao Liu's avatar
Chao Liu committed
15
#define CK_AMD_GPU_GFX906 1
16
17
18
19
20
#elif 0
#define CK_AMD_GPU_GFX908 1
#elif 1
#define CK_AMD_GPU_GFX1030 1
#endif
Chao Liu's avatar
Chao Liu committed
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44

// HIP version
#ifndef CK_HIP_VERSION_FLAT
#define CK_HIP_VERSION_FLAT 0
#endif

// launch bounds
#define CK_USE_LAUNCH_BOUNDS 0

#ifdef CK_USE_LAUNCH_BOUNDS
#define CK_MAX_THREAD_PER_BLOCK 256
#define CK_MIN_BLOCK_PER_CU 1
#endif

// buffer resourse
#if defined(CK_AMD_GPU_GFX906) || defined(CK_AMD_GPU_GFX908)
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x00020000
#elif defined(CK_AMD_GPU_GFX1030)
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000
#endif

// multi index
#define CK_USE_DYNAMICALLY_INDEXED_MULTI_INDEX 0

45
46
47
48
49
50
51
52
53
// AMD inline asm
#ifndef CK_USE_AMD_INLINE_ASM
#define CK_USE_AMD_INLINE_ASM 1
#endif

#ifndef CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM
#define CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM 1
#endif

Chao Liu's avatar
Chao Liu committed
54
55
56
57
#ifndef CK_USE_AMD_V_FMAC_F32
#define CK_USE_AMD_V_FMAC_F32 1
#endif

58
59
60
61
62
// AMD buffer addressing
#ifndef CK_USE_AMD_BUFFER_ADDRESSING
#define CK_USE_AMD_BUFFER_ADDRESSING 1
#endif

Chao Liu's avatar
Chao Liu committed
63
// only gfx908 support native floating point atomic add
Chao Liu's avatar
Chao Liu committed
64
65
#ifndef CK_USE_AMD_BUFFER_ATOMIC_FADD
#define CK_USE_AMD_BUFFER_ATOMIC_FADD 0
66
67
#endif

68
69
// AMD XDLOPS
#ifndef CK_USE_AMD_XDLOPS
70
#define CK_USE_AMD_XDLOPS 0
71
72
73
#endif

#ifndef CK_USE_AMD_XDLOPS_INLINE_ASM
74
#define CK_USE_AMD_XDLOPS_INLINE_ASM 0
75
76
#endif

77
78
79
80
#ifndef CK_USE_AMD_XDLOPS_EMULATE
#define CK_USE_AMD_XDLOPS_EMULATE 0 // For internal debug purposes
#endif

Chao Liu's avatar
Chao Liu committed
81
82
83
84
85
// block synchronization only s_wait lgkmcnt(0), not vmcnt(0)
#ifndef CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
#define CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1
#endif

86
// experimental implementation
Chao Liu's avatar
Chao Liu committed
87
88
89
90
91
92
93
94
95
96
#ifndef CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#endif

#ifndef CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
#define CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK 1
#endif

#ifndef CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_OOB_CHECK_OFFSET_TRICK
#define CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_OOB_CHECK_OFFSET_TRICK 1
97
98
99
#endif

#ifndef CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE
100
#define CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE 1
101
#endif
Chao Liu's avatar
Chao Liu committed
102
103

#ifndef CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_OUTPUT_SKIP_OUT_OF_BOUND_CHECK
Chao Liu's avatar
Chao Liu committed
104
#define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_OUTPUT_SKIP_OUT_OF_BOUND_CHECK 0
Chao Liu's avatar
Chao Liu committed
105
106
107
#endif

#ifndef CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_INPUT_SKIP_OUT_OF_BOUND_CHECK
Chao Liu's avatar
Chao Liu committed
108
#define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_INPUT_SKIP_OUT_OF_BOUND_CHECK 0
Chao Liu's avatar
Chao Liu committed
109
110
#endif

Chao Liu's avatar
Chao Liu committed
111
112
113
114
115
116
117
118
119
120
121
122
123
// pass tensor descriptor by value, pointer or void*
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE 1
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_POINTER 0
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER 0

// hack: have underlying assumption that need to be satsified, otherwise it's a bug
// hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be
// thread-invariant, otherwise it's a bug
// TODO: separate index calculation into "compile-time", "global", "block", "wave", "thread"
#ifndef CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
#define CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE 0
#endif

Chao Liu's avatar
Chao Liu committed
124
// workaround: put all workaround here
Chao Liu's avatar
Chao Liu committed
125
// workaround for unnecessary VGPR <--> AGPR data movement when using mfma LLVM intrinsic
Chao Liu's avatar
Chao Liu committed
126
127
128
#ifndef CK_WORKAROUND_SWDEV_229564
#define CK_WORKAROUND_SWDEV_229564 1
#endif
Chao Liu's avatar
Chao Liu committed
129
130
131
132
133
134
135
136
137

// workaround for accvgpr over-allocation
#ifndef CK_WORKAROUND_SWDEV_241664
#define CK_WORKAROUND_SWDEV_241664 1
#endif

// workaround for compiler crash when compiling recursive lambda
#ifndef CK_WORKAROUND_SWDEV_275126
#define CK_WORKAROUND_SWDEV_275126 1
Chao Liu's avatar
Chao Liu committed
138
#endif
139
140
141
142
143

namespace ck {

enum AddressSpace
{
Chao Liu's avatar
Chao Liu committed
144
145
146
147
    Generic,
    Global,
    Lds,
    Vgpr
Chao Liu's avatar
Chao Liu committed
148
149
150
151
};

enum InMemoryDataOperation
{
Chao Liu's avatar
Chao Liu committed
152
153
    Set,
    AtomicAdd
154
155
};

Chao Liu's avatar
Chao Liu committed
156
// index type
157
158
159
160
using index_t = int32_t;

} // namespace ck
#endif