config.amd.hpp.in 4.66 KB
Newer Older
1
2
3
#ifndef CK_CONFIG_AMD_HPP
#define CK_CONFIG_AMD_HPP

Chao Liu's avatar
Chao Liu committed
4
#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
5
6
#include "hip/hip_runtime.h"
#include "hip/hip_fp16.h"
Chao Liu's avatar
Chao Liu committed
7
#endif
8
9
#include "bfloat16_dev.hpp"

10
11
12
// address space for kernel parameter
#define __CONSTANT__ __attribute__((address_space(4)))

13
14
15
// device backend
#define CK_DEVICE_BACKEND_AMD 1

Chao Liu's avatar
Chao Liu committed
16
// GPU ID
17
#if 0
Chao Liu's avatar
Chao Liu committed
18
#define CK_AMD_GPU_GFX906 1
19
#elif 1
20
#define CK_AMD_GPU_GFX908 1
zjing14's avatar
zjing14 committed
21
#elif 0
22
23
#define CK_AMD_GPU_GFX1030 1
#endif
Chao Liu's avatar
Chao Liu committed
24
25
26
27
28
29
30

// HIP version
#ifndef CK_HIP_VERSION_FLAT
#define CK_HIP_VERSION_FLAT 0
#endif

// launch bounds
zjing14's avatar
zjing14 committed
31
#define CK_USE_LAUNCH_BOUNDS 1
Chao Liu's avatar
Chao Liu committed
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47

#ifdef CK_USE_LAUNCH_BOUNDS
#define CK_MAX_THREAD_PER_BLOCK 256
#define CK_MIN_BLOCK_PER_CU 1
#endif

// buffer resourse
#if defined(CK_AMD_GPU_GFX906) || defined(CK_AMD_GPU_GFX908)
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x00020000
#elif defined(CK_AMD_GPU_GFX1030)
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000
#endif

// multi index
#define CK_USE_DYNAMICALLY_INDEXED_MULTI_INDEX 0

48
49
50
51
52
53
54
55
56
// AMD inline asm
#ifndef CK_USE_AMD_INLINE_ASM
#define CK_USE_AMD_INLINE_ASM 1
#endif

#ifndef CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM
#define CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM 1
#endif

Chao Liu's avatar
Chao Liu committed
57
58
59
60
61
62
63
// AMD DLOPS
#ifndef CK_USE_AMD_DLOP
#define CK_USE_AMD_DLOP 1
#endif

#ifndef CK_USE_AMD_DLOP_INLINE_ASM
#define CK_USE_AMD_DLOP_INLINE_ASM 1
Chao Liu's avatar
Chao Liu committed
64
65
#endif

66
67
68
69
70
// AMD buffer addressing
#ifndef CK_USE_AMD_BUFFER_ADDRESSING
#define CK_USE_AMD_BUFFER_ADDRESSING 1
#endif

Chao Liu's avatar
Chao Liu committed
71
// only gfx908 support native floating point atomic add
Chao Liu's avatar
Chao Liu committed
72
73
#ifndef CK_USE_AMD_BUFFER_ATOMIC_FADD
#define CK_USE_AMD_BUFFER_ATOMIC_FADD 0
74
75
#endif

76
77
// AMD XDLOPS
#ifndef CK_USE_AMD_XDLOPS
78
#define CK_USE_AMD_XDLOPS 0
79
80
81
#endif

#ifndef CK_USE_AMD_XDLOPS_INLINE_ASM
82
#define CK_USE_AMD_XDLOPS_INLINE_ASM 0
83
84
#endif

85
86
87
88
#ifndef CK_USE_AMD_XDLOPS_EMULATE
#define CK_USE_AMD_XDLOPS_EMULATE 0 // For internal debug purposes
#endif

Chao Liu's avatar
Chao Liu committed
89
90
91
92
93
// block synchronization only s_wait lgkmcnt(0), not vmcnt(0)
#ifndef CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
#define CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1
#endif

94
// experimental implementation
Chao Liu's avatar
Chao Liu committed
95
#ifndef CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
96
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 0
Chao Liu's avatar
Chao Liu committed
97
98
99
100
101
102
103
104
#endif

#ifndef CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
#define CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK 1
#endif

#ifndef CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_OOB_CHECK_OFFSET_TRICK
#define CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_OOB_CHECK_OFFSET_TRICK 1
105
106
107
#endif

#ifndef CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE
108
#define CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE 1
109
#endif
Chao Liu's avatar
Chao Liu committed
110
111

#ifndef CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_OUTPUT_SKIP_OUT_OF_BOUND_CHECK
Chao Liu's avatar
Chao Liu committed
112
#define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_OUTPUT_SKIP_OUT_OF_BOUND_CHECK 0
Chao Liu's avatar
Chao Liu committed
113
114
115
#endif

#ifndef CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_INPUT_SKIP_OUT_OF_BOUND_CHECK
Chao Liu's avatar
Chao Liu committed
116
#define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_INPUT_SKIP_OUT_OF_BOUND_CHECK 0
Chao Liu's avatar
Chao Liu committed
117
118
#endif

119
// pass tensor descriptor by value or void*
120
121
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE 0
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER 1
Chao Liu's avatar
Chao Liu committed
122

123
// merge transformation use magic number division
Chao Liu's avatar
Chao Liu committed
124
#define CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION 0
125

Chao Liu's avatar
Chao Liu committed
126
127
128
129
130
131
132
133
// hack: have underlying assumption that need to be satsified, otherwise it's a bug
// hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be
// thread-invariant, otherwise it's a bug
// TODO: separate index calculation into "compile-time", "global", "block", "wave", "thread"
#ifndef CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
#define CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE 0
#endif

Chao Liu's avatar
Chao Liu committed
134
// workaround: put all workaround here
Chao Liu's avatar
Chao Liu committed
135
// workaround for unnecessary VGPR <--> AGPR data movement when using mfma LLVM intrinsic
Chao Liu's avatar
Chao Liu committed
136
137
138
#ifndef CK_WORKAROUND_SWDEV_229564
#define CK_WORKAROUND_SWDEV_229564 1
#endif
Chao Liu's avatar
Chao Liu committed
139
140
141
142
143
144
145
146
147

// workaround for accvgpr over-allocation
#ifndef CK_WORKAROUND_SWDEV_241664
#define CK_WORKAROUND_SWDEV_241664 1
#endif

// workaround for compiler crash when compiling recursive lambda
#ifndef CK_WORKAROUND_SWDEV_275126
#define CK_WORKAROUND_SWDEV_275126 1
Chao Liu's avatar
Chao Liu committed
148
#endif
149

150
// workaround for compiler crash when using buffer load/store for i8
151
152
153
154
155
156
157
#ifndef CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
#define CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE 1
#endif

// workaround for compiler crash when using buffer load/store for i8
#ifndef CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE
#define CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE 1
158
159
#endif

160
161
162
163
namespace ck {

enum AddressSpace
{
Chao Liu's avatar
Chao Liu committed
164
165
166
    Generic,
    Global,
    Lds,
167
    Sgpr,
Chao Liu's avatar
Chao Liu committed
168
    Vgpr
Chao Liu's avatar
Chao Liu committed
169
170
171
172
};

enum InMemoryDataOperation
{
Chao Liu's avatar
Chao Liu committed
173
174
    Set,
    AtomicAdd
175
176
};

Chao Liu's avatar
Chao Liu committed
177
// index type
178
179
180
181
using index_t = int32_t;

} // namespace ck
#endif