config.amd.hpp.in 4.58 KB
Newer Older
1
2
3
#ifndef CK_CONFIG_AMD_HPP
#define CK_CONFIG_AMD_HPP

Chao Liu's avatar
Chao Liu committed
4
#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
5
6
#include "hip/hip_runtime.h"
#include "hip/hip_fp16.h"
Chao Liu's avatar
Chao Liu committed
7
#endif
8
9
#include "bfloat16_dev.hpp"

10
11
12
// address space for kernel parameter
#define __CONSTANT__ __attribute__((address_space(4)))

13
14
15
// device backend
#define CK_DEVICE_BACKEND_AMD 1

Chao Liu's avatar
Chao Liu committed
16
// GPU ID
17
#if 1
Chao Liu's avatar
Chao Liu committed
18
#define CK_AMD_GPU_GFX906 1
19
20
#elif 0
#define CK_AMD_GPU_GFX908 1
21
#elif 1
22
23
#define CK_AMD_GPU_GFX1030 1
#endif
Chao Liu's avatar
Chao Liu committed
24
25
26
27
28
29
30

// HIP version
#ifndef CK_HIP_VERSION_FLAT
#define CK_HIP_VERSION_FLAT 0
#endif

// launch bounds
31
#define CK_USE_LAUNCH_BOUNDS 0
Chao Liu's avatar
Chao Liu committed
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47

#ifdef CK_USE_LAUNCH_BOUNDS
#define CK_MAX_THREAD_PER_BLOCK 256
#define CK_MIN_BLOCK_PER_CU 1
#endif

// buffer resourse
#if defined(CK_AMD_GPU_GFX906) || defined(CK_AMD_GPU_GFX908)
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x00020000
#elif defined(CK_AMD_GPU_GFX1030)
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000
#endif

// multi index
#define CK_USE_DYNAMICALLY_INDEXED_MULTI_INDEX 0

48
49
50
51
52
53
54
55
56
// AMD inline asm
#ifndef CK_USE_AMD_INLINE_ASM
#define CK_USE_AMD_INLINE_ASM 1
#endif

#ifndef CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM
#define CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM 1
#endif

Chao Liu's avatar
Chao Liu committed
57
58
59
60
#ifndef CK_USE_AMD_V_FMAC_F32
#define CK_USE_AMD_V_FMAC_F32 1
#endif

61
62
63
64
65
// AMD buffer addressing
#ifndef CK_USE_AMD_BUFFER_ADDRESSING
#define CK_USE_AMD_BUFFER_ADDRESSING 1
#endif

Chao Liu's avatar
Chao Liu committed
66
// only gfx908 support native floating point atomic add
Chao Liu's avatar
Chao Liu committed
67
68
#ifndef CK_USE_AMD_BUFFER_ATOMIC_FADD
#define CK_USE_AMD_BUFFER_ATOMIC_FADD 0
69
70
#endif

71
72
// AMD XDLOPS
#ifndef CK_USE_AMD_XDLOPS
73
#define CK_USE_AMD_XDLOPS 0
74
75
76
#endif

#ifndef CK_USE_AMD_XDLOPS_INLINE_ASM
77
#define CK_USE_AMD_XDLOPS_INLINE_ASM 0
78
79
#endif

80
81
82
83
#ifndef CK_USE_AMD_XDLOPS_EMULATE
#define CK_USE_AMD_XDLOPS_EMULATE 0 // For internal debug purposes
#endif

Chao Liu's avatar
Chao Liu committed
84
85
86
87
88
// block synchronization only s_wait lgkmcnt(0), not vmcnt(0)
#ifndef CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
#define CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1
#endif

89
// experimental implementation
Chao Liu's avatar
Chao Liu committed
90
#ifndef CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
91
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 0
Chao Liu's avatar
Chao Liu committed
92
93
94
95
96
97
98
99
#endif

#ifndef CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
#define CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK 1
#endif

#ifndef CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_OOB_CHECK_OFFSET_TRICK
#define CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_OOB_CHECK_OFFSET_TRICK 1
100
101
102
#endif

#ifndef CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE
103
#define CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE 1
104
#endif
Chao Liu's avatar
Chao Liu committed
105
106

#ifndef CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_OUTPUT_SKIP_OUT_OF_BOUND_CHECK
Chao Liu's avatar
Chao Liu committed
107
#define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_OUTPUT_SKIP_OUT_OF_BOUND_CHECK 0
Chao Liu's avatar
Chao Liu committed
108
109
110
#endif

#ifndef CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_INPUT_SKIP_OUT_OF_BOUND_CHECK
Chao Liu's avatar
Chao Liu committed
111
#define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_INPUT_SKIP_OUT_OF_BOUND_CHECK 0
Chao Liu's avatar
Chao Liu committed
112
113
#endif

114
// pass tensor descriptor by value or void*
Chao Liu's avatar
Chao Liu committed
115
116
117
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE 1
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER 0

118
119
120
// merge transformation use magic number division
#define CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION 0

Chao Liu's avatar
Chao Liu committed
121
122
123
124
125
126
127
128
// hack: have underlying assumption that need to be satsified, otherwise it's a bug
// hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be
// thread-invariant, otherwise it's a bug
// TODO: separate index calculation into "compile-time", "global", "block", "wave", "thread"
#ifndef CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
#define CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE 0
#endif

Chao Liu's avatar
Chao Liu committed
129
// workaround: put all workaround here
Chao Liu's avatar
Chao Liu committed
130
// workaround for unnecessary VGPR <--> AGPR data movement when using mfma LLVM intrinsic
Chao Liu's avatar
Chao Liu committed
131
132
133
#ifndef CK_WORKAROUND_SWDEV_229564
#define CK_WORKAROUND_SWDEV_229564 1
#endif
Chao Liu's avatar
Chao Liu committed
134
135
136
137
138
139
140
141
142

// workaround for accvgpr over-allocation
#ifndef CK_WORKAROUND_SWDEV_241664
#define CK_WORKAROUND_SWDEV_241664 1
#endif

// workaround for compiler crash when compiling recursive lambda
#ifndef CK_WORKAROUND_SWDEV_275126
#define CK_WORKAROUND_SWDEV_275126 1
Chao Liu's avatar
Chao Liu committed
143
#endif
144

145
// workaround for compiler crash when using buffer load/store for i8
146
147
148
149
150
151
152
#ifndef CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
#define CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE 1
#endif

// workaround for compiler crash when using buffer load/store for i8
#ifndef CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE
#define CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE 1
153
154
#endif

155
156
157
158
namespace ck {

enum AddressSpace
{
Chao Liu's avatar
Chao Liu committed
159
160
161
    Generic,
    Global,
    Lds,
162
    Sgpr,
Chao Liu's avatar
Chao Liu committed
163
    Vgpr
Chao Liu's avatar
Chao Liu committed
164
165
166
167
};

enum InMemoryDataOperation
{
Chao Liu's avatar
Chao Liu committed
168
169
    Set,
    AtomicAdd
170
171
};

Chao Liu's avatar
Chao Liu committed
172
// index type
173
174
175
176
using index_t = int32_t;

} // namespace ck
#endif