config.amd.hpp.in 4.03 KB
Newer Older
1
2
3
#ifndef CK_CONFIG_AMD_HPP
#define CK_CONFIG_AMD_HPP

Chao Liu's avatar
Chao Liu committed
4
#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
5
6
#include "hip/hip_runtime.h"
#include "hip/hip_fp16.h"
Chao Liu's avatar
Chao Liu committed
7
#endif
8
9
10
11
12
#include "bfloat16_dev.hpp"

// device backend
#define CK_DEVICE_BACKEND_AMD 1

Chao Liu's avatar
Chao Liu committed
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
// GPU ID
#define CK_AMD_GPU_GFX906 1
#define CK_AMD_GPU_GFX908 0
#define CK_AMD_GPU_GFX1030 0

// HIP version
#ifndef CK_HIP_VERSION_FLAT
#define CK_HIP_VERSION_FLAT 0
#endif

// launch bounds
#define CK_USE_LAUNCH_BOUNDS 0

#ifdef CK_USE_LAUNCH_BOUNDS
#define CK_MAX_THREAD_PER_BLOCK 256
#define CK_MIN_BLOCK_PER_CU 1
#endif

// buffer resourse
#if defined(CK_AMD_GPU_GFX906) || defined(CK_AMD_GPU_GFX908)
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x00020000
#elif defined(CK_AMD_GPU_GFX1030)
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000
#endif

// multi index
#define CK_USE_DYNAMICALLY_INDEXED_MULTI_INDEX 0

41
42
43
44
45
46
47
48
49
// AMD inline asm
#ifndef CK_USE_AMD_INLINE_ASM
#define CK_USE_AMD_INLINE_ASM 1
#endif

#ifndef CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM
#define CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM 1
#endif

Chao Liu's avatar
Chao Liu committed
50
51
52
53
#ifndef CK_USE_AMD_V_FMAC_F32
#define CK_USE_AMD_V_FMAC_F32 1
#endif

54
55
56
57
58
// AMD buffer addressing
#ifndef CK_USE_AMD_BUFFER_ADDRESSING
#define CK_USE_AMD_BUFFER_ADDRESSING 1
#endif

Chao Liu's avatar
Chao Liu committed
59
// only gfx908 support native floating point atomic add
Chao Liu's avatar
Chao Liu committed
60
61
#ifndef CK_USE_AMD_BUFFER_ATOMIC_FADD
#define CK_USE_AMD_BUFFER_ATOMIC_FADD 0
62
63
#endif

64
65
// AMD XDLOPS
#ifndef CK_USE_AMD_XDLOPS
66
#define CK_USE_AMD_XDLOPS 0
67
68
69
#endif

#ifndef CK_USE_AMD_XDLOPS_INLINE_ASM
70
#define CK_USE_AMD_XDLOPS_INLINE_ASM 0
71
72
#endif

73
74
75
76
#ifndef CK_USE_AMD_XDLOPS_EMULATE
#define CK_USE_AMD_XDLOPS_EMULATE 0 // For internal debug purposes
#endif

Chao Liu's avatar
Chao Liu committed
77
78
79
80
81
// block synchronization only s_wait lgkmcnt(0), not vmcnt(0)
#ifndef CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
#define CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1
#endif

82
// experimental implementation
Chao Liu's avatar
Chao Liu committed
83
84
85
86
87
88
89
90
91
92
#ifndef CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#endif

#ifndef CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
#define CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK 1
#endif

#ifndef CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_OOB_CHECK_OFFSET_TRICK
#define CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_OOB_CHECK_OFFSET_TRICK 1
93
94
95
#endif

#ifndef CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE
96
#define CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE 1
97
#endif
Chao Liu's avatar
Chao Liu committed
98
99

#ifndef CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_OUTPUT_SKIP_OUT_OF_BOUND_CHECK
Chao Liu's avatar
Chao Liu committed
100
#define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_OUTPUT_SKIP_OUT_OF_BOUND_CHECK 0
Chao Liu's avatar
Chao Liu committed
101
102
103
#endif

#ifndef CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_INPUT_SKIP_OUT_OF_BOUND_CHECK
Chao Liu's avatar
Chao Liu committed
104
#define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_INPUT_SKIP_OUT_OF_BOUND_CHECK 0
Chao Liu's avatar
Chao Liu committed
105
106
#endif

Chao Liu's avatar
Chao Liu committed
107
108
109
110
111
112
113
114
115
116
117
118
119
// pass tensor descriptor by value, pointer or void*
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE 1
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_POINTER 0
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER 0

// hack: have underlying assumption that need to be satsified, otherwise it's a bug
// hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be
// thread-invariant, otherwise it's a bug
// TODO: separate index calculation into "compile-time", "global", "block", "wave", "thread"
#ifndef CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
#define CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE 0
#endif

Chao Liu's avatar
Chao Liu committed
120
// workaround: put all workaround here
Chao Liu's avatar
Chao Liu committed
121
// workaround for unnecessary VGPR <--> AGPR data movement when using mfma LLVM intrinsic
Chao Liu's avatar
Chao Liu committed
122
123
124
#ifndef CK_WORKAROUND_SWDEV_229564
#define CK_WORKAROUND_SWDEV_229564 1
#endif
Chao Liu's avatar
Chao Liu committed
125
126
127
128
129
130
131
132
133

// workaround for accvgpr over-allocation
#ifndef CK_WORKAROUND_SWDEV_241664
#define CK_WORKAROUND_SWDEV_241664 1
#endif

// workaround for compiler crash when compiling recursive lambda
#ifndef CK_WORKAROUND_SWDEV_275126
#define CK_WORKAROUND_SWDEV_275126 1
Chao Liu's avatar
Chao Liu committed
134
#endif
135
136
137
138
139

namespace ck {

enum AddressSpace
{
Chao Liu's avatar
Chao Liu committed
140
141
142
143
    Generic,
    Global,
    Lds,
    Vgpr
Chao Liu's avatar
Chao Liu committed
144
145
146
147
};

enum InMemoryDataOperation
{
Chao Liu's avatar
Chao Liu committed
148
149
    Set,
    AtomicAdd
150
151
};

Chao Liu's avatar
Chao Liu committed
152
// index type
153
154
155
156
using index_t = int32_t;

} // namespace ck
#endif