config.amd.hpp.in 4.09 KB
Newer Older
1
2
3
#ifndef CK_CONFIG_AMD_HPP
#define CK_CONFIG_AMD_HPP

Chao Liu's avatar
Chao Liu committed
4
#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
5
6
#include "hip/hip_runtime.h"
#include "hip/hip_fp16.h"
Chao Liu's avatar
Chao Liu committed
7
#endif
8
9
#include "bfloat16_dev.hpp"

10
11
12
// address space for kernel parameter
#define __CONSTANT__ __attribute__((address_space(4)))

13
14
15
// device backend
#define CK_DEVICE_BACKEND_AMD 1

Chao Liu's avatar
Chao Liu committed
16
// GPU ID
17
#if 0
Chao Liu's avatar
Chao Liu committed
18
#define CK_AMD_GPU_GFX906 1
19
20
21
22
23
#elif 0
#define CK_AMD_GPU_GFX908 1
#elif 1
#define CK_AMD_GPU_GFX1030 1
#endif
Chao Liu's avatar
Chao Liu committed
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47

// HIP version
#ifndef CK_HIP_VERSION_FLAT
#define CK_HIP_VERSION_FLAT 0
#endif

// launch bounds
#define CK_USE_LAUNCH_BOUNDS 0

#ifdef CK_USE_LAUNCH_BOUNDS
#define CK_MAX_THREAD_PER_BLOCK 256
#define CK_MIN_BLOCK_PER_CU 1
#endif

// buffer resourse
#if defined(CK_AMD_GPU_GFX906) || defined(CK_AMD_GPU_GFX908)
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x00020000
#elif defined(CK_AMD_GPU_GFX1030)
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000
#endif

// multi index
#define CK_USE_DYNAMICALLY_INDEXED_MULTI_INDEX 0

48
49
50
51
52
53
54
55
56
// AMD inline asm
#ifndef CK_USE_AMD_INLINE_ASM
#define CK_USE_AMD_INLINE_ASM 1
#endif

#ifndef CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM
#define CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM 1
#endif

Chao Liu's avatar
Chao Liu committed
57
58
59
60
#ifndef CK_USE_AMD_V_FMAC_F32
#define CK_USE_AMD_V_FMAC_F32 1
#endif

61
62
63
64
65
// AMD buffer addressing
#ifndef CK_USE_AMD_BUFFER_ADDRESSING
#define CK_USE_AMD_BUFFER_ADDRESSING 1
#endif

Chao Liu's avatar
Chao Liu committed
66
// only gfx908 support native floating point atomic add
Chao Liu's avatar
Chao Liu committed
67
68
#ifndef CK_USE_AMD_BUFFER_ATOMIC_FADD
#define CK_USE_AMD_BUFFER_ATOMIC_FADD 0
69
70
#endif

71
72
// AMD XDLOPS
#ifndef CK_USE_AMD_XDLOPS
73
#define CK_USE_AMD_XDLOPS 0
74
75
76
#endif

#ifndef CK_USE_AMD_XDLOPS_INLINE_ASM
77
#define CK_USE_AMD_XDLOPS_INLINE_ASM 0
78
79
#endif

80
81
82
83
#ifndef CK_USE_AMD_XDLOPS_EMULATE
#define CK_USE_AMD_XDLOPS_EMULATE 0 // For internal debug purposes
#endif

Chao Liu's avatar
Chao Liu committed
84
85
86
87
88
// block synchronization only s_wait lgkmcnt(0), not vmcnt(0)
#ifndef CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
#define CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1
#endif

89
// experimental implementation
Chao Liu's avatar
Chao Liu committed
90
91
92
93
94
95
96
97
98
99
#ifndef CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#endif

#ifndef CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
#define CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK 1
#endif

#ifndef CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_OOB_CHECK_OFFSET_TRICK
#define CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_OOB_CHECK_OFFSET_TRICK 1
100
101
102
#endif

#ifndef CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE
103
#define CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE 1
104
#endif
Chao Liu's avatar
Chao Liu committed
105
106

#ifndef CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_OUTPUT_SKIP_OUT_OF_BOUND_CHECK
Chao Liu's avatar
Chao Liu committed
107
#define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_OUTPUT_SKIP_OUT_OF_BOUND_CHECK 0
Chao Liu's avatar
Chao Liu committed
108
109
110
#endif

#ifndef CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_INPUT_SKIP_OUT_OF_BOUND_CHECK
Chao Liu's avatar
Chao Liu committed
111
#define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_INPUT_SKIP_OUT_OF_BOUND_CHECK 0
Chao Liu's avatar
Chao Liu committed
112
113
#endif

114
// pass tensor descriptor by value or void*
Chao Liu's avatar
Chao Liu committed
115
116
117
118
119
120
121
122
123
124
125
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE 1
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER 0

// hack: have underlying assumption that need to be satsified, otherwise it's a bug
// hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be
// thread-invariant, otherwise it's a bug
// TODO: separate index calculation into "compile-time", "global", "block", "wave", "thread"
#ifndef CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
#define CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE 0
#endif

Chao Liu's avatar
Chao Liu committed
126
// workaround: put all workaround here
Chao Liu's avatar
Chao Liu committed
127
// workaround for unnecessary VGPR <--> AGPR data movement when using mfma LLVM intrinsic
Chao Liu's avatar
Chao Liu committed
128
129
130
#ifndef CK_WORKAROUND_SWDEV_229564
#define CK_WORKAROUND_SWDEV_229564 1
#endif
Chao Liu's avatar
Chao Liu committed
131
132
133
134
135
136
137
138
139

// workaround for accvgpr over-allocation
#ifndef CK_WORKAROUND_SWDEV_241664
#define CK_WORKAROUND_SWDEV_241664 1
#endif

// workaround for compiler crash when compiling recursive lambda
#ifndef CK_WORKAROUND_SWDEV_275126
#define CK_WORKAROUND_SWDEV_275126 1
Chao Liu's avatar
Chao Liu committed
140
#endif
141
142
143
144
145

namespace ck {

enum AddressSpace
{
Chao Liu's avatar
Chao Liu committed
146
147
148
149
    Generic,
    Global,
    Lds,
    Vgpr
Chao Liu's avatar
Chao Liu committed
150
151
152
153
};

enum InMemoryDataOperation
{
Chao Liu's avatar
Chao Liu committed
154
155
    Set,
    AtomicAdd
156
157
};

Chao Liu's avatar
Chao Liu committed
158
// index type
159
160
161
162
using index_t = int32_t;

} // namespace ck
#endif