config_amd.hpp.in 1.1 KB
Newer Older
Chao Liu's avatar
Chao Liu committed
1
2
#ifndef CK_CONFIG_AMD_HPP
#define CK_CONFIG_AMD_HPP
Chao Liu's avatar
Chao Liu committed
3
4
5
6
7
8
9

#cmakedefine01 CK_DEVICE_BACKEND_AMD

#include "hip/hip_runtime.h"
#include "hip/hip_fp16.h"
#define CK_USE_AMD_INLINE_ASM 1

Chao Liu's avatar
Chao Liu committed
10
11
12
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0

Chao Liu's avatar
Chao Liu committed
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
namespace ck {

// For some reason, HIP compiler need this definition to generate optimal load and store
// instruction
typedef float float2_t __attribute__((ext_vector_type(2)));
typedef float float4_t __attribute__((ext_vector_type(4)));

using index_t = uint32_t;

__device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1)
{
    d += s0 * s1;
}

#if 0
__device__ void fused_multiply_accumulate(half& d, const half& s0, const half& s1) { d += s0 * s1; }

__device__ void fused_multiply_accumulate(half& d, const half2& s0, const half2& s1)
{
    d += s0.x * s1.x;
    d += s0.y * s1.y;
}

__device__ void fused_multiply_accumulate(float& d, const half2& s0, const half2& s1)
{
    d += s0.x * s1.x + s0.y * s1.y;
}
#endif

} // namespace ck

#endif