wait.h 2.45 KB
Newer Older
zhangshao's avatar
zhangshao committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#ifndef WAIT_H
#define WAIT_H

#define USE_PINGPANG_BUFFER


namespace flash {

__forceinline__ __device__ void wait_all_warp_arrived() {
    __builtin_amdgcn_sched_barrier(0);
    asm volatile("s_barrier\n");
    __builtin_amdgcn_sched_barrier(0);
}


template<bool Sync>
__forceinline__ __device__ void wait_all_buffer_data_arrived() {
    __builtin_amdgcn_sched_barrier(0);
    if constexpr (Sync) {
        asm volatile("s_waitcnt vmcnt(0)\n\ts_barrier\n");
    } else {
        asm volatile("s_waitcnt vmcnt(0)\n");
    }
    __builtin_amdgcn_sched_barrier(0);
}


template<bool Sync>
__forceinline__ __device__ void wait_buffer_data_arrived(const int wait_count=0) {
    __builtin_amdgcn_sched_barrier(0);
    if constexpr (Sync) {
        asm volatile("s_waitcnt vmcnt(%0)\n\ts_barrier\n":: "n"(wait_count));
    } else {
        asm volatile("s_waitcnt vmcnt(%0)\n":: "n"(wait_count));
    }
    __builtin_amdgcn_sched_barrier(0);
}


template<bool Sync>
__forceinline__ __device__ void wait_lds_data_arrived(const int wait_count=0) {
    __builtin_amdgcn_sched_barrier(0);
    if constexpr (Sync) {
        asm volatile("s_waitcnt lgkmcnt(%0)\n\ts_barrier\n":: "n"(wait_count));
    } else {
        asm volatile("s_waitcnt lgkmcnt(%0)\n":: "n"(wait_count));
    }
    __builtin_amdgcn_sched_barrier(0);
}

} // namespace flash


template<const int COUNT>
__forceinline__ __device__ void buffer_load_lds_dwordx1_wait() {
    asm volatile(
      "s_waitcnt vmcnt(%0)\n\t"
      "s_barrier\n"
      :: "B"(COUNT)
      :);
}

template<const int COUNT>
__forceinline__ __device__ void buffer_load_lds_dwordx1_wait_nosync() {
    asm volatile(
      "s_waitcnt vmcnt(%0)\n\t"
      :: "B"(COUNT)
      :);
}


template<int BLOCK_M, int BLOCK_N, int BLOCK_K>
inline __device__ void buffer_load_lds_dwordx1_wait() {
asm volatile("s_waitcnt vmcnt(0) \n\t"
                "s_barrier");
}

__forceinline__ __device__ void s_barrier() {
    asm volatile("s_barrier\n");
}

#define lgkmcnt_wait(X)\
__builtin_amdgcn_sched_barrier(0);\
asm volatile("s_waitcnt lgkmcnt(%0)": : "I"(X));\
__builtin_amdgcn_sched_barrier(0);

#define vmcnt_wait(X)\
__builtin_amdgcn_sched_barrier(0);\
    asm volatile(\
      "s_waitcnt vmcnt(%0)\n\t"\
      "s_barrier\n"\
      :: "I"(X)\
      :);\
__builtin_amdgcn_sched_barrier(0);   

#define vmcnt_wait_nosync(X)\
__builtin_amdgcn_sched_barrier(0);\
    asm volatile(\
      "s_waitcnt vmcnt(%0)\n\t"\
      :: "I"(X)\
      :);\
__builtin_amdgcn_sched_barrier(0);   

#endif