workgroup_barrier.hpp 1.75 KB
Newer Older
1
2
3
4
5
6
7
#pragma once
#include <hip/hip_runtime.h>
#include <stdint.h>

namespace ck {
struct workgroup_barrier
{
8
    __device__ workgroup_barrier(uint32_t* ptr) : base_ptr(ptr) {}
9

10
    __device__ uint32_t ld(uint32_t offset) const
11
12
13
14
    {
        return __atomic_load_n(base_ptr + offset, __ATOMIC_RELAXED);
    }

15
16
17
18
19
    __device__ void st(uint32_t offset, uint32_t value)
    {
        __atomic_store_n(base_ptr + offset, value, __ATOMIC_RELEASE);
    }

20
21
22
23
24
25
    __device__ void wait_eq(uint32_t offset, uint32_t value)
    {
        if(threadIdx.x == 0)
        {
            while(ld(offset) != value) {}
        }
26
        __builtin_amdgcn_s_barrier();
27
28
29
30
31
32
33
34
    }

    __device__ void wait_lt(uint32_t offset, uint32_t value)
    {
        if(threadIdx.x == 0)
        {
            while(ld(offset) < value) {}
        }
35
        __builtin_amdgcn_s_barrier();
36
37
38
39
40
41
    }

    __device__ void wait_set(uint32_t offset, uint32_t compare, uint32_t value)
    {
        if(threadIdx.x == 0)
        {
42
            while(atomicCAS(base_ptr + offset, compare, value) != compare) {}
43
        }
44
        __builtin_amdgcn_s_barrier();
45
46
47
48
49
50
51
52
53
54
    }

    // enter critical zoon, assume buffer is zero when launch kernel
    __device__ void aquire(uint32_t offset) { wait_set(offset, 0, 1); }

    // exit critical zoon, assume buffer is zero when launch kernel
    __device__ void release(uint32_t offset) { wait_set(offset, 1, 0); }

    __device__ void inc(uint32_t offset)
    {
55
        __builtin_amdgcn_s_barrier();
56
57
        if(threadIdx.x == 0)
        {
58
            atomicAdd(base_ptr + offset, 1);
59
60
61
        }
    }

62
63
    __device__ void reset(uint32_t offset)
    {
64
        __builtin_amdgcn_s_barrier();
65
66
67
68
69
70
        if(threadIdx.x == 0)
        {
            st(offset, 0);
        }
    }

71
    uint32_t* base_ptr;
72
73
};
} // namespace ck