tile_program.hpp 1.67 KB
Newer Older
Chao Liu's avatar
Chao Liu committed
1
2
3
4
5
6
7
8

#include <hip/hip_runtime.h>

#include "ck/ck.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"

Chao Liu's avatar
Chao Liu committed
9
10
// hidden intermediate argument
struct Arg
Chao Liu's avatar
Chao Liu committed
11
{
Chao Liu's avatar
Chao Liu committed
12
13
14
15
16
    char data_[128];
    ck::index_t size_ = 0;
    ck::index_t pos_  = 0;

    __host__ __device__ void reset()
Chao Liu's avatar
Chao Liu committed
17
    {
Chao Liu's avatar
Chao Liu committed
18
19
20
        size_ = 0;
        pos_  = 0;
    }
Chao Liu's avatar
Chao Liu committed
21

Chao Liu's avatar
Chao Liu committed
22
    __device__ void reset_pos() { pos_ = 0; }
Chao Liu's avatar
Chao Liu committed
23
24
25

    // push arg on host
    template <typename T>
Chao Liu's avatar
Chao Liu committed
26
    __host__ T push(const T& a)
Chao Liu's avatar
Chao Liu committed
27
    {
Chao Liu's avatar
Chao Liu committed
28
        *reinterpret_cast<T*>(data_ + size_) = a;
Chao Liu's avatar
Chao Liu committed
29

Chao Liu's avatar
Chao Liu committed
30
        size_ += sizeof(T);
Chao Liu's avatar
Chao Liu committed
31
32
33
34
35
36

        return a;
    }

    // pull arg on device
    template <typename T>
Chao Liu's avatar
Chao Liu committed
37
    __device__ T pull()
Chao Liu's avatar
Chao Liu committed
38
    {
Chao Liu's avatar
Chao Liu committed
39
        T a = *reinterpret_cast<T*>(data_ + pos_);
Chao Liu's avatar
Chao Liu committed
40

Chao Liu's avatar
Chao Liu committed
41
        pos_ += sizeof(T);
Chao Liu's avatar
Chao Liu committed
42
43
44

        return a;
    }
Chao Liu's avatar
Chao Liu committed
45
};
Chao Liu's avatar
Chao Liu committed
46

Chao Liu's avatar
Chao Liu committed
47
48
49
50
51
// namespace tp (for tile programming)
struct TileProgram
{
    // arg on device
    Arg arg_;
Chao Liu's avatar
Chao Liu committed
52

Chao Liu's avatar
Chao Liu committed
53
    __device__ void gpu_init() { arg_.reset_pos(); }
Chao Liu's avatar
Chao Liu committed
54

Chao Liu's avatar
Chao Liu committed
55
56
57
    // push arg on host
    template <typename T>
    __host__ T operator()(const T& a)
Chao Liu's avatar
Chao Liu committed
58
    {
Chao Liu's avatar
Chao Liu committed
59
60
        return arg_.push(a);
    }
Chao Liu's avatar
Chao Liu committed
61

Chao Liu's avatar
Chao Liu committed
62
63
64
65
66
    // push arg on host
    template <typename T>
    __device__ T operator()(const T&)
    {
        return arg_.pull<T>();
Chao Liu's avatar
Chao Liu committed
67
68
69
70
71
72
    }
};

template <typename Program, typename... Xs>
__global__ void gpu_program_wrapper(Program f, TileProgram tp, Xs... xs)
{
Chao Liu's avatar
Chao Liu committed
73
    tp.gpu_init();
Chao Liu's avatar
Chao Liu committed
74
75
76
77
78
79
80
81
82
83
84
85
86
87
    f(tp, xs...);
}

template <typename Program, typename... Xs>
void launch(Program f, dim3 grid_dim, dim3 block_dim, Xs... xs)
{
    TileProgram tp;

    f(tp, xs...);

    printf("cpu arg size %d\n", tp.arg_.size_);

    gpu_program_wrapper<Program><<<grid_dim, block_dim, 0, nullptr>>>(f, tp, xs...);
}