hip_contiguous.cpp 4 KB
Newer Older
1
2
3
4
5

#include <hip/hip_runtime.h>
#include <migraph/operators.hpp>

namespace migraph {
Paul's avatar
Paul committed
6
namespace gpu {
7

Paul's avatar
Paul committed
8
9
10
11
12
13
14
struct index
{
    std::size_t global;
    std::size_t local;
    std::size_t group;
};

Paul's avatar
Paul committed
15
template <class F>
Paul's avatar
Paul committed
16
17
18
19
20
21
22
23
24
25
26
27
28
29
__global__ void launcher(F f)
{
    index idx{blockIdx.x * blockDim.x + threadIdx.x, threadIdx.x, blockIdx.x};
    f(idx);
}

auto launch(std::size_t global, std::size_t local)
{
    return [&](auto f) {
        assert(local > 0);
        assert(global > 0);
        using f_type = decltype(f);
        dim3 nblocks(global / local);
        dim3 nthreads(local);
Paul's avatar
Paul committed
30
        hipLaunchKernelGGL((launcher<f_type>), nblocks, nthreads, 0, nullptr, f);
Paul's avatar
Paul committed
31
32
33
    };
}

34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
template <class F>
void visit_tensor_size(std::size_t n, F f)
{
    switch(n)
    {
    case 1:
    {
        f(std::integral_constant<std::size_t, 1>{});
        break;
    }
    case 2:
    {
        f(std::integral_constant<std::size_t, 2>{});
        break;
    }
    case 3:
    {
        f(std::integral_constant<std::size_t, 3>{});
        break;
    }
    case 4:
    {
        f(std::integral_constant<std::size_t, 4>{});
        break;
    }
    case 5:
    {
        f(std::integral_constant<std::size_t, 5>{});
        break;
    }
    default: throw std::runtime_error("Unknown tensor size");
    }
}

wsttiger's avatar
wsttiger committed
68
69
70
71
template <size_t NDim>
struct hip_index
{
    size_t d[NDim];
Paul's avatar
Paul committed
72
73
    __device__ __host__ size_t& operator[](size_t i) { return d[i]; }
    __device__ __host__ size_t operator[](size_t i) const { return d[i]; }
wsttiger's avatar
wsttiger committed
74
75
};

76
77
78
template <size_t NDim>
struct hip_tensor_descriptor
{
Paul's avatar
Paul committed
79
    __device__ __host__ hip_tensor_descriptor() = default;
80
    template <typename T, typename V>
Paul's avatar
Paul committed
81
    __device__ __host__ hip_tensor_descriptor(const T& lens_ext, const V& strides_ext)
82
83
    {
        for(size_t i = 0; i < NDim; i++)
wsttiger's avatar
wsttiger committed
84
            lens[i] = lens_ext[i];
85
        for(size_t i = 0; i < NDim; i++)
wsttiger's avatar
wsttiger committed
86
            strides[i] = strides_ext[i];
87
    }
Paul's avatar
Paul committed
88
    __device__ __host__ hip_index<NDim> multi(size_t idx)
89
    {
wsttiger's avatar
wsttiger committed
90
91
92
93
94
95
96
97
        hip_index<NDim> result{};
        size_t tidx = idx;
        for(size_t is = 0; is < NDim; is++)
        {
            result[is] = tidx / strides[is];
            tidx       = tidx % strides[is];
        }
        return result;
98
    }
Paul's avatar
Paul committed
99
    __device__ __host__ size_t linear(hip_index<NDim> s)
wsttiger's avatar
wsttiger committed
100
101
102
103
104
105
106
107
108
    {
        size_t idx = 0;
        for(size_t i = 0; i < NDim; i++)
            idx += s[i] * strides[i];
        return idx;
    }
    size_t lens[NDim]    = {};
    size_t strides[NDim] = {};
};
109

Paul's avatar
Paul committed
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
// template <typename T, size_t NDim>
// __global__ void contiguous_gpu(const T* a,
//                                hip_tensor_descriptor<NDim> a_desc,
//                                T* at,
//                                hip_tensor_descriptor<NDim> at_desc,
//                                size_t nelements)
// {
//     for(size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < nelements;
//         i += blockDim.x * gridDim.x)
//     {
//         hip_index<NDim> s = at_desc.multi(i);
//         size_t lidx       = a_desc.linear(s);
//         at[i]             = a[lidx];
//     }
// }
125
126
127
128

void hip_contiguous(migraph::shape output_shape, migraph::argument arg, migraph::argument result)
{
    visit_all(result, arg)([&](auto output, auto input) {
Paul's avatar
Paul committed
129
        visit_tensor_size(output_shape.lens().size(), [&](auto ndim) {
130
            const auto& s = arg.get_shape();
Paul's avatar
Paul committed
131
132
            hip_tensor_descriptor<ndim> a_desc(s.lens(), s.strides());
            hip_tensor_descriptor<ndim> at_desc(output_shape.lens(), output_shape.strides());
Paul's avatar
Paul committed
133
134
135
136
137
            auto* a             = input.data();
            auto* at            = output.data();
            auto nelements      = s.elements();
            std::size_t nlocal  = 512;
            std::size_t nglobal = 512 * nlocal;
Paul's avatar
Paul committed
138
139
140
141

            launch(nglobal, nlocal)([=](auto idx) mutable {
                for(size_t i = idx.global; i < nelements; i += nglobal)
                {
Paul's avatar
Paul committed
142
143
                    size_t lidx = a_desc.linear(at_desc.multi(i));
                    at[i]       = a[lidx];
Paul's avatar
Paul committed
144
145
146
                }
            });
        });
147
148
    });
}
Paul's avatar
Paul committed
149
} // namespace gpu
150
} // namespace migraph