tensor.hpp 5.91 KB
Newer Older
Chao Liu's avatar
Chao Liu committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
#include <thread>
#include <vector>
#include <numeric>

typedef enum
{
    Half  = 0,
    Float = 1,
} DataType_t;

template <class T>
struct DataType;

template <>
struct DataType<float> : std::integral_constant<DataType_t, DataType_t::Float>
{
};

struct TensorDescriptor
{
    TensorDescriptor() = delete;
    TensorDescriptor(DataType_t t, std::initializer_list<std::size_t> lens);
    TensorDescriptor(DataType_t t,
                     std::initializer_list<std::size_t> lens,
                     std::initializer_list<std::size_t> strides);
    TensorDescriptor(DataType_t t, std::vector<std::size_t> lens, std::vector<std::size_t> strides);

    void CalculateStrides();

    template <class Range>
    TensorDescriptor(DataType_t t, const Range& lens)
        : mLens(lens.begin(), lens.end()), mDataType(t)
    {
        this->CalculateStrides();
    }

    template<class Range1, class Range2>
    TensorDescriptor(DataType_t t, const Range1& lens, const Range2& strides)
        : mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end()), mDataType(t)
    {}

    std::size_t GetDimension() const;
    std::size_t GetElementSize() const;
    std::size_t GetElementSpace() const;

    template<class... Xs>
    std::size_t GetIndex(Xs... xs) const
    {
        assert(sizeof...(Xs) == this->GetDimension());
        std::initializer_list<std::size_t> is{xs...};
        return std::inner_product(is.begin(), is.end(), mStrides.begin(), std::size_t{0});
    }

    private:
    std::vector<std::size_t> mLens;
    std::vector<std::size_t> mStrides;

    DataType_t mDataType;
};

template <class T>
struct Tensor
{
    template <class X>
    Tensor(std::initializer_list<X> lens)
        : mDesc(DataType<T>{}, lens), mData(mDesc.GetElementSpace())
    {
    }

    template <class X>
    Tensor(std::vector<X> lens) : mDesc(DataType<T>{}, lens), mData(mDesc.GetElementSpace())
    {
    }

    template <class X, class Y>
    Tensor(std::vector<X> lens, std::vector<Y> strides)
        : mDesc(DataType<T>{}, lens, strides), mData(mDesc.GetElementSpace())
    {
    }

    template <class G>
    void GenerateTensorValue(G g)
    {
        parallel_for([&](Xs... xs) { mData(mDesc.GetIndex(xs...)) = g(xs...); }, mDesc.mLens);
    }

    T& operator[](std::size_t i) { return mData.at(i); }

    const T& operator[](std::size_t i) const { return mData.at(i); }

    typename std::vector<T>::iterator begin() { return mData.begin(); }

    typename std::vector<T>::iterator end() { return mData.end(); }

    typename std::vector<T>::const_iterator begin() const { return mData.begin(); }

    typename std::vector<T>::const_iterator end() const { return mData.end(); }

    TensorDescriptor mDesc;
    std::vector<T> mData;
};

struct GpuMem
{
    GpuMem() = delete;
    GpuMem(std::size_t sz, std::size_t data_sz) : mSz(sz), mDataSz(data_sz)
    {
        cudaMalloc(statci_cast<void**>(&GpuBuf), mDataSize * mSz);
    }

    int ToGpu(void* p)
    {
        return static_cast<int>(cudaMemcpy(mGpuBuf, p, mDataSz * mSz, cudaMemCpyHostToDevice));
    }

    int FromGpu(void* p) { return static_cast<int>(cuadMemCpy(p, mGpuBuf, mDataSz * mSz)); }

    ~GpuMem() { cudaFree(mGpuBuf); }

    void* mGpuBuf;
    std::size_t mSz;
    std::size_t mDataSz;
};

void dummy()
{
    auto f1 = [](int n, int c, int h, int w) { do_f1(n, c, h, w); };
    auto f2 = [](int n, int c, int h, int w) { do_f2(n, c, h, w); };

    auto par_f1 = generate_ParallelTensorFunctor(f1, 3, 3, 3, 3, 3);
    auto par_f2 = generate_ParallelTensorFunctor(f2, 4, 4, 4);

    auto r1 = par_f1();
    auto r2 = par_f2();
}

template <class F, class... Xs>
auto generate_parallel_tensor_functor(F f, Xs... xs)
{
    return ParallelTensorFunctor(f, xs...);
}

template <class F, class... Xs>
struct ParallelTensorFunctor
{
    enum ParallelMethod_t
    {
        Serial   = 0,
        Parallel = 1,
    };

    F mF;
    constexpr std::size_t DIM = sizeof...(Xs);
    std::array<std::size_t, NDIM> mLens;
    std::array<std::size_t, NDIM> mStrides;
    std::size_t mN1d;

    ParallelTensorFunctor(F f, Xs... xs) : mF(f), mLens({static_cast<std::size_t>(xs)...})
    {
        mStrides.back() = 1;
        std::partial_sum(mLens.rbegin(),
                         mLens.rend() - 1,
                         mStrides.rbegin() + 1,
                         std::multiplies<std::size_t>());
        mN1d = mStrides[0] * mLens[0];
    }

    void operator()(std::integral_constant<ParallelMethod_t, ParallelMethod_t::Serial>)
    {
        for(std::size_t i = 0; i < mN1d; ++i)
        {
            call_f_unpack_indices(mF, GetNdIndices(i));
        }
    }

    void operator()(std::integral_constant<ParallelMethod_t, ParallelMethod_t::Parallel>,
                    std::size_t::num_thread)
    {
        std::size_t work_per_thread = (mN1d + num_thread - 1) / num_thread;

        std::vector<joinable_thread> threads(num_thread);

        for(std::size_t it = 0; it < num_thread; ++it)
        {
            std::size_t iw_begin = it * work_per_thread;
            std::size_t iw_end = std::min(((it+1)*work_per_thread, mN1d));

            auto f = [=] {
                for(std::size_t iw = iw_begin; iw < iw_end; ++iw)
                    call_f_unpack_indices(mF, GetNdIndices(iw);
            };
            threads[it] = joinable_thread(f);
        }
    }
};

struct joinable_thread : std::thread
{
    template <class... Xs>
    joinable_thread(Xs&&... xs) : std::thread(std::forward<Xs>(xs)...)
    {
    }

    ~joinable_thread()
    {
        if(this->joinable())
            this->join;
    }
}

template <class F, class T>
auto call_f_unpack_indices(F f, T indices)
{
    constexpr std::size_t N = std::tuple_size<T>::value;
    using NSeq              = std::make_integer_sequence<std::size_t, N>;

    return call_f_unpack_indices_impl(f, indices, NSeq{});
}

template <class F, class T, class... Is>
auto call_f_unpack_indices_impl(F f, T indices, std::integer_sequence<std::size_t, Is...>)
{
    return f(std::get<Is>(indices)...);
}