conv.cu 5.27 KB
Newer Older
Chao Liu's avatar
Chao Liu committed
1
#include <iostream>
Chao Liu's avatar
Chao Liu committed
2
3
#include <numeric>
#include <initializer_list>
Chao Liu's avatar
Chao Liu committed
4
5
6
7
8
#include "nvToolsExt.h"
#include "tensor.hpp"
#include "device_tensor.cuh"
#include "direct_convolution.cuh"

Chao Liu's avatar
Chao Liu committed
9
template <class T>
Chao Liu's avatar
Chao Liu committed
10
struct GeneratorConstant
Chao Liu's avatar
Chao Liu committed
11
12
13
14
15
16
17
{
    T value = 0;

    template <class... Is>
    T operator()(Is... is)
    {
        return value;
Chao Liu's avatar
Chao Liu committed
18
19
20
21
22
23
24
25
26
27
    }
};

template <class T>
struct GeneratorTensor
{
    template <class... Is>
    T operator()(Is... is)
    {
#if 0
Chao Liu's avatar
Chao Liu committed
28
29
        std::initializer_list<std::size_t> ls = {static_cast<std::size_t>(is)...};
        return std::accumulate(ls.begin(), ls.end(), std::size_t(0));
Chao Liu's avatar
Chao Liu committed
30
31
32
33
34
35
36
#else
        assert(sizeof...(Is) > 0);
        std::initializer_list<std::size_t> ids = {static_cast<std::size_t>(is)...};
        std::vector<std::size_t> lens(sizeof...(Is), 100);
        std::vector<std::size_t> strides(sizeof...(Is), 1);
        std::partial_sum(lens.rbegin(), lens.rbegin() + (sizeof...(Is) - 1), strides.rbegin() + 1);
        return std::inner_product(ids.begin(), ids.end(), strides.begin(), std::size_t(0)) + 1;
Chao Liu's avatar
Chao Liu committed
37
38
39
40
#endif
    }
};

Chao Liu's avatar
Chao Liu committed
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
template <typename T>
void host_convolution(const Tensor<T>& in,
                      const Tensor<T>& wei,
                      Tensor<T>& out,
                      std::size_t num_thread)
{
    auto f = [&](auto n, auto k, auto ho, auto wo) {
        double v = 0;
        for(int c = 0; c < wei.mDesc.GetLengths()[1]; ++c)
        {
            for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
            {
                int hi = ho + y;
                for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x)
                {
                    int wi = wo + x;
                    v += in(n, c, hi, wi) * wei(k, c, y, x);
                }
            }
        }
        out(n, k, ho, wo) = v;
    };

    auto f_par = make_ParallelTensorFunctor(f,
                                            out.mDesc.GetLengths()[0],
                                            out.mDesc.GetLengths()[1],
                                            out.mDesc.GetLengths()[2],
                                            out.mDesc.GetLengths()[3]);

    f_par(num_thread);
}

template <class T>
Chao Liu's avatar
Chao Liu committed
74
void device_convolution(const Tensor<T>& in, const Tensor<T>& wei, Tensor<T>& out)
Chao Liu's avatar
Chao Liu committed
75
76

{
Chao Liu's avatar
Chao Liu committed
77
78
79
80
    DeviceTensorDescriptor<4> in_desc_device(in.mDesc);
    DeviceTensorDescriptor<4> wei_desc_device(wei.mDesc);
    DeviceTensorDescriptor<4> out_desc_device(out.mDesc);

Chao Liu's avatar
Chao Liu committed
81
    printf("__func__: in_desc_device: {%u %u %u %u}, {%u %u %u %u}\n",
Chao Liu's avatar
Chao Liu committed
82
83
84
           in_desc_device.GetLength(0),
           in_desc_device.GetLength(1),
           in_desc_device.GetLength(2),
Chao Liu's avatar
Chao Liu committed
85
86
87
88
89
           in_desc_device.GetLength(3),
           in_desc_device.GetStride(0),
           in_desc_device.GetStride(1),
           in_desc_device.GetStride(2),
           in_desc_device.GetStride(3));
Chao Liu's avatar
Chao Liu committed
90
91
92
93
94
95

    std::size_t data_sz = sizeof(T);
    DeviceMem in_device_buf(data_sz * in.mDesc.GetElementSpace());
    DeviceMem wei_device_buf(data_sz * wei.mDesc.GetElementSpace());
    DeviceMem out_device_buf(data_sz * out.mDesc.GetElementSpace());

Chao Liu's avatar
Chao Liu committed
96
97
    int num_thread = std::thread::hardware_concurrency();

Chao Liu's avatar
Chao Liu committed
98
    out.GenerateTensorValue(GeneratorConstant<float>{0}, num_thread);
Chao Liu's avatar
Chao Liu committed
99

Chao Liu's avatar
Chao Liu committed
100
101
    in_device_buf.ToDevice(in.mData.data());
    wei_device_buf.ToDevice(wei.mData.data());
Chao Liu's avatar
Chao Liu committed
102
    out_device_buf.ToDevice(out.mData.data());
Chao Liu's avatar
Chao Liu committed
103

Chao Liu's avatar
Chao Liu committed
104
    dim3 block_dim(64, 1, 1);
Chao Liu's avatar
Chao Liu committed
105
    dim3 grid_dim(1, 1, 1);
Chao Liu's avatar
Chao Liu committed
106
    gridwise_convolution<T, 3, 3, 4, 4, 2, 2, 1, 1, 8, 8, 1>
Chao Liu's avatar
Chao Liu committed
107
108
109
110
111
112
113
        <<<grid_dim, block_dim>>>(in_desc_device,
                                  static_cast<T*>(in_device_buf.GetDeviceBuffer()),
                                  wei_desc_device,
                                  static_cast<T*>(wei_device_buf.GetDeviceBuffer()),
                                  out_desc_device,
                                  static_cast<T*>(out_device_buf.GetDeviceBuffer()));

Chao Liu's avatar
Chao Liu committed
114
    checkCudaErrors(cudaGetLastError());
Chao Liu's avatar
Chao Liu committed
115
116
117
118
119
120
    out_device_buf.FromDevice(out.mData.data());
}

int main()
{
#if 0
Chao Liu's avatar
Chao Liu committed
121
    Tensor<float> in({3, 16, 130, 130});
Chao Liu's avatar
Chao Liu committed
122
    Tensor<float> wei({4, 16, 3, 3});
Chao Liu's avatar
Chao Liu committed
123
124
125
126
127
128
    Tensor<float> out_host({3, 4, 128, 128});
#elif 0
    Tensor<float> in({1, 1, 130, 130});
    Tensor<float> wei({1, 1, 3, 3});
    Tensor<float> out_host({1, 1, 128, 128});
#elif 1
Chao Liu's avatar
Chao Liu committed
129
    Tensor<float> in({1, 1, 18, 18});
Chao Liu's avatar
Chao Liu committed
130
    Tensor<float> wei({1, 1, 3, 3});
Chao Liu's avatar
Chao Liu committed
131
    Tensor<float> out_host({1, 1, 16, 16});
Chao Liu's avatar
Chao Liu committed
132
133
134
135
136
137
138
139
140
141
142
#else
    Tensor<float> in({1, 1, 4, 4});
    Tensor<float> wei({1, 1, 3, 3});
    Tensor<float> out_host({1, 1, 2, 2});
#endif
    Tensor<float> out_device = out_host;

    int num_thread = std::thread::hardware_concurrency();

    std::cout << __func__ << ": num_thread " << num_thread << std::endl;

Chao Liu's avatar
Chao Liu committed
143
144
    in.GenerateTensorValue(GeneratorTensor<float>{}, num_thread);
    wei.GenerateTensorValue(GeneratorTensor<float>{}, num_thread);
Chao Liu's avatar
Chao Liu committed
145

Chao Liu's avatar
Chao Liu committed
146
    host_convolution(in, wei, out_host, num_thread);
Chao Liu's avatar
Chao Liu committed
147
148
149
150
    device_convolution(in, wei, out_device);

    std::cout << __func__ << ": done" << std::endl;

Chao Liu's avatar
Chao Liu committed
151
152
153
    LogRange(std::cout << __func__ << "in : ", in.mData, ",") << std::endl;
    LogRange(std::cout << __func__ << "wei: ", wei.mData, ",") << std::endl;
    LogRange(std::cout, out_host.mData, ",") << std::endl;
Chao Liu's avatar
Chao Liu committed
154
    LogRange(std::cout, out_device.mData, ",") << std::endl;
Chao Liu's avatar
Chao Liu committed
155
156
157
158
159
160
161

    float error = 0;
    for(int i = 0; i < out_host.mData.size(); ++i)
    {
        error += std::abs(out_host.mData[i] - out_device.mData[i]);
    }
    std::cout << "error: " << error << std::endl;
Chao Liu's avatar
Chao Liu committed
162
}