reduce.hip 5.02 KB
Newer Older
lishen's avatar
lishen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
// !!! This is a file automatically generated by hipify!!!
// Includes, system
#include <stdio.h>
#include <stdlib.h>

// Includes, cuda
#include <hip/hip_runtime.h>
//#include<rocblas.h>
#include <hip/hip_runtime_api.h>

// Includes, cuda helper functions
// #include <helper_cuda.h>

// For the functors
#include "detail/ctc_helper.h"
#include "ctc.h"


const int warp_size = 64;
const int kCUDABlockNumThreads = 256;

template<int NT, typename T, typename Rop>
struct CTAReduce;

template<int NT, typename T, typename Rop>
struct CTAReduce {
    enum {
        Size = NT, Capacity = NT
    };
    struct Storage {
        T shared[Capacity];
    };

    __device__ static T reduce(int tid, T x, Storage &storage, int count, Rop g) {
        T *s = storage.shared;
        s[tid] = x;
        __syncthreads();

        // Fold the data in half with each pass.
#pragma unroll
        for (int offset = NT / 2; offset >= warp_size; offset /= 2) {
            if (tid + offset < count && tid < offset) {
                x = g(x, s[offset + tid]);
                s[tid] = x;
            }
            __syncthreads();
        }

        T shuff;
        for (int offset = warp_size / 2; offset > 0; offset /= 2) {
            // shuff = __shfl_down(0xFFFFFFF, x, offset);
            shuff = __shfl_down(x, offset);
            if (tid + offset < count && tid < offset) {
                x = g(x, shuff);
            }
        }
        return x;
    }
};

template<int NT, typename Iop, typename Rop, typename T>
__global__ void reduce_rows(Iop f, Rop g, const T *input, T *output,
                            int num_rows, int num_cols) {

    typedef CTAReduce<NT, T, Rop> R;
    __shared__ typename R::Storage storage;

    int tid = threadIdx.x;
    int idx = tid;
    int col = blockIdx.x;

    T curr;
    // Each block works on a column
    if (idx < num_rows) {
        curr = f(input[idx + col * num_rows]);
    }
    // __syncthreads();

    idx += NT;
    while (idx < num_rows) {
        curr = g(curr, f(input[idx + col * num_rows]));
        idx += NT;
    }

    // Sum thread-totals over the CTA.
    curr = R::reduce(tid, curr, storage, num_rows, g);


    // Store result in out
    if (tid == 0) {
        output[col] = curr;

    }
}

template<int NT, typename Iop, typename Rop, typename T>
__global__ void reduce_cols(Iop f, Rop g, const T *input, T *output,
                            int num_rows, int num_cols) {

    __shared__ T s[NT];

    int warps_per_block = NT / warp_size;
    int row = blockDim.x * blockIdx.x + threadIdx.x;
    int col = threadIdx.y;
    T curr;

    if (row < num_rows && col < num_cols) {
        curr = f(input[row + col * num_rows]);
        col += blockDim.y;
        while (col < num_cols) {
            curr = g(curr, f(input[row + col * num_rows]));
            col += blockDim.y;
        }
    }
    s[threadIdx.x * warps_per_block + threadIdx.y] = curr;
    __syncthreads();

    // Reduce
    if (threadIdx.y == 0 && row < num_rows) {
#pragma unroll
        for (int i = 1; i < warps_per_block && i < num_cols; ++i)
            curr = g(curr, s[i + threadIdx.x * warps_per_block]);
        output[row] = curr;
    }
}

struct ReduceHelper {

    template<typename T, typename Iof, typename Rof>
    static void impl(Iof f, Rof g, const T *input, T *output, int num_rows, int num_cols, bool axis, hipStream_t stream) {

        int grid_size;
        if (axis) {
            grid_size = num_cols;
           hipLaunchKernelGGL(( reduce_rows<kCUDABlockNumThreads>), dim3(grid_size), dim3(kCUDABlockNumThreads), 0, stream, 
                    f, g, input, output, num_rows, num_cols);

        } else {
            dim3 tpb(warp_size, kCUDABlockNumThreads / warp_size);
            grid_size = (num_cols + warp_size - 1) / warp_size;
           hipLaunchKernelGGL(( reduce_cols<kCUDABlockNumThreads>), dim3(grid_size), dim3(tpb), 0, stream, 
                    f, g, input, output, num_rows, num_cols);

        }

    }
};


template<typename T, typename Iof, typename Rof>
ctcStatus_t reduce(Iof f, Rof g, const T *input, T *output, int rows, int cols, bool axis, hipStream_t stream) {
    ReduceHelper::impl(f, g, input, output, rows, cols, axis, stream);
    hipStreamSynchronize(stream);

    hipError_t err = hipGetLastError();
    if (err != hipSuccess)
        return CTC_STATUS_EXECUTION_FAILED;

    return CTC_STATUS_SUCCESS;
}

ctcStatus_t reduce_negate(const float *input, float *output, int rows, int cols, bool axis, hipStream_t stream) {
    return reduce(ctc_helper::negate<float>(), ctc_helper::add<float>(), input, output, rows, cols, axis, stream);
}

ctcStatus_t reduce_exp(const float *input, float *output, int rows, int cols, bool axis, hipStream_t stream) {
    return reduce(ctc_helper::exponential<float>(), ctc_helper::add<float>(), input, output, rows, cols, axis, stream);
}

ctcStatus_t reduce_max(const float *input, float *output, int rows, int cols, bool axis, hipStream_t stream) {
    auto ctc_status = reduce(ctc_helper::identity<float>(), ctc_helper::maximum<float>(), input, output, rows, cols, axis, stream);

    return ctc_status;
}