Commit 99e2985d authored by lishen's avatar lishen
Browse files

warpctc for dcu

parent 0bf5eb5f
// !!! This is a file automatically generated by hipify!!!
// Includes, system
#include <stdio.h>
#include <stdlib.h>
// Includes, cuda
#include <hip/hip_runtime.h>
//#include<rocblas.h>
#include <hip/hip_runtime_api.h>
// Includes, cuda helper functions
// #include <helper_cuda.h>
// For the functors
#include "detail/ctc_helper.h"
#include "ctc.h"
const int warp_size = 64;
const int kCUDABlockNumThreads = 256;
template<int NT, typename T, typename Rop>
struct CTAReduce;
template<int NT, typename T, typename Rop>
struct CTAReduce {
enum {
Size = NT, Capacity = NT
};
struct Storage {
T shared[Capacity];
};
__device__ static T reduce(int tid, T x, Storage &storage, int count, Rop g) {
T *s = storage.shared;
s[tid] = x;
__syncthreads();
// Fold the data in half with each pass.
#pragma unroll
for (int offset = NT / 2; offset >= warp_size; offset /= 2) {
if (tid + offset < count && tid < offset) {
x = g(x, s[offset + tid]);
s[tid] = x;
}
__syncthreads();
}
T shuff;
for (int offset = warp_size / 2; offset > 0; offset /= 2) {
// shuff = __shfl_down(0xFFFFFFF, x, offset);
shuff = __shfl_down(x, offset);
if (tid + offset < count && tid < offset) {
x = g(x, shuff);
}
}
return x;
}
};
template<int NT, typename Iop, typename Rop, typename T>
__global__ void reduce_rows(Iop f, Rop g, const T *input, T *output,
int num_rows, int num_cols) {
typedef CTAReduce<NT, T, Rop> R;
__shared__ typename R::Storage storage;
int tid = threadIdx.x;
int idx = tid;
int col = blockIdx.x;
T curr;
// Each block works on a column
if (idx < num_rows) {
curr = f(input[idx + col * num_rows]);
}
// __syncthreads();
idx += NT;
while (idx < num_rows) {
curr = g(curr, f(input[idx + col * num_rows]));
idx += NT;
}
// Sum thread-totals over the CTA.
curr = R::reduce(tid, curr, storage, num_rows, g);
// Store result in out
if (tid == 0) {
output[col] = curr;
}
}
template<int NT, typename Iop, typename Rop, typename T>
__global__ void reduce_cols(Iop f, Rop g, const T *input, T *output,
int num_rows, int num_cols) {
__shared__ T s[NT];
int warps_per_block = NT / warp_size;
int row = blockDim.x * blockIdx.x + threadIdx.x;
int col = threadIdx.y;
T curr;
if (row < num_rows && col < num_cols) {
curr = f(input[row + col * num_rows]);
col += blockDim.y;
while (col < num_cols) {
curr = g(curr, f(input[row + col * num_rows]));
col += blockDim.y;
}
}
s[threadIdx.x * warps_per_block + threadIdx.y] = curr;
__syncthreads();
// Reduce
if (threadIdx.y == 0 && row < num_rows) {
#pragma unroll
for (int i = 1; i < warps_per_block && i < num_cols; ++i)
curr = g(curr, s[i + threadIdx.x * warps_per_block]);
output[row] = curr;
}
}
struct ReduceHelper {
template<typename T, typename Iof, typename Rof>
static void impl(Iof f, Rof g, const T *input, T *output, int num_rows, int num_cols, bool axis, hipStream_t stream) {
int grid_size;
if (axis) {
grid_size = num_cols;
hipLaunchKernelGGL(( reduce_rows<kCUDABlockNumThreads>), dim3(grid_size), dim3(kCUDABlockNumThreads), 0, stream,
f, g, input, output, num_rows, num_cols);
} else {
dim3 tpb(warp_size, kCUDABlockNumThreads / warp_size);
grid_size = (num_cols + warp_size - 1) / warp_size;
hipLaunchKernelGGL(( reduce_cols<kCUDABlockNumThreads>), dim3(grid_size), dim3(tpb), 0, stream,
f, g, input, output, num_rows, num_cols);
}
}
};
template<typename T, typename Iof, typename Rof>
ctcStatus_t reduce(Iof f, Rof g, const T *input, T *output, int rows, int cols, bool axis, hipStream_t stream) {
ReduceHelper::impl(f, g, input, output, rows, cols, axis, stream);
hipStreamSynchronize(stream);
hipError_t err = hipGetLastError();
if (err != hipSuccess)
return CTC_STATUS_EXECUTION_FAILED;
return CTC_STATUS_SUCCESS;
}
ctcStatus_t reduce_negate(const float *input, float *output, int rows, int cols, bool axis, hipStream_t stream) {
return reduce(ctc_helper::negate<float>(), ctc_helper::add<float>(), input, output, rows, cols, axis, stream);
}
ctcStatus_t reduce_exp(const float *input, float *output, int rows, int cols, bool axis, hipStream_t stream) {
return reduce(ctc_helper::exponential<float>(), ctc_helper::add<float>(), input, output, rows, cols, axis, stream);
}
ctcStatus_t reduce_max(const float *input, float *output, int rows, int cols, bool axis, hipStream_t stream) {
auto ctc_status = reduce(ctc_helper::identity<float>(), ctc_helper::maximum<float>(), input, output, rows, cols, axis, stream);
return ctc_status;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment