// Includes, system #include #include // Includes, cuda #include //#include #include // Includes, cuda helper functions // #include // For the functors #include "detail/ctc_helper.h" #include "ctc.h" const int warp_size = 64; const int kCUDABlockNumThreads = 256; template struct CTAReduce; template struct CTAReduce { enum { Size = NT, Capacity = NT }; struct Storage { T shared[Capacity]; }; __device__ static T reduce(int tid, T x, Storage &storage, int count, Rop g) { T *s = storage.shared; s[tid] = x; __syncthreads(); // Fold the data in half with each pass. #pragma unroll for (int offset = NT / 2; offset >= warp_size; offset /= 2) { if (tid + offset < count && tid < offset) { x = g(x, s[offset + tid]); s[tid] = x; } __syncthreads(); } T shuff; for (int offset = warp_size / 2; offset > 0; offset /= 2) { // shuff = __shfl_down(0xFFFFFFF, x, offset); shuff = __shfl_down(x, offset); if (tid + offset < count && tid < offset) { x = g(x, shuff); } } return x; } }; template __global__ void reduce_rows(Iop f, Rop g, const T *input, T *output, int num_rows, int num_cols) { typedef CTAReduce R; __shared__ typename R::Storage storage; int tid = threadIdx.x; int idx = tid; int col = blockIdx.x; T curr; // Each block works on a column if (idx < num_rows) { curr = f(input[idx + col * num_rows]); } // __syncthreads(); idx += NT; while (idx < num_rows) { curr = g(curr, f(input[idx + col * num_rows])); idx += NT; } // Sum thread-totals over the CTA. curr = R::reduce(tid, curr, storage, num_rows, g); // Store result in out if (tid == 0) { output[col] = curr; } } template __global__ void reduce_cols(Iop f, Rop g, const T *input, T *output, int num_rows, int num_cols) { __shared__ T s[NT]; int warps_per_block = NT / warp_size; int row = blockDim.x * blockIdx.x + threadIdx.x; int col = threadIdx.y; T curr; if (row < num_rows && col < num_cols) { curr = f(input[row + col * num_rows]); col += blockDim.y; while (col < num_cols) { curr = g(curr, f(input[row + col * num_rows])); col += blockDim.y; } } s[threadIdx.x * warps_per_block + threadIdx.y] = curr; __syncthreads(); // Reduce if (threadIdx.y == 0 && row < num_rows) { #pragma unroll for (int i = 1; i < warps_per_block && i < num_cols; ++i) curr = g(curr, s[i + threadIdx.x * warps_per_block]); output[row] = curr; } } struct ReduceHelper { template static void impl(Iof f, Rof g, const T *input, T *output, int num_rows, int num_cols, bool axis, CUstream stream) { int grid_size; if (axis) { grid_size = num_cols; reduce_rows<<>> (f, g, input, output, num_rows, num_cols); } else { dim3 tpb(warp_size, kCUDABlockNumThreads / warp_size); grid_size = (num_cols + warp_size - 1) / warp_size; reduce_cols<<>> (f, g, input, output, num_rows, num_cols); } } }; template ctcStatus_t reduce(Iof f, Rof g, const T *input, T *output, int rows, int cols, bool axis, CUstream stream) { ReduceHelper::impl(f, g, input, output, rows, cols, axis, stream); hipStreamSynchronize(stream); hipError_t err = hipGetLastError(); if (err != hipSuccess) return CTC_STATUS_EXECUTION_FAILED; return CTC_STATUS_SUCCESS; } ctcStatus_t reduce_negate(const float *input, float *output, int rows, int cols, bool axis, CUstream stream) { return reduce(ctc_helper::negate(), ctc_helper::add(), input, output, rows, cols, axis, stream); } ctcStatus_t reduce_exp(const float *input, float *output, int rows, int cols, bool axis, CUstream stream) { return reduce(ctc_helper::exponential(), ctc_helper::add(), input, output, rows, cols, axis, stream); } ctcStatus_t reduce_max(const float *input, float *output, int rows, int cols, bool axis, CUstream stream) { auto ctc_status = reduce(ctc_helper::identity(), ctc_helper::maximum(), input, output, rows, cols, axis, stream); return ctc_status; }