warpctc for dcu

99e2985d · lishen · 0bf5eb5f · 0bf5eb5f
Commit 99e2985d authored May 16, 2023 by lishen
Hide whitespace changes
Inline Side-by-side

Showing with 0 additions and 174 deletions

src/reduce.hip src/reduce.hip +0 -174

No files found.
--- a/src/reduce.hip
+++ b/src/reduce.hip
-// !!! This is a file automatically generated by hipify!!!
-// Includes, system
-#include <stdio.h>
-#include <stdlib.h>
-// Includes, cuda
-#include <hip/hip_runtime.h>
-//#include<rocblas.h>
-#include <hip/hip_runtime_api.h>
-// Includes, cuda helper functions
-// #include <helper_cuda.h>
-// For the functors
-#include "detail/ctc_helper.h"
-#include "ctc.h"
-const int warp_size = 64;
-const int kCUDABlockNumThreads = 256;
-template<int NT, typename T, typename Rop>
-struct CTAReduce;
-template<int NT, typename T, typename Rop>
-struct CTAReduce {
-    enum {
-        Size = NT, Capacity = NT
-    };
-    struct Storage {
-        T shared[Capacity];
-    };
-    __device__ static T reduce(int tid, T x, Storage &storage, int count, Rop g) {
-        T *s = storage.shared;
-        s[tid] = x;
-        __syncthreads();
-        // Fold the data in half with each pass.
-#pragma unroll
-        for (int offset = NT / 2; offset >= warp_size; offset /= 2) {
-            if (tid + offset < count && tid < offset) {
-                x = g(x, s[offset + tid]);
-                s[tid] = x;
-            }
-            __syncthreads();
-        }
-        T shuff;
-        for (int offset = warp_size / 2; offset > 0; offset /= 2) {
-            // shuff = __shfl_down(0xFFFFFFF, x, offset);
-            shuff = __shfl_down(x, offset);
-            if (tid + offset < count && tid < offset) {
-                x = g(x, shuff);
-            }
-        }
-        return x;
-    }
-};
-template<int NT, typename Iop, typename Rop, typename T>
-__global__ void reduce_rows(Iop f, Rop g, const T *input, T *output,
-                            int num_rows, int num_cols) {
-    typedef CTAReduce<NT, T, Rop> R;
-    __shared__ typename R::Storage storage;
-    int tid = threadIdx.x;
-    int idx = tid;
-    int col = blockIdx.x;
-    T curr;
-    // Each block works on a column
-    if (idx < num_rows) {
-        curr = f(input[idx + col * num_rows]);
-    }
-    // __syncthreads();
-    idx += NT;
-    while (idx < num_rows) {
-        curr = g(curr, f(input[idx + col * num_rows]));
-        idx += NT;
-    }
-    // Sum thread-totals over the CTA.
-    curr = R::reduce(tid, curr, storage, num_rows, g);
-    // Store result in out
-    if (tid == 0) {
-        output[col] = curr;
-    }
-}
-template<int NT, typename Iop, typename Rop, typename T>
-__global__ void reduce_cols(Iop f, Rop g, const T *input, T *output,
-                            int num_rows, int num_cols) {
-    __shared__ T s[NT];
-    int warps_per_block = NT / warp_size;
-    int row = blockDim.x * blockIdx.x + threadIdx.x;
-    int col = threadIdx.y;
-    T curr;
-    if (row < num_rows && col < num_cols) {
-        curr = f(input[row + col * num_rows]);
-        col += blockDim.y;
-        while (col < num_cols) {
-            curr = g(curr, f(input[row + col * num_rows]));
-            col += blockDim.y;
-        }
-    }
-    s[threadIdx.x * warps_per_block + threadIdx.y] = curr;
-    __syncthreads();
-    // Reduce
-    if (threadIdx.y == 0 && row < num_rows) {
-#pragma unroll
-        for (int i = 1; i < warps_per_block && i < num_cols; ++i)
-            curr = g(curr, s[i + threadIdx.x * warps_per_block]);
-        output[row] = curr;
-    }
-}
-struct ReduceHelper {
-    template<typename T, typename Iof, typename Rof>
-    static void impl(Iof f, Rof g, const T *input, T *output, int num_rows, int num_cols, bool axis, hipStream_t stream) {
-        int grid_size;
-        if (axis) {
-            grid_size = num_cols;
-           hipLaunchKernelGGL(( reduce_rows<kCUDABlockNumThreads>), dim3(grid_size), dim3(kCUDABlockNumThreads), 0, stream, 
-                    f, g, input, output, num_rows, num_cols);
-        } else {
-            dim3 tpb(warp_size, kCUDABlockNumThreads / warp_size);
-            grid_size = (num_cols + warp_size - 1) / warp_size;
-           hipLaunchKernelGGL(( reduce_cols<kCUDABlockNumThreads>), dim3(grid_size), dim3(tpb), 0, stream, 
-                    f, g, input, output, num_rows, num_cols);
-        }
-    }
-};
-template<typename T, typename Iof, typename Rof>
-ctcStatus_t reduce(Iof f, Rof g, const T *input, T *output, int rows, int cols, bool axis, hipStream_t stream) {
-    ReduceHelper::impl(f, g, input, output, rows, cols, axis, stream);
-    hipStreamSynchronize(stream);
-    hipError_t err = hipGetLastError();
-    if (err != hipSuccess)
-        return CTC_STATUS_EXECUTION_FAILED;
-    return CTC_STATUS_SUCCESS;
-}
-ctcStatus_t reduce_negate(const float *input, float *output, int rows, int cols, bool axis, hipStream_t stream) {
-    return reduce(ctc_helper::negate<float>(), ctc_helper::add<float>(), input, output, rows, cols, axis, stream);
-}
-ctcStatus_t reduce_exp(const float *input, float *output, int rows, int cols, bool axis, hipStream_t stream) {
-    return reduce(ctc_helper::exponential<float>(), ctc_helper::add<float>(), input, output, rows, cols, axis, stream);
-}
-ctcStatus_t reduce_max(const float *input, float *output, int rows, int cols, bool axis, hipStream_t stream) {
-    auto ctc_status = reduce(ctc_helper::identity<float>(), ctc_helper::maximum<float>(), input, output, rows, cols, axis, stream);
-    return ctc_status;
-}