warpctc for dcu

99e2985d · lishen · 0bf5eb5f · 0bf5eb5f · 0bf5eb5f · 0bf5eb5f
Commit 99e2985d authored May 16, 2023 by lishen
20 changed files
--- a/include/contrib/moderngpu/include/mgpuenums.h
+++ b/include/contrib/moderngpu/include/mgpuenums.h
-/******************************************************************************
- * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- *
- * Code and text by Sean Baxter, NVIDIA Research
- * See http://nvlabs.github.io/moderngpu for repository and documentation.
- *
- ******************************************************************************/
-
-#pragma once 
-
-namespace mgpu {
-
-enum MgpuBounds {
-	MgpuBoundsLower,
-	MgpuBoundsUpper
-};
-
-enum MgpuScanType {
-	MgpuScanTypeExc,
-	MgpuScanTypeInc
-};
-
-enum MgpuSearchType {
-	MgpuSearchTypeNone,
-	MgpuSearchTypeIndex,
-	MgpuSearchTypeMatch,
-	MgpuSearchTypeIndexMatch
-};
-
-enum MgpuJoinKind {
-	MgpuJoinKindInner,
-	MgpuJoinKindLeft,
-	MgpuJoinKindRight,
-	MgpuJoinKindOuter
-};
-
-enum MgpuSetOp {
-	MgpuSetOpIntersection,
-	MgpuSetOpUnion,
-	MgpuSetOpDiff,
-	MgpuSetOpSymDiff
-};
-
-} // namespace mgpu
--- a/include/contrib/moderngpu/include/util/static.h
+++ b/include/contrib/moderngpu/include/util/static.h
-/******************************************************************************
- * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- *
- * Code and text by Sean Baxter, NVIDIA Research
- * See http://nvlabs.github.io/moderngpu for repository and documentation.
- *
- ******************************************************************************/
-
-#pragma once
-
-#include <functional>
-#include <iterator>
-#include <cfloat>
-#include <typeinfo>
-#include <vector>
-#include <list>
-#include <map>
-#include <algorithm>
-#include <cassert>
-#include <memory>
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-
-#ifndef MGPU_MIN
-#define MGPU_MIN(x, y) (((x) <= (y)) ? (x) : (y))
-#define MGPU_MAX(x, y) (((x) >= (y)) ? (x) : (y))
-#define MGPU_MAX0(x) (((x) >= 0) ? (x) : 0)
-#define MGPU_ABS(x) (((x) >= 0) ? (x) : (-x))
-
-#define MGPU_DIV_UP(x, y) (((x) + (y) - 1) / (y))
-#define MGPU_DIV_ROUND(x, y) (((x) + (y) / 2) / (y))
-#define MGPU_ROUND_UP(x, y) ((y) * MGPU_DIV_UP(x, y))
-#define MGPU_SHIFT_DIV_UP(x, y) (((x) + ((1<< (y)) - 1))>> y)
-#define MGPU_ROUND_UP_POW2(x, y) (((x) + (y) - 1) & ~((y) - 1))
-#define MGPU_ROUND_DOWN_POW2(x, y) ((x) & ~((y) - 1))
-#define MGPU_IS_POW_2(x) (0 == ((x) & ((x) - 1)))
-
-#endif // MGPU_MIN
-
-namespace mgpu {
-
-
-typedef unsigned char byte;
-
-typedef unsigned int uint;
-typedef signed short int16;
-
-typedef unsigned short ushort;
-typedef unsigned short uint16;
-
-typedef long long int64;
-typedef unsigned long long uint64;
-
-// IsPow2<X>::value is true if X is a power of 2.
-template<int X> struct sIsPow2 {
-	enum { value = 0 == (X & (X - 1)) };
-};
-
-// Finds the base-2 logarithm of X. value is -1 if X is not a power of 2.
-template<int X, bool roundUp = true> struct sLogPow2 { 
-	enum { extra = sIsPow2<X>::value ? 0 : (roundUp ? 1 : 0) };
-	enum { inner = sLogPow2<X / 2>::inner + 1 };
-	enum { value = inner + extra };
-};
-template<bool roundUp> struct sLogPow2<0, roundUp> {
-	enum { inner = 0 };
-	enum { value = 0 };
-};
-template<bool roundUp> struct sLogPow2<1, roundUp> { 
-	enum { inner = 0 };
-	enum { value = 0 };
-};
-
-template<int X, int Y>
-struct sDivUp {
-	enum { value = (X + Y - 1) / Y };
-};
-
-template<int count, int levels> struct sDiv2RoundUp {
-	enum { value = sDiv2RoundUp<sDivUp<count, 2>::value, levels - 1>::value };
-};
-template<int count> struct sDiv2RoundUp<count, 0> {
-	enum { value = count };
-};
-
-template<int X, int Y>
-struct sDivSafe {
-	enum { value = X / Y };
-};
-template<int X>
-struct sDivSafe<X, 0> {
-	enum { value = 0 };
-};
-
-template<int X, int Y>
-struct sRoundUp {
-	enum { rem = X % Y };
-	enum { value = X + (rem ? (Y - rem) : 0) };
-};
-
-template<int X, int Y>
-struct sRoundDown {
-	enum { rem = X % Y };
-	enum { value = X - rem };
-};
-
-// IntegerDiv is a template for avoiding divisions by zero in template 
-// evaluation. Templates always evaluate both b and c in an expression like
-// a ? b : c, and will error if either rhs contains an illegal expression,
-// even if the ternary is explictly designed to guard against that.
-template<int X, int Y>
-struct sIntegerDiv {
-	enum { value = X / (Y ? Y : (X + 1)) };
-};
-
-template<int X, int Y>
-struct sMax {
-	enum { value = (X >= Y) ? X : Y };
-};
-template<int X, int Y>
-struct sMin {
-	enum { value = (X <= Y) ? X : Y };
-};
-
-template<int X>
-struct sAbs {
-	enum { value = (X >= 0) ? X : -X };
-};
-
-
-// Finds the number of powers of 2 in the prime factorization of X.
-template<int X, int LSB = 1 & X> struct sNumFactorsOf2 {
-	enum { shifted = X >> 1 };
-	enum { value = 1 + sNumFactorsOf2<shifted>::value };
-};
-template<int X> struct sNumFactorsOf2<X, 1> {
-	enum { value = 0 };
-};
-
-// Returns the divisor for a conflict-free transpose.
-template<int X, int NumBanks = 32> struct sBankConflictDivisor {
-	enum { value = 
-		(1 & X) ? 0 : 
-		(sIsPow2<X>::value ? NumBanks :
-		(1<< sNumFactorsOf2<X>::value)) }; 
-	enum { log_value = sLogPow2<value>::value };
-};
-
-template<int NT, int X, int NumBanks = 32> struct sConflictFreeStorage {
-	enum { count = NT * X };
-	enum { divisor = sBankConflictDivisor<X, NumBanks>::value };
-	enum { padding = sDivSafe<count, divisor>::value };
-	enum { value = count + padding };
-};
-
-} // namespace mgpu
--- a/include/ctc.h
+++ b/include/ctc.h
-/** \file ctc.h
- * Contains a simple C interface to call fast CPU and GPU based computation
- * of the CTC loss.
- */
-
-#pragma once
-
-#ifdef __cplusplus
-#include <cstddef>
-#include <torch/extension.h>
-
-extern "C" {
-#endif
-
-//forward declare of CUDA typedef to avoid needing to pull in CUDA headers
-//typedef struct CUstream_st* CUstream;
-typedef struct ihipStream_t* CUstream;
-
-typedef enum {
-    CTC_STATUS_SUCCESS = 0,
-    CTC_STATUS_MEMOPS_FAILED = 1,
-    CTC_STATUS_INVALID_VALUE = 2,
-    CTC_STATUS_EXECUTION_FAILED = 3,
-    CTC_STATUS_UNKNOWN_ERROR = 4
-} ctcStatus_t;
-
-/** Returns a single integer which specifies the API version of the warpctc library */
-int get_warpctc_version();
-
-/** Returns a string containing a description of status that was passed in
- *  \param[in] status identifies which string should be returned
- *  \return C style string containing the text description
- *  */
-const char* ctcGetStatusString(ctcStatus_t status);
-
-typedef enum {
-    CTC_CPU = 0,
-    CTC_GPU = 1
-} ctcComputeLocation;
-
-/** Structure used for options to the CTC compution.  Applications
- *  should zero out the array using memset and sizeof(struct
- *  ctcOptions) in C or default initialization (e.g. 'ctcOptions
- *  options{};' or 'auto options = ctcOptions{}') in C++ to ensure
- *  forward compatibility with added options. */
-struct ctcOptions {
-    /// indicates where the ctc calculation should take place {CTC_CPU | CTC_GPU}
-    ctcComputeLocation loc;
-    union {
-        /// used when loc == CTC_CPU, the maximum number of threads that can be used
-        unsigned int num_threads;
-
-        /// used when loc == CTC_GPU, which stream the kernels should be launched in
-        CUstream stream;
-    };
-
-    /// the label value/index that the CTC calculation should use as the blank label
-    int blank_label;
-};
-
-/** Compute the connectionist temporal classification loss between a sequence
- *  of probabilities and a ground truth labeling.  Optionally compute the
- *  gradient with respect to the inputs.
- * \param [in] activations pointer to the activations in either CPU or GPU
- *             addressable memory, depending on info.  We assume a fixed
- *             memory layout for this 3 dimensional tensor, which has dimension
- *             (t, n, p), where t is the time index, n is the minibatch index,
- *             and p indexes over probabilities of each symbol in the alphabet.
- *             The memory layout is (t, n, p) in C order (slowest to fastest changing
- *             index, aka row-major), or (p, n, t) in Fortran order (fastest to slowest
- *             changing index, aka column-major). We also assume strides are equal to
- *             dimensions - there is no padding between dimensions.
- *             More precisely, element (t, n, p), for a problem with mini_batch examples
- *             in the mini batch, and alphabet_size symbols in the alphabet, is located at:
- *             activations[(t * mini_batch + n) * alphabet_size + p]
- * \param [out] gradients if not NULL, then gradients are computed.  Should be
- *              allocated in the same memory space as probs and memory
- *              ordering is identical.
- * \param [in]  flat_labels Always in CPU memory.  A concatenation
- *              of all the labels for the minibatch.
- * \param [in]  label_lengths Always in CPU memory. The length of each label
- *              for each example in the minibatch.
- * \param [in]  input_lengths Always in CPU memory.  The number of time steps
- *              for each sequence in the minibatch.
- * \param [in]  alphabet_size The number of possible output symbols.  There
- *              should be this many probabilities for each time step.
- * \param [in]  mini_batch How many examples in a minibatch.
- * \param [out] costs Always in CPU memory.  The cost of each example in the
- *              minibatch.
- * \param [in,out] workspace In same memory space as probs. Should be of
- *                 size requested by get_workspace_size.
- * \param [in]  options see struct ctcOptions
- *
- *  \return Status information
- *
- * */
-ctcStatus_t compute_ctc_loss(const float* const activations,
-                             float* gradients,
-                             const int* const flat_labels,
-                             const int* const label_lengths,
-                             const int* const input_lengths,
-                             int alphabet_size,
-                             int minibatch,
-                             float *costs,
-                             void *workspace,
-                             ctcOptions options);
-
-
-/** For a given set of labels and minibatch size return the required workspace
- *  size.  This will need to be allocated in the same memory space as your
- *  probabilities.
- * \param [in]  label_lengths Always in CPU memory. The length of each label
- *              for each example in the minibatch.
- * \param [in]  input_lengths Always in CPU memory.  The number of time steps
- *              for each sequence in the minibatch.
- * \param [in]  alphabet_size How many symbols in the alphabet or, equivalently,
- *              the number of probabilities at each time step
- * \param [in]  mini_batch How many examples in a minibatch.
- * \param [in]  info see struct ctcOptions
- * \param [out] size_bytes is pointer to a scalar where the memory
- *              requirement in bytes will be placed. This memory should be allocated
- *              at the same place, CPU or GPU, that the probs are in
- *
- *  \return Status information
- **/
-ctcStatus_t get_workspace_size(const int* const label_lengths,
-                               const int* const input_lengths,
-                               int alphabet_size, int minibatch,
-                               ctcOptions info,
-                               size_t* size_bytes);
-
-#ifdef __cplusplus
-}
-#endif
--- a/include/detail/cpu_ctc.h
+++ b/include/detail/cpu_ctc.h
--- a/include/detail/ctc_helper.h
+++ b/include/detail/ctc_helper.h
-#pragma once
-
-#include <limits>
-#include <algorithm>
-#include <cmath>
-
-#include "hostdevice.h"
-
-namespace ctc_helper {
-
-    static const float threshold = 1e-1;
-
-    template<typename T>
-    HOSTDEVICE
-    T neg_inf() { return -T(INFINITY); }
-
-    inline int div_up(int x, int y) {
-        return (x + y - 1) / y;
-    }
-
-    template<typename Arg, typename Res = Arg>
-    struct maximum {
-        HOSTDEVICE
-        Res operator()(const Arg &x, const Arg &y) const {
-            return x < y ? y : x;
-        }
-    };
-
-    template<typename Arg, typename Res = Arg>
-    struct minimum {
-        HOSTDEVICE
-        Res operator()(const Arg &x, const Arg &y) const {
-            return x < y ? x : y;
-        }
-    };
-
-    template<typename Arg, typename Res = Arg>
-    struct add {
-        HOSTDEVICE
-        Res operator()(const Arg &x, const Arg &y) const {
-            return x + y;
-        }
-    };
-
-    template<typename Arg, typename Res = Arg>
-    struct identity {
-        HOSTDEVICE Res operator()(const Arg &x) const {
-            return Res(x);
-        }
-    };
-
-    template<typename Arg, typename Res = Arg>
-    struct negate {
-        HOSTDEVICE Res operator()(const Arg &x) const {
-            return Res(-x);
-        }
-    };
-
-    template<typename Arg, typename Res = Arg>
-    struct exponential {
-        HOSTDEVICE Res operator()(const Arg &x) const { return std::exp(x); }
-    };
-
-    template<typename Arg1, typename Arg2 = Arg1, typename Res=Arg1>
-    struct log_plus {
-        typedef Res result_type;
-        HOSTDEVICE
-        Res operator()(const Arg1 &p1, const Arg2 &p2) {
-            if (p1 == neg_inf<Arg1>())
-                return p2;
-            if (p2 == neg_inf<Arg2>())
-                return p1;
-            Res result = log1p(exp(-fabs(p1 - p2))) + maximum<Res>()(p1, p2);
-            return result;
-        }
-    };
-
-//template<typename Arg1, typename Arg2 = Arg1, typename Res=Arg1>
-//struct log_plus {
-//    HOSTDEVICE
-//    Res operator()(const Arg1& p1, const Arg2& p2) {
-//        Res p12_max = maximum<Res>()(p1, p2);
-//        Res p12_min = minimum<Res>()(p1, p2);
-//        Res p12_diff = p12_min-p12_max;
-//        Res NEGATIVE_CUTOFF_VAL = -(Res)100000;
-//
-//        Res result = p12_diff <= NEGATIVE_CUTOFF_VAL ? maximum<Res>()(p12_max, NEGATIVE_CUTOFF_VAL)
-//                                        : maximum<Res>()(p12_max + log(exp(p12_diff) + 1), NEGATIVE_CUTOFF_VAL);
-//
-//
-//        return result;
-//    }
-//};
-
-}
--- a/include/detail/gpu_ctc.h
+++ b/include/detail/gpu_ctc.h
--- a/include/detail/gpu_ctc_kernels.h
+++ b/include/detail/gpu_ctc_kernels.h
--- a/include/detail/hostdevice.h
+++ b/include/detail/hostdevice.h
-#pragma once
-
-#ifdef __HIPCC__
-    #define HOSTDEVICE __device__ __host__
-#else
-    #define HOSTDEVICE
-#endif
--- a/include/detail/reduce.h
+++ b/include/detail/reduce.h
-#pragma once
-
-ctcStatus_t reduce_negate(const float* input, float* output, int rows, int cols, bool axis, CUstream stream);
-ctcStatus_t reduce_exp(const float* input, float* output, int rows, int cols, bool axis, CUstream stream);
-ctcStatus_t reduce_max(const float* input, float* output, int rows, int cols, bool axis, CUstream stream);
--- a/pytorch_binding/setup.cfg
+++ b/pytorch_binding/setup.cfg
-[tool:pytest]
--- a/pytorch_binding/setup.py
+++ b/pytorch_binding/setup.py
--- a/pytorch_binding/src/binding.cu
+++ b/pytorch_binding/src/binding.cu
--- a/pytorch_binding/src/binding.hip
+++ b/pytorch_binding/src/binding.hip
--- a/pytorch_binding/src/cpu_binding.h
+++ b/pytorch_binding/src/cpu_binding.h
-#pragma once
-/*
-int cpu_ctc(THFloatTensor *probs,
-                        THFloatTensor *grads,
-                        THIntTensor *labels_ptr,
-                        THIntTensor *label_sizes_ptr,
-                        THIntTensor *sizes,
-                        int minibatch_size,
-                        THFloatTensor *costs,
-                        int blank_label);
-*/
-
-int cpu_ctc(torch::Tensor probs,
-            torch::Tensor grads,
-            torch::Tensor labels,
-            torch::Tensor label_sizes,
-            torch::Tensor sizes,
-            int minibatch_size,
-            torch::Tensor costs,
-            int blank_label); 
--- a/pytorch_binding/src/gpu_binding.h
+++ b/pytorch_binding/src/gpu_binding.h
-#pragma once
-
-/*
-int gpu_ctc(THCudaTensor *probs,
-                        THCudaTensor *grads,
-                        THIntTensor *labels_ptr,
-                        THIntTensor *label_sizes_ptr,
-                        THIntTensor *sizes,
-                        int minibatch_size,
-                        THFloatTensor *costs,
-                        int blank_label);
-*/
-
-int gpu_ctc(torch::Tensor probs,
-            torch::Tensor grads,
-            torch::Tensor labels,
-            torch::Tensor label_sizes,
-            torch::Tensor sizes,
-            int minibatch_size,
-            torch::Tensor costs,
-            int blank_label);
--- a/pytorch_binding/tests/test_gpu.py
+++ b/pytorch_binding/tests/test_gpu.py
--- a/pytorch_binding/tests/test_gpu_speed.py
+++ b/pytorch_binding/tests/test_gpu_speed.py
--- a/pytorch_binding/warpctc_pytorch/__init__.py
+++ b/pytorch_binding/warpctc_pytorch/__init__.py
--- a/src/ctc_entrypoint.cu
+++ b/src/ctc_entrypoint.cu
--- a/src/reduce.cu
+++ b/src/reduce.cu