warpctc for dcu

99e2985d · lishen · 0bf5eb5f · 0bf5eb5f · 0bf5eb5f · 0bf5eb5f
Commit 99e2985d authored May 16, 2023 by lishen
20 changed files
--- a/include/contrib/moderngpu/include/mgpuenums.h
+++ b/include/contrib/moderngpu/include/mgpuenums.h
-/******************************************************************************
- * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- *
- * Code and text by Sean Baxter, NVIDIA Research
- * See http://nvlabs.github.io/moderngpu for repository and documentation.
- *
- ******************************************************************************/
-
-#pragma once 
-
-namespace mgpu {
-
-enum MgpuBounds {
-	MgpuBoundsLower,
-	MgpuBoundsUpper
-};
-
-enum MgpuScanType {
-	MgpuScanTypeExc,
-	MgpuScanTypeInc
-};
-
-enum MgpuSearchType {
-	MgpuSearchTypeNone,
-	MgpuSearchTypeIndex,
-	MgpuSearchTypeMatch,
-	MgpuSearchTypeIndexMatch
-};
-
-enum MgpuJoinKind {
-	MgpuJoinKindInner,
-	MgpuJoinKindLeft,
-	MgpuJoinKindRight,
-	MgpuJoinKindOuter
-};
-
-enum MgpuSetOp {
-	MgpuSetOpIntersection,
-	MgpuSetOpUnion,
-	MgpuSetOpDiff,
-	MgpuSetOpSymDiff
-};
-
-} // namespace mgpu
--- a/include/contrib/moderngpu/include/util/static.h
+++ b/include/contrib/moderngpu/include/util/static.h
-/******************************************************************************
- * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- *
- * Code and text by Sean Baxter, NVIDIA Research
- * See http://nvlabs.github.io/moderngpu for repository and documentation.
- *
- ******************************************************************************/
-
-#pragma once
-
-#include <functional>
-#include <iterator>
-#include <cfloat>
-#include <typeinfo>
-#include <vector>
-#include <list>
-#include <map>
-#include <algorithm>
-#include <cassert>
-#include <memory>
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-
-#ifndef MGPU_MIN
-#define MGPU_MIN(x, y) (((x) <= (y)) ? (x) : (y))
-#define MGPU_MAX(x, y) (((x) >= (y)) ? (x) : (y))
-#define MGPU_MAX0(x) (((x) >= 0) ? (x) : 0)
-#define MGPU_ABS(x) (((x) >= 0) ? (x) : (-x))
-
-#define MGPU_DIV_UP(x, y) (((x) + (y) - 1) / (y))
-#define MGPU_DIV_ROUND(x, y) (((x) + (y) / 2) / (y))
-#define MGPU_ROUND_UP(x, y) ((y) * MGPU_DIV_UP(x, y))
-#define MGPU_SHIFT_DIV_UP(x, y) (((x) + ((1<< (y)) - 1))>> y)
-#define MGPU_ROUND_UP_POW2(x, y) (((x) + (y) - 1) & ~((y) - 1))
-#define MGPU_ROUND_DOWN_POW2(x, y) ((x) & ~((y) - 1))
-#define MGPU_IS_POW_2(x) (0 == ((x) & ((x) - 1)))
-
-#endif // MGPU_MIN
-
-namespace mgpu {
-
-
-typedef unsigned char byte;
-
-typedef unsigned int uint;
-typedef signed short int16;
-
-typedef unsigned short ushort;
-typedef unsigned short uint16;
-
-typedef long long int64;
-typedef unsigned long long uint64;
-
-// IsPow2<X>::value is true if X is a power of 2.
-template<int X> struct sIsPow2 {
-	enum { value = 0 == (X & (X - 1)) };
-};
-
-// Finds the base-2 logarithm of X. value is -1 if X is not a power of 2.
-template<int X, bool roundUp = true> struct sLogPow2 { 
-	enum { extra = sIsPow2<X>::value ? 0 : (roundUp ? 1 : 0) };
-	enum { inner = sLogPow2<X / 2>::inner + 1 };
-	enum { value = inner + extra };
-};
-template<bool roundUp> struct sLogPow2<0, roundUp> {
-	enum { inner = 0 };
-	enum { value = 0 };
-};
-template<bool roundUp> struct sLogPow2<1, roundUp> { 
-	enum { inner = 0 };
-	enum { value = 0 };
-};
-
-template<int X, int Y>
-struct sDivUp {
-	enum { value = (X + Y - 1) / Y };
-};
-
-template<int count, int levels> struct sDiv2RoundUp {
-	enum { value = sDiv2RoundUp<sDivUp<count, 2>::value, levels - 1>::value };
-};
-template<int count> struct sDiv2RoundUp<count, 0> {
-	enum { value = count };
-};
-
-template<int X, int Y>
-struct sDivSafe {
-	enum { value = X / Y };
-};
-template<int X>
-struct sDivSafe<X, 0> {
-	enum { value = 0 };
-};
-
-template<int X, int Y>
-struct sRoundUp {
-	enum { rem = X % Y };
-	enum { value = X + (rem ? (Y - rem) : 0) };
-};
-
-template<int X, int Y>
-struct sRoundDown {
-	enum { rem = X % Y };
-	enum { value = X - rem };
-};
-
-// IntegerDiv is a template for avoiding divisions by zero in template 
-// evaluation. Templates always evaluate both b and c in an expression like
-// a ? b : c, and will error if either rhs contains an illegal expression,
-// even if the ternary is explictly designed to guard against that.
-template<int X, int Y>
-struct sIntegerDiv {
-	enum { value = X / (Y ? Y : (X + 1)) };
-};
-
-template<int X, int Y>
-struct sMax {
-	enum { value = (X >= Y) ? X : Y };
-};
-template<int X, int Y>
-struct sMin {
-	enum { value = (X <= Y) ? X : Y };
-};
-
-template<int X>
-struct sAbs {
-	enum { value = (X >= 0) ? X : -X };
-};
-
-
-// Finds the number of powers of 2 in the prime factorization of X.
-template<int X, int LSB = 1 & X> struct sNumFactorsOf2 {
-	enum { shifted = X >> 1 };
-	enum { value = 1 + sNumFactorsOf2<shifted>::value };
-};
-template<int X> struct sNumFactorsOf2<X, 1> {
-	enum { value = 0 };
-};
-
-// Returns the divisor for a conflict-free transpose.
-template<int X, int NumBanks = 32> struct sBankConflictDivisor {
-	enum { value = 
-		(1 & X) ? 0 : 
-		(sIsPow2<X>::value ? NumBanks :
-		(1<< sNumFactorsOf2<X>::value)) }; 
-	enum { log_value = sLogPow2<value>::value };
-};
-
-template<int NT, int X, int NumBanks = 32> struct sConflictFreeStorage {
-	enum { count = NT * X };
-	enum { divisor = sBankConflictDivisor<X, NumBanks>::value };
-	enum { padding = sDivSafe<count, divisor>::value };
-	enum { value = count + padding };
-};
-
-} // namespace mgpu
--- a/include/ctc.h
+++ b/include/ctc.h
-/** \file ctc.h
- * Contains a simple C interface to call fast CPU and GPU based computation
- * of the CTC loss.
- */
-
-#pragma once
-
-#ifdef __cplusplus
-#include <cstddef>
-#include <torch/extension.h>
-
-extern "C" {
-#endif
-
-//forward declare of CUDA typedef to avoid needing to pull in CUDA headers
-//typedef struct CUstream_st* CUstream;
-typedef struct ihipStream_t* CUstream;
-
-typedef enum {
-    CTC_STATUS_SUCCESS = 0,
-    CTC_STATUS_MEMOPS_FAILED = 1,
-    CTC_STATUS_INVALID_VALUE = 2,
-    CTC_STATUS_EXECUTION_FAILED = 3,
-    CTC_STATUS_UNKNOWN_ERROR = 4
-} ctcStatus_t;
-
-/** Returns a single integer which specifies the API version of the warpctc library */
-int get_warpctc_version();
-
-/** Returns a string containing a description of status that was passed in
- *  \param[in] status identifies which string should be returned
- *  \return C style string containing the text description
- *  */
-const char* ctcGetStatusString(ctcStatus_t status);
-
-typedef enum {
-    CTC_CPU = 0,
-    CTC_GPU = 1
-} ctcComputeLocation;
-
-/** Structure used for options to the CTC compution.  Applications
- *  should zero out the array using memset and sizeof(struct
- *  ctcOptions) in C or default initialization (e.g. 'ctcOptions
- *  options{};' or 'auto options = ctcOptions{}') in C++ to ensure
- *  forward compatibility with added options. */
-struct ctcOptions {
-    /// indicates where the ctc calculation should take place {CTC_CPU | CTC_GPU}
-    ctcComputeLocation loc;
-    union {
-        /// used when loc == CTC_CPU, the maximum number of threads that can be used
-        unsigned int num_threads;
-
-        /// used when loc == CTC_GPU, which stream the kernels should be launched in
-        CUstream stream;
-    };
-
-    /// the label value/index that the CTC calculation should use as the blank label
-    int blank_label;
-};
-
-/** Compute the connectionist temporal classification loss between a sequence
- *  of probabilities and a ground truth labeling.  Optionally compute the
- *  gradient with respect to the inputs.
- * \param [in] activations pointer to the activations in either CPU or GPU
- *             addressable memory, depending on info.  We assume a fixed
- *             memory layout for this 3 dimensional tensor, which has dimension
- *             (t, n, p), where t is the time index, n is the minibatch index,
- *             and p indexes over probabilities of each symbol in the alphabet.
- *             The memory layout is (t, n, p) in C order (slowest to fastest changing
- *             index, aka row-major), or (p, n, t) in Fortran order (fastest to slowest
- *             changing index, aka column-major). We also assume strides are equal to
- *             dimensions - there is no padding between dimensions.
- *             More precisely, element (t, n, p), for a problem with mini_batch examples
- *             in the mini batch, and alphabet_size symbols in the alphabet, is located at:
- *             activations[(t * mini_batch + n) * alphabet_size + p]
- * \param [out] gradients if not NULL, then gradients are computed.  Should be
- *              allocated in the same memory space as probs and memory
- *              ordering is identical.
- * \param [in]  flat_labels Always in CPU memory.  A concatenation
- *              of all the labels for the minibatch.
- * \param [in]  label_lengths Always in CPU memory. The length of each label
- *              for each example in the minibatch.
- * \param [in]  input_lengths Always in CPU memory.  The number of time steps
- *              for each sequence in the minibatch.
- * \param [in]  alphabet_size The number of possible output symbols.  There
- *              should be this many probabilities for each time step.
- * \param [in]  mini_batch How many examples in a minibatch.
- * \param [out] costs Always in CPU memory.  The cost of each example in the
- *              minibatch.
- * \param [in,out] workspace In same memory space as probs. Should be of
- *                 size requested by get_workspace_size.
- * \param [in]  options see struct ctcOptions
- *
- *  \return Status information
- *
- * */
-ctcStatus_t compute_ctc_loss(const float* const activations,
-                             float* gradients,
-                             const int* const flat_labels,
-                             const int* const label_lengths,
-                             const int* const input_lengths,
-                             int alphabet_size,
-                             int minibatch,
-                             float *costs,
-                             void *workspace,
-                             ctcOptions options);
-
-
-/** For a given set of labels and minibatch size return the required workspace
- *  size.  This will need to be allocated in the same memory space as your
- *  probabilities.
- * \param [in]  label_lengths Always in CPU memory. The length of each label
- *              for each example in the minibatch.
- * \param [in]  input_lengths Always in CPU memory.  The number of time steps
- *              for each sequence in the minibatch.
- * \param [in]  alphabet_size How many symbols in the alphabet or, equivalently,
- *              the number of probabilities at each time step
- * \param [in]  mini_batch How many examples in a minibatch.
- * \param [in]  info see struct ctcOptions
- * \param [out] size_bytes is pointer to a scalar where the memory
- *              requirement in bytes will be placed. This memory should be allocated
- *              at the same place, CPU or GPU, that the probs are in
- *
- *  \return Status information
- **/
-ctcStatus_t get_workspace_size(const int* const label_lengths,
-                               const int* const input_lengths,
-                               int alphabet_size, int minibatch,
-                               ctcOptions info,
-                               size_t* size_bytes);
-
-#ifdef __cplusplus
-}
-#endif
--- a/include/detail/cpu_ctc.h
+++ b/include/detail/cpu_ctc.h
-#pragma once
-
-#include <algorithm>
-#include <cmath>
-#include <limits>
-#include <numeric>
-#include <tuple>
-
-#if !defined(CTC_DISABLE_OMP) && !defined(APPLE)
-
-#include <omp.h>
-
-#endif
-
-#include "ctc_helper.h"
-
-template <typename ProbT>
-class CpuCTC {
- public:
-  // Noncopyable
-  CpuCTC(int alphabet_size, int minibatch, void* workspace, int num_threads, int blank_label)
-      : alphabet_size_(alphabet_size), minibatch_(minibatch), num_threads_(num_threads), workspace_(workspace), blank_label_(blank_label) {
-#if defined(CTC_DISABLE_OMP) || defined(APPLE)
-#else
-    if (num_threads > 0) {
-      omp_set_num_threads(num_threads);
-    } else {
-      num_threads_ = omp_get_max_threads();
-    }
-#endif
-  };
-
-  CpuCTC(const CpuCTC&) = delete;
-
-  CpuCTC& operator=(const CpuCTC&) = delete;
-
-  ctcStatus_t cost_and_grad(
-      const ProbT* const activations,
-      ProbT* grads,
-      ProbT* costs,
-      const int* const flat_labels,
-      const int* const label_lengths,
-      const int* const input_lengths);
-
-  ctcStatus_t score_forward(
-      const ProbT* const activations,
-      ProbT* costs,
-      const int* const flat_labels,
-      const int* const label_lengths,
-      const int* const input_lengths);
-
- private:
-  class CpuCTC_metadata {
-   private:
-    int setup_labels(const int* const labels, int blank_label, int L, int S);
-
-   public:
-    CpuCTC_metadata(int L, int S, int T, int mb, int alphabet_size, void* workspace, size_t bytes_used, int blank_label, const int* const labels);
-
-    ProbT* alphas;
-    ProbT* betas;
-    int* labels_w_blanks;
-    int* e_inc;
-    int* s_inc;
-    ProbT* output;
-    int repeats;
-  };
-
-  int alphabet_size_; // Number of characters plus blank
-  int minibatch_;
-  int num_threads_;
-  int blank_label_;
-  void* workspace_;
-
-  void softmax(const ProbT* const activations, ProbT* probs, const int* const input_lengths);
-
-  std::tuple<ProbT, bool> cost_and_grad_kernel(ProbT* grad, const ProbT* const probs, const int* const labels, int T, int L, int mb, size_t bytes_used);
-
-  ProbT compute_alphas(const ProbT* probs, int repeats, int S, int T, const int* const e_inc, const int* const s_inc, const int* const labels, ProbT* alphas);
-
-  ProbT compute_betas_and_grad(
-      ProbT* grad,
-      const ProbT* const probs,
-      ProbT log_partition,
-      int repeats,
-      int S,
-      int T,
-      const int* const e_inc,
-      const int* const s_inc,
-      const int* const labels,
-      ProbT* alphas,
-      ProbT* betas,
-      ProbT* output);
-};
-
-template <typename ProbT>
-CpuCTC<ProbT>::CpuCTC_metadata::CpuCTC_metadata(
-    int L,
-    int S,
-    int T,
-    int mb,
-    int alphabet_size,
-    void* workspace,
-    size_t bytes_used,
-    int blank_label,
-    const int* const labels) {
-  alphas = reinterpret_cast<ProbT*>(static_cast<char*>(workspace) + bytes_used);
-  bytes_used += sizeof(ProbT) * S * T;
-  std::fill(alphas, alphas + S * T, ctc_helper::neg_inf<ProbT>());
-  betas = reinterpret_cast<ProbT*>(static_cast<char*>(workspace) + bytes_used);
-  bytes_used += sizeof(ProbT) * S;
-  std::fill(betas, betas + S, ctc_helper::neg_inf<ProbT>());
-  labels_w_blanks = reinterpret_cast<int*>(static_cast<char*>(workspace) + bytes_used);
-  bytes_used += sizeof(int) * S;
-  e_inc = reinterpret_cast<int*>(static_cast<char*>(workspace) + bytes_used);
-  bytes_used += sizeof(int) * S;
-  s_inc = reinterpret_cast<int*>(static_cast<char*>(workspace) + bytes_used);
-  bytes_used += sizeof(int) * S;
-  output = reinterpret_cast<ProbT*>(static_cast<char*>(workspace) + bytes_used);
-  bytes_used += sizeof(ProbT) * alphabet_size;
-
-  repeats = setup_labels(labels, blank_label, L, S);
-}
-
-template <typename ProbT>
-int CpuCTC<ProbT>::CpuCTC_metadata::setup_labels(const int* const labels, int blank_label, int L, int S) {
-  int e_counter = 0;
-  int s_counter = 0;
-
-  s_inc[s_counter++] = 1; // get start
-
-  int repeats = 0; // number of repeat
-
-  for (int i = 1; i < L; ++i) {
-    if (labels[i - 1] == labels[i]) { // repeat label
-      s_inc[s_counter++] = 1;
-      s_inc[s_counter++] = 1; // label and blank
-      e_inc[e_counter++] = 1;
-      e_inc[e_counter++] = 1;
-      ++repeats;
-    } else {
-      s_inc[s_counter++] = 2; // single label and no repeat
-      e_inc[e_counter++] = 2;
-    }
-  }
-  e_inc[e_counter++] = 1; // get end
-
-  //  // printf("s_counter=%d, e_counter=%d, repeats=%d\n", s_counter, e_counter, repeats);
-  //  for (int i = 0; i < S; ++i) {
-  //    printf("s_inc[%d]=%d, e_inc[%d]=%d\n", i, s_inc[i], i, e_inc[i]);
-  //  }
-
-  for (int i = 0; i < L; ++i) {
-    labels_w_blanks[2 * i] = blank_label;
-    labels_w_blanks[2 * i + 1] = labels[i];
-  }
-  labels_w_blanks[S - 1] = blank_label; // end is blank
-
-  return repeats;
-}
-
-template <typename ProbT>
-void CpuCTC<ProbT>::softmax(const ProbT* const activations, ProbT* probs, const int* const input_lengths) {
-#pragma omp parallel for
-  for (int mb = 0; mb < minibatch_; ++mb) { // iter batch
-    for (int c = 0; c < input_lengths[mb]; ++c) { // iter input audio vec
-      int col_offset = (mb + minibatch_ * c) * alphabet_size_; // vec index * alphabet_size_
-
-      //// get max_activation
-      ProbT max_activation = -std::numeric_limits<ProbT>::infinity(); // set -1 matrix
-      for (int r = 0; r < alphabet_size_; ++r) // iter alphabet
-        max_activation = std::max(max_activation, activations[r + col_offset]);
-
-      //// compute probs between activations and max
-      ProbT denom = ProbT(0.);
-      for (int r = 0; r < alphabet_size_; ++r) {
-        probs[r + col_offset] = std::exp(activations[r + col_offset] - max_activation);
-        denom += probs[r + col_offset];
-      }
-      //// scale probs
-      for (int r = 0; r < alphabet_size_; ++r) {
-        probs[r + col_offset] /= denom;
-      }
-    }
-  }
-}
-
-template <typename ProbT>
-std::tuple<ProbT, bool> CpuCTC<
-    ProbT>::cost_and_grad_kernel(ProbT* grad, const ProbT* const probs, const int* const labels, int T, int L, int mb, size_t bytes_used) {
-  const int S = 2 * L + 1; // Number of labels with blanks
-
-  CpuCTC_metadata ctcm(L, S, T, mb, alphabet_size_, workspace_, bytes_used, blank_label_, labels);
-
-  bool over_threshold = false;
-  // check (length of labels + repeats) <= (length of utterance)
-  if (L + ctcm.repeats > T) {
-    return std::make_tuple(ProbT(0), over_threshold); // TODO, not right to return 0
-  }
-
-  ProbT llForward = compute_alphas(probs, ctcm.repeats, S, T, ctcm.e_inc, ctcm.s_inc, ctcm.labels_w_blanks, ctcm.alphas);
-
-  ProbT llBackward =
-      compute_betas_and_grad(grad, probs, llForward, ctcm.repeats, S, T, ctcm.e_inc, ctcm.s_inc, ctcm.labels_w_blanks, ctcm.alphas, ctcm.betas, ctcm.output);
-
-  ProbT diff = std::abs(llForward - llBackward);
-  if (diff > ctc_helper::threshold) {
-    over_threshold = true;
-  }
-
-  return std::make_tuple(-llForward, over_threshold);
-}
-
-// Computes forward probabilities
-template <typename ProbT>
-ProbT CpuCTC<ProbT>::compute_alphas(
-    const ProbT* probs,
-    int repeats,
-    int S,
-    int T,
-    const int* const e_inc,
-    const int* const s_inc,
-    const int* const labels,
-    ProbT* alphas) {
-  int start = (((S / 2) + repeats - T) < 0) ? 0 : 1, end = S > 1 ? 2 : 1;
-
-  // get log probs of label
-  for (int i = start; i < end; ++i) {
-    alphas[i] = std::log(probs[labels[i]]);
-  }
-
-  // printf("start=%d, end=%d, t=1~srcLen=%d, repeats=%d\n", start, end, T, repeats);
-  for (int t = 1; t < T; ++t) {
-    int remain = (S / 2) + repeats - (T - t);
-    // printf("t=%d, remain=%d\n", t, remain);
-
-    if (remain >= 0)
-      start += s_inc[remain];
-    if (t <= (S / 2) + repeats)
-      end += e_inc[t - 1];
-
-    int startloop = start;
-    int idx1 = t * S, idx2 = (t - 1) * S, idx3 = t * (alphabet_size_ * minibatch_);
-
-    if (start == 0) {
-      alphas[idx1] = alphas[idx2] + std::log(probs[blank_label_ + idx3]);
-      // printf("00 alphas[%d]=%f, alphas[%d]=%f\n", t, alphas[idx1], t - 1, alphas[idx2]);
-      startloop += 1;
-    }
-    // printf("start=%d, startloop=%d, end=%d\n", start, startloop, end);
-
-    for (int i = startloop; i < end; ++i) {
-      // printf("alphas[(t - 1=%d, u=%d)]=%f\n", t - 1, i, alphas[i + idx2]);
-      // printf("alphas[(t - 1=%d, u-1=%d)]=%f\n", t - 1, i - 1, alphas[(i - 1) + idx2]);
-
-      ProbT prev_sum = ctc_helper::log_plus<ProbT>()(alphas[i + idx2], alphas[(i - 1) + idx2]);
-      // printf("11 t=%d, u=%d, prev_sum=%f\n", t, i, prev_sum);
-
-      // Skip two if not on blank and not on repeat.
-      if (labels[i] != blank_label_ && i != 1 && labels[i] != labels[i - 2]) {
-        prev_sum = ctc_helper::log_plus<ProbT>()(prev_sum, alphas[(i - 2) + idx2]);
-        // printf("22 t=%d, u=%d, prev_sum=%f\n", t, i, prev_sum);
-      }
-
-      alphas[i + idx1] = prev_sum + std::log(probs[labels[i] + idx3]);
-      // printf("33 alpha[%d,%d]=%f, log(p(%d))=%f, label(%d)=%d\n", t, i, alphas[i + idx1], labels[i], std::log(probs[labels[i] + idx3]), i, labels[i]);
-    }
-    // printf("\n");
-  }
-
-  // printf("final start=%d, end=%d\n", start, end);
-
-  ProbT loglike = ctc_helper::neg_inf<ProbT>();
-  for (int i = start; i < end; ++i) {
-    loglike = ctc_helper::log_plus<ProbT>()(loglike, alphas[i + (T - 1) * S]);
-  }
-  // printf("compute alpha cost=%f\n", -loglike);
-
-#ifdef DEBUG_KERNEL
-  printf("cpu alphas:\n");
-  printf("T=%d, (T-1)*S=%d, start=%d, end=%d\n", T, (T - 1) * S, start, end);
-  for (int t = start; t < end; ++t) {
-    printf("%.5f ", alphas[t + (T - 1) * S]);
-  }
-  printf("\n");
-  printf("alphas loglike=%f\n", loglike);
-#endif
-
-  return loglike;
-}
-
-// Starting from T, we sweep backward over the alpha array computing one column
-// of betas as we go.  At each position we can update product alpha * beta and then
-// sum into the gradient associated with each label.
-// NOTE computes gradient w.r.t UNNORMALIZED final layer activations.
-// Assumed passed in grads are already zeroed!
-template <typename ProbT>
-ProbT CpuCTC<ProbT>::compute_betas_and_grad(
-    ProbT* grad,
-    const ProbT* const probs,
-    ProbT log_partition,
-    int repeats,
-    int S,
-    int T,
-    const int* const e_inc,
-    const int* const s_inc,
-    const int* const labels,
-    ProbT* alphas,
-    ProbT* betas,
-    ProbT* output) {
-  int start = S > 1 ? (S - 2) : 0, end = (T > (S / 2) + repeats) ? S : S - 1;
-
-  std::fill(output, output + alphabet_size_, ctc_helper::neg_inf<ProbT>());
-
-  // set the starting values in the beta column at the very right edge
-  for (int i = start; i < end; ++i) {
-    betas[i] = std::log(probs[labels[i] + (T - 1) * (alphabet_size_ * minibatch_)]);
-
-    // compute alpha * beta in log space at this position in (S, T) space
-    alphas[i + (T - 1) * S] += betas[i];
-
-    // update the gradient associated with this label
-    // essentially performing a reduce-by-key in a sequential manner
-    output[labels[i]] = ctc_helper::log_plus<ProbT>()(alphas[i + (T - 1) * S], output[labels[i]]);
-  }
-
-  // update the gradient wrt to each unique label
-  for (int i = 0; i < alphabet_size_; ++i) {
-    int idx3 = (T - 1) * alphabet_size_ * minibatch_ + i;
-
-    if (output[i] == 0.0 || output[i] == ctc_helper::neg_inf<ProbT>() || probs[idx3] == 0.0) {
-      grad[idx3] = probs[idx3];
-    } else {
-      grad[idx3] = probs[idx3] - std::exp(output[i] - std::log(probs[idx3]) - log_partition);
-    }
-  }
-
-  // loop from the second to last column all the way to the left
-  for (int t = T - 2; t >= 0; --t) {
-    int remain = (S / 2) + repeats - (T - t);
-    if (remain >= -1)
-      start -= s_inc[remain + 1];
-    if (t < (S / 2) + repeats)
-      end -= e_inc[t];
-
-    int endloop = end == S ? end - 1 : end;
-    int idx1 = t * S, idx3 = t * (alphabet_size_ * minibatch_);
-
-    std::fill(output, output + alphabet_size_, ctc_helper::neg_inf<ProbT>());
-
-    for (int i = start; i < endloop; ++i) {
-      ProbT next_sum = ctc_helper::log_plus<ProbT>()(betas[i], betas[(i + 1)]);
-      // Skip two if not on blank and not on repeat.
-      if (labels[i] != blank_label_ && i != (S - 2) && labels[i] != labels[i + 2]) {
-        next_sum = ctc_helper::log_plus<ProbT>()(next_sum, betas[(i + 2)]);
-      }
-      betas[i] = next_sum + std::log(probs[labels[i] + idx3]);
-
-      // compute alpha * beta in log space
-      alphas[i + idx1] += betas[i];
-
-      // update the gradient associated with this label
-      output[labels[i]] = ctc_helper::log_plus<ProbT>()(alphas[i + idx1], output[labels[i]]);
-    }
-
-    if (end == S) {
-      betas[(S - 1)] = betas[(S - 1)] + std::log(probs[blank_label_ + idx3]);
-      alphas[(S - 1) + idx1] += betas[(S - 1)];
-
-      output[labels[S - 1]] = ctc_helper::log_plus<ProbT>()(alphas[S - 1 + idx1], output[labels[S - 1]]);
-    }
-
-    // go over the unique labels and compute the final grad
-    //  wrt to each one at this time step
-    for (int i = 0; i < alphabet_size_; ++i) {
-      if (output[i] == 0.0 || output[i] == ctc_helper::neg_inf<ProbT>() || probs[idx3] == 0.0) {
-        grad[idx3] = probs[idx3];
-      } else {
-        grad[idx3] = probs[idx3] - std::exp(output[i] - std::log(probs[idx3]) - log_partition);
-      }
-      ++idx3;
-    }
-  }
-
-  ProbT loglike = ctc_helper::neg_inf<ProbT>();
-  for (int i = start; i < end; ++i) {
-    loglike = ctc_helper::log_plus<ProbT>()(loglike, betas[i]);
-  }
-
-#ifdef DEBUG_KERNEL
-  printf("cpu betas:\n");
-  printf("T=%d, (T-1)*S=%d, start=%d, end=%d\n", T, (T - 1) * S, start, end);
-  for (int t = start; t < end; ++t) {
-    printf("%.5f ", betas[t]);
-  }
-  printf("\n");
-  printf("betas loglike=%f\n", loglike);
-#endif
-
-  return loglike;
-}
-
-template <typename ProbT>
-ctcStatus_t CpuCTC<ProbT>::cost_and_grad(
-    const ProbT* const activations,
-    ProbT* grads,
-    ProbT* costs,
-    const int* const flat_labels,
-    const int* const label_lengths,
-    const int* const input_lengths) {
-  if (activations == nullptr || grads == nullptr || costs == nullptr || flat_labels == nullptr || label_lengths == nullptr || input_lengths == nullptr)
-    return CTC_STATUS_INVALID_VALUE;
-
-  ProbT* probs = static_cast<ProbT*>(workspace_);
-
-  // get max length input audio vector
-  int maxT = *std::max_element(input_lengths, input_lengths + minibatch_);
-  // memory to use
-  size_t bytes_used = sizeof(ProbT) * minibatch_ * alphabet_size_ * maxT;
-  // per minibatch memory
-  size_t per_minibatch_bytes = 0;
-
-  // get max length input text vector
-  int maxL = *std::max_element(label_lengths, label_lengths + minibatch_);
-
-  int maxS = 2 * maxL + 1; // labels with blanks
-
-  // output
-  per_minibatch_bytes += sizeof(float) * alphabet_size_; // vector of alphabet
-  // alphas
-  per_minibatch_bytes += sizeof(float) * maxS * maxT; // matrix size
-  // betas
-  per_minibatch_bytes += sizeof(float) * maxS; // sequence label size is n , alloc 2n+1, with blanks
-  // labels w/blanks, e_inc, s_inc
-  per_minibatch_bytes += 3 * sizeof(int) * maxS;
-
-  // compute softmax probs
-  softmax(activations, probs, input_lengths);
-
-#pragma omp parallel for
-  for (int mb = 0; mb < minibatch_; ++mb) {
-    const int T = input_lengths[mb]; // Length of utterance (time)
-    const int L = label_lengths[mb]; // Number of labels in transcription
-
-    bool mb_status;
-
-    std::tie(costs[mb], mb_status) = cost_and_grad_kernel(
-        grads + mb * alphabet_size_,
-        probs + mb * alphabet_size_,
-        flat_labels + std::accumulate(label_lengths, label_lengths + mb, 0),
-        T,
-        L,
-        mb,
-        bytes_used + mb * per_minibatch_bytes);
-  }
-
-  return CTC_STATUS_SUCCESS;
-}
-
-template <typename ProbT>
-ctcStatus_t CpuCTC<ProbT>::score_forward(
-    const ProbT* const activations,
-    ProbT* costs,
-    const int* const flat_labels,
-    const int* const label_lengths,
-    const int* const input_lengths) {
-  if (activations == nullptr || costs == nullptr || flat_labels == nullptr || label_lengths == nullptr || input_lengths == nullptr)
-    return CTC_STATUS_INVALID_VALUE;
-
-  ProbT* probs = static_cast<ProbT*>(workspace_);
-
-  int maxT = *std::max_element(input_lengths, input_lengths + minibatch_);
-
-  size_t bytes_used = sizeof(ProbT) * minibatch_ * alphabet_size_ * maxT;
-
-  // per minibatch memory
-  size_t per_minibatch_bytes = 0;
-
-  int maxL = *std::max_element(label_lengths, label_lengths + minibatch_);
-  int maxS = 2 * maxL + 1;
-
-  // output
-  per_minibatch_bytes += sizeof(float) * alphabet_size_;
-
-  // alphas
-  per_minibatch_bytes += sizeof(float) * maxS * maxT;
-
-  // betas
-  per_minibatch_bytes += sizeof(float) * maxS;
-
-  // labels w/blanks, e_inc, s_inc
-  per_minibatch_bytes += 3 * sizeof(int) * maxS;
-
-  softmax(activations, probs, input_lengths);
-
-#pragma omp parallel for
-  for (int mb = 0; mb < minibatch_; ++mb) {
-    const int T = input_lengths[mb]; // Length of utterance (time)
-    const int L = label_lengths[mb]; // Number of labels in transcription
-    const int S = 2 * L + 1; // Number of labels with blanks
-
-    CpuCTC_metadata ctcm(
-        L,
-        S,
-        T,
-        mb,
-        alphabet_size_,
-        workspace_,
-        bytes_used + mb * per_minibatch_bytes,
-        blank_label_,
-        flat_labels + std::accumulate(label_lengths, label_lengths + mb, 0));
-
-    if (L + ctcm.repeats > T)
-      costs[mb] = ProbT(0);
-    else {
-      costs[mb] = -compute_alphas(probs + mb * alphabet_size_, ctcm.repeats, S, T, ctcm.e_inc, ctcm.s_inc, ctcm.labels_w_blanks, ctcm.alphas);
-    }
-  }
-
-  return CTC_STATUS_SUCCESS;
-}
--- a/include/detail/ctc_helper.h
+++ b/include/detail/ctc_helper.h
-#pragma once
-
-#include <limits>
-#include <algorithm>
-#include <cmath>
-
-#include "hostdevice.h"
-
-namespace ctc_helper {
-
-    static const float threshold = 1e-1;
-
-    template<typename T>
-    HOSTDEVICE
-    T neg_inf() { return -T(INFINITY); }
-
-    inline int div_up(int x, int y) {
-        return (x + y - 1) / y;
-    }
-
-    template<typename Arg, typename Res = Arg>
-    struct maximum {
-        HOSTDEVICE
-        Res operator()(const Arg &x, const Arg &y) const {
-            return x < y ? y : x;
-        }
-    };
-
-    template<typename Arg, typename Res = Arg>
-    struct minimum {
-        HOSTDEVICE
-        Res operator()(const Arg &x, const Arg &y) const {
-            return x < y ? x : y;
-        }
-    };
-
-    template<typename Arg, typename Res = Arg>
-    struct add {
-        HOSTDEVICE
-        Res operator()(const Arg &x, const Arg &y) const {
-            return x + y;
-        }
-    };
-
-    template<typename Arg, typename Res = Arg>
-    struct identity {
-        HOSTDEVICE Res operator()(const Arg &x) const {
-            return Res(x);
-        }
-    };
-
-    template<typename Arg, typename Res = Arg>
-    struct negate {
-        HOSTDEVICE Res operator()(const Arg &x) const {
-            return Res(-x);
-        }
-    };
-
-    template<typename Arg, typename Res = Arg>
-    struct exponential {
-        HOSTDEVICE Res operator()(const Arg &x) const { return std::exp(x); }
-    };
-
-    template<typename Arg1, typename Arg2 = Arg1, typename Res=Arg1>
-    struct log_plus {
-        typedef Res result_type;
-        HOSTDEVICE
-        Res operator()(const Arg1 &p1, const Arg2 &p2) {
-            if (p1 == neg_inf<Arg1>())
-                return p2;
-            if (p2 == neg_inf<Arg2>())
-                return p1;
-            Res result = log1p(exp(-fabs(p1 - p2))) + maximum<Res>()(p1, p2);
-            return result;
-        }
-    };
-
-//template<typename Arg1, typename Arg2 = Arg1, typename Res=Arg1>
-//struct log_plus {
-//    HOSTDEVICE
-//    Res operator()(const Arg1& p1, const Arg2& p2) {
-//        Res p12_max = maximum<Res>()(p1, p2);
-//        Res p12_min = minimum<Res>()(p1, p2);
-//        Res p12_diff = p12_min-p12_max;
-//        Res NEGATIVE_CUTOFF_VAL = -(Res)100000;
-//
-//        Res result = p12_diff <= NEGATIVE_CUTOFF_VAL ? maximum<Res>()(p12_max, NEGATIVE_CUTOFF_VAL)
-//                                        : maximum<Res>()(p12_max + log(exp(p12_diff) + 1), NEGATIVE_CUTOFF_VAL);
-//
-//
-//        return result;
-//    }
-//};
-
-}
--- a/include/detail/gpu_ctc.h
+++ b/include/detail/gpu_ctc.h
-#pragma once
-
-#include "ctc_helper.h"
-#include "gpu_ctc_kernels.h"
-#include "reduce.h"
-#include <stdio.h>
-
-const int kCUDABlockNumThreads = 256;
-
-template<typename ProbT>
-class GpuCTC {
-public:
-    GpuCTC(int alphabet_size,
-           int minibatch,
-           void *workspace,
-           CUstream stream,
-           int blank_label) :
-            out_dim_(alphabet_size), minibatch_(minibatch),
-            gpu_workspace_(workspace), stream_(stream),
-            blank_label_(blank_label) {};
-
-    // Noncopyable
-    GpuCTC(const GpuCTC &) = delete;
-
-    GpuCTC &operator=(const GpuCTC &) = delete;
-
-    ctcStatus_t
-    cost_and_grad(const ProbT *const activations,
-                  ProbT *grads,
-                  ProbT *costs,
-                  const int *const flat_labels,
-                  const int *const label_lengths,
-                  const int *const input_lengths);
-
-    ctcStatus_t
-    score_forward(const ProbT *const activations,
-                  ProbT *costs,
-                  const int *const flat_labels,
-                  const int *const label_lengths,
-                  const int *const input_lengths);
-
-private:
-
-    template<int NT, int VT>
-    ctcStatus_t launch_alpha_beta_kernels(const ProbT *const probs,
-                                          ProbT *grads,
-                                          bool compute_alpha,
-                                          bool compute_beta);
-
-    ctcStatus_t
-    launch_gpu_kernels(const ProbT *const probs,
-                       ProbT *grads,
-                       size_t config,
-                       bool launch_alpha,
-                       bool launch_beta);
-
-    ctcStatus_t
-    setup_gpu_metadata(const int *const flat_labels,
-                       const int *const label_lengths,
-                       const int *const input_lengths);
-
-    ctcStatus_t
-    create_metadata_and_choose_config(const int *const label_lengths,
-                                      const int *const flat_labels,
-                                      const int *const input_lengths,
-                                      size_t &best_config);
-
-    ctcStatus_t
-    compute_probs(const ProbT *const activations);
-
-    ctcStatus_t
-    compute_cost_and_score(const ProbT *const activations,
-                           ProbT *grads,
-                           ProbT *costs,
-                           const int *const flat_labels,
-                           const int *const label_lengths,
-                           const int *const input_lengths,
-                           bool compute_alpha,
-                           bool compute_betas_and_grad);
-
-
-    int out_dim_; // Number of characters plus blank
-    int minibatch_;
-
-    int S_;
-    int T_;
-
-    int activation_cols_; // Number of columns in activations
-
-    CUstream stream_;
-    int blank_label_;
-
-    void *gpu_workspace_; // Buffer for all temporary GPU memory
-    int *utt_length_; // T
-    int *label_sizes_; // L
-    int *repeats_; // repeats_
-    int *label_offsets_;
-    int *labels_without_blanks_;
-    int *labels_with_blanks_;
-    ProbT *alphas_;
-    ProbT *nll_forward_;
-    ProbT *nll_backward_;
-    ProbT *denoms_; // Temporary storage for denoms for softmax
-    ProbT *probs_; // Temporary storage for probabilities (softmax output)
-};
-
-template<typename ProbT>
-ctcStatus_t
-GpuCTC<ProbT>::setup_gpu_metadata(const int *const flat_labels,
-                                  const int *const label_lengths,
-                                  const int *const input_lengths) {
-
-    size_t gpu_bytes_used = 0;
-
-    nll_forward_ = reinterpret_cast<ProbT *>(static_cast<char *>(gpu_workspace_) + gpu_bytes_used);
-    gpu_bytes_used += minibatch_ * sizeof(ProbT);
-
-
-    nll_backward_ = reinterpret_cast<ProbT *>(static_cast<char *>(gpu_workspace_) + gpu_bytes_used);
-    gpu_bytes_used += minibatch_ * sizeof(ProbT);
-
-
-    repeats_ = reinterpret_cast<int *>(static_cast<char *>(gpu_workspace_) + gpu_bytes_used);
-    gpu_bytes_used += minibatch_ * sizeof(int);
-
-    label_offsets_ = reinterpret_cast<int *>(static_cast<char *>(gpu_workspace_) + gpu_bytes_used);
-    gpu_bytes_used += minibatch_ * sizeof(int);
-
-
-    // This is the max of all S and T for all valid examples in the minibatch.
-    // A valid example is one for which L + repeats <= T
-    S_ = 0;
-    T_ = 0;
-
-    // This is the max of all timesteps, valid or not. Needed to compute offsets
-    int Tmax = 0;
-
-    // This is the max of all labels, valid or not. Needed to compute offsets
-    int Lmax = 0;
-    int total_label_length = 0;
-
-    constexpr int cpu_buffer_size = 64;
-    int repeats[cpu_buffer_size];
-    int label_offsets[cpu_buffer_size];
-
-    const int num_passes = ctc_helper::div_up(minibatch_, cpu_buffer_size);
-
-    hipError_t cuda_status;
-
-    for (int pass = 0; pass < num_passes; ++pass) {
-
-        const int start_idx = pass * cpu_buffer_size;
-        const int end_idx = std::min(minibatch_, (pass + 1) * cpu_buffer_size);
-
-        for (int j = start_idx; j < end_idx; ++j) {
-            const int L = label_lengths[j];
-            const int local_T = input_lengths[j];
-            const int *label_ptr = &(flat_labels[total_label_length]);
-
-            label_offsets[j % cpu_buffer_size] = total_label_length;
-            total_label_length += L;
-
-            int repeat_counter = 0;
-
-            for (int i = 1; i < L; ++i)
-                repeat_counter += (label_ptr[i] == label_ptr[i - 1]);
-
-            repeats[j % cpu_buffer_size] = repeat_counter;
-            const bool valid_label = ((L + repeat_counter) <= local_T);
-
-            // Only update S and T if label is valid
-            S_ = (valid_label) ? std::max(S_, L) : S_;
-            T_ = (valid_label) ? std::max(T_, local_T) : T_;
-
-            Tmax = std::max(Tmax, local_T);
-            Lmax = std::max(Lmax, L);
-        }
-
-        cuda_status = hipMemcpyAsync(&(repeats_[start_idx]), repeats,
-                                     (end_idx - start_idx) * sizeof(int),
-                                     hipMemcpyHostToDevice, stream_);
-        if (cuda_status != hipSuccess)
-            return CTC_STATUS_MEMOPS_FAILED;
-
-
-        cuda_status = hipMemcpyAsync(&(label_offsets_[start_idx]), label_offsets,
-                                     (end_idx - start_idx) * sizeof(int),
-                                     hipMemcpyHostToDevice, stream_);
-        if (cuda_status != hipSuccess)
-            return CTC_STATUS_MEMOPS_FAILED;
-    }
-
-    S_ = 2 * S_ + 1;
-    const int Smax = 2 * Lmax + 1;
-
-    activation_cols_ = minibatch_ * Tmax;
-
-    // Allocate memory for T
-    utt_length_ = reinterpret_cast<int *>(static_cast<char *>(gpu_workspace_) + gpu_bytes_used);
-    gpu_bytes_used += minibatch_ * sizeof(int);
-
-    cuda_status = hipMemcpyAsync(utt_length_, input_lengths,
-                                 minibatch_ * sizeof(int),
-                                 hipMemcpyHostToDevice, stream_);
-    if (cuda_status != hipSuccess)
-        return CTC_STATUS_MEMOPS_FAILED;
-
-    label_sizes_ = reinterpret_cast<int *>(static_cast<char *>(gpu_workspace_) + gpu_bytes_used);
-    gpu_bytes_used += minibatch_ * sizeof(int);
-    cuda_status = hipMemcpyAsync(label_sizes_, label_lengths,
-                                 minibatch_ * sizeof(int),
-                                 hipMemcpyHostToDevice, stream_);
-    if (cuda_status != hipSuccess)
-        return CTC_STATUS_MEMOPS_FAILED;
-
-    labels_without_blanks_ = reinterpret_cast<int *>(static_cast<char *>(gpu_workspace_) + gpu_bytes_used);
-    gpu_bytes_used += Lmax * minibatch_ * sizeof(int);
-    cuda_status = hipMemcpyAsync(labels_without_blanks_, flat_labels,
-                                 total_label_length * sizeof(int),
-                                 hipMemcpyHostToDevice, stream_);
-    if (cuda_status != hipSuccess)
-        return CTC_STATUS_MEMOPS_FAILED;
-
-    labels_with_blanks_ = reinterpret_cast<int *>(static_cast<char *>(gpu_workspace_) + gpu_bytes_used);
-    gpu_bytes_used += Smax * minibatch_ * sizeof(int);
-
-    alphas_ = reinterpret_cast<ProbT *>(static_cast<char *>(gpu_workspace_) + gpu_bytes_used);
-    gpu_bytes_used += (S_ * T_) * minibatch_ * sizeof(ProbT);
-
-
-    denoms_ = reinterpret_cast<ProbT *>(static_cast<char *>(gpu_workspace_) + gpu_bytes_used);
-    gpu_bytes_used += activation_cols_ * sizeof(ProbT);
-
-    probs_ = reinterpret_cast<ProbT *>(static_cast<char *>(gpu_workspace_) + gpu_bytes_used);
-    gpu_bytes_used += out_dim_ * activation_cols_ * sizeof(ProbT);
-
-    return CTC_STATUS_SUCCESS;
-}
-
-template<typename ProbT>
-template<int NT, int VT>
-ctcStatus_t GpuCTC<ProbT>::launch_alpha_beta_kernels(const ProbT *const probs,
-                                                     ProbT *grads,
-                                                     bool compute_alpha,
-                                                     bool compute_beta) {
-
-    // One thread block per utterance
-    const int grid_size = minibatch_;
-
-    // The data is laid out so that the next timestep is minibatch entries away
-    const int stride = minibatch_;
-
-    if (compute_alpha) {
-        compute_alpha_kernel<ProbT, NT, VT><<<grid_size, NT, 0, stream_>>>
-        (probs, label_sizes_, utt_length_,
-                repeats_, labels_without_blanks_, label_offsets_,
-                labels_with_blanks_, alphas_, nll_forward_,
-                stride, out_dim_, S_, T_, blank_label_);
-        hipStreamSynchronize(stream_);
-    }
-
-
-    if (compute_beta) {
-        compute_betas_and_grad_kernel<ProbT, NT, VT><<<grid_size, NT, 0, stream_>>>
-        (probs, label_sizes_, utt_length_, repeats_,
-                labels_with_blanks_, alphas_, nll_forward_, nll_backward_,
-                grads, stride, out_dim_, S_, T_, blank_label_);
-
-        hipStreamSynchronize(stream_);
-    }
-
-    hipError_t err = hipGetLastError();
-    if (err != hipSuccess)
-        return CTC_STATUS_EXECUTION_FAILED;
-
-    return CTC_STATUS_SUCCESS;
-}
-
-template<typename ProbT>
-ctcStatus_t
-GpuCTC<ProbT>::create_metadata_and_choose_config(const int *const flat_labels,
-                                                 const int *const label_lengths,
-                                                 const int *const input_lengths,
-                                                 size_t &best_config) {
-
-    // Setup the metadata for GPU
-    ctcStatus_t status = setup_gpu_metadata(flat_labels, label_lengths, input_lengths);
-    if (status != CTC_STATUS_SUCCESS)
-        return status;
-
-    constexpr int num_configs = 12;
-
-    int config_NT[num_configs] =
-            {32, 64, 128, 64, 128, 32, 64, 128, 64, 128, 128, 128};
-    int config_VT[num_configs] =
-            {1, 1, 1, 3, 2, 9, 6, 4, 9, 6, 9, 10};
-
-    best_config = 0;
-    for (int i = 0; i < num_configs; ++i) {
-        if ((config_NT[i] * config_VT[i]) >= S_)
-            break;
-        else
-            best_config++;
-    }
-
-    if (best_config >= num_configs)
-        return CTC_STATUS_UNKNOWN_ERROR;
-
-    return CTC_STATUS_SUCCESS;
-}
-
-template<typename ProbT>
-ctcStatus_t
-GpuCTC<ProbT>::launch_gpu_kernels(const ProbT *const probs,
-                                  ProbT *grads,
-                                  size_t config,
-                                  bool l_a,
-                                  bool l_b) {
-
-    switch (config) {
-        case 0: {
-            return launch_alpha_beta_kernels<32, 1>(probs, grads, l_a, l_b);
-        }
-        case 1: {
-            return launch_alpha_beta_kernels<64, 1>(probs, grads, l_a, l_b);
-        }
-        case 2: {
-            return launch_alpha_beta_kernels<128, 1>(probs, grads, l_a, l_b);
-        }
-        case 3: {
-            return launch_alpha_beta_kernels<64, 3>(probs, grads, l_a, l_b);
-        }
-        case 4: {
-            return launch_alpha_beta_kernels<128, 2>(probs, grads, l_a, l_b);
-        }
-        case 5: {
-            return launch_alpha_beta_kernels<32, 9>(probs, grads, l_a, l_b);
-        }
-        case 6: {
-            return launch_alpha_beta_kernels<64, 6>(probs, grads, l_a, l_b);
-        }
-        case 7: {
-            return launch_alpha_beta_kernels<128, 4>(probs, grads, l_a, l_b);
-        }
-        case 8: {
-            return launch_alpha_beta_kernels<64, 9>(probs, grads, l_a, l_b);
-        }
-        case 9: {
-            return launch_alpha_beta_kernels<128, 6>(probs, grads, l_a, l_b);
-        }
-        case 10: {
-            return launch_alpha_beta_kernels<128, 9>(probs, grads, l_a, l_b);
-        }
-        case 11: {
-            return launch_alpha_beta_kernels<128, 10>(probs, grads, l_a, l_b);
-        }
-    }
-
-    return CTC_STATUS_EXECUTION_FAILED;
-}
-
-template<typename ProbT>
-ctcStatus_t
-GpuCTC<ProbT>::compute_probs(const ProbT *const activations) {
-
-    hipError_t cuda_status;
-    cuda_status = hipMemcpyAsync(probs_, activations,
-                                 activation_cols_ * out_dim_ * sizeof(ProbT),
-                                 hipMemcpyDeviceToDevice, stream_);
-    if (cuda_status != hipSuccess)
-        return CTC_STATUS_MEMOPS_FAILED;
-
-    cuda_status = hipStreamSynchronize(stream_);
-
-    // Numerically stable SM
-    ctcStatus_t ctc_status = reduce_max(probs_, denoms_, out_dim_, activation_cols_, 1, stream_);
-
-    if (ctc_status != CTC_STATUS_SUCCESS)
-        return ctc_status;
-
-    // Kernel launch to subtract maximum
-    const int NT = kCUDABlockNumThreads;
-    const int VT = 1;
-    const int NV = NT * VT;
-    const int num_elements = out_dim_ * activation_cols_;
-    const int grid_size = ctc_helper::div_up(num_elements, NV);
-
-    prepare_stable_SM_kernel<ProbT, VT> <<< grid_size, NT, 0, stream_>>>
-    (ctc_helper::identity<ProbT>(), probs_, denoms_, out_dim_, num_elements);
-
-    // Reduce along columns to calculate denominator
-    ctc_status = reduce_exp(probs_, denoms_, out_dim_, activation_cols_, 1, stream_);
-    if (ctc_status != CTC_STATUS_SUCCESS)
-        return ctc_status;
-
-    // Kernel launch to calculate probabilities
-    compute_probs_kernel<ProbT, VT><<<grid_size, NT, 0, stream_>>>
-    (ctc_helper::exponential<ProbT>(), probs_, denoms_, out_dim_, num_elements);
-
-    return CTC_STATUS_SUCCESS;
-}
-
-template<typename ProbT>
-ctcStatus_t
-GpuCTC<ProbT>::compute_cost_and_score(const ProbT *const activations,
-                                      ProbT *grads,
-                                      ProbT *costs,
-                                      const int *const flat_labels,
-                                      const int *const label_lengths,
-                                      const int *const input_lengths,
-                                      bool compute_alpha,
-                                      bool compute_betas_and_grad) {
-
-    size_t best_config;
-    ctcStatus_t status = create_metadata_and_choose_config(flat_labels,
-                                                           label_lengths,
-                                                           input_lengths,
-                                                           best_config);
-    if (status != CTC_STATUS_SUCCESS)
-        return status;
-
-    status = compute_probs(activations);
-    if (status != CTC_STATUS_SUCCESS)
-        return status;
-
-    launch_gpu_kernels(probs_, grads, best_config,
-                       compute_alpha, compute_betas_and_grad);
-
-    hipError_t cuda_status_mem, cuda_status_sync;
-    cuda_status_mem = hipMemcpyAsync(costs, nll_forward_,
-                                     sizeof(ProbT) * minibatch_,
-                                     hipMemcpyDeviceToHost, stream_);
-    cuda_status_sync = hipStreamSynchronize(stream_);
-    if (cuda_status_mem != hipSuccess || cuda_status_sync != hipSuccess)
-        return CTC_STATUS_MEMOPS_FAILED;
-
-    return CTC_STATUS_SUCCESS;
-}
-
-template<typename ProbT>
-ctcStatus_t
-GpuCTC<ProbT>::cost_and_grad(const ProbT *const activations,
-                             ProbT *grads,
-                             ProbT *costs,
-                             const int *const flat_labels,
-                             const int *const label_lengths,
-                             const int *const input_lengths) {
-    if (activations == nullptr ||
-        grads == nullptr ||
-        costs == nullptr ||
-        flat_labels == nullptr ||
-        label_lengths == nullptr ||
-        input_lengths == nullptr
-            )
-        return CTC_STATUS_INVALID_VALUE;
-
-    return compute_cost_and_score(activations, grads, costs, flat_labels,
-                                  label_lengths, input_lengths, true, true);
-}
-
-template<typename ProbT>
-ctcStatus_t
-GpuCTC<ProbT>::score_forward(const ProbT *const activations,
-                             ProbT *costs,
-                             const int *const flat_labels,
-                             const int *const label_lengths,
-                             const int *const input_lengths) {
-    if (activations == nullptr ||
-        costs == nullptr ||
-        flat_labels == nullptr ||
-        label_lengths == nullptr ||
-        input_lengths == nullptr
-            )
-        return CTC_STATUS_INVALID_VALUE;
-
-    return compute_cost_and_score(activations, nullptr, costs, flat_labels,
-                                  label_lengths, input_lengths, true, false);
-}
-
--- a/include/detail/gpu_ctc_kernels.h
+++ b/include/detail/gpu_ctc_kernels.h
-#pragma once
-
-#include <contrib/moderngpu/include/device/ctascan.cuh>
-#include <contrib/moderngpu/include/device/ctamerge.cuh>
-
-#include "ctc_helper.h"
-#include <stdio.h>
-
-using namespace mgpu;
-
-template<int NT, int VT, typename T, typename KeyT, typename Op>
-struct CTASegReduce {
-
-    enum {
-        NV = NT * VT
-    };
-
-    union Storage {
-        typename CTAScan<NT>::Storage scanStorage;
-        int indices[NV];
-    };
-
-    //adapted from global kernel KernelReduceByKeyPreprocess
-    __device__ static void preprocessKeys(KeyT *keys, int count,
-                                          int *numUniqueLabels, int seg_start[VT],
-                                          int seg_end[VT], int *scanout) {
-        __shared__
-        Storage shared;
-
-        const int tid = threadIdx.x;
-        // Compare adjacent keys within each thread and mark discontinuities
-        int endFlags = 0;
-        T key = keys[VT * tid];
-#pragma unroll
-        for (int i = 0; i < VT; ++i) {
-            int index = VT * tid + 1 + i;
-            T next = keys[index];
-            if (index == count || (index < count && key != next)) {
-                endFlags |= 1 << i;
-            }
-            key = next;
-        }
-
-        __syncthreads();
-
-        //Count the number of encountered end flags
-        int scan = CTAScan<NT>::Scan(tid, popc(endFlags), shared.scanStorage, numUniqueLabels);
-
-        __syncthreads();
-
-        //output the unique keys
-        //use indices as scratch space
-        int outputPos = scan;
-#pragma unroll
-        for (int i = 0; i < VT; ++i) {
-
-            if ((endFlags >> i) & 1) {
-                shared.indices[outputPos] = keys[VT * tid + i];
-                scanout[outputPos] = VT * tid + i;
-                outputPos++;
-            }
-        }
-
-        __syncthreads();
-
-        // Create start and end
-        for (int idx = tid, j = 0; idx < (*numUniqueLabels); idx += blockDim.x, ++j) {
-            seg_start[j] = (idx == 0) ? 0 : (scanout[idx - 1] + 1);
-            seg_end[j] = scanout[idx];
-        }
-
-        __syncthreads();
-
-        //copy from the scratch space back into the keys
-#pragma unroll
-        for (int i = 0; i < VT; ++i) {
-            keys[i * NT + tid] = shared.indices[i * NT + tid];
-        }
-
-        __syncthreads();
-    }
-};
-
-// Computes forward probabilities. This fills in a T * S matrix.
-// The computation starts at t=1 (2nd row) and ends at t=T-1 (last row). Each row has
-// S elements where S = 2L + 1.
-//
-// We only need to read in probabilities corresponding to the labels, thus a sparse
-// set of values are read from the probs matrix since the character set is much smaller
-// than the labels. This is much more true for Mandarin than English.
-template<typename ProbT, int NT, int VT>
-__global__
-void compute_alpha_kernel(const ProbT *probs, const int *label_sizes,
-                          const int *utt_length, const int *repeats_in_labels,
-                          const int *labels_without_blanks, const int *label_offsets,
-                          int *labels_with_blanks, ProbT *alphas,
-                          ProbT *nll_forward, int stride, int out_dim,
-                          int S_memoffset, int T_memoffset, int blank_label) {
-
-    ctc_helper::log_plus<ProbT> log_plus_f;
-
-    const int tid = threadIdx.x;
-    const int L = label_sizes[blockIdx.x];
-    const int T = utt_length[blockIdx.x];
-    const int S = 2 * L + 1;
-    const int prob_offset = out_dim * blockIdx.x;
-    const int repeats = repeats_in_labels[blockIdx.x];
-
-    const int NV = NT * VT;
-    __shared__ int label[NV];
-
-    if ((L + repeats) > T)
-        return;
-
-    // Generate labels with blanks from labels without blanks
-    {
-        const int label_start_offset = label_offsets[blockIdx.x];
-        for (int idx = tid; idx < L; idx += blockDim.x) {
-            const int offset = (blockIdx.x * S_memoffset) + 2 * idx;
-            labels_with_blanks[offset] = blank_label;
-            labels_with_blanks[offset + 1] = labels_without_blanks[label_start_offset + idx];
-        }
-        if (tid == 0) {
-            labels_with_blanks[(blockIdx.x * S_memoffset) + 2 * L] = blank_label;
-        }
-    }
-    __syncthreads();
-
-    const int *labels = labels_with_blanks;
-    const int *label_global = &labels[blockIdx.x * S_memoffset];
-    ProbT *alpha = &alphas[blockIdx.x * (S_memoffset * T_memoffset)];
-
-    // Set the first row of alpha neg_inf - it is much more efficient to do it
-    // here than outside
-#pragma unroll
-    for (int idx = tid; idx < min(S, NV); idx += blockDim.x) {
-        alpha[idx] = ctc_helper::neg_inf<ProbT>();
-    }
-
-    // Load labels into shared memory
-#pragma unroll
-    for (int i = tid; i < S; i += NT) {
-        label[i] = label_global[i];
-    }
-
-    __syncthreads();
-
-    int start = (L + repeats < T) ? 0 : 1;
-    int end = S > 1 ? 2 : 1;
-
-    // Initialize the first row corresponding to t=0;
-    for (int i = tid; i < (end - start); i += blockDim.x) {
-        alpha[i + start] = log(probs[prob_offset + label[i + start]]);
-        //printf("compute_alpha_kernel probs is %f\n", probs[prob_offset + label[i + start]]);
-        //printf("compute_alpha_kernel alpha is %f\n", alpha[i + start]);
-    }
-
-    __syncthreads();
-
-    // Fill in the rest of matrix, one row at a time (outer loop).
-    for (int t = 1; t < T; ++t) {
-
-        // Start offsets into the current and previous row
-        const int start_cur_row = t * S;
-        const int start_prev_row = (t - 1) * S;
-
-        // The prob is a 2D column major array, with probabilites for each t strided
-        // by (out_dim * stride), where stride is the minibatch size, out_dim is alphabet_size
-        const int start_prob_col = t * (out_dim * stride);
-
-        // This is the first column and in this case there is nothing left of it
-        if (tid == 0) {
-            if (start == 0) {
-                alpha[start_cur_row] = alpha[start_prev_row] +
-                                       log(probs[prob_offset + start_prob_col + blank_label]);
-            } else if (start == 1) {
-                alpha[start_cur_row] = alpha[start_prev_row];
-            }
-        }
-
-        __syncthreads();
-
-        // Fill in the elements in each row. There is no loop dependence here since our
-        // input is the row above. We sum either two or three adjacent values from the
-        // row above depending on whether we have a blank or repeated characters. Finally
-        // we add the probability corresponding to this label at time t
-#pragma unroll
-        for (int idx = (tid + 1); idx < S; idx += blockDim.x) {
-
-            ProbT prev_sum = log_plus_f(alpha[idx + start_prev_row], alpha[(idx - 1) + start_prev_row]);
-
-            // Skip two if not on blank and not on repeat.
-            if ((label[idx] != blank_label) &&
-                (idx != 1) && (label[idx] != label[idx - 2]))
-                prev_sum = log_plus_f(prev_sum, alpha[(idx - 2) + start_prev_row]);
-
-            alpha[idx + start_cur_row] =
-                    prev_sum + log(probs[prob_offset + start_prob_col + label[idx]]);
-        }
-
-        __syncthreads();
-    }
-
-    if (tid == 0) {
-        // Add and return the rightmost two/one element(s) in the last row.
-        ProbT loglike = ctc_helper::neg_inf<ProbT>();
-
-        // This is the total increment for s_inc and e_inc through the loop
-        const int val = 2 * (L - 1) + 1 - (((L + repeats) == T) ? 1 : 0);
-
-        start = (val * (L != 0) + start);
-        end = (val * (L != 0) + end);
-
-        for (int i = start; i < end; ++i) {
-            loglike = log_plus_f(loglike, alpha[i + (T - 1) * S]);
-        }
-        nll_forward[blockIdx.x] = -loglike;
-    }
-}
-
-// Computes backward probabilities. This also fills in a T * S matrix
-//
-// See comments above compute_alphas for more context.
-template<typename ProbT, int NT, int VT>
-__global__
-void compute_betas_and_grad_kernel(const ProbT *probs, const int *label_sizes,
-                                   const int *utt_length, const int *repeats_in_labels,
-                                   const int *labels_with_blanks, ProbT *alphas,
-                                   const ProbT *nll_forward, ProbT *nll_backward,
-                                   ProbT *grads, int stride, int out_dim,
-                                   int S_memoffset, int T_memoffset, int blank_label) {
-
-    ctc_helper::log_plus<ProbT> log_plus_f;
-    typedef CTASegReduce<NT, VT, ProbT, int, ctc_helper::log_plus<ProbT>> SegReduce;
-
-    const int tid = threadIdx.x;
-    const int L = label_sizes[blockIdx.x];
-    const int T = utt_length[blockIdx.x];
-    const int S = 2 * L + 1;
-    const int prob_offset = out_dim * blockIdx.x;
-    const int repeats = repeats_in_labels[blockIdx.x];
-    const ProbT log_partition = -nll_forward[blockIdx.x];
-
-    const int *labels = labels_with_blanks;
-    const int *label_global = &labels[blockIdx.x * S_memoffset];
-    ProbT *alpha = &alphas[blockIdx.x * (S_memoffset * T_memoffset)];
-
-    const int NV = NT * VT;
-
-    union TempStorage {
-        ProbT beta[NV];
-        int result[NV];
-    };
-
-    __shared__
-    TempStorage temp_buffer;
-
-    __shared__ int label[NV];
-
-    // Temporaries needed for segmented reduce
-    // TODO: see if we can combine the shared memory requirements
-    __shared__ int keys_shared[NV];
-    __shared__ int gather_indices[NV];
-    __shared__
-    ProbT output[NV];
-
-    ProbT beta_val[VT];
-
-    if ((L + repeats) > T)
-        return;
-
-    int start = S > 1 ? (S - 2) : 0;
-    int end = (L + repeats < T) ? S : S - 1;
-
-    // Setup shared memory buffers
-#pragma unroll
-    for (int idx = tid; idx < NV; idx += NT) {
-        label[idx] = (idx < S) ? label_global[idx] : INT_MAX;
-    }
-
-    __syncthreads();
-
-    // int flags;
-    int uniquelabels;
-    int seg_start[VT];
-    int seg_end[VT];
-
-    // Sort labels and record indices from which to gather from
-    {
-        int key[VT];
-        int gather_val[VT];
-
-#pragma unroll
-        for (int i = 0; i < VT; ++i) {
-            const int idx = tid * VT + i;
-            gather_val[i] = idx;
-            key[i] = label[idx];
-        }
-
-        __syncthreads();
-
-        CTAMergesort < NT, VT, true, true, int, int, mgpu::less < int >>
-                                                                      (key, gather_val, keys_shared, gather_indices, S, tid, mgpu::less<int>());
-
-        __syncthreads();
-
-        for (int i = 0; i < VT; ++i) {
-            const int idx = tid * VT + i;
-            gather_indices[idx] = gather_val[i];
-        }
-
-        __syncthreads();
-
-        SegReduce::preprocessKeys(keys_shared, S, &uniquelabels, seg_start, seg_end,
-                                  temp_buffer.result);
-        __syncthreads();
-    }
-
-    // TODO: probably not necessary
-    __syncthreads();
-
-    // Load labels back
-#pragma unroll
-    for (int idx = tid; idx < NV; idx += NT) {
-        temp_buffer.beta[idx] = ctc_helper::neg_inf<ProbT>();
-    }
-    __syncthreads();
-
-    // Initialize the two rightmost values in the last row (assuming L non-zero)
-    for (int i = tid; i < (end - start); i += blockDim.x)
-        temp_buffer.beta[i + start] =
-                log(probs[prob_offset + (T - 1) * (out_dim * stride) + label[i + start]]);
-
-    __syncthreads();
-
-    // Load output data in registers through the transpose trick - should really be a function
-#pragma unroll
-    for (int idx = tid; idx < S; idx += NT) {
-        output[idx] = alpha[idx + (T - 1) * S] + temp_buffer.beta[idx];
-    }
-
-    __syncthreads();
-
-    // Start at the second to last row and backward in time
-    for (int t = T - 1; t >= 0; --t) {
-
-        // Start offsets into the current and next row
-        const int start_cur_row = t * S;
-
-        // Starting offset of column that we read from the probs array
-        const int start_prob_col = t * (out_dim * stride);
-
-        if (t < T - 1) {
-
-            // Filling up one row at at time but going back in time from the last row
-            // to the first. As in the forward pass, there is no loop dependence and we
-            // do a variable length filter of maximum filter size of 3
-#pragma unroll
-            for (int idx = tid, i = 0; idx < (S - 1); idx += NT, i++) {
-                ProbT next_sum = log_plus_f(temp_buffer.beta[idx], temp_buffer.beta[idx + 1]);
-
-                // Skip two if not on blank and not on repeat.
-                if ((label[idx] != blank_label) &&
-                    (idx != (S - 2)) && (label[idx] != label[idx + 2]))
-                    next_sum = log_plus_f(next_sum, temp_buffer.beta[idx + 2]);
-
-                beta_val[i] = next_sum + log(probs[prob_offset + start_prob_col + label[idx]]);
-            }
-
-            __syncthreads();
-
-            // Initialize values for the rightmost column since there is nothing to the right
-            // Update input buffer for next iteration
-            if ((tid == 0) && (end == S))
-                temp_buffer.beta[(S - 1)] = temp_buffer.beta[(S - 1)] +
-                                            log(probs[prob_offset + start_prob_col + blank_label]);
-
-#pragma unroll
-            for (int idx = tid, i = 0; idx < (S - 1); idx += NT, i++) {
-                temp_buffer.beta[idx] = beta_val[i];
-            }
-
-            __syncthreads();
-
-            // Beta Computation done - add to alpha and update the gradient. Reload
-            // the gradient back for segmented reduce later on
-#pragma unroll
-            for (int idx = tid; idx < S; idx += NT) {
-                output[idx] = alpha[idx + start_cur_row] + temp_buffer.beta[idx];
-            }
-
-            __syncthreads();
-
-        }
-
-        __syncthreads();
-
-        // Compute segmented reduction of output by using label as key
-        {
-            // Somewhat faster key value reduce
-            ProbT accum[VT];
-
-            for (int idx = tid, j = 0; idx < uniquelabels; idx += blockDim.x, ++j) {
-
-                accum[j] = ctc_helper::neg_inf<ProbT>();
-                for (int i = seg_start[j]; i <= seg_end[j]; ++i) {
-                    accum[j] = log_plus_f(accum[j], output[gather_indices[i]]);
-                }
-            }
-            __syncthreads();
-
-            // Write accumulated value into output since that is not used
-            for (int idx = tid, j = 0; idx < uniquelabels; idx += blockDim.x, ++j) {
-                output[idx] = accum[j];
-            }
-            __syncthreads();
-
-            for (int idx = tid; idx < out_dim; idx += blockDim.x) {
-                const int grads_offset = prob_offset + start_prob_col + idx;
-                grads[grads_offset] = probs[grads_offset];
-            }
-
-            __syncthreads();
-
-            for (int idx = tid; idx < uniquelabels; idx += blockDim.x) {
-                const int grads_offset = prob_offset + start_prob_col + keys_shared[idx];
-
-                ProbT grad = output[idx];
-
-                if ((grad == 0.0) || (probs[grads_offset] == 0.0) ||
-                    (grad == ctc_helper::neg_inf<ProbT>())) {
-                } else {
-                    grads[grads_offset] =
-                            probs[grads_offset] - exp(grad - log(probs[grads_offset]) - log_partition);
-                }
-            }
-
-            __syncthreads();
-        }
-
-        // Output backward log likelihood
-        if ((t == 0) && (tid == 0)) {
-            ProbT loglike = ctc_helper::neg_inf<ProbT>();
-
-            const int val = 2 * (L - 1) + 1 - (((L + repeats) == T) ? 1 : 0);
-
-            start = (-val * (L != 0) + start);
-            end = (-val * (L != 0) + end);
-
-            // Sum and return the leftmost one/two value(s) in first row
-            for (int i = start; i < end; ++i)
-                loglike = log_plus_f(loglike, temp_buffer.beta[i]);
-
-            nll_backward[blockIdx.x] = -loglike;
-        }
-
-        // For some reason this is important
-        __syncthreads();
-    }
-}
-
-template<typename ProbT, int VT = 1, typename Op>
-__global__ void compute_probs_kernel(Op f, ProbT *probs,
-                                     const ProbT *const denom,
-                                     int alphabet_size,
-                                     int count) {
-
-    int idx = blockDim.x * blockIdx.x + threadIdx.x;
-    int stride = blockDim.x * gridDim.x;
-#pragma unroll
-    for (int i = 0; i < VT; i++) {
-        if (idx < count) {
-            const int column_idx = idx / alphabet_size;
-            probs[idx] = f(probs[idx]) / denom[column_idx];
-        }
-        idx += stride;
-    }
-}
-
-template<typename ProbT, int VT = 1, typename Op>
-__global__ void prepare_stable_SM_kernel(Op f, ProbT *probs,
-                                         const ProbT *const col_max,
-                                         int alphabet_size,
-                                         int count) {
-
-    int idx = blockDim.x * blockIdx.x + threadIdx.x;
-    int stride = blockDim.x * gridDim.x;
-#pragma unroll
-    for (int i = 0; i < VT; i++) {
-        if (idx < count) {
-            const int column_idx = idx / alphabet_size;
-            probs[idx] = f(probs[idx] - col_max[column_idx]);
-        }
-        idx += stride;
-    }
-}
--- a/include/detail/hostdevice.h
+++ b/include/detail/hostdevice.h
-#pragma once
-
-#ifdef __HIPCC__
-    #define HOSTDEVICE __device__ __host__
-#else
-    #define HOSTDEVICE
-#endif
--- a/include/detail/reduce.h
+++ b/include/detail/reduce.h
-#pragma once
-
-ctcStatus_t reduce_negate(const float* input, float* output, int rows, int cols, bool axis, CUstream stream);
-ctcStatus_t reduce_exp(const float* input, float* output, int rows, int cols, bool axis, CUstream stream);
-ctcStatus_t reduce_max(const float* input, float* output, int rows, int cols, bool axis, CUstream stream);
--- a/pytorch_binding/setup.cfg
+++ b/pytorch_binding/setup.cfg
-[tool:pytest]
--- a/pytorch_binding/setup.py
+++ b/pytorch_binding/setup.py
-import torch
-from typing import List, Optional, Union
-import glob
-import os
-import shlex
-import subprocess
-import sys
-from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CppExtension
-from setuptools import find_packages, setup
-from setuptools.command.build_ext import build_ext
-from pkg_resources import packaging  # type: ignore[attr-defined]
-
-
-def _find_rocm_home() -> Optional[str]:
-    rocm_home = os.environ.get('ROCM_HOME') or os.environ.get('ROCM_PATH')
-    if rocm_home is None:
-        try:
-            pipe_hipcc = subprocess.Popen(
-                ["which hipcc | xargs readlink -f"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
-            hipcc, _ = pipe_hipcc.communicate()
-            rocm_home = os.path.dirname(os.path.dirname(hipcc.decode(*()).rstrip('\r\n')))
-            if os.path.basename(rocm_home) == 'hip':
-                rocm_home = os.path.dirname(rocm_home)
-        except Exception:
-            rocm_home = '/opt/rocm'
-            if not os.path.exists(rocm_home):
-                rocm_home = None
-    if rocm_home and torch.version.hip is None:
-        print(f"No ROCm runtime is found, using ROCM_HOME='{rocm_home}'")
-    return rocm_home
-
-
-def _get_rocm_arch_flags(cflags: Optional[List[str]] = None) -> List[str]:
-    if cflags is not None:
-        for flag in cflags:
-            if 'amdgpu-target' in flag:
-                return ['-fno-gpu-rdc']
-    archs = os.environ.get('PYTORCH_ROCM_ARCH', 'gfx900;gfx906')
-    flags = ['--amdgpu-target=%s' % arch for arch in archs.split(';')]
-    flags += ['-fno-gpu-rdc']
-    return flags
-
-
-ROCM_HOME = _find_rocm_home()
-IS_HIP_EXTENSION = True if ((ROCM_HOME is not None) and (torch.version.hip is not None)) else False
-COMMON_HIP_FLAGS = [
-    '-fPIC',
-    '-D__HIP_PLATFORM_HCC__=1',
-]
-
-COMMON_HIPCC_FLAGS = [
-    '-DCUDA_HAS_FP16=1',
-    '-D__HIP_NO_HALF_OPERATORS__=1',
-    '-D__HIP_NO_HALF_CONVERSIONS__=1',
-]
-
-
-def is_ninja_available():
-    try:
-        subprocess.check_output('ninja --version'.split())
-    except Exception:
-        return False
-    else:
-        return True
-
-
-def verify_ninja_availability():
-    if not is_ninja_available():
-        raise RuntimeError("Ninja is required to load C++ extensions")
-
-
-def _is_cuda_file(path: str) -> bool:
-    valid_ext = ['.cu', '.cuh']
-    if IS_HIP_EXTENSION:
-        valid_ext.append('.hip')
-    return os.path.splitext(path)[1] in valid_ext
-
-
-def _join_rocm_home(*paths) -> str:
-    if ROCM_HOME is None:
-        raise EnvironmentError('ROCM_HOME environment variable is not set. ')
-    return os.path.join(ROCM_HOME, *paths)
-
-
-def _write_ninja_file(path, cflags, post_cflags, cuda_cflags, cuda_post_cflags, sources,
-                      objects, ldflags, library_target, with_cuda) -> None:
-    def sanitize_flags(flags):
-        if flags is None:
-            return []
-        else:
-            return [flag.strip() for flag in flags]
-
-    cflags = sanitize_flags(cflags)
-    post_cflags = sanitize_flags(post_cflags)
-    cuda_cflags = sanitize_flags(cuda_cflags)
-    cuda_post_cflags = sanitize_flags(cuda_post_cflags)
-    ldflags = sanitize_flags(ldflags)
-    assert len(sources) == len(objects)
-    assert len(sources) > 0
-    compiler = os.environ.get('CXX', 'c++')
-    config = ['ninja_required_version = 1.3']
-    config.append(f'cxx = {compiler}')
-    if with_cuda:
-        if IS_HIP_EXTENSION:
-            nvcc = _join_rocm_home('bin', 'hipcc')
-        config.append(f'nvcc = {nvcc}')
-    flags = [f'cflags = {" ".join(cflags)}']
-    flags.append(f'post_cflags = {" ".join(post_cflags)}')
-    if with_cuda:
-        flags.append(f'cuda_cflags = {" ".join(cuda_cflags)}')
-        flags.append(f'cuda_post_cflags = {" ".join(cuda_post_cflags)}')
-    flags.append(f'ldflags = {" ".join(ldflags)}')
-    sources = [os.path.abspath(file) for file in sources]
-    compile_rule = ['rule compile']
-    compile_rule.append('  command = $cxx -MMD -MF $out.d $cflags -c $in -o $out $post_cflags')
-    compile_rule.append('  depfile = $out.d')
-    compile_rule.append('  deps = gcc')
-    if with_cuda:
-        cuda_compile_rule = ['rule cuda_compile']
-        nvcc_gendeps = ''
-        required_cuda_version = packaging.version.parse('10.2')
-        has_cuda_version = torch.version.cuda is not None
-        if has_cuda_version and packaging.version.parse(torch.version.cuda) >= required_cuda_version:
-            cuda_compile_rule.append('  depfile = $out.d')
-            cuda_compile_rule.append('  deps = gcc')
-        cuda_compile_rule.append(
-            f'  command = $nvcc {nvcc_gendeps} $cuda_cflags -c $in -o $out $cuda_post_cflags')
-    build = []
-    for source_file, object_file in zip(sources, objects):
-        is_cuda_source = _is_cuda_file(source_file) and with_cuda
-        rule = 'cuda_compile' if is_cuda_source else 'compile'
-        source_file = source_file.replace(" ", "$ ")
-        object_file = object_file.replace(" ", "$ ")
-        build.append(f'build {object_file}: {rule} {source_file}')
-    if library_target is not None:
-        link_rule = ['rule link']
-        link_rule.append('  command = $cxx $in $ldflags -o $out')
-        link = [f'build {library_target}: link {" ".join(objects)}']
-        default = [f'default {library_target}']
-    else:
-        link_rule, link, default = [], [], []
-    blocks = [config, flags, compile_rule]
-    if with_cuda:
-        blocks.append(cuda_compile_rule)
-    blocks += [link_rule, build, link, default]
-    with open(path, 'w') as build_file:
-        for block in blocks:
-            lines = '\n'.join(block)
-            build_file.write(f'{lines}\n\n')
-
-
-def _get_num_workers(verbose: bool) -> Optional[int]:
-    max_jobs = os.environ.get('MAX_JOBS')
-    if max_jobs is not None and max_jobs.isdigit():
-        if verbose:
-            print(f'Using envvar MAX_JOBS ({max_jobs}) as the number of workers...')
-        return int(max_jobs)
-    if verbose:
-        print('Allowing ninja to set a default number of workers... ')
-    return None
-
-
-def _run_ninja_build(build_directory: str, verbose: bool, error_prefix: str) -> None:
-    command = ['ninja', '-v']
-    num_workers = _get_num_workers(verbose)
-    if num_workers is not None:
-        command.extend(['-j', str(num_workers)])
-    env = os.environ.copy()
-    try:
-        sys.stdout.flush()
-        sys.stderr.flush()
-        stdout_fileno = 1
-        subprocess.run(command, stdout=stdout_fileno if verbose else subprocess.PIPE, stderr=subprocess.STDOUT,
-                       cwd=build_directory, check=True, env=env)
-    except subprocess.CalledProcessError as e:
-        _, error, _ = sys.exc_info()
-        message = error_prefix
-        if hasattr(error, 'output') and error.output:  # type: ignore[union-attr]
-            message += f": {error.output.decode(*SUBPROCESS_DECODE_ARGS)}"  # type: ignore[union-attr]
-        raise RuntimeError(message) from e
-
-
-def _write_ninja_file_and_compile_objects(sources: List[str], objects, cflags, post_cflags, cuda_cflags,
-                                          cuda_post_cflags, build_directory: str, verbose: bool,
-                                          with_cuda: Optional[bool]) -> None:
-    verify_ninja_availability()
-    compiler = os.environ.get('CXX', 'c++')
-    if with_cuda is None:
-        with_cuda = any(map(_is_cuda_file, sources))
-    build_file_path = os.path.join(build_directory, 'build.ninja')
-    if verbose:
-        print(f'Emitting ninja build file {build_file_path}...')
-    _write_ninja_file(path=build_file_path, cflags=cflags, post_cflags=post_cflags, cuda_cflags=cuda_cflags,
-                      cuda_post_cflags=cuda_post_cflags, sources=sources, objects=objects, ldflags=None,
-                      library_target=None, with_cuda=with_cuda)
-    if verbose:
-        print('Compiling objects...')
-    _run_ninja_build(
-        build_directory,
-        verbose,
-        error_prefix='Error compiling objects for extension')
-
-
-class BuildReleaseExtension(BuildExtension):
-    def __init__(self, *args, **kwargs) -> None:
-        super(BuildReleaseExtension, self).__init__(*args, **kwargs)
-
-    def build_extensions(self) -> None:
-        self._check_abi()
-        cuda_ext = False
-        extension_iter = iter(self.extensions)
-        extension = next(extension_iter, None)
-        while not cuda_ext and extension:
-            for source in extension.sources:
-                _, ext = os.path.splitext(source)
-                if ext == '.cu':
-                    cuda_ext = True
-                    break
-            extension = next(extension_iter, None)
-        for extension in self.extensions:
-            if isinstance(extension.extra_compile_args, dict):
-                for ext in ['cxx', 'nvcc']:
-                    if ext not in extension.extra_compile_args:
-                        extension.extra_compile_args[ext] = []
-            self._add_compile_flag(extension, '-DTORCH_API_INCLUDE_EXTENSION_H')
-            for name in ["COMPILER_TYPE", "STDLIB", "BUILD_ABI"]:
-                val = getattr(torch._C, f"_PYBIND11_{name}")
-                self._add_compile_flag(extension, f'-DPYBIND11_{name}="{val}"')
-            self._define_torch_extension_name(extension)
-            self._add_gnu_cpp_abi_flag(extension)
-        self.compiler.src_extensions += ['.cu', '.cuh', '.hip']
-
-        def append_std17_if_no_std_present(cflags) -> None:
-            cpp_format_prefix = '/{}:' if self.compiler.compiler_type == 'msvc' else '-{}='
-            cpp_flag_prefix = cpp_format_prefix.format('std')
-            cpp_flag = cpp_flag_prefix + 'c++14'
-            if not any(flag.startswith(cpp_flag_prefix) for flag in cflags):
-                cflags.append(cpp_flag)
-
-        def convert_to_absolute_paths_inplace(paths):
-            if paths is not None:
-                for i in range(len(paths)):
-                    if not os.path.isabs(paths[i]):
-                        paths[i] = os.path.abspath(paths[i])
-
-        def unix_wrap_ninja_compile(sources, output_dir=None, macros=None, include_dirs=None, debug=0,
-                                    extra_preargs=None, extra_postargs=None, depends=None):
-            output_dir = os.path.abspath(output_dir)
-            convert_to_absolute_paths_inplace(self.compiler.include_dirs)
-            _, objects, extra_postargs, pp_opts, _ = \
-                self.compiler._setup_compile(output_dir, macros, include_dirs, sources, depends, extra_postargs)
-            common_cflags = self.compiler._get_cc_args(pp_opts, debug, extra_preargs)
-            extra_cc_cflags = self.compiler.compiler_so[1:]
-            if (debug):
-                print("debug mode")
-            else:
-                extra_cc_cflags.remove('-g')
-                extra_cc_cflags.remove('-Wall')
-                print("release mode")
-            with_cuda = any(map(_is_cuda_file, sources))
-            if isinstance(extra_postargs, dict):
-                post_cflags = extra_postargs['cxx']
-            else:
-                post_cflags = list(extra_postargs)
-            if IS_HIP_EXTENSION:
-                post_cflags = COMMON_HIP_FLAGS + post_cflags
-            append_std17_if_no_std_present(post_cflags)
-            cuda_post_cflags = None
-            cuda_cflags = None
-            if with_cuda:
-                cuda_cflags = common_cflags
-                if isinstance(extra_postargs, dict):
-                    cuda_post_cflags = extra_postargs['nvcc']
-                else:
-                    cuda_post_cflags = list(extra_postargs)
-                if IS_HIP_EXTENSION:
-                    cuda_post_cflags = cuda_post_cflags + _get_rocm_arch_flags(cuda_post_cflags)
-                    cuda_post_cflags = COMMON_HIP_FLAGS + COMMON_HIPCC_FLAGS + cuda_post_cflags
-                append_std17_if_no_std_present(cuda_post_cflags)
-                cuda_cflags = [shlex.quote(f) for f in cuda_cflags]
-                cuda_post_cflags = [shlex.quote(f) for f in cuda_post_cflags]
-            _write_ninja_file_and_compile_objects(sources=sources, objects=objects,
-                                                  cflags=[shlex.quote(f) for f in extra_cc_cflags + common_cflags],
-                                                  post_cflags=[shlex.quote(f) for f in post_cflags],
-                                                  cuda_cflags=cuda_cflags,
-                                                  cuda_post_cflags=cuda_post_cflags, build_directory=output_dir,
-                                                  verbose=True, with_cuda=with_cuda)
-            return objects
-
-        self.compiler.compile = unix_wrap_ninja_compile
-        build_ext.build_extensions(self)
-
-
-def get_version():
-    return "0.1"
-
-
-def get_extensions():
-    extensions = []
-    include_dirs = []
-    define_macros = []
-    extra_compile_args = {'cxx': ['-O3'], 'nvcc': []}
-
-    args = []
-    args += ['-DWARPCTC_ENABLE_GPU']
-    args += ['-DCTC_DISABLE_OMP']
-    # args += ['-DDEBUG_KERNEL']
-    args += ['-Wno-deprecated']
-    extra_compile_args['cxx'] += args
-    extra_compile_args['nvcc'] += args
-
-    op_files = glob.glob('./src/*.cu') + glob.glob('./src/*.cpp') + ['../src/reduce.cu', '../src/ctc_entrypoint.cu']
-    print('op_files = ', op_files)
-    extension = CUDAExtension
-
-    include_dirs.append(os.path.realpath('../include/'))
-    include_dirs.append('/opt/dtk/rocrand/include/')
-    include_dirs.append('/opt/dtk/hiprand/include/')
-    print('include_dirs = ', include_dirs)
-
-    ext_ops = extension(
-        name="_warp_ctc",
-        sources=op_files,
-        include_dirs=include_dirs,
-        define_macros=define_macros,
-        extra_compile_args=extra_compile_args)
-    extensions.append(ext_ops)
-    return extensions
-
-
-def main():
-    setup(
-        name='warpctc_pytorch',
-        version=get_version(),
-        description='Torch fuseop Computer Vision Foundation',
-        keywords='computer vision',
-        packages=find_packages(),
-        include_package_data=False,
-        package_data={
-            'warpctc_pytorch': [
-                "src/*.cuh",
-                "src/*.cu",
-                "src/*.hip",
-                "src/*.cpp"
-            ]
-        },
-        ext_modules=get_extensions(),
-        cmdclass={
-            'build_ext': BuildReleaseExtension
-        },
-        zip_safe=False
-    )
-
-
-if __name__ == "__main__":
-    main()
--- a/pytorch_binding/src/binding.cu
+++ b/pytorch_binding/src/binding.cu
-#include <iostream>
-#include <vector>
-#include <numeric>
-#include <torch/extension.h>
-
-#ifdef WARPCTC_ENABLE_GPU
-	#include "ATen/cuda/CUDAContext.h"
-	#include <c10/cuda/CUDAGuard.h>
-	#include "ATen/cuda/CUDAEvent.h"
-    #include <THC/THCGeneral.h>
-
-    extern THCState* state;
-#endif
-
-#include "ctc.h"
-
-int cpu_ctc(torch::Tensor probs,
-            torch::Tensor grads,
-            torch::Tensor labels,
-            torch::Tensor label_sizes,
-            torch::Tensor sizes,
-            int minibatch_size,
-            torch::Tensor costs,
-            int blank_label)
-{
-    float* probs_ptr       = (float*)probs.data_ptr();
-    float* grads_ptr       = grads.storage() ? (float*)grads.data_ptr() : NULL;
-    int*   sizes_ptr       = (int*)sizes.data_ptr();
-    int*   labels_ptr      = (int*)labels.data_ptr();
-    int*   label_sizes_ptr = (int*)label_sizes.data_ptr();
-    float* costs_ptr       = (float*)costs.data_ptr();
-
-    const int probs_size = probs.size(2);
-
-    ctcOptions options;
-    memset(&options, 0, sizeof(options));
-    options.loc = CTC_CPU;
-    options.num_threads = 0; // will use default number of threads
-    options.blank_label = blank_label;
-
-#if defined(CTC_DISABLE_OMP) || defined(APPLE)
-    // have to use at least one
-    options.num_threads = std::max(options.num_threads, (unsigned int) 1);
-#endif
-
-    size_t cpu_size_bytes;
-    get_workspace_size(label_sizes_ptr, sizes_ptr,
-                       probs_size, minibatch_size,
-                       options, &cpu_size_bytes);
-
-    float* cpu_workspace = new float[cpu_size_bytes / sizeof(float)];
-
-    compute_ctc_loss(probs_ptr, grads_ptr,
-                     labels_ptr, label_sizes_ptr,
-                     sizes_ptr, probs_size,
-                     minibatch_size, costs_ptr,
-                     cpu_workspace, options);
-
-    delete[] cpu_workspace;
-    return 1;
-}
-
-
-#ifdef WARPCTC_ENABLE_GPU
-int gpu_ctc(torch::Tensor probs,
-            torch::Tensor grads,
-            torch::Tensor labels,
-            torch::Tensor label_sizes,
-            torch::Tensor sizes,
-            int minibatch_size,
-            torch::Tensor costs,
-            int blank_label)
-{
-    float* probs_ptr       = (float*)probs.data_ptr();
-    float* grads_ptr       = grads.storage() ? (float*)grads.data_ptr() : NULL;
-    int*   sizes_ptr       = (int*)sizes.data_ptr();
-    int*   labels_ptr      = (int*)labels.data_ptr();
-    int*   label_sizes_ptr = (int*)label_sizes.data_ptr();
-    float* costs_ptr       = (float*)costs.data_ptr();
-
-    const int probs_size = probs.size(2);
-
-    ctcOptions options;
-    memset(&options, 0, sizeof(options));
-    options.loc = CTC_GPU;
-    options.blank_label = blank_label;
-    options.stream = at::cuda::getCurrentCUDAStream();
-
-    size_t gpu_size_bytes;
-    get_workspace_size(label_sizes_ptr, sizes_ptr,
-                       probs_size, minibatch_size,
-                       options, &gpu_size_bytes);
-
-    void* gpu_workspace = THCudaMalloc(state, gpu_size_bytes);
-
-    compute_ctc_loss(probs_ptr, grads_ptr,
-                     labels_ptr, label_sizes_ptr,
-                     sizes_ptr, probs_size,
-                     minibatch_size, costs_ptr,
-                     gpu_workspace, options);
-
-    THCudaFree(state, (void *) gpu_workspace);
-    return 1;
-}
-#endif
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("cpu_ctc", &cpu_ctc, "CTC Loss function with cpu");
-#ifdef WARPCTC_ENABLE_GPU
-  m.def("gpu_ctc", &gpu_ctc, "CTC Loss function with gpu");
-#endif
-}
--- a/pytorch_binding/src/binding.hip
+++ b/pytorch_binding/src/binding.hip
-// !!! This is a file automatically generated by hipify!!!
-#include <iostream>
-#include <vector>
-#include <numeric>
-#include <torch/extension.h>
-
-#ifdef WARPCTC_ENABLE_GPU
-	#include "ATen/hip/HIPContext.h"
-	#include <ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h>
-	#include "ATen/hip/HIPEvent.h"
-    #include <THH/THHGeneral.h>
-
-    extern THCState* state;
-#endif
-
-#include "ctc.h"
-
-int cpu_ctc(torch::Tensor probs,
-            torch::Tensor grads,
-            torch::Tensor labels,
-            torch::Tensor label_sizes,
-            torch::Tensor sizes,
-            int minibatch_size,
-            torch::Tensor costs,
-            int blank_label)
-{
-    float* probs_ptr       = (float*)probs.data_ptr();
-    float* grads_ptr       = grads.storage() ? (float*)grads.data_ptr() : NULL;
-    int*   sizes_ptr       = (int*)sizes.data_ptr();
-    int*   labels_ptr      = (int*)labels.data_ptr();
-    int*   label_sizes_ptr = (int*)label_sizes.data_ptr();
-    float* costs_ptr       = (float*)costs.data_ptr();
-
-    const int probs_size = probs.size(2);
-
-    ctcOptions options;
-    memset(&options, 0, sizeof(options));
-    options.loc = CTC_CPU;
-    options.num_threads = 0; // will use default number of threads
-    options.blank_label = blank_label;
-
-#if defined(CTC_DISABLE_OMP) || defined(APPLE)
-    // have to use at least one
-    options.num_threads = ::max(options.num_threads, (unsigned int) 1);
-#endif
-
-    size_t cpu_size_bytes;
-    get_workspace_size(label_sizes_ptr, sizes_ptr,
-                       probs_size, minibatch_size,
-                       options, &cpu_size_bytes);
-
-    float* cpu_workspace = new float[cpu_size_bytes / sizeof(float)];
-
-    compute_ctc_loss(probs_ptr, grads_ptr,
-                     labels_ptr, label_sizes_ptr,
-                     sizes_ptr, probs_size,
-                     minibatch_size, costs_ptr,
-                     cpu_workspace, options);
-
-    delete[] cpu_workspace;
-    return 1;
-}
-
-
-#ifdef WARPCTC_ENABLE_GPU
-int gpu_ctc(torch::Tensor probs,
-            torch::Tensor grads,
-            torch::Tensor labels,
-            torch::Tensor label_sizes,
-            torch::Tensor sizes,
-            int minibatch_size,
-            torch::Tensor costs,
-            int blank_label)
-{
-    float* probs_ptr       = (float*)probs.data_ptr();
-    float* grads_ptr       = grads.storage() ? (float*)grads.data_ptr() : NULL;
-    int*   sizes_ptr       = (int*)sizes.data_ptr();
-    int*   labels_ptr      = (int*)labels.data_ptr();
-    int*   label_sizes_ptr = (int*)label_sizes.data_ptr();
-    float* costs_ptr       = (float*)costs.data_ptr();
-
-    const int probs_size = probs.size(2);
-
-    ctcOptions options;
-    memset(&options, 0, sizeof(options));
-    options.loc = CTC_GPU;
-    options.blank_label = blank_label;
-    options.stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
-
-    size_t gpu_size_bytes;
-    get_workspace_size(label_sizes_ptr, sizes_ptr,
-                       probs_size, minibatch_size,
-                       options, &gpu_size_bytes);
-
-    void* gpu_workspace = THCudaMalloc(state, gpu_size_bytes);
-
-    compute_ctc_loss(probs_ptr, grads_ptr,
-                     labels_ptr, label_sizes_ptr,
-                     sizes_ptr, probs_size,
-                     minibatch_size, costs_ptr,
-                     gpu_workspace, options);
-
-    THCudaFree(state, (void *) gpu_workspace);
-    return 1;
-}
-#endif
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("cpu_ctc", &cpu_ctc, "CTC Loss function with cpu");
-#ifdef WARPCTC_ENABLE_GPU
-  m.def("gpu_ctc", &gpu_ctc, "CTC Loss function with gpu");
-#endif
-}
--- a/pytorch_binding/src/cpu_binding.h
+++ b/pytorch_binding/src/cpu_binding.h
-#pragma once
-/*
-int cpu_ctc(THFloatTensor *probs,
-                        THFloatTensor *grads,
-                        THIntTensor *labels_ptr,
-                        THIntTensor *label_sizes_ptr,
-                        THIntTensor *sizes,
-                        int minibatch_size,
-                        THFloatTensor *costs,
-                        int blank_label);
-*/
-
-int cpu_ctc(torch::Tensor probs,
-            torch::Tensor grads,
-            torch::Tensor labels,
-            torch::Tensor label_sizes,
-            torch::Tensor sizes,
-            int minibatch_size,
-            torch::Tensor costs,
-            int blank_label); 
--- a/pytorch_binding/src/gpu_binding.h
+++ b/pytorch_binding/src/gpu_binding.h
-#pragma once
-
-/*
-int gpu_ctc(THCudaTensor *probs,
-                        THCudaTensor *grads,
-                        THIntTensor *labels_ptr,
-                        THIntTensor *label_sizes_ptr,
-                        THIntTensor *sizes,
-                        int minibatch_size,
-                        THFloatTensor *costs,
-                        int blank_label);
-*/
-
-int gpu_ctc(torch::Tensor probs,
-            torch::Tensor grads,
-            torch::Tensor labels,
-            torch::Tensor label_sizes,
-            torch::Tensor sizes,
-            int minibatch_size,
-            torch::Tensor costs,
-            int blank_label);
--- a/pytorch_binding/tests/test_gpu.py
+++ b/pytorch_binding/tests/test_gpu.py
-import torch
-import warpctc_pytorch as warp_ctc
-
-
-def test_empty_label(test_cpu=True, test_gpu=True):
-    probs = torch.FloatTensor([
-        [[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]],
-        [[0.6, 0.1, 0.1, 0.1, 0.1], [0.1, 0.1, 0.5, 0.2, 0.1]]
-    ]).contiguous()
-    grads = torch.zeros(probs.size())
-    labels = torch.IntTensor([1, 2])
-    label_sizes = torch.IntTensor([2, 0])
-    sizes = torch.IntTensor([2, 2])
-    minibatch_size = probs.size(1)
-
-    if test_cpu:
-        costs = torch.zeros(minibatch_size)
-        warp_ctc.cpu_ctc(probs,  grads,  labels, label_sizes,  sizes,  minibatch_size, costs,  0)
-        print('CPU_cost: %f' % costs.sum())
-        print('CPU probs={}\ngrads={}\ncosts={}'.format(probs, grads, costs))
-
-    if test_gpu:
-        probs = probs.clone().cuda()
-        grads = torch.zeros(probs.size()).cuda()
-        costs = torch.zeros(minibatch_size)
-        warp_ctc.gpu_ctc(probs, grads, labels, label_sizes, sizes, minibatch_size, costs, 0)
-        print('GPU_cost: %f' % costs.sum())
-        print(grads.view(grads.size(0) * grads.size(1), grads.size(2)))
-        print('GPU probs={}\ngrads={}\ncosts={}'.format(probs, grads, costs))
-
-
-if __name__ == '__main__':
-    print('torch.cuda.is_available() ', torch.cuda.is_available())
-    # test_empty_label(test_cpu=True, test_gpu=False)
-    test_empty_label(test_cpu=False, test_gpu=True)
-
-# HIP_VISIBLE_DEVICES=1 python3 test_gpu_new.py
--- a/pytorch_binding/tests/test_gpu_speed.py
+++ b/pytorch_binding/tests/test_gpu_speed.py
-import torch
-import warpctc_pytorch_change1 as warp_ctc_new
-import warpctc_pytorch as warp_ctc
-import time
-
-
-def test_compare_cpu(repeat_num=20):
-    probs = torch.FloatTensor([
-        [[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]],
-        [[0.6, 0.1, 0.1, 0.1, 0.1], [0.1, 0.1, 0.5, 0.2, 0.1]]
-    ]).contiguous()
-    labels = torch.IntTensor([1, 2])
-    label_sizes = torch.IntTensor([2, 0])
-    sizes = torch.IntTensor([2, 2])
-    minibatch_size = probs.size(1)
-    costs = torch.zeros(minibatch_size)
-    grads = torch.zeros(probs.size())
-
-    time_st = time.perf_counter()
-    # 1.运行老版本 CPU
-    for i in range(repeat_num):
-        probs_old = probs.clone()
-        costs_old = costs.clone()
-        grads_old = grads.clone()
-        warp_ctc.cpu_ctc(probs_old, grads_old, labels, label_sizes, sizes, minibatch_size, costs_old, 0)
-        if i == 0:
-            print('CPU_costs_old: %f' % costs_old.sum())
-            print('CPU probs_old={}\ngrads_old={}\ncosts_old={}'.format(probs_old, grads_old, costs_old))
-    time_used = (time.perf_counter() - time_st) / repeat_num
-    print('CPU warp_ctc old version using time: ', time_used)
-
-    time_st = time.perf_counter()
-    # 2.运行新版本 CPU
-    for i in range(repeat_num):
-        probs_new = probs.clone()
-        costs_new = costs.clone()
-        grads_new = grads.clone()
-        warp_ctc_new.cpu_ctc(probs_new, grads_new, labels, label_sizes, sizes, minibatch_size, costs_new, 0)
-        if i == 0:
-            print('CPU_costs_new: %f' % costs_new.sum())
-            print('CPU probs={}\ngrads_new={}\ncosts_new={}'.format(probs_new, grads_new, costs_new))
-    time_used = (time.perf_counter() - time_st) / repeat_num
-    print('CPU warp_ctc new version using time: ', time_used)
-
-
-def test_compare_gpu():
-    probs0 = torch.FloatTensor([
-        [[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]],
-        [[0.6, 0.1, 0.1, 0.1, 0.1], [0.1, 0.1, 0.5, 0.2, 0.1]]
-    ]).contiguous().cuda()
-    labels = torch.IntTensor([1, 2])
-    label_sizes = torch.IntTensor([2, 0])
-    sizes = torch.IntTensor([2, 2])
-    minibatch_size = probs0.size(1)
-
-    # 1.运行新版本 CPU
-    probs_new = probs0.clone().cuda()
-    costs_new = torch.zeros(minibatch_size)
-    grads_new = torch.zeros(probs0.size())
-    warp_ctc_new.cpu_ctc(probs_new, grads_new, labels, label_sizes, sizes, minibatch_size, costs_new, 0)
-    print('CPU_costs_new: %f' % costs_new.sum())
-    print('CPU probs_new={}\ngrads_new={}\ncosts_new={}'.format(probs_new, grads_new, costs_new))
-    # 2.运行老版本 CPU
-    probs = probs0.clone().cuda()
-    costs = torch.zeros(minibatch_size)
-    grads = torch.zeros(probs0.size())
-    warp_ctc.cpu_ctc(probs0, grads, labels, label_sizes, sizes, minibatch_size, costs, 0)
-    print('CPU_cost: %f' % costs.sum())
-    print('CPU probs={}\ngrads={}\ncosts={}'.format(probs, grads, costs))
-
-
-if __name__ == '__main__':
-    print('torch.cuda.is_available() ', torch.cuda.is_available())
-    test_compare_cpu()
-    test_compare_gpu()
--- a/pytorch_binding/warpctc_pytorch/__init__.py
+++ b/pytorch_binding/warpctc_pytorch/__init__.py
-import torch
-import warpctc_pytorch as warp_ctc
-from torch.autograd import Function
-from torch.nn import Module
-
-from _warp_ctc import *  # noqa
-
-
-def _assert_no_grad(tensor):
-    assert not tensor.requires_grad, \
-        "gradients only computed for acts - please " \
-        "mark other tensors as not requiring gradients"
-
-
-class _CTC(Function):
-    @staticmethod
-    def forward(ctx, acts, labels, act_lens, label_lens, size_average=False,
-                length_average=False, blank=0):
-        is_cuda = True if acts.is_cuda else False
-        # print('_CTC is_cuda', is_cuda)
-        acts = acts.contiguous()
-        loss_func = warp_ctc.gpu_ctc if is_cuda else warp_ctc.cpu_ctc
-        grads = torch.zeros(acts.size()).type_as(acts)
-        minibatch_size = acts.size(1)
-        costs = torch.zeros(minibatch_size).cpu()
-        loss_func(acts,
-                  grads,
-                  labels,
-                  label_lens,
-                  act_lens,
-                  minibatch_size,
-                  costs,
-                  blank)
-
-        costs = torch.FloatTensor([costs.sum()])
-
-        if length_average:
-            # Compute the avg. log-probability per batch sample and frame.
-            total_length = torch.sum(act_lens).item()
-            grads = grads / total_length
-            costs = costs / total_length
-        elif size_average:
-            # Compute the avg. log-probability per batch sample.
-            grads = grads / minibatch_size
-            costs = costs / minibatch_size
-
-        ctx.grads = grads
-        return costs
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        _grad_output = grad_output.to(ctx.grads.device)
-        return ctx.grads.mul_(_grad_output), None, None, None, None, None, None
-
-
-class CTCLoss(Module):
-    """
-    Parameters:
-        size_average (bool): normalize the loss by the batch size
-            (default: `False`)
-        length_average (bool): normalize the loss by the total number of frames
-            in the batch. If `True`, supersedes `size_average`
-            (default: `False`)
-    """
-
-    def __init__(self, blank=0, size_average=False, length_average=False):
-        super(CTCLoss, self).__init__()
-        self.ctc = _CTC.apply
-        self.blank = blank
-        self.size_average = size_average
-        self.length_average = length_average
-
-    def forward(self, acts, labels, act_lens, label_lens):
-        """
-        acts: Tensor of (seqLength x batch x outputDim) containing output from network
-        labels: 1 dimensional Tensor containing all the targets of the batch in one sequence
-        act_lens: Tensor of size (batch) containing size of each output sequence from the network
-        label_lens: Tensor of (batch) containing label length of each example
-        """
-        # labels must be 1 dimensional
-        if len(labels.size()) != 1:
-            print('error!! len(labels.size()) must be 1, get {}'.format(len(labels.size())))
-            raise ValueError
-
-        _assert_no_grad(labels)
-        _assert_no_grad(act_lens)
-        _assert_no_grad(label_lens)
-        return self.ctc(acts, labels, act_lens, label_lens, self.size_average,
-                        self.length_average, self.blank)
--- a/src/ctc_entrypoint.cu
+++ b/src/ctc_entrypoint.cu
-#include <cstddef>
-#include <iostream>
-#include <algorithm>
-
-#include "ctc.h"
-
-#include "detail/cpu_ctc.h"
-
-#ifdef __HIPCC__
-#include "detail/gpu_ctc.h"
-#endif
-
-
-extern "C" {
-
-int get_warpctc_version() {
-    return 13;
-}
-
-const char *ctcGetStatusString(ctcStatus_t status) {
-    switch (status) {
-        case CTC_STATUS_SUCCESS:
-            return "no error";
-        case CTC_STATUS_MEMOPS_FAILED:
-            return "cuda memcpy or memset failed";
-        case CTC_STATUS_INVALID_VALUE:
-            return "invalid value";
-        case CTC_STATUS_EXECUTION_FAILED:
-            return "execution failed";
-
-        case CTC_STATUS_UNKNOWN_ERROR:
-        default:
-            return "unknown error";
-
-    }
-
-}
-
-
-ctcStatus_t compute_ctc_loss(const float *const activations,
-                             float *gradients,
-                             const int *const flat_labels,
-                             const int *const label_lengths,
-                             const int *const input_lengths,
-                             int alphabet_size,
-                             int minibatch,
-                             float *costs,
-                             void *workspace,
-                             ctcOptions options) {
-
-    if (activations == nullptr ||
-        flat_labels == nullptr ||
-        label_lengths == nullptr ||
-        input_lengths == nullptr ||
-        costs == nullptr ||
-        workspace == nullptr ||
-        alphabet_size <= 0 ||
-        minibatch <= 0)
-        return CTC_STATUS_INVALID_VALUE;
-
-
-    if (options.loc == CTC_CPU) {
-        CpuCTC<float> ctc(alphabet_size, minibatch, workspace, options.num_threads,
-                          options.blank_label);
-
-        if (gradients != NULL)
-            return ctc.cost_and_grad(activations, gradients,
-                                     costs,
-                                     flat_labels, label_lengths,
-                                     input_lengths);
-        else
-            return ctc.score_forward(activations,
-                                     costs, flat_labels,
-                                     label_lengths, input_lengths);
-    } else if (options.loc == CTC_GPU) {
-#ifdef __HIPCC__
-        GpuCTC<float> ctc(alphabet_size, minibatch, workspace, options.stream,
-                          options.blank_label);
-
-        if (gradients != NULL)
-            return ctc.cost_and_grad(activations, gradients, costs,
-                                     flat_labels, label_lengths,
-                                     input_lengths);
-        else
-            return ctc.score_forward(activations, costs, flat_labels,
-                                     label_lengths, input_lengths);
-
-#else
-        std::cerr << "GPU execution requested, but not compiled with GPU support" << std::endl;
-        return CTC_STATUS_EXECUTION_FAILED;
-#endif
-    } else {
-        return CTC_STATUS_INVALID_VALUE;
-    }
-}
-
-
-ctcStatus_t get_workspace_size(const int *const label_lengths,
-                               const int *const input_lengths,
-                               int alphabet_size, int minibatch,
-                               ctcOptions options,
-                               size_t *size_bytes) {
-    if (label_lengths == nullptr ||
-        input_lengths == nullptr ||
-        size_bytes == nullptr ||
-        alphabet_size <= 0 ||
-        minibatch <= 0)
-        return CTC_STATUS_INVALID_VALUE;
-
-    // This is the max of all S and T for all examples in the minibatch.
-    int maxL = *std::max_element(label_lengths, label_lengths + minibatch);
-    int maxT = *std::max_element(input_lengths, input_lengths + minibatch);
-
-    const int S = 2 * maxL + 1;
-
-    *size_bytes = 0;
-
-    if (options.loc == CTC_GPU) {
-        // GPU storage
-        //nll_forward, nll_backward
-        *size_bytes += 2 * sizeof(float) * minibatch;
-
-        //repeats
-        *size_bytes += sizeof(int) * minibatch;
-
-        //label offsets
-        *size_bytes += sizeof(int) * minibatch;
-
-        //utt_length
-        *size_bytes += sizeof(int) * minibatch;
-
-        //label lengths
-        *size_bytes += sizeof(int) * minibatch;
-
-        //labels without blanks - overallocate for now
-        *size_bytes += sizeof(int) * maxL * minibatch;
-
-        //labels with blanks
-        *size_bytes += sizeof(int) * S * minibatch;
-
-        //alphas
-        *size_bytes += sizeof(float) * S * maxT * minibatch;
-
-        //denoms
-        *size_bytes += sizeof(float) * maxT * minibatch;
-
-        //probs (since we will pass in activations)
-        *size_bytes += sizeof(float) * alphabet_size * maxT * minibatch;
-
-    } else {
-        //cpu can eventually replace all minibatch with
-        //max number of concurrent threads if memory is
-        //really tight
-
-        //per minibatch memory
-        size_t per_minibatch_bytes = 0;
-
-        //output
-        per_minibatch_bytes += sizeof(float) * alphabet_size;
-
-        //alphas
-        per_minibatch_bytes += sizeof(float) * S * maxT;
-
-        //betas
-        per_minibatch_bytes += sizeof(float) * S;
-
-        //labels w/blanks, e_inc, s_inc
-        per_minibatch_bytes += 3 * sizeof(int) * S;
-
-        *size_bytes = per_minibatch_bytes * minibatch;
-
-        //probs
-        *size_bytes += sizeof(float) * alphabet_size * maxT * minibatch;
-    }
-
-    return CTC_STATUS_SUCCESS;
-}
-
-}
--- a/src/reduce.cu
+++ b/src/reduce.cu
-// Includes, system
-#include <stdio.h>
-#include <stdlib.h>
-
-// Includes, cuda
-#include <cuda_runtime.h>
-//#include<cublas_v2.h>
-#include <cuda_runtime_api.h>
-
-// Includes, cuda helper functions
-// #include <helper_cuda.h>
-
-// For the functors
-#include "detail/ctc_helper.h"
-#include "ctc.h"
-
-
-const int warp_size = 64;
-const int kCUDABlockNumThreads = 256;
-
-template<int NT, typename T, typename Rop>
-struct CTAReduce;
-
-template<int NT, typename T, typename Rop>
-struct CTAReduce {
-    enum {
-        Size = NT, Capacity = NT
-    };
-    struct Storage {
-        T shared[Capacity];
-    };
-
-    __device__ static T reduce(int tid, T x, Storage &storage, int count, Rop g) {
-        T *s = storage.shared;
-        s[tid] = x;
-        __syncthreads();
-
-        // Fold the data in half with each pass.
-#pragma unroll
-        for (int offset = NT / 2; offset >= warp_size; offset /= 2) {
-            if (tid + offset < count && tid < offset) {
-                x = g(x, s[offset + tid]);
-                s[tid] = x;
-            }
-            __syncthreads();
-        }
-
-        T shuff;
-        for (int offset = warp_size / 2; offset > 0; offset /= 2) {
-            // shuff = __shfl_down(0xFFFFFFF, x, offset);
-            shuff = __shfl_down(x, offset);
-            if (tid + offset < count && tid < offset) {
-                x = g(x, shuff);
-            }
-        }
-        return x;
-    }
-};
-
-template<int NT, typename Iop, typename Rop, typename T>
-__global__ void reduce_rows(Iop f, Rop g, const T *input, T *output,
-                            int num_rows, int num_cols) {
-
-    typedef CTAReduce<NT, T, Rop> R;
-    __shared__ typename R::Storage storage;
-
-    int tid = threadIdx.x;
-    int idx = tid;
-    int col = blockIdx.x;
-
-    T curr;
-    // Each block works on a column
-    if (idx < num_rows) {
-        curr = f(input[idx + col * num_rows]);
-    }
-    // __syncthreads();
-
-    idx += NT;
-    while (idx < num_rows) {
-        curr = g(curr, f(input[idx + col * num_rows]));
-        idx += NT;
-    }
-
-    // Sum thread-totals over the CTA.
-    curr = R::reduce(tid, curr, storage, num_rows, g);
-
-
-    // Store result in out
-    if (tid == 0) {
-        output[col] = curr;
-
-    }
-}
-
-template<int NT, typename Iop, typename Rop, typename T>
-__global__ void reduce_cols(Iop f, Rop g, const T *input, T *output,
-                            int num_rows, int num_cols) {
-
-    __shared__ T s[NT];
-
-    int warps_per_block = NT / warp_size;
-    int row = blockDim.x * blockIdx.x + threadIdx.x;
-    int col = threadIdx.y;
-    T curr;
-
-    if (row < num_rows && col < num_cols) {
-        curr = f(input[row + col * num_rows]);
-        col += blockDim.y;
-        while (col < num_cols) {
-            curr = g(curr, f(input[row + col * num_rows]));
-            col += blockDim.y;
-        }
-    }
-    s[threadIdx.x * warps_per_block + threadIdx.y] = curr;
-    __syncthreads();
-
-    // Reduce
-    if (threadIdx.y == 0 && row < num_rows) {
-#pragma unroll
-        for (int i = 1; i < warps_per_block && i < num_cols; ++i)
-            curr = g(curr, s[i + threadIdx.x * warps_per_block]);
-        output[row] = curr;
-    }
-}
-
-struct ReduceHelper {
-
-    template<typename T, typename Iof, typename Rof>
-    static void impl(Iof f, Rof g, const T *input, T *output, int num_rows, int num_cols, bool axis, CUstream stream) {
-
-        int grid_size;
-        if (axis) {
-            grid_size = num_cols;
-            reduce_rows<kCUDABlockNumThreads><<<grid_size, kCUDABlockNumThreads, 0, stream>>>
-                    (f, g, input, output, num_rows, num_cols);
-
-        } else {
-            dim3 tpb(warp_size, kCUDABlockNumThreads / warp_size);
-            grid_size = (num_cols + warp_size - 1) / warp_size;
-            reduce_cols<kCUDABlockNumThreads><<<grid_size, tpb, 0, stream>>>
-                    (f, g, input, output, num_rows, num_cols);
-
-        }
-
-    }
-};
-
-
-template<typename T, typename Iof, typename Rof>
-ctcStatus_t reduce(Iof f, Rof g, const T *input, T *output, int rows, int cols, bool axis, CUstream stream) {
-    ReduceHelper::impl(f, g, input, output, rows, cols, axis, stream);
-    hipStreamSynchronize(stream);
-
-    hipError_t err = hipGetLastError();
-    if (err != hipSuccess)
-        return CTC_STATUS_EXECUTION_FAILED;
-
-    return CTC_STATUS_SUCCESS;
-}
-
-ctcStatus_t reduce_negate(const float *input, float *output, int rows, int cols, bool axis, CUstream stream) {
-    return reduce(ctc_helper::negate<float>(), ctc_helper::add<float>(), input, output, rows, cols, axis, stream);
-}
-
-ctcStatus_t reduce_exp(const float *input, float *output, int rows, int cols, bool axis, CUstream stream) {
-    return reduce(ctc_helper::exponential<float>(), ctc_helper::add<float>(), input, output, rows, cols, axis, stream);
-}
-
-ctcStatus_t reduce_max(const float *input, float *output, int rows, int cols, bool axis, CUstream stream) {
-    auto ctc_status = reduce(ctc_helper::identity<float>(), ctc_helper::maximum<float>(), input, output, rows, cols, axis, stream);
-
-    return ctc_status;
-}