warpctc for dcu

ffdb193b · lishen · 99e2985d · ffdb193b · ffdb193b · ffdb193b
Commit ffdb193b authored May 16, 2023 by lishen
20 changed files
--- a/include/contrib/moderngpu/include/mgpuenums.h
+++ b/include/contrib/moderngpu/include/mgpuenums.h
+/******************************************************************************
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ *
+ * Code and text by Sean Baxter, NVIDIA Research
+ * See http://nvlabs.github.io/moderngpu for repository and documentation.
+ *
+ ******************************************************************************/
+
+#pragma once 
+
+namespace mgpu {
+
+enum MgpuBounds {
+	MgpuBoundsLower,
+	MgpuBoundsUpper
+};
+
+enum MgpuScanType {
+	MgpuScanTypeExc,
+	MgpuScanTypeInc
+};
+
+enum MgpuSearchType {
+	MgpuSearchTypeNone,
+	MgpuSearchTypeIndex,
+	MgpuSearchTypeMatch,
+	MgpuSearchTypeIndexMatch
+};
+
+enum MgpuJoinKind {
+	MgpuJoinKindInner,
+	MgpuJoinKindLeft,
+	MgpuJoinKindRight,
+	MgpuJoinKindOuter
+};
+
+enum MgpuSetOp {
+	MgpuSetOpIntersection,
+	MgpuSetOpUnion,
+	MgpuSetOpDiff,
+	MgpuSetOpSymDiff
+};
+
+} // namespace mgpu
--- a/include/contrib/moderngpu/include/util/static.h
+++ b/include/contrib/moderngpu/include/util/static.h
+/******************************************************************************
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ *
+ * Code and text by Sean Baxter, NVIDIA Research
+ * See http://nvlabs.github.io/moderngpu for repository and documentation.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <functional>
+#include <iterator>
+#include <cfloat>
+#include <typeinfo>
+#include <vector>
+#include <list>
+#include <map>
+#include <algorithm>
+#include <cassert>
+#include <memory>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+
+#ifndef MGPU_MIN
+#define MGPU_MIN(x, y) (((x) <= (y)) ? (x) : (y))
+#define MGPU_MAX(x, y) (((x) >= (y)) ? (x) : (y))
+#define MGPU_MAX0(x) (((x) >= 0) ? (x) : 0)
+#define MGPU_ABS(x) (((x) >= 0) ? (x) : (-x))
+
+#define MGPU_DIV_UP(x, y) (((x) + (y) - 1) / (y))
+#define MGPU_DIV_ROUND(x, y) (((x) + (y) / 2) / (y))
+#define MGPU_ROUND_UP(x, y) ((y) * MGPU_DIV_UP(x, y))
+#define MGPU_SHIFT_DIV_UP(x, y) (((x) + ((1<< (y)) - 1))>> y)
+#define MGPU_ROUND_UP_POW2(x, y) (((x) + (y) - 1) & ~((y) - 1))
+#define MGPU_ROUND_DOWN_POW2(x, y) ((x) & ~((y) - 1))
+#define MGPU_IS_POW_2(x) (0 == ((x) & ((x) - 1)))
+
+#endif // MGPU_MIN
+
+namespace mgpu {
+
+
+typedef unsigned char byte;
+
+typedef unsigned int uint;
+typedef signed short int16;
+
+typedef unsigned short ushort;
+typedef unsigned short uint16;
+
+typedef long long int64;
+typedef unsigned long long uint64;
+
+// IsPow2<X>::value is true if X is a power of 2.
+template<int X> struct sIsPow2 {
+	enum { value = 0 == (X & (X - 1)) };
+};
+
+// Finds the base-2 logarithm of X. value is -1 if X is not a power of 2.
+template<int X, bool roundUp = true> struct sLogPow2 { 
+	enum { extra = sIsPow2<X>::value ? 0 : (roundUp ? 1 : 0) };
+	enum { inner = sLogPow2<X / 2>::inner + 1 };
+	enum { value = inner + extra };
+};
+template<bool roundUp> struct sLogPow2<0, roundUp> {
+	enum { inner = 0 };
+	enum { value = 0 };
+};
+template<bool roundUp> struct sLogPow2<1, roundUp> { 
+	enum { inner = 0 };
+	enum { value = 0 };
+};
+
+template<int X, int Y>
+struct sDivUp {
+	enum { value = (X + Y - 1) / Y };
+};
+
+template<int count, int levels> struct sDiv2RoundUp {
+	enum { value = sDiv2RoundUp<sDivUp<count, 2>::value, levels - 1>::value };
+};
+template<int count> struct sDiv2RoundUp<count, 0> {
+	enum { value = count };
+};
+
+template<int X, int Y>
+struct sDivSafe {
+	enum { value = X / Y };
+};
+template<int X>
+struct sDivSafe<X, 0> {
+	enum { value = 0 };
+};
+
+template<int X, int Y>
+struct sRoundUp {
+	enum { rem = X % Y };
+	enum { value = X + (rem ? (Y - rem) : 0) };
+};
+
+template<int X, int Y>
+struct sRoundDown {
+	enum { rem = X % Y };
+	enum { value = X - rem };
+};
+
+// IntegerDiv is a template for avoiding divisions by zero in template 
+// evaluation. Templates always evaluate both b and c in an expression like
+// a ? b : c, and will error if either rhs contains an illegal expression,
+// even if the ternary is explictly designed to guard against that.
+template<int X, int Y>
+struct sIntegerDiv {
+	enum { value = X / (Y ? Y : (X + 1)) };
+};
+
+template<int X, int Y>
+struct sMax {
+	enum { value = (X >= Y) ? X : Y };
+};
+template<int X, int Y>
+struct sMin {
+	enum { value = (X <= Y) ? X : Y };
+};
+
+template<int X>
+struct sAbs {
+	enum { value = (X >= 0) ? X : -X };
+};
+
+
+// Finds the number of powers of 2 in the prime factorization of X.
+template<int X, int LSB = 1 & X> struct sNumFactorsOf2 {
+	enum { shifted = X >> 1 };
+	enum { value = 1 + sNumFactorsOf2<shifted>::value };
+};
+template<int X> struct sNumFactorsOf2<X, 1> {
+	enum { value = 0 };
+};
+
+// Returns the divisor for a conflict-free transpose.
+template<int X, int NumBanks = 32> struct sBankConflictDivisor {
+	enum { value = 
+		(1 & X) ? 0 : 
+		(sIsPow2<X>::value ? NumBanks :
+		(1<< sNumFactorsOf2<X>::value)) }; 
+	enum { log_value = sLogPow2<value>::value };
+};
+
+template<int NT, int X, int NumBanks = 32> struct sConflictFreeStorage {
+	enum { count = NT * X };
+	enum { divisor = sBankConflictDivisor<X, NumBanks>::value };
+	enum { padding = sDivSafe<count, divisor>::value };
+	enum { value = count + padding };
+};
+
+} // namespace mgpu
--- a/include/ctc.h
+++ b/include/ctc.h
+/** \file ctc.h
+ * Contains a simple C interface to call fast CPU and GPU based computation
+ * of the CTC loss.
+ */
+
+#pragma once
+
+#ifdef __cplusplus
+#include <cstddef>
+#include <torch/extension.h>
+
+extern "C" {
+#endif
+
+//forward declare of CUDA typedef to avoid needing to pull in CUDA headers
+//typedef struct CUstream_st* CUstream;
+typedef struct ihipStream_t* CUstream;
+
+typedef enum {
+    CTC_STATUS_SUCCESS = 0,
+    CTC_STATUS_MEMOPS_FAILED = 1,
+    CTC_STATUS_INVALID_VALUE = 2,
+    CTC_STATUS_EXECUTION_FAILED = 3,
+    CTC_STATUS_UNKNOWN_ERROR = 4
+} ctcStatus_t;
+
+/** Returns a single integer which specifies the API version of the warpctc library */
+int get_warpctc_version();
+
+/** Returns a string containing a description of status that was passed in
+ *  \param[in] status identifies which string should be returned
+ *  \return C style string containing the text description
+ *  */
+const char* ctcGetStatusString(ctcStatus_t status);
+
+typedef enum {
+    CTC_CPU = 0,
+    CTC_GPU = 1
+} ctcComputeLocation;
+
+/** Structure used for options to the CTC compution.  Applications
+ *  should zero out the array using memset and sizeof(struct
+ *  ctcOptions) in C or default initialization (e.g. 'ctcOptions
+ *  options{};' or 'auto options = ctcOptions{}') in C++ to ensure
+ *  forward compatibility with added options. */
+struct ctcOptions {
+    /// indicates where the ctc calculation should take place {CTC_CPU | CTC_GPU}
+    ctcComputeLocation loc;
+    union {
+        /// used when loc == CTC_CPU, the maximum number of threads that can be used
+        unsigned int num_threads;
+
+        /// used when loc == CTC_GPU, which stream the kernels should be launched in
+        CUstream stream;
+    };
+
+    /// the label value/index that the CTC calculation should use as the blank label
+    int blank_label;
+};
+
+/** Compute the connectionist temporal classification loss between a sequence
+ *  of probabilities and a ground truth labeling.  Optionally compute the
+ *  gradient with respect to the inputs.
+ * \param [in] activations pointer to the activations in either CPU or GPU
+ *             addressable memory, depending on info.  We assume a fixed
+ *             memory layout for this 3 dimensional tensor, which has dimension
+ *             (t, n, p), where t is the time index, n is the minibatch index,
+ *             and p indexes over probabilities of each symbol in the alphabet.
+ *             The memory layout is (t, n, p) in C order (slowest to fastest changing
+ *             index, aka row-major), or (p, n, t) in Fortran order (fastest to slowest
+ *             changing index, aka column-major). We also assume strides are equal to
+ *             dimensions - there is no padding between dimensions.
+ *             More precisely, element (t, n, p), for a problem with mini_batch examples
+ *             in the mini batch, and alphabet_size symbols in the alphabet, is located at:
+ *             activations[(t * mini_batch + n) * alphabet_size + p]
+ * \param [out] gradients if not NULL, then gradients are computed.  Should be
+ *              allocated in the same memory space as probs and memory
+ *              ordering is identical.
+ * \param [in]  flat_labels Always in CPU memory.  A concatenation
+ *              of all the labels for the minibatch.
+ * \param [in]  label_lengths Always in CPU memory. The length of each label
+ *              for each example in the minibatch.
+ * \param [in]  input_lengths Always in CPU memory.  The number of time steps
+ *              for each sequence in the minibatch.
+ * \param [in]  alphabet_size The number of possible output symbols.  There
+ *              should be this many probabilities for each time step.
+ * \param [in]  mini_batch How many examples in a minibatch.
+ * \param [out] costs Always in CPU memory.  The cost of each example in the
+ *              minibatch.
+ * \param [in,out] workspace In same memory space as probs. Should be of
+ *                 size requested by get_workspace_size.
+ * \param [in]  options see struct ctcOptions
+ *
+ *  \return Status information
+ *
+ * */
+ctcStatus_t compute_ctc_loss(const float* const activations,
+                             float* gradients,
+                             const int* const flat_labels,
+                             const int* const label_lengths,
+                             const int* const input_lengths,
+                             int alphabet_size,
+                             int minibatch,
+                             float *costs,
+                             void *workspace,
+                             ctcOptions options);
+
+
+/** For a given set of labels and minibatch size return the required workspace
+ *  size.  This will need to be allocated in the same memory space as your
+ *  probabilities.
+ * \param [in]  label_lengths Always in CPU memory. The length of each label
+ *              for each example in the minibatch.
+ * \param [in]  input_lengths Always in CPU memory.  The number of time steps
+ *              for each sequence in the minibatch.
+ * \param [in]  alphabet_size How many symbols in the alphabet or, equivalently,
+ *              the number of probabilities at each time step
+ * \param [in]  mini_batch How many examples in a minibatch.
+ * \param [in]  info see struct ctcOptions
+ * \param [out] size_bytes is pointer to a scalar where the memory
+ *              requirement in bytes will be placed. This memory should be allocated
+ *              at the same place, CPU or GPU, that the probs are in
+ *
+ *  \return Status information
+ **/
+ctcStatus_t get_workspace_size(const int* const label_lengths,
+                               const int* const input_lengths,
+                               int alphabet_size, int minibatch,
+                               ctcOptions info,
+                               size_t* size_bytes);
+
+#ifdef __cplusplus
+}
+#endif
--- a/include/detail/cpu_ctc.h
+++ b/include/detail/cpu_ctc.h
+#pragma once
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <numeric>
+#include <tuple>
+
+#if !defined(CTC_DISABLE_OMP) && !defined(APPLE)
+
+#include <omp.h>
+
+#endif
+
+#include "ctc_helper.h"
+
+template <typename ProbT>
+class CpuCTC {
+ public:
+  // Noncopyable
+  CpuCTC(int alphabet_size, int minibatch, void* workspace, int num_threads, int blank_label)
+      : alphabet_size_(alphabet_size), minibatch_(minibatch), num_threads_(num_threads), workspace_(workspace), blank_label_(blank_label) {
+#if defined(CTC_DISABLE_OMP) || defined(APPLE)
+#else
+    if (num_threads > 0) {
+      omp_set_num_threads(num_threads);
+    } else {
+      num_threads_ = omp_get_max_threads();
+    }
+#endif
+  };
+
+  CpuCTC(const CpuCTC&) = delete;
+
+  CpuCTC& operator=(const CpuCTC&) = delete;
+
+  ctcStatus_t cost_and_grad(
+      const ProbT* const activations,
+      ProbT* grads,
+      ProbT* costs,
+      const int* const flat_labels,
+      const int* const label_lengths,
+      const int* const input_lengths);
+
+  ctcStatus_t score_forward(
+      const ProbT* const activations,
+      ProbT* costs,
+      const int* const flat_labels,
+      const int* const label_lengths,
+      const int* const input_lengths);
+
+ private:
+  class CpuCTC_metadata {
+   private:
+    int setup_labels(const int* const labels, int blank_label, int L, int S);
+
+   public:
+    CpuCTC_metadata(int L, int S, int T, int mb, int alphabet_size, void* workspace, size_t bytes_used, int blank_label, const int* const labels);
+
+    ProbT* alphas;
+    ProbT* betas;
+    int* labels_w_blanks;
+    int* e_inc;
+    int* s_inc;
+    ProbT* output;
+    int repeats;
+  };
+
+  int alphabet_size_; // Number of characters plus blank
+  int minibatch_;
+  int num_threads_;
+  int blank_label_;
+  void* workspace_;
+
+  void softmax(const ProbT* const activations, ProbT* probs, const int* const input_lengths);
+
+  std::tuple<ProbT, bool> cost_and_grad_kernel(ProbT* grad, const ProbT* const probs, const int* const labels, int T, int L, int mb, size_t bytes_used);
+
+  ProbT compute_alphas(const ProbT* probs, int repeats, int S, int T, const int* const e_inc, const int* const s_inc, const int* const labels, ProbT* alphas);
+
+  ProbT compute_betas_and_grad(
+      ProbT* grad,
+      const ProbT* const probs,
+      ProbT log_partition,
+      int repeats,
+      int S,
+      int T,
+      const int* const e_inc,
+      const int* const s_inc,
+      const int* const labels,
+      ProbT* alphas,
+      ProbT* betas,
+      ProbT* output);
+};
+
+template <typename ProbT>
+CpuCTC<ProbT>::CpuCTC_metadata::CpuCTC_metadata(
+    int L,
+    int S,
+    int T,
+    int mb,
+    int alphabet_size,
+    void* workspace,
+    size_t bytes_used,
+    int blank_label,
+    const int* const labels) {
+  alphas = reinterpret_cast<ProbT*>(static_cast<char*>(workspace) + bytes_used);
+  bytes_used += sizeof(ProbT) * S * T;
+  std::fill(alphas, alphas + S * T, ctc_helper::neg_inf<ProbT>());
+  betas = reinterpret_cast<ProbT*>(static_cast<char*>(workspace) + bytes_used);
+  bytes_used += sizeof(ProbT) * S;
+  std::fill(betas, betas + S, ctc_helper::neg_inf<ProbT>());
+  labels_w_blanks = reinterpret_cast<int*>(static_cast<char*>(workspace) + bytes_used);
+  bytes_used += sizeof(int) * S;
+  e_inc = reinterpret_cast<int*>(static_cast<char*>(workspace) + bytes_used);
+  bytes_used += sizeof(int) * S;
+  s_inc = reinterpret_cast<int*>(static_cast<char*>(workspace) + bytes_used);
+  bytes_used += sizeof(int) * S;
+  output = reinterpret_cast<ProbT*>(static_cast<char*>(workspace) + bytes_used);
+  bytes_used += sizeof(ProbT) * alphabet_size;
+
+  repeats = setup_labels(labels, blank_label, L, S);
+}
+
+template <typename ProbT>
+int CpuCTC<ProbT>::CpuCTC_metadata::setup_labels(const int* const labels, int blank_label, int L, int S) {
+  int e_counter = 0;
+  int s_counter = 0;
+
+  s_inc[s_counter++] = 1; // get start
+
+  int repeats = 0; // number of repeat
+
+  for (int i = 1; i < L; ++i) {
+    if (labels[i - 1] == labels[i]) { // repeat label
+      s_inc[s_counter++] = 1;
+      s_inc[s_counter++] = 1; // label and blank
+      e_inc[e_counter++] = 1;
+      e_inc[e_counter++] = 1;
+      ++repeats;
+    } else {
+      s_inc[s_counter++] = 2; // single label and no repeat
+      e_inc[e_counter++] = 2;
+    }
+  }
+  e_inc[e_counter++] = 1; // get end
+
+  //  // printf("s_counter=%d, e_counter=%d, repeats=%d\n", s_counter, e_counter, repeats);
+  //  for (int i = 0; i < S; ++i) {
+  //    printf("s_inc[%d]=%d, e_inc[%d]=%d\n", i, s_inc[i], i, e_inc[i]);
+  //  }
+
+  for (int i = 0; i < L; ++i) {
+    labels_w_blanks[2 * i] = blank_label;
+    labels_w_blanks[2 * i + 1] = labels[i];
+  }
+  labels_w_blanks[S - 1] = blank_label; // end is blank
+
+  return repeats;
+}
+
+template <typename ProbT>
+void CpuCTC<ProbT>::softmax(const ProbT* const activations, ProbT* probs, const int* const input_lengths) {
+#pragma omp parallel for
+  for (int mb = 0; mb < minibatch_; ++mb) { // iter batch
+    for (int c = 0; c < input_lengths[mb]; ++c) { // iter input audio vec
+      int col_offset = (mb + minibatch_ * c) * alphabet_size_; // vec index * alphabet_size_
+
+      //// get max_activation
+      ProbT max_activation = -std::numeric_limits<ProbT>::infinity(); // set -1 matrix
+      for (int r = 0; r < alphabet_size_; ++r) // iter alphabet
+        max_activation = std::max(max_activation, activations[r + col_offset]);
+
+      //// compute probs between activations and max
+      ProbT denom = ProbT(0.);
+      for (int r = 0; r < alphabet_size_; ++r) {
+        probs[r + col_offset] = std::exp(activations[r + col_offset] - max_activation);
+        denom += probs[r + col_offset];
+      }
+      //// scale probs
+      for (int r = 0; r < alphabet_size_; ++r) {
+        probs[r + col_offset] /= denom;
+      }
+    }
+  }
+}
+
+template <typename ProbT>
+std::tuple<ProbT, bool> CpuCTC<
+    ProbT>::cost_and_grad_kernel(ProbT* grad, const ProbT* const probs, const int* const labels, int T, int L, int mb, size_t bytes_used) {
+  const int S = 2 * L + 1; // Number of labels with blanks
+
+  CpuCTC_metadata ctcm(L, S, T, mb, alphabet_size_, workspace_, bytes_used, blank_label_, labels);
+
+  bool over_threshold = false;
+  // check (length of labels + repeats) <= (length of utterance)
+  if (L + ctcm.repeats > T) {
+    return std::make_tuple(ProbT(0), over_threshold); // TODO, not right to return 0
+  }
+
+  ProbT llForward = compute_alphas(probs, ctcm.repeats, S, T, ctcm.e_inc, ctcm.s_inc, ctcm.labels_w_blanks, ctcm.alphas);
+
+  ProbT llBackward =
+      compute_betas_and_grad(grad, probs, llForward, ctcm.repeats, S, T, ctcm.e_inc, ctcm.s_inc, ctcm.labels_w_blanks, ctcm.alphas, ctcm.betas, ctcm.output);
+
+  ProbT diff = std::abs(llForward - llBackward);
+  if (diff > ctc_helper::threshold) {
+    over_threshold = true;
+  }
+
+  return std::make_tuple(-llForward, over_threshold);
+}
+
+// Computes forward probabilities
+template <typename ProbT>
+ProbT CpuCTC<ProbT>::compute_alphas(
+    const ProbT* probs,
+    int repeats,
+    int S,
+    int T,
+    const int* const e_inc,
+    const int* const s_inc,
+    const int* const labels,
+    ProbT* alphas) {
+  int start = (((S / 2) + repeats - T) < 0) ? 0 : 1, end = S > 1 ? 2 : 1;
+
+  // get log probs of label
+  for (int i = start; i < end; ++i) {
+    alphas[i] = std::log(probs[labels[i]]);
+  }
+
+  // printf("start=%d, end=%d, t=1~srcLen=%d, repeats=%d\n", start, end, T, repeats);
+  for (int t = 1; t < T; ++t) {
+    int remain = (S / 2) + repeats - (T - t);
+    // printf("t=%d, remain=%d\n", t, remain);
+
+    if (remain >= 0)
+      start += s_inc[remain];
+    if (t <= (S / 2) + repeats)
+      end += e_inc[t - 1];
+
+    int startloop = start;
+    int idx1 = t * S, idx2 = (t - 1) * S, idx3 = t * (alphabet_size_ * minibatch_);
+
+    if (start == 0) {
+      alphas[idx1] = alphas[idx2] + std::log(probs[blank_label_ + idx3]);
+      // printf("00 alphas[%d]=%f, alphas[%d]=%f\n", t, alphas[idx1], t - 1, alphas[idx2]);
+      startloop += 1;
+    }
+    // printf("start=%d, startloop=%d, end=%d\n", start, startloop, end);
+
+    for (int i = startloop; i < end; ++i) {
+      // printf("alphas[(t - 1=%d, u=%d)]=%f\n", t - 1, i, alphas[i + idx2]);
+      // printf("alphas[(t - 1=%d, u-1=%d)]=%f\n", t - 1, i - 1, alphas[(i - 1) + idx2]);
+
+      ProbT prev_sum = ctc_helper::log_plus<ProbT>()(alphas[i + idx2], alphas[(i - 1) + idx2]);
+      // printf("11 t=%d, u=%d, prev_sum=%f\n", t, i, prev_sum);
+
+      // Skip two if not on blank and not on repeat.
+      if (labels[i] != blank_label_ && i != 1 && labels[i] != labels[i - 2]) {
+        prev_sum = ctc_helper::log_plus<ProbT>()(prev_sum, alphas[(i - 2) + idx2]);
+        // printf("22 t=%d, u=%d, prev_sum=%f\n", t, i, prev_sum);
+      }
+
+      alphas[i + idx1] = prev_sum + std::log(probs[labels[i] + idx3]);
+      // printf("33 alpha[%d,%d]=%f, log(p(%d))=%f, label(%d)=%d\n", t, i, alphas[i + idx1], labels[i], std::log(probs[labels[i] + idx3]), i, labels[i]);
+    }
+    // printf("\n");
+  }
+
+  // printf("final start=%d, end=%d\n", start, end);
+
+  ProbT loglike = ctc_helper::neg_inf<ProbT>();
+  for (int i = start; i < end; ++i) {
+    loglike = ctc_helper::log_plus<ProbT>()(loglike, alphas[i + (T - 1) * S]);
+  }
+  // printf("compute alpha cost=%f\n", -loglike);
+
+#ifdef DEBUG_KERNEL
+  printf("cpu alphas:\n");
+  printf("T=%d, (T-1)*S=%d, start=%d, end=%d\n", T, (T - 1) * S, start, end);
+  for (int t = start; t < end; ++t) {
+    printf("%.5f ", alphas[t + (T - 1) * S]);
+  }
+  printf("\n");
+  printf("alphas loglike=%f\n", loglike);
+#endif
+
+  return loglike;
+}
+
+// Starting from T, we sweep backward over the alpha array computing one column
+// of betas as we go.  At each position we can update product alpha * beta and then
+// sum into the gradient associated with each label.
+// NOTE computes gradient w.r.t UNNORMALIZED final layer activations.
+// Assumed passed in grads are already zeroed!
+template <typename ProbT>
+ProbT CpuCTC<ProbT>::compute_betas_and_grad(
+    ProbT* grad,
+    const ProbT* const probs,
+    ProbT log_partition,
+    int repeats,
+    int S,
+    int T,
+    const int* const e_inc,
+    const int* const s_inc,
+    const int* const labels,
+    ProbT* alphas,
+    ProbT* betas,
+    ProbT* output) {
+  int start = S > 1 ? (S - 2) : 0, end = (T > (S / 2) + repeats) ? S : S - 1;
+
+  std::fill(output, output + alphabet_size_, ctc_helper::neg_inf<ProbT>());
+
+  // set the starting values in the beta column at the very right edge
+  for (int i = start; i < end; ++i) {
+    betas[i] = std::log(probs[labels[i] + (T - 1) * (alphabet_size_ * minibatch_)]);
+
+    // compute alpha * beta in log space at this position in (S, T) space
+    alphas[i + (T - 1) * S] += betas[i];
+
+    // update the gradient associated with this label
+    // essentially performing a reduce-by-key in a sequential manner
+    output[labels[i]] = ctc_helper::log_plus<ProbT>()(alphas[i + (T - 1) * S], output[labels[i]]);
+  }
+
+  // update the gradient wrt to each unique label
+  for (int i = 0; i < alphabet_size_; ++i) {
+    int idx3 = (T - 1) * alphabet_size_ * minibatch_ + i;
+
+    if (output[i] == 0.0 || output[i] == ctc_helper::neg_inf<ProbT>() || probs[idx3] == 0.0) {
+      grad[idx3] = probs[idx3];
+    } else {
+      grad[idx3] = probs[idx3] - std::exp(output[i] - std::log(probs[idx3]) - log_partition);
+    }
+  }
+
+  // loop from the second to last column all the way to the left
+  for (int t = T - 2; t >= 0; --t) {
+    int remain = (S / 2) + repeats - (T - t);
+    if (remain >= -1)
+      start -= s_inc[remain + 1];
+    if (t < (S / 2) + repeats)
+      end -= e_inc[t];
+
+    int endloop = end == S ? end - 1 : end;
+    int idx1 = t * S, idx3 = t * (alphabet_size_ * minibatch_);
+
+    std::fill(output, output + alphabet_size_, ctc_helper::neg_inf<ProbT>());
+
+    for (int i = start; i < endloop; ++i) {
+      ProbT next_sum = ctc_helper::log_plus<ProbT>()(betas[i], betas[(i + 1)]);
+      // Skip two if not on blank and not on repeat.
+      if (labels[i] != blank_label_ && i != (S - 2) && labels[i] != labels[i + 2]) {
+        next_sum = ctc_helper::log_plus<ProbT>()(next_sum, betas[(i + 2)]);
+      }
+      betas[i] = next_sum + std::log(probs[labels[i] + idx3]);
+
+      // compute alpha * beta in log space
+      alphas[i + idx1] += betas[i];
+
+      // update the gradient associated with this label
+      output[labels[i]] = ctc_helper::log_plus<ProbT>()(alphas[i + idx1], output[labels[i]]);
+    }
+
+    if (end == S) {
+      betas[(S - 1)] = betas[(S - 1)] + std::log(probs[blank_label_ + idx3]);
+      alphas[(S - 1) + idx1] += betas[(S - 1)];
+
+      output[labels[S - 1]] = ctc_helper::log_plus<ProbT>()(alphas[S - 1 + idx1], output[labels[S - 1]]);
+    }
+
+    // go over the unique labels and compute the final grad
+    //  wrt to each one at this time step
+    for (int i = 0; i < alphabet_size_; ++i) {
+      if (output[i] == 0.0 || output[i] == ctc_helper::neg_inf<ProbT>() || probs[idx3] == 0.0) {
+        grad[idx3] = probs[idx3];
+      } else {
+        grad[idx3] = probs[idx3] - std::exp(output[i] - std::log(probs[idx3]) - log_partition);
+      }
+      ++idx3;
+    }
+  }
+
+  ProbT loglike = ctc_helper::neg_inf<ProbT>();
+  for (int i = start; i < end; ++i) {
+    loglike = ctc_helper::log_plus<ProbT>()(loglike, betas[i]);
+  }
+
+#ifdef DEBUG_KERNEL
+  printf("cpu betas:\n");
+  printf("T=%d, (T-1)*S=%d, start=%d, end=%d\n", T, (T - 1) * S, start, end);
+  for (int t = start; t < end; ++t) {
+    printf("%.5f ", betas[t]);
+  }
+  printf("\n");
+  printf("betas loglike=%f\n", loglike);
+#endif
+
+  return loglike;
+}
+
+template <typename ProbT>
+ctcStatus_t CpuCTC<ProbT>::cost_and_grad(
+    const ProbT* const activations,
+    ProbT* grads,
+    ProbT* costs,
+    const int* const flat_labels,
+    const int* const label_lengths,
+    const int* const input_lengths) {
+  if (activations == nullptr || grads == nullptr || costs == nullptr || flat_labels == nullptr || label_lengths == nullptr || input_lengths == nullptr)
+    return CTC_STATUS_INVALID_VALUE;
+
+  ProbT* probs = static_cast<ProbT*>(workspace_);
+
+  // get max length input audio vector
+  int maxT = *std::max_element(input_lengths, input_lengths + minibatch_);
+  // memory to use
+  size_t bytes_used = sizeof(ProbT) * minibatch_ * alphabet_size_ * maxT;
+  // per minibatch memory
+  size_t per_minibatch_bytes = 0;
+
+  // get max length input text vector
+  int maxL = *std::max_element(label_lengths, label_lengths + minibatch_);
+
+  int maxS = 2 * maxL + 1; // labels with blanks
+
+  // output
+  per_minibatch_bytes += sizeof(float) * alphabet_size_; // vector of alphabet
+  // alphas
+  per_minibatch_bytes += sizeof(float) * maxS * maxT; // matrix size
+  // betas
+  per_minibatch_bytes += sizeof(float) * maxS; // sequence label size is n , alloc 2n+1, with blanks
+  // labels w/blanks, e_inc, s_inc
+  per_minibatch_bytes += 3 * sizeof(int) * maxS;
+
+  // compute softmax probs
+  softmax(activations, probs, input_lengths);
+
+#pragma omp parallel for
+  for (int mb = 0; mb < minibatch_; ++mb) {
+    const int T = input_lengths[mb]; // Length of utterance (time)
+    const int L = label_lengths[mb]; // Number of labels in transcription
+
+    bool mb_status;
+
+    std::tie(costs[mb], mb_status) = cost_and_grad_kernel(
+        grads + mb * alphabet_size_,
+        probs + mb * alphabet_size_,
+        flat_labels + std::accumulate(label_lengths, label_lengths + mb, 0),
+        T,
+        L,
+        mb,
+        bytes_used + mb * per_minibatch_bytes);
+  }
+
+  return CTC_STATUS_SUCCESS;
+}
+
+template <typename ProbT>
+ctcStatus_t CpuCTC<ProbT>::score_forward(
+    const ProbT* const activations,
+    ProbT* costs,
+    const int* const flat_labels,
+    const int* const label_lengths,
+    const int* const input_lengths) {
+  if (activations == nullptr || costs == nullptr || flat_labels == nullptr || label_lengths == nullptr || input_lengths == nullptr)
+    return CTC_STATUS_INVALID_VALUE;
+
+  ProbT* probs = static_cast<ProbT*>(workspace_);
+
+  int maxT = *std::max_element(input_lengths, input_lengths + minibatch_);
+
+  size_t bytes_used = sizeof(ProbT) * minibatch_ * alphabet_size_ * maxT;
+
+  // per minibatch memory
+  size_t per_minibatch_bytes = 0;
+
+  int maxL = *std::max_element(label_lengths, label_lengths + minibatch_);
+  int maxS = 2 * maxL + 1;
+
+  // output
+  per_minibatch_bytes += sizeof(float) * alphabet_size_;
+
+  // alphas
+  per_minibatch_bytes += sizeof(float) * maxS * maxT;
+
+  // betas
+  per_minibatch_bytes += sizeof(float) * maxS;
+
+  // labels w/blanks, e_inc, s_inc
+  per_minibatch_bytes += 3 * sizeof(int) * maxS;
+
+  softmax(activations, probs, input_lengths);
+
+#pragma omp parallel for
+  for (int mb = 0; mb < minibatch_; ++mb) {
+    const int T = input_lengths[mb]; // Length of utterance (time)
+    const int L = label_lengths[mb]; // Number of labels in transcription
+    const int S = 2 * L + 1; // Number of labels with blanks
+
+    CpuCTC_metadata ctcm(
+        L,
+        S,
+        T,
+        mb,
+        alphabet_size_,
+        workspace_,
+        bytes_used + mb * per_minibatch_bytes,
+        blank_label_,
+        flat_labels + std::accumulate(label_lengths, label_lengths + mb, 0));
+
+    if (L + ctcm.repeats > T)
+      costs[mb] = ProbT(0);
+    else {
+      costs[mb] = -compute_alphas(probs + mb * alphabet_size_, ctcm.repeats, S, T, ctcm.e_inc, ctcm.s_inc, ctcm.labels_w_blanks, ctcm.alphas);
+    }
+  }
+
+  return CTC_STATUS_SUCCESS;
+}
--- a/include/detail/ctc_helper.h
+++ b/include/detail/ctc_helper.h
+#pragma once
+
+#include <limits>
+#include <algorithm>
+#include <cmath>
+
+#include "hostdevice.h"
+
+namespace ctc_helper {
+
+    static const float threshold = 1e-1;
+
+    template<typename T>
+    HOSTDEVICE
+    T neg_inf() { return -T(INFINITY); }
+
+    inline int div_up(int x, int y) {
+        return (x + y - 1) / y;
+    }
+
+    template<typename Arg, typename Res = Arg>
+    struct maximum {
+        HOSTDEVICE
+        Res operator()(const Arg &x, const Arg &y) const {
+            return x < y ? y : x;
+        }
+    };
+
+    template<typename Arg, typename Res = Arg>
+    struct minimum {
+        HOSTDEVICE
+        Res operator()(const Arg &x, const Arg &y) const {
+            return x < y ? x : y;
+        }
+    };
+
+    template<typename Arg, typename Res = Arg>
+    struct add {
+        HOSTDEVICE
+        Res operator()(const Arg &x, const Arg &y) const {
+            return x + y;
+        }
+    };
+
+    template<typename Arg, typename Res = Arg>
+    struct identity {
+        HOSTDEVICE Res operator()(const Arg &x) const {
+            return Res(x);
+        }
+    };
+
+    template<typename Arg, typename Res = Arg>
+    struct negate {
+        HOSTDEVICE Res operator()(const Arg &x) const {
+            return Res(-x);
+        }
+    };
+
+    template<typename Arg, typename Res = Arg>
+    struct exponential {
+        HOSTDEVICE Res operator()(const Arg &x) const { return std::exp(x); }
+    };
+
+    template<typename Arg1, typename Arg2 = Arg1, typename Res=Arg1>
+    struct log_plus {
+        typedef Res result_type;
+        HOSTDEVICE
+        Res operator()(const Arg1 &p1, const Arg2 &p2) {
+            if (p1 == neg_inf<Arg1>())
+                return p2;
+            if (p2 == neg_inf<Arg2>())
+                return p1;
+            Res result = log1p(exp(-fabs(p1 - p2))) + maximum<Res>()(p1, p2);
+            return result;
+        }
+    };
+
+//template<typename Arg1, typename Arg2 = Arg1, typename Res=Arg1>
+//struct log_plus {
+//    HOSTDEVICE
+//    Res operator()(const Arg1& p1, const Arg2& p2) {
+//        Res p12_max = maximum<Res>()(p1, p2);
+//        Res p12_min = minimum<Res>()(p1, p2);
+//        Res p12_diff = p12_min-p12_max;
+//        Res NEGATIVE_CUTOFF_VAL = -(Res)100000;
+//
+//        Res result = p12_diff <= NEGATIVE_CUTOFF_VAL ? maximum<Res>()(p12_max, NEGATIVE_CUTOFF_VAL)
+//                                        : maximum<Res>()(p12_max + log(exp(p12_diff) + 1), NEGATIVE_CUTOFF_VAL);
+//
+//
+//        return result;
+//    }
+//};
+
+}
--- a/include/detail/gpu_ctc.h
+++ b/include/detail/gpu_ctc.h
+#pragma once
+
+#include "ctc_helper.h"
+#include "gpu_ctc_kernels.h"
+#include "reduce.h"
+#include <stdio.h>
+
+const int kCUDABlockNumThreads = 256;
+
+template<typename ProbT>
+class GpuCTC {
+public:
+    GpuCTC(int alphabet_size,
+           int minibatch,
+           void *workspace,
+           CUstream stream,
+           int blank_label) :
+            out_dim_(alphabet_size), minibatch_(minibatch),
+            gpu_workspace_(workspace), stream_(stream),
+            blank_label_(blank_label) {};
+
+    // Noncopyable
+    GpuCTC(const GpuCTC &) = delete;
+
+    GpuCTC &operator=(const GpuCTC &) = delete;
+
+    ctcStatus_t
+    cost_and_grad(const ProbT *const activations,
+                  ProbT *grads,
+                  ProbT *costs,
+                  const int *const flat_labels,
+                  const int *const label_lengths,
+                  const int *const input_lengths);
+
+    ctcStatus_t
+    score_forward(const ProbT *const activations,
+                  ProbT *costs,
+                  const int *const flat_labels,
+                  const int *const label_lengths,
+                  const int *const input_lengths);
+
+private:
+
+    template<int NT, int VT>
+    ctcStatus_t launch_alpha_beta_kernels(const ProbT *const probs,
+                                          ProbT *grads,
+                                          bool compute_alpha,
+                                          bool compute_beta);
+
+    ctcStatus_t
+    launch_gpu_kernels(const ProbT *const probs,
+                       ProbT *grads,
+                       size_t config,
+                       bool launch_alpha,
+                       bool launch_beta);
+
+    ctcStatus_t
+    setup_gpu_metadata(const int *const flat_labels,
+                       const int *const label_lengths,
+                       const int *const input_lengths);
+
+    ctcStatus_t
+    create_metadata_and_choose_config(const int *const label_lengths,
+                                      const int *const flat_labels,
+                                      const int *const input_lengths,
+                                      size_t &best_config);
+
+    ctcStatus_t
+    compute_probs(const ProbT *const activations);
+
+    ctcStatus_t
+    compute_cost_and_score(const ProbT *const activations,
+                           ProbT *grads,
+                           ProbT *costs,
+                           const int *const flat_labels,
+                           const int *const label_lengths,
+                           const int *const input_lengths,
+                           bool compute_alpha,
+                           bool compute_betas_and_grad);
+
+
+    int out_dim_; // Number of characters plus blank
+    int minibatch_;
+
+    int S_;
+    int T_;
+
+    int activation_cols_; // Number of columns in activations
+
+    CUstream stream_;
+    int blank_label_;
+
+    void *gpu_workspace_; // Buffer for all temporary GPU memory
+    int *utt_length_; // T
+    int *label_sizes_; // L
+    int *repeats_; // repeats_
+    int *label_offsets_;
+    int *labels_without_blanks_;
+    int *labels_with_blanks_;
+    ProbT *alphas_;
+    ProbT *nll_forward_;
+    ProbT *nll_backward_;
+    ProbT *denoms_; // Temporary storage for denoms for softmax
+    ProbT *probs_; // Temporary storage for probabilities (softmax output)
+};
+
+template<typename ProbT>
+ctcStatus_t
+GpuCTC<ProbT>::setup_gpu_metadata(const int *const flat_labels,
+                                  const int *const label_lengths,
+                                  const int *const input_lengths) {
+
+    size_t gpu_bytes_used = 0;
+
+    nll_forward_ = reinterpret_cast<ProbT *>(static_cast<char *>(gpu_workspace_) + gpu_bytes_used);
+    gpu_bytes_used += minibatch_ * sizeof(ProbT);
+
+
+    nll_backward_ = reinterpret_cast<ProbT *>(static_cast<char *>(gpu_workspace_) + gpu_bytes_used);
+    gpu_bytes_used += minibatch_ * sizeof(ProbT);
+
+
+    repeats_ = reinterpret_cast<int *>(static_cast<char *>(gpu_workspace_) + gpu_bytes_used);
+    gpu_bytes_used += minibatch_ * sizeof(int);
+
+    label_offsets_ = reinterpret_cast<int *>(static_cast<char *>(gpu_workspace_) + gpu_bytes_used);
+    gpu_bytes_used += minibatch_ * sizeof(int);
+
+
+    // This is the max of all S and T for all valid examples in the minibatch.
+    // A valid example is one for which L + repeats <= T
+    S_ = 0;
+    T_ = 0;
+
+    // This is the max of all timesteps, valid or not. Needed to compute offsets
+    int Tmax = 0;
+
+    // This is the max of all labels, valid or not. Needed to compute offsets
+    int Lmax = 0;
+    int total_label_length = 0;
+
+    constexpr int cpu_buffer_size = 64;
+    int repeats[cpu_buffer_size];
+    int label_offsets[cpu_buffer_size];
+
+    const int num_passes = ctc_helper::div_up(minibatch_, cpu_buffer_size);
+
+    hipError_t cuda_status;
+
+    for (int pass = 0; pass < num_passes; ++pass) {
+
+        const int start_idx = pass * cpu_buffer_size;
+        const int end_idx = std::min(minibatch_, (pass + 1) * cpu_buffer_size);
+
+        for (int j = start_idx; j < end_idx; ++j) {
+            const int L = label_lengths[j];
+            const int local_T = input_lengths[j];
+            const int *label_ptr = &(flat_labels[total_label_length]);
+
+            label_offsets[j % cpu_buffer_size] = total_label_length;
+            total_label_length += L;
+
+            int repeat_counter = 0;
+
+            for (int i = 1; i < L; ++i)
+                repeat_counter += (label_ptr[i] == label_ptr[i - 1]);
+
+            repeats[j % cpu_buffer_size] = repeat_counter;
+            const bool valid_label = ((L + repeat_counter) <= local_T);
+
+            // Only update S and T if label is valid
+            S_ = (valid_label) ? std::max(S_, L) : S_;
+            T_ = (valid_label) ? std::max(T_, local_T) : T_;
+
+            Tmax = std::max(Tmax, local_T);
+            Lmax = std::max(Lmax, L);
+        }
+
+        cuda_status = hipMemcpyAsync(&(repeats_[start_idx]), repeats,
+                                     (end_idx - start_idx) * sizeof(int),
+                                     hipMemcpyHostToDevice, stream_);
+        if (cuda_status != hipSuccess)
+            return CTC_STATUS_MEMOPS_FAILED;
+
+
+        cuda_status = hipMemcpyAsync(&(label_offsets_[start_idx]), label_offsets,
+                                     (end_idx - start_idx) * sizeof(int),
+                                     hipMemcpyHostToDevice, stream_);
+        if (cuda_status != hipSuccess)
+            return CTC_STATUS_MEMOPS_FAILED;
+    }
+
+    S_ = 2 * S_ + 1;
+    const int Smax = 2 * Lmax + 1;
+
+    activation_cols_ = minibatch_ * Tmax;
+
+    // Allocate memory for T
+    utt_length_ = reinterpret_cast<int *>(static_cast<char *>(gpu_workspace_) + gpu_bytes_used);
+    gpu_bytes_used += minibatch_ * sizeof(int);
+
+    cuda_status = hipMemcpyAsync(utt_length_, input_lengths,
+                                 minibatch_ * sizeof(int),
+                                 hipMemcpyHostToDevice, stream_);
+    if (cuda_status != hipSuccess)
+        return CTC_STATUS_MEMOPS_FAILED;
+
+    label_sizes_ = reinterpret_cast<int *>(static_cast<char *>(gpu_workspace_) + gpu_bytes_used);
+    gpu_bytes_used += minibatch_ * sizeof(int);
+    cuda_status = hipMemcpyAsync(label_sizes_, label_lengths,
+                                 minibatch_ * sizeof(int),
+                                 hipMemcpyHostToDevice, stream_);
+    if (cuda_status != hipSuccess)
+        return CTC_STATUS_MEMOPS_FAILED;
+
+    labels_without_blanks_ = reinterpret_cast<int *>(static_cast<char *>(gpu_workspace_) + gpu_bytes_used);
+    gpu_bytes_used += Lmax * minibatch_ * sizeof(int);
+    cuda_status = hipMemcpyAsync(labels_without_blanks_, flat_labels,
+                                 total_label_length * sizeof(int),
+                                 hipMemcpyHostToDevice, stream_);
+    if (cuda_status != hipSuccess)
+        return CTC_STATUS_MEMOPS_FAILED;
+
+    labels_with_blanks_ = reinterpret_cast<int *>(static_cast<char *>(gpu_workspace_) + gpu_bytes_used);
+    gpu_bytes_used += Smax * minibatch_ * sizeof(int);
+
+    alphas_ = reinterpret_cast<ProbT *>(static_cast<char *>(gpu_workspace_) + gpu_bytes_used);
+    gpu_bytes_used += (S_ * T_) * minibatch_ * sizeof(ProbT);
+
+
+    denoms_ = reinterpret_cast<ProbT *>(static_cast<char *>(gpu_workspace_) + gpu_bytes_used);
+    gpu_bytes_used += activation_cols_ * sizeof(ProbT);
+
+    probs_ = reinterpret_cast<ProbT *>(static_cast<char *>(gpu_workspace_) + gpu_bytes_used);
+    gpu_bytes_used += out_dim_ * activation_cols_ * sizeof(ProbT);
+
+    return CTC_STATUS_SUCCESS;
+}
+
+template<typename ProbT>
+template<int NT, int VT>
+ctcStatus_t GpuCTC<ProbT>::launch_alpha_beta_kernels(const ProbT *const probs,
+                                                     ProbT *grads,
+                                                     bool compute_alpha,
+                                                     bool compute_beta) {
+
+    // One thread block per utterance
+    const int grid_size = minibatch_;
+
+    // The data is laid out so that the next timestep is minibatch entries away
+    const int stride = minibatch_;
+
+    if (compute_alpha) {
+        compute_alpha_kernel<ProbT, NT, VT><<<grid_size, NT, 0, stream_>>>
+        (probs, label_sizes_, utt_length_,
+                repeats_, labels_without_blanks_, label_offsets_,
+                labels_with_blanks_, alphas_, nll_forward_,
+                stride, out_dim_, S_, T_, blank_label_);
+        hipStreamSynchronize(stream_);
+    }
+
+
+    if (compute_beta) {
+        compute_betas_and_grad_kernel<ProbT, NT, VT><<<grid_size, NT, 0, stream_>>>
+        (probs, label_sizes_, utt_length_, repeats_,
+                labels_with_blanks_, alphas_, nll_forward_, nll_backward_,
+                grads, stride, out_dim_, S_, T_, blank_label_);
+
+        hipStreamSynchronize(stream_);
+    }
+
+    hipError_t err = hipGetLastError();
+    if (err != hipSuccess)
+        return CTC_STATUS_EXECUTION_FAILED;
+
+    return CTC_STATUS_SUCCESS;
+}
+
+template<typename ProbT>
+ctcStatus_t
+GpuCTC<ProbT>::create_metadata_and_choose_config(const int *const flat_labels,
+                                                 const int *const label_lengths,
+                                                 const int *const input_lengths,
+                                                 size_t &best_config) {
+
+    // Setup the metadata for GPU
+    ctcStatus_t status = setup_gpu_metadata(flat_labels, label_lengths, input_lengths);
+    if (status != CTC_STATUS_SUCCESS)
+        return status;
+
+    constexpr int num_configs = 12;
+
+    int config_NT[num_configs] =
+            {32, 64, 128, 64, 128, 32, 64, 128, 64, 128, 128, 128};
+    int config_VT[num_configs] =
+            {1, 1, 1, 3, 2, 9, 6, 4, 9, 6, 9, 10};
+
+    best_config = 0;
+    for (int i = 0; i < num_configs; ++i) {
+        if ((config_NT[i] * config_VT[i]) >= S_)
+            break;
+        else
+            best_config++;
+    }
+
+    if (best_config >= num_configs)
+        return CTC_STATUS_UNKNOWN_ERROR;
+
+    return CTC_STATUS_SUCCESS;
+}
+
+template<typename ProbT>
+ctcStatus_t
+GpuCTC<ProbT>::launch_gpu_kernels(const ProbT *const probs,
+                                  ProbT *grads,
+                                  size_t config,
+                                  bool l_a,
+                                  bool l_b) {
+
+    switch (config) {
+        case 0: {
+            return launch_alpha_beta_kernels<32, 1>(probs, grads, l_a, l_b);
+        }
+        case 1: {
+            return launch_alpha_beta_kernels<64, 1>(probs, grads, l_a, l_b);
+        }
+        case 2: {
+            return launch_alpha_beta_kernels<128, 1>(probs, grads, l_a, l_b);
+        }
+        case 3: {
+            return launch_alpha_beta_kernels<64, 3>(probs, grads, l_a, l_b);
+        }
+        case 4: {
+            return launch_alpha_beta_kernels<128, 2>(probs, grads, l_a, l_b);
+        }
+        case 5: {
+            return launch_alpha_beta_kernels<32, 9>(probs, grads, l_a, l_b);
+        }
+        case 6: {
+            return launch_alpha_beta_kernels<64, 6>(probs, grads, l_a, l_b);
+        }
+        case 7: {
+            return launch_alpha_beta_kernels<128, 4>(probs, grads, l_a, l_b);
+        }
+        case 8: {
+            return launch_alpha_beta_kernels<64, 9>(probs, grads, l_a, l_b);
+        }
+        case 9: {
+            return launch_alpha_beta_kernels<128, 6>(probs, grads, l_a, l_b);
+        }
+        case 10: {
+            return launch_alpha_beta_kernels<128, 9>(probs, grads, l_a, l_b);
+        }
+        case 11: {
+            return launch_alpha_beta_kernels<128, 10>(probs, grads, l_a, l_b);
+        }
+    }
+
+    return CTC_STATUS_EXECUTION_FAILED;
+}
+
+template<typename ProbT>
+ctcStatus_t
+GpuCTC<ProbT>::compute_probs(const ProbT *const activations) {
+
+    hipError_t cuda_status;
+    cuda_status = hipMemcpyAsync(probs_, activations,
+                                 activation_cols_ * out_dim_ * sizeof(ProbT),
+                                 hipMemcpyDeviceToDevice, stream_);
+    if (cuda_status != hipSuccess)
+        return CTC_STATUS_MEMOPS_FAILED;
+
+    cuda_status = hipStreamSynchronize(stream_);
+
+    // Numerically stable SM
+    ctcStatus_t ctc_status = reduce_max(probs_, denoms_, out_dim_, activation_cols_, 1, stream_);
+
+    if (ctc_status != CTC_STATUS_SUCCESS)
+        return ctc_status;
+
+    // Kernel launch to subtract maximum
+    const int NT = kCUDABlockNumThreads;
+    const int VT = 1;
+    const int NV = NT * VT;
+    const int num_elements = out_dim_ * activation_cols_;
+    const int grid_size = ctc_helper::div_up(num_elements, NV);
+
+    prepare_stable_SM_kernel<ProbT, VT> <<< grid_size, NT, 0, stream_>>>
+    (ctc_helper::identity<ProbT>(), probs_, denoms_, out_dim_, num_elements);
+
+    // Reduce along columns to calculate denominator
+    ctc_status = reduce_exp(probs_, denoms_, out_dim_, activation_cols_, 1, stream_);
+    if (ctc_status != CTC_STATUS_SUCCESS)
+        return ctc_status;
+
+    // Kernel launch to calculate probabilities
+    compute_probs_kernel<ProbT, VT><<<grid_size, NT, 0, stream_>>>
+    (ctc_helper::exponential<ProbT>(), probs_, denoms_, out_dim_, num_elements);
+
+    return CTC_STATUS_SUCCESS;
+}
+
+template<typename ProbT>
+ctcStatus_t
+GpuCTC<ProbT>::compute_cost_and_score(const ProbT *const activations,
+                                      ProbT *grads,
+                                      ProbT *costs,
+                                      const int *const flat_labels,
+                                      const int *const label_lengths,
+                                      const int *const input_lengths,
+                                      bool compute_alpha,
+                                      bool compute_betas_and_grad) {
+
+    size_t best_config;
+    ctcStatus_t status = create_metadata_and_choose_config(flat_labels,
+                                                           label_lengths,
+                                                           input_lengths,
+                                                           best_config);
+    if (status != CTC_STATUS_SUCCESS)
+        return status;
+
+    status = compute_probs(activations);
+    if (status != CTC_STATUS_SUCCESS)
+        return status;
+
+    launch_gpu_kernels(probs_, grads, best_config,
+                       compute_alpha, compute_betas_and_grad);
+
+    hipError_t cuda_status_mem, cuda_status_sync;
+    cuda_status_mem = hipMemcpyAsync(costs, nll_forward_,
+                                     sizeof(ProbT) * minibatch_,
+                                     hipMemcpyDeviceToHost, stream_);
+    cuda_status_sync = hipStreamSynchronize(stream_);
+    if (cuda_status_mem != hipSuccess || cuda_status_sync != hipSuccess)
+        return CTC_STATUS_MEMOPS_FAILED;
+
+    return CTC_STATUS_SUCCESS;
+}
+
+template<typename ProbT>
+ctcStatus_t
+GpuCTC<ProbT>::cost_and_grad(const ProbT *const activations,
+                             ProbT *grads,
+                             ProbT *costs,
+                             const int *const flat_labels,
+                             const int *const label_lengths,
+                             const int *const input_lengths) {
+    if (activations == nullptr ||
+        grads == nullptr ||
+        costs == nullptr ||
+        flat_labels == nullptr ||
+        label_lengths == nullptr ||
+        input_lengths == nullptr
+            )
+        return CTC_STATUS_INVALID_VALUE;
+
+    return compute_cost_and_score(activations, grads, costs, flat_labels,
+                                  label_lengths, input_lengths, true, true);
+}
+
+template<typename ProbT>
+ctcStatus_t
+GpuCTC<ProbT>::score_forward(const ProbT *const activations,
+                             ProbT *costs,
+                             const int *const flat_labels,
+                             const int *const label_lengths,
+                             const int *const input_lengths) {
+    if (activations == nullptr ||
+        costs == nullptr ||
+        flat_labels == nullptr ||
+        label_lengths == nullptr ||
+        input_lengths == nullptr
+            )
+        return CTC_STATUS_INVALID_VALUE;
+
+    return compute_cost_and_score(activations, nullptr, costs, flat_labels,
+                                  label_lengths, input_lengths, true, false);
+}
+
--- a/include/detail/gpu_ctc_kernels.h
+++ b/include/detail/gpu_ctc_kernels.h
+#pragma once
+
+#include <contrib/moderngpu/include/device/ctascan.cuh>
+#include <contrib/moderngpu/include/device/ctamerge.cuh>
+
+#include "ctc_helper.h"
+#include <stdio.h>
+
+using namespace mgpu;
+
+template<int NT, int VT, typename T, typename KeyT, typename Op>
+struct CTASegReduce {
+
+    enum {
+        NV = NT * VT
+    };
+
+    union Storage {
+        typename CTAScan<NT>::Storage scanStorage;
+        int indices[NV];
+    };
+
+    //adapted from global kernel KernelReduceByKeyPreprocess
+    __device__ static void preprocessKeys(KeyT *keys, int count,
+                                          int *numUniqueLabels, int seg_start[VT],
+                                          int seg_end[VT], int *scanout) {
+        __shared__
+        Storage shared;
+
+        const int tid = threadIdx.x;
+        // Compare adjacent keys within each thread and mark discontinuities
+        int endFlags = 0;
+        T key = keys[VT * tid];
+#pragma unroll
+        for (int i = 0; i < VT; ++i) {
+            int index = VT * tid + 1 + i;
+            T next = keys[index];
+            if (index == count || (index < count && key != next)) {
+                endFlags |= 1 << i;
+            }
+            key = next;
+        }
+
+        __syncthreads();
+
+        //Count the number of encountered end flags
+        int scan = CTAScan<NT>::Scan(tid, popc(endFlags), shared.scanStorage, numUniqueLabels);
+
+        __syncthreads();
+
+        //output the unique keys
+        //use indices as scratch space
+        int outputPos = scan;
+#pragma unroll
+        for (int i = 0; i < VT; ++i) {
+
+            if ((endFlags >> i) & 1) {
+                shared.indices[outputPos] = keys[VT * tid + i];
+                scanout[outputPos] = VT * tid + i;
+                outputPos++;
+            }
+        }
+
+        __syncthreads();
+
+        // Create start and end
+        for (int idx = tid, j = 0; idx < (*numUniqueLabels); idx += blockDim.x, ++j) {
+            seg_start[j] = (idx == 0) ? 0 : (scanout[idx - 1] + 1);
+            seg_end[j] = scanout[idx];
+        }
+
+        __syncthreads();
+
+        //copy from the scratch space back into the keys
+#pragma unroll
+        for (int i = 0; i < VT; ++i) {
+            keys[i * NT + tid] = shared.indices[i * NT + tid];
+        }
+
+        __syncthreads();
+    }
+};
+
+// Computes forward probabilities. This fills in a T * S matrix.
+// The computation starts at t=1 (2nd row) and ends at t=T-1 (last row). Each row has
+// S elements where S = 2L + 1.
+//
+// We only need to read in probabilities corresponding to the labels, thus a sparse
+// set of values are read from the probs matrix since the character set is much smaller
+// than the labels. This is much more true for Mandarin than English.
+template<typename ProbT, int NT, int VT>
+__global__
+void compute_alpha_kernel(const ProbT *probs, const int *label_sizes,
+                          const int *utt_length, const int *repeats_in_labels,
+                          const int *labels_without_blanks, const int *label_offsets,
+                          int *labels_with_blanks, ProbT *alphas,
+                          ProbT *nll_forward, int stride, int out_dim,
+                          int S_memoffset, int T_memoffset, int blank_label) {
+
+    ctc_helper::log_plus<ProbT> log_plus_f;
+
+    const int tid = threadIdx.x;
+    const int L = label_sizes[blockIdx.x];
+    const int T = utt_length[blockIdx.x];
+    const int S = 2 * L + 1;
+    const int prob_offset = out_dim * blockIdx.x;
+    const int repeats = repeats_in_labels[blockIdx.x];
+
+    const int NV = NT * VT;
+    __shared__ int label[NV];
+
+    if ((L + repeats) > T)
+        return;
+
+    // Generate labels with blanks from labels without blanks
+    {
+        const int label_start_offset = label_offsets[blockIdx.x];
+        for (int idx = tid; idx < L; idx += blockDim.x) {
+            const int offset = (blockIdx.x * S_memoffset) + 2 * idx;
+            labels_with_blanks[offset] = blank_label;
+            labels_with_blanks[offset + 1] = labels_without_blanks[label_start_offset + idx];
+        }
+        if (tid == 0) {
+            labels_with_blanks[(blockIdx.x * S_memoffset) + 2 * L] = blank_label;
+        }
+    }
+    __syncthreads();
+
+    const int *labels = labels_with_blanks;
+    const int *label_global = &labels[blockIdx.x * S_memoffset];
+    ProbT *alpha = &alphas[blockIdx.x * (S_memoffset * T_memoffset)];
+
+    // Set the first row of alpha neg_inf - it is much more efficient to do it
+    // here than outside
+#pragma unroll
+    for (int idx = tid; idx < min(S, NV); idx += blockDim.x) {
+        alpha[idx] = ctc_helper::neg_inf<ProbT>();
+    }
+
+    // Load labels into shared memory
+#pragma unroll
+    for (int i = tid; i < S; i += NT) {
+        label[i] = label_global[i];
+    }
+
+    __syncthreads();
+
+    int start = (L + repeats < T) ? 0 : 1;
+    int end = S > 1 ? 2 : 1;
+
+    // Initialize the first row corresponding to t=0;
+    for (int i = tid; i < (end - start); i += blockDim.x) {
+        alpha[i + start] = log(probs[prob_offset + label[i + start]]);
+        //printf("compute_alpha_kernel probs is %f\n", probs[prob_offset + label[i + start]]);
+        //printf("compute_alpha_kernel alpha is %f\n", alpha[i + start]);
+    }
+
+    __syncthreads();
+
+    // Fill in the rest of matrix, one row at a time (outer loop).
+    for (int t = 1; t < T; ++t) {
+
+        // Start offsets into the current and previous row
+        const int start_cur_row = t * S;
+        const int start_prev_row = (t - 1) * S;
+
+        // The prob is a 2D column major array, with probabilites for each t strided
+        // by (out_dim * stride), where stride is the minibatch size, out_dim is alphabet_size
+        const int start_prob_col = t * (out_dim * stride);
+
+        // This is the first column and in this case there is nothing left of it
+        if (tid == 0) {
+            if (start == 0) {
+                alpha[start_cur_row] = alpha[start_prev_row] +
+                                       log(probs[prob_offset + start_prob_col + blank_label]);
+            } else if (start == 1) {
+                alpha[start_cur_row] = alpha[start_prev_row];
+            }
+        }
+
+        __syncthreads();
+
+        // Fill in the elements in each row. There is no loop dependence here since our
+        // input is the row above. We sum either two or three adjacent values from the
+        // row above depending on whether we have a blank or repeated characters. Finally
+        // we add the probability corresponding to this label at time t
+#pragma unroll
+        for (int idx = (tid + 1); idx < S; idx += blockDim.x) {
+
+            ProbT prev_sum = log_plus_f(alpha[idx + start_prev_row], alpha[(idx - 1) + start_prev_row]);
+
+            // Skip two if not on blank and not on repeat.
+            if ((label[idx] != blank_label) &&
+                (idx != 1) && (label[idx] != label[idx - 2]))
+                prev_sum = log_plus_f(prev_sum, alpha[(idx - 2) + start_prev_row]);
+
+            alpha[idx + start_cur_row] =
+                    prev_sum + log(probs[prob_offset + start_prob_col + label[idx]]);
+        }
+
+        __syncthreads();
+    }
+
+    if (tid == 0) {
+        // Add and return the rightmost two/one element(s) in the last row.
+        ProbT loglike = ctc_helper::neg_inf<ProbT>();
+
+        // This is the total increment for s_inc and e_inc through the loop
+        const int val = 2 * (L - 1) + 1 - (((L + repeats) == T) ? 1 : 0);
+
+        start = (val * (L != 0) + start);
+        end = (val * (L != 0) + end);
+
+        for (int i = start; i < end; ++i) {
+            loglike = log_plus_f(loglike, alpha[i + (T - 1) * S]);
+        }
+        nll_forward[blockIdx.x] = -loglike;
+    }
+}
+
+// Computes backward probabilities. This also fills in a T * S matrix
+//
+// See comments above compute_alphas for more context.
+template<typename ProbT, int NT, int VT>
+__global__
+void compute_betas_and_grad_kernel(const ProbT *probs, const int *label_sizes,
+                                   const int *utt_length, const int *repeats_in_labels,
+                                   const int *labels_with_blanks, ProbT *alphas,
+                                   const ProbT *nll_forward, ProbT *nll_backward,
+                                   ProbT *grads, int stride, int out_dim,
+                                   int S_memoffset, int T_memoffset, int blank_label) {
+
+    ctc_helper::log_plus<ProbT> log_plus_f;
+    typedef CTASegReduce<NT, VT, ProbT, int, ctc_helper::log_plus<ProbT>> SegReduce;
+
+    const int tid = threadIdx.x;
+    const int L = label_sizes[blockIdx.x];
+    const int T = utt_length[blockIdx.x];
+    const int S = 2 * L + 1;
+    const int prob_offset = out_dim * blockIdx.x;
+    const int repeats = repeats_in_labels[blockIdx.x];
+    const ProbT log_partition = -nll_forward[blockIdx.x];
+
+    const int *labels = labels_with_blanks;
+    const int *label_global = &labels[blockIdx.x * S_memoffset];
+    ProbT *alpha = &alphas[blockIdx.x * (S_memoffset * T_memoffset)];
+
+    const int NV = NT * VT;
+
+    union TempStorage {
+        ProbT beta[NV];
+        int result[NV];
+    };
+
+    __shared__
+    TempStorage temp_buffer;
+
+    __shared__ int label[NV];
+
+    // Temporaries needed for segmented reduce
+    // TODO: see if we can combine the shared memory requirements
+    __shared__ int keys_shared[NV];
+    __shared__ int gather_indices[NV];
+    __shared__
+    ProbT output[NV];
+
+    ProbT beta_val[VT];
+
+    if ((L + repeats) > T)
+        return;
+
+    int start = S > 1 ? (S - 2) : 0;
+    int end = (L + repeats < T) ? S : S - 1;
+
+    // Setup shared memory buffers
+#pragma unroll
+    for (int idx = tid; idx < NV; idx += NT) {
+        label[idx] = (idx < S) ? label_global[idx] : INT_MAX;
+    }
+
+    __syncthreads();
+
+    // int flags;
+    int uniquelabels;
+    int seg_start[VT];
+    int seg_end[VT];
+
+    // Sort labels and record indices from which to gather from
+    {
+        int key[VT];
+        int gather_val[VT];
+
+#pragma unroll
+        for (int i = 0; i < VT; ++i) {
+            const int idx = tid * VT + i;
+            gather_val[i] = idx;
+            key[i] = label[idx];
+        }
+
+        __syncthreads();
+
+        CTAMergesort < NT, VT, true, true, int, int, mgpu::less < int >>
+                                                                      (key, gather_val, keys_shared, gather_indices, S, tid, mgpu::less<int>());
+
+        __syncthreads();
+
+        for (int i = 0; i < VT; ++i) {
+            const int idx = tid * VT + i;
+            gather_indices[idx] = gather_val[i];
+        }
+
+        __syncthreads();
+
+        SegReduce::preprocessKeys(keys_shared, S, &uniquelabels, seg_start, seg_end,
+                                  temp_buffer.result);
+        __syncthreads();
+    }
+
+    // TODO: probably not necessary
+    __syncthreads();
+
+    // Load labels back
+#pragma unroll
+    for (int idx = tid; idx < NV; idx += NT) {
+        temp_buffer.beta[idx] = ctc_helper::neg_inf<ProbT>();
+    }
+    __syncthreads();
+
+    // Initialize the two rightmost values in the last row (assuming L non-zero)
+    for (int i = tid; i < (end - start); i += blockDim.x)
+        temp_buffer.beta[i + start] =
+                log(probs[prob_offset + (T - 1) * (out_dim * stride) + label[i + start]]);
+
+    __syncthreads();
+
+    // Load output data in registers through the transpose trick - should really be a function
+#pragma unroll
+    for (int idx = tid; idx < S; idx += NT) {
+        output[idx] = alpha[idx + (T - 1) * S] + temp_buffer.beta[idx];
+    }
+
+    __syncthreads();
+
+    // Start at the second to last row and backward in time
+    for (int t = T - 1; t >= 0; --t) {
+
+        // Start offsets into the current and next row
+        const int start_cur_row = t * S;
+
+        // Starting offset of column that we read from the probs array
+        const int start_prob_col = t * (out_dim * stride);
+
+        if (t < T - 1) {
+
+            // Filling up one row at at time but going back in time from the last row
+            // to the first. As in the forward pass, there is no loop dependence and we
+            // do a variable length filter of maximum filter size of 3
+#pragma unroll
+            for (int idx = tid, i = 0; idx < (S - 1); idx += NT, i++) {
+                ProbT next_sum = log_plus_f(temp_buffer.beta[idx], temp_buffer.beta[idx + 1]);
+
+                // Skip two if not on blank and not on repeat.
+                if ((label[idx] != blank_label) &&
+                    (idx != (S - 2)) && (label[idx] != label[idx + 2]))
+                    next_sum = log_plus_f(next_sum, temp_buffer.beta[idx + 2]);
+
+                beta_val[i] = next_sum + log(probs[prob_offset + start_prob_col + label[idx]]);
+            }
+
+            __syncthreads();
+
+            // Initialize values for the rightmost column since there is nothing to the right
+            // Update input buffer for next iteration
+            if ((tid == 0) && (end == S))
+                temp_buffer.beta[(S - 1)] = temp_buffer.beta[(S - 1)] +
+                                            log(probs[prob_offset + start_prob_col + blank_label]);
+
+#pragma unroll
+            for (int idx = tid, i = 0; idx < (S - 1); idx += NT, i++) {
+                temp_buffer.beta[idx] = beta_val[i];
+            }
+
+            __syncthreads();
+
+            // Beta Computation done - add to alpha and update the gradient. Reload
+            // the gradient back for segmented reduce later on
+#pragma unroll
+            for (int idx = tid; idx < S; idx += NT) {
+                output[idx] = alpha[idx + start_cur_row] + temp_buffer.beta[idx];
+            }
+
+            __syncthreads();
+
+        }
+
+        __syncthreads();
+
+        // Compute segmented reduction of output by using label as key
+        {
+            // Somewhat faster key value reduce
+            ProbT accum[VT];
+
+            for (int idx = tid, j = 0; idx < uniquelabels; idx += blockDim.x, ++j) {
+
+                accum[j] = ctc_helper::neg_inf<ProbT>();
+                for (int i = seg_start[j]; i <= seg_end[j]; ++i) {
+                    accum[j] = log_plus_f(accum[j], output[gather_indices[i]]);
+                }
+            }
+            __syncthreads();
+
+            // Write accumulated value into output since that is not used
+            for (int idx = tid, j = 0; idx < uniquelabels; idx += blockDim.x, ++j) {
+                output[idx] = accum[j];
+            }
+            __syncthreads();
+
+            for (int idx = tid; idx < out_dim; idx += blockDim.x) {
+                const int grads_offset = prob_offset + start_prob_col + idx;
+                grads[grads_offset] = probs[grads_offset];
+            }
+
+            __syncthreads();
+
+            for (int idx = tid; idx < uniquelabels; idx += blockDim.x) {
+                const int grads_offset = prob_offset + start_prob_col + keys_shared[idx];
+
+                ProbT grad = output[idx];
+
+                if ((grad == 0.0) || (probs[grads_offset] == 0.0) ||
+                    (grad == ctc_helper::neg_inf<ProbT>())) {
+                } else {
+                    grads[grads_offset] =
+                            probs[grads_offset] - exp(grad - log(probs[grads_offset]) - log_partition);
+                }
+            }
+
+            __syncthreads();
+        }
+
+        // Output backward log likelihood
+        if ((t == 0) && (tid == 0)) {
+            ProbT loglike = ctc_helper::neg_inf<ProbT>();
+
+            const int val = 2 * (L - 1) + 1 - (((L + repeats) == T) ? 1 : 0);
+
+            start = (-val * (L != 0) + start);
+            end = (-val * (L != 0) + end);
+
+            // Sum and return the leftmost one/two value(s) in first row
+            for (int i = start; i < end; ++i)
+                loglike = log_plus_f(loglike, temp_buffer.beta[i]);
+
+            nll_backward[blockIdx.x] = -loglike;
+        }
+
+        // For some reason this is important
+        __syncthreads();
+    }
+}
+
+template<typename ProbT, int VT = 1, typename Op>
+__global__ void compute_probs_kernel(Op f, ProbT *probs,
+                                     const ProbT *const denom,
+                                     int alphabet_size,
+                                     int count) {
+
+    int idx = blockDim.x * blockIdx.x + threadIdx.x;
+    int stride = blockDim.x * gridDim.x;
+#pragma unroll
+    for (int i = 0; i < VT; i++) {
+        if (idx < count) {
+            const int column_idx = idx / alphabet_size;
+            probs[idx] = f(probs[idx]) / denom[column_idx];
+        }
+        idx += stride;
+    }
+}
+
+template<typename ProbT, int VT = 1, typename Op>
+__global__ void prepare_stable_SM_kernel(Op f, ProbT *probs,
+                                         const ProbT *const col_max,
+                                         int alphabet_size,
+                                         int count) {
+
+    int idx = blockDim.x * blockIdx.x + threadIdx.x;
+    int stride = blockDim.x * gridDim.x;
+#pragma unroll
+    for (int i = 0; i < VT; i++) {
+        if (idx < count) {
+            const int column_idx = idx / alphabet_size;
+            probs[idx] = f(probs[idx] - col_max[column_idx]);
+        }
+        idx += stride;
+    }
+}
--- a/include/detail/hostdevice.h
+++ b/include/detail/hostdevice.h
+#pragma once
+
+#ifdef __HIPCC__
+    #define HOSTDEVICE __device__ __host__
+#else
+    #define HOSTDEVICE
+#endif
--- a/include/detail/reduce.h
+++ b/include/detail/reduce.h
+#pragma once
+
+ctcStatus_t reduce_negate(const float* input, float* output, int rows, int cols, bool axis, CUstream stream);
+ctcStatus_t reduce_exp(const float* input, float* output, int rows, int cols, bool axis, CUstream stream);
+ctcStatus_t reduce_max(const float* input, float* output, int rows, int cols, bool axis, CUstream stream);
--- a/pytorch_binding/setup.cfg
+++ b/pytorch_binding/setup.cfg
+[tool:pytest]
--- a/pytorch_binding/setup.py
+++ b/pytorch_binding/setup.py
+import torch
+from typing import List, Optional, Union
+import glob
+import os
+import shlex
+import subprocess
+import sys
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CppExtension
+from setuptools import find_packages, setup
+from setuptools.command.build_ext import build_ext
+from pkg_resources import packaging  # type: ignore[attr-defined]
+
+
+def _find_rocm_home() -> Optional[str]:
+    rocm_home = os.environ.get('ROCM_HOME') or os.environ.get('ROCM_PATH')
+    if rocm_home is None:
+        try:
+            pipe_hipcc = subprocess.Popen(
+                ["which hipcc | xargs readlink -f"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
+            hipcc, _ = pipe_hipcc.communicate()
+            rocm_home = os.path.dirname(os.path.dirname(hipcc.decode(*()).rstrip('\r\n')))
+            if os.path.basename(rocm_home) == 'hip':
+                rocm_home = os.path.dirname(rocm_home)
+        except Exception:
+            rocm_home = '/opt/rocm'
+            if not os.path.exists(rocm_home):
+                rocm_home = None
+    if rocm_home and torch.version.hip is None:
+        print(f"No ROCm runtime is found, using ROCM_HOME='{rocm_home}'")
+    return rocm_home
+
+
+def _get_rocm_arch_flags(cflags: Optional[List[str]] = None) -> List[str]:
+    if cflags is not None:
+        for flag in cflags:
+            if 'amdgpu-target' in flag:
+                return ['-fno-gpu-rdc']
+    archs = os.environ.get('PYTORCH_ROCM_ARCH', 'gfx900;gfx906')
+    flags = ['--amdgpu-target=%s' % arch for arch in archs.split(';')]
+    flags += ['-fno-gpu-rdc']
+    return flags
+
+
+ROCM_HOME = _find_rocm_home()
+IS_HIP_EXTENSION = True if ((ROCM_HOME is not None) and (torch.version.hip is not None)) else False
+COMMON_HIP_FLAGS = [
+    '-fPIC',
+    '-D__HIP_PLATFORM_HCC__=1',
+]
+
+COMMON_HIPCC_FLAGS = [
+    '-DCUDA_HAS_FP16=1',
+    '-D__HIP_NO_HALF_OPERATORS__=1',
+    '-D__HIP_NO_HALF_CONVERSIONS__=1',
+]
+
+
+def is_ninja_available():
+    try:
+        subprocess.check_output('ninja --version'.split())
+    except Exception:
+        return False
+    else:
+        return True
+
+
+def verify_ninja_availability():
+    if not is_ninja_available():
+        raise RuntimeError("Ninja is required to load C++ extensions")
+
+
+def _is_cuda_file(path: str) -> bool:
+    valid_ext = ['.cu', '.cuh']
+    if IS_HIP_EXTENSION:
+        valid_ext.append('.hip')
+    return os.path.splitext(path)[1] in valid_ext
+
+
+def _join_rocm_home(*paths) -> str:
+    if ROCM_HOME is None:
+        raise EnvironmentError('ROCM_HOME environment variable is not set. ')
+    return os.path.join(ROCM_HOME, *paths)
+
+
+def _write_ninja_file(path, cflags, post_cflags, cuda_cflags, cuda_post_cflags, sources,
+                      objects, ldflags, library_target, with_cuda) -> None:
+    def sanitize_flags(flags):
+        if flags is None:
+            return []
+        else:
+            return [flag.strip() for flag in flags]
+
+    cflags = sanitize_flags(cflags)
+    post_cflags = sanitize_flags(post_cflags)
+    cuda_cflags = sanitize_flags(cuda_cflags)
+    cuda_post_cflags = sanitize_flags(cuda_post_cflags)
+    ldflags = sanitize_flags(ldflags)
+    assert len(sources) == len(objects)
+    assert len(sources) > 0
+    compiler = os.environ.get('CXX', 'c++')
+    config = ['ninja_required_version = 1.3']
+    config.append(f'cxx = {compiler}')
+    if with_cuda:
+        if IS_HIP_EXTENSION:
+            nvcc = _join_rocm_home('bin', 'hipcc')
+        config.append(f'nvcc = {nvcc}')
+    flags = [f'cflags = {" ".join(cflags)}']
+    flags.append(f'post_cflags = {" ".join(post_cflags)}')
+    if with_cuda:
+        flags.append(f'cuda_cflags = {" ".join(cuda_cflags)}')
+        flags.append(f'cuda_post_cflags = {" ".join(cuda_post_cflags)}')
+    flags.append(f'ldflags = {" ".join(ldflags)}')
+    sources = [os.path.abspath(file) for file in sources]
+    compile_rule = ['rule compile']
+    compile_rule.append('  command = $cxx -MMD -MF $out.d $cflags -c $in -o $out $post_cflags')
+    compile_rule.append('  depfile = $out.d')
+    compile_rule.append('  deps = gcc')
+    if with_cuda:
+        cuda_compile_rule = ['rule cuda_compile']
+        nvcc_gendeps = ''
+        required_cuda_version = packaging.version.parse('10.2')
+        has_cuda_version = torch.version.cuda is not None
+        if has_cuda_version and packaging.version.parse(torch.version.cuda) >= required_cuda_version:
+            cuda_compile_rule.append('  depfile = $out.d')
+            cuda_compile_rule.append('  deps = gcc')
+        cuda_compile_rule.append(
+            f'  command = $nvcc {nvcc_gendeps} $cuda_cflags -c $in -o $out $cuda_post_cflags')
+    build = []
+    for source_file, object_file in zip(sources, objects):
+        is_cuda_source = _is_cuda_file(source_file) and with_cuda
+        rule = 'cuda_compile' if is_cuda_source else 'compile'
+        source_file = source_file.replace(" ", "$ ")
+        object_file = object_file.replace(" ", "$ ")
+        build.append(f'build {object_file}: {rule} {source_file}')
+    if library_target is not None:
+        link_rule = ['rule link']
+        link_rule.append('  command = $cxx $in $ldflags -o $out')
+        link = [f'build {library_target}: link {" ".join(objects)}']
+        default = [f'default {library_target}']
+    else:
+        link_rule, link, default = [], [], []
+    blocks = [config, flags, compile_rule]
+    if with_cuda:
+        blocks.append(cuda_compile_rule)
+    blocks += [link_rule, build, link, default]
+    with open(path, 'w') as build_file:
+        for block in blocks:
+            lines = '\n'.join(block)
+            build_file.write(f'{lines}\n\n')
+
+
+def _get_num_workers(verbose: bool) -> Optional[int]:
+    max_jobs = os.environ.get('MAX_JOBS')
+    if max_jobs is not None and max_jobs.isdigit():
+        if verbose:
+            print(f'Using envvar MAX_JOBS ({max_jobs}) as the number of workers...')
+        return int(max_jobs)
+    if verbose:
+        print('Allowing ninja to set a default number of workers... ')
+    return None
+
+
+def _run_ninja_build(build_directory: str, verbose: bool, error_prefix: str) -> None:
+    command = ['ninja', '-v']
+    num_workers = _get_num_workers(verbose)
+    if num_workers is not None:
+        command.extend(['-j', str(num_workers)])
+    env = os.environ.copy()
+    try:
+        sys.stdout.flush()
+        sys.stderr.flush()
+        stdout_fileno = 1
+        subprocess.run(command, stdout=stdout_fileno if verbose else subprocess.PIPE, stderr=subprocess.STDOUT,
+                       cwd=build_directory, check=True, env=env)
+    except subprocess.CalledProcessError as e:
+        _, error, _ = sys.exc_info()
+        message = error_prefix
+        if hasattr(error, 'output') and error.output:  # type: ignore[union-attr]
+            message += f": {error.output.decode(*SUBPROCESS_DECODE_ARGS)}"  # type: ignore[union-attr]
+        raise RuntimeError(message) from e
+
+
+def _write_ninja_file_and_compile_objects(sources: List[str], objects, cflags, post_cflags, cuda_cflags,
+                                          cuda_post_cflags, build_directory: str, verbose: bool,
+                                          with_cuda: Optional[bool]) -> None:
+    verify_ninja_availability()
+    compiler = os.environ.get('CXX', 'c++')
+    if with_cuda is None:
+        with_cuda = any(map(_is_cuda_file, sources))
+    build_file_path = os.path.join(build_directory, 'build.ninja')
+    if verbose:
+        print(f'Emitting ninja build file {build_file_path}...')
+    _write_ninja_file(path=build_file_path, cflags=cflags, post_cflags=post_cflags, cuda_cflags=cuda_cflags,
+                      cuda_post_cflags=cuda_post_cflags, sources=sources, objects=objects, ldflags=None,
+                      library_target=None, with_cuda=with_cuda)
+    if verbose:
+        print('Compiling objects...')
+    _run_ninja_build(
+        build_directory,
+        verbose,
+        error_prefix='Error compiling objects for extension')
+
+
+class BuildReleaseExtension(BuildExtension):
+    def __init__(self, *args, **kwargs) -> None:
+        super(BuildReleaseExtension, self).__init__(*args, **kwargs)
+
+    def build_extensions(self) -> None:
+        self._check_abi()
+        cuda_ext = False
+        extension_iter = iter(self.extensions)
+        extension = next(extension_iter, None)
+        while not cuda_ext and extension:
+            for source in extension.sources:
+                _, ext = os.path.splitext(source)
+                if ext == '.cu':
+                    cuda_ext = True
+                    break
+            extension = next(extension_iter, None)
+        for extension in self.extensions:
+            if isinstance(extension.extra_compile_args, dict):
+                for ext in ['cxx', 'nvcc']:
+                    if ext not in extension.extra_compile_args:
+                        extension.extra_compile_args[ext] = []
+            self._add_compile_flag(extension, '-DTORCH_API_INCLUDE_EXTENSION_H')
+            for name in ["COMPILER_TYPE", "STDLIB", "BUILD_ABI"]:
+                val = getattr(torch._C, f"_PYBIND11_{name}")
+                self._add_compile_flag(extension, f'-DPYBIND11_{name}="{val}"')
+            self._define_torch_extension_name(extension)
+            self._add_gnu_cpp_abi_flag(extension)
+        self.compiler.src_extensions += ['.cu', '.cuh', '.hip']
+
+        def append_std17_if_no_std_present(cflags) -> None:
+            cpp_format_prefix = '/{}:' if self.compiler.compiler_type == 'msvc' else '-{}='
+            cpp_flag_prefix = cpp_format_prefix.format('std')
+            cpp_flag = cpp_flag_prefix + 'c++14'
+            if not any(flag.startswith(cpp_flag_prefix) for flag in cflags):
+                cflags.append(cpp_flag)
+
+        def convert_to_absolute_paths_inplace(paths):
+            if paths is not None:
+                for i in range(len(paths)):
+                    if not os.path.isabs(paths[i]):
+                        paths[i] = os.path.abspath(paths[i])
+
+        def unix_wrap_ninja_compile(sources, output_dir=None, macros=None, include_dirs=None, debug=0,
+                                    extra_preargs=None, extra_postargs=None, depends=None):
+            output_dir = os.path.abspath(output_dir)
+            convert_to_absolute_paths_inplace(self.compiler.include_dirs)
+            _, objects, extra_postargs, pp_opts, _ = \
+                self.compiler._setup_compile(output_dir, macros, include_dirs, sources, depends, extra_postargs)
+            common_cflags = self.compiler._get_cc_args(pp_opts, debug, extra_preargs)
+            extra_cc_cflags = self.compiler.compiler_so[1:]
+            if (debug):
+                print("debug mode")
+            else:
+                extra_cc_cflags.remove('-g')
+                extra_cc_cflags.remove('-Wall')
+                print("release mode")
+            with_cuda = any(map(_is_cuda_file, sources))
+            if isinstance(extra_postargs, dict):
+                post_cflags = extra_postargs['cxx']
+            else:
+                post_cflags = list(extra_postargs)
+            if IS_HIP_EXTENSION:
+                post_cflags = COMMON_HIP_FLAGS + post_cflags
+            append_std17_if_no_std_present(post_cflags)
+            cuda_post_cflags = None
+            cuda_cflags = None
+            if with_cuda:
+                cuda_cflags = common_cflags
+                if isinstance(extra_postargs, dict):
+                    cuda_post_cflags = extra_postargs['nvcc']
+                else:
+                    cuda_post_cflags = list(extra_postargs)
+                if IS_HIP_EXTENSION:
+                    cuda_post_cflags = cuda_post_cflags + _get_rocm_arch_flags(cuda_post_cflags)
+                    cuda_post_cflags = COMMON_HIP_FLAGS + COMMON_HIPCC_FLAGS + cuda_post_cflags
+                append_std17_if_no_std_present(cuda_post_cflags)
+                cuda_cflags = [shlex.quote(f) for f in cuda_cflags]
+                cuda_post_cflags = [shlex.quote(f) for f in cuda_post_cflags]
+            _write_ninja_file_and_compile_objects(sources=sources, objects=objects,
+                                                  cflags=[shlex.quote(f) for f in extra_cc_cflags + common_cflags],
+                                                  post_cflags=[shlex.quote(f) for f in post_cflags],
+                                                  cuda_cflags=cuda_cflags,
+                                                  cuda_post_cflags=cuda_post_cflags, build_directory=output_dir,
+                                                  verbose=True, with_cuda=with_cuda)
+            return objects
+
+        self.compiler.compile = unix_wrap_ninja_compile
+        build_ext.build_extensions(self)
+
+
+def get_version():
+    return "0.1"
+
+
+def get_extensions():
+    extensions = []
+    include_dirs = []
+    define_macros = []
+    extra_compile_args = {'cxx': ['-O3'], 'nvcc': []}
+
+    args = []
+    args += ['-DWARPCTC_ENABLE_GPU']
+    args += ['-DCTC_DISABLE_OMP']
+    # args += ['-DDEBUG_KERNEL']
+    args += ['-Wno-deprecated']
+    extra_compile_args['cxx'] += args
+    extra_compile_args['nvcc'] += args
+
+    op_files = glob.glob('./src/*.cu') + glob.glob('./src/*.cpp') + ['../src/reduce.cu', '../src/ctc_entrypoint.cu']
+    print('op_files = ', op_files)
+    extension = CUDAExtension
+
+    include_dirs.append(os.path.realpath('../include/'))
+    include_dirs.append('/opt/dtk/rocrand/include/')
+    include_dirs.append('/opt/dtk/hiprand/include/')
+    print('include_dirs = ', include_dirs)
+
+    ext_ops = extension(
+        name="_warp_ctc",
+        sources=op_files,
+        include_dirs=include_dirs,
+        define_macros=define_macros,
+        extra_compile_args=extra_compile_args)
+    extensions.append(ext_ops)
+    return extensions
+
+
+def main():
+    setup(
+        name='warpctc_pytorch',
+        version=get_version(),
+        description='Torch fuseop Computer Vision Foundation',
+        keywords='computer vision',
+        packages=find_packages(),
+        include_package_data=False,
+        package_data={
+            'warpctc_pytorch': [
+                "src/*.cuh",
+                "src/*.cu",
+                "src/*.hip",
+                "src/*.cpp"
+            ]
+        },
+        ext_modules=get_extensions(),
+        cmdclass={
+            'build_ext': BuildReleaseExtension
+        },
+        zip_safe=False
+    )
+
+
+if __name__ == "__main__":
+    main()
--- a/pytorch_binding/src/binding.cu
+++ b/pytorch_binding/src/binding.cu
+#include <iostream>
+#include <vector>
+#include <numeric>
+#include <torch/extension.h>
+
+#ifdef WARPCTC_ENABLE_GPU
+	#include "ATen/cuda/CUDAContext.h"
+	#include <c10/cuda/CUDAGuard.h>
+	#include "ATen/cuda/CUDAEvent.h"
+    #include <THC/THCGeneral.h>
+
+    extern THCState* state;
+#endif
+
+#include "ctc.h"
+
+int cpu_ctc(torch::Tensor probs,
+            torch::Tensor grads,
+            torch::Tensor labels,
+            torch::Tensor label_sizes,
+            torch::Tensor sizes,
+            int minibatch_size,
+            torch::Tensor costs,
+            int blank_label)
+{
+    float* probs_ptr       = (float*)probs.data_ptr();
+    float* grads_ptr       = grads.storage() ? (float*)grads.data_ptr() : NULL;
+    int*   sizes_ptr       = (int*)sizes.data_ptr();
+    int*   labels_ptr      = (int*)labels.data_ptr();
+    int*   label_sizes_ptr = (int*)label_sizes.data_ptr();
+    float* costs_ptr       = (float*)costs.data_ptr();
+
+    const int probs_size = probs.size(2);
+
+    ctcOptions options;
+    memset(&options, 0, sizeof(options));
+    options.loc = CTC_CPU;
+    options.num_threads = 0; // will use default number of threads
+    options.blank_label = blank_label;
+
+#if defined(CTC_DISABLE_OMP) || defined(APPLE)
+    // have to use at least one
+    options.num_threads = std::max(options.num_threads, (unsigned int) 1);
+#endif
+
+    size_t cpu_size_bytes;
+    get_workspace_size(label_sizes_ptr, sizes_ptr,
+                       probs_size, minibatch_size,
+                       options, &cpu_size_bytes);
+
+    float* cpu_workspace = new float[cpu_size_bytes / sizeof(float)];
+
+    compute_ctc_loss(probs_ptr, grads_ptr,
+                     labels_ptr, label_sizes_ptr,
+                     sizes_ptr, probs_size,
+                     minibatch_size, costs_ptr,
+                     cpu_workspace, options);
+
+    delete[] cpu_workspace;
+    return 1;
+}
+
+
+#ifdef WARPCTC_ENABLE_GPU
+int gpu_ctc(torch::Tensor probs,
+            torch::Tensor grads,
+            torch::Tensor labels,
+            torch::Tensor label_sizes,
+            torch::Tensor sizes,
+            int minibatch_size,
+            torch::Tensor costs,
+            int blank_label)
+{
+    float* probs_ptr       = (float*)probs.data_ptr();
+    float* grads_ptr       = grads.storage() ? (float*)grads.data_ptr() : NULL;
+    int*   sizes_ptr       = (int*)sizes.data_ptr();
+    int*   labels_ptr      = (int*)labels.data_ptr();
+    int*   label_sizes_ptr = (int*)label_sizes.data_ptr();
+    float* costs_ptr       = (float*)costs.data_ptr();
+
+    const int probs_size = probs.size(2);
+
+    ctcOptions options;
+    memset(&options, 0, sizeof(options));
+    options.loc = CTC_GPU;
+    options.blank_label = blank_label;
+    options.stream = at::cuda::getCurrentCUDAStream();
+
+    size_t gpu_size_bytes;
+    get_workspace_size(label_sizes_ptr, sizes_ptr,
+                       probs_size, minibatch_size,
+                       options, &gpu_size_bytes);
+
+    void* gpu_workspace = THCudaMalloc(state, gpu_size_bytes);
+
+    compute_ctc_loss(probs_ptr, grads_ptr,
+                     labels_ptr, label_sizes_ptr,
+                     sizes_ptr, probs_size,
+                     minibatch_size, costs_ptr,
+                     gpu_workspace, options);
+
+    THCudaFree(state, (void *) gpu_workspace);
+    return 1;
+}
+#endif
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("cpu_ctc", &cpu_ctc, "CTC Loss function with cpu");
+#ifdef WARPCTC_ENABLE_GPU
+  m.def("gpu_ctc", &gpu_ctc, "CTC Loss function with gpu");
+#endif
+}
--- a/pytorch_binding/src/binding.hip
+++ b/pytorch_binding/src/binding.hip
+// !!! This is a file automatically generated by hipify!!!
+#include <iostream>
+#include <vector>
+#include <numeric>
+#include <torch/extension.h>
+
+#ifdef WARPCTC_ENABLE_GPU
+	#include "ATen/hip/HIPContext.h"
+	#include <ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h>
+	#include "ATen/hip/HIPEvent.h"
+    #include <THH/THHGeneral.h>
+
+    extern THCState* state;
+#endif
+
+#include "ctc.h"
+
+int cpu_ctc(torch::Tensor probs,
+            torch::Tensor grads,
+            torch::Tensor labels,
+            torch::Tensor label_sizes,
+            torch::Tensor sizes,
+            int minibatch_size,
+            torch::Tensor costs,
+            int blank_label)
+{
+    float* probs_ptr       = (float*)probs.data_ptr();
+    float* grads_ptr       = grads.storage() ? (float*)grads.data_ptr() : NULL;
+    int*   sizes_ptr       = (int*)sizes.data_ptr();
+    int*   labels_ptr      = (int*)labels.data_ptr();
+    int*   label_sizes_ptr = (int*)label_sizes.data_ptr();
+    float* costs_ptr       = (float*)costs.data_ptr();
+
+    const int probs_size = probs.size(2);
+
+    ctcOptions options;
+    memset(&options, 0, sizeof(options));
+    options.loc = CTC_CPU;
+    options.num_threads = 0; // will use default number of threads
+    options.blank_label = blank_label;
+
+#if defined(CTC_DISABLE_OMP) || defined(APPLE)
+    // have to use at least one
+    options.num_threads = ::max(options.num_threads, (unsigned int) 1);
+#endif
+
+    size_t cpu_size_bytes;
+    get_workspace_size(label_sizes_ptr, sizes_ptr,
+                       probs_size, minibatch_size,
+                       options, &cpu_size_bytes);
+
+    float* cpu_workspace = new float[cpu_size_bytes / sizeof(float)];
+
+    compute_ctc_loss(probs_ptr, grads_ptr,
+                     labels_ptr, label_sizes_ptr,
+                     sizes_ptr, probs_size,
+                     minibatch_size, costs_ptr,
+                     cpu_workspace, options);
+
+    delete[] cpu_workspace;
+    return 1;
+}
+
+
+#ifdef WARPCTC_ENABLE_GPU
+int gpu_ctc(torch::Tensor probs,
+            torch::Tensor grads,
+            torch::Tensor labels,
+            torch::Tensor label_sizes,
+            torch::Tensor sizes,
+            int minibatch_size,
+            torch::Tensor costs,
+            int blank_label)
+{
+    float* probs_ptr       = (float*)probs.data_ptr();
+    float* grads_ptr       = grads.storage() ? (float*)grads.data_ptr() : NULL;
+    int*   sizes_ptr       = (int*)sizes.data_ptr();
+    int*   labels_ptr      = (int*)labels.data_ptr();
+    int*   label_sizes_ptr = (int*)label_sizes.data_ptr();
+    float* costs_ptr       = (float*)costs.data_ptr();
+
+    const int probs_size = probs.size(2);
+
+    ctcOptions options;
+    memset(&options, 0, sizeof(options));
+    options.loc = CTC_GPU;
+    options.blank_label = blank_label;
+    options.stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
+
+    size_t gpu_size_bytes;
+    get_workspace_size(label_sizes_ptr, sizes_ptr,
+                       probs_size, minibatch_size,
+                       options, &gpu_size_bytes);
+
+    void* gpu_workspace = THCudaMalloc(state, gpu_size_bytes);
+
+    compute_ctc_loss(probs_ptr, grads_ptr,
+                     labels_ptr, label_sizes_ptr,
+                     sizes_ptr, probs_size,
+                     minibatch_size, costs_ptr,
+                     gpu_workspace, options);
+
+    THCudaFree(state, (void *) gpu_workspace);
+    return 1;
+}
+#endif
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("cpu_ctc", &cpu_ctc, "CTC Loss function with cpu");
+#ifdef WARPCTC_ENABLE_GPU
+  m.def("gpu_ctc", &gpu_ctc, "CTC Loss function with gpu");
+#endif
+}
--- a/pytorch_binding/src/cpu_binding.h
+++ b/pytorch_binding/src/cpu_binding.h
+#pragma once
+/*
+int cpu_ctc(THFloatTensor *probs,
+                        THFloatTensor *grads,
+                        THIntTensor *labels_ptr,
+                        THIntTensor *label_sizes_ptr,
+                        THIntTensor *sizes,
+                        int minibatch_size,
+                        THFloatTensor *costs,
+                        int blank_label);
+*/
+
+int cpu_ctc(torch::Tensor probs,
+            torch::Tensor grads,
+            torch::Tensor labels,
+            torch::Tensor label_sizes,
+            torch::Tensor sizes,
+            int minibatch_size,
+            torch::Tensor costs,
+            int blank_label); 
--- a/pytorch_binding/src/gpu_binding.h
+++ b/pytorch_binding/src/gpu_binding.h
+#pragma once
+
+/*
+int gpu_ctc(THCudaTensor *probs,
+                        THCudaTensor *grads,
+                        THIntTensor *labels_ptr,
+                        THIntTensor *label_sizes_ptr,
+                        THIntTensor *sizes,
+                        int minibatch_size,
+                        THFloatTensor *costs,
+                        int blank_label);
+*/
+
+int gpu_ctc(torch::Tensor probs,
+            torch::Tensor grads,
+            torch::Tensor labels,
+            torch::Tensor label_sizes,
+            torch::Tensor sizes,
+            int minibatch_size,
+            torch::Tensor costs,
+            int blank_label);
--- a/pytorch_binding/tests/test_gpu.py
+++ b/pytorch_binding/tests/test_gpu.py
+import torch
+import warpctc_pytorch as warp_ctc
+
+
+def test_empty_label(test_cpu=True, test_gpu=True):
+    probs = torch.FloatTensor([
+        [[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]],
+        [[0.6, 0.1, 0.1, 0.1, 0.1], [0.1, 0.1, 0.5, 0.2, 0.1]]
+    ]).contiguous()
+    grads = torch.zeros(probs.size())
+    labels = torch.IntTensor([1, 2])
+    label_sizes = torch.IntTensor([2, 0])
+    sizes = torch.IntTensor([2, 2])
+    minibatch_size = probs.size(1)
+
+    if test_cpu:
+        costs = torch.zeros(minibatch_size)
+        warp_ctc.cpu_ctc(probs,  grads,  labels, label_sizes,  sizes,  minibatch_size, costs,  0)
+        print('CPU_cost: %f' % costs.sum())
+        print('CPU probs={}\ngrads={}\ncosts={}'.format(probs, grads, costs))
+
+    if test_gpu:
+        probs = probs.clone().cuda()
+        grads = torch.zeros(probs.size()).cuda()
+        costs = torch.zeros(minibatch_size)
+        warp_ctc.gpu_ctc(probs, grads, labels, label_sizes, sizes, minibatch_size, costs, 0)
+        print('GPU_cost: %f' % costs.sum())
+        print(grads.view(grads.size(0) * grads.size(1), grads.size(2)))
+        print('GPU probs={}\ngrads={}\ncosts={}'.format(probs, grads, costs))
+
+
+if __name__ == '__main__':
+    print('torch.cuda.is_available() ', torch.cuda.is_available())
+    # test_empty_label(test_cpu=True, test_gpu=False)
+    test_empty_label(test_cpu=False, test_gpu=True)
+
+# HIP_VISIBLE_DEVICES=1 python3 test_gpu_new.py
--- a/pytorch_binding/tests/test_gpu_speed.py
+++ b/pytorch_binding/tests/test_gpu_speed.py
+import torch
+import warpctc_pytorch_change1 as warp_ctc_new
+import warpctc_pytorch as warp_ctc
+import time
+
+
+def test_compare_cpu(repeat_num=20):
+    probs = torch.FloatTensor([
+        [[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]],
+        [[0.6, 0.1, 0.1, 0.1, 0.1], [0.1, 0.1, 0.5, 0.2, 0.1]]
+    ]).contiguous()
+    labels = torch.IntTensor([1, 2])
+    label_sizes = torch.IntTensor([2, 0])
+    sizes = torch.IntTensor([2, 2])
+    minibatch_size = probs.size(1)
+    costs = torch.zeros(minibatch_size)
+    grads = torch.zeros(probs.size())
+
+    time_st = time.perf_counter()
+    # 1.运行老版本 CPU
+    for i in range(repeat_num):
+        probs_old = probs.clone()
+        costs_old = costs.clone()
+        grads_old = grads.clone()
+        warp_ctc.cpu_ctc(probs_old, grads_old, labels, label_sizes, sizes, minibatch_size, costs_old, 0)
+        if i == 0:
+            print('CPU_costs_old: %f' % costs_old.sum())
+            print('CPU probs_old={}\ngrads_old={}\ncosts_old={}'.format(probs_old, grads_old, costs_old))
+    time_used = (time.perf_counter() - time_st) / repeat_num
+    print('CPU warp_ctc old version using time: ', time_used)
+
+    time_st = time.perf_counter()
+    # 2.运行新版本 CPU
+    for i in range(repeat_num):
+        probs_new = probs.clone()
+        costs_new = costs.clone()
+        grads_new = grads.clone()
+        warp_ctc_new.cpu_ctc(probs_new, grads_new, labels, label_sizes, sizes, minibatch_size, costs_new, 0)
+        if i == 0:
+            print('CPU_costs_new: %f' % costs_new.sum())
+            print('CPU probs={}\ngrads_new={}\ncosts_new={}'.format(probs_new, grads_new, costs_new))
+    time_used = (time.perf_counter() - time_st) / repeat_num
+    print('CPU warp_ctc new version using time: ', time_used)
+
+
+def test_compare_gpu():
+    probs0 = torch.FloatTensor([
+        [[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]],
+        [[0.6, 0.1, 0.1, 0.1, 0.1], [0.1, 0.1, 0.5, 0.2, 0.1]]
+    ]).contiguous().cuda()
+    labels = torch.IntTensor([1, 2])
+    label_sizes = torch.IntTensor([2, 0])
+    sizes = torch.IntTensor([2, 2])
+    minibatch_size = probs0.size(1)
+
+    # 1.运行新版本 CPU
+    probs_new = probs0.clone().cuda()
+    costs_new = torch.zeros(minibatch_size)
+    grads_new = torch.zeros(probs0.size())
+    warp_ctc_new.cpu_ctc(probs_new, grads_new, labels, label_sizes, sizes, minibatch_size, costs_new, 0)
+    print('CPU_costs_new: %f' % costs_new.sum())
+    print('CPU probs_new={}\ngrads_new={}\ncosts_new={}'.format(probs_new, grads_new, costs_new))
+    # 2.运行老版本 CPU
+    probs = probs0.clone().cuda()
+    costs = torch.zeros(minibatch_size)
+    grads = torch.zeros(probs0.size())
+    warp_ctc.cpu_ctc(probs0, grads, labels, label_sizes, sizes, minibatch_size, costs, 0)
+    print('CPU_cost: %f' % costs.sum())
+    print('CPU probs={}\ngrads={}\ncosts={}'.format(probs, grads, costs))
+
+
+if __name__ == '__main__':
+    print('torch.cuda.is_available() ', torch.cuda.is_available())
+    test_compare_cpu()
+    test_compare_gpu()
--- a/pytorch_binding/warpctc_pytorch/__init__.py
+++ b/pytorch_binding/warpctc_pytorch/__init__.py
+import torch
+import warpctc_pytorch as warp_ctc
+from torch.autograd import Function
+from torch.nn import Module
+
+from _warp_ctc import *  # noqa
+
+
+def _assert_no_grad(tensor):
+    assert not tensor.requires_grad, \
+        "gradients only computed for acts - please " \
+        "mark other tensors as not requiring gradients"
+
+
+class _CTC(Function):
+    @staticmethod
+    def forward(ctx, acts, labels, act_lens, label_lens, size_average=False,
+                length_average=False, blank=0):
+        is_cuda = True if acts.is_cuda else False
+        # print('_CTC is_cuda', is_cuda)
+        acts = acts.contiguous()
+        loss_func = warp_ctc.gpu_ctc if is_cuda else warp_ctc.cpu_ctc
+        grads = torch.zeros(acts.size()).type_as(acts)
+        minibatch_size = acts.size(1)
+        costs = torch.zeros(minibatch_size).cpu()
+        loss_func(acts,
+                  grads,
+                  labels,
+                  label_lens,
+                  act_lens,
+                  minibatch_size,
+                  costs,
+                  blank)
+
+        costs = torch.FloatTensor([costs.sum()])
+
+        if length_average:
+            # Compute the avg. log-probability per batch sample and frame.
+            total_length = torch.sum(act_lens).item()
+            grads = grads / total_length
+            costs = costs / total_length
+        elif size_average:
+            # Compute the avg. log-probability per batch sample.
+            grads = grads / minibatch_size
+            costs = costs / minibatch_size
+
+        ctx.grads = grads
+        return costs
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        _grad_output = grad_output.to(ctx.grads.device)
+        return ctx.grads.mul_(_grad_output), None, None, None, None, None, None
+
+
+class CTCLoss(Module):
+    """
+    Parameters:
+        size_average (bool): normalize the loss by the batch size
+            (default: `False`)
+        length_average (bool): normalize the loss by the total number of frames
+            in the batch. If `True`, supersedes `size_average`
+            (default: `False`)
+    """
+
+    def __init__(self, blank=0, size_average=False, length_average=False):
+        super(CTCLoss, self).__init__()
+        self.ctc = _CTC.apply
+        self.blank = blank
+        self.size_average = size_average
+        self.length_average = length_average
+
+    def forward(self, acts, labels, act_lens, label_lens):
+        """
+        acts: Tensor of (seqLength x batch x outputDim) containing output from network
+        labels: 1 dimensional Tensor containing all the targets of the batch in one sequence
+        act_lens: Tensor of size (batch) containing size of each output sequence from the network
+        label_lens: Tensor of (batch) containing label length of each example
+        """
+        # labels must be 1 dimensional
+        if len(labels.size()) != 1:
+            print('error!! len(labels.size()) must be 1, get {}'.format(len(labels.size())))
+            raise ValueError
+
+        _assert_no_grad(labels)
+        _assert_no_grad(act_lens)
+        _assert_no_grad(label_lens)
+        return self.ctc(acts, labels, act_lens, label_lens, self.size_average,
+                        self.length_average, self.blank)
--- a/src/ctc_entrypoint.cu
+++ b/src/ctc_entrypoint.cu
+#include <cstddef>
+#include <iostream>
+#include <algorithm>
+
+#include "ctc.h"
+
+#include "detail/cpu_ctc.h"
+
+#ifdef __HIPCC__
+#include "detail/gpu_ctc.h"
+#endif
+
+
+extern "C" {
+
+int get_warpctc_version() {
+    return 13;
+}
+
+const char *ctcGetStatusString(ctcStatus_t status) {
+    switch (status) {
+        case CTC_STATUS_SUCCESS:
+            return "no error";
+        case CTC_STATUS_MEMOPS_FAILED:
+            return "cuda memcpy or memset failed";
+        case CTC_STATUS_INVALID_VALUE:
+            return "invalid value";
+        case CTC_STATUS_EXECUTION_FAILED:
+            return "execution failed";
+
+        case CTC_STATUS_UNKNOWN_ERROR:
+        default:
+            return "unknown error";
+
+    }
+
+}
+
+
+ctcStatus_t compute_ctc_loss(const float *const activations,
+                             float *gradients,
+                             const int *const flat_labels,
+                             const int *const label_lengths,
+                             const int *const input_lengths,
+                             int alphabet_size,
+                             int minibatch,
+                             float *costs,
+                             void *workspace,
+                             ctcOptions options) {
+
+    if (activations == nullptr ||
+        flat_labels == nullptr ||
+        label_lengths == nullptr ||
+        input_lengths == nullptr ||
+        costs == nullptr ||
+        workspace == nullptr ||
+        alphabet_size <= 0 ||
+        minibatch <= 0)
+        return CTC_STATUS_INVALID_VALUE;
+
+
+    if (options.loc == CTC_CPU) {
+        CpuCTC<float> ctc(alphabet_size, minibatch, workspace, options.num_threads,
+                          options.blank_label);
+
+        if (gradients != NULL)
+            return ctc.cost_and_grad(activations, gradients,
+                                     costs,
+                                     flat_labels, label_lengths,
+                                     input_lengths);
+        else
+            return ctc.score_forward(activations,
+                                     costs, flat_labels,
+                                     label_lengths, input_lengths);
+    } else if (options.loc == CTC_GPU) {
+#ifdef __HIPCC__
+        GpuCTC<float> ctc(alphabet_size, minibatch, workspace, options.stream,
+                          options.blank_label);
+
+        if (gradients != NULL)
+            return ctc.cost_and_grad(activations, gradients, costs,
+                                     flat_labels, label_lengths,
+                                     input_lengths);
+        else
+            return ctc.score_forward(activations, costs, flat_labels,
+                                     label_lengths, input_lengths);
+
+#else
+        std::cerr << "GPU execution requested, but not compiled with GPU support" << std::endl;
+        return CTC_STATUS_EXECUTION_FAILED;
+#endif
+    } else {
+        return CTC_STATUS_INVALID_VALUE;
+    }
+}
+
+
+ctcStatus_t get_workspace_size(const int *const label_lengths,
+                               const int *const input_lengths,
+                               int alphabet_size, int minibatch,
+                               ctcOptions options,
+                               size_t *size_bytes) {
+    if (label_lengths == nullptr ||
+        input_lengths == nullptr ||
+        size_bytes == nullptr ||
+        alphabet_size <= 0 ||
+        minibatch <= 0)
+        return CTC_STATUS_INVALID_VALUE;
+
+    // This is the max of all S and T for all examples in the minibatch.
+    int maxL = *std::max_element(label_lengths, label_lengths + minibatch);
+    int maxT = *std::max_element(input_lengths, input_lengths + minibatch);
+
+    const int S = 2 * maxL + 1;
+
+    *size_bytes = 0;
+
+    if (options.loc == CTC_GPU) {
+        // GPU storage
+        //nll_forward, nll_backward
+        *size_bytes += 2 * sizeof(float) * minibatch;
+
+        //repeats
+        *size_bytes += sizeof(int) * minibatch;
+
+        //label offsets
+        *size_bytes += sizeof(int) * minibatch;
+
+        //utt_length
+        *size_bytes += sizeof(int) * minibatch;
+
+        //label lengths
+        *size_bytes += sizeof(int) * minibatch;
+
+        //labels without blanks - overallocate for now
+        *size_bytes += sizeof(int) * maxL * minibatch;
+
+        //labels with blanks
+        *size_bytes += sizeof(int) * S * minibatch;
+
+        //alphas
+        *size_bytes += sizeof(float) * S * maxT * minibatch;
+
+        //denoms
+        *size_bytes += sizeof(float) * maxT * minibatch;
+
+        //probs (since we will pass in activations)
+        *size_bytes += sizeof(float) * alphabet_size * maxT * minibatch;
+
+    } else {
+        //cpu can eventually replace all minibatch with
+        //max number of concurrent threads if memory is
+        //really tight
+
+        //per minibatch memory
+        size_t per_minibatch_bytes = 0;
+
+        //output
+        per_minibatch_bytes += sizeof(float) * alphabet_size;
+
+        //alphas
+        per_minibatch_bytes += sizeof(float) * S * maxT;
+
+        //betas
+        per_minibatch_bytes += sizeof(float) * S;
+
+        //labels w/blanks, e_inc, s_inc
+        per_minibatch_bytes += 3 * sizeof(int) * S;
+
+        *size_bytes = per_minibatch_bytes * minibatch;
+
+        //probs
+        *size_bytes += sizeof(float) * alphabet_size * maxT * minibatch;
+    }
+
+    return CTC_STATUS_SUCCESS;
+}
+
+}
--- a/src/reduce.cu
+++ b/src/reduce.cu
+// Includes, system
+#include <stdio.h>
+#include <stdlib.h>
+
+// Includes, cuda
+#include <cuda_runtime.h>
+//#include<cublas_v2.h>
+#include <cuda_runtime_api.h>
+
+// Includes, cuda helper functions
+// #include <helper_cuda.h>
+
+// For the functors
+#include "detail/ctc_helper.h"
+#include "ctc.h"
+
+
+const int warp_size = 64;
+const int kCUDABlockNumThreads = 256;
+
+template<int NT, typename T, typename Rop>
+struct CTAReduce;
+
+template<int NT, typename T, typename Rop>
+struct CTAReduce {
+    enum {
+        Size = NT, Capacity = NT
+    };
+    struct Storage {
+        T shared[Capacity];
+    };
+
+    __device__ static T reduce(int tid, T x, Storage &storage, int count, Rop g) {
+        T *s = storage.shared;
+        s[tid] = x;
+        __syncthreads();
+
+        // Fold the data in half with each pass.
+#pragma unroll
+        for (int offset = NT / 2; offset >= warp_size; offset /= 2) {
+            if (tid + offset < count && tid < offset) {
+                x = g(x, s[offset + tid]);
+                s[tid] = x;
+            }
+            __syncthreads();
+        }
+
+        T shuff;
+        for (int offset = warp_size / 2; offset > 0; offset /= 2) {
+            // shuff = __shfl_down(0xFFFFFFF, x, offset);
+            shuff = __shfl_down(x, offset);
+            if (tid + offset < count && tid < offset) {
+                x = g(x, shuff);
+            }
+        }
+        return x;
+    }
+};
+
+template<int NT, typename Iop, typename Rop, typename T>
+__global__ void reduce_rows(Iop f, Rop g, const T *input, T *output,
+                            int num_rows, int num_cols) {
+
+    typedef CTAReduce<NT, T, Rop> R;
+    __shared__ typename R::Storage storage;
+
+    int tid = threadIdx.x;
+    int idx = tid;
+    int col = blockIdx.x;
+
+    T curr;
+    // Each block works on a column
+    if (idx < num_rows) {
+        curr = f(input[idx + col * num_rows]);
+    }
+    // __syncthreads();
+
+    idx += NT;
+    while (idx < num_rows) {
+        curr = g(curr, f(input[idx + col * num_rows]));
+        idx += NT;
+    }
+
+    // Sum thread-totals over the CTA.
+    curr = R::reduce(tid, curr, storage, num_rows, g);
+
+
+    // Store result in out
+    if (tid == 0) {
+        output[col] = curr;
+
+    }
+}
+
+template<int NT, typename Iop, typename Rop, typename T>
+__global__ void reduce_cols(Iop f, Rop g, const T *input, T *output,
+                            int num_rows, int num_cols) {
+
+    __shared__ T s[NT];
+
+    int warps_per_block = NT / warp_size;
+    int row = blockDim.x * blockIdx.x + threadIdx.x;
+    int col = threadIdx.y;
+    T curr;
+
+    if (row < num_rows && col < num_cols) {
+        curr = f(input[row + col * num_rows]);
+        col += blockDim.y;
+        while (col < num_cols) {
+            curr = g(curr, f(input[row + col * num_rows]));
+            col += blockDim.y;
+        }
+    }
+    s[threadIdx.x * warps_per_block + threadIdx.y] = curr;
+    __syncthreads();
+
+    // Reduce
+    if (threadIdx.y == 0 && row < num_rows) {
+#pragma unroll
+        for (int i = 1; i < warps_per_block && i < num_cols; ++i)
+            curr = g(curr, s[i + threadIdx.x * warps_per_block]);
+        output[row] = curr;
+    }
+}
+
+struct ReduceHelper {
+
+    template<typename T, typename Iof, typename Rof>
+    static void impl(Iof f, Rof g, const T *input, T *output, int num_rows, int num_cols, bool axis, CUstream stream) {
+
+        int grid_size;
+        if (axis) {
+            grid_size = num_cols;
+            reduce_rows<kCUDABlockNumThreads><<<grid_size, kCUDABlockNumThreads, 0, stream>>>
+                    (f, g, input, output, num_rows, num_cols);
+
+        } else {
+            dim3 tpb(warp_size, kCUDABlockNumThreads / warp_size);
+            grid_size = (num_cols + warp_size - 1) / warp_size;
+            reduce_cols<kCUDABlockNumThreads><<<grid_size, tpb, 0, stream>>>
+                    (f, g, input, output, num_rows, num_cols);
+
+        }
+
+    }
+};
+
+
+template<typename T, typename Iof, typename Rof>
+ctcStatus_t reduce(Iof f, Rof g, const T *input, T *output, int rows, int cols, bool axis, CUstream stream) {
+    ReduceHelper::impl(f, g, input, output, rows, cols, axis, stream);
+    hipStreamSynchronize(stream);
+
+    hipError_t err = hipGetLastError();
+    if (err != hipSuccess)
+        return CTC_STATUS_EXECUTION_FAILED;
+
+    return CTC_STATUS_SUCCESS;
+}
+
+ctcStatus_t reduce_negate(const float *input, float *output, int rows, int cols, bool axis, CUstream stream) {
+    return reduce(ctc_helper::negate<float>(), ctc_helper::add<float>(), input, output, rows, cols, axis, stream);
+}
+
+ctcStatus_t reduce_exp(const float *input, float *output, int rows, int cols, bool axis, CUstream stream) {
+    return reduce(ctc_helper::exponential<float>(), ctc_helper::add<float>(), input, output, rows, cols, axis, stream);
+}
+
+ctcStatus_t reduce_max(const float *input, float *output, int rows, int cols, bool axis, CUstream stream) {
+    auto ctc_status = reduce(ctc_helper::identity<float>(), ctc_helper::maximum<float>(), input, output, rows, cols, axis, stream);
+
+    return ctc_status;
+}