Commit ffdb193b authored by lishen's avatar lishen
Browse files

warpctc for dcu

parent 99e2985d
/******************************************************************************
* Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
*
* Code and text by Sean Baxter, NVIDIA Research
* See http://nvlabs.github.io/moderngpu for repository and documentation.
*
******************************************************************************/
#pragma once
namespace mgpu {
enum MgpuBounds {
MgpuBoundsLower,
MgpuBoundsUpper
};
enum MgpuScanType {
MgpuScanTypeExc,
MgpuScanTypeInc
};
enum MgpuSearchType {
MgpuSearchTypeNone,
MgpuSearchTypeIndex,
MgpuSearchTypeMatch,
MgpuSearchTypeIndexMatch
};
enum MgpuJoinKind {
MgpuJoinKindInner,
MgpuJoinKindLeft,
MgpuJoinKindRight,
MgpuJoinKindOuter
};
enum MgpuSetOp {
MgpuSetOpIntersection,
MgpuSetOpUnion,
MgpuSetOpDiff,
MgpuSetOpSymDiff
};
} // namespace mgpu
/******************************************************************************
* Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
*
* Code and text by Sean Baxter, NVIDIA Research
* See http://nvlabs.github.io/moderngpu for repository and documentation.
*
******************************************************************************/
#pragma once
#include <functional>
#include <iterator>
#include <cfloat>
#include <typeinfo>
#include <vector>
#include <list>
#include <map>
#include <algorithm>
#include <cassert>
#include <memory>
#include <cmath>
#include <cstdio>
#include <cstdlib>
#ifndef MGPU_MIN
#define MGPU_MIN(x, y) (((x) <= (y)) ? (x) : (y))
#define MGPU_MAX(x, y) (((x) >= (y)) ? (x) : (y))
#define MGPU_MAX0(x) (((x) >= 0) ? (x) : 0)
#define MGPU_ABS(x) (((x) >= 0) ? (x) : (-x))
#define MGPU_DIV_UP(x, y) (((x) + (y) - 1) / (y))
#define MGPU_DIV_ROUND(x, y) (((x) + (y) / 2) / (y))
#define MGPU_ROUND_UP(x, y) ((y) * MGPU_DIV_UP(x, y))
#define MGPU_SHIFT_DIV_UP(x, y) (((x) + ((1<< (y)) - 1))>> y)
#define MGPU_ROUND_UP_POW2(x, y) (((x) + (y) - 1) & ~((y) - 1))
#define MGPU_ROUND_DOWN_POW2(x, y) ((x) & ~((y) - 1))
#define MGPU_IS_POW_2(x) (0 == ((x) & ((x) - 1)))
#endif // MGPU_MIN
namespace mgpu {
typedef unsigned char byte;
typedef unsigned int uint;
typedef signed short int16;
typedef unsigned short ushort;
typedef unsigned short uint16;
typedef long long int64;
typedef unsigned long long uint64;
// IsPow2<X>::value is true if X is a power of 2.
template<int X> struct sIsPow2 {
enum { value = 0 == (X & (X - 1)) };
};
// Finds the base-2 logarithm of X. value is -1 if X is not a power of 2.
template<int X, bool roundUp = true> struct sLogPow2 {
enum { extra = sIsPow2<X>::value ? 0 : (roundUp ? 1 : 0) };
enum { inner = sLogPow2<X / 2>::inner + 1 };
enum { value = inner + extra };
};
template<bool roundUp> struct sLogPow2<0, roundUp> {
enum { inner = 0 };
enum { value = 0 };
};
template<bool roundUp> struct sLogPow2<1, roundUp> {
enum { inner = 0 };
enum { value = 0 };
};
template<int X, int Y>
struct sDivUp {
enum { value = (X + Y - 1) / Y };
};
template<int count, int levels> struct sDiv2RoundUp {
enum { value = sDiv2RoundUp<sDivUp<count, 2>::value, levels - 1>::value };
};
template<int count> struct sDiv2RoundUp<count, 0> {
enum { value = count };
};
template<int X, int Y>
struct sDivSafe {
enum { value = X / Y };
};
template<int X>
struct sDivSafe<X, 0> {
enum { value = 0 };
};
template<int X, int Y>
struct sRoundUp {
enum { rem = X % Y };
enum { value = X + (rem ? (Y - rem) : 0) };
};
template<int X, int Y>
struct sRoundDown {
enum { rem = X % Y };
enum { value = X - rem };
};
// IntegerDiv is a template for avoiding divisions by zero in template
// evaluation. Templates always evaluate both b and c in an expression like
// a ? b : c, and will error if either rhs contains an illegal expression,
// even if the ternary is explictly designed to guard against that.
template<int X, int Y>
struct sIntegerDiv {
enum { value = X / (Y ? Y : (X + 1)) };
};
template<int X, int Y>
struct sMax {
enum { value = (X >= Y) ? X : Y };
};
template<int X, int Y>
struct sMin {
enum { value = (X <= Y) ? X : Y };
};
template<int X>
struct sAbs {
enum { value = (X >= 0) ? X : -X };
};
// Finds the number of powers of 2 in the prime factorization of X.
template<int X, int LSB = 1 & X> struct sNumFactorsOf2 {
enum { shifted = X >> 1 };
enum { value = 1 + sNumFactorsOf2<shifted>::value };
};
template<int X> struct sNumFactorsOf2<X, 1> {
enum { value = 0 };
};
// Returns the divisor for a conflict-free transpose.
template<int X, int NumBanks = 32> struct sBankConflictDivisor {
enum { value =
(1 & X) ? 0 :
(sIsPow2<X>::value ? NumBanks :
(1<< sNumFactorsOf2<X>::value)) };
enum { log_value = sLogPow2<value>::value };
};
template<int NT, int X, int NumBanks = 32> struct sConflictFreeStorage {
enum { count = NT * X };
enum { divisor = sBankConflictDivisor<X, NumBanks>::value };
enum { padding = sDivSafe<count, divisor>::value };
enum { value = count + padding };
};
} // namespace mgpu
/** \file ctc.h
* Contains a simple C interface to call fast CPU and GPU based computation
* of the CTC loss.
*/
#pragma once
#ifdef __cplusplus
#include <cstddef>
#include <torch/extension.h>
extern "C" {
#endif
//forward declare of CUDA typedef to avoid needing to pull in CUDA headers
//typedef struct CUstream_st* CUstream;
typedef struct ihipStream_t* CUstream;
typedef enum {
CTC_STATUS_SUCCESS = 0,
CTC_STATUS_MEMOPS_FAILED = 1,
CTC_STATUS_INVALID_VALUE = 2,
CTC_STATUS_EXECUTION_FAILED = 3,
CTC_STATUS_UNKNOWN_ERROR = 4
} ctcStatus_t;
/** Returns a single integer which specifies the API version of the warpctc library */
int get_warpctc_version();
/** Returns a string containing a description of status that was passed in
* \param[in] status identifies which string should be returned
* \return C style string containing the text description
* */
const char* ctcGetStatusString(ctcStatus_t status);
typedef enum {
CTC_CPU = 0,
CTC_GPU = 1
} ctcComputeLocation;
/** Structure used for options to the CTC compution. Applications
* should zero out the array using memset and sizeof(struct
* ctcOptions) in C or default initialization (e.g. 'ctcOptions
* options{};' or 'auto options = ctcOptions{}') in C++ to ensure
* forward compatibility with added options. */
struct ctcOptions {
/// indicates where the ctc calculation should take place {CTC_CPU | CTC_GPU}
ctcComputeLocation loc;
union {
/// used when loc == CTC_CPU, the maximum number of threads that can be used
unsigned int num_threads;
/// used when loc == CTC_GPU, which stream the kernels should be launched in
CUstream stream;
};
/// the label value/index that the CTC calculation should use as the blank label
int blank_label;
};
/** Compute the connectionist temporal classification loss between a sequence
* of probabilities and a ground truth labeling. Optionally compute the
* gradient with respect to the inputs.
* \param [in] activations pointer to the activations in either CPU or GPU
* addressable memory, depending on info. We assume a fixed
* memory layout for this 3 dimensional tensor, which has dimension
* (t, n, p), where t is the time index, n is the minibatch index,
* and p indexes over probabilities of each symbol in the alphabet.
* The memory layout is (t, n, p) in C order (slowest to fastest changing
* index, aka row-major), or (p, n, t) in Fortran order (fastest to slowest
* changing index, aka column-major). We also assume strides are equal to
* dimensions - there is no padding between dimensions.
* More precisely, element (t, n, p), for a problem with mini_batch examples
* in the mini batch, and alphabet_size symbols in the alphabet, is located at:
* activations[(t * mini_batch + n) * alphabet_size + p]
* \param [out] gradients if not NULL, then gradients are computed. Should be
* allocated in the same memory space as probs and memory
* ordering is identical.
* \param [in] flat_labels Always in CPU memory. A concatenation
* of all the labels for the minibatch.
* \param [in] label_lengths Always in CPU memory. The length of each label
* for each example in the minibatch.
* \param [in] input_lengths Always in CPU memory. The number of time steps
* for each sequence in the minibatch.
* \param [in] alphabet_size The number of possible output symbols. There
* should be this many probabilities for each time step.
* \param [in] mini_batch How many examples in a minibatch.
* \param [out] costs Always in CPU memory. The cost of each example in the
* minibatch.
* \param [in,out] workspace In same memory space as probs. Should be of
* size requested by get_workspace_size.
* \param [in] options see struct ctcOptions
*
* \return Status information
*
* */
ctcStatus_t compute_ctc_loss(const float* const activations,
float* gradients,
const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths,
int alphabet_size,
int minibatch,
float *costs,
void *workspace,
ctcOptions options);
/** For a given set of labels and minibatch size return the required workspace
* size. This will need to be allocated in the same memory space as your
* probabilities.
* \param [in] label_lengths Always in CPU memory. The length of each label
* for each example in the minibatch.
* \param [in] input_lengths Always in CPU memory. The number of time steps
* for each sequence in the minibatch.
* \param [in] alphabet_size How many symbols in the alphabet or, equivalently,
* the number of probabilities at each time step
* \param [in] mini_batch How many examples in a minibatch.
* \param [in] info see struct ctcOptions
* \param [out] size_bytes is pointer to a scalar where the memory
* requirement in bytes will be placed. This memory should be allocated
* at the same place, CPU or GPU, that the probs are in
*
* \return Status information
**/
ctcStatus_t get_workspace_size(const int* const label_lengths,
const int* const input_lengths,
int alphabet_size, int minibatch,
ctcOptions info,
size_t* size_bytes);
#ifdef __cplusplus
}
#endif
#pragma once
#include <algorithm>
#include <cmath>
#include <limits>
#include <numeric>
#include <tuple>
#if !defined(CTC_DISABLE_OMP) && !defined(APPLE)
#include <omp.h>
#endif
#include "ctc_helper.h"
template <typename ProbT>
class CpuCTC {
public:
// Noncopyable
CpuCTC(int alphabet_size, int minibatch, void* workspace, int num_threads, int blank_label)
: alphabet_size_(alphabet_size), minibatch_(minibatch), num_threads_(num_threads), workspace_(workspace), blank_label_(blank_label) {
#if defined(CTC_DISABLE_OMP) || defined(APPLE)
#else
if (num_threads > 0) {
omp_set_num_threads(num_threads);
} else {
num_threads_ = omp_get_max_threads();
}
#endif
};
CpuCTC(const CpuCTC&) = delete;
CpuCTC& operator=(const CpuCTC&) = delete;
ctcStatus_t cost_and_grad(
const ProbT* const activations,
ProbT* grads,
ProbT* costs,
const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths);
ctcStatus_t score_forward(
const ProbT* const activations,
ProbT* costs,
const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths);
private:
class CpuCTC_metadata {
private:
int setup_labels(const int* const labels, int blank_label, int L, int S);
public:
CpuCTC_metadata(int L, int S, int T, int mb, int alphabet_size, void* workspace, size_t bytes_used, int blank_label, const int* const labels);
ProbT* alphas;
ProbT* betas;
int* labels_w_blanks;
int* e_inc;
int* s_inc;
ProbT* output;
int repeats;
};
int alphabet_size_; // Number of characters plus blank
int minibatch_;
int num_threads_;
int blank_label_;
void* workspace_;
void softmax(const ProbT* const activations, ProbT* probs, const int* const input_lengths);
std::tuple<ProbT, bool> cost_and_grad_kernel(ProbT* grad, const ProbT* const probs, const int* const labels, int T, int L, int mb, size_t bytes_used);
ProbT compute_alphas(const ProbT* probs, int repeats, int S, int T, const int* const e_inc, const int* const s_inc, const int* const labels, ProbT* alphas);
ProbT compute_betas_and_grad(
ProbT* grad,
const ProbT* const probs,
ProbT log_partition,
int repeats,
int S,
int T,
const int* const e_inc,
const int* const s_inc,
const int* const labels,
ProbT* alphas,
ProbT* betas,
ProbT* output);
};
template <typename ProbT>
CpuCTC<ProbT>::CpuCTC_metadata::CpuCTC_metadata(
int L,
int S,
int T,
int mb,
int alphabet_size,
void* workspace,
size_t bytes_used,
int blank_label,
const int* const labels) {
alphas = reinterpret_cast<ProbT*>(static_cast<char*>(workspace) + bytes_used);
bytes_used += sizeof(ProbT) * S * T;
std::fill(alphas, alphas + S * T, ctc_helper::neg_inf<ProbT>());
betas = reinterpret_cast<ProbT*>(static_cast<char*>(workspace) + bytes_used);
bytes_used += sizeof(ProbT) * S;
std::fill(betas, betas + S, ctc_helper::neg_inf<ProbT>());
labels_w_blanks = reinterpret_cast<int*>(static_cast<char*>(workspace) + bytes_used);
bytes_used += sizeof(int) * S;
e_inc = reinterpret_cast<int*>(static_cast<char*>(workspace) + bytes_used);
bytes_used += sizeof(int) * S;
s_inc = reinterpret_cast<int*>(static_cast<char*>(workspace) + bytes_used);
bytes_used += sizeof(int) * S;
output = reinterpret_cast<ProbT*>(static_cast<char*>(workspace) + bytes_used);
bytes_used += sizeof(ProbT) * alphabet_size;
repeats = setup_labels(labels, blank_label, L, S);
}
template <typename ProbT>
int CpuCTC<ProbT>::CpuCTC_metadata::setup_labels(const int* const labels, int blank_label, int L, int S) {
int e_counter = 0;
int s_counter = 0;
s_inc[s_counter++] = 1; // get start
int repeats = 0; // number of repeat
for (int i = 1; i < L; ++i) {
if (labels[i - 1] == labels[i]) { // repeat label
s_inc[s_counter++] = 1;
s_inc[s_counter++] = 1; // label and blank
e_inc[e_counter++] = 1;
e_inc[e_counter++] = 1;
++repeats;
} else {
s_inc[s_counter++] = 2; // single label and no repeat
e_inc[e_counter++] = 2;
}
}
e_inc[e_counter++] = 1; // get end
// // printf("s_counter=%d, e_counter=%d, repeats=%d\n", s_counter, e_counter, repeats);
// for (int i = 0; i < S; ++i) {
// printf("s_inc[%d]=%d, e_inc[%d]=%d\n", i, s_inc[i], i, e_inc[i]);
// }
for (int i = 0; i < L; ++i) {
labels_w_blanks[2 * i] = blank_label;
labels_w_blanks[2 * i + 1] = labels[i];
}
labels_w_blanks[S - 1] = blank_label; // end is blank
return repeats;
}
template <typename ProbT>
void CpuCTC<ProbT>::softmax(const ProbT* const activations, ProbT* probs, const int* const input_lengths) {
#pragma omp parallel for
for (int mb = 0; mb < minibatch_; ++mb) { // iter batch
for (int c = 0; c < input_lengths[mb]; ++c) { // iter input audio vec
int col_offset = (mb + minibatch_ * c) * alphabet_size_; // vec index * alphabet_size_
//// get max_activation
ProbT max_activation = -std::numeric_limits<ProbT>::infinity(); // set -1 matrix
for (int r = 0; r < alphabet_size_; ++r) // iter alphabet
max_activation = std::max(max_activation, activations[r + col_offset]);
//// compute probs between activations and max
ProbT denom = ProbT(0.);
for (int r = 0; r < alphabet_size_; ++r) {
probs[r + col_offset] = std::exp(activations[r + col_offset] - max_activation);
denom += probs[r + col_offset];
}
//// scale probs
for (int r = 0; r < alphabet_size_; ++r) {
probs[r + col_offset] /= denom;
}
}
}
}
template <typename ProbT>
std::tuple<ProbT, bool> CpuCTC<
ProbT>::cost_and_grad_kernel(ProbT* grad, const ProbT* const probs, const int* const labels, int T, int L, int mb, size_t bytes_used) {
const int S = 2 * L + 1; // Number of labels with blanks
CpuCTC_metadata ctcm(L, S, T, mb, alphabet_size_, workspace_, bytes_used, blank_label_, labels);
bool over_threshold = false;
// check (length of labels + repeats) <= (length of utterance)
if (L + ctcm.repeats > T) {
return std::make_tuple(ProbT(0), over_threshold); // TODO, not right to return 0
}
ProbT llForward = compute_alphas(probs, ctcm.repeats, S, T, ctcm.e_inc, ctcm.s_inc, ctcm.labels_w_blanks, ctcm.alphas);
ProbT llBackward =
compute_betas_and_grad(grad, probs, llForward, ctcm.repeats, S, T, ctcm.e_inc, ctcm.s_inc, ctcm.labels_w_blanks, ctcm.alphas, ctcm.betas, ctcm.output);
ProbT diff = std::abs(llForward - llBackward);
if (diff > ctc_helper::threshold) {
over_threshold = true;
}
return std::make_tuple(-llForward, over_threshold);
}
// Computes forward probabilities
template <typename ProbT>
ProbT CpuCTC<ProbT>::compute_alphas(
const ProbT* probs,
int repeats,
int S,
int T,
const int* const e_inc,
const int* const s_inc,
const int* const labels,
ProbT* alphas) {
int start = (((S / 2) + repeats - T) < 0) ? 0 : 1, end = S > 1 ? 2 : 1;
// get log probs of label
for (int i = start; i < end; ++i) {
alphas[i] = std::log(probs[labels[i]]);
}
// printf("start=%d, end=%d, t=1~srcLen=%d, repeats=%d\n", start, end, T, repeats);
for (int t = 1; t < T; ++t) {
int remain = (S / 2) + repeats - (T - t);
// printf("t=%d, remain=%d\n", t, remain);
if (remain >= 0)
start += s_inc[remain];
if (t <= (S / 2) + repeats)
end += e_inc[t - 1];
int startloop = start;
int idx1 = t * S, idx2 = (t - 1) * S, idx3 = t * (alphabet_size_ * minibatch_);
if (start == 0) {
alphas[idx1] = alphas[idx2] + std::log(probs[blank_label_ + idx3]);
// printf("00 alphas[%d]=%f, alphas[%d]=%f\n", t, alphas[idx1], t - 1, alphas[idx2]);
startloop += 1;
}
// printf("start=%d, startloop=%d, end=%d\n", start, startloop, end);
for (int i = startloop; i < end; ++i) {
// printf("alphas[(t - 1=%d, u=%d)]=%f\n", t - 1, i, alphas[i + idx2]);
// printf("alphas[(t - 1=%d, u-1=%d)]=%f\n", t - 1, i - 1, alphas[(i - 1) + idx2]);
ProbT prev_sum = ctc_helper::log_plus<ProbT>()(alphas[i + idx2], alphas[(i - 1) + idx2]);
// printf("11 t=%d, u=%d, prev_sum=%f\n", t, i, prev_sum);
// Skip two if not on blank and not on repeat.
if (labels[i] != blank_label_ && i != 1 && labels[i] != labels[i - 2]) {
prev_sum = ctc_helper::log_plus<ProbT>()(prev_sum, alphas[(i - 2) + idx2]);
// printf("22 t=%d, u=%d, prev_sum=%f\n", t, i, prev_sum);
}
alphas[i + idx1] = prev_sum + std::log(probs[labels[i] + idx3]);
// printf("33 alpha[%d,%d]=%f, log(p(%d))=%f, label(%d)=%d\n", t, i, alphas[i + idx1], labels[i], std::log(probs[labels[i] + idx3]), i, labels[i]);
}
// printf("\n");
}
// printf("final start=%d, end=%d\n", start, end);
ProbT loglike = ctc_helper::neg_inf<ProbT>();
for (int i = start; i < end; ++i) {
loglike = ctc_helper::log_plus<ProbT>()(loglike, alphas[i + (T - 1) * S]);
}
// printf("compute alpha cost=%f\n", -loglike);
#ifdef DEBUG_KERNEL
printf("cpu alphas:\n");
printf("T=%d, (T-1)*S=%d, start=%d, end=%d\n", T, (T - 1) * S, start, end);
for (int t = start; t < end; ++t) {
printf("%.5f ", alphas[t + (T - 1) * S]);
}
printf("\n");
printf("alphas loglike=%f\n", loglike);
#endif
return loglike;
}
// Starting from T, we sweep backward over the alpha array computing one column
// of betas as we go. At each position we can update product alpha * beta and then
// sum into the gradient associated with each label.
// NOTE computes gradient w.r.t UNNORMALIZED final layer activations.
// Assumed passed in grads are already zeroed!
template <typename ProbT>
ProbT CpuCTC<ProbT>::compute_betas_and_grad(
ProbT* grad,
const ProbT* const probs,
ProbT log_partition,
int repeats,
int S,
int T,
const int* const e_inc,
const int* const s_inc,
const int* const labels,
ProbT* alphas,
ProbT* betas,
ProbT* output) {
int start = S > 1 ? (S - 2) : 0, end = (T > (S / 2) + repeats) ? S : S - 1;
std::fill(output, output + alphabet_size_, ctc_helper::neg_inf<ProbT>());
// set the starting values in the beta column at the very right edge
for (int i = start; i < end; ++i) {
betas[i] = std::log(probs[labels[i] + (T - 1) * (alphabet_size_ * minibatch_)]);
// compute alpha * beta in log space at this position in (S, T) space
alphas[i + (T - 1) * S] += betas[i];
// update the gradient associated with this label
// essentially performing a reduce-by-key in a sequential manner
output[labels[i]] = ctc_helper::log_plus<ProbT>()(alphas[i + (T - 1) * S], output[labels[i]]);
}
// update the gradient wrt to each unique label
for (int i = 0; i < alphabet_size_; ++i) {
int idx3 = (T - 1) * alphabet_size_ * minibatch_ + i;
if (output[i] == 0.0 || output[i] == ctc_helper::neg_inf<ProbT>() || probs[idx3] == 0.0) {
grad[idx3] = probs[idx3];
} else {
grad[idx3] = probs[idx3] - std::exp(output[i] - std::log(probs[idx3]) - log_partition);
}
}
// loop from the second to last column all the way to the left
for (int t = T - 2; t >= 0; --t) {
int remain = (S / 2) + repeats - (T - t);
if (remain >= -1)
start -= s_inc[remain + 1];
if (t < (S / 2) + repeats)
end -= e_inc[t];
int endloop = end == S ? end - 1 : end;
int idx1 = t * S, idx3 = t * (alphabet_size_ * minibatch_);
std::fill(output, output + alphabet_size_, ctc_helper::neg_inf<ProbT>());
for (int i = start; i < endloop; ++i) {
ProbT next_sum = ctc_helper::log_plus<ProbT>()(betas[i], betas[(i + 1)]);
// Skip two if not on blank and not on repeat.
if (labels[i] != blank_label_ && i != (S - 2) && labels[i] != labels[i + 2]) {
next_sum = ctc_helper::log_plus<ProbT>()(next_sum, betas[(i + 2)]);
}
betas[i] = next_sum + std::log(probs[labels[i] + idx3]);
// compute alpha * beta in log space
alphas[i + idx1] += betas[i];
// update the gradient associated with this label
output[labels[i]] = ctc_helper::log_plus<ProbT>()(alphas[i + idx1], output[labels[i]]);
}
if (end == S) {
betas[(S - 1)] = betas[(S - 1)] + std::log(probs[blank_label_ + idx3]);
alphas[(S - 1) + idx1] += betas[(S - 1)];
output[labels[S - 1]] = ctc_helper::log_plus<ProbT>()(alphas[S - 1 + idx1], output[labels[S - 1]]);
}
// go over the unique labels and compute the final grad
// wrt to each one at this time step
for (int i = 0; i < alphabet_size_; ++i) {
if (output[i] == 0.0 || output[i] == ctc_helper::neg_inf<ProbT>() || probs[idx3] == 0.0) {
grad[idx3] = probs[idx3];
} else {
grad[idx3] = probs[idx3] - std::exp(output[i] - std::log(probs[idx3]) - log_partition);
}
++idx3;
}
}
ProbT loglike = ctc_helper::neg_inf<ProbT>();
for (int i = start; i < end; ++i) {
loglike = ctc_helper::log_plus<ProbT>()(loglike, betas[i]);
}
#ifdef DEBUG_KERNEL
printf("cpu betas:\n");
printf("T=%d, (T-1)*S=%d, start=%d, end=%d\n", T, (T - 1) * S, start, end);
for (int t = start; t < end; ++t) {
printf("%.5f ", betas[t]);
}
printf("\n");
printf("betas loglike=%f\n", loglike);
#endif
return loglike;
}
template <typename ProbT>
ctcStatus_t CpuCTC<ProbT>::cost_and_grad(
const ProbT* const activations,
ProbT* grads,
ProbT* costs,
const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths) {
if (activations == nullptr || grads == nullptr || costs == nullptr || flat_labels == nullptr || label_lengths == nullptr || input_lengths == nullptr)
return CTC_STATUS_INVALID_VALUE;
ProbT* probs = static_cast<ProbT*>(workspace_);
// get max length input audio vector
int maxT = *std::max_element(input_lengths, input_lengths + minibatch_);
// memory to use
size_t bytes_used = sizeof(ProbT) * minibatch_ * alphabet_size_ * maxT;
// per minibatch memory
size_t per_minibatch_bytes = 0;
// get max length input text vector
int maxL = *std::max_element(label_lengths, label_lengths + minibatch_);
int maxS = 2 * maxL + 1; // labels with blanks
// output
per_minibatch_bytes += sizeof(float) * alphabet_size_; // vector of alphabet
// alphas
per_minibatch_bytes += sizeof(float) * maxS * maxT; // matrix size
// betas
per_minibatch_bytes += sizeof(float) * maxS; // sequence label size is n , alloc 2n+1, with blanks
// labels w/blanks, e_inc, s_inc
per_minibatch_bytes += 3 * sizeof(int) * maxS;
// compute softmax probs
softmax(activations, probs, input_lengths);
#pragma omp parallel for
for (int mb = 0; mb < minibatch_; ++mb) {
const int T = input_lengths[mb]; // Length of utterance (time)
const int L = label_lengths[mb]; // Number of labels in transcription
bool mb_status;
std::tie(costs[mb], mb_status) = cost_and_grad_kernel(
grads + mb * alphabet_size_,
probs + mb * alphabet_size_,
flat_labels + std::accumulate(label_lengths, label_lengths + mb, 0),
T,
L,
mb,
bytes_used + mb * per_minibatch_bytes);
}
return CTC_STATUS_SUCCESS;
}
template <typename ProbT>
ctcStatus_t CpuCTC<ProbT>::score_forward(
const ProbT* const activations,
ProbT* costs,
const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths) {
if (activations == nullptr || costs == nullptr || flat_labels == nullptr || label_lengths == nullptr || input_lengths == nullptr)
return CTC_STATUS_INVALID_VALUE;
ProbT* probs = static_cast<ProbT*>(workspace_);
int maxT = *std::max_element(input_lengths, input_lengths + minibatch_);
size_t bytes_used = sizeof(ProbT) * minibatch_ * alphabet_size_ * maxT;
// per minibatch memory
size_t per_minibatch_bytes = 0;
int maxL = *std::max_element(label_lengths, label_lengths + minibatch_);
int maxS = 2 * maxL + 1;
// output
per_minibatch_bytes += sizeof(float) * alphabet_size_;
// alphas
per_minibatch_bytes += sizeof(float) * maxS * maxT;
// betas
per_minibatch_bytes += sizeof(float) * maxS;
// labels w/blanks, e_inc, s_inc
per_minibatch_bytes += 3 * sizeof(int) * maxS;
softmax(activations, probs, input_lengths);
#pragma omp parallel for
for (int mb = 0; mb < minibatch_; ++mb) {
const int T = input_lengths[mb]; // Length of utterance (time)
const int L = label_lengths[mb]; // Number of labels in transcription
const int S = 2 * L + 1; // Number of labels with blanks
CpuCTC_metadata ctcm(
L,
S,
T,
mb,
alphabet_size_,
workspace_,
bytes_used + mb * per_minibatch_bytes,
blank_label_,
flat_labels + std::accumulate(label_lengths, label_lengths + mb, 0));
if (L + ctcm.repeats > T)
costs[mb] = ProbT(0);
else {
costs[mb] = -compute_alphas(probs + mb * alphabet_size_, ctcm.repeats, S, T, ctcm.e_inc, ctcm.s_inc, ctcm.labels_w_blanks, ctcm.alphas);
}
}
return CTC_STATUS_SUCCESS;
}
#pragma once
#include <limits>
#include <algorithm>
#include <cmath>
#include "hostdevice.h"
namespace ctc_helper {
static const float threshold = 1e-1;
template<typename T>
HOSTDEVICE
T neg_inf() { return -T(INFINITY); }
inline int div_up(int x, int y) {
return (x + y - 1) / y;
}
template<typename Arg, typename Res = Arg>
struct maximum {
HOSTDEVICE
Res operator()(const Arg &x, const Arg &y) const {
return x < y ? y : x;
}
};
template<typename Arg, typename Res = Arg>
struct minimum {
HOSTDEVICE
Res operator()(const Arg &x, const Arg &y) const {
return x < y ? x : y;
}
};
template<typename Arg, typename Res = Arg>
struct add {
HOSTDEVICE
Res operator()(const Arg &x, const Arg &y) const {
return x + y;
}
};
template<typename Arg, typename Res = Arg>
struct identity {
HOSTDEVICE Res operator()(const Arg &x) const {
return Res(x);
}
};
template<typename Arg, typename Res = Arg>
struct negate {
HOSTDEVICE Res operator()(const Arg &x) const {
return Res(-x);
}
};
template<typename Arg, typename Res = Arg>
struct exponential {
HOSTDEVICE Res operator()(const Arg &x) const { return std::exp(x); }
};
template<typename Arg1, typename Arg2 = Arg1, typename Res=Arg1>
struct log_plus {
typedef Res result_type;
HOSTDEVICE
Res operator()(const Arg1 &p1, const Arg2 &p2) {
if (p1 == neg_inf<Arg1>())
return p2;
if (p2 == neg_inf<Arg2>())
return p1;
Res result = log1p(exp(-fabs(p1 - p2))) + maximum<Res>()(p1, p2);
return result;
}
};
//template<typename Arg1, typename Arg2 = Arg1, typename Res=Arg1>
//struct log_plus {
// HOSTDEVICE
// Res operator()(const Arg1& p1, const Arg2& p2) {
// Res p12_max = maximum<Res>()(p1, p2);
// Res p12_min = minimum<Res>()(p1, p2);
// Res p12_diff = p12_min-p12_max;
// Res NEGATIVE_CUTOFF_VAL = -(Res)100000;
//
// Res result = p12_diff <= NEGATIVE_CUTOFF_VAL ? maximum<Res>()(p12_max, NEGATIVE_CUTOFF_VAL)
// : maximum<Res>()(p12_max + log(exp(p12_diff) + 1), NEGATIVE_CUTOFF_VAL);
//
//
// return result;
// }
//};
}
#pragma once
#include "ctc_helper.h"
#include "gpu_ctc_kernels.h"
#include "reduce.h"
#include <stdio.h>
const int kCUDABlockNumThreads = 256;
template<typename ProbT>
class GpuCTC {
public:
GpuCTC(int alphabet_size,
int minibatch,
void *workspace,
CUstream stream,
int blank_label) :
out_dim_(alphabet_size), minibatch_(minibatch),
gpu_workspace_(workspace), stream_(stream),
blank_label_(blank_label) {};
// Noncopyable
GpuCTC(const GpuCTC &) = delete;
GpuCTC &operator=(const GpuCTC &) = delete;
ctcStatus_t
cost_and_grad(const ProbT *const activations,
ProbT *grads,
ProbT *costs,
const int *const flat_labels,
const int *const label_lengths,
const int *const input_lengths);
ctcStatus_t
score_forward(const ProbT *const activations,
ProbT *costs,
const int *const flat_labels,
const int *const label_lengths,
const int *const input_lengths);
private:
template<int NT, int VT>
ctcStatus_t launch_alpha_beta_kernels(const ProbT *const probs,
ProbT *grads,
bool compute_alpha,
bool compute_beta);
ctcStatus_t
launch_gpu_kernels(const ProbT *const probs,
ProbT *grads,
size_t config,
bool launch_alpha,
bool launch_beta);
ctcStatus_t
setup_gpu_metadata(const int *const flat_labels,
const int *const label_lengths,
const int *const input_lengths);
ctcStatus_t
create_metadata_and_choose_config(const int *const label_lengths,
const int *const flat_labels,
const int *const input_lengths,
size_t &best_config);
ctcStatus_t
compute_probs(const ProbT *const activations);
ctcStatus_t
compute_cost_and_score(const ProbT *const activations,
ProbT *grads,
ProbT *costs,
const int *const flat_labels,
const int *const label_lengths,
const int *const input_lengths,
bool compute_alpha,
bool compute_betas_and_grad);
int out_dim_; // Number of characters plus blank
int minibatch_;
int S_;
int T_;
int activation_cols_; // Number of columns in activations
CUstream stream_;
int blank_label_;
void *gpu_workspace_; // Buffer for all temporary GPU memory
int *utt_length_; // T
int *label_sizes_; // L
int *repeats_; // repeats_
int *label_offsets_;
int *labels_without_blanks_;
int *labels_with_blanks_;
ProbT *alphas_;
ProbT *nll_forward_;
ProbT *nll_backward_;
ProbT *denoms_; // Temporary storage for denoms for softmax
ProbT *probs_; // Temporary storage for probabilities (softmax output)
};
template<typename ProbT>
ctcStatus_t
GpuCTC<ProbT>::setup_gpu_metadata(const int *const flat_labels,
const int *const label_lengths,
const int *const input_lengths) {
size_t gpu_bytes_used = 0;
nll_forward_ = reinterpret_cast<ProbT *>(static_cast<char *>(gpu_workspace_) + gpu_bytes_used);
gpu_bytes_used += minibatch_ * sizeof(ProbT);
nll_backward_ = reinterpret_cast<ProbT *>(static_cast<char *>(gpu_workspace_) + gpu_bytes_used);
gpu_bytes_used += minibatch_ * sizeof(ProbT);
repeats_ = reinterpret_cast<int *>(static_cast<char *>(gpu_workspace_) + gpu_bytes_used);
gpu_bytes_used += minibatch_ * sizeof(int);
label_offsets_ = reinterpret_cast<int *>(static_cast<char *>(gpu_workspace_) + gpu_bytes_used);
gpu_bytes_used += minibatch_ * sizeof(int);
// This is the max of all S and T for all valid examples in the minibatch.
// A valid example is one for which L + repeats <= T
S_ = 0;
T_ = 0;
// This is the max of all timesteps, valid or not. Needed to compute offsets
int Tmax = 0;
// This is the max of all labels, valid or not. Needed to compute offsets
int Lmax = 0;
int total_label_length = 0;
constexpr int cpu_buffer_size = 64;
int repeats[cpu_buffer_size];
int label_offsets[cpu_buffer_size];
const int num_passes = ctc_helper::div_up(minibatch_, cpu_buffer_size);
hipError_t cuda_status;
for (int pass = 0; pass < num_passes; ++pass) {
const int start_idx = pass * cpu_buffer_size;
const int end_idx = std::min(minibatch_, (pass + 1) * cpu_buffer_size);
for (int j = start_idx; j < end_idx; ++j) {
const int L = label_lengths[j];
const int local_T = input_lengths[j];
const int *label_ptr = &(flat_labels[total_label_length]);
label_offsets[j % cpu_buffer_size] = total_label_length;
total_label_length += L;
int repeat_counter = 0;
for (int i = 1; i < L; ++i)
repeat_counter += (label_ptr[i] == label_ptr[i - 1]);
repeats[j % cpu_buffer_size] = repeat_counter;
const bool valid_label = ((L + repeat_counter) <= local_T);
// Only update S and T if label is valid
S_ = (valid_label) ? std::max(S_, L) : S_;
T_ = (valid_label) ? std::max(T_, local_T) : T_;
Tmax = std::max(Tmax, local_T);
Lmax = std::max(Lmax, L);
}
cuda_status = hipMemcpyAsync(&(repeats_[start_idx]), repeats,
(end_idx - start_idx) * sizeof(int),
hipMemcpyHostToDevice, stream_);
if (cuda_status != hipSuccess)
return CTC_STATUS_MEMOPS_FAILED;
cuda_status = hipMemcpyAsync(&(label_offsets_[start_idx]), label_offsets,
(end_idx - start_idx) * sizeof(int),
hipMemcpyHostToDevice, stream_);
if (cuda_status != hipSuccess)
return CTC_STATUS_MEMOPS_FAILED;
}
S_ = 2 * S_ + 1;
const int Smax = 2 * Lmax + 1;
activation_cols_ = minibatch_ * Tmax;
// Allocate memory for T
utt_length_ = reinterpret_cast<int *>(static_cast<char *>(gpu_workspace_) + gpu_bytes_used);
gpu_bytes_used += minibatch_ * sizeof(int);
cuda_status = hipMemcpyAsync(utt_length_, input_lengths,
minibatch_ * sizeof(int),
hipMemcpyHostToDevice, stream_);
if (cuda_status != hipSuccess)
return CTC_STATUS_MEMOPS_FAILED;
label_sizes_ = reinterpret_cast<int *>(static_cast<char *>(gpu_workspace_) + gpu_bytes_used);
gpu_bytes_used += minibatch_ * sizeof(int);
cuda_status = hipMemcpyAsync(label_sizes_, label_lengths,
minibatch_ * sizeof(int),
hipMemcpyHostToDevice, stream_);
if (cuda_status != hipSuccess)
return CTC_STATUS_MEMOPS_FAILED;
labels_without_blanks_ = reinterpret_cast<int *>(static_cast<char *>(gpu_workspace_) + gpu_bytes_used);
gpu_bytes_used += Lmax * minibatch_ * sizeof(int);
cuda_status = hipMemcpyAsync(labels_without_blanks_, flat_labels,
total_label_length * sizeof(int),
hipMemcpyHostToDevice, stream_);
if (cuda_status != hipSuccess)
return CTC_STATUS_MEMOPS_FAILED;
labels_with_blanks_ = reinterpret_cast<int *>(static_cast<char *>(gpu_workspace_) + gpu_bytes_used);
gpu_bytes_used += Smax * minibatch_ * sizeof(int);
alphas_ = reinterpret_cast<ProbT *>(static_cast<char *>(gpu_workspace_) + gpu_bytes_used);
gpu_bytes_used += (S_ * T_) * minibatch_ * sizeof(ProbT);
denoms_ = reinterpret_cast<ProbT *>(static_cast<char *>(gpu_workspace_) + gpu_bytes_used);
gpu_bytes_used += activation_cols_ * sizeof(ProbT);
probs_ = reinterpret_cast<ProbT *>(static_cast<char *>(gpu_workspace_) + gpu_bytes_used);
gpu_bytes_used += out_dim_ * activation_cols_ * sizeof(ProbT);
return CTC_STATUS_SUCCESS;
}
template<typename ProbT>
template<int NT, int VT>
ctcStatus_t GpuCTC<ProbT>::launch_alpha_beta_kernels(const ProbT *const probs,
ProbT *grads,
bool compute_alpha,
bool compute_beta) {
// One thread block per utterance
const int grid_size = minibatch_;
// The data is laid out so that the next timestep is minibatch entries away
const int stride = minibatch_;
if (compute_alpha) {
compute_alpha_kernel<ProbT, NT, VT><<<grid_size, NT, 0, stream_>>>
(probs, label_sizes_, utt_length_,
repeats_, labels_without_blanks_, label_offsets_,
labels_with_blanks_, alphas_, nll_forward_,
stride, out_dim_, S_, T_, blank_label_);
hipStreamSynchronize(stream_);
}
if (compute_beta) {
compute_betas_and_grad_kernel<ProbT, NT, VT><<<grid_size, NT, 0, stream_>>>
(probs, label_sizes_, utt_length_, repeats_,
labels_with_blanks_, alphas_, nll_forward_, nll_backward_,
grads, stride, out_dim_, S_, T_, blank_label_);
hipStreamSynchronize(stream_);
}
hipError_t err = hipGetLastError();
if (err != hipSuccess)
return CTC_STATUS_EXECUTION_FAILED;
return CTC_STATUS_SUCCESS;
}
template<typename ProbT>
ctcStatus_t
GpuCTC<ProbT>::create_metadata_and_choose_config(const int *const flat_labels,
const int *const label_lengths,
const int *const input_lengths,
size_t &best_config) {
// Setup the metadata for GPU
ctcStatus_t status = setup_gpu_metadata(flat_labels, label_lengths, input_lengths);
if (status != CTC_STATUS_SUCCESS)
return status;
constexpr int num_configs = 12;
int config_NT[num_configs] =
{32, 64, 128, 64, 128, 32, 64, 128, 64, 128, 128, 128};
int config_VT[num_configs] =
{1, 1, 1, 3, 2, 9, 6, 4, 9, 6, 9, 10};
best_config = 0;
for (int i = 0; i < num_configs; ++i) {
if ((config_NT[i] * config_VT[i]) >= S_)
break;
else
best_config++;
}
if (best_config >= num_configs)
return CTC_STATUS_UNKNOWN_ERROR;
return CTC_STATUS_SUCCESS;
}
template<typename ProbT>
ctcStatus_t
GpuCTC<ProbT>::launch_gpu_kernels(const ProbT *const probs,
ProbT *grads,
size_t config,
bool l_a,
bool l_b) {
switch (config) {
case 0: {
return launch_alpha_beta_kernels<32, 1>(probs, grads, l_a, l_b);
}
case 1: {
return launch_alpha_beta_kernels<64, 1>(probs, grads, l_a, l_b);
}
case 2: {
return launch_alpha_beta_kernels<128, 1>(probs, grads, l_a, l_b);
}
case 3: {
return launch_alpha_beta_kernels<64, 3>(probs, grads, l_a, l_b);
}
case 4: {
return launch_alpha_beta_kernels<128, 2>(probs, grads, l_a, l_b);
}
case 5: {
return launch_alpha_beta_kernels<32, 9>(probs, grads, l_a, l_b);
}
case 6: {
return launch_alpha_beta_kernels<64, 6>(probs, grads, l_a, l_b);
}
case 7: {
return launch_alpha_beta_kernels<128, 4>(probs, grads, l_a, l_b);
}
case 8: {
return launch_alpha_beta_kernels<64, 9>(probs, grads, l_a, l_b);
}
case 9: {
return launch_alpha_beta_kernels<128, 6>(probs, grads, l_a, l_b);
}
case 10: {
return launch_alpha_beta_kernels<128, 9>(probs, grads, l_a, l_b);
}
case 11: {
return launch_alpha_beta_kernels<128, 10>(probs, grads, l_a, l_b);
}
}
return CTC_STATUS_EXECUTION_FAILED;
}
template<typename ProbT>
ctcStatus_t
GpuCTC<ProbT>::compute_probs(const ProbT *const activations) {
hipError_t cuda_status;
cuda_status = hipMemcpyAsync(probs_, activations,
activation_cols_ * out_dim_ * sizeof(ProbT),
hipMemcpyDeviceToDevice, stream_);
if (cuda_status != hipSuccess)
return CTC_STATUS_MEMOPS_FAILED;
cuda_status = hipStreamSynchronize(stream_);
// Numerically stable SM
ctcStatus_t ctc_status = reduce_max(probs_, denoms_, out_dim_, activation_cols_, 1, stream_);
if (ctc_status != CTC_STATUS_SUCCESS)
return ctc_status;
// Kernel launch to subtract maximum
const int NT = kCUDABlockNumThreads;
const int VT = 1;
const int NV = NT * VT;
const int num_elements = out_dim_ * activation_cols_;
const int grid_size = ctc_helper::div_up(num_elements, NV);
prepare_stable_SM_kernel<ProbT, VT> <<< grid_size, NT, 0, stream_>>>
(ctc_helper::identity<ProbT>(), probs_, denoms_, out_dim_, num_elements);
// Reduce along columns to calculate denominator
ctc_status = reduce_exp(probs_, denoms_, out_dim_, activation_cols_, 1, stream_);
if (ctc_status != CTC_STATUS_SUCCESS)
return ctc_status;
// Kernel launch to calculate probabilities
compute_probs_kernel<ProbT, VT><<<grid_size, NT, 0, stream_>>>
(ctc_helper::exponential<ProbT>(), probs_, denoms_, out_dim_, num_elements);
return CTC_STATUS_SUCCESS;
}
template<typename ProbT>
ctcStatus_t
GpuCTC<ProbT>::compute_cost_and_score(const ProbT *const activations,
ProbT *grads,
ProbT *costs,
const int *const flat_labels,
const int *const label_lengths,
const int *const input_lengths,
bool compute_alpha,
bool compute_betas_and_grad) {
size_t best_config;
ctcStatus_t status = create_metadata_and_choose_config(flat_labels,
label_lengths,
input_lengths,
best_config);
if (status != CTC_STATUS_SUCCESS)
return status;
status = compute_probs(activations);
if (status != CTC_STATUS_SUCCESS)
return status;
launch_gpu_kernels(probs_, grads, best_config,
compute_alpha, compute_betas_and_grad);
hipError_t cuda_status_mem, cuda_status_sync;
cuda_status_mem = hipMemcpyAsync(costs, nll_forward_,
sizeof(ProbT) * minibatch_,
hipMemcpyDeviceToHost, stream_);
cuda_status_sync = hipStreamSynchronize(stream_);
if (cuda_status_mem != hipSuccess || cuda_status_sync != hipSuccess)
return CTC_STATUS_MEMOPS_FAILED;
return CTC_STATUS_SUCCESS;
}
template<typename ProbT>
ctcStatus_t
GpuCTC<ProbT>::cost_and_grad(const ProbT *const activations,
ProbT *grads,
ProbT *costs,
const int *const flat_labels,
const int *const label_lengths,
const int *const input_lengths) {
if (activations == nullptr ||
grads == nullptr ||
costs == nullptr ||
flat_labels == nullptr ||
label_lengths == nullptr ||
input_lengths == nullptr
)
return CTC_STATUS_INVALID_VALUE;
return compute_cost_and_score(activations, grads, costs, flat_labels,
label_lengths, input_lengths, true, true);
}
template<typename ProbT>
ctcStatus_t
GpuCTC<ProbT>::score_forward(const ProbT *const activations,
ProbT *costs,
const int *const flat_labels,
const int *const label_lengths,
const int *const input_lengths) {
if (activations == nullptr ||
costs == nullptr ||
flat_labels == nullptr ||
label_lengths == nullptr ||
input_lengths == nullptr
)
return CTC_STATUS_INVALID_VALUE;
return compute_cost_and_score(activations, nullptr, costs, flat_labels,
label_lengths, input_lengths, true, false);
}
#pragma once
#include <contrib/moderngpu/include/device/ctascan.cuh>
#include <contrib/moderngpu/include/device/ctamerge.cuh>
#include "ctc_helper.h"
#include <stdio.h>
using namespace mgpu;
template<int NT, int VT, typename T, typename KeyT, typename Op>
struct CTASegReduce {
enum {
NV = NT * VT
};
union Storage {
typename CTAScan<NT>::Storage scanStorage;
int indices[NV];
};
//adapted from global kernel KernelReduceByKeyPreprocess
__device__ static void preprocessKeys(KeyT *keys, int count,
int *numUniqueLabels, int seg_start[VT],
int seg_end[VT], int *scanout) {
__shared__
Storage shared;
const int tid = threadIdx.x;
// Compare adjacent keys within each thread and mark discontinuities
int endFlags = 0;
T key = keys[VT * tid];
#pragma unroll
for (int i = 0; i < VT; ++i) {
int index = VT * tid + 1 + i;
T next = keys[index];
if (index == count || (index < count && key != next)) {
endFlags |= 1 << i;
}
key = next;
}
__syncthreads();
//Count the number of encountered end flags
int scan = CTAScan<NT>::Scan(tid, popc(endFlags), shared.scanStorage, numUniqueLabels);
__syncthreads();
//output the unique keys
//use indices as scratch space
int outputPos = scan;
#pragma unroll
for (int i = 0; i < VT; ++i) {
if ((endFlags >> i) & 1) {
shared.indices[outputPos] = keys[VT * tid + i];
scanout[outputPos] = VT * tid + i;
outputPos++;
}
}
__syncthreads();
// Create start and end
for (int idx = tid, j = 0; idx < (*numUniqueLabels); idx += blockDim.x, ++j) {
seg_start[j] = (idx == 0) ? 0 : (scanout[idx - 1] + 1);
seg_end[j] = scanout[idx];
}
__syncthreads();
//copy from the scratch space back into the keys
#pragma unroll
for (int i = 0; i < VT; ++i) {
keys[i * NT + tid] = shared.indices[i * NT + tid];
}
__syncthreads();
}
};
// Computes forward probabilities. This fills in a T * S matrix.
// The computation starts at t=1 (2nd row) and ends at t=T-1 (last row). Each row has
// S elements where S = 2L + 1.
//
// We only need to read in probabilities corresponding to the labels, thus a sparse
// set of values are read from the probs matrix since the character set is much smaller
// than the labels. This is much more true for Mandarin than English.
template<typename ProbT, int NT, int VT>
__global__
void compute_alpha_kernel(const ProbT *probs, const int *label_sizes,
const int *utt_length, const int *repeats_in_labels,
const int *labels_without_blanks, const int *label_offsets,
int *labels_with_blanks, ProbT *alphas,
ProbT *nll_forward, int stride, int out_dim,
int S_memoffset, int T_memoffset, int blank_label) {
ctc_helper::log_plus<ProbT> log_plus_f;
const int tid = threadIdx.x;
const int L = label_sizes[blockIdx.x];
const int T = utt_length[blockIdx.x];
const int S = 2 * L + 1;
const int prob_offset = out_dim * blockIdx.x;
const int repeats = repeats_in_labels[blockIdx.x];
const int NV = NT * VT;
__shared__ int label[NV];
if ((L + repeats) > T)
return;
// Generate labels with blanks from labels without blanks
{
const int label_start_offset = label_offsets[blockIdx.x];
for (int idx = tid; idx < L; idx += blockDim.x) {
const int offset = (blockIdx.x * S_memoffset) + 2 * idx;
labels_with_blanks[offset] = blank_label;
labels_with_blanks[offset + 1] = labels_without_blanks[label_start_offset + idx];
}
if (tid == 0) {
labels_with_blanks[(blockIdx.x * S_memoffset) + 2 * L] = blank_label;
}
}
__syncthreads();
const int *labels = labels_with_blanks;
const int *label_global = &labels[blockIdx.x * S_memoffset];
ProbT *alpha = &alphas[blockIdx.x * (S_memoffset * T_memoffset)];
// Set the first row of alpha neg_inf - it is much more efficient to do it
// here than outside
#pragma unroll
for (int idx = tid; idx < min(S, NV); idx += blockDim.x) {
alpha[idx] = ctc_helper::neg_inf<ProbT>();
}
// Load labels into shared memory
#pragma unroll
for (int i = tid; i < S; i += NT) {
label[i] = label_global[i];
}
__syncthreads();
int start = (L + repeats < T) ? 0 : 1;
int end = S > 1 ? 2 : 1;
// Initialize the first row corresponding to t=0;
for (int i = tid; i < (end - start); i += blockDim.x) {
alpha[i + start] = log(probs[prob_offset + label[i + start]]);
//printf("compute_alpha_kernel probs is %f\n", probs[prob_offset + label[i + start]]);
//printf("compute_alpha_kernel alpha is %f\n", alpha[i + start]);
}
__syncthreads();
// Fill in the rest of matrix, one row at a time (outer loop).
for (int t = 1; t < T; ++t) {
// Start offsets into the current and previous row
const int start_cur_row = t * S;
const int start_prev_row = (t - 1) * S;
// The prob is a 2D column major array, with probabilites for each t strided
// by (out_dim * stride), where stride is the minibatch size, out_dim is alphabet_size
const int start_prob_col = t * (out_dim * stride);
// This is the first column and in this case there is nothing left of it
if (tid == 0) {
if (start == 0) {
alpha[start_cur_row] = alpha[start_prev_row] +
log(probs[prob_offset + start_prob_col + blank_label]);
} else if (start == 1) {
alpha[start_cur_row] = alpha[start_prev_row];
}
}
__syncthreads();
// Fill in the elements in each row. There is no loop dependence here since our
// input is the row above. We sum either two or three adjacent values from the
// row above depending on whether we have a blank or repeated characters. Finally
// we add the probability corresponding to this label at time t
#pragma unroll
for (int idx = (tid + 1); idx < S; idx += blockDim.x) {
ProbT prev_sum = log_plus_f(alpha[idx + start_prev_row], alpha[(idx - 1) + start_prev_row]);
// Skip two if not on blank and not on repeat.
if ((label[idx] != blank_label) &&
(idx != 1) && (label[idx] != label[idx - 2]))
prev_sum = log_plus_f(prev_sum, alpha[(idx - 2) + start_prev_row]);
alpha[idx + start_cur_row] =
prev_sum + log(probs[prob_offset + start_prob_col + label[idx]]);
}
__syncthreads();
}
if (tid == 0) {
// Add and return the rightmost two/one element(s) in the last row.
ProbT loglike = ctc_helper::neg_inf<ProbT>();
// This is the total increment for s_inc and e_inc through the loop
const int val = 2 * (L - 1) + 1 - (((L + repeats) == T) ? 1 : 0);
start = (val * (L != 0) + start);
end = (val * (L != 0) + end);
for (int i = start; i < end; ++i) {
loglike = log_plus_f(loglike, alpha[i + (T - 1) * S]);
}
nll_forward[blockIdx.x] = -loglike;
}
}
// Computes backward probabilities. This also fills in a T * S matrix
//
// See comments above compute_alphas for more context.
template<typename ProbT, int NT, int VT>
__global__
void compute_betas_and_grad_kernel(const ProbT *probs, const int *label_sizes,
const int *utt_length, const int *repeats_in_labels,
const int *labels_with_blanks, ProbT *alphas,
const ProbT *nll_forward, ProbT *nll_backward,
ProbT *grads, int stride, int out_dim,
int S_memoffset, int T_memoffset, int blank_label) {
ctc_helper::log_plus<ProbT> log_plus_f;
typedef CTASegReduce<NT, VT, ProbT, int, ctc_helper::log_plus<ProbT>> SegReduce;
const int tid = threadIdx.x;
const int L = label_sizes[blockIdx.x];
const int T = utt_length[blockIdx.x];
const int S = 2 * L + 1;
const int prob_offset = out_dim * blockIdx.x;
const int repeats = repeats_in_labels[blockIdx.x];
const ProbT log_partition = -nll_forward[blockIdx.x];
const int *labels = labels_with_blanks;
const int *label_global = &labels[blockIdx.x * S_memoffset];
ProbT *alpha = &alphas[blockIdx.x * (S_memoffset * T_memoffset)];
const int NV = NT * VT;
union TempStorage {
ProbT beta[NV];
int result[NV];
};
__shared__
TempStorage temp_buffer;
__shared__ int label[NV];
// Temporaries needed for segmented reduce
// TODO: see if we can combine the shared memory requirements
__shared__ int keys_shared[NV];
__shared__ int gather_indices[NV];
__shared__
ProbT output[NV];
ProbT beta_val[VT];
if ((L + repeats) > T)
return;
int start = S > 1 ? (S - 2) : 0;
int end = (L + repeats < T) ? S : S - 1;
// Setup shared memory buffers
#pragma unroll
for (int idx = tid; idx < NV; idx += NT) {
label[idx] = (idx < S) ? label_global[idx] : INT_MAX;
}
__syncthreads();
// int flags;
int uniquelabels;
int seg_start[VT];
int seg_end[VT];
// Sort labels and record indices from which to gather from
{
int key[VT];
int gather_val[VT];
#pragma unroll
for (int i = 0; i < VT; ++i) {
const int idx = tid * VT + i;
gather_val[i] = idx;
key[i] = label[idx];
}
__syncthreads();
CTAMergesort < NT, VT, true, true, int, int, mgpu::less < int >>
(key, gather_val, keys_shared, gather_indices, S, tid, mgpu::less<int>());
__syncthreads();
for (int i = 0; i < VT; ++i) {
const int idx = tid * VT + i;
gather_indices[idx] = gather_val[i];
}
__syncthreads();
SegReduce::preprocessKeys(keys_shared, S, &uniquelabels, seg_start, seg_end,
temp_buffer.result);
__syncthreads();
}
// TODO: probably not necessary
__syncthreads();
// Load labels back
#pragma unroll
for (int idx = tid; idx < NV; idx += NT) {
temp_buffer.beta[idx] = ctc_helper::neg_inf<ProbT>();
}
__syncthreads();
// Initialize the two rightmost values in the last row (assuming L non-zero)
for (int i = tid; i < (end - start); i += blockDim.x)
temp_buffer.beta[i + start] =
log(probs[prob_offset + (T - 1) * (out_dim * stride) + label[i + start]]);
__syncthreads();
// Load output data in registers through the transpose trick - should really be a function
#pragma unroll
for (int idx = tid; idx < S; idx += NT) {
output[idx] = alpha[idx + (T - 1) * S] + temp_buffer.beta[idx];
}
__syncthreads();
// Start at the second to last row and backward in time
for (int t = T - 1; t >= 0; --t) {
// Start offsets into the current and next row
const int start_cur_row = t * S;
// Starting offset of column that we read from the probs array
const int start_prob_col = t * (out_dim * stride);
if (t < T - 1) {
// Filling up one row at at time but going back in time from the last row
// to the first. As in the forward pass, there is no loop dependence and we
// do a variable length filter of maximum filter size of 3
#pragma unroll
for (int idx = tid, i = 0; idx < (S - 1); idx += NT, i++) {
ProbT next_sum = log_plus_f(temp_buffer.beta[idx], temp_buffer.beta[idx + 1]);
// Skip two if not on blank and not on repeat.
if ((label[idx] != blank_label) &&
(idx != (S - 2)) && (label[idx] != label[idx + 2]))
next_sum = log_plus_f(next_sum, temp_buffer.beta[idx + 2]);
beta_val[i] = next_sum + log(probs[prob_offset + start_prob_col + label[idx]]);
}
__syncthreads();
// Initialize values for the rightmost column since there is nothing to the right
// Update input buffer for next iteration
if ((tid == 0) && (end == S))
temp_buffer.beta[(S - 1)] = temp_buffer.beta[(S - 1)] +
log(probs[prob_offset + start_prob_col + blank_label]);
#pragma unroll
for (int idx = tid, i = 0; idx < (S - 1); idx += NT, i++) {
temp_buffer.beta[idx] = beta_val[i];
}
__syncthreads();
// Beta Computation done - add to alpha and update the gradient. Reload
// the gradient back for segmented reduce later on
#pragma unroll
for (int idx = tid; idx < S; idx += NT) {
output[idx] = alpha[idx + start_cur_row] + temp_buffer.beta[idx];
}
__syncthreads();
}
__syncthreads();
// Compute segmented reduction of output by using label as key
{
// Somewhat faster key value reduce
ProbT accum[VT];
for (int idx = tid, j = 0; idx < uniquelabels; idx += blockDim.x, ++j) {
accum[j] = ctc_helper::neg_inf<ProbT>();
for (int i = seg_start[j]; i <= seg_end[j]; ++i) {
accum[j] = log_plus_f(accum[j], output[gather_indices[i]]);
}
}
__syncthreads();
// Write accumulated value into output since that is not used
for (int idx = tid, j = 0; idx < uniquelabels; idx += blockDim.x, ++j) {
output[idx] = accum[j];
}
__syncthreads();
for (int idx = tid; idx < out_dim; idx += blockDim.x) {
const int grads_offset = prob_offset + start_prob_col + idx;
grads[grads_offset] = probs[grads_offset];
}
__syncthreads();
for (int idx = tid; idx < uniquelabels; idx += blockDim.x) {
const int grads_offset = prob_offset + start_prob_col + keys_shared[idx];
ProbT grad = output[idx];
if ((grad == 0.0) || (probs[grads_offset] == 0.0) ||
(grad == ctc_helper::neg_inf<ProbT>())) {
} else {
grads[grads_offset] =
probs[grads_offset] - exp(grad - log(probs[grads_offset]) - log_partition);
}
}
__syncthreads();
}
// Output backward log likelihood
if ((t == 0) && (tid == 0)) {
ProbT loglike = ctc_helper::neg_inf<ProbT>();
const int val = 2 * (L - 1) + 1 - (((L + repeats) == T) ? 1 : 0);
start = (-val * (L != 0) + start);
end = (-val * (L != 0) + end);
// Sum and return the leftmost one/two value(s) in first row
for (int i = start; i < end; ++i)
loglike = log_plus_f(loglike, temp_buffer.beta[i]);
nll_backward[blockIdx.x] = -loglike;
}
// For some reason this is important
__syncthreads();
}
}
template<typename ProbT, int VT = 1, typename Op>
__global__ void compute_probs_kernel(Op f, ProbT *probs,
const ProbT *const denom,
int alphabet_size,
int count) {
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
#pragma unroll
for (int i = 0; i < VT; i++) {
if (idx < count) {
const int column_idx = idx / alphabet_size;
probs[idx] = f(probs[idx]) / denom[column_idx];
}
idx += stride;
}
}
template<typename ProbT, int VT = 1, typename Op>
__global__ void prepare_stable_SM_kernel(Op f, ProbT *probs,
const ProbT *const col_max,
int alphabet_size,
int count) {
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
#pragma unroll
for (int i = 0; i < VT; i++) {
if (idx < count) {
const int column_idx = idx / alphabet_size;
probs[idx] = f(probs[idx] - col_max[column_idx]);
}
idx += stride;
}
}
#pragma once
#ifdef __HIPCC__
#define HOSTDEVICE __device__ __host__
#else
#define HOSTDEVICE
#endif
#pragma once
ctcStatus_t reduce_negate(const float* input, float* output, int rows, int cols, bool axis, CUstream stream);
ctcStatus_t reduce_exp(const float* input, float* output, int rows, int cols, bool axis, CUstream stream);
ctcStatus_t reduce_max(const float* input, float* output, int rows, int cols, bool axis, CUstream stream);
import torch
from typing import List, Optional, Union
import glob
import os
import shlex
import subprocess
import sys
from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CppExtension
from setuptools import find_packages, setup
from setuptools.command.build_ext import build_ext
from pkg_resources import packaging # type: ignore[attr-defined]
def _find_rocm_home() -> Optional[str]:
rocm_home = os.environ.get('ROCM_HOME') or os.environ.get('ROCM_PATH')
if rocm_home is None:
try:
pipe_hipcc = subprocess.Popen(
["which hipcc | xargs readlink -f"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
hipcc, _ = pipe_hipcc.communicate()
rocm_home = os.path.dirname(os.path.dirname(hipcc.decode(*()).rstrip('\r\n')))
if os.path.basename(rocm_home) == 'hip':
rocm_home = os.path.dirname(rocm_home)
except Exception:
rocm_home = '/opt/rocm'
if not os.path.exists(rocm_home):
rocm_home = None
if rocm_home and torch.version.hip is None:
print(f"No ROCm runtime is found, using ROCM_HOME='{rocm_home}'")
return rocm_home
def _get_rocm_arch_flags(cflags: Optional[List[str]] = None) -> List[str]:
if cflags is not None:
for flag in cflags:
if 'amdgpu-target' in flag:
return ['-fno-gpu-rdc']
archs = os.environ.get('PYTORCH_ROCM_ARCH', 'gfx900;gfx906')
flags = ['--amdgpu-target=%s' % arch for arch in archs.split(';')]
flags += ['-fno-gpu-rdc']
return flags
ROCM_HOME = _find_rocm_home()
IS_HIP_EXTENSION = True if ((ROCM_HOME is not None) and (torch.version.hip is not None)) else False
COMMON_HIP_FLAGS = [
'-fPIC',
'-D__HIP_PLATFORM_HCC__=1',
]
COMMON_HIPCC_FLAGS = [
'-DCUDA_HAS_FP16=1',
'-D__HIP_NO_HALF_OPERATORS__=1',
'-D__HIP_NO_HALF_CONVERSIONS__=1',
]
def is_ninja_available():
try:
subprocess.check_output('ninja --version'.split())
except Exception:
return False
else:
return True
def verify_ninja_availability():
if not is_ninja_available():
raise RuntimeError("Ninja is required to load C++ extensions")
def _is_cuda_file(path: str) -> bool:
valid_ext = ['.cu', '.cuh']
if IS_HIP_EXTENSION:
valid_ext.append('.hip')
return os.path.splitext(path)[1] in valid_ext
def _join_rocm_home(*paths) -> str:
if ROCM_HOME is None:
raise EnvironmentError('ROCM_HOME environment variable is not set. ')
return os.path.join(ROCM_HOME, *paths)
def _write_ninja_file(path, cflags, post_cflags, cuda_cflags, cuda_post_cflags, sources,
objects, ldflags, library_target, with_cuda) -> None:
def sanitize_flags(flags):
if flags is None:
return []
else:
return [flag.strip() for flag in flags]
cflags = sanitize_flags(cflags)
post_cflags = sanitize_flags(post_cflags)
cuda_cflags = sanitize_flags(cuda_cflags)
cuda_post_cflags = sanitize_flags(cuda_post_cflags)
ldflags = sanitize_flags(ldflags)
assert len(sources) == len(objects)
assert len(sources) > 0
compiler = os.environ.get('CXX', 'c++')
config = ['ninja_required_version = 1.3']
config.append(f'cxx = {compiler}')
if with_cuda:
if IS_HIP_EXTENSION:
nvcc = _join_rocm_home('bin', 'hipcc')
config.append(f'nvcc = {nvcc}')
flags = [f'cflags = {" ".join(cflags)}']
flags.append(f'post_cflags = {" ".join(post_cflags)}')
if with_cuda:
flags.append(f'cuda_cflags = {" ".join(cuda_cflags)}')
flags.append(f'cuda_post_cflags = {" ".join(cuda_post_cflags)}')
flags.append(f'ldflags = {" ".join(ldflags)}')
sources = [os.path.abspath(file) for file in sources]
compile_rule = ['rule compile']
compile_rule.append(' command = $cxx -MMD -MF $out.d $cflags -c $in -o $out $post_cflags')
compile_rule.append(' depfile = $out.d')
compile_rule.append(' deps = gcc')
if with_cuda:
cuda_compile_rule = ['rule cuda_compile']
nvcc_gendeps = ''
required_cuda_version = packaging.version.parse('10.2')
has_cuda_version = torch.version.cuda is not None
if has_cuda_version and packaging.version.parse(torch.version.cuda) >= required_cuda_version:
cuda_compile_rule.append(' depfile = $out.d')
cuda_compile_rule.append(' deps = gcc')
cuda_compile_rule.append(
f' command = $nvcc {nvcc_gendeps} $cuda_cflags -c $in -o $out $cuda_post_cflags')
build = []
for source_file, object_file in zip(sources, objects):
is_cuda_source = _is_cuda_file(source_file) and with_cuda
rule = 'cuda_compile' if is_cuda_source else 'compile'
source_file = source_file.replace(" ", "$ ")
object_file = object_file.replace(" ", "$ ")
build.append(f'build {object_file}: {rule} {source_file}')
if library_target is not None:
link_rule = ['rule link']
link_rule.append(' command = $cxx $in $ldflags -o $out')
link = [f'build {library_target}: link {" ".join(objects)}']
default = [f'default {library_target}']
else:
link_rule, link, default = [], [], []
blocks = [config, flags, compile_rule]
if with_cuda:
blocks.append(cuda_compile_rule)
blocks += [link_rule, build, link, default]
with open(path, 'w') as build_file:
for block in blocks:
lines = '\n'.join(block)
build_file.write(f'{lines}\n\n')
def _get_num_workers(verbose: bool) -> Optional[int]:
max_jobs = os.environ.get('MAX_JOBS')
if max_jobs is not None and max_jobs.isdigit():
if verbose:
print(f'Using envvar MAX_JOBS ({max_jobs}) as the number of workers...')
return int(max_jobs)
if verbose:
print('Allowing ninja to set a default number of workers... ')
return None
def _run_ninja_build(build_directory: str, verbose: bool, error_prefix: str) -> None:
command = ['ninja', '-v']
num_workers = _get_num_workers(verbose)
if num_workers is not None:
command.extend(['-j', str(num_workers)])
env = os.environ.copy()
try:
sys.stdout.flush()
sys.stderr.flush()
stdout_fileno = 1
subprocess.run(command, stdout=stdout_fileno if verbose else subprocess.PIPE, stderr=subprocess.STDOUT,
cwd=build_directory, check=True, env=env)
except subprocess.CalledProcessError as e:
_, error, _ = sys.exc_info()
message = error_prefix
if hasattr(error, 'output') and error.output: # type: ignore[union-attr]
message += f": {error.output.decode(*SUBPROCESS_DECODE_ARGS)}" # type: ignore[union-attr]
raise RuntimeError(message) from e
def _write_ninja_file_and_compile_objects(sources: List[str], objects, cflags, post_cflags, cuda_cflags,
cuda_post_cflags, build_directory: str, verbose: bool,
with_cuda: Optional[bool]) -> None:
verify_ninja_availability()
compiler = os.environ.get('CXX', 'c++')
if with_cuda is None:
with_cuda = any(map(_is_cuda_file, sources))
build_file_path = os.path.join(build_directory, 'build.ninja')
if verbose:
print(f'Emitting ninja build file {build_file_path}...')
_write_ninja_file(path=build_file_path, cflags=cflags, post_cflags=post_cflags, cuda_cflags=cuda_cflags,
cuda_post_cflags=cuda_post_cflags, sources=sources, objects=objects, ldflags=None,
library_target=None, with_cuda=with_cuda)
if verbose:
print('Compiling objects...')
_run_ninja_build(
build_directory,
verbose,
error_prefix='Error compiling objects for extension')
class BuildReleaseExtension(BuildExtension):
def __init__(self, *args, **kwargs) -> None:
super(BuildReleaseExtension, self).__init__(*args, **kwargs)
def build_extensions(self) -> None:
self._check_abi()
cuda_ext = False
extension_iter = iter(self.extensions)
extension = next(extension_iter, None)
while not cuda_ext and extension:
for source in extension.sources:
_, ext = os.path.splitext(source)
if ext == '.cu':
cuda_ext = True
break
extension = next(extension_iter, None)
for extension in self.extensions:
if isinstance(extension.extra_compile_args, dict):
for ext in ['cxx', 'nvcc']:
if ext not in extension.extra_compile_args:
extension.extra_compile_args[ext] = []
self._add_compile_flag(extension, '-DTORCH_API_INCLUDE_EXTENSION_H')
for name in ["COMPILER_TYPE", "STDLIB", "BUILD_ABI"]:
val = getattr(torch._C, f"_PYBIND11_{name}")
self._add_compile_flag(extension, f'-DPYBIND11_{name}="{val}"')
self._define_torch_extension_name(extension)
self._add_gnu_cpp_abi_flag(extension)
self.compiler.src_extensions += ['.cu', '.cuh', '.hip']
def append_std17_if_no_std_present(cflags) -> None:
cpp_format_prefix = '/{}:' if self.compiler.compiler_type == 'msvc' else '-{}='
cpp_flag_prefix = cpp_format_prefix.format('std')
cpp_flag = cpp_flag_prefix + 'c++14'
if not any(flag.startswith(cpp_flag_prefix) for flag in cflags):
cflags.append(cpp_flag)
def convert_to_absolute_paths_inplace(paths):
if paths is not None:
for i in range(len(paths)):
if not os.path.isabs(paths[i]):
paths[i] = os.path.abspath(paths[i])
def unix_wrap_ninja_compile(sources, output_dir=None, macros=None, include_dirs=None, debug=0,
extra_preargs=None, extra_postargs=None, depends=None):
output_dir = os.path.abspath(output_dir)
convert_to_absolute_paths_inplace(self.compiler.include_dirs)
_, objects, extra_postargs, pp_opts, _ = \
self.compiler._setup_compile(output_dir, macros, include_dirs, sources, depends, extra_postargs)
common_cflags = self.compiler._get_cc_args(pp_opts, debug, extra_preargs)
extra_cc_cflags = self.compiler.compiler_so[1:]
if (debug):
print("debug mode")
else:
extra_cc_cflags.remove('-g')
extra_cc_cflags.remove('-Wall')
print("release mode")
with_cuda = any(map(_is_cuda_file, sources))
if isinstance(extra_postargs, dict):
post_cflags = extra_postargs['cxx']
else:
post_cflags = list(extra_postargs)
if IS_HIP_EXTENSION:
post_cflags = COMMON_HIP_FLAGS + post_cflags
append_std17_if_no_std_present(post_cflags)
cuda_post_cflags = None
cuda_cflags = None
if with_cuda:
cuda_cflags = common_cflags
if isinstance(extra_postargs, dict):
cuda_post_cflags = extra_postargs['nvcc']
else:
cuda_post_cflags = list(extra_postargs)
if IS_HIP_EXTENSION:
cuda_post_cflags = cuda_post_cflags + _get_rocm_arch_flags(cuda_post_cflags)
cuda_post_cflags = COMMON_HIP_FLAGS + COMMON_HIPCC_FLAGS + cuda_post_cflags
append_std17_if_no_std_present(cuda_post_cflags)
cuda_cflags = [shlex.quote(f) for f in cuda_cflags]
cuda_post_cflags = [shlex.quote(f) for f in cuda_post_cflags]
_write_ninja_file_and_compile_objects(sources=sources, objects=objects,
cflags=[shlex.quote(f) for f in extra_cc_cflags + common_cflags],
post_cflags=[shlex.quote(f) for f in post_cflags],
cuda_cflags=cuda_cflags,
cuda_post_cflags=cuda_post_cflags, build_directory=output_dir,
verbose=True, with_cuda=with_cuda)
return objects
self.compiler.compile = unix_wrap_ninja_compile
build_ext.build_extensions(self)
def get_version():
return "0.1"
def get_extensions():
extensions = []
include_dirs = []
define_macros = []
extra_compile_args = {'cxx': ['-O3'], 'nvcc': []}
args = []
args += ['-DWARPCTC_ENABLE_GPU']
args += ['-DCTC_DISABLE_OMP']
# args += ['-DDEBUG_KERNEL']
args += ['-Wno-deprecated']
extra_compile_args['cxx'] += args
extra_compile_args['nvcc'] += args
op_files = glob.glob('./src/*.cu') + glob.glob('./src/*.cpp') + ['../src/reduce.cu', '../src/ctc_entrypoint.cu']
print('op_files = ', op_files)
extension = CUDAExtension
include_dirs.append(os.path.realpath('../include/'))
include_dirs.append('/opt/dtk/rocrand/include/')
include_dirs.append('/opt/dtk/hiprand/include/')
print('include_dirs = ', include_dirs)
ext_ops = extension(
name="_warp_ctc",
sources=op_files,
include_dirs=include_dirs,
define_macros=define_macros,
extra_compile_args=extra_compile_args)
extensions.append(ext_ops)
return extensions
def main():
setup(
name='warpctc_pytorch',
version=get_version(),
description='Torch fuseop Computer Vision Foundation',
keywords='computer vision',
packages=find_packages(),
include_package_data=False,
package_data={
'warpctc_pytorch': [
"src/*.cuh",
"src/*.cu",
"src/*.hip",
"src/*.cpp"
]
},
ext_modules=get_extensions(),
cmdclass={
'build_ext': BuildReleaseExtension
},
zip_safe=False
)
if __name__ == "__main__":
main()
#include <iostream>
#include <vector>
#include <numeric>
#include <torch/extension.h>
#ifdef WARPCTC_ENABLE_GPU
#include "ATen/cuda/CUDAContext.h"
#include <c10/cuda/CUDAGuard.h>
#include "ATen/cuda/CUDAEvent.h"
#include <THC/THCGeneral.h>
extern THCState* state;
#endif
#include "ctc.h"
int cpu_ctc(torch::Tensor probs,
torch::Tensor grads,
torch::Tensor labels,
torch::Tensor label_sizes,
torch::Tensor sizes,
int minibatch_size,
torch::Tensor costs,
int blank_label)
{
float* probs_ptr = (float*)probs.data_ptr();
float* grads_ptr = grads.storage() ? (float*)grads.data_ptr() : NULL;
int* sizes_ptr = (int*)sizes.data_ptr();
int* labels_ptr = (int*)labels.data_ptr();
int* label_sizes_ptr = (int*)label_sizes.data_ptr();
float* costs_ptr = (float*)costs.data_ptr();
const int probs_size = probs.size(2);
ctcOptions options;
memset(&options, 0, sizeof(options));
options.loc = CTC_CPU;
options.num_threads = 0; // will use default number of threads
options.blank_label = blank_label;
#if defined(CTC_DISABLE_OMP) || defined(APPLE)
// have to use at least one
options.num_threads = std::max(options.num_threads, (unsigned int) 1);
#endif
size_t cpu_size_bytes;
get_workspace_size(label_sizes_ptr, sizes_ptr,
probs_size, minibatch_size,
options, &cpu_size_bytes);
float* cpu_workspace = new float[cpu_size_bytes / sizeof(float)];
compute_ctc_loss(probs_ptr, grads_ptr,
labels_ptr, label_sizes_ptr,
sizes_ptr, probs_size,
minibatch_size, costs_ptr,
cpu_workspace, options);
delete[] cpu_workspace;
return 1;
}
#ifdef WARPCTC_ENABLE_GPU
int gpu_ctc(torch::Tensor probs,
torch::Tensor grads,
torch::Tensor labels,
torch::Tensor label_sizes,
torch::Tensor sizes,
int minibatch_size,
torch::Tensor costs,
int blank_label)
{
float* probs_ptr = (float*)probs.data_ptr();
float* grads_ptr = grads.storage() ? (float*)grads.data_ptr() : NULL;
int* sizes_ptr = (int*)sizes.data_ptr();
int* labels_ptr = (int*)labels.data_ptr();
int* label_sizes_ptr = (int*)label_sizes.data_ptr();
float* costs_ptr = (float*)costs.data_ptr();
const int probs_size = probs.size(2);
ctcOptions options;
memset(&options, 0, sizeof(options));
options.loc = CTC_GPU;
options.blank_label = blank_label;
options.stream = at::cuda::getCurrentCUDAStream();
size_t gpu_size_bytes;
get_workspace_size(label_sizes_ptr, sizes_ptr,
probs_size, minibatch_size,
options, &gpu_size_bytes);
void* gpu_workspace = THCudaMalloc(state, gpu_size_bytes);
compute_ctc_loss(probs_ptr, grads_ptr,
labels_ptr, label_sizes_ptr,
sizes_ptr, probs_size,
minibatch_size, costs_ptr,
gpu_workspace, options);
THCudaFree(state, (void *) gpu_workspace);
return 1;
}
#endif
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("cpu_ctc", &cpu_ctc, "CTC Loss function with cpu");
#ifdef WARPCTC_ENABLE_GPU
m.def("gpu_ctc", &gpu_ctc, "CTC Loss function with gpu");
#endif
}
// !!! This is a file automatically generated by hipify!!!
#include <iostream>
#include <vector>
#include <numeric>
#include <torch/extension.h>
#ifdef WARPCTC_ENABLE_GPU
#include "ATen/hip/HIPContext.h"
#include <ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h>
#include "ATen/hip/HIPEvent.h"
#include <THH/THHGeneral.h>
extern THCState* state;
#endif
#include "ctc.h"
int cpu_ctc(torch::Tensor probs,
torch::Tensor grads,
torch::Tensor labels,
torch::Tensor label_sizes,
torch::Tensor sizes,
int minibatch_size,
torch::Tensor costs,
int blank_label)
{
float* probs_ptr = (float*)probs.data_ptr();
float* grads_ptr = grads.storage() ? (float*)grads.data_ptr() : NULL;
int* sizes_ptr = (int*)sizes.data_ptr();
int* labels_ptr = (int*)labels.data_ptr();
int* label_sizes_ptr = (int*)label_sizes.data_ptr();
float* costs_ptr = (float*)costs.data_ptr();
const int probs_size = probs.size(2);
ctcOptions options;
memset(&options, 0, sizeof(options));
options.loc = CTC_CPU;
options.num_threads = 0; // will use default number of threads
options.blank_label = blank_label;
#if defined(CTC_DISABLE_OMP) || defined(APPLE)
// have to use at least one
options.num_threads = ::max(options.num_threads, (unsigned int) 1);
#endif
size_t cpu_size_bytes;
get_workspace_size(label_sizes_ptr, sizes_ptr,
probs_size, minibatch_size,
options, &cpu_size_bytes);
float* cpu_workspace = new float[cpu_size_bytes / sizeof(float)];
compute_ctc_loss(probs_ptr, grads_ptr,
labels_ptr, label_sizes_ptr,
sizes_ptr, probs_size,
minibatch_size, costs_ptr,
cpu_workspace, options);
delete[] cpu_workspace;
return 1;
}
#ifdef WARPCTC_ENABLE_GPU
int gpu_ctc(torch::Tensor probs,
torch::Tensor grads,
torch::Tensor labels,
torch::Tensor label_sizes,
torch::Tensor sizes,
int minibatch_size,
torch::Tensor costs,
int blank_label)
{
float* probs_ptr = (float*)probs.data_ptr();
float* grads_ptr = grads.storage() ? (float*)grads.data_ptr() : NULL;
int* sizes_ptr = (int*)sizes.data_ptr();
int* labels_ptr = (int*)labels.data_ptr();
int* label_sizes_ptr = (int*)label_sizes.data_ptr();
float* costs_ptr = (float*)costs.data_ptr();
const int probs_size = probs.size(2);
ctcOptions options;
memset(&options, 0, sizeof(options));
options.loc = CTC_GPU;
options.blank_label = blank_label;
options.stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
size_t gpu_size_bytes;
get_workspace_size(label_sizes_ptr, sizes_ptr,
probs_size, minibatch_size,
options, &gpu_size_bytes);
void* gpu_workspace = THCudaMalloc(state, gpu_size_bytes);
compute_ctc_loss(probs_ptr, grads_ptr,
labels_ptr, label_sizes_ptr,
sizes_ptr, probs_size,
minibatch_size, costs_ptr,
gpu_workspace, options);
THCudaFree(state, (void *) gpu_workspace);
return 1;
}
#endif
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("cpu_ctc", &cpu_ctc, "CTC Loss function with cpu");
#ifdef WARPCTC_ENABLE_GPU
m.def("gpu_ctc", &gpu_ctc, "CTC Loss function with gpu");
#endif
}
#pragma once
/*
int cpu_ctc(THFloatTensor *probs,
THFloatTensor *grads,
THIntTensor *labels_ptr,
THIntTensor *label_sizes_ptr,
THIntTensor *sizes,
int minibatch_size,
THFloatTensor *costs,
int blank_label);
*/
int cpu_ctc(torch::Tensor probs,
torch::Tensor grads,
torch::Tensor labels,
torch::Tensor label_sizes,
torch::Tensor sizes,
int minibatch_size,
torch::Tensor costs,
int blank_label);
#pragma once
/*
int gpu_ctc(THCudaTensor *probs,
THCudaTensor *grads,
THIntTensor *labels_ptr,
THIntTensor *label_sizes_ptr,
THIntTensor *sizes,
int minibatch_size,
THFloatTensor *costs,
int blank_label);
*/
int gpu_ctc(torch::Tensor probs,
torch::Tensor grads,
torch::Tensor labels,
torch::Tensor label_sizes,
torch::Tensor sizes,
int minibatch_size,
torch::Tensor costs,
int blank_label);
import torch
import warpctc_pytorch as warp_ctc
def test_empty_label(test_cpu=True, test_gpu=True):
probs = torch.FloatTensor([
[[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]],
[[0.6, 0.1, 0.1, 0.1, 0.1], [0.1, 0.1, 0.5, 0.2, 0.1]]
]).contiguous()
grads = torch.zeros(probs.size())
labels = torch.IntTensor([1, 2])
label_sizes = torch.IntTensor([2, 0])
sizes = torch.IntTensor([2, 2])
minibatch_size = probs.size(1)
if test_cpu:
costs = torch.zeros(minibatch_size)
warp_ctc.cpu_ctc(probs, grads, labels, label_sizes, sizes, minibatch_size, costs, 0)
print('CPU_cost: %f' % costs.sum())
print('CPU probs={}\ngrads={}\ncosts={}'.format(probs, grads, costs))
if test_gpu:
probs = probs.clone().cuda()
grads = torch.zeros(probs.size()).cuda()
costs = torch.zeros(minibatch_size)
warp_ctc.gpu_ctc(probs, grads, labels, label_sizes, sizes, minibatch_size, costs, 0)
print('GPU_cost: %f' % costs.sum())
print(grads.view(grads.size(0) * grads.size(1), grads.size(2)))
print('GPU probs={}\ngrads={}\ncosts={}'.format(probs, grads, costs))
if __name__ == '__main__':
print('torch.cuda.is_available() ', torch.cuda.is_available())
# test_empty_label(test_cpu=True, test_gpu=False)
test_empty_label(test_cpu=False, test_gpu=True)
# HIP_VISIBLE_DEVICES=1 python3 test_gpu_new.py
import torch
import warpctc_pytorch_change1 as warp_ctc_new
import warpctc_pytorch as warp_ctc
import time
def test_compare_cpu(repeat_num=20):
probs = torch.FloatTensor([
[[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]],
[[0.6, 0.1, 0.1, 0.1, 0.1], [0.1, 0.1, 0.5, 0.2, 0.1]]
]).contiguous()
labels = torch.IntTensor([1, 2])
label_sizes = torch.IntTensor([2, 0])
sizes = torch.IntTensor([2, 2])
minibatch_size = probs.size(1)
costs = torch.zeros(minibatch_size)
grads = torch.zeros(probs.size())
time_st = time.perf_counter()
# 1.运行老版本 CPU
for i in range(repeat_num):
probs_old = probs.clone()
costs_old = costs.clone()
grads_old = grads.clone()
warp_ctc.cpu_ctc(probs_old, grads_old, labels, label_sizes, sizes, minibatch_size, costs_old, 0)
if i == 0:
print('CPU_costs_old: %f' % costs_old.sum())
print('CPU probs_old={}\ngrads_old={}\ncosts_old={}'.format(probs_old, grads_old, costs_old))
time_used = (time.perf_counter() - time_st) / repeat_num
print('CPU warp_ctc old version using time: ', time_used)
time_st = time.perf_counter()
# 2.运行新版本 CPU
for i in range(repeat_num):
probs_new = probs.clone()
costs_new = costs.clone()
grads_new = grads.clone()
warp_ctc_new.cpu_ctc(probs_new, grads_new, labels, label_sizes, sizes, minibatch_size, costs_new, 0)
if i == 0:
print('CPU_costs_new: %f' % costs_new.sum())
print('CPU probs={}\ngrads_new={}\ncosts_new={}'.format(probs_new, grads_new, costs_new))
time_used = (time.perf_counter() - time_st) / repeat_num
print('CPU warp_ctc new version using time: ', time_used)
def test_compare_gpu():
probs0 = torch.FloatTensor([
[[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]],
[[0.6, 0.1, 0.1, 0.1, 0.1], [0.1, 0.1, 0.5, 0.2, 0.1]]
]).contiguous().cuda()
labels = torch.IntTensor([1, 2])
label_sizes = torch.IntTensor([2, 0])
sizes = torch.IntTensor([2, 2])
minibatch_size = probs0.size(1)
# 1.运行新版本 CPU
probs_new = probs0.clone().cuda()
costs_new = torch.zeros(minibatch_size)
grads_new = torch.zeros(probs0.size())
warp_ctc_new.cpu_ctc(probs_new, grads_new, labels, label_sizes, sizes, minibatch_size, costs_new, 0)
print('CPU_costs_new: %f' % costs_new.sum())
print('CPU probs_new={}\ngrads_new={}\ncosts_new={}'.format(probs_new, grads_new, costs_new))
# 2.运行老版本 CPU
probs = probs0.clone().cuda()
costs = torch.zeros(minibatch_size)
grads = torch.zeros(probs0.size())
warp_ctc.cpu_ctc(probs0, grads, labels, label_sizes, sizes, minibatch_size, costs, 0)
print('CPU_cost: %f' % costs.sum())
print('CPU probs={}\ngrads={}\ncosts={}'.format(probs, grads, costs))
if __name__ == '__main__':
print('torch.cuda.is_available() ', torch.cuda.is_available())
test_compare_cpu()
test_compare_gpu()
import torch
import warpctc_pytorch as warp_ctc
from torch.autograd import Function
from torch.nn import Module
from _warp_ctc import * # noqa
def _assert_no_grad(tensor):
assert not tensor.requires_grad, \
"gradients only computed for acts - please " \
"mark other tensors as not requiring gradients"
class _CTC(Function):
@staticmethod
def forward(ctx, acts, labels, act_lens, label_lens, size_average=False,
length_average=False, blank=0):
is_cuda = True if acts.is_cuda else False
# print('_CTC is_cuda', is_cuda)
acts = acts.contiguous()
loss_func = warp_ctc.gpu_ctc if is_cuda else warp_ctc.cpu_ctc
grads = torch.zeros(acts.size()).type_as(acts)
minibatch_size = acts.size(1)
costs = torch.zeros(minibatch_size).cpu()
loss_func(acts,
grads,
labels,
label_lens,
act_lens,
minibatch_size,
costs,
blank)
costs = torch.FloatTensor([costs.sum()])
if length_average:
# Compute the avg. log-probability per batch sample and frame.
total_length = torch.sum(act_lens).item()
grads = grads / total_length
costs = costs / total_length
elif size_average:
# Compute the avg. log-probability per batch sample.
grads = grads / minibatch_size
costs = costs / minibatch_size
ctx.grads = grads
return costs
@staticmethod
def backward(ctx, grad_output):
_grad_output = grad_output.to(ctx.grads.device)
return ctx.grads.mul_(_grad_output), None, None, None, None, None, None
class CTCLoss(Module):
"""
Parameters:
size_average (bool): normalize the loss by the batch size
(default: `False`)
length_average (bool): normalize the loss by the total number of frames
in the batch. If `True`, supersedes `size_average`
(default: `False`)
"""
def __init__(self, blank=0, size_average=False, length_average=False):
super(CTCLoss, self).__init__()
self.ctc = _CTC.apply
self.blank = blank
self.size_average = size_average
self.length_average = length_average
def forward(self, acts, labels, act_lens, label_lens):
"""
acts: Tensor of (seqLength x batch x outputDim) containing output from network
labels: 1 dimensional Tensor containing all the targets of the batch in one sequence
act_lens: Tensor of size (batch) containing size of each output sequence from the network
label_lens: Tensor of (batch) containing label length of each example
"""
# labels must be 1 dimensional
if len(labels.size()) != 1:
print('error!! len(labels.size()) must be 1, get {}'.format(len(labels.size())))
raise ValueError
_assert_no_grad(labels)
_assert_no_grad(act_lens)
_assert_no_grad(label_lens)
return self.ctc(acts, labels, act_lens, label_lens, self.size_average,
self.length_average, self.blank)
#include <cstddef>
#include <iostream>
#include <algorithm>
#include "ctc.h"
#include "detail/cpu_ctc.h"
#ifdef __HIPCC__
#include "detail/gpu_ctc.h"
#endif
extern "C" {
int get_warpctc_version() {
return 13;
}
const char *ctcGetStatusString(ctcStatus_t status) {
switch (status) {
case CTC_STATUS_SUCCESS:
return "no error";
case CTC_STATUS_MEMOPS_FAILED:
return "cuda memcpy or memset failed";
case CTC_STATUS_INVALID_VALUE:
return "invalid value";
case CTC_STATUS_EXECUTION_FAILED:
return "execution failed";
case CTC_STATUS_UNKNOWN_ERROR:
default:
return "unknown error";
}
}
ctcStatus_t compute_ctc_loss(const float *const activations,
float *gradients,
const int *const flat_labels,
const int *const label_lengths,
const int *const input_lengths,
int alphabet_size,
int minibatch,
float *costs,
void *workspace,
ctcOptions options) {
if (activations == nullptr ||
flat_labels == nullptr ||
label_lengths == nullptr ||
input_lengths == nullptr ||
costs == nullptr ||
workspace == nullptr ||
alphabet_size <= 0 ||
minibatch <= 0)
return CTC_STATUS_INVALID_VALUE;
if (options.loc == CTC_CPU) {
CpuCTC<float> ctc(alphabet_size, minibatch, workspace, options.num_threads,
options.blank_label);
if (gradients != NULL)
return ctc.cost_and_grad(activations, gradients,
costs,
flat_labels, label_lengths,
input_lengths);
else
return ctc.score_forward(activations,
costs, flat_labels,
label_lengths, input_lengths);
} else if (options.loc == CTC_GPU) {
#ifdef __HIPCC__
GpuCTC<float> ctc(alphabet_size, minibatch, workspace, options.stream,
options.blank_label);
if (gradients != NULL)
return ctc.cost_and_grad(activations, gradients, costs,
flat_labels, label_lengths,
input_lengths);
else
return ctc.score_forward(activations, costs, flat_labels,
label_lengths, input_lengths);
#else
std::cerr << "GPU execution requested, but not compiled with GPU support" << std::endl;
return CTC_STATUS_EXECUTION_FAILED;
#endif
} else {
return CTC_STATUS_INVALID_VALUE;
}
}
ctcStatus_t get_workspace_size(const int *const label_lengths,
const int *const input_lengths,
int alphabet_size, int minibatch,
ctcOptions options,
size_t *size_bytes) {
if (label_lengths == nullptr ||
input_lengths == nullptr ||
size_bytes == nullptr ||
alphabet_size <= 0 ||
minibatch <= 0)
return CTC_STATUS_INVALID_VALUE;
// This is the max of all S and T for all examples in the minibatch.
int maxL = *std::max_element(label_lengths, label_lengths + minibatch);
int maxT = *std::max_element(input_lengths, input_lengths + minibatch);
const int S = 2 * maxL + 1;
*size_bytes = 0;
if (options.loc == CTC_GPU) {
// GPU storage
//nll_forward, nll_backward
*size_bytes += 2 * sizeof(float) * minibatch;
//repeats
*size_bytes += sizeof(int) * minibatch;
//label offsets
*size_bytes += sizeof(int) * minibatch;
//utt_length
*size_bytes += sizeof(int) * minibatch;
//label lengths
*size_bytes += sizeof(int) * minibatch;
//labels without blanks - overallocate for now
*size_bytes += sizeof(int) * maxL * minibatch;
//labels with blanks
*size_bytes += sizeof(int) * S * minibatch;
//alphas
*size_bytes += sizeof(float) * S * maxT * minibatch;
//denoms
*size_bytes += sizeof(float) * maxT * minibatch;
//probs (since we will pass in activations)
*size_bytes += sizeof(float) * alphabet_size * maxT * minibatch;
} else {
//cpu can eventually replace all minibatch with
//max number of concurrent threads if memory is
//really tight
//per minibatch memory
size_t per_minibatch_bytes = 0;
//output
per_minibatch_bytes += sizeof(float) * alphabet_size;
//alphas
per_minibatch_bytes += sizeof(float) * S * maxT;
//betas
per_minibatch_bytes += sizeof(float) * S;
//labels w/blanks, e_inc, s_inc
per_minibatch_bytes += 3 * sizeof(int) * S;
*size_bytes = per_minibatch_bytes * minibatch;
//probs
*size_bytes += sizeof(float) * alphabet_size * maxT * minibatch;
}
return CTC_STATUS_SUCCESS;
}
}
// Includes, system
#include <stdio.h>
#include <stdlib.h>
// Includes, cuda
#include <cuda_runtime.h>
//#include<cublas_v2.h>
#include <cuda_runtime_api.h>
// Includes, cuda helper functions
// #include <helper_cuda.h>
// For the functors
#include "detail/ctc_helper.h"
#include "ctc.h"
const int warp_size = 64;
const int kCUDABlockNumThreads = 256;
template<int NT, typename T, typename Rop>
struct CTAReduce;
template<int NT, typename T, typename Rop>
struct CTAReduce {
enum {
Size = NT, Capacity = NT
};
struct Storage {
T shared[Capacity];
};
__device__ static T reduce(int tid, T x, Storage &storage, int count, Rop g) {
T *s = storage.shared;
s[tid] = x;
__syncthreads();
// Fold the data in half with each pass.
#pragma unroll
for (int offset = NT / 2; offset >= warp_size; offset /= 2) {
if (tid + offset < count && tid < offset) {
x = g(x, s[offset + tid]);
s[tid] = x;
}
__syncthreads();
}
T shuff;
for (int offset = warp_size / 2; offset > 0; offset /= 2) {
// shuff = __shfl_down(0xFFFFFFF, x, offset);
shuff = __shfl_down(x, offset);
if (tid + offset < count && tid < offset) {
x = g(x, shuff);
}
}
return x;
}
};
template<int NT, typename Iop, typename Rop, typename T>
__global__ void reduce_rows(Iop f, Rop g, const T *input, T *output,
int num_rows, int num_cols) {
typedef CTAReduce<NT, T, Rop> R;
__shared__ typename R::Storage storage;
int tid = threadIdx.x;
int idx = tid;
int col = blockIdx.x;
T curr;
// Each block works on a column
if (idx < num_rows) {
curr = f(input[idx + col * num_rows]);
}
// __syncthreads();
idx += NT;
while (idx < num_rows) {
curr = g(curr, f(input[idx + col * num_rows]));
idx += NT;
}
// Sum thread-totals over the CTA.
curr = R::reduce(tid, curr, storage, num_rows, g);
// Store result in out
if (tid == 0) {
output[col] = curr;
}
}
template<int NT, typename Iop, typename Rop, typename T>
__global__ void reduce_cols(Iop f, Rop g, const T *input, T *output,
int num_rows, int num_cols) {
__shared__ T s[NT];
int warps_per_block = NT / warp_size;
int row = blockDim.x * blockIdx.x + threadIdx.x;
int col = threadIdx.y;
T curr;
if (row < num_rows && col < num_cols) {
curr = f(input[row + col * num_rows]);
col += blockDim.y;
while (col < num_cols) {
curr = g(curr, f(input[row + col * num_rows]));
col += blockDim.y;
}
}
s[threadIdx.x * warps_per_block + threadIdx.y] = curr;
__syncthreads();
// Reduce
if (threadIdx.y == 0 && row < num_rows) {
#pragma unroll
for (int i = 1; i < warps_per_block && i < num_cols; ++i)
curr = g(curr, s[i + threadIdx.x * warps_per_block]);
output[row] = curr;
}
}
struct ReduceHelper {
template<typename T, typename Iof, typename Rof>
static void impl(Iof f, Rof g, const T *input, T *output, int num_rows, int num_cols, bool axis, CUstream stream) {
int grid_size;
if (axis) {
grid_size = num_cols;
reduce_rows<kCUDABlockNumThreads><<<grid_size, kCUDABlockNumThreads, 0, stream>>>
(f, g, input, output, num_rows, num_cols);
} else {
dim3 tpb(warp_size, kCUDABlockNumThreads / warp_size);
grid_size = (num_cols + warp_size - 1) / warp_size;
reduce_cols<kCUDABlockNumThreads><<<grid_size, tpb, 0, stream>>>
(f, g, input, output, num_rows, num_cols);
}
}
};
template<typename T, typename Iof, typename Rof>
ctcStatus_t reduce(Iof f, Rof g, const T *input, T *output, int rows, int cols, bool axis, CUstream stream) {
ReduceHelper::impl(f, g, input, output, rows, cols, axis, stream);
hipStreamSynchronize(stream);
hipError_t err = hipGetLastError();
if (err != hipSuccess)
return CTC_STATUS_EXECUTION_FAILED;
return CTC_STATUS_SUCCESS;
}
ctcStatus_t reduce_negate(const float *input, float *output, int rows, int cols, bool axis, CUstream stream) {
return reduce(ctc_helper::negate<float>(), ctc_helper::add<float>(), input, output, rows, cols, axis, stream);
}
ctcStatus_t reduce_exp(const float *input, float *output, int rows, int cols, bool axis, CUstream stream) {
return reduce(ctc_helper::exponential<float>(), ctc_helper::add<float>(), input, output, rows, cols, axis, stream);
}
ctcStatus_t reduce_max(const float *input, float *output, int rows, int cols, bool axis, CUstream stream) {
auto ctc_status = reduce(ctc_helper::identity<float>(), ctc_helper::maximum<float>(), input, output, rows, cols, axis, stream);
return ctc_status;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment