#pragma once #include #include #include #include #include #if !defined(CTC_DISABLE_OMP) && !defined(APPLE) #include #endif #include "ctc_helper.h" template class CpuCTC { public: // Noncopyable CpuCTC(int alphabet_size, int minibatch, void* workspace, int num_threads, int blank_label) : alphabet_size_(alphabet_size), minibatch_(minibatch), num_threads_(num_threads), workspace_(workspace), blank_label_(blank_label) { #if defined(CTC_DISABLE_OMP) || defined(APPLE) #else if (num_threads > 0) { omp_set_num_threads(num_threads); } else { num_threads_ = omp_get_max_threads(); } #endif }; CpuCTC(const CpuCTC&) = delete; CpuCTC& operator=(const CpuCTC&) = delete; ctcStatus_t cost_and_grad( const ProbT* const activations, ProbT* grads, ProbT* costs, const int* const flat_labels, const int* const label_lengths, const int* const input_lengths); ctcStatus_t score_forward( const ProbT* const activations, ProbT* costs, const int* const flat_labels, const int* const label_lengths, const int* const input_lengths); private: class CpuCTC_metadata { private: int setup_labels(const int* const labels, int blank_label, int L, int S); public: CpuCTC_metadata(int L, int S, int T, int mb, int alphabet_size, void* workspace, size_t bytes_used, int blank_label, const int* const labels); ProbT* alphas; ProbT* betas; int* labels_w_blanks; int* e_inc; int* s_inc; ProbT* output; int repeats; }; int alphabet_size_; // Number of characters plus blank int minibatch_; int num_threads_; int blank_label_; void* workspace_; void softmax(const ProbT* const activations, ProbT* probs, const int* const input_lengths); std::tuple cost_and_grad_kernel(ProbT* grad, const ProbT* const probs, const int* const labels, int T, int L, int mb, size_t bytes_used); ProbT compute_alphas(const ProbT* probs, int repeats, int S, int T, const int* const e_inc, const int* const s_inc, const int* const labels, ProbT* alphas); ProbT compute_betas_and_grad( ProbT* grad, const ProbT* const probs, ProbT log_partition, int repeats, int S, int T, const int* const e_inc, const int* const s_inc, const int* const labels, ProbT* alphas, ProbT* betas, ProbT* output); }; template CpuCTC::CpuCTC_metadata::CpuCTC_metadata( int L, int S, int T, int mb, int alphabet_size, void* workspace, size_t bytes_used, int blank_label, const int* const labels) { alphas = reinterpret_cast(static_cast(workspace) + bytes_used); bytes_used += sizeof(ProbT) * S * T; std::fill(alphas, alphas + S * T, ctc_helper::neg_inf()); betas = reinterpret_cast(static_cast(workspace) + bytes_used); bytes_used += sizeof(ProbT) * S; std::fill(betas, betas + S, ctc_helper::neg_inf()); labels_w_blanks = reinterpret_cast(static_cast(workspace) + bytes_used); bytes_used += sizeof(int) * S; e_inc = reinterpret_cast(static_cast(workspace) + bytes_used); bytes_used += sizeof(int) * S; s_inc = reinterpret_cast(static_cast(workspace) + bytes_used); bytes_used += sizeof(int) * S; output = reinterpret_cast(static_cast(workspace) + bytes_used); bytes_used += sizeof(ProbT) * alphabet_size; repeats = setup_labels(labels, blank_label, L, S); } template int CpuCTC::CpuCTC_metadata::setup_labels(const int* const labels, int blank_label, int L, int S) { int e_counter = 0; int s_counter = 0; s_inc[s_counter++] = 1; // get start int repeats = 0; // number of repeat for (int i = 1; i < L; ++i) { if (labels[i - 1] == labels[i]) { // repeat label s_inc[s_counter++] = 1; s_inc[s_counter++] = 1; // label and blank e_inc[e_counter++] = 1; e_inc[e_counter++] = 1; ++repeats; } else { s_inc[s_counter++] = 2; // single label and no repeat e_inc[e_counter++] = 2; } } e_inc[e_counter++] = 1; // get end // // printf("s_counter=%d, e_counter=%d, repeats=%d\n", s_counter, e_counter, repeats); // for (int i = 0; i < S; ++i) { // printf("s_inc[%d]=%d, e_inc[%d]=%d\n", i, s_inc[i], i, e_inc[i]); // } for (int i = 0; i < L; ++i) { labels_w_blanks[2 * i] = blank_label; labels_w_blanks[2 * i + 1] = labels[i]; } labels_w_blanks[S - 1] = blank_label; // end is blank return repeats; } template void CpuCTC::softmax(const ProbT* const activations, ProbT* probs, const int* const input_lengths) { #pragma omp parallel for for (int mb = 0; mb < minibatch_; ++mb) { // iter batch for (int c = 0; c < input_lengths[mb]; ++c) { // iter input audio vec int col_offset = (mb + minibatch_ * c) * alphabet_size_; // vec index * alphabet_size_ //// get max_activation ProbT max_activation = -std::numeric_limits::infinity(); // set -1 matrix for (int r = 0; r < alphabet_size_; ++r) // iter alphabet max_activation = std::max(max_activation, activations[r + col_offset]); //// compute probs between activations and max ProbT denom = ProbT(0.); for (int r = 0; r < alphabet_size_; ++r) { probs[r + col_offset] = std::exp(activations[r + col_offset] - max_activation); denom += probs[r + col_offset]; } //// scale probs for (int r = 0; r < alphabet_size_; ++r) { probs[r + col_offset] /= denom; } } } } template std::tuple CpuCTC< ProbT>::cost_and_grad_kernel(ProbT* grad, const ProbT* const probs, const int* const labels, int T, int L, int mb, size_t bytes_used) { const int S = 2 * L + 1; // Number of labels with blanks CpuCTC_metadata ctcm(L, S, T, mb, alphabet_size_, workspace_, bytes_used, blank_label_, labels); bool over_threshold = false; // check (length of labels + repeats) <= (length of utterance) if (L + ctcm.repeats > T) { return std::make_tuple(ProbT(0), over_threshold); // TODO, not right to return 0 } ProbT llForward = compute_alphas(probs, ctcm.repeats, S, T, ctcm.e_inc, ctcm.s_inc, ctcm.labels_w_blanks, ctcm.alphas); ProbT llBackward = compute_betas_and_grad(grad, probs, llForward, ctcm.repeats, S, T, ctcm.e_inc, ctcm.s_inc, ctcm.labels_w_blanks, ctcm.alphas, ctcm.betas, ctcm.output); ProbT diff = std::abs(llForward - llBackward); if (diff > ctc_helper::threshold) { over_threshold = true; } return std::make_tuple(-llForward, over_threshold); } // Computes forward probabilities template ProbT CpuCTC::compute_alphas( const ProbT* probs, int repeats, int S, int T, const int* const e_inc, const int* const s_inc, const int* const labels, ProbT* alphas) { int start = (((S / 2) + repeats - T) < 0) ? 0 : 1, end = S > 1 ? 2 : 1; // get log probs of label for (int i = start; i < end; ++i) { alphas[i] = std::log(probs[labels[i]]); } // printf("start=%d, end=%d, t=1~srcLen=%d, repeats=%d\n", start, end, T, repeats); for (int t = 1; t < T; ++t) { int remain = (S / 2) + repeats - (T - t); // printf("t=%d, remain=%d\n", t, remain); if (remain >= 0) start += s_inc[remain]; if (t <= (S / 2) + repeats) end += e_inc[t - 1]; int startloop = start; int idx1 = t * S, idx2 = (t - 1) * S, idx3 = t * (alphabet_size_ * minibatch_); if (start == 0) { alphas[idx1] = alphas[idx2] + std::log(probs[blank_label_ + idx3]); // printf("00 alphas[%d]=%f, alphas[%d]=%f\n", t, alphas[idx1], t - 1, alphas[idx2]); startloop += 1; } // printf("start=%d, startloop=%d, end=%d\n", start, startloop, end); for (int i = startloop; i < end; ++i) { // printf("alphas[(t - 1=%d, u=%d)]=%f\n", t - 1, i, alphas[i + idx2]); // printf("alphas[(t - 1=%d, u-1=%d)]=%f\n", t - 1, i - 1, alphas[(i - 1) + idx2]); ProbT prev_sum = ctc_helper::log_plus()(alphas[i + idx2], alphas[(i - 1) + idx2]); // printf("11 t=%d, u=%d, prev_sum=%f\n", t, i, prev_sum); // Skip two if not on blank and not on repeat. if (labels[i] != blank_label_ && i != 1 && labels[i] != labels[i - 2]) { prev_sum = ctc_helper::log_plus()(prev_sum, alphas[(i - 2) + idx2]); // printf("22 t=%d, u=%d, prev_sum=%f\n", t, i, prev_sum); } alphas[i + idx1] = prev_sum + std::log(probs[labels[i] + idx3]); // printf("33 alpha[%d,%d]=%f, log(p(%d))=%f, label(%d)=%d\n", t, i, alphas[i + idx1], labels[i], std::log(probs[labels[i] + idx3]), i, labels[i]); } // printf("\n"); } // printf("final start=%d, end=%d\n", start, end); ProbT loglike = ctc_helper::neg_inf(); for (int i = start; i < end; ++i) { loglike = ctc_helper::log_plus()(loglike, alphas[i + (T - 1) * S]); } // printf("compute alpha cost=%f\n", -loglike); #ifdef DEBUG_KERNEL printf("cpu alphas:\n"); printf("T=%d, (T-1)*S=%d, start=%d, end=%d\n", T, (T - 1) * S, start, end); for (int t = start; t < end; ++t) { printf("%.5f ", alphas[t + (T - 1) * S]); } printf("\n"); printf("alphas loglike=%f\n", loglike); #endif return loglike; } // Starting from T, we sweep backward over the alpha array computing one column // of betas as we go. At each position we can update product alpha * beta and then // sum into the gradient associated with each label. // NOTE computes gradient w.r.t UNNORMALIZED final layer activations. // Assumed passed in grads are already zeroed! template ProbT CpuCTC::compute_betas_and_grad( ProbT* grad, const ProbT* const probs, ProbT log_partition, int repeats, int S, int T, const int* const e_inc, const int* const s_inc, const int* const labels, ProbT* alphas, ProbT* betas, ProbT* output) { int start = S > 1 ? (S - 2) : 0, end = (T > (S / 2) + repeats) ? S : S - 1; std::fill(output, output + alphabet_size_, ctc_helper::neg_inf()); // set the starting values in the beta column at the very right edge for (int i = start; i < end; ++i) { betas[i] = std::log(probs[labels[i] + (T - 1) * (alphabet_size_ * minibatch_)]); // compute alpha * beta in log space at this position in (S, T) space alphas[i + (T - 1) * S] += betas[i]; // update the gradient associated with this label // essentially performing a reduce-by-key in a sequential manner output[labels[i]] = ctc_helper::log_plus()(alphas[i + (T - 1) * S], output[labels[i]]); } // update the gradient wrt to each unique label for (int i = 0; i < alphabet_size_; ++i) { int idx3 = (T - 1) * alphabet_size_ * minibatch_ + i; if (output[i] == 0.0 || output[i] == ctc_helper::neg_inf() || probs[idx3] == 0.0) { grad[idx3] = probs[idx3]; } else { grad[idx3] = probs[idx3] - std::exp(output[i] - std::log(probs[idx3]) - log_partition); } } // loop from the second to last column all the way to the left for (int t = T - 2; t >= 0; --t) { int remain = (S / 2) + repeats - (T - t); if (remain >= -1) start -= s_inc[remain + 1]; if (t < (S / 2) + repeats) end -= e_inc[t]; int endloop = end == S ? end - 1 : end; int idx1 = t * S, idx3 = t * (alphabet_size_ * minibatch_); std::fill(output, output + alphabet_size_, ctc_helper::neg_inf()); for (int i = start; i < endloop; ++i) { ProbT next_sum = ctc_helper::log_plus()(betas[i], betas[(i + 1)]); // Skip two if not on blank and not on repeat. if (labels[i] != blank_label_ && i != (S - 2) && labels[i] != labels[i + 2]) { next_sum = ctc_helper::log_plus()(next_sum, betas[(i + 2)]); } betas[i] = next_sum + std::log(probs[labels[i] + idx3]); // compute alpha * beta in log space alphas[i + idx1] += betas[i]; // update the gradient associated with this label output[labels[i]] = ctc_helper::log_plus()(alphas[i + idx1], output[labels[i]]); } if (end == S) { betas[(S - 1)] = betas[(S - 1)] + std::log(probs[blank_label_ + idx3]); alphas[(S - 1) + idx1] += betas[(S - 1)]; output[labels[S - 1]] = ctc_helper::log_plus()(alphas[S - 1 + idx1], output[labels[S - 1]]); } // go over the unique labels and compute the final grad // wrt to each one at this time step for (int i = 0; i < alphabet_size_; ++i) { if (output[i] == 0.0 || output[i] == ctc_helper::neg_inf() || probs[idx3] == 0.0) { grad[idx3] = probs[idx3]; } else { grad[idx3] = probs[idx3] - std::exp(output[i] - std::log(probs[idx3]) - log_partition); } ++idx3; } } ProbT loglike = ctc_helper::neg_inf(); for (int i = start; i < end; ++i) { loglike = ctc_helper::log_plus()(loglike, betas[i]); } #ifdef DEBUG_KERNEL printf("cpu betas:\n"); printf("T=%d, (T-1)*S=%d, start=%d, end=%d\n", T, (T - 1) * S, start, end); for (int t = start; t < end; ++t) { printf("%.5f ", betas[t]); } printf("\n"); printf("betas loglike=%f\n", loglike); #endif return loglike; } template ctcStatus_t CpuCTC::cost_and_grad( const ProbT* const activations, ProbT* grads, ProbT* costs, const int* const flat_labels, const int* const label_lengths, const int* const input_lengths) { if (activations == nullptr || grads == nullptr || costs == nullptr || flat_labels == nullptr || label_lengths == nullptr || input_lengths == nullptr) return CTC_STATUS_INVALID_VALUE; ProbT* probs = static_cast(workspace_); // get max length input audio vector int maxT = *std::max_element(input_lengths, input_lengths + minibatch_); // memory to use size_t bytes_used = sizeof(ProbT) * minibatch_ * alphabet_size_ * maxT; // per minibatch memory size_t per_minibatch_bytes = 0; // get max length input text vector int maxL = *std::max_element(label_lengths, label_lengths + minibatch_); int maxS = 2 * maxL + 1; // labels with blanks // output per_minibatch_bytes += sizeof(float) * alphabet_size_; // vector of alphabet // alphas per_minibatch_bytes += sizeof(float) * maxS * maxT; // matrix size // betas per_minibatch_bytes += sizeof(float) * maxS; // sequence label size is n , alloc 2n+1, with blanks // labels w/blanks, e_inc, s_inc per_minibatch_bytes += 3 * sizeof(int) * maxS; // compute softmax probs softmax(activations, probs, input_lengths); #pragma omp parallel for for (int mb = 0; mb < minibatch_; ++mb) { const int T = input_lengths[mb]; // Length of utterance (time) const int L = label_lengths[mb]; // Number of labels in transcription bool mb_status; std::tie(costs[mb], mb_status) = cost_and_grad_kernel( grads + mb * alphabet_size_, probs + mb * alphabet_size_, flat_labels + std::accumulate(label_lengths, label_lengths + mb, 0), T, L, mb, bytes_used + mb * per_minibatch_bytes); } return CTC_STATUS_SUCCESS; } template ctcStatus_t CpuCTC::score_forward( const ProbT* const activations, ProbT* costs, const int* const flat_labels, const int* const label_lengths, const int* const input_lengths) { if (activations == nullptr || costs == nullptr || flat_labels == nullptr || label_lengths == nullptr || input_lengths == nullptr) return CTC_STATUS_INVALID_VALUE; ProbT* probs = static_cast(workspace_); int maxT = *std::max_element(input_lengths, input_lengths + minibatch_); size_t bytes_used = sizeof(ProbT) * minibatch_ * alphabet_size_ * maxT; // per minibatch memory size_t per_minibatch_bytes = 0; int maxL = *std::max_element(label_lengths, label_lengths + minibatch_); int maxS = 2 * maxL + 1; // output per_minibatch_bytes += sizeof(float) * alphabet_size_; // alphas per_minibatch_bytes += sizeof(float) * maxS * maxT; // betas per_minibatch_bytes += sizeof(float) * maxS; // labels w/blanks, e_inc, s_inc per_minibatch_bytes += 3 * sizeof(int) * maxS; softmax(activations, probs, input_lengths); #pragma omp parallel for for (int mb = 0; mb < minibatch_; ++mb) { const int T = input_lengths[mb]; // Length of utterance (time) const int L = label_lengths[mb]; // Number of labels in transcription const int S = 2 * L + 1; // Number of labels with blanks CpuCTC_metadata ctcm( L, S, T, mb, alphabet_size_, workspace_, bytes_used + mb * per_minibatch_bytes, blank_label_, flat_labels + std::accumulate(label_lengths, label_lengths + mb, 0)); if (L + ctcm.repeats > T) costs[mb] = ProbT(0); else { costs[mb] = -compute_alphas(probs + mb * alphabet_size_, ctcm.repeats, S, T, ctcm.e_inc, ctcm.s_inc, ctcm.labels_w_blanks, ctcm.alphas); } } return CTC_STATUS_SUCCESS; }