"docs/source/en/api/logging.md" did not exist on "c3d78cd3067612175ac9f0f8b234abf5a2e1f510"
Commit 99e2985d authored by lishen's avatar lishen
Browse files

warpctc for dcu

parent 0bf5eb5f
/******************************************************************************
* Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
*
* Code and text by Sean Baxter, NVIDIA Research
* See http://nvlabs.github.io/moderngpu for repository and documentation.
*
******************************************************************************/
#pragma once
namespace mgpu {
enum MgpuBounds {
MgpuBoundsLower,
MgpuBoundsUpper
};
enum MgpuScanType {
MgpuScanTypeExc,
MgpuScanTypeInc
};
enum MgpuSearchType {
MgpuSearchTypeNone,
MgpuSearchTypeIndex,
MgpuSearchTypeMatch,
MgpuSearchTypeIndexMatch
};
enum MgpuJoinKind {
MgpuJoinKindInner,
MgpuJoinKindLeft,
MgpuJoinKindRight,
MgpuJoinKindOuter
};
enum MgpuSetOp {
MgpuSetOpIntersection,
MgpuSetOpUnion,
MgpuSetOpDiff,
MgpuSetOpSymDiff
};
} // namespace mgpu
/******************************************************************************
* Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
*
* Code and text by Sean Baxter, NVIDIA Research
* See http://nvlabs.github.io/moderngpu for repository and documentation.
*
******************************************************************************/
#pragma once
#include <functional>
#include <iterator>
#include <cfloat>
#include <typeinfo>
#include <vector>
#include <list>
#include <map>
#include <algorithm>
#include <cassert>
#include <memory>
#include <cmath>
#include <cstdio>
#include <cstdlib>
#ifndef MGPU_MIN
#define MGPU_MIN(x, y) (((x) <= (y)) ? (x) : (y))
#define MGPU_MAX(x, y) (((x) >= (y)) ? (x) : (y))
#define MGPU_MAX0(x) (((x) >= 0) ? (x) : 0)
#define MGPU_ABS(x) (((x) >= 0) ? (x) : (-x))
#define MGPU_DIV_UP(x, y) (((x) + (y) - 1) / (y))
#define MGPU_DIV_ROUND(x, y) (((x) + (y) / 2) / (y))
#define MGPU_ROUND_UP(x, y) ((y) * MGPU_DIV_UP(x, y))
#define MGPU_SHIFT_DIV_UP(x, y) (((x) + ((1<< (y)) - 1))>> y)
#define MGPU_ROUND_UP_POW2(x, y) (((x) + (y) - 1) & ~((y) - 1))
#define MGPU_ROUND_DOWN_POW2(x, y) ((x) & ~((y) - 1))
#define MGPU_IS_POW_2(x) (0 == ((x) & ((x) - 1)))
#endif // MGPU_MIN
namespace mgpu {
typedef unsigned char byte;
typedef unsigned int uint;
typedef signed short int16;
typedef unsigned short ushort;
typedef unsigned short uint16;
typedef long long int64;
typedef unsigned long long uint64;
// IsPow2<X>::value is true if X is a power of 2.
template<int X> struct sIsPow2 {
enum { value = 0 == (X & (X - 1)) };
};
// Finds the base-2 logarithm of X. value is -1 if X is not a power of 2.
template<int X, bool roundUp = true> struct sLogPow2 {
enum { extra = sIsPow2<X>::value ? 0 : (roundUp ? 1 : 0) };
enum { inner = sLogPow2<X / 2>::inner + 1 };
enum { value = inner + extra };
};
template<bool roundUp> struct sLogPow2<0, roundUp> {
enum { inner = 0 };
enum { value = 0 };
};
template<bool roundUp> struct sLogPow2<1, roundUp> {
enum { inner = 0 };
enum { value = 0 };
};
template<int X, int Y>
struct sDivUp {
enum { value = (X + Y - 1) / Y };
};
template<int count, int levels> struct sDiv2RoundUp {
enum { value = sDiv2RoundUp<sDivUp<count, 2>::value, levels - 1>::value };
};
template<int count> struct sDiv2RoundUp<count, 0> {
enum { value = count };
};
template<int X, int Y>
struct sDivSafe {
enum { value = X / Y };
};
template<int X>
struct sDivSafe<X, 0> {
enum { value = 0 };
};
template<int X, int Y>
struct sRoundUp {
enum { rem = X % Y };
enum { value = X + (rem ? (Y - rem) : 0) };
};
template<int X, int Y>
struct sRoundDown {
enum { rem = X % Y };
enum { value = X - rem };
};
// IntegerDiv is a template for avoiding divisions by zero in template
// evaluation. Templates always evaluate both b and c in an expression like
// a ? b : c, and will error if either rhs contains an illegal expression,
// even if the ternary is explictly designed to guard against that.
template<int X, int Y>
struct sIntegerDiv {
enum { value = X / (Y ? Y : (X + 1)) };
};
template<int X, int Y>
struct sMax {
enum { value = (X >= Y) ? X : Y };
};
template<int X, int Y>
struct sMin {
enum { value = (X <= Y) ? X : Y };
};
template<int X>
struct sAbs {
enum { value = (X >= 0) ? X : -X };
};
// Finds the number of powers of 2 in the prime factorization of X.
template<int X, int LSB = 1 & X> struct sNumFactorsOf2 {
enum { shifted = X >> 1 };
enum { value = 1 + sNumFactorsOf2<shifted>::value };
};
template<int X> struct sNumFactorsOf2<X, 1> {
enum { value = 0 };
};
// Returns the divisor for a conflict-free transpose.
template<int X, int NumBanks = 32> struct sBankConflictDivisor {
enum { value =
(1 & X) ? 0 :
(sIsPow2<X>::value ? NumBanks :
(1<< sNumFactorsOf2<X>::value)) };
enum { log_value = sLogPow2<value>::value };
};
template<int NT, int X, int NumBanks = 32> struct sConflictFreeStorage {
enum { count = NT * X };
enum { divisor = sBankConflictDivisor<X, NumBanks>::value };
enum { padding = sDivSafe<count, divisor>::value };
enum { value = count + padding };
};
} // namespace mgpu
/** \file ctc.h
* Contains a simple C interface to call fast CPU and GPU based computation
* of the CTC loss.
*/
#pragma once
#ifdef __cplusplus
#include <cstddef>
#include <torch/extension.h>
extern "C" {
#endif
//forward declare of CUDA typedef to avoid needing to pull in CUDA headers
//typedef struct CUstream_st* CUstream;
typedef struct ihipStream_t* CUstream;
typedef enum {
CTC_STATUS_SUCCESS = 0,
CTC_STATUS_MEMOPS_FAILED = 1,
CTC_STATUS_INVALID_VALUE = 2,
CTC_STATUS_EXECUTION_FAILED = 3,
CTC_STATUS_UNKNOWN_ERROR = 4
} ctcStatus_t;
/** Returns a single integer which specifies the API version of the warpctc library */
int get_warpctc_version();
/** Returns a string containing a description of status that was passed in
* \param[in] status identifies which string should be returned
* \return C style string containing the text description
* */
const char* ctcGetStatusString(ctcStatus_t status);
typedef enum {
CTC_CPU = 0,
CTC_GPU = 1
} ctcComputeLocation;
/** Structure used for options to the CTC compution. Applications
* should zero out the array using memset and sizeof(struct
* ctcOptions) in C or default initialization (e.g. 'ctcOptions
* options{};' or 'auto options = ctcOptions{}') in C++ to ensure
* forward compatibility with added options. */
struct ctcOptions {
/// indicates where the ctc calculation should take place {CTC_CPU | CTC_GPU}
ctcComputeLocation loc;
union {
/// used when loc == CTC_CPU, the maximum number of threads that can be used
unsigned int num_threads;
/// used when loc == CTC_GPU, which stream the kernels should be launched in
CUstream stream;
};
/// the label value/index that the CTC calculation should use as the blank label
int blank_label;
};
/** Compute the connectionist temporal classification loss between a sequence
* of probabilities and a ground truth labeling. Optionally compute the
* gradient with respect to the inputs.
* \param [in] activations pointer to the activations in either CPU or GPU
* addressable memory, depending on info. We assume a fixed
* memory layout for this 3 dimensional tensor, which has dimension
* (t, n, p), where t is the time index, n is the minibatch index,
* and p indexes over probabilities of each symbol in the alphabet.
* The memory layout is (t, n, p) in C order (slowest to fastest changing
* index, aka row-major), or (p, n, t) in Fortran order (fastest to slowest
* changing index, aka column-major). We also assume strides are equal to
* dimensions - there is no padding between dimensions.
* More precisely, element (t, n, p), for a problem with mini_batch examples
* in the mini batch, and alphabet_size symbols in the alphabet, is located at:
* activations[(t * mini_batch + n) * alphabet_size + p]
* \param [out] gradients if not NULL, then gradients are computed. Should be
* allocated in the same memory space as probs and memory
* ordering is identical.
* \param [in] flat_labels Always in CPU memory. A concatenation
* of all the labels for the minibatch.
* \param [in] label_lengths Always in CPU memory. The length of each label
* for each example in the minibatch.
* \param [in] input_lengths Always in CPU memory. The number of time steps
* for each sequence in the minibatch.
* \param [in] alphabet_size The number of possible output symbols. There
* should be this many probabilities for each time step.
* \param [in] mini_batch How many examples in a minibatch.
* \param [out] costs Always in CPU memory. The cost of each example in the
* minibatch.
* \param [in,out] workspace In same memory space as probs. Should be of
* size requested by get_workspace_size.
* \param [in] options see struct ctcOptions
*
* \return Status information
*
* */
ctcStatus_t compute_ctc_loss(const float* const activations,
float* gradients,
const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths,
int alphabet_size,
int minibatch,
float *costs,
void *workspace,
ctcOptions options);
/** For a given set of labels and minibatch size return the required workspace
* size. This will need to be allocated in the same memory space as your
* probabilities.
* \param [in] label_lengths Always in CPU memory. The length of each label
* for each example in the minibatch.
* \param [in] input_lengths Always in CPU memory. The number of time steps
* for each sequence in the minibatch.
* \param [in] alphabet_size How many symbols in the alphabet or, equivalently,
* the number of probabilities at each time step
* \param [in] mini_batch How many examples in a minibatch.
* \param [in] info see struct ctcOptions
* \param [out] size_bytes is pointer to a scalar where the memory
* requirement in bytes will be placed. This memory should be allocated
* at the same place, CPU or GPU, that the probs are in
*
* \return Status information
**/
ctcStatus_t get_workspace_size(const int* const label_lengths,
const int* const input_lengths,
int alphabet_size, int minibatch,
ctcOptions info,
size_t* size_bytes);
#ifdef __cplusplus
}
#endif
This diff is collapsed.
#pragma once
#include <limits>
#include <algorithm>
#include <cmath>
#include "hostdevice.h"
namespace ctc_helper {
static const float threshold = 1e-1;
template<typename T>
HOSTDEVICE
T neg_inf() { return -T(INFINITY); }
inline int div_up(int x, int y) {
return (x + y - 1) / y;
}
template<typename Arg, typename Res = Arg>
struct maximum {
HOSTDEVICE
Res operator()(const Arg &x, const Arg &y) const {
return x < y ? y : x;
}
};
template<typename Arg, typename Res = Arg>
struct minimum {
HOSTDEVICE
Res operator()(const Arg &x, const Arg &y) const {
return x < y ? x : y;
}
};
template<typename Arg, typename Res = Arg>
struct add {
HOSTDEVICE
Res operator()(const Arg &x, const Arg &y) const {
return x + y;
}
};
template<typename Arg, typename Res = Arg>
struct identity {
HOSTDEVICE Res operator()(const Arg &x) const {
return Res(x);
}
};
template<typename Arg, typename Res = Arg>
struct negate {
HOSTDEVICE Res operator()(const Arg &x) const {
return Res(-x);
}
};
template<typename Arg, typename Res = Arg>
struct exponential {
HOSTDEVICE Res operator()(const Arg &x) const { return std::exp(x); }
};
template<typename Arg1, typename Arg2 = Arg1, typename Res=Arg1>
struct log_plus {
typedef Res result_type;
HOSTDEVICE
Res operator()(const Arg1 &p1, const Arg2 &p2) {
if (p1 == neg_inf<Arg1>())
return p2;
if (p2 == neg_inf<Arg2>())
return p1;
Res result = log1p(exp(-fabs(p1 - p2))) + maximum<Res>()(p1, p2);
return result;
}
};
//template<typename Arg1, typename Arg2 = Arg1, typename Res=Arg1>
//struct log_plus {
// HOSTDEVICE
// Res operator()(const Arg1& p1, const Arg2& p2) {
// Res p12_max = maximum<Res>()(p1, p2);
// Res p12_min = minimum<Res>()(p1, p2);
// Res p12_diff = p12_min-p12_max;
// Res NEGATIVE_CUTOFF_VAL = -(Res)100000;
//
// Res result = p12_diff <= NEGATIVE_CUTOFF_VAL ? maximum<Res>()(p12_max, NEGATIVE_CUTOFF_VAL)
// : maximum<Res>()(p12_max + log(exp(p12_diff) + 1), NEGATIVE_CUTOFF_VAL);
//
//
// return result;
// }
//};
}
This diff is collapsed.
This diff is collapsed.
#pragma once
#ifdef __HIPCC__
#define HOSTDEVICE __device__ __host__
#else
#define HOSTDEVICE
#endif
#pragma once
ctcStatus_t reduce_negate(const float* input, float* output, int rows, int cols, bool axis, CUstream stream);
ctcStatus_t reduce_exp(const float* input, float* output, int rows, int cols, bool axis, CUstream stream);
ctcStatus_t reduce_max(const float* input, float* output, int rows, int cols, bool axis, CUstream stream);
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
#pragma once
/*
int cpu_ctc(THFloatTensor *probs,
THFloatTensor *grads,
THIntTensor *labels_ptr,
THIntTensor *label_sizes_ptr,
THIntTensor *sizes,
int minibatch_size,
THFloatTensor *costs,
int blank_label);
*/
int cpu_ctc(torch::Tensor probs,
torch::Tensor grads,
torch::Tensor labels,
torch::Tensor label_sizes,
torch::Tensor sizes,
int minibatch_size,
torch::Tensor costs,
int blank_label);
#pragma once
/*
int gpu_ctc(THCudaTensor *probs,
THCudaTensor *grads,
THIntTensor *labels_ptr,
THIntTensor *label_sizes_ptr,
THIntTensor *sizes,
int minibatch_size,
THFloatTensor *costs,
int blank_label);
*/
int gpu_ctc(torch::Tensor probs,
torch::Tensor grads,
torch::Tensor labels,
torch::Tensor label_sizes,
torch::Tensor sizes,
int minibatch_size,
torch::Tensor costs,
int blank_label);
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment