Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Warpctc
Commits
99e2985d
Commit
99e2985d
authored
May 16, 2023
by
lishen
Browse files
warpctc for dcu
parent
0bf5eb5f
Changes
41
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
0 additions
and
3166 deletions
+0
-3166
include/contrib/moderngpu/include/mgpuenums.h
include/contrib/moderngpu/include/mgpuenums.h
+0
-70
include/contrib/moderngpu/include/util/static.h
include/contrib/moderngpu/include/util/static.h
+0
-183
include/ctc.h
include/ctc.h
+0
-134
include/detail/cpu_ctc.h
include/detail/cpu_ctc.h
+0
-521
include/detail/ctc_helper.h
include/detail/ctc_helper.h
+0
-95
include/detail/gpu_ctc.h
include/detail/gpu_ctc.h
+0
-479
include/detail/gpu_ctc_kernels.h
include/detail/gpu_ctc_kernels.h
+0
-496
include/detail/hostdevice.h
include/detail/hostdevice.h
+0
-7
include/detail/reduce.h
include/detail/reduce.h
+0
-5
pytorch_binding/setup.cfg
pytorch_binding/setup.cfg
+0
-1
pytorch_binding/setup.py
pytorch_binding/setup.py
+0
-356
pytorch_binding/src/binding.cu
pytorch_binding/src/binding.cu
+0
-112
pytorch_binding/src/binding.hip
pytorch_binding/src/binding.hip
+0
-113
pytorch_binding/src/cpu_binding.h
pytorch_binding/src/cpu_binding.h
+0
-20
pytorch_binding/src/gpu_binding.h
pytorch_binding/src/gpu_binding.h
+0
-21
pytorch_binding/tests/test_gpu.py
pytorch_binding/tests/test_gpu.py
+0
-37
pytorch_binding/tests/test_gpu_speed.py
pytorch_binding/tests/test_gpu_speed.py
+0
-75
pytorch_binding/warpctc_pytorch/__init__.py
pytorch_binding/warpctc_pytorch/__init__.py
+0
-89
src/ctc_entrypoint.cu
src/ctc_entrypoint.cu
+0
-179
src/reduce.cu
src/reduce.cu
+0
-173
No files found.
include/contrib/moderngpu/include/mgpuenums.h
deleted
100644 → 0
View file @
0bf5eb5f
/******************************************************************************
* Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
*
* Code and text by Sean Baxter, NVIDIA Research
* See http://nvlabs.github.io/moderngpu for repository and documentation.
*
******************************************************************************/
#pragma once
namespace
mgpu
{
enum
MgpuBounds
{
MgpuBoundsLower
,
MgpuBoundsUpper
};
enum
MgpuScanType
{
MgpuScanTypeExc
,
MgpuScanTypeInc
};
enum
MgpuSearchType
{
MgpuSearchTypeNone
,
MgpuSearchTypeIndex
,
MgpuSearchTypeMatch
,
MgpuSearchTypeIndexMatch
};
enum
MgpuJoinKind
{
MgpuJoinKindInner
,
MgpuJoinKindLeft
,
MgpuJoinKindRight
,
MgpuJoinKindOuter
};
enum
MgpuSetOp
{
MgpuSetOpIntersection
,
MgpuSetOpUnion
,
MgpuSetOpDiff
,
MgpuSetOpSymDiff
};
}
// namespace mgpu
include/contrib/moderngpu/include/util/static.h
deleted
100644 → 0
View file @
0bf5eb5f
/******************************************************************************
* Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
*
* Code and text by Sean Baxter, NVIDIA Research
* See http://nvlabs.github.io/moderngpu for repository and documentation.
*
******************************************************************************/
#pragma once
#include <functional>
#include <iterator>
#include <cfloat>
#include <typeinfo>
#include <vector>
#include <list>
#include <map>
#include <algorithm>
#include <cassert>
#include <memory>
#include <cmath>
#include <cstdio>
#include <cstdlib>
#ifndef MGPU_MIN
#define MGPU_MIN(x, y) (((x) <= (y)) ? (x) : (y))
#define MGPU_MAX(x, y) (((x) >= (y)) ? (x) : (y))
#define MGPU_MAX0(x) (((x) >= 0) ? (x) : 0)
#define MGPU_ABS(x) (((x) >= 0) ? (x) : (-x))
#define MGPU_DIV_UP(x, y) (((x) + (y) - 1) / (y))
#define MGPU_DIV_ROUND(x, y) (((x) + (y) / 2) / (y))
#define MGPU_ROUND_UP(x, y) ((y) * MGPU_DIV_UP(x, y))
#define MGPU_SHIFT_DIV_UP(x, y) (((x) + ((1<< (y)) - 1))>> y)
#define MGPU_ROUND_UP_POW2(x, y) (((x) + (y) - 1) & ~((y) - 1))
#define MGPU_ROUND_DOWN_POW2(x, y) ((x) & ~((y) - 1))
#define MGPU_IS_POW_2(x) (0 == ((x) & ((x) - 1)))
#endif // MGPU_MIN
namespace
mgpu
{
typedef
unsigned
char
byte
;
typedef
unsigned
int
uint
;
typedef
signed
short
int16
;
typedef
unsigned
short
ushort
;
typedef
unsigned
short
uint16
;
typedef
long
long
int64
;
typedef
unsigned
long
long
uint64
;
// IsPow2<X>::value is true if X is a power of 2.
template
<
int
X
>
struct
sIsPow2
{
enum
{
value
=
0
==
(
X
&
(
X
-
1
))
};
};
// Finds the base-2 logarithm of X. value is -1 if X is not a power of 2.
template
<
int
X
,
bool
roundUp
=
true
>
struct
sLogPow2
{
enum
{
extra
=
sIsPow2
<
X
>::
value
?
0
:
(
roundUp
?
1
:
0
)
};
enum
{
inner
=
sLogPow2
<
X
/
2
>::
inner
+
1
};
enum
{
value
=
inner
+
extra
};
};
template
<
bool
roundUp
>
struct
sLogPow2
<
0
,
roundUp
>
{
enum
{
inner
=
0
};
enum
{
value
=
0
};
};
template
<
bool
roundUp
>
struct
sLogPow2
<
1
,
roundUp
>
{
enum
{
inner
=
0
};
enum
{
value
=
0
};
};
template
<
int
X
,
int
Y
>
struct
sDivUp
{
enum
{
value
=
(
X
+
Y
-
1
)
/
Y
};
};
template
<
int
count
,
int
levels
>
struct
sDiv2RoundUp
{
enum
{
value
=
sDiv2RoundUp
<
sDivUp
<
count
,
2
>::
value
,
levels
-
1
>::
value
};
};
template
<
int
count
>
struct
sDiv2RoundUp
<
count
,
0
>
{
enum
{
value
=
count
};
};
template
<
int
X
,
int
Y
>
struct
sDivSafe
{
enum
{
value
=
X
/
Y
};
};
template
<
int
X
>
struct
sDivSafe
<
X
,
0
>
{
enum
{
value
=
0
};
};
template
<
int
X
,
int
Y
>
struct
sRoundUp
{
enum
{
rem
=
X
%
Y
};
enum
{
value
=
X
+
(
rem
?
(
Y
-
rem
)
:
0
)
};
};
template
<
int
X
,
int
Y
>
struct
sRoundDown
{
enum
{
rem
=
X
%
Y
};
enum
{
value
=
X
-
rem
};
};
// IntegerDiv is a template for avoiding divisions by zero in template
// evaluation. Templates always evaluate both b and c in an expression like
// a ? b : c, and will error if either rhs contains an illegal expression,
// even if the ternary is explictly designed to guard against that.
template
<
int
X
,
int
Y
>
struct
sIntegerDiv
{
enum
{
value
=
X
/
(
Y
?
Y
:
(
X
+
1
))
};
};
template
<
int
X
,
int
Y
>
struct
sMax
{
enum
{
value
=
(
X
>=
Y
)
?
X
:
Y
};
};
template
<
int
X
,
int
Y
>
struct
sMin
{
enum
{
value
=
(
X
<=
Y
)
?
X
:
Y
};
};
template
<
int
X
>
struct
sAbs
{
enum
{
value
=
(
X
>=
0
)
?
X
:
-
X
};
};
// Finds the number of powers of 2 in the prime factorization of X.
template
<
int
X
,
int
LSB
=
1
&
X
>
struct
sNumFactorsOf2
{
enum
{
shifted
=
X
>>
1
};
enum
{
value
=
1
+
sNumFactorsOf2
<
shifted
>::
value
};
};
template
<
int
X
>
struct
sNumFactorsOf2
<
X
,
1
>
{
enum
{
value
=
0
};
};
// Returns the divisor for a conflict-free transpose.
template
<
int
X
,
int
NumBanks
=
32
>
struct
sBankConflictDivisor
{
enum
{
value
=
(
1
&
X
)
?
0
:
(
sIsPow2
<
X
>::
value
?
NumBanks
:
(
1
<<
sNumFactorsOf2
<
X
>::
value
))
};
enum
{
log_value
=
sLogPow2
<
value
>::
value
};
};
template
<
int
NT
,
int
X
,
int
NumBanks
=
32
>
struct
sConflictFreeStorage
{
enum
{
count
=
NT
*
X
};
enum
{
divisor
=
sBankConflictDivisor
<
X
,
NumBanks
>::
value
};
enum
{
padding
=
sDivSafe
<
count
,
divisor
>::
value
};
enum
{
value
=
count
+
padding
};
};
}
// namespace mgpu
include/ctc.h
deleted
100644 → 0
View file @
0bf5eb5f
/** \file ctc.h
* Contains a simple C interface to call fast CPU and GPU based computation
* of the CTC loss.
*/
#pragma once
#ifdef __cplusplus
#include <cstddef>
#include <torch/extension.h>
extern
"C"
{
#endif
//forward declare of CUDA typedef to avoid needing to pull in CUDA headers
//typedef struct CUstream_st* CUstream;
typedef
struct
ihipStream_t
*
CUstream
;
typedef
enum
{
CTC_STATUS_SUCCESS
=
0
,
CTC_STATUS_MEMOPS_FAILED
=
1
,
CTC_STATUS_INVALID_VALUE
=
2
,
CTC_STATUS_EXECUTION_FAILED
=
3
,
CTC_STATUS_UNKNOWN_ERROR
=
4
}
ctcStatus_t
;
/** Returns a single integer which specifies the API version of the warpctc library */
int
get_warpctc_version
();
/** Returns a string containing a description of status that was passed in
* \param[in] status identifies which string should be returned
* \return C style string containing the text description
* */
const
char
*
ctcGetStatusString
(
ctcStatus_t
status
);
typedef
enum
{
CTC_CPU
=
0
,
CTC_GPU
=
1
}
ctcComputeLocation
;
/** Structure used for options to the CTC compution. Applications
* should zero out the array using memset and sizeof(struct
* ctcOptions) in C or default initialization (e.g. 'ctcOptions
* options{};' or 'auto options = ctcOptions{}') in C++ to ensure
* forward compatibility with added options. */
struct
ctcOptions
{
/// indicates where the ctc calculation should take place {CTC_CPU | CTC_GPU}
ctcComputeLocation
loc
;
union
{
/// used when loc == CTC_CPU, the maximum number of threads that can be used
unsigned
int
num_threads
;
/// used when loc == CTC_GPU, which stream the kernels should be launched in
CUstream
stream
;
};
/// the label value/index that the CTC calculation should use as the blank label
int
blank_label
;
};
/** Compute the connectionist temporal classification loss between a sequence
* of probabilities and a ground truth labeling. Optionally compute the
* gradient with respect to the inputs.
* \param [in] activations pointer to the activations in either CPU or GPU
* addressable memory, depending on info. We assume a fixed
* memory layout for this 3 dimensional tensor, which has dimension
* (t, n, p), where t is the time index, n is the minibatch index,
* and p indexes over probabilities of each symbol in the alphabet.
* The memory layout is (t, n, p) in C order (slowest to fastest changing
* index, aka row-major), or (p, n, t) in Fortran order (fastest to slowest
* changing index, aka column-major). We also assume strides are equal to
* dimensions - there is no padding between dimensions.
* More precisely, element (t, n, p), for a problem with mini_batch examples
* in the mini batch, and alphabet_size symbols in the alphabet, is located at:
* activations[(t * mini_batch + n) * alphabet_size + p]
* \param [out] gradients if not NULL, then gradients are computed. Should be
* allocated in the same memory space as probs and memory
* ordering is identical.
* \param [in] flat_labels Always in CPU memory. A concatenation
* of all the labels for the minibatch.
* \param [in] label_lengths Always in CPU memory. The length of each label
* for each example in the minibatch.
* \param [in] input_lengths Always in CPU memory. The number of time steps
* for each sequence in the minibatch.
* \param [in] alphabet_size The number of possible output symbols. There
* should be this many probabilities for each time step.
* \param [in] mini_batch How many examples in a minibatch.
* \param [out] costs Always in CPU memory. The cost of each example in the
* minibatch.
* \param [in,out] workspace In same memory space as probs. Should be of
* size requested by get_workspace_size.
* \param [in] options see struct ctcOptions
*
* \return Status information
*
* */
ctcStatus_t
compute_ctc_loss
(
const
float
*
const
activations
,
float
*
gradients
,
const
int
*
const
flat_labels
,
const
int
*
const
label_lengths
,
const
int
*
const
input_lengths
,
int
alphabet_size
,
int
minibatch
,
float
*
costs
,
void
*
workspace
,
ctcOptions
options
);
/** For a given set of labels and minibatch size return the required workspace
* size. This will need to be allocated in the same memory space as your
* probabilities.
* \param [in] label_lengths Always in CPU memory. The length of each label
* for each example in the minibatch.
* \param [in] input_lengths Always in CPU memory. The number of time steps
* for each sequence in the minibatch.
* \param [in] alphabet_size How many symbols in the alphabet or, equivalently,
* the number of probabilities at each time step
* \param [in] mini_batch How many examples in a minibatch.
* \param [in] info see struct ctcOptions
* \param [out] size_bytes is pointer to a scalar where the memory
* requirement in bytes will be placed. This memory should be allocated
* at the same place, CPU or GPU, that the probs are in
*
* \return Status information
**/
ctcStatus_t
get_workspace_size
(
const
int
*
const
label_lengths
,
const
int
*
const
input_lengths
,
int
alphabet_size
,
int
minibatch
,
ctcOptions
info
,
size_t
*
size_bytes
);
#ifdef __cplusplus
}
#endif
include/detail/cpu_ctc.h
deleted
100644 → 0
View file @
0bf5eb5f
#pragma once
#include <algorithm>
#include <cmath>
#include <limits>
#include <numeric>
#include <tuple>
#if !defined(CTC_DISABLE_OMP) && !defined(APPLE)
#include <omp.h>
#endif
#include "ctc_helper.h"
template
<
typename
ProbT
>
class
CpuCTC
{
public:
// Noncopyable
CpuCTC
(
int
alphabet_size
,
int
minibatch
,
void
*
workspace
,
int
num_threads
,
int
blank_label
)
:
alphabet_size_
(
alphabet_size
),
minibatch_
(
minibatch
),
num_threads_
(
num_threads
),
workspace_
(
workspace
),
blank_label_
(
blank_label
)
{
#if defined(CTC_DISABLE_OMP) || defined(APPLE)
#else
if
(
num_threads
>
0
)
{
omp_set_num_threads
(
num_threads
);
}
else
{
num_threads_
=
omp_get_max_threads
();
}
#endif
};
CpuCTC
(
const
CpuCTC
&
)
=
delete
;
CpuCTC
&
operator
=
(
const
CpuCTC
&
)
=
delete
;
ctcStatus_t
cost_and_grad
(
const
ProbT
*
const
activations
,
ProbT
*
grads
,
ProbT
*
costs
,
const
int
*
const
flat_labels
,
const
int
*
const
label_lengths
,
const
int
*
const
input_lengths
);
ctcStatus_t
score_forward
(
const
ProbT
*
const
activations
,
ProbT
*
costs
,
const
int
*
const
flat_labels
,
const
int
*
const
label_lengths
,
const
int
*
const
input_lengths
);
private:
class
CpuCTC_metadata
{
private:
int
setup_labels
(
const
int
*
const
labels
,
int
blank_label
,
int
L
,
int
S
);
public:
CpuCTC_metadata
(
int
L
,
int
S
,
int
T
,
int
mb
,
int
alphabet_size
,
void
*
workspace
,
size_t
bytes_used
,
int
blank_label
,
const
int
*
const
labels
);
ProbT
*
alphas
;
ProbT
*
betas
;
int
*
labels_w_blanks
;
int
*
e_inc
;
int
*
s_inc
;
ProbT
*
output
;
int
repeats
;
};
int
alphabet_size_
;
// Number of characters plus blank
int
minibatch_
;
int
num_threads_
;
int
blank_label_
;
void
*
workspace_
;
void
softmax
(
const
ProbT
*
const
activations
,
ProbT
*
probs
,
const
int
*
const
input_lengths
);
std
::
tuple
<
ProbT
,
bool
>
cost_and_grad_kernel
(
ProbT
*
grad
,
const
ProbT
*
const
probs
,
const
int
*
const
labels
,
int
T
,
int
L
,
int
mb
,
size_t
bytes_used
);
ProbT
compute_alphas
(
const
ProbT
*
probs
,
int
repeats
,
int
S
,
int
T
,
const
int
*
const
e_inc
,
const
int
*
const
s_inc
,
const
int
*
const
labels
,
ProbT
*
alphas
);
ProbT
compute_betas_and_grad
(
ProbT
*
grad
,
const
ProbT
*
const
probs
,
ProbT
log_partition
,
int
repeats
,
int
S
,
int
T
,
const
int
*
const
e_inc
,
const
int
*
const
s_inc
,
const
int
*
const
labels
,
ProbT
*
alphas
,
ProbT
*
betas
,
ProbT
*
output
);
};
template
<
typename
ProbT
>
CpuCTC
<
ProbT
>::
CpuCTC_metadata
::
CpuCTC_metadata
(
int
L
,
int
S
,
int
T
,
int
mb
,
int
alphabet_size
,
void
*
workspace
,
size_t
bytes_used
,
int
blank_label
,
const
int
*
const
labels
)
{
alphas
=
reinterpret_cast
<
ProbT
*>
(
static_cast
<
char
*>
(
workspace
)
+
bytes_used
);
bytes_used
+=
sizeof
(
ProbT
)
*
S
*
T
;
std
::
fill
(
alphas
,
alphas
+
S
*
T
,
ctc_helper
::
neg_inf
<
ProbT
>
());
betas
=
reinterpret_cast
<
ProbT
*>
(
static_cast
<
char
*>
(
workspace
)
+
bytes_used
);
bytes_used
+=
sizeof
(
ProbT
)
*
S
;
std
::
fill
(
betas
,
betas
+
S
,
ctc_helper
::
neg_inf
<
ProbT
>
());
labels_w_blanks
=
reinterpret_cast
<
int
*>
(
static_cast
<
char
*>
(
workspace
)
+
bytes_used
);
bytes_used
+=
sizeof
(
int
)
*
S
;
e_inc
=
reinterpret_cast
<
int
*>
(
static_cast
<
char
*>
(
workspace
)
+
bytes_used
);
bytes_used
+=
sizeof
(
int
)
*
S
;
s_inc
=
reinterpret_cast
<
int
*>
(
static_cast
<
char
*>
(
workspace
)
+
bytes_used
);
bytes_used
+=
sizeof
(
int
)
*
S
;
output
=
reinterpret_cast
<
ProbT
*>
(
static_cast
<
char
*>
(
workspace
)
+
bytes_used
);
bytes_used
+=
sizeof
(
ProbT
)
*
alphabet_size
;
repeats
=
setup_labels
(
labels
,
blank_label
,
L
,
S
);
}
template
<
typename
ProbT
>
int
CpuCTC
<
ProbT
>::
CpuCTC_metadata
::
setup_labels
(
const
int
*
const
labels
,
int
blank_label
,
int
L
,
int
S
)
{
int
e_counter
=
0
;
int
s_counter
=
0
;
s_inc
[
s_counter
++
]
=
1
;
// get start
int
repeats
=
0
;
// number of repeat
for
(
int
i
=
1
;
i
<
L
;
++
i
)
{
if
(
labels
[
i
-
1
]
==
labels
[
i
])
{
// repeat label
s_inc
[
s_counter
++
]
=
1
;
s_inc
[
s_counter
++
]
=
1
;
// label and blank
e_inc
[
e_counter
++
]
=
1
;
e_inc
[
e_counter
++
]
=
1
;
++
repeats
;
}
else
{
s_inc
[
s_counter
++
]
=
2
;
// single label and no repeat
e_inc
[
e_counter
++
]
=
2
;
}
}
e_inc
[
e_counter
++
]
=
1
;
// get end
// // printf("s_counter=%d, e_counter=%d, repeats=%d\n", s_counter, e_counter, repeats);
// for (int i = 0; i < S; ++i) {
// printf("s_inc[%d]=%d, e_inc[%d]=%d\n", i, s_inc[i], i, e_inc[i]);
// }
for
(
int
i
=
0
;
i
<
L
;
++
i
)
{
labels_w_blanks
[
2
*
i
]
=
blank_label
;
labels_w_blanks
[
2
*
i
+
1
]
=
labels
[
i
];
}
labels_w_blanks
[
S
-
1
]
=
blank_label
;
// end is blank
return
repeats
;
}
template
<
typename
ProbT
>
void
CpuCTC
<
ProbT
>::
softmax
(
const
ProbT
*
const
activations
,
ProbT
*
probs
,
const
int
*
const
input_lengths
)
{
#pragma omp parallel for
for
(
int
mb
=
0
;
mb
<
minibatch_
;
++
mb
)
{
// iter batch
for
(
int
c
=
0
;
c
<
input_lengths
[
mb
];
++
c
)
{
// iter input audio vec
int
col_offset
=
(
mb
+
minibatch_
*
c
)
*
alphabet_size_
;
// vec index * alphabet_size_
//// get max_activation
ProbT
max_activation
=
-
std
::
numeric_limits
<
ProbT
>::
infinity
();
// set -1 matrix
for
(
int
r
=
0
;
r
<
alphabet_size_
;
++
r
)
// iter alphabet
max_activation
=
std
::
max
(
max_activation
,
activations
[
r
+
col_offset
]);
//// compute probs between activations and max
ProbT
denom
=
ProbT
(
0.
);
for
(
int
r
=
0
;
r
<
alphabet_size_
;
++
r
)
{
probs
[
r
+
col_offset
]
=
std
::
exp
(
activations
[
r
+
col_offset
]
-
max_activation
);
denom
+=
probs
[
r
+
col_offset
];
}
//// scale probs
for
(
int
r
=
0
;
r
<
alphabet_size_
;
++
r
)
{
probs
[
r
+
col_offset
]
/=
denom
;
}
}
}
}
template
<
typename
ProbT
>
std
::
tuple
<
ProbT
,
bool
>
CpuCTC
<
ProbT
>::
cost_and_grad_kernel
(
ProbT
*
grad
,
const
ProbT
*
const
probs
,
const
int
*
const
labels
,
int
T
,
int
L
,
int
mb
,
size_t
bytes_used
)
{
const
int
S
=
2
*
L
+
1
;
// Number of labels with blanks
CpuCTC_metadata
ctcm
(
L
,
S
,
T
,
mb
,
alphabet_size_
,
workspace_
,
bytes_used
,
blank_label_
,
labels
);
bool
over_threshold
=
false
;
// check (length of labels + repeats) <= (length of utterance)
if
(
L
+
ctcm
.
repeats
>
T
)
{
return
std
::
make_tuple
(
ProbT
(
0
),
over_threshold
);
// TODO, not right to return 0
}
ProbT
llForward
=
compute_alphas
(
probs
,
ctcm
.
repeats
,
S
,
T
,
ctcm
.
e_inc
,
ctcm
.
s_inc
,
ctcm
.
labels_w_blanks
,
ctcm
.
alphas
);
ProbT
llBackward
=
compute_betas_and_grad
(
grad
,
probs
,
llForward
,
ctcm
.
repeats
,
S
,
T
,
ctcm
.
e_inc
,
ctcm
.
s_inc
,
ctcm
.
labels_w_blanks
,
ctcm
.
alphas
,
ctcm
.
betas
,
ctcm
.
output
);
ProbT
diff
=
std
::
abs
(
llForward
-
llBackward
);
if
(
diff
>
ctc_helper
::
threshold
)
{
over_threshold
=
true
;
}
return
std
::
make_tuple
(
-
llForward
,
over_threshold
);
}
// Computes forward probabilities
template
<
typename
ProbT
>
ProbT
CpuCTC
<
ProbT
>::
compute_alphas
(
const
ProbT
*
probs
,
int
repeats
,
int
S
,
int
T
,
const
int
*
const
e_inc
,
const
int
*
const
s_inc
,
const
int
*
const
labels
,
ProbT
*
alphas
)
{
int
start
=
(((
S
/
2
)
+
repeats
-
T
)
<
0
)
?
0
:
1
,
end
=
S
>
1
?
2
:
1
;
// get log probs of label
for
(
int
i
=
start
;
i
<
end
;
++
i
)
{
alphas
[
i
]
=
std
::
log
(
probs
[
labels
[
i
]]);
}
// printf("start=%d, end=%d, t=1~srcLen=%d, repeats=%d\n", start, end, T, repeats);
for
(
int
t
=
1
;
t
<
T
;
++
t
)
{
int
remain
=
(
S
/
2
)
+
repeats
-
(
T
-
t
);
// printf("t=%d, remain=%d\n", t, remain);
if
(
remain
>=
0
)
start
+=
s_inc
[
remain
];
if
(
t
<=
(
S
/
2
)
+
repeats
)
end
+=
e_inc
[
t
-
1
];
int
startloop
=
start
;
int
idx1
=
t
*
S
,
idx2
=
(
t
-
1
)
*
S
,
idx3
=
t
*
(
alphabet_size_
*
minibatch_
);
if
(
start
==
0
)
{
alphas
[
idx1
]
=
alphas
[
idx2
]
+
std
::
log
(
probs
[
blank_label_
+
idx3
]);
// printf("00 alphas[%d]=%f, alphas[%d]=%f\n", t, alphas[idx1], t - 1, alphas[idx2]);
startloop
+=
1
;
}
// printf("start=%d, startloop=%d, end=%d\n", start, startloop, end);
for
(
int
i
=
startloop
;
i
<
end
;
++
i
)
{
// printf("alphas[(t - 1=%d, u=%d)]=%f\n", t - 1, i, alphas[i + idx2]);
// printf("alphas[(t - 1=%d, u-1=%d)]=%f\n", t - 1, i - 1, alphas[(i - 1) + idx2]);
ProbT
prev_sum
=
ctc_helper
::
log_plus
<
ProbT
>
()(
alphas
[
i
+
idx2
],
alphas
[(
i
-
1
)
+
idx2
]);
// printf("11 t=%d, u=%d, prev_sum=%f\n", t, i, prev_sum);
// Skip two if not on blank and not on repeat.
if
(
labels
[
i
]
!=
blank_label_
&&
i
!=
1
&&
labels
[
i
]
!=
labels
[
i
-
2
])
{
prev_sum
=
ctc_helper
::
log_plus
<
ProbT
>
()(
prev_sum
,
alphas
[(
i
-
2
)
+
idx2
]);
// printf("22 t=%d, u=%d, prev_sum=%f\n", t, i, prev_sum);
}
alphas
[
i
+
idx1
]
=
prev_sum
+
std
::
log
(
probs
[
labels
[
i
]
+
idx3
]);
// printf("33 alpha[%d,%d]=%f, log(p(%d))=%f, label(%d)=%d\n", t, i, alphas[i + idx1], labels[i], std::log(probs[labels[i] + idx3]), i, labels[i]);
}
// printf("\n");
}
// printf("final start=%d, end=%d\n", start, end);
ProbT
loglike
=
ctc_helper
::
neg_inf
<
ProbT
>
();
for
(
int
i
=
start
;
i
<
end
;
++
i
)
{
loglike
=
ctc_helper
::
log_plus
<
ProbT
>
()(
loglike
,
alphas
[
i
+
(
T
-
1
)
*
S
]);
}
// printf("compute alpha cost=%f\n", -loglike);
#ifdef DEBUG_KERNEL
printf
(
"cpu alphas:
\n
"
);
printf
(
"T=%d, (T-1)*S=%d, start=%d, end=%d
\n
"
,
T
,
(
T
-
1
)
*
S
,
start
,
end
);
for
(
int
t
=
start
;
t
<
end
;
++
t
)
{
printf
(
"%.5f "
,
alphas
[
t
+
(
T
-
1
)
*
S
]);
}
printf
(
"
\n
"
);
printf
(
"alphas loglike=%f
\n
"
,
loglike
);
#endif
return
loglike
;
}
// Starting from T, we sweep backward over the alpha array computing one column
// of betas as we go. At each position we can update product alpha * beta and then
// sum into the gradient associated with each label.
// NOTE computes gradient w.r.t UNNORMALIZED final layer activations.
// Assumed passed in grads are already zeroed!
template
<
typename
ProbT
>
ProbT
CpuCTC
<
ProbT
>::
compute_betas_and_grad
(
ProbT
*
grad
,
const
ProbT
*
const
probs
,
ProbT
log_partition
,
int
repeats
,
int
S
,
int
T
,
const
int
*
const
e_inc
,
const
int
*
const
s_inc
,
const
int
*
const
labels
,
ProbT
*
alphas
,
ProbT
*
betas
,
ProbT
*
output
)
{
int
start
=
S
>
1
?
(
S
-
2
)
:
0
,
end
=
(
T
>
(
S
/
2
)
+
repeats
)
?
S
:
S
-
1
;
std
::
fill
(
output
,
output
+
alphabet_size_
,
ctc_helper
::
neg_inf
<
ProbT
>
());
// set the starting values in the beta column at the very right edge
for
(
int
i
=
start
;
i
<
end
;
++
i
)
{
betas
[
i
]
=
std
::
log
(
probs
[
labels
[
i
]
+
(
T
-
1
)
*
(
alphabet_size_
*
minibatch_
)]);
// compute alpha * beta in log space at this position in (S, T) space
alphas
[
i
+
(
T
-
1
)
*
S
]
+=
betas
[
i
];
// update the gradient associated with this label
// essentially performing a reduce-by-key in a sequential manner
output
[
labels
[
i
]]
=
ctc_helper
::
log_plus
<
ProbT
>
()(
alphas
[
i
+
(
T
-
1
)
*
S
],
output
[
labels
[
i
]]);
}
// update the gradient wrt to each unique label
for
(
int
i
=
0
;
i
<
alphabet_size_
;
++
i
)
{
int
idx3
=
(
T
-
1
)
*
alphabet_size_
*
minibatch_
+
i
;
if
(
output
[
i
]
==
0.0
||
output
[
i
]
==
ctc_helper
::
neg_inf
<
ProbT
>
()
||
probs
[
idx3
]
==
0.0
)
{
grad
[
idx3
]
=
probs
[
idx3
];
}
else
{
grad
[
idx3
]
=
probs
[
idx3
]
-
std
::
exp
(
output
[
i
]
-
std
::
log
(
probs
[
idx3
])
-
log_partition
);
}
}
// loop from the second to last column all the way to the left
for
(
int
t
=
T
-
2
;
t
>=
0
;
--
t
)
{
int
remain
=
(
S
/
2
)
+
repeats
-
(
T
-
t
);
if
(
remain
>=
-
1
)
start
-=
s_inc
[
remain
+
1
];
if
(
t
<
(
S
/
2
)
+
repeats
)
end
-=
e_inc
[
t
];
int
endloop
=
end
==
S
?
end
-
1
:
end
;
int
idx1
=
t
*
S
,
idx3
=
t
*
(
alphabet_size_
*
minibatch_
);
std
::
fill
(
output
,
output
+
alphabet_size_
,
ctc_helper
::
neg_inf
<
ProbT
>
());
for
(
int
i
=
start
;
i
<
endloop
;
++
i
)
{
ProbT
next_sum
=
ctc_helper
::
log_plus
<
ProbT
>
()(
betas
[
i
],
betas
[(
i
+
1
)]);
// Skip two if not on blank and not on repeat.
if
(
labels
[
i
]
!=
blank_label_
&&
i
!=
(
S
-
2
)
&&
labels
[
i
]
!=
labels
[
i
+
2
])
{
next_sum
=
ctc_helper
::
log_plus
<
ProbT
>
()(
next_sum
,
betas
[(
i
+
2
)]);
}
betas
[
i
]
=
next_sum
+
std
::
log
(
probs
[
labels
[
i
]
+
idx3
]);
// compute alpha * beta in log space
alphas
[
i
+
idx1
]
+=
betas
[
i
];
// update the gradient associated with this label
output
[
labels
[
i
]]
=
ctc_helper
::
log_plus
<
ProbT
>
()(
alphas
[
i
+
idx1
],
output
[
labels
[
i
]]);
}
if
(
end
==
S
)
{
betas
[(
S
-
1
)]
=
betas
[(
S
-
1
)]
+
std
::
log
(
probs
[
blank_label_
+
idx3
]);
alphas
[(
S
-
1
)
+
idx1
]
+=
betas
[(
S
-
1
)];
output
[
labels
[
S
-
1
]]
=
ctc_helper
::
log_plus
<
ProbT
>
()(
alphas
[
S
-
1
+
idx1
],
output
[
labels
[
S
-
1
]]);
}
// go over the unique labels and compute the final grad
// wrt to each one at this time step
for
(
int
i
=
0
;
i
<
alphabet_size_
;
++
i
)
{
if
(
output
[
i
]
==
0.0
||
output
[
i
]
==
ctc_helper
::
neg_inf
<
ProbT
>
()
||
probs
[
idx3
]
==
0.0
)
{
grad
[
idx3
]
=
probs
[
idx3
];
}
else
{
grad
[
idx3
]
=
probs
[
idx3
]
-
std
::
exp
(
output
[
i
]
-
std
::
log
(
probs
[
idx3
])
-
log_partition
);
}
++
idx3
;
}
}
ProbT
loglike
=
ctc_helper
::
neg_inf
<
ProbT
>
();
for
(
int
i
=
start
;
i
<
end
;
++
i
)
{
loglike
=
ctc_helper
::
log_plus
<
ProbT
>
()(
loglike
,
betas
[
i
]);
}
#ifdef DEBUG_KERNEL
printf
(
"cpu betas:
\n
"
);
printf
(
"T=%d, (T-1)*S=%d, start=%d, end=%d
\n
"
,
T
,
(
T
-
1
)
*
S
,
start
,
end
);
for
(
int
t
=
start
;
t
<
end
;
++
t
)
{
printf
(
"%.5f "
,
betas
[
t
]);
}
printf
(
"
\n
"
);
printf
(
"betas loglike=%f
\n
"
,
loglike
);
#endif
return
loglike
;
}
template
<
typename
ProbT
>
ctcStatus_t
CpuCTC
<
ProbT
>::
cost_and_grad
(
const
ProbT
*
const
activations
,
ProbT
*
grads
,
ProbT
*
costs
,
const
int
*
const
flat_labels
,
const
int
*
const
label_lengths
,
const
int
*
const
input_lengths
)
{
if
(
activations
==
nullptr
||
grads
==
nullptr
||
costs
==
nullptr
||
flat_labels
==
nullptr
||
label_lengths
==
nullptr
||
input_lengths
==
nullptr
)
return
CTC_STATUS_INVALID_VALUE
;
ProbT
*
probs
=
static_cast
<
ProbT
*>
(
workspace_
);
// get max length input audio vector
int
maxT
=
*
std
::
max_element
(
input_lengths
,
input_lengths
+
minibatch_
);
// memory to use
size_t
bytes_used
=
sizeof
(
ProbT
)
*
minibatch_
*
alphabet_size_
*
maxT
;
// per minibatch memory
size_t
per_minibatch_bytes
=
0
;
// get max length input text vector
int
maxL
=
*
std
::
max_element
(
label_lengths
,
label_lengths
+
minibatch_
);
int
maxS
=
2
*
maxL
+
1
;
// labels with blanks
// output
per_minibatch_bytes
+=
sizeof
(
float
)
*
alphabet_size_
;
// vector of alphabet
// alphas
per_minibatch_bytes
+=
sizeof
(
float
)
*
maxS
*
maxT
;
// matrix size
// betas
per_minibatch_bytes
+=
sizeof
(
float
)
*
maxS
;
// sequence label size is n , alloc 2n+1, with blanks
// labels w/blanks, e_inc, s_inc
per_minibatch_bytes
+=
3
*
sizeof
(
int
)
*
maxS
;
// compute softmax probs
softmax
(
activations
,
probs
,
input_lengths
);
#pragma omp parallel for
for
(
int
mb
=
0
;
mb
<
minibatch_
;
++
mb
)
{
const
int
T
=
input_lengths
[
mb
];
// Length of utterance (time)
const
int
L
=
label_lengths
[
mb
];
// Number of labels in transcription
bool
mb_status
;
std
::
tie
(
costs
[
mb
],
mb_status
)
=
cost_and_grad_kernel
(
grads
+
mb
*
alphabet_size_
,
probs
+
mb
*
alphabet_size_
,
flat_labels
+
std
::
accumulate
(
label_lengths
,
label_lengths
+
mb
,
0
),
T
,
L
,
mb
,
bytes_used
+
mb
*
per_minibatch_bytes
);
}
return
CTC_STATUS_SUCCESS
;
}
template
<
typename
ProbT
>
ctcStatus_t
CpuCTC
<
ProbT
>::
score_forward
(
const
ProbT
*
const
activations
,
ProbT
*
costs
,
const
int
*
const
flat_labels
,
const
int
*
const
label_lengths
,
const
int
*
const
input_lengths
)
{
if
(
activations
==
nullptr
||
costs
==
nullptr
||
flat_labels
==
nullptr
||
label_lengths
==
nullptr
||
input_lengths
==
nullptr
)
return
CTC_STATUS_INVALID_VALUE
;
ProbT
*
probs
=
static_cast
<
ProbT
*>
(
workspace_
);
int
maxT
=
*
std
::
max_element
(
input_lengths
,
input_lengths
+
minibatch_
);
size_t
bytes_used
=
sizeof
(
ProbT
)
*
minibatch_
*
alphabet_size_
*
maxT
;
// per minibatch memory
size_t
per_minibatch_bytes
=
0
;
int
maxL
=
*
std
::
max_element
(
label_lengths
,
label_lengths
+
minibatch_
);
int
maxS
=
2
*
maxL
+
1
;
// output
per_minibatch_bytes
+=
sizeof
(
float
)
*
alphabet_size_
;
// alphas
per_minibatch_bytes
+=
sizeof
(
float
)
*
maxS
*
maxT
;
// betas
per_minibatch_bytes
+=
sizeof
(
float
)
*
maxS
;
// labels w/blanks, e_inc, s_inc
per_minibatch_bytes
+=
3
*
sizeof
(
int
)
*
maxS
;
softmax
(
activations
,
probs
,
input_lengths
);
#pragma omp parallel for
for
(
int
mb
=
0
;
mb
<
minibatch_
;
++
mb
)
{
const
int
T
=
input_lengths
[
mb
];
// Length of utterance (time)
const
int
L
=
label_lengths
[
mb
];
// Number of labels in transcription
const
int
S
=
2
*
L
+
1
;
// Number of labels with blanks
CpuCTC_metadata
ctcm
(
L
,
S
,
T
,
mb
,
alphabet_size_
,
workspace_
,
bytes_used
+
mb
*
per_minibatch_bytes
,
blank_label_
,
flat_labels
+
std
::
accumulate
(
label_lengths
,
label_lengths
+
mb
,
0
));
if
(
L
+
ctcm
.
repeats
>
T
)
costs
[
mb
]
=
ProbT
(
0
);
else
{
costs
[
mb
]
=
-
compute_alphas
(
probs
+
mb
*
alphabet_size_
,
ctcm
.
repeats
,
S
,
T
,
ctcm
.
e_inc
,
ctcm
.
s_inc
,
ctcm
.
labels_w_blanks
,
ctcm
.
alphas
);
}
}
return
CTC_STATUS_SUCCESS
;
}
include/detail/ctc_helper.h
deleted
100644 → 0
View file @
0bf5eb5f
#pragma once
#include <limits>
#include <algorithm>
#include <cmath>
#include "hostdevice.h"
namespace
ctc_helper
{
static
const
float
threshold
=
1e-1
;
template
<
typename
T
>
HOSTDEVICE
T
neg_inf
()
{
return
-
T
(
INFINITY
);
}
inline
int
div_up
(
int
x
,
int
y
)
{
return
(
x
+
y
-
1
)
/
y
;
}
template
<
typename
Arg
,
typename
Res
=
Arg
>
struct
maximum
{
HOSTDEVICE
Res
operator
()(
const
Arg
&
x
,
const
Arg
&
y
)
const
{
return
x
<
y
?
y
:
x
;
}
};
template
<
typename
Arg
,
typename
Res
=
Arg
>
struct
minimum
{
HOSTDEVICE
Res
operator
()(
const
Arg
&
x
,
const
Arg
&
y
)
const
{
return
x
<
y
?
x
:
y
;
}
};
template
<
typename
Arg
,
typename
Res
=
Arg
>
struct
add
{
HOSTDEVICE
Res
operator
()(
const
Arg
&
x
,
const
Arg
&
y
)
const
{
return
x
+
y
;
}
};
template
<
typename
Arg
,
typename
Res
=
Arg
>
struct
identity
{
HOSTDEVICE
Res
operator
()(
const
Arg
&
x
)
const
{
return
Res
(
x
);
}
};
template
<
typename
Arg
,
typename
Res
=
Arg
>
struct
negate
{
HOSTDEVICE
Res
operator
()(
const
Arg
&
x
)
const
{
return
Res
(
-
x
);
}
};
template
<
typename
Arg
,
typename
Res
=
Arg
>
struct
exponential
{
HOSTDEVICE
Res
operator
()(
const
Arg
&
x
)
const
{
return
std
::
exp
(
x
);
}
};
template
<
typename
Arg1
,
typename
Arg2
=
Arg1
,
typename
Res
=
Arg1
>
struct
log_plus
{
typedef
Res
result_type
;
HOSTDEVICE
Res
operator
()(
const
Arg1
&
p1
,
const
Arg2
&
p2
)
{
if
(
p1
==
neg_inf
<
Arg1
>
())
return
p2
;
if
(
p2
==
neg_inf
<
Arg2
>
())
return
p1
;
Res
result
=
log1p
(
exp
(
-
fabs
(
p1
-
p2
)))
+
maximum
<
Res
>
()(
p1
,
p2
);
return
result
;
}
};
//template<typename Arg1, typename Arg2 = Arg1, typename Res=Arg1>
//struct log_plus {
// HOSTDEVICE
// Res operator()(const Arg1& p1, const Arg2& p2) {
// Res p12_max = maximum<Res>()(p1, p2);
// Res p12_min = minimum<Res>()(p1, p2);
// Res p12_diff = p12_min-p12_max;
// Res NEGATIVE_CUTOFF_VAL = -(Res)100000;
//
// Res result = p12_diff <= NEGATIVE_CUTOFF_VAL ? maximum<Res>()(p12_max, NEGATIVE_CUTOFF_VAL)
// : maximum<Res>()(p12_max + log(exp(p12_diff) + 1), NEGATIVE_CUTOFF_VAL);
//
//
// return result;
// }
//};
}
include/detail/gpu_ctc.h
deleted
100644 → 0
View file @
0bf5eb5f
#pragma once
#include "ctc_helper.h"
#include "gpu_ctc_kernels.h"
#include "reduce.h"
#include <stdio.h>
const
int
kCUDABlockNumThreads
=
256
;
template
<
typename
ProbT
>
class
GpuCTC
{
public:
GpuCTC
(
int
alphabet_size
,
int
minibatch
,
void
*
workspace
,
CUstream
stream
,
int
blank_label
)
:
out_dim_
(
alphabet_size
),
minibatch_
(
minibatch
),
gpu_workspace_
(
workspace
),
stream_
(
stream
),
blank_label_
(
blank_label
)
{};
// Noncopyable
GpuCTC
(
const
GpuCTC
&
)
=
delete
;
GpuCTC
&
operator
=
(
const
GpuCTC
&
)
=
delete
;
ctcStatus_t
cost_and_grad
(
const
ProbT
*
const
activations
,
ProbT
*
grads
,
ProbT
*
costs
,
const
int
*
const
flat_labels
,
const
int
*
const
label_lengths
,
const
int
*
const
input_lengths
);
ctcStatus_t
score_forward
(
const
ProbT
*
const
activations
,
ProbT
*
costs
,
const
int
*
const
flat_labels
,
const
int
*
const
label_lengths
,
const
int
*
const
input_lengths
);
private:
template
<
int
NT
,
int
VT
>
ctcStatus_t
launch_alpha_beta_kernels
(
const
ProbT
*
const
probs
,
ProbT
*
grads
,
bool
compute_alpha
,
bool
compute_beta
);
ctcStatus_t
launch_gpu_kernels
(
const
ProbT
*
const
probs
,
ProbT
*
grads
,
size_t
config
,
bool
launch_alpha
,
bool
launch_beta
);
ctcStatus_t
setup_gpu_metadata
(
const
int
*
const
flat_labels
,
const
int
*
const
label_lengths
,
const
int
*
const
input_lengths
);
ctcStatus_t
create_metadata_and_choose_config
(
const
int
*
const
label_lengths
,
const
int
*
const
flat_labels
,
const
int
*
const
input_lengths
,
size_t
&
best_config
);
ctcStatus_t
compute_probs
(
const
ProbT
*
const
activations
);
ctcStatus_t
compute_cost_and_score
(
const
ProbT
*
const
activations
,
ProbT
*
grads
,
ProbT
*
costs
,
const
int
*
const
flat_labels
,
const
int
*
const
label_lengths
,
const
int
*
const
input_lengths
,
bool
compute_alpha
,
bool
compute_betas_and_grad
);
int
out_dim_
;
// Number of characters plus blank
int
minibatch_
;
int
S_
;
int
T_
;
int
activation_cols_
;
// Number of columns in activations
CUstream
stream_
;
int
blank_label_
;
void
*
gpu_workspace_
;
// Buffer for all temporary GPU memory
int
*
utt_length_
;
// T
int
*
label_sizes_
;
// L
int
*
repeats_
;
// repeats_
int
*
label_offsets_
;
int
*
labels_without_blanks_
;
int
*
labels_with_blanks_
;
ProbT
*
alphas_
;
ProbT
*
nll_forward_
;
ProbT
*
nll_backward_
;
ProbT
*
denoms_
;
// Temporary storage for denoms for softmax
ProbT
*
probs_
;
// Temporary storage for probabilities (softmax output)
};
template
<
typename
ProbT
>
ctcStatus_t
GpuCTC
<
ProbT
>::
setup_gpu_metadata
(
const
int
*
const
flat_labels
,
const
int
*
const
label_lengths
,
const
int
*
const
input_lengths
)
{
size_t
gpu_bytes_used
=
0
;
nll_forward_
=
reinterpret_cast
<
ProbT
*>
(
static_cast
<
char
*>
(
gpu_workspace_
)
+
gpu_bytes_used
);
gpu_bytes_used
+=
minibatch_
*
sizeof
(
ProbT
);
nll_backward_
=
reinterpret_cast
<
ProbT
*>
(
static_cast
<
char
*>
(
gpu_workspace_
)
+
gpu_bytes_used
);
gpu_bytes_used
+=
minibatch_
*
sizeof
(
ProbT
);
repeats_
=
reinterpret_cast
<
int
*>
(
static_cast
<
char
*>
(
gpu_workspace_
)
+
gpu_bytes_used
);
gpu_bytes_used
+=
minibatch_
*
sizeof
(
int
);
label_offsets_
=
reinterpret_cast
<
int
*>
(
static_cast
<
char
*>
(
gpu_workspace_
)
+
gpu_bytes_used
);
gpu_bytes_used
+=
minibatch_
*
sizeof
(
int
);
// This is the max of all S and T for all valid examples in the minibatch.
// A valid example is one for which L + repeats <= T
S_
=
0
;
T_
=
0
;
// This is the max of all timesteps, valid or not. Needed to compute offsets
int
Tmax
=
0
;
// This is the max of all labels, valid or not. Needed to compute offsets
int
Lmax
=
0
;
int
total_label_length
=
0
;
constexpr
int
cpu_buffer_size
=
64
;
int
repeats
[
cpu_buffer_size
];
int
label_offsets
[
cpu_buffer_size
];
const
int
num_passes
=
ctc_helper
::
div_up
(
minibatch_
,
cpu_buffer_size
);
hipError_t
cuda_status
;
for
(
int
pass
=
0
;
pass
<
num_passes
;
++
pass
)
{
const
int
start_idx
=
pass
*
cpu_buffer_size
;
const
int
end_idx
=
std
::
min
(
minibatch_
,
(
pass
+
1
)
*
cpu_buffer_size
);
for
(
int
j
=
start_idx
;
j
<
end_idx
;
++
j
)
{
const
int
L
=
label_lengths
[
j
];
const
int
local_T
=
input_lengths
[
j
];
const
int
*
label_ptr
=
&
(
flat_labels
[
total_label_length
]);
label_offsets
[
j
%
cpu_buffer_size
]
=
total_label_length
;
total_label_length
+=
L
;
int
repeat_counter
=
0
;
for
(
int
i
=
1
;
i
<
L
;
++
i
)
repeat_counter
+=
(
label_ptr
[
i
]
==
label_ptr
[
i
-
1
]);
repeats
[
j
%
cpu_buffer_size
]
=
repeat_counter
;
const
bool
valid_label
=
((
L
+
repeat_counter
)
<=
local_T
);
// Only update S and T if label is valid
S_
=
(
valid_label
)
?
std
::
max
(
S_
,
L
)
:
S_
;
T_
=
(
valid_label
)
?
std
::
max
(
T_
,
local_T
)
:
T_
;
Tmax
=
std
::
max
(
Tmax
,
local_T
);
Lmax
=
std
::
max
(
Lmax
,
L
);
}
cuda_status
=
hipMemcpyAsync
(
&
(
repeats_
[
start_idx
]),
repeats
,
(
end_idx
-
start_idx
)
*
sizeof
(
int
),
hipMemcpyHostToDevice
,
stream_
);
if
(
cuda_status
!=
hipSuccess
)
return
CTC_STATUS_MEMOPS_FAILED
;
cuda_status
=
hipMemcpyAsync
(
&
(
label_offsets_
[
start_idx
]),
label_offsets
,
(
end_idx
-
start_idx
)
*
sizeof
(
int
),
hipMemcpyHostToDevice
,
stream_
);
if
(
cuda_status
!=
hipSuccess
)
return
CTC_STATUS_MEMOPS_FAILED
;
}
S_
=
2
*
S_
+
1
;
const
int
Smax
=
2
*
Lmax
+
1
;
activation_cols_
=
minibatch_
*
Tmax
;
// Allocate memory for T
utt_length_
=
reinterpret_cast
<
int
*>
(
static_cast
<
char
*>
(
gpu_workspace_
)
+
gpu_bytes_used
);
gpu_bytes_used
+=
minibatch_
*
sizeof
(
int
);
cuda_status
=
hipMemcpyAsync
(
utt_length_
,
input_lengths
,
minibatch_
*
sizeof
(
int
),
hipMemcpyHostToDevice
,
stream_
);
if
(
cuda_status
!=
hipSuccess
)
return
CTC_STATUS_MEMOPS_FAILED
;
label_sizes_
=
reinterpret_cast
<
int
*>
(
static_cast
<
char
*>
(
gpu_workspace_
)
+
gpu_bytes_used
);
gpu_bytes_used
+=
minibatch_
*
sizeof
(
int
);
cuda_status
=
hipMemcpyAsync
(
label_sizes_
,
label_lengths
,
minibatch_
*
sizeof
(
int
),
hipMemcpyHostToDevice
,
stream_
);
if
(
cuda_status
!=
hipSuccess
)
return
CTC_STATUS_MEMOPS_FAILED
;
labels_without_blanks_
=
reinterpret_cast
<
int
*>
(
static_cast
<
char
*>
(
gpu_workspace_
)
+
gpu_bytes_used
);
gpu_bytes_used
+=
Lmax
*
minibatch_
*
sizeof
(
int
);
cuda_status
=
hipMemcpyAsync
(
labels_without_blanks_
,
flat_labels
,
total_label_length
*
sizeof
(
int
),
hipMemcpyHostToDevice
,
stream_
);
if
(
cuda_status
!=
hipSuccess
)
return
CTC_STATUS_MEMOPS_FAILED
;
labels_with_blanks_
=
reinterpret_cast
<
int
*>
(
static_cast
<
char
*>
(
gpu_workspace_
)
+
gpu_bytes_used
);
gpu_bytes_used
+=
Smax
*
minibatch_
*
sizeof
(
int
);
alphas_
=
reinterpret_cast
<
ProbT
*>
(
static_cast
<
char
*>
(
gpu_workspace_
)
+
gpu_bytes_used
);
gpu_bytes_used
+=
(
S_
*
T_
)
*
minibatch_
*
sizeof
(
ProbT
);
denoms_
=
reinterpret_cast
<
ProbT
*>
(
static_cast
<
char
*>
(
gpu_workspace_
)
+
gpu_bytes_used
);
gpu_bytes_used
+=
activation_cols_
*
sizeof
(
ProbT
);
probs_
=
reinterpret_cast
<
ProbT
*>
(
static_cast
<
char
*>
(
gpu_workspace_
)
+
gpu_bytes_used
);
gpu_bytes_used
+=
out_dim_
*
activation_cols_
*
sizeof
(
ProbT
);
return
CTC_STATUS_SUCCESS
;
}
template
<
typename
ProbT
>
template
<
int
NT
,
int
VT
>
ctcStatus_t
GpuCTC
<
ProbT
>::
launch_alpha_beta_kernels
(
const
ProbT
*
const
probs
,
ProbT
*
grads
,
bool
compute_alpha
,
bool
compute_beta
)
{
// One thread block per utterance
const
int
grid_size
=
minibatch_
;
// The data is laid out so that the next timestep is minibatch entries away
const
int
stride
=
minibatch_
;
if
(
compute_alpha
)
{
compute_alpha_kernel
<
ProbT
,
NT
,
VT
><<<
grid_size
,
NT
,
0
,
stream_
>>>
(
probs
,
label_sizes_
,
utt_length_
,
repeats_
,
labels_without_blanks_
,
label_offsets_
,
labels_with_blanks_
,
alphas_
,
nll_forward_
,
stride
,
out_dim_
,
S_
,
T_
,
blank_label_
);
hipStreamSynchronize
(
stream_
);
}
if
(
compute_beta
)
{
compute_betas_and_grad_kernel
<
ProbT
,
NT
,
VT
><<<
grid_size
,
NT
,
0
,
stream_
>>>
(
probs
,
label_sizes_
,
utt_length_
,
repeats_
,
labels_with_blanks_
,
alphas_
,
nll_forward_
,
nll_backward_
,
grads
,
stride
,
out_dim_
,
S_
,
T_
,
blank_label_
);
hipStreamSynchronize
(
stream_
);
}
hipError_t
err
=
hipGetLastError
();
if
(
err
!=
hipSuccess
)
return
CTC_STATUS_EXECUTION_FAILED
;
return
CTC_STATUS_SUCCESS
;
}
template
<
typename
ProbT
>
ctcStatus_t
GpuCTC
<
ProbT
>::
create_metadata_and_choose_config
(
const
int
*
const
flat_labels
,
const
int
*
const
label_lengths
,
const
int
*
const
input_lengths
,
size_t
&
best_config
)
{
// Setup the metadata for GPU
ctcStatus_t
status
=
setup_gpu_metadata
(
flat_labels
,
label_lengths
,
input_lengths
);
if
(
status
!=
CTC_STATUS_SUCCESS
)
return
status
;
constexpr
int
num_configs
=
12
;
int
config_NT
[
num_configs
]
=
{
32
,
64
,
128
,
64
,
128
,
32
,
64
,
128
,
64
,
128
,
128
,
128
};
int
config_VT
[
num_configs
]
=
{
1
,
1
,
1
,
3
,
2
,
9
,
6
,
4
,
9
,
6
,
9
,
10
};
best_config
=
0
;
for
(
int
i
=
0
;
i
<
num_configs
;
++
i
)
{
if
((
config_NT
[
i
]
*
config_VT
[
i
])
>=
S_
)
break
;
else
best_config
++
;
}
if
(
best_config
>=
num_configs
)
return
CTC_STATUS_UNKNOWN_ERROR
;
return
CTC_STATUS_SUCCESS
;
}
template
<
typename
ProbT
>
ctcStatus_t
GpuCTC
<
ProbT
>::
launch_gpu_kernels
(
const
ProbT
*
const
probs
,
ProbT
*
grads
,
size_t
config
,
bool
l_a
,
bool
l_b
)
{
switch
(
config
)
{
case
0
:
{
return
launch_alpha_beta_kernels
<
32
,
1
>
(
probs
,
grads
,
l_a
,
l_b
);
}
case
1
:
{
return
launch_alpha_beta_kernels
<
64
,
1
>
(
probs
,
grads
,
l_a
,
l_b
);
}
case
2
:
{
return
launch_alpha_beta_kernels
<
128
,
1
>
(
probs
,
grads
,
l_a
,
l_b
);
}
case
3
:
{
return
launch_alpha_beta_kernels
<
64
,
3
>
(
probs
,
grads
,
l_a
,
l_b
);
}
case
4
:
{
return
launch_alpha_beta_kernels
<
128
,
2
>
(
probs
,
grads
,
l_a
,
l_b
);
}
case
5
:
{
return
launch_alpha_beta_kernels
<
32
,
9
>
(
probs
,
grads
,
l_a
,
l_b
);
}
case
6
:
{
return
launch_alpha_beta_kernels
<
64
,
6
>
(
probs
,
grads
,
l_a
,
l_b
);
}
case
7
:
{
return
launch_alpha_beta_kernels
<
128
,
4
>
(
probs
,
grads
,
l_a
,
l_b
);
}
case
8
:
{
return
launch_alpha_beta_kernels
<
64
,
9
>
(
probs
,
grads
,
l_a
,
l_b
);
}
case
9
:
{
return
launch_alpha_beta_kernels
<
128
,
6
>
(
probs
,
grads
,
l_a
,
l_b
);
}
case
10
:
{
return
launch_alpha_beta_kernels
<
128
,
9
>
(
probs
,
grads
,
l_a
,
l_b
);
}
case
11
:
{
return
launch_alpha_beta_kernels
<
128
,
10
>
(
probs
,
grads
,
l_a
,
l_b
);
}
}
return
CTC_STATUS_EXECUTION_FAILED
;
}
template
<
typename
ProbT
>
ctcStatus_t
GpuCTC
<
ProbT
>::
compute_probs
(
const
ProbT
*
const
activations
)
{
hipError_t
cuda_status
;
cuda_status
=
hipMemcpyAsync
(
probs_
,
activations
,
activation_cols_
*
out_dim_
*
sizeof
(
ProbT
),
hipMemcpyDeviceToDevice
,
stream_
);
if
(
cuda_status
!=
hipSuccess
)
return
CTC_STATUS_MEMOPS_FAILED
;
cuda_status
=
hipStreamSynchronize
(
stream_
);
// Numerically stable SM
ctcStatus_t
ctc_status
=
reduce_max
(
probs_
,
denoms_
,
out_dim_
,
activation_cols_
,
1
,
stream_
);
if
(
ctc_status
!=
CTC_STATUS_SUCCESS
)
return
ctc_status
;
// Kernel launch to subtract maximum
const
int
NT
=
kCUDABlockNumThreads
;
const
int
VT
=
1
;
const
int
NV
=
NT
*
VT
;
const
int
num_elements
=
out_dim_
*
activation_cols_
;
const
int
grid_size
=
ctc_helper
::
div_up
(
num_elements
,
NV
);
prepare_stable_SM_kernel
<
ProbT
,
VT
>
<<<
grid_size
,
NT
,
0
,
stream_
>>>
(
ctc_helper
::
identity
<
ProbT
>
(),
probs_
,
denoms_
,
out_dim_
,
num_elements
);
// Reduce along columns to calculate denominator
ctc_status
=
reduce_exp
(
probs_
,
denoms_
,
out_dim_
,
activation_cols_
,
1
,
stream_
);
if
(
ctc_status
!=
CTC_STATUS_SUCCESS
)
return
ctc_status
;
// Kernel launch to calculate probabilities
compute_probs_kernel
<
ProbT
,
VT
><<<
grid_size
,
NT
,
0
,
stream_
>>>
(
ctc_helper
::
exponential
<
ProbT
>
(),
probs_
,
denoms_
,
out_dim_
,
num_elements
);
return
CTC_STATUS_SUCCESS
;
}
template
<
typename
ProbT
>
ctcStatus_t
GpuCTC
<
ProbT
>::
compute_cost_and_score
(
const
ProbT
*
const
activations
,
ProbT
*
grads
,
ProbT
*
costs
,
const
int
*
const
flat_labels
,
const
int
*
const
label_lengths
,
const
int
*
const
input_lengths
,
bool
compute_alpha
,
bool
compute_betas_and_grad
)
{
size_t
best_config
;
ctcStatus_t
status
=
create_metadata_and_choose_config
(
flat_labels
,
label_lengths
,
input_lengths
,
best_config
);
if
(
status
!=
CTC_STATUS_SUCCESS
)
return
status
;
status
=
compute_probs
(
activations
);
if
(
status
!=
CTC_STATUS_SUCCESS
)
return
status
;
launch_gpu_kernels
(
probs_
,
grads
,
best_config
,
compute_alpha
,
compute_betas_and_grad
);
hipError_t
cuda_status_mem
,
cuda_status_sync
;
cuda_status_mem
=
hipMemcpyAsync
(
costs
,
nll_forward_
,
sizeof
(
ProbT
)
*
minibatch_
,
hipMemcpyDeviceToHost
,
stream_
);
cuda_status_sync
=
hipStreamSynchronize
(
stream_
);
if
(
cuda_status_mem
!=
hipSuccess
||
cuda_status_sync
!=
hipSuccess
)
return
CTC_STATUS_MEMOPS_FAILED
;
return
CTC_STATUS_SUCCESS
;
}
template
<
typename
ProbT
>
ctcStatus_t
GpuCTC
<
ProbT
>::
cost_and_grad
(
const
ProbT
*
const
activations
,
ProbT
*
grads
,
ProbT
*
costs
,
const
int
*
const
flat_labels
,
const
int
*
const
label_lengths
,
const
int
*
const
input_lengths
)
{
if
(
activations
==
nullptr
||
grads
==
nullptr
||
costs
==
nullptr
||
flat_labels
==
nullptr
||
label_lengths
==
nullptr
||
input_lengths
==
nullptr
)
return
CTC_STATUS_INVALID_VALUE
;
return
compute_cost_and_score
(
activations
,
grads
,
costs
,
flat_labels
,
label_lengths
,
input_lengths
,
true
,
true
);
}
template
<
typename
ProbT
>
ctcStatus_t
GpuCTC
<
ProbT
>::
score_forward
(
const
ProbT
*
const
activations
,
ProbT
*
costs
,
const
int
*
const
flat_labels
,
const
int
*
const
label_lengths
,
const
int
*
const
input_lengths
)
{
if
(
activations
==
nullptr
||
costs
==
nullptr
||
flat_labels
==
nullptr
||
label_lengths
==
nullptr
||
input_lengths
==
nullptr
)
return
CTC_STATUS_INVALID_VALUE
;
return
compute_cost_and_score
(
activations
,
nullptr
,
costs
,
flat_labels
,
label_lengths
,
input_lengths
,
true
,
false
);
}
include/detail/gpu_ctc_kernels.h
deleted
100644 → 0
View file @
0bf5eb5f
#pragma once
#include <contrib/moderngpu/include/device/ctascan.cuh>
#include <contrib/moderngpu/include/device/ctamerge.cuh>
#include "ctc_helper.h"
#include <stdio.h>
using
namespace
mgpu
;
template
<
int
NT
,
int
VT
,
typename
T
,
typename
KeyT
,
typename
Op
>
struct
CTASegReduce
{
enum
{
NV
=
NT
*
VT
};
union
Storage
{
typename
CTAScan
<
NT
>::
Storage
scanStorage
;
int
indices
[
NV
];
};
//adapted from global kernel KernelReduceByKeyPreprocess
__device__
static
void
preprocessKeys
(
KeyT
*
keys
,
int
count
,
int
*
numUniqueLabels
,
int
seg_start
[
VT
],
int
seg_end
[
VT
],
int
*
scanout
)
{
__shared__
Storage
shared
;
const
int
tid
=
threadIdx
.
x
;
// Compare adjacent keys within each thread and mark discontinuities
int
endFlags
=
0
;
T
key
=
keys
[
VT
*
tid
];
#pragma unroll
for
(
int
i
=
0
;
i
<
VT
;
++
i
)
{
int
index
=
VT
*
tid
+
1
+
i
;
T
next
=
keys
[
index
];
if
(
index
==
count
||
(
index
<
count
&&
key
!=
next
))
{
endFlags
|=
1
<<
i
;
}
key
=
next
;
}
__syncthreads
();
//Count the number of encountered end flags
int
scan
=
CTAScan
<
NT
>::
Scan
(
tid
,
popc
(
endFlags
),
shared
.
scanStorage
,
numUniqueLabels
);
__syncthreads
();
//output the unique keys
//use indices as scratch space
int
outputPos
=
scan
;
#pragma unroll
for
(
int
i
=
0
;
i
<
VT
;
++
i
)
{
if
((
endFlags
>>
i
)
&
1
)
{
shared
.
indices
[
outputPos
]
=
keys
[
VT
*
tid
+
i
];
scanout
[
outputPos
]
=
VT
*
tid
+
i
;
outputPos
++
;
}
}
__syncthreads
();
// Create start and end
for
(
int
idx
=
tid
,
j
=
0
;
idx
<
(
*
numUniqueLabels
);
idx
+=
blockDim
.
x
,
++
j
)
{
seg_start
[
j
]
=
(
idx
==
0
)
?
0
:
(
scanout
[
idx
-
1
]
+
1
);
seg_end
[
j
]
=
scanout
[
idx
];
}
__syncthreads
();
//copy from the scratch space back into the keys
#pragma unroll
for
(
int
i
=
0
;
i
<
VT
;
++
i
)
{
keys
[
i
*
NT
+
tid
]
=
shared
.
indices
[
i
*
NT
+
tid
];
}
__syncthreads
();
}
};
// Computes forward probabilities. This fills in a T * S matrix.
// The computation starts at t=1 (2nd row) and ends at t=T-1 (last row). Each row has
// S elements where S = 2L + 1.
//
// We only need to read in probabilities corresponding to the labels, thus a sparse
// set of values are read from the probs matrix since the character set is much smaller
// than the labels. This is much more true for Mandarin than English.
template
<
typename
ProbT
,
int
NT
,
int
VT
>
__global__
void
compute_alpha_kernel
(
const
ProbT
*
probs
,
const
int
*
label_sizes
,
const
int
*
utt_length
,
const
int
*
repeats_in_labels
,
const
int
*
labels_without_blanks
,
const
int
*
label_offsets
,
int
*
labels_with_blanks
,
ProbT
*
alphas
,
ProbT
*
nll_forward
,
int
stride
,
int
out_dim
,
int
S_memoffset
,
int
T_memoffset
,
int
blank_label
)
{
ctc_helper
::
log_plus
<
ProbT
>
log_plus_f
;
const
int
tid
=
threadIdx
.
x
;
const
int
L
=
label_sizes
[
blockIdx
.
x
];
const
int
T
=
utt_length
[
blockIdx
.
x
];
const
int
S
=
2
*
L
+
1
;
const
int
prob_offset
=
out_dim
*
blockIdx
.
x
;
const
int
repeats
=
repeats_in_labels
[
blockIdx
.
x
];
const
int
NV
=
NT
*
VT
;
__shared__
int
label
[
NV
];
if
((
L
+
repeats
)
>
T
)
return
;
// Generate labels with blanks from labels without blanks
{
const
int
label_start_offset
=
label_offsets
[
blockIdx
.
x
];
for
(
int
idx
=
tid
;
idx
<
L
;
idx
+=
blockDim
.
x
)
{
const
int
offset
=
(
blockIdx
.
x
*
S_memoffset
)
+
2
*
idx
;
labels_with_blanks
[
offset
]
=
blank_label
;
labels_with_blanks
[
offset
+
1
]
=
labels_without_blanks
[
label_start_offset
+
idx
];
}
if
(
tid
==
0
)
{
labels_with_blanks
[(
blockIdx
.
x
*
S_memoffset
)
+
2
*
L
]
=
blank_label
;
}
}
__syncthreads
();
const
int
*
labels
=
labels_with_blanks
;
const
int
*
label_global
=
&
labels
[
blockIdx
.
x
*
S_memoffset
];
ProbT
*
alpha
=
&
alphas
[
blockIdx
.
x
*
(
S_memoffset
*
T_memoffset
)];
// Set the first row of alpha neg_inf - it is much more efficient to do it
// here than outside
#pragma unroll
for
(
int
idx
=
tid
;
idx
<
min
(
S
,
NV
);
idx
+=
blockDim
.
x
)
{
alpha
[
idx
]
=
ctc_helper
::
neg_inf
<
ProbT
>
();
}
// Load labels into shared memory
#pragma unroll
for
(
int
i
=
tid
;
i
<
S
;
i
+=
NT
)
{
label
[
i
]
=
label_global
[
i
];
}
__syncthreads
();
int
start
=
(
L
+
repeats
<
T
)
?
0
:
1
;
int
end
=
S
>
1
?
2
:
1
;
// Initialize the first row corresponding to t=0;
for
(
int
i
=
tid
;
i
<
(
end
-
start
);
i
+=
blockDim
.
x
)
{
alpha
[
i
+
start
]
=
log
(
probs
[
prob_offset
+
label
[
i
+
start
]]);
//printf("compute_alpha_kernel probs is %f\n", probs[prob_offset + label[i + start]]);
//printf("compute_alpha_kernel alpha is %f\n", alpha[i + start]);
}
__syncthreads
();
// Fill in the rest of matrix, one row at a time (outer loop).
for
(
int
t
=
1
;
t
<
T
;
++
t
)
{
// Start offsets into the current and previous row
const
int
start_cur_row
=
t
*
S
;
const
int
start_prev_row
=
(
t
-
1
)
*
S
;
// The prob is a 2D column major array, with probabilites for each t strided
// by (out_dim * stride), where stride is the minibatch size, out_dim is alphabet_size
const
int
start_prob_col
=
t
*
(
out_dim
*
stride
);
// This is the first column and in this case there is nothing left of it
if
(
tid
==
0
)
{
if
(
start
==
0
)
{
alpha
[
start_cur_row
]
=
alpha
[
start_prev_row
]
+
log
(
probs
[
prob_offset
+
start_prob_col
+
blank_label
]);
}
else
if
(
start
==
1
)
{
alpha
[
start_cur_row
]
=
alpha
[
start_prev_row
];
}
}
__syncthreads
();
// Fill in the elements in each row. There is no loop dependence here since our
// input is the row above. We sum either two or three adjacent values from the
// row above depending on whether we have a blank or repeated characters. Finally
// we add the probability corresponding to this label at time t
#pragma unroll
for
(
int
idx
=
(
tid
+
1
);
idx
<
S
;
idx
+=
blockDim
.
x
)
{
ProbT
prev_sum
=
log_plus_f
(
alpha
[
idx
+
start_prev_row
],
alpha
[(
idx
-
1
)
+
start_prev_row
]);
// Skip two if not on blank and not on repeat.
if
((
label
[
idx
]
!=
blank_label
)
&&
(
idx
!=
1
)
&&
(
label
[
idx
]
!=
label
[
idx
-
2
]))
prev_sum
=
log_plus_f
(
prev_sum
,
alpha
[(
idx
-
2
)
+
start_prev_row
]);
alpha
[
idx
+
start_cur_row
]
=
prev_sum
+
log
(
probs
[
prob_offset
+
start_prob_col
+
label
[
idx
]]);
}
__syncthreads
();
}
if
(
tid
==
0
)
{
// Add and return the rightmost two/one element(s) in the last row.
ProbT
loglike
=
ctc_helper
::
neg_inf
<
ProbT
>
();
// This is the total increment for s_inc and e_inc through the loop
const
int
val
=
2
*
(
L
-
1
)
+
1
-
(((
L
+
repeats
)
==
T
)
?
1
:
0
);
start
=
(
val
*
(
L
!=
0
)
+
start
);
end
=
(
val
*
(
L
!=
0
)
+
end
);
for
(
int
i
=
start
;
i
<
end
;
++
i
)
{
loglike
=
log_plus_f
(
loglike
,
alpha
[
i
+
(
T
-
1
)
*
S
]);
}
nll_forward
[
blockIdx
.
x
]
=
-
loglike
;
}
}
// Computes backward probabilities. This also fills in a T * S matrix
//
// See comments above compute_alphas for more context.
template
<
typename
ProbT
,
int
NT
,
int
VT
>
__global__
void
compute_betas_and_grad_kernel
(
const
ProbT
*
probs
,
const
int
*
label_sizes
,
const
int
*
utt_length
,
const
int
*
repeats_in_labels
,
const
int
*
labels_with_blanks
,
ProbT
*
alphas
,
const
ProbT
*
nll_forward
,
ProbT
*
nll_backward
,
ProbT
*
grads
,
int
stride
,
int
out_dim
,
int
S_memoffset
,
int
T_memoffset
,
int
blank_label
)
{
ctc_helper
::
log_plus
<
ProbT
>
log_plus_f
;
typedef
CTASegReduce
<
NT
,
VT
,
ProbT
,
int
,
ctc_helper
::
log_plus
<
ProbT
>>
SegReduce
;
const
int
tid
=
threadIdx
.
x
;
const
int
L
=
label_sizes
[
blockIdx
.
x
];
const
int
T
=
utt_length
[
blockIdx
.
x
];
const
int
S
=
2
*
L
+
1
;
const
int
prob_offset
=
out_dim
*
blockIdx
.
x
;
const
int
repeats
=
repeats_in_labels
[
blockIdx
.
x
];
const
ProbT
log_partition
=
-
nll_forward
[
blockIdx
.
x
];
const
int
*
labels
=
labels_with_blanks
;
const
int
*
label_global
=
&
labels
[
blockIdx
.
x
*
S_memoffset
];
ProbT
*
alpha
=
&
alphas
[
blockIdx
.
x
*
(
S_memoffset
*
T_memoffset
)];
const
int
NV
=
NT
*
VT
;
union
TempStorage
{
ProbT
beta
[
NV
];
int
result
[
NV
];
};
__shared__
TempStorage
temp_buffer
;
__shared__
int
label
[
NV
];
// Temporaries needed for segmented reduce
// TODO: see if we can combine the shared memory requirements
__shared__
int
keys_shared
[
NV
];
__shared__
int
gather_indices
[
NV
];
__shared__
ProbT
output
[
NV
];
ProbT
beta_val
[
VT
];
if
((
L
+
repeats
)
>
T
)
return
;
int
start
=
S
>
1
?
(
S
-
2
)
:
0
;
int
end
=
(
L
+
repeats
<
T
)
?
S
:
S
-
1
;
// Setup shared memory buffers
#pragma unroll
for
(
int
idx
=
tid
;
idx
<
NV
;
idx
+=
NT
)
{
label
[
idx
]
=
(
idx
<
S
)
?
label_global
[
idx
]
:
INT_MAX
;
}
__syncthreads
();
// int flags;
int
uniquelabels
;
int
seg_start
[
VT
];
int
seg_end
[
VT
];
// Sort labels and record indices from which to gather from
{
int
key
[
VT
];
int
gather_val
[
VT
];
#pragma unroll
for
(
int
i
=
0
;
i
<
VT
;
++
i
)
{
const
int
idx
=
tid
*
VT
+
i
;
gather_val
[
i
]
=
idx
;
key
[
i
]
=
label
[
idx
];
}
__syncthreads
();
CTAMergesort
<
NT
,
VT
,
true
,
true
,
int
,
int
,
mgpu
::
less
<
int
>>
(
key
,
gather_val
,
keys_shared
,
gather_indices
,
S
,
tid
,
mgpu
::
less
<
int
>
());
__syncthreads
();
for
(
int
i
=
0
;
i
<
VT
;
++
i
)
{
const
int
idx
=
tid
*
VT
+
i
;
gather_indices
[
idx
]
=
gather_val
[
i
];
}
__syncthreads
();
SegReduce
::
preprocessKeys
(
keys_shared
,
S
,
&
uniquelabels
,
seg_start
,
seg_end
,
temp_buffer
.
result
);
__syncthreads
();
}
// TODO: probably not necessary
__syncthreads
();
// Load labels back
#pragma unroll
for
(
int
idx
=
tid
;
idx
<
NV
;
idx
+=
NT
)
{
temp_buffer
.
beta
[
idx
]
=
ctc_helper
::
neg_inf
<
ProbT
>
();
}
__syncthreads
();
// Initialize the two rightmost values in the last row (assuming L non-zero)
for
(
int
i
=
tid
;
i
<
(
end
-
start
);
i
+=
blockDim
.
x
)
temp_buffer
.
beta
[
i
+
start
]
=
log
(
probs
[
prob_offset
+
(
T
-
1
)
*
(
out_dim
*
stride
)
+
label
[
i
+
start
]]);
__syncthreads
();
// Load output data in registers through the transpose trick - should really be a function
#pragma unroll
for
(
int
idx
=
tid
;
idx
<
S
;
idx
+=
NT
)
{
output
[
idx
]
=
alpha
[
idx
+
(
T
-
1
)
*
S
]
+
temp_buffer
.
beta
[
idx
];
}
__syncthreads
();
// Start at the second to last row and backward in time
for
(
int
t
=
T
-
1
;
t
>=
0
;
--
t
)
{
// Start offsets into the current and next row
const
int
start_cur_row
=
t
*
S
;
// Starting offset of column that we read from the probs array
const
int
start_prob_col
=
t
*
(
out_dim
*
stride
);
if
(
t
<
T
-
1
)
{
// Filling up one row at at time but going back in time from the last row
// to the first. As in the forward pass, there is no loop dependence and we
// do a variable length filter of maximum filter size of 3
#pragma unroll
for
(
int
idx
=
tid
,
i
=
0
;
idx
<
(
S
-
1
);
idx
+=
NT
,
i
++
)
{
ProbT
next_sum
=
log_plus_f
(
temp_buffer
.
beta
[
idx
],
temp_buffer
.
beta
[
idx
+
1
]);
// Skip two if not on blank and not on repeat.
if
((
label
[
idx
]
!=
blank_label
)
&&
(
idx
!=
(
S
-
2
))
&&
(
label
[
idx
]
!=
label
[
idx
+
2
]))
next_sum
=
log_plus_f
(
next_sum
,
temp_buffer
.
beta
[
idx
+
2
]);
beta_val
[
i
]
=
next_sum
+
log
(
probs
[
prob_offset
+
start_prob_col
+
label
[
idx
]]);
}
__syncthreads
();
// Initialize values for the rightmost column since there is nothing to the right
// Update input buffer for next iteration
if
((
tid
==
0
)
&&
(
end
==
S
))
temp_buffer
.
beta
[(
S
-
1
)]
=
temp_buffer
.
beta
[(
S
-
1
)]
+
log
(
probs
[
prob_offset
+
start_prob_col
+
blank_label
]);
#pragma unroll
for
(
int
idx
=
tid
,
i
=
0
;
idx
<
(
S
-
1
);
idx
+=
NT
,
i
++
)
{
temp_buffer
.
beta
[
idx
]
=
beta_val
[
i
];
}
__syncthreads
();
// Beta Computation done - add to alpha and update the gradient. Reload
// the gradient back for segmented reduce later on
#pragma unroll
for
(
int
idx
=
tid
;
idx
<
S
;
idx
+=
NT
)
{
output
[
idx
]
=
alpha
[
idx
+
start_cur_row
]
+
temp_buffer
.
beta
[
idx
];
}
__syncthreads
();
}
__syncthreads
();
// Compute segmented reduction of output by using label as key
{
// Somewhat faster key value reduce
ProbT
accum
[
VT
];
for
(
int
idx
=
tid
,
j
=
0
;
idx
<
uniquelabels
;
idx
+=
blockDim
.
x
,
++
j
)
{
accum
[
j
]
=
ctc_helper
::
neg_inf
<
ProbT
>
();
for
(
int
i
=
seg_start
[
j
];
i
<=
seg_end
[
j
];
++
i
)
{
accum
[
j
]
=
log_plus_f
(
accum
[
j
],
output
[
gather_indices
[
i
]]);
}
}
__syncthreads
();
// Write accumulated value into output since that is not used
for
(
int
idx
=
tid
,
j
=
0
;
idx
<
uniquelabels
;
idx
+=
blockDim
.
x
,
++
j
)
{
output
[
idx
]
=
accum
[
j
];
}
__syncthreads
();
for
(
int
idx
=
tid
;
idx
<
out_dim
;
idx
+=
blockDim
.
x
)
{
const
int
grads_offset
=
prob_offset
+
start_prob_col
+
idx
;
grads
[
grads_offset
]
=
probs
[
grads_offset
];
}
__syncthreads
();
for
(
int
idx
=
tid
;
idx
<
uniquelabels
;
idx
+=
blockDim
.
x
)
{
const
int
grads_offset
=
prob_offset
+
start_prob_col
+
keys_shared
[
idx
];
ProbT
grad
=
output
[
idx
];
if
((
grad
==
0.0
)
||
(
probs
[
grads_offset
]
==
0.0
)
||
(
grad
==
ctc_helper
::
neg_inf
<
ProbT
>
()))
{
}
else
{
grads
[
grads_offset
]
=
probs
[
grads_offset
]
-
exp
(
grad
-
log
(
probs
[
grads_offset
])
-
log_partition
);
}
}
__syncthreads
();
}
// Output backward log likelihood
if
((
t
==
0
)
&&
(
tid
==
0
))
{
ProbT
loglike
=
ctc_helper
::
neg_inf
<
ProbT
>
();
const
int
val
=
2
*
(
L
-
1
)
+
1
-
(((
L
+
repeats
)
==
T
)
?
1
:
0
);
start
=
(
-
val
*
(
L
!=
0
)
+
start
);
end
=
(
-
val
*
(
L
!=
0
)
+
end
);
// Sum and return the leftmost one/two value(s) in first row
for
(
int
i
=
start
;
i
<
end
;
++
i
)
loglike
=
log_plus_f
(
loglike
,
temp_buffer
.
beta
[
i
]);
nll_backward
[
blockIdx
.
x
]
=
-
loglike
;
}
// For some reason this is important
__syncthreads
();
}
}
template
<
typename
ProbT
,
int
VT
=
1
,
typename
Op
>
__global__
void
compute_probs_kernel
(
Op
f
,
ProbT
*
probs
,
const
ProbT
*
const
denom
,
int
alphabet_size
,
int
count
)
{
int
idx
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
int
stride
=
blockDim
.
x
*
gridDim
.
x
;
#pragma unroll
for
(
int
i
=
0
;
i
<
VT
;
i
++
)
{
if
(
idx
<
count
)
{
const
int
column_idx
=
idx
/
alphabet_size
;
probs
[
idx
]
=
f
(
probs
[
idx
])
/
denom
[
column_idx
];
}
idx
+=
stride
;
}
}
template
<
typename
ProbT
,
int
VT
=
1
,
typename
Op
>
__global__
void
prepare_stable_SM_kernel
(
Op
f
,
ProbT
*
probs
,
const
ProbT
*
const
col_max
,
int
alphabet_size
,
int
count
)
{
int
idx
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
int
stride
=
blockDim
.
x
*
gridDim
.
x
;
#pragma unroll
for
(
int
i
=
0
;
i
<
VT
;
i
++
)
{
if
(
idx
<
count
)
{
const
int
column_idx
=
idx
/
alphabet_size
;
probs
[
idx
]
=
f
(
probs
[
idx
]
-
col_max
[
column_idx
]);
}
idx
+=
stride
;
}
}
include/detail/hostdevice.h
deleted
100644 → 0
View file @
0bf5eb5f
#pragma once
#ifdef __HIPCC__
#define HOSTDEVICE __device__ __host__
#else
#define HOSTDEVICE
#endif
include/detail/reduce.h
deleted
100644 → 0
View file @
0bf5eb5f
#pragma once
ctcStatus_t
reduce_negate
(
const
float
*
input
,
float
*
output
,
int
rows
,
int
cols
,
bool
axis
,
CUstream
stream
);
ctcStatus_t
reduce_exp
(
const
float
*
input
,
float
*
output
,
int
rows
,
int
cols
,
bool
axis
,
CUstream
stream
);
ctcStatus_t
reduce_max
(
const
float
*
input
,
float
*
output
,
int
rows
,
int
cols
,
bool
axis
,
CUstream
stream
);
pytorch_binding/setup.cfg
deleted
100644 → 0
View file @
0bf5eb5f
[tool:pytest]
pytorch_binding/setup.py
deleted
100644 → 0
View file @
0bf5eb5f
import
torch
from
typing
import
List
,
Optional
,
Union
import
glob
import
os
import
shlex
import
subprocess
import
sys
from
torch.utils.cpp_extension
import
BuildExtension
,
CUDAExtension
,
CppExtension
from
setuptools
import
find_packages
,
setup
from
setuptools.command.build_ext
import
build_ext
from
pkg_resources
import
packaging
# type: ignore[attr-defined]
def
_find_rocm_home
()
->
Optional
[
str
]:
rocm_home
=
os
.
environ
.
get
(
'ROCM_HOME'
)
or
os
.
environ
.
get
(
'ROCM_PATH'
)
if
rocm_home
is
None
:
try
:
pipe_hipcc
=
subprocess
.
Popen
(
[
"which hipcc | xargs readlink -f"
],
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
,
shell
=
True
)
hipcc
,
_
=
pipe_hipcc
.
communicate
()
rocm_home
=
os
.
path
.
dirname
(
os
.
path
.
dirname
(
hipcc
.
decode
(
*
()).
rstrip
(
'
\r\n
'
)))
if
os
.
path
.
basename
(
rocm_home
)
==
'hip'
:
rocm_home
=
os
.
path
.
dirname
(
rocm_home
)
except
Exception
:
rocm_home
=
'/opt/rocm'
if
not
os
.
path
.
exists
(
rocm_home
):
rocm_home
=
None
if
rocm_home
and
torch
.
version
.
hip
is
None
:
print
(
f
"No ROCm runtime is found, using ROCM_HOME='
{
rocm_home
}
'"
)
return
rocm_home
def
_get_rocm_arch_flags
(
cflags
:
Optional
[
List
[
str
]]
=
None
)
->
List
[
str
]:
if
cflags
is
not
None
:
for
flag
in
cflags
:
if
'amdgpu-target'
in
flag
:
return
[
'-fno-gpu-rdc'
]
archs
=
os
.
environ
.
get
(
'PYTORCH_ROCM_ARCH'
,
'gfx900;gfx906'
)
flags
=
[
'--amdgpu-target=%s'
%
arch
for
arch
in
archs
.
split
(
';'
)]
flags
+=
[
'-fno-gpu-rdc'
]
return
flags
ROCM_HOME
=
_find_rocm_home
()
IS_HIP_EXTENSION
=
True
if
((
ROCM_HOME
is
not
None
)
and
(
torch
.
version
.
hip
is
not
None
))
else
False
COMMON_HIP_FLAGS
=
[
'-fPIC'
,
'-D__HIP_PLATFORM_HCC__=1'
,
]
COMMON_HIPCC_FLAGS
=
[
'-DCUDA_HAS_FP16=1'
,
'-D__HIP_NO_HALF_OPERATORS__=1'
,
'-D__HIP_NO_HALF_CONVERSIONS__=1'
,
]
def
is_ninja_available
():
try
:
subprocess
.
check_output
(
'ninja --version'
.
split
())
except
Exception
:
return
False
else
:
return
True
def
verify_ninja_availability
():
if
not
is_ninja_available
():
raise
RuntimeError
(
"Ninja is required to load C++ extensions"
)
def
_is_cuda_file
(
path
:
str
)
->
bool
:
valid_ext
=
[
'.cu'
,
'.cuh'
]
if
IS_HIP_EXTENSION
:
valid_ext
.
append
(
'.hip'
)
return
os
.
path
.
splitext
(
path
)[
1
]
in
valid_ext
def
_join_rocm_home
(
*
paths
)
->
str
:
if
ROCM_HOME
is
None
:
raise
EnvironmentError
(
'ROCM_HOME environment variable is not set. '
)
return
os
.
path
.
join
(
ROCM_HOME
,
*
paths
)
def
_write_ninja_file
(
path
,
cflags
,
post_cflags
,
cuda_cflags
,
cuda_post_cflags
,
sources
,
objects
,
ldflags
,
library_target
,
with_cuda
)
->
None
:
def
sanitize_flags
(
flags
):
if
flags
is
None
:
return
[]
else
:
return
[
flag
.
strip
()
for
flag
in
flags
]
cflags
=
sanitize_flags
(
cflags
)
post_cflags
=
sanitize_flags
(
post_cflags
)
cuda_cflags
=
sanitize_flags
(
cuda_cflags
)
cuda_post_cflags
=
sanitize_flags
(
cuda_post_cflags
)
ldflags
=
sanitize_flags
(
ldflags
)
assert
len
(
sources
)
==
len
(
objects
)
assert
len
(
sources
)
>
0
compiler
=
os
.
environ
.
get
(
'CXX'
,
'c++'
)
config
=
[
'ninja_required_version = 1.3'
]
config
.
append
(
f
'cxx =
{
compiler
}
'
)
if
with_cuda
:
if
IS_HIP_EXTENSION
:
nvcc
=
_join_rocm_home
(
'bin'
,
'hipcc'
)
config
.
append
(
f
'nvcc =
{
nvcc
}
'
)
flags
=
[
f
'cflags =
{
" "
.
join
(
cflags
)
}
'
]
flags
.
append
(
f
'post_cflags =
{
" "
.
join
(
post_cflags
)
}
'
)
if
with_cuda
:
flags
.
append
(
f
'cuda_cflags =
{
" "
.
join
(
cuda_cflags
)
}
'
)
flags
.
append
(
f
'cuda_post_cflags =
{
" "
.
join
(
cuda_post_cflags
)
}
'
)
flags
.
append
(
f
'ldflags =
{
" "
.
join
(
ldflags
)
}
'
)
sources
=
[
os
.
path
.
abspath
(
file
)
for
file
in
sources
]
compile_rule
=
[
'rule compile'
]
compile_rule
.
append
(
' command = $cxx -MMD -MF $out.d $cflags -c $in -o $out $post_cflags'
)
compile_rule
.
append
(
' depfile = $out.d'
)
compile_rule
.
append
(
' deps = gcc'
)
if
with_cuda
:
cuda_compile_rule
=
[
'rule cuda_compile'
]
nvcc_gendeps
=
''
required_cuda_version
=
packaging
.
version
.
parse
(
'10.2'
)
has_cuda_version
=
torch
.
version
.
cuda
is
not
None
if
has_cuda_version
and
packaging
.
version
.
parse
(
torch
.
version
.
cuda
)
>=
required_cuda_version
:
cuda_compile_rule
.
append
(
' depfile = $out.d'
)
cuda_compile_rule
.
append
(
' deps = gcc'
)
cuda_compile_rule
.
append
(
f
' command = $nvcc
{
nvcc_gendeps
}
$cuda_cflags -c $in -o $out $cuda_post_cflags'
)
build
=
[]
for
source_file
,
object_file
in
zip
(
sources
,
objects
):
is_cuda_source
=
_is_cuda_file
(
source_file
)
and
with_cuda
rule
=
'cuda_compile'
if
is_cuda_source
else
'compile'
source_file
=
source_file
.
replace
(
" "
,
"$ "
)
object_file
=
object_file
.
replace
(
" "
,
"$ "
)
build
.
append
(
f
'build
{
object_file
}
:
{
rule
}
{
source_file
}
'
)
if
library_target
is
not
None
:
link_rule
=
[
'rule link'
]
link_rule
.
append
(
' command = $cxx $in $ldflags -o $out'
)
link
=
[
f
'build
{
library_target
}
: link
{
" "
.
join
(
objects
)
}
'
]
default
=
[
f
'default
{
library_target
}
'
]
else
:
link_rule
,
link
,
default
=
[],
[],
[]
blocks
=
[
config
,
flags
,
compile_rule
]
if
with_cuda
:
blocks
.
append
(
cuda_compile_rule
)
blocks
+=
[
link_rule
,
build
,
link
,
default
]
with
open
(
path
,
'w'
)
as
build_file
:
for
block
in
blocks
:
lines
=
'
\n
'
.
join
(
block
)
build_file
.
write
(
f
'
{
lines
}
\n\n
'
)
def
_get_num_workers
(
verbose
:
bool
)
->
Optional
[
int
]:
max_jobs
=
os
.
environ
.
get
(
'MAX_JOBS'
)
if
max_jobs
is
not
None
and
max_jobs
.
isdigit
():
if
verbose
:
print
(
f
'Using envvar MAX_JOBS (
{
max_jobs
}
) as the number of workers...'
)
return
int
(
max_jobs
)
if
verbose
:
print
(
'Allowing ninja to set a default number of workers... '
)
return
None
def
_run_ninja_build
(
build_directory
:
str
,
verbose
:
bool
,
error_prefix
:
str
)
->
None
:
command
=
[
'ninja'
,
'-v'
]
num_workers
=
_get_num_workers
(
verbose
)
if
num_workers
is
not
None
:
command
.
extend
([
'-j'
,
str
(
num_workers
)])
env
=
os
.
environ
.
copy
()
try
:
sys
.
stdout
.
flush
()
sys
.
stderr
.
flush
()
stdout_fileno
=
1
subprocess
.
run
(
command
,
stdout
=
stdout_fileno
if
verbose
else
subprocess
.
PIPE
,
stderr
=
subprocess
.
STDOUT
,
cwd
=
build_directory
,
check
=
True
,
env
=
env
)
except
subprocess
.
CalledProcessError
as
e
:
_
,
error
,
_
=
sys
.
exc_info
()
message
=
error_prefix
if
hasattr
(
error
,
'output'
)
and
error
.
output
:
# type: ignore[union-attr]
message
+=
f
":
{
error
.
output
.
decode
(
*
SUBPROCESS_DECODE_ARGS
)
}
"
# type: ignore[union-attr]
raise
RuntimeError
(
message
)
from
e
def
_write_ninja_file_and_compile_objects
(
sources
:
List
[
str
],
objects
,
cflags
,
post_cflags
,
cuda_cflags
,
cuda_post_cflags
,
build_directory
:
str
,
verbose
:
bool
,
with_cuda
:
Optional
[
bool
])
->
None
:
verify_ninja_availability
()
compiler
=
os
.
environ
.
get
(
'CXX'
,
'c++'
)
if
with_cuda
is
None
:
with_cuda
=
any
(
map
(
_is_cuda_file
,
sources
))
build_file_path
=
os
.
path
.
join
(
build_directory
,
'build.ninja'
)
if
verbose
:
print
(
f
'Emitting ninja build file
{
build_file_path
}
...'
)
_write_ninja_file
(
path
=
build_file_path
,
cflags
=
cflags
,
post_cflags
=
post_cflags
,
cuda_cflags
=
cuda_cflags
,
cuda_post_cflags
=
cuda_post_cflags
,
sources
=
sources
,
objects
=
objects
,
ldflags
=
None
,
library_target
=
None
,
with_cuda
=
with_cuda
)
if
verbose
:
print
(
'Compiling objects...'
)
_run_ninja_build
(
build_directory
,
verbose
,
error_prefix
=
'Error compiling objects for extension'
)
class
BuildReleaseExtension
(
BuildExtension
):
def
__init__
(
self
,
*
args
,
**
kwargs
)
->
None
:
super
(
BuildReleaseExtension
,
self
).
__init__
(
*
args
,
**
kwargs
)
def
build_extensions
(
self
)
->
None
:
self
.
_check_abi
()
cuda_ext
=
False
extension_iter
=
iter
(
self
.
extensions
)
extension
=
next
(
extension_iter
,
None
)
while
not
cuda_ext
and
extension
:
for
source
in
extension
.
sources
:
_
,
ext
=
os
.
path
.
splitext
(
source
)
if
ext
==
'.cu'
:
cuda_ext
=
True
break
extension
=
next
(
extension_iter
,
None
)
for
extension
in
self
.
extensions
:
if
isinstance
(
extension
.
extra_compile_args
,
dict
):
for
ext
in
[
'cxx'
,
'nvcc'
]:
if
ext
not
in
extension
.
extra_compile_args
:
extension
.
extra_compile_args
[
ext
]
=
[]
self
.
_add_compile_flag
(
extension
,
'-DTORCH_API_INCLUDE_EXTENSION_H'
)
for
name
in
[
"COMPILER_TYPE"
,
"STDLIB"
,
"BUILD_ABI"
]:
val
=
getattr
(
torch
.
_C
,
f
"_PYBIND11_
{
name
}
"
)
self
.
_add_compile_flag
(
extension
,
f
'-DPYBIND11_
{
name
}
="
{
val
}
"'
)
self
.
_define_torch_extension_name
(
extension
)
self
.
_add_gnu_cpp_abi_flag
(
extension
)
self
.
compiler
.
src_extensions
+=
[
'.cu'
,
'.cuh'
,
'.hip'
]
def
append_std17_if_no_std_present
(
cflags
)
->
None
:
cpp_format_prefix
=
'/{}:'
if
self
.
compiler
.
compiler_type
==
'msvc'
else
'-{}='
cpp_flag_prefix
=
cpp_format_prefix
.
format
(
'std'
)
cpp_flag
=
cpp_flag_prefix
+
'c++14'
if
not
any
(
flag
.
startswith
(
cpp_flag_prefix
)
for
flag
in
cflags
):
cflags
.
append
(
cpp_flag
)
def
convert_to_absolute_paths_inplace
(
paths
):
if
paths
is
not
None
:
for
i
in
range
(
len
(
paths
)):
if
not
os
.
path
.
isabs
(
paths
[
i
]):
paths
[
i
]
=
os
.
path
.
abspath
(
paths
[
i
])
def
unix_wrap_ninja_compile
(
sources
,
output_dir
=
None
,
macros
=
None
,
include_dirs
=
None
,
debug
=
0
,
extra_preargs
=
None
,
extra_postargs
=
None
,
depends
=
None
):
output_dir
=
os
.
path
.
abspath
(
output_dir
)
convert_to_absolute_paths_inplace
(
self
.
compiler
.
include_dirs
)
_
,
objects
,
extra_postargs
,
pp_opts
,
_
=
\
self
.
compiler
.
_setup_compile
(
output_dir
,
macros
,
include_dirs
,
sources
,
depends
,
extra_postargs
)
common_cflags
=
self
.
compiler
.
_get_cc_args
(
pp_opts
,
debug
,
extra_preargs
)
extra_cc_cflags
=
self
.
compiler
.
compiler_so
[
1
:]
if
(
debug
):
print
(
"debug mode"
)
else
:
extra_cc_cflags
.
remove
(
'-g'
)
extra_cc_cflags
.
remove
(
'-Wall'
)
print
(
"release mode"
)
with_cuda
=
any
(
map
(
_is_cuda_file
,
sources
))
if
isinstance
(
extra_postargs
,
dict
):
post_cflags
=
extra_postargs
[
'cxx'
]
else
:
post_cflags
=
list
(
extra_postargs
)
if
IS_HIP_EXTENSION
:
post_cflags
=
COMMON_HIP_FLAGS
+
post_cflags
append_std17_if_no_std_present
(
post_cflags
)
cuda_post_cflags
=
None
cuda_cflags
=
None
if
with_cuda
:
cuda_cflags
=
common_cflags
if
isinstance
(
extra_postargs
,
dict
):
cuda_post_cflags
=
extra_postargs
[
'nvcc'
]
else
:
cuda_post_cflags
=
list
(
extra_postargs
)
if
IS_HIP_EXTENSION
:
cuda_post_cflags
=
cuda_post_cflags
+
_get_rocm_arch_flags
(
cuda_post_cflags
)
cuda_post_cflags
=
COMMON_HIP_FLAGS
+
COMMON_HIPCC_FLAGS
+
cuda_post_cflags
append_std17_if_no_std_present
(
cuda_post_cflags
)
cuda_cflags
=
[
shlex
.
quote
(
f
)
for
f
in
cuda_cflags
]
cuda_post_cflags
=
[
shlex
.
quote
(
f
)
for
f
in
cuda_post_cflags
]
_write_ninja_file_and_compile_objects
(
sources
=
sources
,
objects
=
objects
,
cflags
=
[
shlex
.
quote
(
f
)
for
f
in
extra_cc_cflags
+
common_cflags
],
post_cflags
=
[
shlex
.
quote
(
f
)
for
f
in
post_cflags
],
cuda_cflags
=
cuda_cflags
,
cuda_post_cflags
=
cuda_post_cflags
,
build_directory
=
output_dir
,
verbose
=
True
,
with_cuda
=
with_cuda
)
return
objects
self
.
compiler
.
compile
=
unix_wrap_ninja_compile
build_ext
.
build_extensions
(
self
)
def
get_version
():
return
"0.1"
def
get_extensions
():
extensions
=
[]
include_dirs
=
[]
define_macros
=
[]
extra_compile_args
=
{
'cxx'
:
[
'-O3'
],
'nvcc'
:
[]}
args
=
[]
args
+=
[
'-DWARPCTC_ENABLE_GPU'
]
args
+=
[
'-DCTC_DISABLE_OMP'
]
# args += ['-DDEBUG_KERNEL']
args
+=
[
'-Wno-deprecated'
]
extra_compile_args
[
'cxx'
]
+=
args
extra_compile_args
[
'nvcc'
]
+=
args
op_files
=
glob
.
glob
(
'./src/*.cu'
)
+
glob
.
glob
(
'./src/*.cpp'
)
+
[
'../src/reduce.cu'
,
'../src/ctc_entrypoint.cu'
]
print
(
'op_files = '
,
op_files
)
extension
=
CUDAExtension
include_dirs
.
append
(
os
.
path
.
realpath
(
'../include/'
))
include_dirs
.
append
(
'/opt/dtk/rocrand/include/'
)
include_dirs
.
append
(
'/opt/dtk/hiprand/include/'
)
print
(
'include_dirs = '
,
include_dirs
)
ext_ops
=
extension
(
name
=
"_warp_ctc"
,
sources
=
op_files
,
include_dirs
=
include_dirs
,
define_macros
=
define_macros
,
extra_compile_args
=
extra_compile_args
)
extensions
.
append
(
ext_ops
)
return
extensions
def
main
():
setup
(
name
=
'warpctc_pytorch'
,
version
=
get_version
(),
description
=
'Torch fuseop Computer Vision Foundation'
,
keywords
=
'computer vision'
,
packages
=
find_packages
(),
include_package_data
=
False
,
package_data
=
{
'warpctc_pytorch'
:
[
"src/*.cuh"
,
"src/*.cu"
,
"src/*.hip"
,
"src/*.cpp"
]
},
ext_modules
=
get_extensions
(),
cmdclass
=
{
'build_ext'
:
BuildReleaseExtension
},
zip_safe
=
False
)
if
__name__
==
"__main__"
:
main
()
pytorch_binding/src/binding.cu
deleted
100644 → 0
View file @
0bf5eb5f
#include <iostream>
#include <vector>
#include <numeric>
#include <torch/extension.h>
#ifdef WARPCTC_ENABLE_GPU
#include "ATen/cuda/CUDAContext.h"
#include <c10/cuda/CUDAGuard.h>
#include "ATen/cuda/CUDAEvent.h"
#include <THC/THCGeneral.h>
extern
THCState
*
state
;
#endif
#include "ctc.h"
int
cpu_ctc
(
torch
::
Tensor
probs
,
torch
::
Tensor
grads
,
torch
::
Tensor
labels
,
torch
::
Tensor
label_sizes
,
torch
::
Tensor
sizes
,
int
minibatch_size
,
torch
::
Tensor
costs
,
int
blank_label
)
{
float
*
probs_ptr
=
(
float
*
)
probs
.
data_ptr
();
float
*
grads_ptr
=
grads
.
storage
()
?
(
float
*
)
grads
.
data_ptr
()
:
NULL
;
int
*
sizes_ptr
=
(
int
*
)
sizes
.
data_ptr
();
int
*
labels_ptr
=
(
int
*
)
labels
.
data_ptr
();
int
*
label_sizes_ptr
=
(
int
*
)
label_sizes
.
data_ptr
();
float
*
costs_ptr
=
(
float
*
)
costs
.
data_ptr
();
const
int
probs_size
=
probs
.
size
(
2
);
ctcOptions
options
;
memset
(
&
options
,
0
,
sizeof
(
options
));
options
.
loc
=
CTC_CPU
;
options
.
num_threads
=
0
;
// will use default number of threads
options
.
blank_label
=
blank_label
;
#if defined(CTC_DISABLE_OMP) || defined(APPLE)
// have to use at least one
options
.
num_threads
=
std
::
max
(
options
.
num_threads
,
(
unsigned
int
)
1
);
#endif
size_t
cpu_size_bytes
;
get_workspace_size
(
label_sizes_ptr
,
sizes_ptr
,
probs_size
,
minibatch_size
,
options
,
&
cpu_size_bytes
);
float
*
cpu_workspace
=
new
float
[
cpu_size_bytes
/
sizeof
(
float
)];
compute_ctc_loss
(
probs_ptr
,
grads_ptr
,
labels_ptr
,
label_sizes_ptr
,
sizes_ptr
,
probs_size
,
minibatch_size
,
costs_ptr
,
cpu_workspace
,
options
);
delete
[]
cpu_workspace
;
return
1
;
}
#ifdef WARPCTC_ENABLE_GPU
int
gpu_ctc
(
torch
::
Tensor
probs
,
torch
::
Tensor
grads
,
torch
::
Tensor
labels
,
torch
::
Tensor
label_sizes
,
torch
::
Tensor
sizes
,
int
minibatch_size
,
torch
::
Tensor
costs
,
int
blank_label
)
{
float
*
probs_ptr
=
(
float
*
)
probs
.
data_ptr
();
float
*
grads_ptr
=
grads
.
storage
()
?
(
float
*
)
grads
.
data_ptr
()
:
NULL
;
int
*
sizes_ptr
=
(
int
*
)
sizes
.
data_ptr
();
int
*
labels_ptr
=
(
int
*
)
labels
.
data_ptr
();
int
*
label_sizes_ptr
=
(
int
*
)
label_sizes
.
data_ptr
();
float
*
costs_ptr
=
(
float
*
)
costs
.
data_ptr
();
const
int
probs_size
=
probs
.
size
(
2
);
ctcOptions
options
;
memset
(
&
options
,
0
,
sizeof
(
options
));
options
.
loc
=
CTC_GPU
;
options
.
blank_label
=
blank_label
;
options
.
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
size_t
gpu_size_bytes
;
get_workspace_size
(
label_sizes_ptr
,
sizes_ptr
,
probs_size
,
minibatch_size
,
options
,
&
gpu_size_bytes
);
void
*
gpu_workspace
=
THCudaMalloc
(
state
,
gpu_size_bytes
);
compute_ctc_loss
(
probs_ptr
,
grads_ptr
,
labels_ptr
,
label_sizes_ptr
,
sizes_ptr
,
probs_size
,
minibatch_size
,
costs_ptr
,
gpu_workspace
,
options
);
THCudaFree
(
state
,
(
void
*
)
gpu_workspace
);
return
1
;
}
#endif
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"cpu_ctc"
,
&
cpu_ctc
,
"CTC Loss function with cpu"
);
#ifdef WARPCTC_ENABLE_GPU
m
.
def
(
"gpu_ctc"
,
&
gpu_ctc
,
"CTC Loss function with gpu"
);
#endif
}
pytorch_binding/src/binding.hip
deleted
100644 → 0
View file @
0bf5eb5f
// !!! This is a file automatically generated by hipify!!!
#include <iostream>
#include <vector>
#include <numeric>
#include <torch/extension.h>
#ifdef WARPCTC_ENABLE_GPU
#include "ATen/hip/HIPContext.h"
#include <ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h>
#include "ATen/hip/HIPEvent.h"
#include <THH/THHGeneral.h>
extern THCState* state;
#endif
#include "ctc.h"
int cpu_ctc(torch::Tensor probs,
torch::Tensor grads,
torch::Tensor labels,
torch::Tensor label_sizes,
torch::Tensor sizes,
int minibatch_size,
torch::Tensor costs,
int blank_label)
{
float* probs_ptr = (float*)probs.data_ptr();
float* grads_ptr = grads.storage() ? (float*)grads.data_ptr() : NULL;
int* sizes_ptr = (int*)sizes.data_ptr();
int* labels_ptr = (int*)labels.data_ptr();
int* label_sizes_ptr = (int*)label_sizes.data_ptr();
float* costs_ptr = (float*)costs.data_ptr();
const int probs_size = probs.size(2);
ctcOptions options;
memset(&options, 0, sizeof(options));
options.loc = CTC_CPU;
options.num_threads = 0; // will use default number of threads
options.blank_label = blank_label;
#if defined(CTC_DISABLE_OMP) || defined(APPLE)
// have to use at least one
options.num_threads = ::max(options.num_threads, (unsigned int) 1);
#endif
size_t cpu_size_bytes;
get_workspace_size(label_sizes_ptr, sizes_ptr,
probs_size, minibatch_size,
options, &cpu_size_bytes);
float* cpu_workspace = new float[cpu_size_bytes / sizeof(float)];
compute_ctc_loss(probs_ptr, grads_ptr,
labels_ptr, label_sizes_ptr,
sizes_ptr, probs_size,
minibatch_size, costs_ptr,
cpu_workspace, options);
delete[] cpu_workspace;
return 1;
}
#ifdef WARPCTC_ENABLE_GPU
int gpu_ctc(torch::Tensor probs,
torch::Tensor grads,
torch::Tensor labels,
torch::Tensor label_sizes,
torch::Tensor sizes,
int minibatch_size,
torch::Tensor costs,
int blank_label)
{
float* probs_ptr = (float*)probs.data_ptr();
float* grads_ptr = grads.storage() ? (float*)grads.data_ptr() : NULL;
int* sizes_ptr = (int*)sizes.data_ptr();
int* labels_ptr = (int*)labels.data_ptr();
int* label_sizes_ptr = (int*)label_sizes.data_ptr();
float* costs_ptr = (float*)costs.data_ptr();
const int probs_size = probs.size(2);
ctcOptions options;
memset(&options, 0, sizeof(options));
options.loc = CTC_GPU;
options.blank_label = blank_label;
options.stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
size_t gpu_size_bytes;
get_workspace_size(label_sizes_ptr, sizes_ptr,
probs_size, minibatch_size,
options, &gpu_size_bytes);
void* gpu_workspace = THCudaMalloc(state, gpu_size_bytes);
compute_ctc_loss(probs_ptr, grads_ptr,
labels_ptr, label_sizes_ptr,
sizes_ptr, probs_size,
minibatch_size, costs_ptr,
gpu_workspace, options);
THCudaFree(state, (void *) gpu_workspace);
return 1;
}
#endif
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("cpu_ctc", &cpu_ctc, "CTC Loss function with cpu");
#ifdef WARPCTC_ENABLE_GPU
m.def("gpu_ctc", &gpu_ctc, "CTC Loss function with gpu");
#endif
}
pytorch_binding/src/cpu_binding.h
deleted
100644 → 0
View file @
0bf5eb5f
#pragma once
/*
int cpu_ctc(THFloatTensor *probs,
THFloatTensor *grads,
THIntTensor *labels_ptr,
THIntTensor *label_sizes_ptr,
THIntTensor *sizes,
int minibatch_size,
THFloatTensor *costs,
int blank_label);
*/
int
cpu_ctc
(
torch
::
Tensor
probs
,
torch
::
Tensor
grads
,
torch
::
Tensor
labels
,
torch
::
Tensor
label_sizes
,
torch
::
Tensor
sizes
,
int
minibatch_size
,
torch
::
Tensor
costs
,
int
blank_label
);
pytorch_binding/src/gpu_binding.h
deleted
100644 → 0
View file @
0bf5eb5f
#pragma once
/*
int gpu_ctc(THCudaTensor *probs,
THCudaTensor *grads,
THIntTensor *labels_ptr,
THIntTensor *label_sizes_ptr,
THIntTensor *sizes,
int minibatch_size,
THFloatTensor *costs,
int blank_label);
*/
int
gpu_ctc
(
torch
::
Tensor
probs
,
torch
::
Tensor
grads
,
torch
::
Tensor
labels
,
torch
::
Tensor
label_sizes
,
torch
::
Tensor
sizes
,
int
minibatch_size
,
torch
::
Tensor
costs
,
int
blank_label
);
pytorch_binding/tests/test_gpu.py
deleted
100644 → 0
View file @
0bf5eb5f
import
torch
import
warpctc_pytorch
as
warp_ctc
def
test_empty_label
(
test_cpu
=
True
,
test_gpu
=
True
):
probs
=
torch
.
FloatTensor
([
[[
0.1
,
0.6
,
0.1
,
0.1
,
0.1
],
[
0.1
,
0.1
,
0.6
,
0.1
,
0.1
]],
[[
0.6
,
0.1
,
0.1
,
0.1
,
0.1
],
[
0.1
,
0.1
,
0.5
,
0.2
,
0.1
]]
]).
contiguous
()
grads
=
torch
.
zeros
(
probs
.
size
())
labels
=
torch
.
IntTensor
([
1
,
2
])
label_sizes
=
torch
.
IntTensor
([
2
,
0
])
sizes
=
torch
.
IntTensor
([
2
,
2
])
minibatch_size
=
probs
.
size
(
1
)
if
test_cpu
:
costs
=
torch
.
zeros
(
minibatch_size
)
warp_ctc
.
cpu_ctc
(
probs
,
grads
,
labels
,
label_sizes
,
sizes
,
minibatch_size
,
costs
,
0
)
print
(
'CPU_cost: %f'
%
costs
.
sum
())
print
(
'CPU probs={}
\n
grads={}
\n
costs={}'
.
format
(
probs
,
grads
,
costs
))
if
test_gpu
:
probs
=
probs
.
clone
().
cuda
()
grads
=
torch
.
zeros
(
probs
.
size
()).
cuda
()
costs
=
torch
.
zeros
(
minibatch_size
)
warp_ctc
.
gpu_ctc
(
probs
,
grads
,
labels
,
label_sizes
,
sizes
,
minibatch_size
,
costs
,
0
)
print
(
'GPU_cost: %f'
%
costs
.
sum
())
print
(
grads
.
view
(
grads
.
size
(
0
)
*
grads
.
size
(
1
),
grads
.
size
(
2
)))
print
(
'GPU probs={}
\n
grads={}
\n
costs={}'
.
format
(
probs
,
grads
,
costs
))
if
__name__
==
'__main__'
:
print
(
'torch.cuda.is_available() '
,
torch
.
cuda
.
is_available
())
# test_empty_label(test_cpu=True, test_gpu=False)
test_empty_label
(
test_cpu
=
False
,
test_gpu
=
True
)
# HIP_VISIBLE_DEVICES=1 python3 test_gpu_new.py
pytorch_binding/tests/test_gpu_speed.py
deleted
100644 → 0
View file @
0bf5eb5f
import
torch
import
warpctc_pytorch_change1
as
warp_ctc_new
import
warpctc_pytorch
as
warp_ctc
import
time
def
test_compare_cpu
(
repeat_num
=
20
):
probs
=
torch
.
FloatTensor
([
[[
0.1
,
0.6
,
0.1
,
0.1
,
0.1
],
[
0.1
,
0.1
,
0.6
,
0.1
,
0.1
]],
[[
0.6
,
0.1
,
0.1
,
0.1
,
0.1
],
[
0.1
,
0.1
,
0.5
,
0.2
,
0.1
]]
]).
contiguous
()
labels
=
torch
.
IntTensor
([
1
,
2
])
label_sizes
=
torch
.
IntTensor
([
2
,
0
])
sizes
=
torch
.
IntTensor
([
2
,
2
])
minibatch_size
=
probs
.
size
(
1
)
costs
=
torch
.
zeros
(
minibatch_size
)
grads
=
torch
.
zeros
(
probs
.
size
())
time_st
=
time
.
perf_counter
()
# 1.运行老版本 CPU
for
i
in
range
(
repeat_num
):
probs_old
=
probs
.
clone
()
costs_old
=
costs
.
clone
()
grads_old
=
grads
.
clone
()
warp_ctc
.
cpu_ctc
(
probs_old
,
grads_old
,
labels
,
label_sizes
,
sizes
,
minibatch_size
,
costs_old
,
0
)
if
i
==
0
:
print
(
'CPU_costs_old: %f'
%
costs_old
.
sum
())
print
(
'CPU probs_old={}
\n
grads_old={}
\n
costs_old={}'
.
format
(
probs_old
,
grads_old
,
costs_old
))
time_used
=
(
time
.
perf_counter
()
-
time_st
)
/
repeat_num
print
(
'CPU warp_ctc old version using time: '
,
time_used
)
time_st
=
time
.
perf_counter
()
# 2.运行新版本 CPU
for
i
in
range
(
repeat_num
):
probs_new
=
probs
.
clone
()
costs_new
=
costs
.
clone
()
grads_new
=
grads
.
clone
()
warp_ctc_new
.
cpu_ctc
(
probs_new
,
grads_new
,
labels
,
label_sizes
,
sizes
,
minibatch_size
,
costs_new
,
0
)
if
i
==
0
:
print
(
'CPU_costs_new: %f'
%
costs_new
.
sum
())
print
(
'CPU probs={}
\n
grads_new={}
\n
costs_new={}'
.
format
(
probs_new
,
grads_new
,
costs_new
))
time_used
=
(
time
.
perf_counter
()
-
time_st
)
/
repeat_num
print
(
'CPU warp_ctc new version using time: '
,
time_used
)
def
test_compare_gpu
():
probs0
=
torch
.
FloatTensor
([
[[
0.1
,
0.6
,
0.1
,
0.1
,
0.1
],
[
0.1
,
0.1
,
0.6
,
0.1
,
0.1
]],
[[
0.6
,
0.1
,
0.1
,
0.1
,
0.1
],
[
0.1
,
0.1
,
0.5
,
0.2
,
0.1
]]
]).
contiguous
().
cuda
()
labels
=
torch
.
IntTensor
([
1
,
2
])
label_sizes
=
torch
.
IntTensor
([
2
,
0
])
sizes
=
torch
.
IntTensor
([
2
,
2
])
minibatch_size
=
probs0
.
size
(
1
)
# 1.运行新版本 CPU
probs_new
=
probs0
.
clone
().
cuda
()
costs_new
=
torch
.
zeros
(
minibatch_size
)
grads_new
=
torch
.
zeros
(
probs0
.
size
())
warp_ctc_new
.
cpu_ctc
(
probs_new
,
grads_new
,
labels
,
label_sizes
,
sizes
,
minibatch_size
,
costs_new
,
0
)
print
(
'CPU_costs_new: %f'
%
costs_new
.
sum
())
print
(
'CPU probs_new={}
\n
grads_new={}
\n
costs_new={}'
.
format
(
probs_new
,
grads_new
,
costs_new
))
# 2.运行老版本 CPU
probs
=
probs0
.
clone
().
cuda
()
costs
=
torch
.
zeros
(
minibatch_size
)
grads
=
torch
.
zeros
(
probs0
.
size
())
warp_ctc
.
cpu_ctc
(
probs0
,
grads
,
labels
,
label_sizes
,
sizes
,
minibatch_size
,
costs
,
0
)
print
(
'CPU_cost: %f'
%
costs
.
sum
())
print
(
'CPU probs={}
\n
grads={}
\n
costs={}'
.
format
(
probs
,
grads
,
costs
))
if
__name__
==
'__main__'
:
print
(
'torch.cuda.is_available() '
,
torch
.
cuda
.
is_available
())
test_compare_cpu
()
test_compare_gpu
()
pytorch_binding/warpctc_pytorch/__init__.py
deleted
100644 → 0
View file @
0bf5eb5f
import
torch
import
warpctc_pytorch
as
warp_ctc
from
torch.autograd
import
Function
from
torch.nn
import
Module
from
_warp_ctc
import
*
# noqa
def
_assert_no_grad
(
tensor
):
assert
not
tensor
.
requires_grad
,
\
"gradients only computed for acts - please "
\
"mark other tensors as not requiring gradients"
class
_CTC
(
Function
):
@
staticmethod
def
forward
(
ctx
,
acts
,
labels
,
act_lens
,
label_lens
,
size_average
=
False
,
length_average
=
False
,
blank
=
0
):
is_cuda
=
True
if
acts
.
is_cuda
else
False
# print('_CTC is_cuda', is_cuda)
acts
=
acts
.
contiguous
()
loss_func
=
warp_ctc
.
gpu_ctc
if
is_cuda
else
warp_ctc
.
cpu_ctc
grads
=
torch
.
zeros
(
acts
.
size
()).
type_as
(
acts
)
minibatch_size
=
acts
.
size
(
1
)
costs
=
torch
.
zeros
(
minibatch_size
).
cpu
()
loss_func
(
acts
,
grads
,
labels
,
label_lens
,
act_lens
,
minibatch_size
,
costs
,
blank
)
costs
=
torch
.
FloatTensor
([
costs
.
sum
()])
if
length_average
:
# Compute the avg. log-probability per batch sample and frame.
total_length
=
torch
.
sum
(
act_lens
).
item
()
grads
=
grads
/
total_length
costs
=
costs
/
total_length
elif
size_average
:
# Compute the avg. log-probability per batch sample.
grads
=
grads
/
minibatch_size
costs
=
costs
/
minibatch_size
ctx
.
grads
=
grads
return
costs
@
staticmethod
def
backward
(
ctx
,
grad_output
):
_grad_output
=
grad_output
.
to
(
ctx
.
grads
.
device
)
return
ctx
.
grads
.
mul_
(
_grad_output
),
None
,
None
,
None
,
None
,
None
,
None
class
CTCLoss
(
Module
):
"""
Parameters:
size_average (bool): normalize the loss by the batch size
(default: `False`)
length_average (bool): normalize the loss by the total number of frames
in the batch. If `True`, supersedes `size_average`
(default: `False`)
"""
def
__init__
(
self
,
blank
=
0
,
size_average
=
False
,
length_average
=
False
):
super
(
CTCLoss
,
self
).
__init__
()
self
.
ctc
=
_CTC
.
apply
self
.
blank
=
blank
self
.
size_average
=
size_average
self
.
length_average
=
length_average
def
forward
(
self
,
acts
,
labels
,
act_lens
,
label_lens
):
"""
acts: Tensor of (seqLength x batch x outputDim) containing output from network
labels: 1 dimensional Tensor containing all the targets of the batch in one sequence
act_lens: Tensor of size (batch) containing size of each output sequence from the network
label_lens: Tensor of (batch) containing label length of each example
"""
# labels must be 1 dimensional
if
len
(
labels
.
size
())
!=
1
:
print
(
'error!! len(labels.size()) must be 1, get {}'
.
format
(
len
(
labels
.
size
())))
raise
ValueError
_assert_no_grad
(
labels
)
_assert_no_grad
(
act_lens
)
_assert_no_grad
(
label_lens
)
return
self
.
ctc
(
acts
,
labels
,
act_lens
,
label_lens
,
self
.
size_average
,
self
.
length_average
,
self
.
blank
)
src/ctc_entrypoint.cu
deleted
100644 → 0
View file @
0bf5eb5f
#include <cstddef>
#include <iostream>
#include <algorithm>
#include "ctc.h"
#include "detail/cpu_ctc.h"
#ifdef __HIPCC__
#include "detail/gpu_ctc.h"
#endif
extern
"C"
{
int
get_warpctc_version
()
{
return
13
;
}
const
char
*
ctcGetStatusString
(
ctcStatus_t
status
)
{
switch
(
status
)
{
case
CTC_STATUS_SUCCESS
:
return
"no error"
;
case
CTC_STATUS_MEMOPS_FAILED
:
return
"cuda memcpy or memset failed"
;
case
CTC_STATUS_INVALID_VALUE
:
return
"invalid value"
;
case
CTC_STATUS_EXECUTION_FAILED
:
return
"execution failed"
;
case
CTC_STATUS_UNKNOWN_ERROR
:
default:
return
"unknown error"
;
}
}
ctcStatus_t
compute_ctc_loss
(
const
float
*
const
activations
,
float
*
gradients
,
const
int
*
const
flat_labels
,
const
int
*
const
label_lengths
,
const
int
*
const
input_lengths
,
int
alphabet_size
,
int
minibatch
,
float
*
costs
,
void
*
workspace
,
ctcOptions
options
)
{
if
(
activations
==
nullptr
||
flat_labels
==
nullptr
||
label_lengths
==
nullptr
||
input_lengths
==
nullptr
||
costs
==
nullptr
||
workspace
==
nullptr
||
alphabet_size
<=
0
||
minibatch
<=
0
)
return
CTC_STATUS_INVALID_VALUE
;
if
(
options
.
loc
==
CTC_CPU
)
{
CpuCTC
<
float
>
ctc
(
alphabet_size
,
minibatch
,
workspace
,
options
.
num_threads
,
options
.
blank_label
);
if
(
gradients
!=
NULL
)
return
ctc
.
cost_and_grad
(
activations
,
gradients
,
costs
,
flat_labels
,
label_lengths
,
input_lengths
);
else
return
ctc
.
score_forward
(
activations
,
costs
,
flat_labels
,
label_lengths
,
input_lengths
);
}
else
if
(
options
.
loc
==
CTC_GPU
)
{
#ifdef __HIPCC__
GpuCTC
<
float
>
ctc
(
alphabet_size
,
minibatch
,
workspace
,
options
.
stream
,
options
.
blank_label
);
if
(
gradients
!=
NULL
)
return
ctc
.
cost_and_grad
(
activations
,
gradients
,
costs
,
flat_labels
,
label_lengths
,
input_lengths
);
else
return
ctc
.
score_forward
(
activations
,
costs
,
flat_labels
,
label_lengths
,
input_lengths
);
#else
std
::
cerr
<<
"GPU execution requested, but not compiled with GPU support"
<<
std
::
endl
;
return
CTC_STATUS_EXECUTION_FAILED
;
#endif
}
else
{
return
CTC_STATUS_INVALID_VALUE
;
}
}
ctcStatus_t
get_workspace_size
(
const
int
*
const
label_lengths
,
const
int
*
const
input_lengths
,
int
alphabet_size
,
int
minibatch
,
ctcOptions
options
,
size_t
*
size_bytes
)
{
if
(
label_lengths
==
nullptr
||
input_lengths
==
nullptr
||
size_bytes
==
nullptr
||
alphabet_size
<=
0
||
minibatch
<=
0
)
return
CTC_STATUS_INVALID_VALUE
;
// This is the max of all S and T for all examples in the minibatch.
int
maxL
=
*
std
::
max_element
(
label_lengths
,
label_lengths
+
minibatch
);
int
maxT
=
*
std
::
max_element
(
input_lengths
,
input_lengths
+
minibatch
);
const
int
S
=
2
*
maxL
+
1
;
*
size_bytes
=
0
;
if
(
options
.
loc
==
CTC_GPU
)
{
// GPU storage
//nll_forward, nll_backward
*
size_bytes
+=
2
*
sizeof
(
float
)
*
minibatch
;
//repeats
*
size_bytes
+=
sizeof
(
int
)
*
minibatch
;
//label offsets
*
size_bytes
+=
sizeof
(
int
)
*
minibatch
;
//utt_length
*
size_bytes
+=
sizeof
(
int
)
*
minibatch
;
//label lengths
*
size_bytes
+=
sizeof
(
int
)
*
minibatch
;
//labels without blanks - overallocate for now
*
size_bytes
+=
sizeof
(
int
)
*
maxL
*
minibatch
;
//labels with blanks
*
size_bytes
+=
sizeof
(
int
)
*
S
*
minibatch
;
//alphas
*
size_bytes
+=
sizeof
(
float
)
*
S
*
maxT
*
minibatch
;
//denoms
*
size_bytes
+=
sizeof
(
float
)
*
maxT
*
minibatch
;
//probs (since we will pass in activations)
*
size_bytes
+=
sizeof
(
float
)
*
alphabet_size
*
maxT
*
minibatch
;
}
else
{
//cpu can eventually replace all minibatch with
//max number of concurrent threads if memory is
//really tight
//per minibatch memory
size_t
per_minibatch_bytes
=
0
;
//output
per_minibatch_bytes
+=
sizeof
(
float
)
*
alphabet_size
;
//alphas
per_minibatch_bytes
+=
sizeof
(
float
)
*
S
*
maxT
;
//betas
per_minibatch_bytes
+=
sizeof
(
float
)
*
S
;
//labels w/blanks, e_inc, s_inc
per_minibatch_bytes
+=
3
*
sizeof
(
int
)
*
S
;
*
size_bytes
=
per_minibatch_bytes
*
minibatch
;
//probs
*
size_bytes
+=
sizeof
(
float
)
*
alphabet_size
*
maxT
*
minibatch
;
}
return
CTC_STATUS_SUCCESS
;
}
}
src/reduce.cu
deleted
100644 → 0
View file @
0bf5eb5f
// Includes, system
#include <stdio.h>
#include <stdlib.h>
// Includes, cuda
#include <cuda_runtime.h>
//#include<cublas_v2.h>
#include <cuda_runtime_api.h>
// Includes, cuda helper functions
// #include <helper_cuda.h>
// For the functors
#include "detail/ctc_helper.h"
#include "ctc.h"
const
int
warp_size
=
64
;
const
int
kCUDABlockNumThreads
=
256
;
template
<
int
NT
,
typename
T
,
typename
Rop
>
struct
CTAReduce
;
template
<
int
NT
,
typename
T
,
typename
Rop
>
struct
CTAReduce
{
enum
{
Size
=
NT
,
Capacity
=
NT
};
struct
Storage
{
T
shared
[
Capacity
];
};
__device__
static
T
reduce
(
int
tid
,
T
x
,
Storage
&
storage
,
int
count
,
Rop
g
)
{
T
*
s
=
storage
.
shared
;
s
[
tid
]
=
x
;
__syncthreads
();
// Fold the data in half with each pass.
#pragma unroll
for
(
int
offset
=
NT
/
2
;
offset
>=
warp_size
;
offset
/=
2
)
{
if
(
tid
+
offset
<
count
&&
tid
<
offset
)
{
x
=
g
(
x
,
s
[
offset
+
tid
]);
s
[
tid
]
=
x
;
}
__syncthreads
();
}
T
shuff
;
for
(
int
offset
=
warp_size
/
2
;
offset
>
0
;
offset
/=
2
)
{
// shuff = __shfl_down(0xFFFFFFF, x, offset);
shuff
=
__shfl_down
(
x
,
offset
);
if
(
tid
+
offset
<
count
&&
tid
<
offset
)
{
x
=
g
(
x
,
shuff
);
}
}
return
x
;
}
};
template
<
int
NT
,
typename
Iop
,
typename
Rop
,
typename
T
>
__global__
void
reduce_rows
(
Iop
f
,
Rop
g
,
const
T
*
input
,
T
*
output
,
int
num_rows
,
int
num_cols
)
{
typedef
CTAReduce
<
NT
,
T
,
Rop
>
R
;
__shared__
typename
R
::
Storage
storage
;
int
tid
=
threadIdx
.
x
;
int
idx
=
tid
;
int
col
=
blockIdx
.
x
;
T
curr
;
// Each block works on a column
if
(
idx
<
num_rows
)
{
curr
=
f
(
input
[
idx
+
col
*
num_rows
]);
}
// __syncthreads();
idx
+=
NT
;
while
(
idx
<
num_rows
)
{
curr
=
g
(
curr
,
f
(
input
[
idx
+
col
*
num_rows
]));
idx
+=
NT
;
}
// Sum thread-totals over the CTA.
curr
=
R
::
reduce
(
tid
,
curr
,
storage
,
num_rows
,
g
);
// Store result in out
if
(
tid
==
0
)
{
output
[
col
]
=
curr
;
}
}
template
<
int
NT
,
typename
Iop
,
typename
Rop
,
typename
T
>
__global__
void
reduce_cols
(
Iop
f
,
Rop
g
,
const
T
*
input
,
T
*
output
,
int
num_rows
,
int
num_cols
)
{
__shared__
T
s
[
NT
];
int
warps_per_block
=
NT
/
warp_size
;
int
row
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
int
col
=
threadIdx
.
y
;
T
curr
;
if
(
row
<
num_rows
&&
col
<
num_cols
)
{
curr
=
f
(
input
[
row
+
col
*
num_rows
]);
col
+=
blockDim
.
y
;
while
(
col
<
num_cols
)
{
curr
=
g
(
curr
,
f
(
input
[
row
+
col
*
num_rows
]));
col
+=
blockDim
.
y
;
}
}
s
[
threadIdx
.
x
*
warps_per_block
+
threadIdx
.
y
]
=
curr
;
__syncthreads
();
// Reduce
if
(
threadIdx
.
y
==
0
&&
row
<
num_rows
)
{
#pragma unroll
for
(
int
i
=
1
;
i
<
warps_per_block
&&
i
<
num_cols
;
++
i
)
curr
=
g
(
curr
,
s
[
i
+
threadIdx
.
x
*
warps_per_block
]);
output
[
row
]
=
curr
;
}
}
struct
ReduceHelper
{
template
<
typename
T
,
typename
Iof
,
typename
Rof
>
static
void
impl
(
Iof
f
,
Rof
g
,
const
T
*
input
,
T
*
output
,
int
num_rows
,
int
num_cols
,
bool
axis
,
CUstream
stream
)
{
int
grid_size
;
if
(
axis
)
{
grid_size
=
num_cols
;
reduce_rows
<
kCUDABlockNumThreads
><<<
grid_size
,
kCUDABlockNumThreads
,
0
,
stream
>>>
(
f
,
g
,
input
,
output
,
num_rows
,
num_cols
);
}
else
{
dim3
tpb
(
warp_size
,
kCUDABlockNumThreads
/
warp_size
);
grid_size
=
(
num_cols
+
warp_size
-
1
)
/
warp_size
;
reduce_cols
<
kCUDABlockNumThreads
><<<
grid_size
,
tpb
,
0
,
stream
>>>
(
f
,
g
,
input
,
output
,
num_rows
,
num_cols
);
}
}
};
template
<
typename
T
,
typename
Iof
,
typename
Rof
>
ctcStatus_t
reduce
(
Iof
f
,
Rof
g
,
const
T
*
input
,
T
*
output
,
int
rows
,
int
cols
,
bool
axis
,
CUstream
stream
)
{
ReduceHelper
::
impl
(
f
,
g
,
input
,
output
,
rows
,
cols
,
axis
,
stream
);
hipStreamSynchronize
(
stream
);
hipError_t
err
=
hipGetLastError
();
if
(
err
!=
hipSuccess
)
return
CTC_STATUS_EXECUTION_FAILED
;
return
CTC_STATUS_SUCCESS
;
}
ctcStatus_t
reduce_negate
(
const
float
*
input
,
float
*
output
,
int
rows
,
int
cols
,
bool
axis
,
CUstream
stream
)
{
return
reduce
(
ctc_helper
::
negate
<
float
>
(),
ctc_helper
::
add
<
float
>
(),
input
,
output
,
rows
,
cols
,
axis
,
stream
);
}
ctcStatus_t
reduce_exp
(
const
float
*
input
,
float
*
output
,
int
rows
,
int
cols
,
bool
axis
,
CUstream
stream
)
{
return
reduce
(
ctc_helper
::
exponential
<
float
>
(),
ctc_helper
::
add
<
float
>
(),
input
,
output
,
rows
,
cols
,
axis
,
stream
);
}
ctcStatus_t
reduce_max
(
const
float
*
input
,
float
*
output
,
int
rows
,
int
cols
,
bool
axis
,
CUstream
stream
)
{
auto
ctc_status
=
reduce
(
ctc_helper
::
identity
<
float
>
(),
ctc_helper
::
maximum
<
float
>
(),
input
,
output
,
rows
,
cols
,
axis
,
stream
);
return
ctc_status
;
}
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment