Commit a60ed4d7 authored by lishen's avatar lishen
Browse files

warpctc from github

parent 949fcc19
/******************************************************************************
* Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
*
* Code and text by Sean Baxter, NVIDIA Research
* See http://nvlabs.github.io/moderngpu for repository and documentation.
*
******************************************************************************/
#pragma once
#if __CUDA_ARCH__ == 100
#error "COMPUTE CAPABILITY 1.0 NOT SUPPORTED BY MPGU. TRY 2.0!"
#endif
#include <climits>
#include "../util/static.h"
#ifdef _MSC_VER
#define INLINESYMBOL __forceinline__
#else
#define INLINESYMBOL inline
#endif
namespace mgpu {
#define MGPU_HOST __host__ INLINESYMBOL
#define MGPU_DEVICE __device__ INLINESYMBOL
#define MGPU_HOST_DEVICE __host__ __device__ INLINESYMBOL
const int WARP_SIZE = 32;
const int LOG_WARP_SIZE = 5;
////////////////////////////////////////////////////////////////////////////////
// Device-side comparison operators
template<typename T>
struct less : public std::binary_function<T, T, bool> {
MGPU_HOST_DEVICE bool operator()(T a, T b) { return a < b; }
};
template<typename T>
struct less_equal : public std::binary_function<T, T, bool> {
MGPU_HOST_DEVICE bool operator()(T a, T b) { return a <= b; }
};
template<typename T>
struct greater : public std::binary_function<T, T, bool> {
MGPU_HOST_DEVICE bool operator()(T a, T b) { return a > b; }
};
template<typename T>
struct greater_equal : public std::binary_function<T, T, bool> {
MGPU_HOST_DEVICE bool operator()(T a, T b) { return a >= b; }
};
template<typename T>
struct equal_to : public std::binary_function<T, T, bool> {
MGPU_HOST_DEVICE bool operator()(T a, T b) { return a == b; }
};
template<typename T>
struct not_equal_to : public std::binary_function<T, T, bool> {
MGPU_HOST_DEVICE bool operator()(T a, T b) { return a != b; }
};
////////////////////////////////////////////////////////////////////////////////
// Device-side arithmetic operators
template<typename T>
struct plus : public std::binary_function<T, T, T> {
MGPU_HOST_DEVICE T operator()(T a, T b) { return a + b; }
};
template<typename T>
struct minus : public std::binary_function<T, T, T> {
MGPU_HOST_DEVICE T operator()(T a, T b) { return a - b; }
};
template<typename T>
struct multiplies : public std::binary_function<T, T, T> {
MGPU_HOST_DEVICE T operator()(T a, T b) { return a * b; }
};
template<typename T>
struct modulus : public std::binary_function<T, T, T> {
MGPU_HOST_DEVICE T operator()(T a, T b) { return a % b; }
};
template<typename T>
struct bit_or : public std::binary_function<T, T, T> {
MGPU_HOST_DEVICE T operator()(T a, T b) { return a | b; }
};
template<typename T>
struct bit_and : public std::binary_function<T, T, T> {
MGPU_HOST_DEVICE T operator()(T a, T b) { return a & b; }
};
template<typename T>
struct bit_xor : public std::binary_function<T, T, T> {
MGPU_HOST_DEVICE T operator()(T a, T b) { return a ^ b; }
};
template<typename T>
struct maximum : public std::binary_function<T, T, T> {
MGPU_HOST_DEVICE T operator()(T a, T b) { return max(a, b); }
};
template<typename T>
struct minimum : public std::binary_function<T, T, T> {
MGPU_HOST_DEVICE T operator()(T a, T b) { return min(a, b); }
};
////////////////////////////////////////////////////////////////////////////////
template<typename T>
MGPU_HOST_DEVICE void swap(T& a, T& b) {
T c = a;
a = b;
b = c;
}
template<typename T>
struct DevicePair {
T x, y;
};
template<typename T>
MGPU_HOST_DEVICE DevicePair<T> MakeDevicePair(T x, T y) {
DevicePair<T> p = { x, y };
return p;
}
template<typename T> struct numeric_limits;
template<> struct numeric_limits<int> {
MGPU_HOST_DEVICE static int min() { return INT_MIN; }
MGPU_HOST_DEVICE static int max() { return INT_MAX; }
MGPU_HOST_DEVICE static int lowest() { return INT_MIN; }
MGPU_HOST_DEVICE static int AddIdent() { return 0; }
MGPU_HOST_DEVICE static int MulIdent() { return 1; }
};
template<> struct numeric_limits<long long> {
MGPU_HOST_DEVICE static long long min() { return LLONG_MIN; }
MGPU_HOST_DEVICE static long long max() { return LLONG_MAX; }
MGPU_HOST_DEVICE static long long lowest() { return LLONG_MIN; }
MGPU_HOST_DEVICE static long long AddIdent() { return 0; }
MGPU_HOST_DEVICE static long long MulIdent() { return 1; }
};
template<> struct numeric_limits<uint> {
MGPU_HOST_DEVICE static uint min() { return 0; }
MGPU_HOST_DEVICE static uint max() { return UINT_MAX; }
MGPU_HOST_DEVICE static uint lowest() { return 0; }
MGPU_HOST_DEVICE static uint AddIdent() { return 0; }
MGPU_HOST_DEVICE static uint MulIdent() { return 1; }
};
template<> struct numeric_limits<unsigned long long> {
MGPU_HOST_DEVICE static unsigned long long min() { return 0; }
MGPU_HOST_DEVICE static unsigned long long max() { return ULLONG_MAX; }
MGPU_HOST_DEVICE static unsigned long long lowest() { return 0; }
MGPU_HOST_DEVICE static unsigned long long AddIdent() { return 0; }
MGPU_HOST_DEVICE static unsigned long long MulIdent() { return 1; }
};
template<> struct numeric_limits<float> {
MGPU_HOST_DEVICE static float min() { return FLT_MIN; }
MGPU_HOST_DEVICE static float max() { return FLT_MAX; }
MGPU_HOST_DEVICE static float lowest() { return -FLT_MAX; }
MGPU_HOST_DEVICE static float AddIdent() { return 0; }
MGPU_HOST_DEVICE static float MulIdent() { return 1; }
};
template<> struct numeric_limits<double> {
MGPU_HOST_DEVICE static double min() { return DBL_MIN; }
MGPU_HOST_DEVICE static double max() { return DBL_MAX; }
MGPU_HOST_DEVICE static double lowest() { return -DBL_MAX; }
MGPU_HOST_DEVICE static double AddIdent() { return 0; }
MGPU_HOST_DEVICE static double MulIdent() { return 1; }
};
MGPU_HOST_DEVICE int2 operator+(int2 a, int2 b) {
return make_int2(a.x + b.x, a.y + b.y);
}
MGPU_HOST_DEVICE int2& operator+=(int2& a, int2 b) {
a = a + b;
return a;
}
MGPU_HOST_DEVICE int2 operator*(int2 a, int2 b) {
return make_int2(a.x * b.x, a.y * b.y);
}
MGPU_HOST_DEVICE int2& operator*=(int2& a, int2 b) {
a = a * b;
return a;
}
template<typename T>
MGPU_HOST_DEVICE T max(T a, T b) {
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 100)
return std::max(a, b);
#else
return (a < b) ? b : a;
#endif
}
template<typename T>
MGPU_HOST_DEVICE T min(T a, T b) {
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 100)
return std::min(a, b);
#else
return (b < a) ? b : a;
#endif
}
MGPU_HOST_DEVICE int2 max(int2 a, int2 b) {
return make_int2(max(a.x, b.x), max(a.y, b.y));
}
MGPU_HOST_DEVICE int2 min(int2 a, int2 b) {
return make_int2(min(a.x, b.x), min(a.y, b.y));
}
template<> struct numeric_limits<int2> {
MGPU_HOST_DEVICE static int2 min() { return make_int2(INT_MIN, INT_MIN); }
MGPU_HOST_DEVICE static int2 max() { return make_int2(INT_MAX, INT_MAX); }
MGPU_HOST_DEVICE static int2 lowest() {
return make_int2(INT_MIN, INT_MIN);
}
MGPU_HOST_DEVICE static int2 AddIdent() { return make_int2(0, 0); }
MGPU_HOST_DEVICE static int2 MulIdent() { return make_int2(1, 1); }
};
template<typename T>
class constant_iterator : public std::iterator_traits<const T*> {
public:
MGPU_HOST_DEVICE constant_iterator(T value) : _value(value) { }
MGPU_HOST_DEVICE T operator[](ptrdiff_t i) const {
return _value;
}
MGPU_HOST_DEVICE T operator*() const {
return _value;
}
MGPU_HOST_DEVICE constant_iterator operator+(ptrdiff_t diff) const {
return constant_iterator(_value);
}
MGPU_HOST_DEVICE constant_iterator operator-(ptrdiff_t diff) const {
return constant_iterator(_value);
}
MGPU_HOST_DEVICE constant_iterator& operator+=(ptrdiff_t diff) {
return *this;
}
MGPU_HOST_DEVICE constant_iterator& operator-=(ptrdiff_t diff) {
return *this;
}
private:
T _value;
};
template<typename T>
class counting_iterator : public std::iterator_traits<const T*> {
public:
MGPU_HOST_DEVICE counting_iterator(T value) : _value(value) { }
MGPU_HOST_DEVICE T operator[](ptrdiff_t i) {
return _value + i;
}
MGPU_HOST_DEVICE T operator*() {
return _value;
}
MGPU_HOST_DEVICE counting_iterator operator+(ptrdiff_t diff) {
return counting_iterator(_value + diff);
}
MGPU_HOST_DEVICE counting_iterator operator-(ptrdiff_t diff) {
return counting_iterator(_value - diff);
}
MGPU_HOST_DEVICE counting_iterator& operator+=(ptrdiff_t diff) {
_value += diff;
return *this;
}
MGPU_HOST_DEVICE counting_iterator& operator-=(ptrdiff_t diff) {
_value -= diff;
return *this;
}
private:
T _value;
};
template<typename T>
class step_iterator : public std::iterator_traits<const T*> {
public:
MGPU_HOST_DEVICE step_iterator(T base, T step) :
_base(base), _step(step), _offset(0) { }
MGPU_HOST_DEVICE T operator[](ptrdiff_t i) {
return _base + (_offset + i) * _step;
}
MGPU_HOST_DEVICE T operator*() {
return _base + _offset * _step;
}
MGPU_HOST_DEVICE step_iterator operator+(ptrdiff_t diff) {
step_iterator it = *this;
it._offset += diff;
return it;
}
MGPU_HOST_DEVICE step_iterator operator-(ptrdiff_t diff) {
step_iterator it = *this;
it._offset -= diff;
return it;
}
MGPU_HOST_DEVICE step_iterator& operator+=(ptrdiff_t diff) {
_offset += diff;
return *this;
}
MGPU_HOST_DEVICE step_iterator& operator-=(ptrdiff_t diff) {
_offset -= diff;
return *this;
}
private:
ptrdiff_t _offset;
T _base, _step;
};
} // namespace mgpu
template<typename T>
MGPU_HOST_DEVICE mgpu::counting_iterator<T> operator+(ptrdiff_t diff,
mgpu::counting_iterator<T> it) {
return it + diff;
}
template<typename T>
MGPU_HOST_DEVICE mgpu::counting_iterator<T> operator-(ptrdiff_t diff,
mgpu::counting_iterator<T> it) {
return it + (-diff);
}
template<typename T>
MGPU_HOST_DEVICE mgpu::step_iterator<T> operator+(ptrdiff_t diff,
mgpu::step_iterator<T> it) {
return it + diff;
}
template<typename T>
MGPU_HOST_DEVICE mgpu::step_iterator<T> operator-(ptrdiff_t diff,
mgpu::step_iterator<T> it) {
return it + (-diff);
}
/******************************************************************************
* Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
*
* Code and text by Sean Baxter, NVIDIA Research
* See http://nvlabs.github.io/moderngpu for repository and documentation.
*
******************************************************************************/
#pragma once
#include "intrinsics.cuh"
namespace mgpu {
// Get the difference between two pointers in bytes.
MGPU_HOST_DEVICE ptrdiff_t PtrDiff(const void* a, const void* b) {
return (const byte*)b - (const byte*)a;
}
// Offset a pointer by i bytes.
template<typename T>
MGPU_HOST_DEVICE const T* PtrOffset(const T* p, ptrdiff_t i) {
return (const T*)((const byte*)p + i);
}
template<typename T>
MGPU_HOST_DEVICE T* PtrOffset(T* p, ptrdiff_t i) {
return (T*)((byte*)p + i);
}
////////////////////////////////////////////////////////////////////////////////
// Task range support
// Evenly distributes variable-length arrays over a fixed number of CTAs.
MGPU_HOST int2 DivideTaskRange(int numItems, int numWorkers) {
div_t d = div(numItems, numWorkers);
return make_int2(d.quot, d.rem);
}
MGPU_HOST_DEVICE int2 ComputeTaskRange(int block, int2 task) {
int2 range;
range.x = task.x * block;
range.x += min(block, task.y);
range.y = range.x + task.x + (block < task.y);
return range;
}
MGPU_HOST_DEVICE int2 ComputeTaskRange(int block, int2 task, int blockSize,
int count) {
int2 range = ComputeTaskRange(block, task);
range.x *= blockSize;
range.y = min(count, range.y * blockSize);
return range;
}
////////////////////////////////////////////////////////////////////////////////
// DeviceExtractHeadFlags
// Input array flags is a bit array with 32 head flags per word.
// ExtractThreadHeadFlags returns numBits flags starting at bit index.
MGPU_HOST_DEVICE uint DeviceExtractHeadFlags(const uint* flags, int index,
int numBits) {
int index2 = index>> 5;
int shift = 31 & index;
uint headFlags = flags[index2]>> shift;
int shifted = 32 - shift;
if(shifted < numBits)
// We also need to shift in the next set of bits.
headFlags = bfi(flags[index2 + 1], headFlags, shifted, shift);
headFlags &= (1<< numBits) - 1;
return headFlags;
}
////////////////////////////////////////////////////////////////////////////////
// DevicePackHeadFlags
// Pack VT bits per thread at 32 bits/thread. Will consume an integer number of
// words, because CTA size is a multiple of 32. The first NT * VT / 32 threads
// return packed words.
template<int NT, int VT>
MGPU_DEVICE uint DevicePackHeadFlags(uint threadBits, int tid,
uint* flags_shared) {
const int WordCount = NT * VT / 32;
// Each thread stores its thread bits to flags_shared[tid].
flags_shared[tid] = threadBits;
__syncthreads();
uint packed = 0;
if(tid < WordCount) {
const int Items = MGPU_DIV_UP(32, VT);
int index = 32 * tid;
int first = index / VT;
int bit = 0;
int rem = index - VT * first;
packed = flags_shared[first]>> rem;
bit = VT - rem;
++first;
#pragma unroll
for(int i = 0; i < Items; ++i) {
if(i < Items - 1 || bit < 32) {
uint x = flags_shared[first + i];
if(bit < 32) packed |= x<< bit;
bit += VT;
}
}
}
__syncthreads();
return packed;
}
} // namespace mgpu
/******************************************************************************
* Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
*
* Code and text by Sean Baxter, NVIDIA Research
* See http://nvlabs.github.io/moderngpu for repository and documentation.
*
******************************************************************************/
#include "devicetypes.cuh"
#pragma once
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
namespace mgpu {
MGPU_HOST_DEVICE uint2 ulonglong_as_uint2(uint64 x) {
return *reinterpret_cast<uint2*>(&x);
}
MGPU_HOST_DEVICE uint64 uint2_as_ulonglong(uint2 x) {
return *reinterpret_cast<uint64*>(&x);
}
MGPU_HOST_DEVICE int2 longlong_as_int2(int64 x) {
return *reinterpret_cast<int2*>(&x);
}
MGPU_HOST_DEVICE int64 int2_as_longlong(int2 x) {
return *reinterpret_cast<int64*>(&x);
}
MGPU_HOST_DEVICE int2 double_as_int2(double x) {
return *reinterpret_cast<int2*>(&x);
}
MGPU_HOST_DEVICE double int2_as_double(int2 x) {
return *reinterpret_cast<double*>(&x);
}
MGPU_HOST_DEVICE void SetDoubleX(double& d, int x) {
reinterpret_cast<int*>(&d)[0] = x;
}
MGPU_HOST_DEVICE int GetDoubleX(double d) {
return double_as_int2(d).x;
}
MGPU_HOST_DEVICE void SetDoubleY(double& d, int y) {
reinterpret_cast<int*>(&d)[1] = y;
}
MGPU_HOST_DEVICE int GetDoubleY(double d) {
return double_as_int2(d).y;
}
////////////////////////////////////////////////////////////////////////////////
// PTX for bfe and bfi
#if __CUDA_ARCH__ >= 200
MGPU_DEVICE uint bfe_ptx(uint x, uint bit, uint numBits) {
uint result;
asm("bfe.u32 %0, %1, %2, %3;" :
"=r"(result) : "r"(x), "r"(bit), "r"(numBits));
return result;
}
MGPU_DEVICE uint bfi_ptx(uint x, uint y, uint bit, uint numBits) {
uint result;
asm("bfi.b32 %0, %1, %2, %3, %4;" :
"=r"(result) : "r"(x), "r"(y), "r"(bit), "r"(numBits));
return result;
}
MGPU_DEVICE uint prmt_ptx(uint a, uint b, uint index) {
uint ret;
asm("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index));
return ret;
}
#endif // __CUDA_ARCH__ >= 200
////////////////////////////////////////////////////////////////////////////////
// shfl_up
__device__ __forceinline__ float shfl_up(float var,
unsigned int delta, int width = 32) {
#if __CUDA_ARCH__ >= 300
var = __shfl_up_sync(0xFFFFFFFF, var, delta, width);
#endif
return var;
}
__device__ __forceinline__ double shfl_up(double var,
unsigned int delta, int width = 32) {
#if __CUDA_ARCH__ >= 300
int2 p = mgpu::double_as_int2(var);
p.x = __shfl_up_sync(0xFFFFFFFF, p.x, delta, width);
p.y = __shfl_up_sync(0xFFFFFFFF, p.y, delta, width);
var = mgpu::int2_as_double(p);
#endif
return var;
}
////////////////////////////////////////////////////////////////////////////////
// shfl_add
MGPU_DEVICE int shfl_add(int x, int offset, int width = WARP_SIZE) {
int result = 0;
#if __CUDA_ARCH__ >= 300
int mask = (WARP_SIZE - width)<< 8;
asm(
"{.reg .s32 r0;"
".reg .pred p;"
"shfl.up.sync.b32 r0|p, %1, %2, %3, %4;"
"@p add.s32 r0, r0, %4;"
"mov.s32 %0, r0; }"
: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
#endif
return result;
}
MGPU_DEVICE int shfl_max(int x, int offset, int width = WARP_SIZE) {
int result = 0;
#if __CUDA_ARCH__ >= 300
int mask = (WARP_SIZE - width)<< 8;
asm(
"{.reg .s32 r0;"
".reg .pred p;"
"shfl.up.sync..b32 r0|p, %1, %2, %3, %4;"
"@p max.s32 r0, r0, %4;"
"mov.s32 %0, r0; }"
: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
#endif
return result;
}
////////////////////////////////////////////////////////////////////////////////
// brev, popc, clz, bfe, bfi, prmt
// Reverse the bits in an integer.
MGPU_HOST_DEVICE uint brev(uint x) {
#if __CUDA_ARCH__ >= 200
uint y = __brev(x);
#else
uint y = 0;
for(int i = 0; i < 32; ++i)
y |= (1 & (x>> i))<< (31 - i);
#endif
return y;
}
// Count number of bits in a register.
MGPU_HOST_DEVICE int popc(uint x) {
#if __CUDA_ARCH__ >= 200
return __popc(x);
#else
int c;
for(c = 0; x; ++c)
x &= x - 1;
return c;
#endif
}
// Count leading zeros - start from most significant bit.
MGPU_HOST_DEVICE int clz(int x) {
#if __CUDA_ARCH__ >= 200
return __clz(x);
#else
for(int i = 31; i >= 0; --i)
if((1<< i) & x) return 31 - i;
return 32;
#endif
}
// Find first set - start from least significant bit. LSB is 1. ffs(0) is 0.
MGPU_HOST_DEVICE int ffs(int x) {
#if __CUDA_ARCH__ >= 200
return __ffs(x);
#else
for(int i = 0; i < 32; ++i)
if((1<< i) & x) return i + 1;
return 0;
#endif
}
MGPU_HOST_DEVICE uint bfe(uint x, uint bit, uint numBits) {
#if __CUDA_ARCH__ >= 200
return bfe_ptx(x, bit, numBits);
#else
return ((1<< numBits) - 1) & (x>> bit);
#endif
}
MGPU_HOST_DEVICE uint bfi(uint x, uint y, uint bit, uint numBits) {
uint result;
#if __CUDA_ARCH__ >= 200
result = bfi_ptx(x, y, bit, numBits);
#else
if(bit + numBits > 32) numBits = 32 - bit;
uint mask = ((1<< numBits) - 1)<< bit;
result = y & ~mask;
result |= mask & (x<< bit);
#endif
return result;
}
MGPU_HOST_DEVICE uint prmt(uint a, uint b, uint index) {
uint result;
#if __CUDA_ARCH__ >= 200
result = prmt_ptx(a, b, index);
#else
result = 0;
for(int i = 0; i < 4; ++i) {
uint sel = 0xf & (index>> (4 * i));
uint x = ((7 & sel) > 3) ? b : a;
x = 0xff & (x>> (8 * (3 & sel)));
if(8 & sel) x = (128 & x) ? 0xff : 0;
result |= x<< (8 * i);
}
#endif
return result;
}
// Find log2(x) and optionally round up to the next integer logarithm.
MGPU_HOST_DEVICE int FindLog2(int x, bool roundUp = false) {
int a = 31 - clz(x);
if(roundUp) a += !MGPU_IS_POW_2(x);
return a;
}
////////////////////////////////////////////////////////////////////////////////
// vset4
#if __CUDA_ARCH__ >= 300
// Performs four byte-wise comparisons and returns 1 for each byte that
// satisfies the conditional, and zero otherwise.
MGPU_DEVICE uint vset4_lt_add_ptx(uint a, uint b, uint c) {
uint result;
asm("vset4.u32.u32.lt.add %0, %1, %2, %3;" :
"=r"(result) : "r"(a), "r"(b), "r"(c));
return result;
}
MGPU_DEVICE uint vset4_eq_ptx(uint a, uint b) {
uint result;
asm("vset4.u32.u32.eq %0, %1, %2, %3;" :
"=r"(result) : "r"(a), "r"(b), "r"(0));
return result;
}
#endif // __CUDA_ARCH__ >= 300
MGPU_HOST_DEVICE uint vset4_lt_add(uint a, uint b, uint c) {
uint result;
#if __CUDA_ARCH__ >= 300
result = vset4_lt_add_ptx(a, b, c);
#else
result = c;
if((0x000000ff & a) < (0x000000ff & b)) result += 0x00000001;
if((0x0000ff00 & a) < (0x0000ff00 & b)) result += 0x00000100;
if((0x00ff0000 & a) < (0x00ff0000 & b)) result += 0x00010000;
if((0xff000000 & a) < (0xff000000 & b)) result += 0x01000000;
#endif
return result;
}
MGPU_HOST_DEVICE uint vset4_eq(uint a, uint b) {
uint result;
#if __CUDA_ARCH__ >= 300
result = vset4_eq_ptx(a, b);
#else
result = 0;
if((0x000000ff & a) == (0x000000ff & b)) result = 0x00000001;
if((0x0000ff00 & a) == (0x0000ff00 & b)) result += 0x00000100;
if((0x00ff0000 & a) == (0x00ff0000 & b)) result += 0x00010000;
if((0xff000000 & a) == (0xff000000 & b)) result += 0x01000000;
#endif
return result;
}
////////////////////////////////////////////////////////////////////////////////
//
MGPU_HOST_DEVICE uint umulhi(uint x, uint y) {
#if __CUDA_ARCH__ >= 100
return __umulhi(x, y);
#else
uint64 product = (uint64)x * y;
return (uint)(product>> 32);
#endif
}
////////////////////////////////////////////////////////////////////////////////
// ldg() function defined for all devices and all types. Only compiles to __ldg
// intrinsic for __CUDA_ARCH__ >= 320 && __CUDA_ARCH__ < 400 for types supported
// by __ldg in sm_32_intrinsics.h
template<typename T>
struct IsLdgType {
enum { value = false };
};
#define DEFINE_LDG_TYPE(T) \
template<> struct IsLdgType<T> { enum { value = true }; };
template<typename T, bool UseLDG = IsLdgType<T>::value>
struct LdgShim {
MGPU_DEVICE static T Ldg(const T* p) {
return *p;
}
};
#if __CUDA_ARCH__ >= 320 && __CUDA_ARCH__ < 400
// List of __ldg-compatible types from sm_32_intrinsics.h.
DEFINE_LDG_TYPE(char)
DEFINE_LDG_TYPE(short)
DEFINE_LDG_TYPE(int)
DEFINE_LDG_TYPE(long long)
DEFINE_LDG_TYPE(char2)
DEFINE_LDG_TYPE(char4)
DEFINE_LDG_TYPE(short2)
DEFINE_LDG_TYPE(short4)
DEFINE_LDG_TYPE(int2)
DEFINE_LDG_TYPE(int4)
DEFINE_LDG_TYPE(longlong2)
DEFINE_LDG_TYPE(unsigned char)
DEFINE_LDG_TYPE(unsigned short)
DEFINE_LDG_TYPE(unsigned int)
DEFINE_LDG_TYPE(unsigned long long)
DEFINE_LDG_TYPE(uchar2)
DEFINE_LDG_TYPE(uchar4)
DEFINE_LDG_TYPE(ushort2)
DEFINE_LDG_TYPE(ushort4)
DEFINE_LDG_TYPE(uint2)
DEFINE_LDG_TYPE(uint4)
DEFINE_LDG_TYPE(ulonglong2)
DEFINE_LDG_TYPE(float)
DEFINE_LDG_TYPE(double)
DEFINE_LDG_TYPE(float2)
DEFINE_LDG_TYPE(float4)
DEFINE_LDG_TYPE(double2)
template<typename T> struct LdgShim<T, true> {
MGPU_DEVICE static T Ldg(const T* p) {
return __ldg(p);
}
};
#endif
template<typename T>
MGPU_DEVICE T ldg(const T* p) {
return LdgShim<T>::Ldg(p);
}
////////////////////////////////////////////////////////////////////////////////
// Fast division for 31-bit integers.
// Uses the method in Hacker's Delight (2nd edition) page 228.
// Evaluates for denom > 1 and x < 2^31.
struct FastDivide {
uint denom;
uint coef;
uint shift;
MGPU_HOST_DEVICE uint Divide(uint x) {
return umulhi(x, coef)>> shift;
}
MGPU_HOST_DEVICE uint Modulus(uint x) {
return x - Divide(x) * denom;
}
explicit FastDivide(uint denom_) {
denom = denom_;
uint p = 31 + FindLog2(denom, true);
coef = (uint)(((1ull<< p) + denom - 1) / denom);
shift = p - 32;
}
};
#pragma GCC diagnostic pop
} // namespace mgpu
/******************************************************************************
* Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
*
* Code and text by Sean Baxter, NVIDIA Research
* See http://nvlabs.github.io/moderngpu for repository and documentation.
*
******************************************************************************/
#pragma once
#include "../mgpudevice.cuh"
#include "deviceutil.cuh"
#include "intrinsics.cuh"
namespace mgpu {
////////////////////////////////////////////////////////////////////////////////
// Cooperative load functions.
template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceSharedToReg(InputIt data, int tid, T* reg,
bool sync) {
#pragma unroll
for(int i = 0; i < VT; ++i)
reg[i] = data[NT * i + tid];
if(sync) __syncthreads();
}
template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToRegPred(int count, InputIt data, int tid,
T* reg, bool sync) {
// TODO: Attempt to issue 4 loads at a time.
#pragma unroll
for(int i = 0; i < VT; ++i) {
int index = NT * i + tid;
if(index < count) reg[i] = data[index];
}
if(sync) __syncthreads();
}
template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToReg(int count, InputIt data, int tid,
T* reg, bool sync) {
if(count >= NT * VT) {
#pragma unroll
for(int i = 0; i < VT; ++i)
reg[i] = data[NT * i + tid];
} else
DeviceGlobalToRegPred<NT, VT>(count, data, tid, reg, false);
if(sync) __syncthreads();
}
template<int NT, int VT0, int VT1, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToReg2(int count, InputIt data, int tid,
T* reg, bool sync) {
DeviceGlobalToReg<NT, VT0>(count, data, tid, reg, false);
#pragma unroll
for(int i = VT0; i < VT1; ++i) {
int index = NT * i + tid;
if(index < count) reg[i] = data[index];
}
if(sync) __syncthreads();
}
template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToRegDefault(int count, InputIt data, int tid,
T* reg, T init, bool sync) {
if(count >= NT * VT) {
#pragma unroll
for(int i = 0; i < VT; ++i)
reg[i] = data[NT * i + tid];
} else {
#pragma unroll
for(int i = 0; i < VT; ++i) {
int index = NT * i + tid;
reg[i] = init;
if(index < count) reg[i] = data[index];
}
}
if(sync) __syncthreads();
}
template<int NT, int VT0, int VT1, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToRegDefault2(int count, InputIt data, int tid,
T* reg, T init, bool sync) {
DeviceGlobalToRegDefault<NT, VT0>(count, data, tid, reg, init, false);
#pragma unroll
for(int i = VT0; i < VT1; ++i) {
int index = NT * i + tid;
reg[i] = init;
if(index < count) reg[i] = data[index];
}
if(sync) __syncthreads();
}
////////////////////////////////////////////////////////////////////////////////
template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToThread(int count, InputIt data, int tid,
T* reg) {
data += VT * tid;
if(count >= NT * VT) {
#pragma unroll
for(int i = 0; i < VT; ++i)
reg[i] = ldg(data + i);
} else {
count -= VT * tid;
#pragma unroll
for(int i = 0; i < VT; ++i)
if(i < count) reg[i] = ldg(data + i);
}
}
template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToThreadDefault(int count, InputIt data, int tid,
T* reg, T init) {
data += VT * tid;
if(count >= NT * VT) {
#pragma unroll
for(int i = 0; i < VT; ++i)
reg[i] = ldg(data + i);
} else {
count -= VT * tid;
#pragma unroll
for(int i = 0; i < VT; ++i)
reg[i] = (i < count) ? ldg(data + i) : init;
}
}
////////////////////////////////////////////////////////////////////////////////
// Cooperative store functions.
template<int NT, int VT, typename OutputIt, typename T>
MGPU_DEVICE void DeviceRegToShared(const T* reg, int tid,
OutputIt dest, bool sync) {
typedef typename std::iterator_traits<OutputIt>::value_type T2;
#pragma unroll
for(int i = 0; i < VT; ++i)
dest[NT * i + tid] = (T2)reg[i];
if(sync) __syncthreads();
}
template<int NT, int VT, typename OutputIt, typename T>
MGPU_DEVICE void DeviceRegToGlobal(int count, const T* reg, int tid,
OutputIt dest, bool sync) {
#pragma unroll
for(int i = 0; i < VT; ++i) {
int index = NT * i + tid;
if(index < count)
dest[index] = reg[i];
}
if(sync) __syncthreads();
}
////////////////////////////////////////////////////////////////////////////////
// DeviceMemToMemLoop
// Transfer from shared memory to global, or global to shared, for transfers
// that are smaller than NT * VT in the average case. The goal is to reduce
// unnecessary comparison logic.
template<int NT, int VT, typename InputIt, typename OutputIt>
MGPU_DEVICE void DeviceMemToMem4(int count, InputIt source, int tid,
OutputIt dest, bool sync) {
typedef typename std::iterator_traits<InputIt>::value_type T;
T x[VT];
const int Count = (VT < 4) ? VT : 4;
if(count >= NT * VT) {
#pragma unroll
for(int i = 0; i < Count; ++i)
x[i] = source[NT * i + tid];
#pragma unroll
for(int i = 0; i < Count; ++i)
dest[NT * i + tid] = x[i];
} else {
#pragma unroll
for(int i = 0; i < Count; ++i) {
int index = NT * i + tid;
if(index < count)
x[i] = source[NT * i + tid];
}
#pragma unroll
for(int i = 0; i < Count; ++i) {
int index = NT * i + tid;
if(index < count)
dest[index] = x[i];
}
}
if(sync) __syncthreads();
}
template<int NT, typename InputIt, typename OutputIt>
MGPU_DEVICE void DeviceMemToMemLoop(int count, InputIt source, int tid,
OutputIt dest, bool sync) {
for(int i = 0; i < count; i += 4 * NT)
DeviceMemToMem4<NT, 4>(count - i, source + i, tid, dest + i,
false);
if(sync) __syncthreads();
}
////////////////////////////////////////////////////////////////////////////////
// Functions to copy between shared and global memory where the average case is
// to transfer NT * VT elements.
template<int NT, int VT, typename T, typename OutputIt>
MGPU_DEVICE void DeviceSharedToGlobal(int count, const T* source, int tid,
OutputIt dest, bool sync) {
typedef typename std::iterator_traits<OutputIt>::value_type T2;
#pragma unroll
for(int i = 0; i < VT; ++i) {
int index = NT * i + tid;
if(index < count) dest[index] = (T2)source[index];
}
if(sync) __syncthreads();
}
template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToShared(int count, InputIt source, int tid,
T* dest, bool sync) {
T reg[VT];
DeviceGlobalToReg<NT, VT>(count, source, tid, reg, false);
DeviceRegToShared<NT, VT>(reg, tid, dest, sync);
}
template<int NT, int VT0, int VT1, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToShared2(int count, InputIt source, int tid,
T* dest, bool sync) {
T reg[VT1];
DeviceGlobalToReg2<NT, VT0, VT1>(count, source, tid, reg, false);
DeviceRegToShared<NT, VT1>(reg, tid, dest, sync);
}
template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToSharedDefault(int count, InputIt source, int tid,
T* dest, T init, bool sync) {
T reg[VT];
DeviceGlobalToRegDefault<NT, VT>(count, source, tid, reg, init, false);
DeviceRegToShared<NT, VT>(reg, tid, dest, sync);
}
template<int NT, int VT0, int VT1, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToSharedDefault2(int count, InputIt data, int tid,
T* dest, T init, bool sync) {
T reg[VT1];
DeviceGlobalToRegDefault2<NT, VT0, VT1>(count, data, tid, reg, init, false);
DeviceRegToShared<NT, VT1>(reg, tid, dest, sync);
}
////////////////////////////////////////////////////////////////////////////////
template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToSharedLoop(int count, InputIt source, int tid,
T* dest, bool sync) {
const int Granularity = MGPU_MIN(VT, 3);
DeviceGlobalToShared<NT, Granularity>(count, source, tid, dest, false);
int offset = Granularity * NT;
if(count > offset)
DeviceGlobalToShared<NT, VT - Granularity>(count - offset,
source + offset, tid, dest + offset, false);
if(sync) __syncthreads();
/*
source += tid;
while(count > 0) {
T reg[Granularity];
#pragma unroll
for(int i = 0; i < Granularity; ++i) {
int index = NT * i + tid;
if(index < count)
reg[i] = source[NT * i];
}
DeviceRegToShared<NT, Granularity>(reg, tid, dest, false);
source += Granularity * NT;
dest += Granularity * NT;
count -= Granularity * NT;
}
if(sync) __syncthreads();*/
}
template<int NT, int VT, typename InputIt, typename OutputIt>
MGPU_DEVICE void DeviceGlobalToGlobal(int count, InputIt source, int tid,
OutputIt dest, bool sync) {
typedef typename std::iterator_traits<OutputIt>::value_type T;
T values[VT];
DeviceGlobalToReg<NT, VT>(count, source, tid, values, false);
DeviceRegToGlobal<NT, VT>(count, values, tid, dest, sync);
}
////////////////////////////////////////////////////////////////////////////////
// Transponse VT elements in NT threads (x) into thread-order registers (y)
// using only NT * VT / 2 elements of shared memory.
//This function definitely has a bug, don't use!!! fix TODO(erich)
template<int NT, int VT, typename T>
MGPU_DEVICE void HalfSmemTranspose(const T* x, int tid, T* shared, T* y) {
printf("HalfSmemTranspose has a bug, use WAR SmemTranpose or find bug before using in production");
// Transpose the first half values (tid < NT / 2)
#pragma unroll
for(int i = 0; i <= VT / 2; ++i)
if(i < VT / 2 || tid < NT / 2)
shared[NT * i + tid] = x[i];
__syncthreads();
if(tid < NT / 2) {
#pragma unroll
for(int i = 0; i < VT; ++i)
y[i] = shared[VT * tid + i];
}
__syncthreads();
// Transpose the second half values (tid >= NT / 2)
#pragma unroll
for(int i = VT / 2; i < VT; ++i)
if(i > VT / 2 || tid >= NT / 2)
shared[NT * i - NT * VT / 2 + tid] = x[i];
__syncthreads();
if(tid >= NT / 2) {
#pragma unroll
for(int i = 0; i < VT; ++i)
y[i] = shared[VT * tid + i - NT * VT / 2];
}
__syncthreads();
}
////////////////////////////////////////////////////////////////////////////////
// Gather/scatter functions
template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGather(int count, InputIt data, int indices[VT],
int tid, T* reg, bool sync) {
if(count >= NT * VT) {
#pragma unroll
for(int i = 0; i < VT; ++i)
reg[i] = data[indices[i]];
} else {
#pragma unroll
for(int i = 0; i < VT; ++i) {
int index = NT * i + tid;
if(index < count)
reg[i] = data[indices[i]];
}
}
if(sync) __syncthreads();
}
template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGatherDefault(int count, InputIt data, int indices[VT],
int tid, T* reg, T identity, bool sync) {
if(count >= NT * VT) {
#pragma unroll
for(int i = 0; i < VT; ++i)
reg[i] = data[indices[i]];
} else {
#pragma unroll
for(int i = 0; i < VT; ++i) {
int index = NT * i + tid;
reg[i] = (index < count) ? data[indices[i]] : identity;
}
}
if(sync) __syncthreads();
}
template<int NT, int VT, typename T, typename OutputIt>
MGPU_DEVICE void DeviceScatter(int count, const T* reg, int tid,
int indices[VT], OutputIt data, bool sync) {
if(count >= NT * VT) {
#pragma unroll
for(int i = 0; i < VT; ++i)
data[indices[i]] = reg[i];
} else {
#pragma unroll
for(int i = 0; i < VT; ++i) {
int index = NT * i + tid;
if(index < count)
data[indices[i]] = reg[i];
}
}
if(sync) __syncthreads();
}
////////////////////////////////////////////////////////////////////////////////
// Cooperative transpose functions (strided to thread order)
template<int VT, typename T>
MGPU_DEVICE void DeviceThreadToShared(const T* threadReg, int tid, T* shared,
bool sync) {
if(1 & VT) {
// Odd grain size. Store as type T.
#pragma unroll
for(int i = 0; i < VT; ++i)
shared[VT * tid + i] = threadReg[i];
} else {
// Even grain size. Store as DevicePair<T>. This lets us exploit the
// 8-byte shared memory mode on Kepler.
DevicePair<T>* dest = (DevicePair<T>*)(shared + VT * tid);
#pragma unroll
for(int i = 0; i < VT / 2; ++i)
dest[i] = MakeDevicePair(threadReg[2 * i], threadReg[2 * i + 1]);
}
if(sync) __syncthreads();
}
template<int VT, typename T>
MGPU_DEVICE void DeviceSharedToThread(const T* shared, int tid, T* threadReg,
bool sync) {
if(1 & VT) {
#pragma unroll
for(int i = 0; i < VT; ++i)
threadReg[i] = shared[VT * tid + i];
} else {
const DevicePair<T>* source = (const DevicePair<T>*)(shared + VT * tid);
#pragma unroll
for(int i = 0; i < VT / 2; ++i) {
DevicePair<T> p = source[i];
threadReg[2 * i] = p.x;
threadReg[2 * i + 1] = p.y;
}
}
if(sync) __syncthreads();
}
////////////////////////////////////////////////////////////////////////////////
// DeviceLoad2 - load from pointers of the same type. Optimize for a single LD
// statement.
template<int NT, int VT0, int VT1, typename T>
MGPU_DEVICE void DeviceLoad2ToReg(const T* a_global, int aCount,
const T* b_global, int bCount, int tid, T* reg, bool sync) {
int b0 = b_global - a_global - aCount;
int total = aCount + bCount;
if(total >= NT * VT0) {
#pragma unroll
for(int i = 0; i < VT0; ++i) {
int index = NT * i + tid;
reg[i] = a_global[index + ((index >= aCount) ? b0 : 0)];
}
} else {
#pragma unroll
for(int i = 0; i < VT0; ++i) {
int index = NT * i + tid;
if(index < total)
reg[i] = a_global[index + ((index >= aCount) ? b0 : 0)];
}
}
#pragma unroll
for(int i = VT0; i < VT1; ++i) {
int index = NT * i + tid;
if(index < total)
reg[i] = a_global[index + ((index >= aCount) ? b0 : 0)];
}
}
template<int NT, int VT0, int VT1, typename T>
MGPU_DEVICE void DeviceLoad2ToShared(const T* a_global, int aCount,
const T* b_global, int bCount, int tid, T* shared, bool sync) {
T reg[VT1];
DeviceLoad2ToReg<NT, VT0, VT1>(a_global, aCount, b_global, bCount, tid,
reg, false);
DeviceRegToShared<NT, VT1>(reg, tid, shared, sync);
}
////////////////////////////////////////////////////////////////////////////////
// DeviceLoad2 - load from pointers of different types. Uses two LD statements.
template<int NT, int VT0, int VT1, typename InputIt1, typename InputIt2,
typename T>
MGPU_DEVICE void DeviceLoad2ToReg(InputIt1 a_global, int aCount,
InputIt2 b_global, int bCount, int tid, T* reg, bool sync) {
b_global -= aCount;
int total = aCount + bCount;
if(total >= NT * VT0) {
#pragma unroll
for(int i = 0; i < VT0; ++i) {
int index = NT * i + tid;
if(index < aCount) reg[i] = a_global[index];
else reg[i] = b_global[index];
}
} else {
#pragma unroll
for(int i = 0; i < VT0; ++i) {
int index = NT * i + tid;
if(index < aCount) reg[i] = a_global[index];
else if(index < total) reg[i] = b_global[index];
}
}
#pragma unroll
for(int i = VT0; i < VT1; ++i) {
int index = NT * i + tid;
if(index < aCount) reg[i] = a_global[index];
else if(index < total) reg[i] = b_global[index];
}
}
template<int NT, int VT0, int VT1, typename InputIt1, typename InputIt2,
typename T>
MGPU_DEVICE void DeviceLoad2ToShared(InputIt1 a_global, int aCount,
InputIt2 b_global, int bCount, int tid, T* shared, bool sync) {
T reg[VT1];
DeviceLoad2ToReg<NT, VT0, VT1>(a_global, aCount, b_global, bCount, tid,
reg, false);
DeviceRegToShared<NT, VT1>(reg, tid, shared, sync);
}
////////////////////////////////////////////////////////////////////////////////
// DeviceGatherGlobalToGlobal
template<int NT, int VT, typename InputIt, typename OutputIt>
MGPU_DEVICE void DeviceGatherGlobalToGlobal(int count, InputIt data_global,
const int* indices_shared, int tid, OutputIt dest_global, bool sync) {
typedef typename std::iterator_traits<InputIt>::value_type ValType;
ValType values[VT];
#pragma unroll
for(int i = 0; i < VT; ++i) {
int index = NT * i + tid;
if(index < count) {
int gather = indices_shared[index];
values[i] = data_global[gather];
}
}
if(sync) __syncthreads();
DeviceRegToGlobal<NT, VT>(count, values, tid, dest_global, false);
}
////////////////////////////////////////////////////////////////////////////////
// DeviceTransferMergeValues
// Gather in a merge-like value from two input arrays and store to a single
// output. Like DeviceGatherGlobalToGlobal, but for two arrays at once.
template<int NT, int VT, typename InputIt1, typename InputIt2,
typename T>
MGPU_DEVICE void DeviceTransferMergeValuesReg(int count, InputIt1 a_global,
InputIt2 b_global, int bStart, const int* indices, int tid,
T* reg, bool sync) {
b_global -= bStart;
if(count >= NT * VT) {
#pragma unroll
for(int i = 0; i < VT; ++i) {
reg[i] = (indices[i] < bStart) ? a_global[indices[i]] :
b_global[indices[i]];
}
} else {
#pragma unroll
for(int i = 0; i < VT; ++i) {
int index = NT * i + tid;
if(index < count)
reg[i] = (indices[i] < bStart) ? a_global[indices[i]] :
b_global[indices[i]];
}
}
if(sync) __syncthreads();
}
template<int NT, int VT, typename InputIt1, typename InputIt2,
typename OutputIt>
MGPU_DEVICE void DeviceTransferMergeValuesShared(int count, InputIt1 a_global,
InputIt2 b_global, int bStart, const int* indices_shared, int tid,
OutputIt dest_global, bool sync) {
int indices[VT];
DeviceSharedToReg<NT, VT>(indices_shared, tid, indices);
typedef typename std::iterator_traits<InputIt1>::value_type ValType;
ValType reg[VT];
DeviceTransferMergeValuesReg<NT, VT>(count, a_global, b_global, bStart,
indices, tid, reg, sync);
DeviceRegToGlobal<NT, VT>(count, reg, tid, dest_global, sync);
}
template<int NT, int VT, typename T>
MGPU_DEVICE void DeviceTransferMergeValuesReg(int count, const T* a_global,
const T* b_global, int bStart, const int* indices, int tid, T* reg,
bool sync) {
int bOffset = (int)(b_global - a_global - bStart);
if(count >= NT * VT) {
#pragma unroll
for(int i = 0; i < VT; ++i) {
int gather = indices[i];
if(gather >= bStart) gather += bOffset;
reg[i] = a_global[gather];
}
} else {
#pragma unroll
for(int i = 0; i < VT; ++i) {
int index = NT * i + tid;
int gather = indices[i];
if(gather >= bStart) gather += bOffset;
if(index < count)
reg[i] = a_global[gather];
}
}
if(sync) __syncthreads();
}
template<int NT, int VT, typename T, typename OutputIt>
MGPU_DEVICE void DeviceTransferMergeValuesShared(int count, const T* a_global,
const T* b_global, int bStart, const int* indices_shared, int tid,
OutputIt dest_global, bool sync) {
int indices[VT];
DeviceSharedToReg<NT, VT>(indices_shared, tid, indices);
T reg[VT];
DeviceTransferMergeValuesReg<NT, VT>(count, a_global, b_global, bStart,
indices, tid, reg, sync);
DeviceRegToGlobal<NT, VT>(count, reg, tid, dest_global, sync);
}
} // namespace mgpu
/******************************************************************************
* Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
*
* Code and text by Sean Baxter, NVIDIA Research
* See http://nvlabs.github.io/moderngpu for repository and documentation.
*
******************************************************************************/
#pragma once
#include "deviceutil.cuh"
namespace mgpu {
////////////////////////////////////////////////////////////////////////////////
// SerialSetIntersection
// Emit A if A and B are in range and equal.
template<int VT, bool RangeCheck, typename T, typename Comp>
MGPU_DEVICE int SerialSetIntersection(const T* data, int aBegin, int aEnd,
int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) {
const int MinIterations = VT / 2;
int commit = 0;
#pragma unroll
for(int i = 0; i < VT; ++i) {
bool test = RangeCheck ?
((aBegin + bBegin < end) && (aBegin < aEnd) && (bBegin < bEnd)) :
(i < MinIterations || (aBegin + bBegin < end));
if(test) {
T aKey = data[aBegin];
T bKey = data[bBegin];
bool pA = comp(aKey, bKey);
bool pB = comp(bKey, aKey);
// The outputs must come from A by definition of set interection.
results[i] = aKey;
indices[i] = aBegin;
if(!pB) ++aBegin;
if(!pA) ++bBegin;
if(pA == pB) commit |= 1<< i;
}
}
return commit;
}
////////////////////////////////////////////////////////////////////////////////
// SerialSetUnion
// Emit A if A <= B. Emit B if B < A.
template<int VT, bool RangeCheck, typename T, typename Comp>
MGPU_DEVICE int SerialSetUnion(const T* data, int aBegin, int aEnd,
int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) {
const int MinIterations = VT / 2;
int commit = 0;
#pragma unroll
for(int i = 0; i < VT; ++i) {
bool test = RangeCheck ?
(aBegin + bBegin < end) :
(i < MinIterations || (aBegin + bBegin < end));
if(test) {
T aKey = data[aBegin];
T bKey = data[bBegin];
bool pA = false, pB = false;
if(RangeCheck && aBegin >= aEnd)
pB = true;
else if(RangeCheck && bBegin >= bEnd)
pA = true;
else {
// Both are in range.
pA = comp(aKey, bKey);
pB = comp(bKey, aKey);
}
// Output A in case of a tie, so check if b < a.
results[i] = pB ? bKey : aKey;
indices[i] = pB ? bBegin : aBegin;
if(!pB) ++aBegin;
if(!pA) ++bBegin;
commit |= 1<< i;
}
}
return commit;
}
////////////////////////////////////////////////////////////////////////////////
// SerialSetDifference
// Emit A if A < B.
template<int VT, bool RangeCheck, typename T, typename Comp>
MGPU_DEVICE int SerialSetDifference(const T* data, int aBegin, int aEnd,
int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) {
const int MinIterations = VT / 2;
int commit = 0;
#pragma unroll
for(int i = 0; i < VT; ++i) {
bool test = RangeCheck ?
(aBegin + bBegin < end) :
(i < MinIterations || (aBegin + bBegin < end));
if(test) {
T aKey = data[aBegin];
T bKey = data[bBegin];
bool pA = false, pB = false;
if(RangeCheck && aBegin >= aEnd)
pB = true;
else if(RangeCheck && bBegin >= bEnd)
pA = true;
else {
pA = comp(aKey, bKey);
pB = comp(bKey, aKey);
}
// The outputs must come from A by definition of set difference.
results[i] = aKey;
indices[i] = aBegin;
if(!pB) ++aBegin;
if(!pA) ++bBegin;
if(pA) commit |= 1<< i;
}
}
return commit;
}
////////////////////////////////////////////////////////////////////////////////
// SerialSetSymDiff
// Emit A if A < B and emit B if B < A.
template<int VT, bool RangeCheck, typename T, typename Comp>
MGPU_DEVICE int SerialSetSymDiff(const T* data, int aBegin, int aEnd,
int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) {
const int MinIterations = VT / 2;
int commit = 0;
#pragma unroll
for(int i = 0; i < VT; ++i) {
bool test = RangeCheck ?
(aBegin + bBegin < end) :
(i < MinIterations || (aBegin + bBegin < end));
if(test) {
T aKey = data[aBegin];
T bKey = data[bBegin];
bool pA = false, pB = false;
if(RangeCheck && (bBegin >= bEnd))
pA = true;
else if(RangeCheck && (aBegin >= aEnd))
pB = true;
else {
pA = comp(aKey, bKey);
pB = comp(bKey, aKey);
}
results[i] = pA ? aKey : bKey;
indices[i] = pA ? aBegin : bBegin;
if(!pA) ++bBegin;
if(!pB) ++aBegin;
if(pA != pB) commit |= 1<< i;
}
}
return commit;
}
////////////////////////////////////////////////////////////////////////////////
// SerialSetOp
// Uses the MgpuSetOp enum to statically select one of the four serial ops
// above.
template<int VT, bool RangeCheck, MgpuSetOp Op, typename T, typename Comp>
MGPU_DEVICE int SerialSetOp(const T* data, int aBegin, int aEnd,
int bBegin, int bEnd, int star, T* results, int* indices, Comp comp) {
int end = aBegin + bBegin + VT - star;
if(RangeCheck) end = min(end, aEnd + bEnd);
int commit;
switch(Op) {
case MgpuSetOpIntersection:
commit = SerialSetIntersection<VT, RangeCheck>(data, aBegin,
aEnd, bBegin, bEnd, end, results, indices, comp);
break;
case MgpuSetOpUnion:
commit = SerialSetUnion<VT, RangeCheck>(data, aBegin, aEnd,
bBegin, bEnd, end, results, indices, comp);
break;
case MgpuSetOpDiff:
commit = SerialSetDifference<VT, RangeCheck>(data, aBegin, aEnd,
bBegin, bEnd, end, results, indices, comp);
break;
case MgpuSetOpSymDiff:
commit = SerialSetSymDiff<VT, RangeCheck>(data, aBegin, aEnd,
bBegin, bEnd, end, results, indices, comp);
break;
}
__syncthreads();
return commit;
}
} // namespace mgpu
/******************************************************************************
* Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
*
* Code and text by Sean Baxter, NVIDIA Research
* See http://nvlabs.github.io/moderngpu for repository and documentation.
*
******************************************************************************/
#pragma once
#include "deviceutil.cuh"
namespace mgpu {
////////////////////////////////////////////////////////////////////////////////
// Odd-even transposition sorting network. Sorts keys and values in-place in
// register.
// http://en.wikipedia.org/wiki/Odd%E2%80%93even_sort
// CUDA Compiler does not currently unroll these loops correctly. Write using
// template loop unrolling.
/*
template<int VT, typename T, typename V, typename Comp>
MGPU_DEVICE void OddEvenTransposeSort(T* keys, V* values, Comp comp) {
#pragma unroll
for(int level = 0; level < VT; ++level) {
#pragma unroll
for(int i = 1 & level; i < VT - 1; i += 2) {
if(comp(keys[i + 1], keys[i])) {
mgpu::swap(keys[i], keys[i + 1]);
mgpu::swap(values[i], values[i + 1]);
}
}
}
}*/
template<int I, int VT>
struct OddEvenTransposeSortT {
// Sort segments marked by head flags. If the head flag between i and i + 1
// is set (so that (2<< i) & flags is true), the values belong to different
// segments and are not swapped.
template<typename K, typename V, typename Comp>
static MGPU_DEVICE void Sort(K* keys, V* values, int flags, Comp comp) {
#pragma unroll
for(int i = 1 & I; i < VT - 1; i += 2)
if((0 == ((2<< i) & flags)) && comp(keys[i + 1], keys[i])) {
mgpu::swap(keys[i], keys[i + 1]);
mgpu::swap(values[i], values[i + 1]);
}
OddEvenTransposeSortT<I + 1, VT>::Sort(keys, values, flags, comp);
}
};
template<int I> struct OddEvenTransposeSortT<I, I> {
template<typename K, typename V, typename Comp>
static MGPU_DEVICE void Sort(K* keys, V* values, int flags, Comp comp) { }
};
template<int VT, typename K, typename V, typename Comp>
MGPU_DEVICE void OddEvenTransposeSort(K* keys, V* values, Comp comp) {
OddEvenTransposeSortT<0, VT>::Sort(keys, values, 0, comp);
}
template<int VT, typename K, typename V, typename Comp>
MGPU_DEVICE void OddEvenTransposeSortFlags(K* keys, V* values, int flags,
Comp comp) {
OddEvenTransposeSortT<0, VT>::Sort(keys, values, flags, comp);
}
////////////////////////////////////////////////////////////////////////////////
// Batcher Odd-Even Mergesort network
// Unstable but executes much faster than the transposition sort.
// http://en.wikipedia.org/wiki/Batcher_odd%E2%80%93even_mergesort
template<int Width, int Low, int Count>
struct OddEvenMergesortT {
template<typename K, typename V, typename Comp>
MGPU_DEVICE static void CompareAndSwap(K* keys, V* values, int flags,
int a, int b, Comp comp) {
if(b < Count) {
// Mask the bits between a and b. Any head flags in this interval
// means the keys are in different segments and must not be swapped.
const int Mask = ((2<< b) - 1) ^ ((2<< a) - 1);
if(!(Mask & flags) && comp(keys[b], keys[a])) {
mgpu::swap(keys[b], keys[a]);
mgpu::swap(values[b], values[a]);
}
}
}
template<int R, int Low2, bool Recurse = 2 * R < Width>
struct OddEvenMerge {
template<typename K, typename V, typename Comp>
MGPU_DEVICE static void Merge(K* keys, V* values, int flags,
Comp comp) {
// Compare and swap
const int M = 2 * R;
OddEvenMerge<M, Low2>::Merge(keys, values, flags, comp);
OddEvenMerge<M, Low2 + R>::Merge(keys, values, flags, comp);
#pragma unroll
for(int i = Low2 + R; i + R < Low2 + Width; i += M)
CompareAndSwap(keys, values, flags, i, i + R, comp);
}
};
template<int R, int Low2>
struct OddEvenMerge<R, Low2, false> {
template<typename K, typename V, typename Comp>
MGPU_DEVICE static void Merge(K* keys, V* values, int flags,
Comp comp) {
CompareAndSwap(keys, values, flags, Low2, Low2 + R, comp);
}
};
template<typename K, typename V, typename Comp>
MGPU_DEVICE static void Sort(K* keys, V* values, int flags,
Comp comp) {
const int M = Width / 2;
OddEvenMergesortT<M, Low, Count>::Sort(keys, values, flags, comp);
OddEvenMergesortT<M, Low + M, Count>::Sort(keys, values, flags, comp);
OddEvenMerge<1, Low>::Merge(keys, values, flags, comp);
}
};
template<int Low, int Count> struct OddEvenMergesortT<1, Low, Count> {
template<typename K, typename V, typename Comp>
MGPU_DEVICE static void Sort(K* keys, V* values, int flags,
Comp comp) { }
};
template<int VT, typename K, typename V, typename Comp>
MGPU_DEVICE void OddEvenMergesort(K* keys, V* values, Comp comp) {
const int Width = 1<< sLogPow2<VT, true>::value;
OddEvenMergesortT<Width, 0, VT>::Sort(keys, values, 0, comp);
}
template<int VT, typename K, typename V, typename Comp>
MGPU_DEVICE void OddEvenMergesortFlags(K* keys, V* values, int flags,
Comp comp) {
const int Width = 1<< sLogPow2<VT, true>::value;
OddEvenMergesortT<Width, 0, VT>::Sort(keys, values, flags, comp);
}
} // namespace mgpu
/******************************************************************************
* Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
*
* Code and text by Sean Baxter, NVIDIA Research
* See http://nvlabs.github.io/moderngpu for repository and documentation.
*
******************************************************************************/
#pragma once
#include "mgpuenums.h"
#include "device/deviceutil.cuh"
namespace mgpu {
////////////////////////////////////////////////////////////////////////////////
// device/loadstore.cuh
// For 0 <= i < VT:
// index = NT * i + tid;
// reg[i] = data[index];
// Synchronize after load.
template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceSharedToReg(InputIt data, int tid, T* reg,
bool sync = true);
// For 0 <= i < VT:
// index = NT * i + tid;
// if(index < count) reg[i] = data[index];
// No synchronize after load.
template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToReg(int count, InputIt data, int tid,
T* reg, bool sync = false);
template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToRegDefault(int count, InputIt data, int tid,
T* reg, T init, bool sync = false);
// For 0 <= i < VT:
// index = NT * i + tid;
// if(index < count) reg[i] = data[index];
// No synchronize after load.
template<int NT, int VT0, int VT1, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToReg(int count, InputIt data, int tid,
T* reg, bool sync = false);
// For 0 <= i < VT:
// index = NT * i + tid;
// if(index < count) reg[i] = data[index];
// No synchronize after load.
template<int NT, int VT0, int VT1, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToRegDefault2(int count, InputIt data, int tid,
T* reg, T init, bool sync = false);
// For 0 <= i < VT:
// index = NT * i + tid;
// if(index < count) reg[i] = data[index];
// No synchronize after load.
// No optimized code path for count < NV (smaller generated code).
template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToRegLoop(int count, InputIt data, int tid,
T* reg, bool sync = false);
// For 0 <= i < VT:
// index = VT * tid + i.
// if(index < count) reg[i] = data[index];
// No synchronize after load.
template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToThread(int count, InputIt data, int tid,
T* reg);
template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToThreadDefault(int count, InputIt data, int tid,
T* reg, T init);
// For 0 <= i < VT:
// index = NT * i + tid;
// if(index < count) data[index] = reg[i];
// Synchronize after load.
template<int NT, int VT, typename OutputIt, typename T>
MGPU_DEVICE void DeviceRegToShared(const T* reg, int tid, OutputIt dest,
bool sync = true);
// For 0 <= i < VT:
// index = NT * i + tid;
// if(index < count) data[index] = reg[i];
// No synchronize after load.
template<int NT, int VT, typename OutputIt, typename T>
MGPU_DEVICE void DeviceRegToGlobal(int count, const T* reg, int tid,
OutputIt dest, bool sync = false);
// For 0 <= index < count:
// dest[index] = source[index];
// This function is intended to replace DeviceGlobalToShared in cases where
// count is much less than NT * VT.
template<int NT, typename InputIt, typename OutputIt>
MGPU_DEVICE void DeviceMemToMemLoop(int count, InputIt source, int tid,
OutputIt dest, bool sync = true);
// For 0 <= index < count:
// dest[index] = source[index];
// Synchronize after store.
template<int NT, int VT, typename T, typename OutputIt>
MGPU_DEVICE void DeviceSharedToGlobal(int count, const T* source, int tid,
OutputIt dest, bool sync = true);
// For 0 <= index < count:
// dest[index] = source[index];
// Synchronize after store.
template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToShared(int count, InputIt source, int tid,
T* dest, bool sync = true);
template<int NT, int VT0, int VT1, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToShared2(int count, InputIt source, int tid,
T* dest, bool sync = true);
// For 0 <= index < count:
// dest[index] = source[index];
// Synchronize after store.
// No optimized code path for count < NV (smaller generated code).
template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToSharedLoop(int count, InputIt source, int tid,
T* dest, bool sync = true);
template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToSharedDefault(int count, InputIt source, int tid,
T* dest, T init, bool sync = true);
template<int NT, int VT0, int VT1, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToSharedDefault2(int count, InputIt source,
int tid, T* dest, T init, bool sync = true);
// For 0 <= index < count:
// dest[index] = source[index];
// No synchronize.
template<int NT, int VT, typename InputIt, typename OutputIt>
MGPU_DEVICE void DeviceGlobalToGlobal(int count, InputIt source, int tid,
OutputIt dest, bool sync = false);
// Transponse VT elements in NT threads (x) into thread-order registers (y)
// using only NT * VT / 2 elements of shared memory.
template<int NT, int VT, typename T>
MGPU_DEVICE void HalfSmemTranspose(const T* x, int tid, T* shared, T* y);
// For 0 <= i < VT:
// index = NT * i + tid;
// if(index < count)
// gather = indices[index];
// reg[i] = data[gather];
// Synchronize after load.
template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGather(int count, InputIt data, int indices[VT],
int tid, T* reg, bool sync = true);
template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGatherDefault(int count, InputIt data, int indices[VT],
int tid, T* reg, T identity, bool sync = true);
// For 0 <= i < VT:
// index = NT * i + tid;
// if(index < count)
// scatter = indices[index];
// data[scatter] = reg[i];
// Synchronize after store.
template<int NT, int VT, typename T, typename OutputIt>
MGPU_DEVICE void DeviceScatter(int count, const T* reg, int tid,
int indices[VT], OutputIt data, bool sync = true);
// For 0 <= i < VT:
// shared[VT * tid + i] = threadReg[i];
// Synchronize after store.
// Note this function moves data in THREAD ORDER.
// (DeviceRegToShared moves data in STRIDED ORDER).
template<int VT, typename T>
MGPU_DEVICE void DeviceThreadToShared(const T* threadReg, int tid, T* shared,
bool sync = true);
// For 0 <= i < VT:
// threadReg[i] = shared[VT * tid + i];
// Synchronize after load.
// Note this function moves data in THREAD ORDER.
// (DeviceSharedToReg moves data in STRIDED ORDER).
template<int VT, typename T>
MGPU_DEVICE void DeviceSharedToThread(const T* shared, int tid, T* threadReg,
bool sync = true);
// For 0 <= index < aCount:
// shared[index] = a_global[index];
// For 0 <= index < bCount:
// shared[aCount + index] = b_global[index];
// VT0 is the lower-bound for predication-free execution:
// If count >= NT * VT0, a predication-free branch is taken.
// VT1 is the upper-bound for loads:
// NT * VT1 must >= aCount + bCount.
template<int NT, int VT0, int VT1, typename T>
MGPU_DEVICE void DeviceLoad2ToReg(const T* a_global, int aCount,
const T* b_global, int bCount, int tid, T* reg, bool sync = false);
template<int NT, int VT0, int VT1, typename T>
MGPU_DEVICE void DeviceLoad2ToShared(const T* a_global, int aCount,
const T* b_global, int bCount, int tid, T* shared, bool sync = true);
template<int NT, int VT0, int VT1, typename InputIt1, typename InputIt2,
typename T>
MGPU_DEVICE void DeviceLoad2ToReg(InputIt1 a_global, int aCount,
InputIt2 b_global, int bCount, int tid, T* reg, bool sync = false);
template<int NT, int VT0, int VT1, typename InputIt1, typename InputIt2,
typename T>
MGPU_DEVICE void DeviceLoad2ToShared(InputIt1 a_global, int aCount,
InputIt2 b_global, int bCount, int tid, T* shared, bool sync = true);
// For 0 <= i < VT
// index = NT * i + tid;
// if(index < count)
// gather = indices_shared[index];
// dest_global[index] = data_global[gather];
// Synchronize after load.
template<int NT, int VT, typename InputIt, typename OutputIt>
MGPU_DEVICE void DeviceGatherGlobalToGlobal(int count, InputIt data_global,
const int* indices_shared, int tid, OutputIt dest_global,
bool sync = true);
// For 0 <= i < VT
// index = NT * i + tid
// if(index < count)
// gather = indices[index];
// if(gather < aCount) data = a_global[gather];
// else data = b_global[gather - aCount];
// dest_global[index] = data;
// Synchronize after load.
template<int NT, int VT, typename InputIt1, typename InputIt2,
typename T>
MGPU_DEVICE void DeviceTransferMergeValuesReg(int count, InputIt1 a_global,
InputIt2 b_global, int bStart, const int* indices, int tid,
T* reg, bool sync = false);
template<int NT, int VT, typename InputIt1, typename InputIt2,
typename OutputIt>
MGPU_DEVICE void DeviceTransferMergeValuesShared(int count, InputIt1 a_global,
InputIt2 b_global, int bStart, const int* indices_shared, int tid,
OutputIt dest_global, bool sync = true);
template<int NT, int VT, typename T>
MGPU_DEVICE void DeviceTransferMergeValuesReg(int count, const T* a_global,
const T* b_global, int bStart, const int* indices, int tid,
T* reg, bool sync = false);
template<int NT, int VT, typename T, typename OutputIt>
MGPU_DEVICE void DeviceTransferMergeValuesShared(int count, const T* a_global,
const T* b_global, int bStart, const int* indices_shared, int tid,
OutputIt dest_global, bool sync = true);
} // namespace mgpu
#include "device/loadstore.cuh"
#include "device/ctasegscan.cuh"
/******************************************************************************
* Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
*
* Code and text by Sean Baxter, NVIDIA Research
* See http://nvlabs.github.io/moderngpu for repository and documentation.
*
******************************************************************************/
#pragma once
namespace mgpu {
enum MgpuBounds {
MgpuBoundsLower,
MgpuBoundsUpper
};
enum MgpuScanType {
MgpuScanTypeExc,
MgpuScanTypeInc
};
enum MgpuSearchType {
MgpuSearchTypeNone,
MgpuSearchTypeIndex,
MgpuSearchTypeMatch,
MgpuSearchTypeIndexMatch
};
enum MgpuJoinKind {
MgpuJoinKindInner,
MgpuJoinKindLeft,
MgpuJoinKindRight,
MgpuJoinKindOuter
};
enum MgpuSetOp {
MgpuSetOpIntersection,
MgpuSetOpUnion,
MgpuSetOpDiff,
MgpuSetOpSymDiff
};
} // namespace mgpu
/******************************************************************************
* Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
*
* Code and text by Sean Baxter, NVIDIA Research
* See http://nvlabs.github.io/moderngpu for repository and documentation.
*
******************************************************************************/
#pragma once
#include <functional>
#include <iterator>
#include <cfloat>
#include <typeinfo>
#include <vector>
#include <list>
#include <map>
#include <algorithm>
#include <cassert>
#include <memory>
#include <cmath>
#include <cstdio>
#include <cstdlib>
#ifndef MGPU_MIN
#define MGPU_MIN(x, y) (((x) <= (y)) ? (x) : (y))
#define MGPU_MAX(x, y) (((x) >= (y)) ? (x) : (y))
#define MGPU_MAX0(x) (((x) >= 0) ? (x) : 0)
#define MGPU_ABS(x) (((x) >= 0) ? (x) : (-x))
#define MGPU_DIV_UP(x, y) (((x) + (y) - 1) / (y))
#define MGPU_DIV_ROUND(x, y) (((x) + (y) / 2) / (y))
#define MGPU_ROUND_UP(x, y) ((y) * MGPU_DIV_UP(x, y))
#define MGPU_SHIFT_DIV_UP(x, y) (((x) + ((1<< (y)) - 1))>> y)
#define MGPU_ROUND_UP_POW2(x, y) (((x) + (y) - 1) & ~((y) - 1))
#define MGPU_ROUND_DOWN_POW2(x, y) ((x) & ~((y) - 1))
#define MGPU_IS_POW_2(x) (0 == ((x) & ((x) - 1)))
#endif // MGPU_MIN
namespace mgpu {
typedef unsigned char byte;
typedef unsigned int uint;
typedef signed short int16;
typedef unsigned short ushort;
typedef unsigned short uint16;
typedef long long int64;
typedef unsigned long long uint64;
// IsPow2<X>::value is true if X is a power of 2.
template<int X> struct sIsPow2 {
enum { value = 0 == (X & (X - 1)) };
};
// Finds the base-2 logarithm of X. value is -1 if X is not a power of 2.
template<int X, bool roundUp = true> struct sLogPow2 {
enum { extra = sIsPow2<X>::value ? 0 : (roundUp ? 1 : 0) };
enum { inner = sLogPow2<X / 2>::inner + 1 };
enum { value = inner + extra };
};
template<bool roundUp> struct sLogPow2<0, roundUp> {
enum { inner = 0 };
enum { value = 0 };
};
template<bool roundUp> struct sLogPow2<1, roundUp> {
enum { inner = 0 };
enum { value = 0 };
};
template<int X, int Y>
struct sDivUp {
enum { value = (X + Y - 1) / Y };
};
template<int count, int levels> struct sDiv2RoundUp {
enum { value = sDiv2RoundUp<sDivUp<count, 2>::value, levels - 1>::value };
};
template<int count> struct sDiv2RoundUp<count, 0> {
enum { value = count };
};
template<int X, int Y>
struct sDivSafe {
enum { value = X / Y };
};
template<int X>
struct sDivSafe<X, 0> {
enum { value = 0 };
};
template<int X, int Y>
struct sRoundUp {
enum { rem = X % Y };
enum { value = X + (rem ? (Y - rem) : 0) };
};
template<int X, int Y>
struct sRoundDown {
enum { rem = X % Y };
enum { value = X - rem };
};
// IntegerDiv is a template for avoiding divisions by zero in template
// evaluation. Templates always evaluate both b and c in an expression like
// a ? b : c, and will error if either rhs contains an illegal expression,
// even if the ternary is explictly designed to guard against that.
template<int X, int Y>
struct sIntegerDiv {
enum { value = X / (Y ? Y : (X + 1)) };
};
template<int X, int Y>
struct sMax {
enum { value = (X >= Y) ? X : Y };
};
template<int X, int Y>
struct sMin {
enum { value = (X <= Y) ? X : Y };
};
template<int X>
struct sAbs {
enum { value = (X >= 0) ? X : -X };
};
// Finds the number of powers of 2 in the prime factorization of X.
template<int X, int LSB = 1 & X> struct sNumFactorsOf2 {
enum { shifted = X >> 1 };
enum { value = 1 + sNumFactorsOf2<shifted>::value };
};
template<int X> struct sNumFactorsOf2<X, 1> {
enum { value = 0 };
};
// Returns the divisor for a conflict-free transpose.
template<int X, int NumBanks = 32> struct sBankConflictDivisor {
enum { value =
(1 & X) ? 0 :
(sIsPow2<X>::value ? NumBanks :
(1<< sNumFactorsOf2<X>::value)) };
enum { log_value = sLogPow2<value>::value };
};
template<int NT, int X, int NumBanks = 32> struct sConflictFreeStorage {
enum { count = NT * X };
enum { divisor = sBankConflictDivisor<X, NumBanks>::value };
enum { padding = sDivSafe<count, divisor>::value };
enum { value = count + padding };
};
} // namespace mgpu
/** \file ctc.h
* Contains a simple C interface to call fast CPU and GPU based computation
* of the CTC loss.
*/
#pragma once
#ifdef __cplusplus
#include <cstddef>
extern "C" {
#endif
//forward declare of CUDA typedef to avoid needing to pull in CUDA headers
typedef struct CUstream_st* CUstream;
typedef enum {
CTC_STATUS_SUCCESS = 0,
CTC_STATUS_MEMOPS_FAILED = 1,
CTC_STATUS_INVALID_VALUE = 2,
CTC_STATUS_EXECUTION_FAILED = 3,
CTC_STATUS_UNKNOWN_ERROR = 4
} ctcStatus_t;
/** Returns a single integer which specifies the API version of the warpctc library */
int get_warpctc_version();
/** Returns a string containing a description of status that was passed in
* \param[in] status identifies which string should be returned
* \return C style string containing the text description
* */
const char* ctcGetStatusString(ctcStatus_t status);
typedef enum {
CTC_CPU = 0,
CTC_GPU = 1
} ctcComputeLocation;
/** Structure used for options to the CTC compution. Applications
* should zero out the array using memset and sizeof(struct
* ctcOptions) in C or default initialization (e.g. 'ctcOptions
* options{};' or 'auto options = ctcOptions{}') in C++ to ensure
* forward compatibility with added options. */
struct ctcOptions {
/// indicates where the ctc calculation should take place {CTC_CPU | CTC_GPU}
ctcComputeLocation loc;
union {
/// used when loc == CTC_CPU, the maximum number of threads that can be used
unsigned int num_threads;
/// used when loc == CTC_GPU, which stream the kernels should be launched in
CUstream stream;
};
/// the label value/index that the CTC calculation should use as the blank label
int blank_label;
};
/** Compute the connectionist temporal classification loss between a sequence
* of probabilities and a ground truth labeling. Optionally compute the
* gradient with respect to the inputs.
* \param [in] activations pointer to the activations in either CPU or GPU
* addressable memory, depending on info. We assume a fixed
* memory layout for this 3 dimensional tensor, which has dimension
* (t, n, p), where t is the time index, n is the minibatch index,
* and p indexes over probabilities of each symbol in the alphabet.
* The memory layout is (t, n, p) in C order (slowest to fastest changing
* index, aka row-major), or (p, n, t) in Fortran order (fastest to slowest
* changing index, aka column-major). We also assume strides are equal to
* dimensions - there is no padding between dimensions.
* More precisely, element (t, n, p), for a problem with mini_batch examples
* in the mini batch, and alphabet_size symbols in the alphabet, is located at:
* activations[(t * mini_batch + n) * alphabet_size + p]
* \param [out] gradients if not NULL, then gradients are computed. Should be
* allocated in the same memory space as probs and memory
* ordering is identical.
* \param [in] flat_labels Always in CPU memory. A concatenation
* of all the labels for the minibatch.
* \param [in] label_lengths Always in CPU memory. The length of each label
* for each example in the minibatch.
* \param [in] input_lengths Always in CPU memory. The number of time steps
* for each sequence in the minibatch.
* \param [in] alphabet_size The number of possible output symbols. There
* should be this many probabilities for each time step.
* \param [in] mini_batch How many examples in a minibatch.
* \param [out] costs Always in CPU memory. The cost of each example in the
* minibatch.
* \param [in,out] workspace In same memory space as probs. Should be of
* size requested by get_workspace_size.
* \param [in] options see struct ctcOptions
*
* \return Status information
*
* */
ctcStatus_t compute_ctc_loss(const float* const activations,
float* gradients,
const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths,
int alphabet_size,
int minibatch,
float *costs,
void *workspace,
ctcOptions options);
/** For a given set of labels and minibatch size return the required workspace
* size. This will need to be allocated in the same memory space as your
* probabilities.
* \param [in] label_lengths Always in CPU memory. The length of each label
* for each example in the minibatch.
* \param [in] input_lengths Always in CPU memory. The number of time steps
* for each sequence in the minibatch.
* \param [in] alphabet_size How many symbols in the alphabet or, equivalently,
* the number of probabilities at each time step
* \param [in] mini_batch How many examples in a minibatch.
* \param [in] info see struct ctcOptions
* \param [out] size_bytes is pointer to a scalar where the memory
* requirement in bytes will be placed. This memory should be allocated
* at the same place, CPU or GPU, that the probs are in
*
* \return Status information
**/
ctcStatus_t get_workspace_size(const int* const label_lengths,
const int* const input_lengths,
int alphabet_size, int minibatch,
ctcOptions info,
size_t* size_bytes);
#ifdef __cplusplus
}
#endif
#pragma once
#include <tuple>
#include <cmath>
#include <limits>
#include <algorithm>
#include <numeric>
#if !defined(CTC_DISABLE_OMP) && !defined(APPLE)
#include <omp.h>
#endif
#include "ctc_helper.h"
template<typename ProbT>
class CpuCTC {
public:
// Noncopyable
CpuCTC(int alphabet_size, int minibatch, void* workspace, int num_threads,
int blank_label) :
alphabet_size_(alphabet_size), minibatch_(minibatch),
num_threads_(num_threads), workspace_(workspace),
blank_label_(blank_label) {
#if defined(CTC_DISABLE_OMP) || defined(APPLE)
#else
if (num_threads > 0) {
omp_set_num_threads(num_threads);
} else {
num_threads_ = omp_get_max_threads();
}
#endif
};
CpuCTC(const CpuCTC&) = delete;
CpuCTC& operator=(const CpuCTC&) = delete;
ctcStatus_t cost_and_grad(const ProbT* const activations,
ProbT *grads,
ProbT* costs,
const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths);
ctcStatus_t score_forward(const ProbT* const activations,
ProbT* costs,
const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths);
private:
class CpuCTC_metadata {
private:
int setup_labels(const int* const labels, int blank_label, int L, int S);
public:
CpuCTC_metadata(int L, int S, int T, int mb, int alphabet_size,
void* workspace, size_t bytes_used, int blank_label,
const int* const labels);
ProbT* alphas;
ProbT* betas;
int* labels_w_blanks;
int* e_inc;
int* s_inc;
ProbT* output;
int repeats;
};
int alphabet_size_; // Number of characters plus blank
int minibatch_;
int num_threads_;
int blank_label_;
void* workspace_;
void softmax(const ProbT* const activations, ProbT* probs,
const int* const input_lengths);
std::tuple<ProbT, bool>
cost_and_grad_kernel(ProbT *grad, const ProbT* const probs,
const int* const labels, int T, int L,
int mb, size_t bytes_used);
ProbT compute_alphas(const ProbT* probs, int repeats, int S, int T,
const int* const e_inc,
const int* const s_inc,
const int* const labels,
ProbT* alphas);
ProbT compute_betas_and_grad(ProbT* grad, const ProbT* const probs,
ProbT log_partition, int repeats,
int S, int T, const int* const e_inc,
const int* const s_inc,
const int* const labels,
ProbT* alphas,
ProbT* betas,
ProbT* output);
};
template<typename ProbT>
CpuCTC<ProbT>::CpuCTC_metadata::CpuCTC_metadata(int L, int S, int T, int mb,
int alphabet_size,
void* workspace, size_t bytes_used,
int blank_label,
const int* const labels) {
alphas = reinterpret_cast<ProbT *>(static_cast<char *>(workspace) + bytes_used);
bytes_used += sizeof(ProbT) * S * T;
std::fill(alphas, alphas + S * T, ctc_helper::neg_inf<ProbT>());
betas = reinterpret_cast<ProbT *>(static_cast<char *>(workspace) + bytes_used);
bytes_used += sizeof(ProbT) * S;
std::fill(betas, betas + S, ctc_helper::neg_inf<ProbT>());
labels_w_blanks = reinterpret_cast<int *>(static_cast<char *>(workspace) + bytes_used);
bytes_used += sizeof(int) * S;
e_inc = reinterpret_cast<int *>(static_cast<char *>(workspace) + bytes_used);
bytes_used += sizeof(int) * S;
s_inc = reinterpret_cast<int *>(static_cast<char *>(workspace) + bytes_used);
bytes_used += sizeof(int) * S;
output = reinterpret_cast<ProbT *>(static_cast<char *>(workspace) + bytes_used);
bytes_used += sizeof(ProbT) * alphabet_size;
repeats = setup_labels(labels, blank_label, L, S);
}
template<typename ProbT>
int CpuCTC<ProbT>::CpuCTC_metadata::setup_labels(const int* const labels,
int blank_label, int L, int S) {
int e_counter = 0;
int s_counter = 0;
s_inc[s_counter++] = 1;
int repeats = 0;
for (int i = 1; i < L; ++i) {
if (labels[i-1] == labels[i]) {
s_inc[s_counter++] = 1;
s_inc[s_counter++] = 1;
e_inc[e_counter++] = 1;
e_inc[e_counter++] = 1;
++repeats;
}
else {
s_inc[s_counter++] = 2;
e_inc[e_counter++] = 2;
}
}
e_inc[e_counter++] = 1;
for (int i = 0; i < L; ++i) {
labels_w_blanks[2 * i] = blank_label;
labels_w_blanks[2 * i + 1] = labels[i];
}
labels_w_blanks[S - 1] = blank_label;
return repeats;
}
template<typename ProbT>
void
CpuCTC<ProbT>::softmax(const ProbT* const activations, ProbT* probs,
const int* const input_lengths) {
#pragma omp parallel for
for (int mb = 0; mb < minibatch_; ++mb) {
for(int c = 0; c < input_lengths[mb]; ++c) {
int col_offset = (mb + minibatch_ * c) * alphabet_size_;
ProbT max_activation = -std::numeric_limits<ProbT>::infinity();
for(int r = 0; r < alphabet_size_; ++r)
max_activation = std::max(max_activation, activations[r + col_offset]);
ProbT denom = ProbT(0.);
for(int r = 0; r < alphabet_size_; ++r) {
probs[r + col_offset] = std::exp(activations[r + col_offset] - max_activation);
denom += probs[r + col_offset];
}
for(int r = 0; r < alphabet_size_; ++r) {
probs[r + col_offset] /= denom;
}
}
}
}
template<typename ProbT>
std::tuple<ProbT, bool>
CpuCTC<ProbT>::cost_and_grad_kernel(ProbT *grad, const ProbT* const probs,
const int* const labels,
int T, int L, int mb, size_t bytes_used) {
const int S = 2*L + 1; // Number of labels with blanks
CpuCTC_metadata ctcm(L, S, T, mb, alphabet_size_, workspace_, bytes_used, blank_label_, labels);
bool over_threshold = false;
if (L + ctcm.repeats > T) {
return std::make_tuple(ProbT(0), over_threshold); // TODO, not right to return 0
}
ProbT llForward = compute_alphas(probs, ctcm.repeats, S, T, ctcm.e_inc,
ctcm.s_inc, ctcm.labels_w_blanks,
ctcm.alphas);
ProbT llBackward = compute_betas_and_grad(grad, probs, llForward, ctcm.repeats,
S, T, ctcm.e_inc, ctcm.s_inc,
ctcm.labels_w_blanks,
ctcm.alphas,
ctcm.betas,
ctcm.output);
ProbT diff = std::abs(llForward - llBackward);
if (diff > ctc_helper::threshold) {
over_threshold = true;
}
return std::make_tuple(-llForward, over_threshold);
}
// Computes forward probabilities
template<typename ProbT>
ProbT CpuCTC<ProbT>::compute_alphas(const ProbT* probs, int repeats, int S, int T,
const int* const e_inc,
const int* const s_inc,
const int* const labels,
ProbT* alphas) {
int start = (((S /2) + repeats - T) < 0) ? 0 : 1,
end = S > 1 ? 2 : 1;
for (int i = start; i < end; ++i) {
alphas[i] = std::log(probs[labels[i]]);
}
for(int t = 1; t < T; ++t) {
int remain = (S / 2) + repeats - (T - t);
if(remain >= 0)
start += s_inc[remain];
if(t <= (S / 2) + repeats)
end += e_inc[t - 1];
int startloop = start;
int idx1 = t * S, idx2 = (t - 1) * S, idx3 = t * (alphabet_size_ * minibatch_);
if (start == 0) {
alphas[idx1] = alphas[idx2] + std::log(probs[blank_label_ + idx3]);
startloop += 1;
}
for(int i = startloop; i < end; ++i) {
ProbT prev_sum = ctc_helper::log_plus<ProbT>()(alphas[i + idx2], alphas[(i-1) + idx2]);
// Skip two if not on blank and not on repeat.
if (labels[i] != blank_label_ && i != 1 && labels[i] != labels[i-2])
prev_sum = ctc_helper::log_plus<ProbT>()(prev_sum, alphas[(i-2) + idx2]);
alphas[i + idx1] = prev_sum + std::log(probs[labels[i] + idx3]);
}
}
ProbT loglike = ctc_helper::neg_inf<ProbT>();
for(int i = start; i < end; ++i) {
loglike = ctc_helper::log_plus<ProbT>()(loglike, alphas[i + (T - 1) * S]);
}
return loglike;
}
// Starting from T, we sweep backward over the alpha array computing one column
// of betas as we go. At each position we can update product alpha * beta and then
// sum into the gradient associated with each label.
// NOTE computes gradient w.r.t UNNORMALIZED final layer activations.
// Assumed passed in grads are already zeroed!
template<typename ProbT>
ProbT CpuCTC<ProbT>::compute_betas_and_grad(ProbT* grad, const ProbT* const probs,
ProbT log_partition, int repeats,
int S, int T, const int* const e_inc,
const int* const s_inc,
const int* const labels,
ProbT* alphas,
ProbT* betas,
ProbT* output) {
int start = S > 1 ? (S - 2) : 0,
end = (T > (S / 2) + repeats) ? S : S-1;
std::fill(output, output + alphabet_size_, ctc_helper::neg_inf<ProbT>());
//set the starting values in the beta column at the very right edge
for (int i = start; i < end; ++i) {
betas[i] = std::log(probs[labels[i] + (T - 1) * (alphabet_size_ * minibatch_)]);
//compute alpha * beta in log space at this position in (S, T) space
alphas[i + (T - 1) * S] += betas[i];
//update the gradient associated with this label
//essentially performing a reduce-by-key in a sequential manner
output[labels[i]] =
ctc_helper::log_plus<ProbT>()(alphas[i + (T - 1) * S], output[labels[i]]);
}
//update the gradient wrt to each unique label
for (int i = 0; i < alphabet_size_; ++i) {
int idx3 = (T - 1) * alphabet_size_ * minibatch_ + i;
if (output[i] == 0.0 || output[i] == ctc_helper::neg_inf<ProbT>() ||
probs[idx3] == 0.0) {
grad[idx3] = probs[idx3];
} else {
grad[idx3] = probs[idx3] - std::exp(output[i] -
std::log(probs[idx3]) - log_partition);
}
}
//loop from the second to last column all the way to the left
for(int t = T - 2; t >= 0; --t) {
int remain = (S / 2) + repeats - (T - t);
if(remain >= -1)
start -= s_inc[remain + 1];
if(t < (S / 2) + repeats)
end -= e_inc[t];
int endloop = end == S ? end - 1 : end;
int idx1 = t * S, idx3 = t * (alphabet_size_ * minibatch_);
std::fill(output, output + alphabet_size_, ctc_helper::neg_inf<ProbT>());
for(int i = start; i < endloop; ++i) {
ProbT next_sum = ctc_helper::log_plus<ProbT>()(betas[i], betas[(i+1)]);
// Skip two if not on blank and not on repeat.
if (labels[i] != blank_label_ && i != (S-2) && labels[i] != labels[i+2]){
next_sum = ctc_helper::log_plus<ProbT>()(next_sum, betas[(i+2)]);
}
betas[i] = next_sum + std::log(probs[labels[i] + idx3]);
//compute alpha * beta in log space
alphas[i + idx1] += betas[i];
//update the gradient associated with this label
output[labels[i]] =
ctc_helper::log_plus<ProbT>()(alphas[i + idx1], output[labels[i]]);
}
if (end == S) {
betas[(S-1)] = betas[(S-1)] + std::log(probs[blank_label_ + idx3]);
alphas[(S-1) + idx1] += betas[(S-1)];
output[labels[S-1]] =
ctc_helper::log_plus<ProbT>()(alphas[S-1 + idx1], output[labels[S-1]]);
}
//go over the unique labels and compute the final grad
// wrt to each one at this time step
for (int i = 0; i < alphabet_size_; ++i) {
if (output[i] == 0.0 || output[i] == ctc_helper::neg_inf<ProbT>() ||
probs[idx3] == 0.0) {
grad[idx3] = probs[idx3];
} else {
grad[idx3] = probs[idx3] - std::exp(output[i] -
std::log(probs[idx3]) - log_partition);
}
++idx3;
}
}
ProbT loglike = ctc_helper::neg_inf<ProbT>();
for(int i = start; i < end; ++i) {
loglike = ctc_helper::log_plus<ProbT>()(loglike, betas[i]);
}
return loglike;
}
template<typename ProbT>
ctcStatus_t
CpuCTC<ProbT>::cost_and_grad(const ProbT* const activations,
ProbT *grads,
ProbT *costs,
const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths) {
if (activations == nullptr ||
grads == nullptr ||
costs == nullptr ||
flat_labels == nullptr ||
label_lengths == nullptr ||
input_lengths == nullptr
)
return CTC_STATUS_INVALID_VALUE;
ProbT* probs = static_cast<ProbT *>(workspace_);
int maxT = *std::max_element(input_lengths, input_lengths + minibatch_);
size_t bytes_used = sizeof(ProbT) * minibatch_ * alphabet_size_ * maxT;
//per minibatch memory
size_t per_minibatch_bytes = 0;
int maxL = *std::max_element(label_lengths, label_lengths + minibatch_);;
int maxS = 2 * maxL + 1;
//output
per_minibatch_bytes += sizeof(float) * alphabet_size_;
//alphas
per_minibatch_bytes += sizeof(float) * maxS * maxT;
//betas
per_minibatch_bytes += sizeof(float) * maxS;
//labels w/blanks, e_inc, s_inc
per_minibatch_bytes += 3 * sizeof(int) * maxS;
softmax(activations, probs, input_lengths);
#pragma omp parallel for
for (int mb = 0; mb < minibatch_; ++mb) {
const int T = input_lengths[mb]; // Length of utterance (time)
const int L = label_lengths[mb]; // Number of labels in transcription
bool mb_status;
std::tie(costs[mb], mb_status) =
cost_and_grad_kernel(grads + mb * alphabet_size_,
probs + mb * alphabet_size_,
flat_labels + std::accumulate(label_lengths, label_lengths + mb, 0),
T, L, mb,
bytes_used + mb * per_minibatch_bytes);
}
return CTC_STATUS_SUCCESS;
}
template<typename ProbT>
ctcStatus_t CpuCTC<ProbT>::score_forward(const ProbT* const activations,
ProbT* costs,
const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths) {
if (activations == nullptr ||
costs == nullptr ||
flat_labels == nullptr ||
label_lengths == nullptr ||
input_lengths == nullptr
)
return CTC_STATUS_INVALID_VALUE;
ProbT* probs = static_cast<ProbT *>(workspace_);
int maxT = *std::max_element(input_lengths, input_lengths + minibatch_);
size_t bytes_used = sizeof(ProbT) * minibatch_ * alphabet_size_ * maxT;
//per minibatch memory
size_t per_minibatch_bytes = 0;
int maxL = *std::max_element(label_lengths, label_lengths + minibatch_);
int maxS = 2 * maxL + 1;
//output
per_minibatch_bytes += sizeof(float) * alphabet_size_;
//alphas
per_minibatch_bytes += sizeof(float) * maxS * maxT;
//betas
per_minibatch_bytes += sizeof(float) * maxS;
//labels w/blanks, e_inc, s_inc
per_minibatch_bytes += 3 * sizeof(int) * maxS;
softmax(activations, probs, input_lengths);
#pragma omp parallel for
for (int mb = 0; mb < minibatch_; ++mb) {
const int T = input_lengths[mb]; // Length of utterance (time)
const int L = label_lengths[mb]; // Number of labels in transcription
const int S = 2*L + 1; // Number of labels with blanks
CpuCTC_metadata ctcm(L, S, T, mb, alphabet_size_, workspace_,
bytes_used + mb * per_minibatch_bytes, blank_label_,
flat_labels + std::accumulate(label_lengths, label_lengths + mb, 0));
if (L + ctcm.repeats > T)
costs[mb] = ProbT(0);
else {
costs[mb] = -compute_alphas(probs + mb * alphabet_size_, ctcm.repeats, S, T,
ctcm.e_inc, ctcm.s_inc, ctcm.labels_w_blanks,
ctcm.alphas);
}
}
return CTC_STATUS_SUCCESS;
}
#pragma once
#include <limits>
#include <algorithm>
#include <cmath>
#include "hostdevice.h"
namespace ctc_helper {
static const float threshold = 1e-1;
template<typename T>
HOSTDEVICE
T neg_inf() { return -T(INFINITY); }
inline int div_up(int x, int y) {
return (x + y - 1) / y;
}
template <typename Arg, typename Res = Arg> struct maximum {
HOSTDEVICE
Res operator()(const Arg& x, const Arg& y) const {
return x < y ? y : x;
}
};
template <typename Arg, typename Res = Arg> struct add {
HOSTDEVICE
Res operator()(const Arg& x, const Arg& y) const {
return x + y;
}
};
template <typename Arg, typename Res = Arg> struct identity {
HOSTDEVICE Res operator()(const Arg& x) const {return Res(x);}
};
template <typename Arg, typename Res = Arg> struct negate {
HOSTDEVICE Res operator()(const Arg& x) const {return Res(-x);}
};
template <typename Arg, typename Res = Arg> struct exponential {
HOSTDEVICE Res operator()(const Arg& x) const {return std::exp(x);}
};
template<typename Arg1, typename Arg2 = Arg1, typename Res=Arg1>
struct log_plus {
typedef Res result_type;
HOSTDEVICE
Res operator()(const Arg1& p1, const Arg2& p2) {
if (p1 == neg_inf<Arg1>())
return p2;
if (p2 == neg_inf<Arg2>())
return p1;
Res result = log1p(exp(-fabs(p1 - p2))) + maximum<Res>()(p1, p2);
return result;
}
};
}
#pragma once
#include "ctc_helper.h"
#include "gpu_ctc_kernels.h"
#include "reduce.h"
template <typename ProbT>
class GpuCTC {
public:
GpuCTC(int alphabet_size,
int minibatch,
void *workspace,
CUstream stream,
int blank_label) :
out_dim_(alphabet_size), minibatch_(minibatch),
gpu_workspace_(workspace), stream_(stream),
blank_label_(blank_label) {};
// Noncopyable
GpuCTC(const GpuCTC&) = delete;
GpuCTC& operator=(const GpuCTC&) = delete;
ctcStatus_t
cost_and_grad(const ProbT* const activations,
ProbT* grads,
ProbT* costs,
const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths);
ctcStatus_t
score_forward(const ProbT* const activations,
ProbT* costs,
const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths);
private:
template<int NT, int VT>
ctcStatus_t launch_alpha_beta_kernels(const ProbT* const probs,
ProbT *grads,
bool compute_alpha,
bool compute_beta);
ctcStatus_t
launch_gpu_kernels(const ProbT* const probs,
ProbT *grads,
size_t config,
bool launch_alpha,
bool launch_beta);
ctcStatus_t
setup_gpu_metadata(const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths);
ctcStatus_t
create_metadata_and_choose_config(const int* const label_lengths,
const int* const flat_labels,
const int* const input_lengths,
size_t& best_config);
ctcStatus_t
compute_probs(const ProbT* const activations);
ctcStatus_t
compute_cost_and_score(const ProbT* const activations,
ProbT* grads,
ProbT* costs,
const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths,
bool compute_alpha,
bool compute_betas_and_grad);
int out_dim_; // Number of characters plus blank
int minibatch_;
int S_;
int T_;
int activation_cols_; // Number of columns in activations
CUstream stream_;
int blank_label_;
void *gpu_workspace_; // Buffer for all temporary GPU memory
int *utt_length_; // T
int *label_sizes_; // L
int *repeats_; // repeats_
int *label_offsets_;
int *labels_without_blanks_;
int *labels_with_blanks_;
ProbT *alphas_;
ProbT *nll_forward_;
ProbT *nll_backward_;
ProbT *denoms_; // Temporary storage for denoms for softmax
ProbT *probs_; // Temporary storage for probabilities (softmax output)
};
template<typename ProbT>
ctcStatus_t
GpuCTC<ProbT>::setup_gpu_metadata(const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths)
{
size_t gpu_bytes_used = 0;
nll_forward_ =
reinterpret_cast<ProbT *>(static_cast<char*>(gpu_workspace_) +
gpu_bytes_used);
gpu_bytes_used += minibatch_ * sizeof(ProbT);
nll_backward_ =
reinterpret_cast<ProbT *>(static_cast<char*>(gpu_workspace_) +
gpu_bytes_used);
gpu_bytes_used += minibatch_ * sizeof(ProbT);
repeats_ =
reinterpret_cast<int *>(static_cast<char*>(gpu_workspace_) +
gpu_bytes_used);
gpu_bytes_used += minibatch_ * sizeof(int);
label_offsets_ =
reinterpret_cast<int *>(static_cast<char*>(gpu_workspace_) +
gpu_bytes_used);
gpu_bytes_used += minibatch_ * sizeof(int);
// This is the max of all S and T for all valid examples in the minibatch.
// A valid example is one for which L + repeats <= T
S_ = 0;
T_ = 0;
// This is the max of all timesteps, valid or not. Needed to compute offsets
int Tmax = 0;
// This is the max of all labels, valid or not. Needed to compute offsets
int Lmax = 0;
int total_label_length = 0;
constexpr int cpu_buffer_size = 64;
int repeats[cpu_buffer_size];
int label_offsets[cpu_buffer_size];
const int num_passes = ctc_helper::div_up(minibatch_, cpu_buffer_size);
cudaError_t cuda_status;
for (int pass = 0; pass < num_passes; ++pass) {
const int start_idx = pass * cpu_buffer_size;
const int end_idx = std::min(minibatch_, (pass+1) * cpu_buffer_size);
for (int j = start_idx; j < end_idx; ++j) {
const int L = label_lengths[j];
const int local_T = input_lengths[j];
const int *label_ptr = &(flat_labels[total_label_length]);
label_offsets[j % cpu_buffer_size] = total_label_length;
total_label_length += L;
int repeat_counter = 0;
for (int i = 1; i < L; ++i)
repeat_counter += (label_ptr[i] == label_ptr[i-1]);
repeats[j % cpu_buffer_size] = repeat_counter;
const bool valid_label = ((L + repeat_counter) <= local_T);
// Only update S and T if label is valid
S_ = (valid_label) ? std::max(S_, L) : S_;
T_ = (valid_label) ? std::max(T_, local_T) : T_;
Tmax = std::max(Tmax, local_T);
Lmax = std::max(Lmax, L);
}
cuda_status = cudaMemcpyAsync(&(repeats_[start_idx]), repeats,
(end_idx - start_idx) * sizeof(int),
cudaMemcpyHostToDevice, stream_);
if (cuda_status != cudaSuccess)
return CTC_STATUS_MEMOPS_FAILED;
cuda_status = cudaMemcpyAsync(&(label_offsets_[start_idx]), label_offsets,
(end_idx - start_idx) * sizeof(int),
cudaMemcpyHostToDevice, stream_);
if (cuda_status != cudaSuccess)
return CTC_STATUS_MEMOPS_FAILED;
}
S_ = 2 * S_ + 1;
const int Smax = 2 * Lmax + 1;
activation_cols_ = minibatch_ * Tmax;
// Allocate memory for T
utt_length_ =
reinterpret_cast<int *>(static_cast<char*>(gpu_workspace_) +
gpu_bytes_used);
gpu_bytes_used += minibatch_ * sizeof(int);
cuda_status = cudaMemcpyAsync(utt_length_, input_lengths,
minibatch_ * sizeof(int),
cudaMemcpyHostToDevice, stream_);
if (cuda_status != cudaSuccess)
return CTC_STATUS_MEMOPS_FAILED;
label_sizes_ =
reinterpret_cast<int *>(static_cast<char*>(gpu_workspace_) +
gpu_bytes_used);
gpu_bytes_used += minibatch_ * sizeof(int);
cuda_status = cudaMemcpyAsync(label_sizes_, label_lengths,
minibatch_ * sizeof(int),
cudaMemcpyHostToDevice, stream_);
if (cuda_status != cudaSuccess)
return CTC_STATUS_MEMOPS_FAILED;
labels_without_blanks_ =
reinterpret_cast<int *>(static_cast<char*>(gpu_workspace_) +
gpu_bytes_used);
gpu_bytes_used += Lmax * minibatch_ * sizeof(int);
cuda_status = cudaMemcpyAsync(labels_without_blanks_, flat_labels,
total_label_length * sizeof(int),
cudaMemcpyHostToDevice, stream_);
if (cuda_status != cudaSuccess)
return CTC_STATUS_MEMOPS_FAILED;
labels_with_blanks_ =
reinterpret_cast<int *>(static_cast<char*>(gpu_workspace_) +
gpu_bytes_used);
gpu_bytes_used += Smax * minibatch_ * sizeof(int);
alphas_ =
reinterpret_cast<ProbT *>(static_cast<char*>(gpu_workspace_) +
gpu_bytes_used);
gpu_bytes_used += (S_ * T_) * minibatch_ * sizeof(ProbT);
denoms_ =
reinterpret_cast<ProbT *>(static_cast<char*>(gpu_workspace_) +
gpu_bytes_used);
gpu_bytes_used += activation_cols_ * sizeof(ProbT);
probs_ =
reinterpret_cast<ProbT *>(static_cast<char*>(gpu_workspace_) +
gpu_bytes_used);
gpu_bytes_used += out_dim_ * activation_cols_ * sizeof(ProbT);
return CTC_STATUS_SUCCESS;
}
template<typename ProbT>
template<int NT, int VT>
ctcStatus_t GpuCTC<ProbT>::launch_alpha_beta_kernels(const ProbT* const probs,
ProbT* grads,
bool compute_alpha,
bool compute_beta ) {
// One thread block per utterance
const int grid_size = minibatch_;
// The data is laid out so that the next timestep is minibatch entries
// away
const int stride = minibatch_;
if (compute_alpha)
compute_alpha_kernel<ProbT, NT, VT><<<grid_size, NT, 0, stream_>>>
(probs, label_sizes_, utt_length_,
repeats_, labels_without_blanks_, label_offsets_,
labels_with_blanks_, alphas_, nll_forward_,
stride, out_dim_, S_, T_, blank_label_);
if (compute_beta) {
compute_betas_and_grad_kernel<ProbT, NT, VT><<<grid_size, NT, 0, stream_>>>
(probs, label_sizes_, utt_length_, repeats_,
labels_with_blanks_, alphas_, nll_forward_, nll_backward_,
grads, stride, out_dim_, S_, T_, blank_label_);
cudaStreamSynchronize(stream_);
}
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
return CTC_STATUS_EXECUTION_FAILED;
return CTC_STATUS_SUCCESS;
}
template<typename ProbT>
ctcStatus_t
GpuCTC<ProbT>::create_metadata_and_choose_config(const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths,
size_t& best_config) {
// Setup the metadata for GPU
ctcStatus_t status = setup_gpu_metadata(flat_labels, label_lengths, input_lengths);
if (status != CTC_STATUS_SUCCESS)
return status;
constexpr int num_configs = 12;
int config_NT[num_configs] =
{32, 64, 128, 64, 128, 32, 64, 128, 64, 128, 128, 128};
int config_VT[num_configs] =
{ 1, 1, 1, 3, 2, 9, 6, 4, 9, 6, 9, 10};
best_config = 0;
for (int i = 0; i < num_configs; ++i) {
if ((config_NT[i]* config_VT[i]) >= S_)
break;
else
best_config++;
}
if (best_config >= num_configs)
return CTC_STATUS_UNKNOWN_ERROR;
return CTC_STATUS_SUCCESS;
}
template<typename ProbT>
ctcStatus_t
GpuCTC<ProbT>::launch_gpu_kernels(const ProbT* const probs,
ProbT* grads,
size_t config,
bool l_a,
bool l_b) {
switch(config) {
case 0: {return launch_alpha_beta_kernels<32, 1>(probs, grads, l_a, l_b);}
case 1: {return launch_alpha_beta_kernels<64, 1>(probs, grads, l_a, l_b);}
case 2: {return launch_alpha_beta_kernels<128, 1>(probs, grads, l_a, l_b);}
case 3: {return launch_alpha_beta_kernels<64, 3>(probs, grads, l_a, l_b);}
case 4: {return launch_alpha_beta_kernels<128, 2>(probs, grads, l_a, l_b);}
case 5: {return launch_alpha_beta_kernels<32, 9>(probs, grads, l_a, l_b);}
case 6: {return launch_alpha_beta_kernels<64, 6>(probs, grads, l_a, l_b);}
case 7: {return launch_alpha_beta_kernels<128, 4>(probs, grads, l_a, l_b);}
case 8: {return launch_alpha_beta_kernels<64, 9>(probs, grads, l_a, l_b);}
case 9: {return launch_alpha_beta_kernels<128, 6>(probs, grads, l_a, l_b);}
case 10: {return launch_alpha_beta_kernels<128, 9>(probs, grads, l_a, l_b);}
case 11: {return launch_alpha_beta_kernels<128, 10>(probs, grads, l_a, l_b);}
}
return CTC_STATUS_EXECUTION_FAILED;
}
template<typename ProbT>
ctcStatus_t
GpuCTC<ProbT>::compute_probs(const ProbT* const activations) {
cudaError_t cuda_status;
cuda_status =
cudaMemcpyAsync(probs_, activations,
activation_cols_ * out_dim_ *sizeof(ProbT),
cudaMemcpyDeviceToDevice, stream_);
if (cuda_status != cudaSuccess)
return CTC_STATUS_MEMOPS_FAILED;
// Numerically stable SM
ctcStatus_t ctc_status =
reduce_max(probs_, denoms_, out_dim_,
activation_cols_, 1, stream_);
if (ctc_status != CTC_STATUS_SUCCESS)
return ctc_status;
// Kernel launch to subtract maximum
const int NT = 128;
const int VT = 1;
const int NV = NT * VT;
const int num_elements = out_dim_ * activation_cols_;
const int grid_size = ctc_helper::div_up(num_elements, NV);
prepare_stable_SM_kernel<ProbT, VT> <<< grid_size, NT, 0, stream_>>>
(ctc_helper::identity<ProbT>(), probs_,
denoms_, out_dim_, num_elements);
// Reduce along columns to calculate denominator
ctc_status =
reduce_exp(probs_, denoms_, out_dim_,
activation_cols_, 1, stream_);
if (ctc_status != CTC_STATUS_SUCCESS)
return ctc_status;
// Kernel launch to calculate probabilities
compute_probs_kernel<ProbT, VT><<<grid_size, NT, 0, stream_>>>
(ctc_helper::exponential<ProbT>(), probs_,
denoms_, out_dim_, num_elements);
return CTC_STATUS_SUCCESS;
}
template<typename ProbT>
ctcStatus_t
GpuCTC<ProbT>::compute_cost_and_score(const ProbT* const activations,
ProbT* grads,
ProbT* costs,
const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths,
bool compute_alpha,
bool compute_betas_and_grad) {
size_t best_config;
ctcStatus_t status = create_metadata_and_choose_config(flat_labels,
label_lengths,
input_lengths,
best_config);
if (status != CTC_STATUS_SUCCESS)
return status;
status = compute_probs(activations);
if (status != CTC_STATUS_SUCCESS)
return status;
launch_gpu_kernels(probs_, grads, best_config,
compute_alpha, compute_betas_and_grad);
cudaError_t cuda_status_mem, cuda_status_sync;
cuda_status_mem = cudaMemcpyAsync(costs, nll_forward_,
sizeof(ProbT) * minibatch_,
cudaMemcpyDeviceToHost, stream_);
cuda_status_sync = cudaStreamSynchronize(stream_);
if (cuda_status_mem != cudaSuccess || cuda_status_sync != cudaSuccess)
return CTC_STATUS_MEMOPS_FAILED;
return CTC_STATUS_SUCCESS;
}
template<typename ProbT>
ctcStatus_t
GpuCTC<ProbT>::cost_and_grad(const ProbT* const activations,
ProbT* grads,
ProbT* costs,
const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths) {
if (activations == nullptr ||
grads == nullptr ||
costs == nullptr ||
flat_labels == nullptr ||
label_lengths == nullptr ||
input_lengths == nullptr
)
return CTC_STATUS_INVALID_VALUE;
return compute_cost_and_score(activations, grads, costs, flat_labels,
label_lengths, input_lengths, true, true);
}
template<typename ProbT>
ctcStatus_t
GpuCTC<ProbT>::score_forward(const ProbT* const activations,
ProbT* costs,
const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths) {
if (activations == nullptr ||
costs == nullptr ||
flat_labels == nullptr ||
label_lengths == nullptr ||
input_lengths == nullptr
)
return CTC_STATUS_INVALID_VALUE;
return compute_cost_and_score(activations, nullptr, costs, flat_labels,
label_lengths, input_lengths, true, false);
}
#pragma once
#include <contrib/moderngpu/include/device/ctascan.cuh>
#include <contrib/moderngpu/include/device/ctamerge.cuh>
#include "ctc_helper.h"
using namespace mgpu;
template<int NT, int VT, typename T, typename KeyT, typename Op>
struct CTASegReduce {
enum {NV = NT * VT};
union Storage {
typename CTAScan<NT>::Storage scanStorage;
int indices[NV];
};
//adapted from global kernel KernelReduceByKeyPreprocess
__device__ static void preprocessKeys(KeyT *keys, int count,
int *numUniqueLabels, int seg_start[VT],
int seg_end[VT], int *scanout) {
__shared__ Storage shared;
const int tid = threadIdx.x;
// Compare adjacent keys within each thread and mark discontinuities
int endFlags = 0;
T key = keys[VT * tid];
#pragma unroll
for (int i = 0; i < VT; ++i) {
int index = VT * tid + 1 + i;
T next = keys[index];
if(index == count || (index < count && key != next)) {
endFlags |= 1 << i;
}
key = next;
}
__syncthreads();
//Count the number of encountered end flags
int scan = CTAScan<NT>::Scan(tid, popc(endFlags), shared.scanStorage, numUniqueLabels);
__syncthreads();
//output the unique keys
//use indices as scratch space
int outputPos = scan;
#pragma unroll
for (int i = 0; i < VT; ++i) {
if ( (endFlags >> i) & 1) {
shared.indices[outputPos] = keys[VT * tid + i];
scanout[outputPos] = VT * tid + i;
outputPos++;
}
}
__syncthreads();
// Create start and end
for (int idx = tid, j = 0; idx < (*numUniqueLabels); idx += blockDim.x, ++j) {
seg_start[j] = (idx == 0) ? 0 : (scanout[idx-1] + 1);
seg_end[j] = scanout[idx];
}
__syncthreads();
//copy from the scratch space back into the keys
#pragma unroll
for (int i = 0; i < VT; ++i) {
keys[i * NT + tid] = shared.indices[i * NT + tid];
}
__syncthreads();
}
};
// Computes forward probabilities. This fills in a T * S matrix.
// The computation starts at t=1 (2nd row) and ends at t=T-1 (last row). Each row has
// S elements where S = 2L + 1.
//
// We only need to read in probabilities corresponding to the labels, thus a sparse
// set of values are read from the probs matrix since the character set is much smaller
// than the labels. This is much more true for Mandarin than English.
template<typename ProbT, int NT, int VT>
__global__
void compute_alpha_kernel (const ProbT* probs, const int *label_sizes,
const int *utt_length, const int *repeats_in_labels,
const int *labels_without_blanks, const int *label_offsets,
int *labels_with_blanks, ProbT *alphas,
ProbT* nll_forward, int stride, int out_dim,
int S_memoffset, int T_memoffset, int blank_label) {
ctc_helper::log_plus<ProbT> log_plus_f;
const int tid = threadIdx.x;
const int L = label_sizes[blockIdx.x];
const int T = utt_length[blockIdx.x];
const int S = 2*L + 1;
const int prob_offset = out_dim * blockIdx.x;
const int repeats = repeats_in_labels[blockIdx.x];
const int NV = NT * VT;
__shared__ int label[NV];
if ((L + repeats) > T)
return;
// Generate labels with blanks from labels without blanks
{
const int label_start_offset = label_offsets[blockIdx.x];
for (int idx = tid; idx < L; idx += blockDim.x) {
const int offset = (blockIdx.x * S_memoffset) + 2 * idx;
labels_with_blanks[offset] = blank_label;
labels_with_blanks[offset+1] = labels_without_blanks[label_start_offset + idx];
}
if (tid == 0) {
labels_with_blanks[(blockIdx.x * S_memoffset) + 2 * L] = blank_label;
}
}
__syncthreads();
const int *labels = labels_with_blanks;
const int* label_global = &labels[blockIdx.x * S_memoffset];
ProbT* alpha = &alphas[blockIdx.x * (S_memoffset * T_memoffset)];
// Set the first row of alpha neg_inf - it is much more efficient to do it
// here than outside
#pragma unroll
for (int idx = tid; idx < min(S, NV); idx += blockDim.x) {
alpha[idx] = ctc_helper::neg_inf<ProbT>();
}
// Load labels into shared memory
#pragma unroll
for (int i = tid; i < S; i += NT) {
label[i] = label_global[i];
}
__syncthreads();
int start = (L + repeats < T) ? 0 : 1;
int end = S > 1 ? 2 : 1;
// Initialize the first row corresponding to t=0;
for(int i = tid; i < (end-start); i += blockDim.x)
alpha[i + start] = log(probs[prob_offset + label[i + start]]);
__syncthreads();
// Fill in the rest of matrix, one row at a time (outer loop).
for(int t = 1; t < T; ++t) {
// Start offsets into the current and previous row
const int start_cur_row = t * S;
const int start_prev_row = (t - 1) * S;
// The prob is a 2D column major array, with probabilites for each t strided
// by (out_dim * stride), where stride is the minibatch size
const int start_prob_col = t * (out_dim * stride);
// This is the first column and in this case there is nothing left of it
if (tid == 0) {
if (start == 0) {
alpha[start_cur_row] = alpha[start_prev_row] +
log(probs[prob_offset + start_prob_col + blank_label]);
}
else if (start == 1) {
alpha[start_cur_row] = alpha[start_prev_row];
}
}
__syncthreads();
// Fill in the elements in each row. There is no loop dependence here since our
// input is the row above. We sum either two or three adjacent values from the
// row above depending on whether we have a blank or repeated characters. Finally
// we add the probability corresponding to this label at time t
#pragma unroll
for (int idx = (tid+1); idx < S; idx += blockDim.x) {
ProbT prev_sum = log_plus_f(alpha[idx + start_prev_row], alpha[(idx-1) + start_prev_row]);
// Skip two if not on blank and not on repeat.
if ((label[idx] != blank_label) &&
(idx != 1) && (label[idx] != label[idx-2]))
prev_sum = log_plus_f(prev_sum, alpha[(idx-2) + start_prev_row]);
alpha[idx + start_cur_row] =
prev_sum + log(probs[prob_offset + start_prob_col + label[idx]]);
}
__syncthreads();
}
if (tid == 0) {
// Add and return the rightmost two/one element(s) in the last row.
ProbT loglike = ctc_helper::neg_inf<ProbT>();
// This is the total increment for s_inc and e_inc through the loop
const int val = 2 * (L-1) + 1 - (((L + repeats) == T) ? 1 : 0);
start = (val * (L!=0) + start);
end = (val * (L!=0) + end);
for(int i = start; i < end; ++i)
loglike = log_plus_f(loglike, alpha[i + (T - 1) * S]);
nll_forward[blockIdx.x] = -loglike;
}
}
// Computes backward probabilities. This also fills in a T * S matrix
//
// See comments above compute_alphas for more context.
template<typename ProbT, int NT, int VT>
__global__
void compute_betas_and_grad_kernel (const ProbT* probs, const int *label_sizes,
const int *utt_length, const int *repeats_in_labels,
const int *labels_with_blanks, ProbT *alphas,
const ProbT* nll_forward, ProbT *nll_backward,
ProbT *grads, int stride, int out_dim,
int S_memoffset, int T_memoffset, int blank_label) {
ctc_helper::log_plus<ProbT> log_plus_f;
typedef CTASegReduce<NT, VT, ProbT, int, ctc_helper::log_plus<ProbT>> SegReduce;
const int tid = threadIdx.x;
const int L = label_sizes[blockIdx.x];
const int T = utt_length[blockIdx.x];
const int S = 2*L + 1;
const int prob_offset = out_dim * blockIdx.x;
const int repeats = repeats_in_labels[blockIdx.x];
const ProbT log_partition = -nll_forward[blockIdx.x];
const int* labels = labels_with_blanks;
const int* label_global = &labels[blockIdx.x * S_memoffset];
ProbT* alpha = &alphas[blockIdx.x * (S_memoffset * T_memoffset)];
const int NV = NT * VT;
union TempStorage {
ProbT beta[NV];
int result[NV];
};
__shared__ TempStorage temp_buffer;
__shared__ int label[NV];
// Temporaries needed for segmented reduce
// TODO: see if we can combine the shared memory requirements
__shared__ int keys_shared[NV];
__shared__ int gather_indices[NV];
__shared__ ProbT output[NV];
ProbT beta_val[VT];
if ((L + repeats) > T)
return;
int start = S > 1 ? (S - 2) : 0;
int end = (L + repeats < T) ? S : S-1;
// Setup shared memory buffers
#pragma unroll
for (int idx = tid; idx < NV; idx += NT) {
label[idx] = (idx < S) ? label_global[idx] : INT_MAX;
}
__syncthreads();
// int flags;
int uniquelabels;
int seg_start[VT];
int seg_end[VT];
// Sort labels and record indices from which to gather from
{
int key[VT];
int gather_val[VT];
#pragma unroll
for (int i = 0; i < VT; ++i) {
const int idx = tid * VT + i;
gather_val[i] = idx;
key[i] = label[idx];
}
__syncthreads();
CTAMergesort<NT, VT, true, true, int, int, mgpu::less<int>>
(key, gather_val, keys_shared, gather_indices, S, tid, mgpu::less<int>());
__syncthreads();
for (int i = 0; i < VT; ++i) {
const int idx = tid * VT + i;
gather_indices[idx] = gather_val[i];
}
__syncthreads();
SegReduce::preprocessKeys(keys_shared, S, &uniquelabels, seg_start, seg_end,
temp_buffer.result);
__syncthreads();
}
// TODO: probably not necessary
__syncthreads();
// Load labels back
#pragma unroll
for (int idx = tid; idx < NV; idx += NT) {
temp_buffer.beta[idx] = ctc_helper::neg_inf<ProbT>();
}
__syncthreads();
// Initialize the two rightmost values in the last row (assuming L non-zero)
for(int i = tid; i < (end-start); i += blockDim.x)
temp_buffer.beta[i + start] =
log(probs[prob_offset + (T - 1) * (out_dim * stride) + label[i + start]]);
__syncthreads();
// Load output data in registers through the transpose trick - should really be a function
#pragma unroll
for (int idx = tid; idx < S; idx += NT) {
output[idx] = alpha[idx + (T - 1) * S] + temp_buffer.beta[idx];
}
__syncthreads();
// Start at the second to last row and backward in time
for(int t = T - 1; t >= 0; --t) {
// Start offsets into the current and next row
const int start_cur_row = t * S;
// Starting offset of column that we read from the probs array
const int start_prob_col = t * (out_dim * stride);
if (t < T-1) {
// Filling up one row at at time but going back in time from the last row
// to the first. As in the forward pass, there is no loop dependence and we
// do a variable length filter of maximum filter size of 3
#pragma unroll
for(int idx = tid, i = 0; idx < (S-1); idx += NT, i++) {
ProbT next_sum = log_plus_f(temp_buffer.beta[idx], temp_buffer.beta[idx+1]);
// Skip two if not on blank and not on repeat.
if ((label[idx] != blank_label) &&
(idx != (S-2)) && (label[idx] != label[idx+2]))
next_sum = log_plus_f(next_sum, temp_buffer.beta[idx+2]);
beta_val[i] = next_sum + log(probs[prob_offset + start_prob_col + label[idx]]);
}
__syncthreads();
// Initialize values for the rightmost column since there is nothing to the right
// Update input buffer for next iteration
if ((tid == 0) && (end == S))
temp_buffer.beta[(S-1)] = temp_buffer.beta[(S-1)] +
log(probs[prob_offset + start_prob_col + blank_label]);
#pragma unroll
for(int idx = tid, i = 0; idx < (S-1); idx += NT, i++) {
temp_buffer.beta[idx] = beta_val[i];
}
__syncthreads();
// Beta Computation done - add to alpha and update the gradient. Reload
// the gradient back for segmented reduce later on
#pragma unroll
for(int idx = tid; idx < S; idx += NT) {
output[idx] = alpha[idx + start_cur_row] + temp_buffer.beta[idx];
}
__syncthreads();
}
__syncthreads();
// Compute segmented reduction of output by using label as key
{
// Somewhat faster key value reduce
ProbT accum[VT];
for (int idx = tid, j = 0; idx < uniquelabels; idx += blockDim.x, ++j) {
accum[j] = ctc_helper::neg_inf<ProbT>();
for (int i = seg_start[j]; i <= seg_end[j]; ++i) {
accum[j] = log_plus_f(accum[j], output[gather_indices[i]]);
}
}
__syncthreads();
// Write accumulated value into output since that is not used
for (int idx = tid, j = 0; idx < uniquelabels; idx += blockDim.x, ++j) {
output[idx] = accum[j];
}
__syncthreads();
for (int idx = tid; idx < out_dim; idx += blockDim.x) {
const int grads_offset = prob_offset + start_prob_col + idx;
grads[grads_offset] = probs[grads_offset];
}
__syncthreads();
for (int idx = tid; idx < uniquelabels; idx += blockDim.x) {
const int grads_offset = prob_offset + start_prob_col + keys_shared[idx];
ProbT grad = output[idx];
if ((grad == 0.0) || (probs[grads_offset] == 0.0) ||
(grad == ctc_helper::neg_inf<ProbT>())) {
} else {
grads[grads_offset] =
probs[grads_offset] - exp(grad - log(probs[grads_offset]) - log_partition);
}
}
__syncthreads();
}
// Output backward log likelihood
if ((t == 0) && (tid == 0)) {
ProbT loglike = ctc_helper::neg_inf<ProbT>();
const int val = 2 * (L-1) + 1 - (((L + repeats) == T) ? 1 : 0);
start = (-val * (L != 0) + start);
end = (-val * (L != 0) + end);
// Sum and return the leftmost one/two value(s) in first row
for(int i = start; i < end; ++i)
loglike = log_plus_f(loglike, temp_buffer.beta[i]);
nll_backward[blockIdx.x] = -loglike;
}
// For some reason this is important
__syncthreads();
}
}
template <typename ProbT, int VT = 1, typename Op>
__global__ void compute_probs_kernel(Op f, ProbT* probs,
const ProbT* const denom,
int alphabet_size,
int count) {
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
#pragma unroll
for(int i = 0; i < VT; i++) {
if (idx < count) {
const int column_idx = idx / alphabet_size;
probs[idx] = f(probs[idx]) / denom[column_idx];
}
idx += stride;
}
}
template <typename ProbT, int VT = 1, typename Op>
__global__ void prepare_stable_SM_kernel(Op f, ProbT* probs,
const ProbT* const col_max,
int alphabet_size,
int count) {
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
#pragma unroll
for(int i = 0; i < VT; i++) {
if (idx < count) {
const int column_idx = idx / alphabet_size;
probs[idx] = f(probs[idx] - col_max[column_idx]);
}
idx += stride;
}
}
#pragma once
#ifdef __CUDACC__
#define HOSTDEVICE __host__ __device__
#else
#define HOSTDEVICE
#endif
#pragma once
ctcStatus_t reduce_negate(const float* input, float* output, int rows, int cols, bool axis, cudaStream_t stream);
ctcStatus_t reduce_exp(const float* input, float* output, int rows, int cols, bool axis, cudaStream_t stream);
ctcStatus_t reduce_max(const float* input, float* output, int rows, int cols, bool axis, cudaStream_t stream);
# Ignore compiled FFI location
warpctc_pytorch/_warp_ctc
# Created by https://www.gitignore.io/api/python
### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule.*
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
# End of https://www.gitignore.io/api/python
import os
import platform
import sys
from setuptools import setup, find_packages
from torch.utils.cpp_extension import BuildExtension, CppExtension
import torch
extra_compile_args = ['-std=c++14', '-fPIC']
warp_ctc_path = "../build"
if platform.system() == 'Darwin':
lib_ext = ".dylib"
else:
lib_ext = ".so"
if "WARP_CTC_PATH" in os.environ:
warp_ctc_path = os.environ["WARP_CTC_PATH"]
if not os.path.exists(os.path.join(warp_ctc_path, "libwarpctc" + lib_ext)):
print(("Could not find libwarpctc.so in {}.\n"
"Build warp-ctc and set WARP_CTC_PATH to the location of"
" libwarpctc.so (default is '../build')").format(warp_ctc_path))
sys.exit(1)
include_dirs = [os.path.realpath('../include')]
if torch.cuda.is_available() or "CUDA_HOME" in os.environ:
enable_gpu = True
else:
print("Torch was not built with CUDA support, not building warp-ctc GPU extensions.")
enable_gpu = False
if enable_gpu:
from torch.utils.cpp_extension import CUDAExtension
build_extension = CUDAExtension
extra_compile_args += ['-DWARPCTC_ENABLE_GPU']
else:
build_extension = CppExtension
ext_modules = [
build_extension(
name='warpctc_pytorch._warp_ctc',
language='c++',
sources=['src/binding.cpp'],
include_dirs=include_dirs,
library_dirs=[os.path.realpath(warp_ctc_path)],
libraries=['warpctc'],
extra_link_args=['-Wl,-rpath,' + os.path.realpath(warp_ctc_path)],
extra_compile_args=extra_compile_args
)
]
setup(
name="warpctc_pytorch",
version="0.1",
description="PyTorch wrapper for warp-ctc",
url="https://github.com/SeanNaren/warp-ctc",
author="Jared Casper, Sean Naren",
author_email="jared.casper@baidu.com, sean.narenthiran@digitalreasoning.com",
license="Apache",
packages=find_packages(),
ext_modules=ext_modules,
cmdclass={'build_ext': BuildExtension}
)
#include <iostream>
#include <vector>
#include <numeric>
#include <torch/extension.h>
#ifdef WARPCTC_ENABLE_GPU
#include "ATen/cuda/CUDAContext.h"
#include <c10/cuda/CUDAGuard.h>
#include "ATen/cuda/CUDAEvent.h"
#include "THC.h"
extern THCState* state;
#endif
#include "ctc.h"
int cpu_ctc(torch::Tensor probs,
torch::Tensor grads,
torch::Tensor labels,
torch::Tensor label_sizes,
torch::Tensor sizes,
int minibatch_size,
torch::Tensor costs,
int blank_label)
{
float* probs_ptr = (float*)probs.data_ptr();
float* grads_ptr = grads.storage() ? (float*)grads.data_ptr() : NULL;
int* sizes_ptr = (int*)sizes.data_ptr();
int* labels_ptr = (int*)labels.data_ptr();
int* label_sizes_ptr = (int*)label_sizes.data_ptr();
float* costs_ptr = (float*)costs.data_ptr();
const int probs_size = probs.size(2);
ctcOptions options;
memset(&options, 0, sizeof(options));
options.loc = CTC_CPU;
options.num_threads = 0; // will use default number of threads
options.blank_label = blank_label;
#if defined(CTC_DISABLE_OMP) || defined(APPLE)
// have to use at least one
options.num_threads = std::max(options.num_threads, (unsigned int) 1);
#endif
size_t cpu_size_bytes;
get_workspace_size(label_sizes_ptr, sizes_ptr,
probs_size, minibatch_size,
options, &cpu_size_bytes);
float* cpu_workspace = new float[cpu_size_bytes / sizeof(float)];
compute_ctc_loss(probs_ptr, grads_ptr,
labels_ptr, label_sizes_ptr,
sizes_ptr, probs_size,
minibatch_size, costs_ptr,
cpu_workspace, options);
delete[] cpu_workspace;
return 1;
}
#ifdef WARPCTC_ENABLE_GPU
int gpu_ctc(torch::Tensor probs,
torch::Tensor grads,
torch::Tensor labels,
torch::Tensor label_sizes,
torch::Tensor sizes,
int minibatch_size,
torch::Tensor costs,
int blank_label)
{
float* probs_ptr = (float*)probs.data_ptr();
float* grads_ptr = grads.storage() ? (float*)grads.data_ptr() : NULL;
int* sizes_ptr = (int*)sizes.data_ptr();
int* labels_ptr = (int*)labels.data_ptr();
int* label_sizes_ptr = (int*)label_sizes.data_ptr();
float* costs_ptr = (float*)costs.data_ptr();
const int probs_size = probs.size(2);
ctcOptions options;
memset(&options, 0, sizeof(options));
options.loc = CTC_GPU;
options.blank_label = blank_label;
options.stream = at::cuda::getCurrentCUDAStream();
size_t gpu_size_bytes;
get_workspace_size(label_sizes_ptr, sizes_ptr,
probs_size, minibatch_size,
options, &gpu_size_bytes);
void* gpu_workspace = THCudaMalloc(state, gpu_size_bytes);
compute_ctc_loss(probs_ptr, grads_ptr,
labels_ptr, label_sizes_ptr,
sizes_ptr, probs_size,
minibatch_size, costs_ptr,
gpu_workspace, options);
THCudaFree(state, (void *) gpu_workspace);
return 1;
}
#endif
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("cpu_ctc", &cpu_ctc, "CTC Loss function with cpu");
#ifdef WARPCTC_ENABLE_GPU
m.def("gpu_ctc", &gpu_ctc, "CTC Loss function with gpu");
#endif
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment