"tests/vscode:/vscode.git/clone" did not exist on "cd3fa030351d9d386991b7a943889f980db00513"
Commit 0bf5eb5f authored by lishen's avatar lishen
Browse files

warpctc for dcu

parent 949fcc19
/******************************************************************************
* Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
*
* Code and text by Sean Baxter, NVIDIA Research
* See http://nvlabs.github.io/moderngpu for repository and documentation.
*
******************************************************************************/
#pragma once
#include "intrinsics.cuh"
namespace mgpu {
// Get the difference between two pointers in bytes.
MGPU_HOST_DEVICE ptrdiff_t PtrDiff(const void* a, const void* b) {
return (const byte*)b - (const byte*)a;
}
// Offset a pointer by i bytes.
template<typename T>
MGPU_HOST_DEVICE const T* PtrOffset(const T* p, ptrdiff_t i) {
return (const T*)((const byte*)p + i);
}
template<typename T>
MGPU_HOST_DEVICE T* PtrOffset(T* p, ptrdiff_t i) {
return (T*)((byte*)p + i);
}
////////////////////////////////////////////////////////////////////////////////
// Task range support
// Evenly distributes variable-length arrays over a fixed number of CTAs.
MGPU_HOST int2 DivideTaskRange(int numItems, int numWorkers) {
div_t d = div(numItems, numWorkers);
return make_int2(d.quot, d.rem);
}
MGPU_HOST_DEVICE int2 ComputeTaskRange(int block, int2 task) {
int2 range;
range.x = task.x * block;
range.x += min(block, task.y);
range.y = range.x + task.x + (block < task.y);
return range;
}
MGPU_HOST_DEVICE int2 ComputeTaskRange(int block, int2 task, int blockSize,
int count) {
int2 range = ComputeTaskRange(block, task);
range.x *= blockSize;
range.y = min(count, range.y * blockSize);
return range;
}
////////////////////////////////////////////////////////////////////////////////
// DeviceExtractHeadFlags
// Input array flags is a bit array with 32 head flags per word.
// ExtractThreadHeadFlags returns numBits flags starting at bit index.
MGPU_HOST_DEVICE uint DeviceExtractHeadFlags(const uint* flags, int index,
int numBits) {
int index2 = index>> 5;
int shift = 31 & index;
uint headFlags = flags[index2]>> shift;
int shifted = 32 - shift;
if(shifted < numBits)
// We also need to shift in the next set of bits.
headFlags = bfi(flags[index2 + 1], headFlags, shifted, shift);
headFlags &= (1<< numBits) - 1;
return headFlags;
}
////////////////////////////////////////////////////////////////////////////////
// DevicePackHeadFlags
// Pack VT bits per thread at 32 bits/thread. Will consume an integer number of
// words, because CTA size is a multiple of 32. The first NT * VT / 32 threads
// return packed words.
template<int NT, int VT>
MGPU_DEVICE uint DevicePackHeadFlags(uint threadBits, int tid,
uint* flags_shared) {
const int WordCount = NT * VT / 32;
// Each thread stores its thread bits to flags_shared[tid].
flags_shared[tid] = threadBits;
__syncthreads();
uint packed = 0;
if(tid < WordCount) {
const int Items = MGPU_DIV_UP(32, VT);
int index = 32 * tid;
int first = index / VT;
int bit = 0;
int rem = index - VT * first;
packed = flags_shared[first]>> rem;
bit = VT - rem;
++first;
#pragma unroll
for(int i = 0; i < Items; ++i) {
if(i < Items - 1 || bit < 32) {
uint x = flags_shared[first + i];
if(bit < 32) packed |= x<< bit;
bit += VT;
}
}
}
__syncthreads();
return packed;
}
} // namespace mgpu
/******************************************************************************
* Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
*
* Code and text by Sean Baxter, NVIDIA Research
* See http://nvlabs.github.io/moderngpu for repository and documentation.
*
******************************************************************************/
#include "devicetypes.cuh"
#pragma once
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
namespace mgpu {
MGPU_HOST_DEVICE uint2 ulonglong_as_uint2(uint64 x) {
return *reinterpret_cast<uint2*>(&x);
}
MGPU_HOST_DEVICE uint64 uint2_as_ulonglong(uint2 x) {
return *reinterpret_cast<uint64*>(&x);
}
MGPU_HOST_DEVICE int2 longlong_as_int2(int64 x) {
return *reinterpret_cast<int2*>(&x);
}
MGPU_HOST_DEVICE int64 int2_as_longlong(int2 x) {
return *reinterpret_cast<int64*>(&x);
}
MGPU_HOST_DEVICE int2 double_as_int2(double x) {
return *reinterpret_cast<int2*>(&x);
}
MGPU_HOST_DEVICE double int2_as_double(int2 x) {
return *reinterpret_cast<double*>(&x);
}
MGPU_HOST_DEVICE void SetDoubleX(double& d, int x) {
reinterpret_cast<int*>(&d)[0] = x;
}
MGPU_HOST_DEVICE int GetDoubleX(double d) {
return double_as_int2(d).x;
}
MGPU_HOST_DEVICE void SetDoubleY(double& d, int y) {
reinterpret_cast<int*>(&d)[1] = y;
}
MGPU_HOST_DEVICE int GetDoubleY(double d) {
return double_as_int2(d).y;
}
////////////////////////////////////////////////////////////////////////////////
// PTX for bfe and bfi
#if __CUDA_ARCH__ >= 200
MGPU_DEVICE uint bfe_ptx(uint x, uint bit, uint numBits) {
uint result;
asm("bfe.u32 %0, %1, %2, %3;" :
"=r"(result) : "r"(x), "r"(bit), "r"(numBits));
return result;
}
MGPU_DEVICE uint bfi_ptx(uint x, uint y, uint bit, uint numBits) {
uint result;
asm("bfi.b32 %0, %1, %2, %3, %4;" :
"=r"(result) : "r"(x), "r"(y), "r"(bit), "r"(numBits));
return result;
}
MGPU_DEVICE uint prmt_ptx(uint a, uint b, uint index) {
uint ret;
asm("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index));
return ret;
}
#endif // __CUDA_ARCH__ >= 200
////////////////////////////////////////////////////////////////////////////////
// shfl_up
__device__ __forceinline__ float shfl_up(float var,
unsigned int delta, int width = 32) {
#if __CUDA_ARCH__ >= 300
var = __shfl_up_sync(0xFFFFFFFF, var, delta, width);
#endif
return var;
}
__device__ __forceinline__ double shfl_up(double var,
unsigned int delta, int width = 32) {
#if __CUDA_ARCH__ >= 300
int2 p = mgpu::double_as_int2(var);
p.x = __shfl_up_sync(0xFFFFFFFF, p.x, delta, width);
p.y = __shfl_up_sync(0xFFFFFFFF, p.y, delta, width);
var = mgpu::int2_as_double(p);
#endif
return var;
}
////////////////////////////////////////////////////////////////////////////////
// shfl_add
MGPU_DEVICE int shfl_add(int x, int offset, int width = WARP_SIZE) {
int result = 0;
#if __CUDA_ARCH__ >= 300
int mask = (WARP_SIZE - width)<< 8;
asm(
"{.reg .s32 r0;"
".reg .pred p;"
"shfl.up.sync.b32 r0|p, %1, %2, %3, %4;"
"@p add.s32 r0, r0, %4;"
"mov.s32 %0, r0; }"
: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
#endif
return result;
}
MGPU_DEVICE int shfl_max(int x, int offset, int width = WARP_SIZE) {
int result = 0;
#if __CUDA_ARCH__ >= 300
int mask = (WARP_SIZE - width)<< 8;
asm(
"{.reg .s32 r0;"
".reg .pred p;"
"shfl.up.sync..b32 r0|p, %1, %2, %3, %4;"
"@p max.s32 r0, r0, %4;"
"mov.s32 %0, r0; }"
: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
#endif
return result;
}
////////////////////////////////////////////////////////////////////////////////
// brev, popc, clz, bfe, bfi, prmt
// Reverse the bits in an integer.
MGPU_HOST_DEVICE uint brev(uint x) {
#if __CUDA_ARCH__ >= 200
uint y = __brev(x);
#else
uint y = 0;
for(int i = 0; i < 32; ++i)
y |= (1 & (x>> i))<< (31 - i);
#endif
return y;
}
// Count number of bits in a register.
MGPU_HOST_DEVICE int popc(uint x) {
#if __CUDA_ARCH__ >= 200
return __popc(x);
#else
int c;
for(c = 0; x; ++c)
x &= x - 1;
return c;
#endif
}
// Count leading zeros - start from most significant bit.
MGPU_HOST_DEVICE int clz(int x) {
#if __CUDA_ARCH__ >= 200
return __clz(x);
#else
for(int i = 31; i >= 0; --i)
if((1<< i) & x) return 31 - i;
return 32;
#endif
}
// Find first set - start from least significant bit. LSB is 1. ffs(0) is 0.
MGPU_HOST_DEVICE int ffs(int x) {
#if __CUDA_ARCH__ >= 200
return __ffs(x);
#else
for(int i = 0; i < 32; ++i)
if((1<< i) & x) return i + 1;
return 0;
#endif
}
MGPU_HOST_DEVICE uint bfe(uint x, uint bit, uint numBits) {
#if __CUDA_ARCH__ >= 200
return bfe_ptx(x, bit, numBits);
#else
return ((1<< numBits) - 1) & (x>> bit);
#endif
}
MGPU_HOST_DEVICE uint bfi(uint x, uint y, uint bit, uint numBits) {
uint result;
#if __CUDA_ARCH__ >= 200
result = bfi_ptx(x, y, bit, numBits);
#else
if(bit + numBits > 32) numBits = 32 - bit;
uint mask = ((1<< numBits) - 1)<< bit;
result = y & ~mask;
result |= mask & (x<< bit);
#endif
return result;
}
MGPU_HOST_DEVICE uint prmt(uint a, uint b, uint index) {
uint result;
#if __CUDA_ARCH__ >= 200
result = prmt_ptx(a, b, index);
#else
result = 0;
for(int i = 0; i < 4; ++i) {
uint sel = 0xf & (index>> (4 * i));
uint x = ((7 & sel) > 3) ? b : a;
x = 0xff & (x>> (8 * (3 & sel)));
if(8 & sel) x = (128 & x) ? 0xff : 0;
result |= x<< (8 * i);
}
#endif
return result;
}
// Find log2(x) and optionally round up to the next integer logarithm.
MGPU_HOST_DEVICE int FindLog2(int x, bool roundUp = false) {
int a = 31 - clz(x);
if(roundUp) a += !MGPU_IS_POW_2(x);
return a;
}
////////////////////////////////////////////////////////////////////////////////
// vset4
#if __CUDA_ARCH__ >= 300
// Performs four byte-wise comparisons and returns 1 for each byte that
// satisfies the conditional, and zero otherwise.
MGPU_DEVICE uint vset4_lt_add_ptx(uint a, uint b, uint c) {
uint result;
asm("vset4.u32.u32.lt.add %0, %1, %2, %3;" :
"=r"(result) : "r"(a), "r"(b), "r"(c));
return result;
}
MGPU_DEVICE uint vset4_eq_ptx(uint a, uint b) {
uint result;
asm("vset4.u32.u32.eq %0, %1, %2, %3;" :
"=r"(result) : "r"(a), "r"(b), "r"(0));
return result;
}
#endif // __CUDA_ARCH__ >= 300
MGPU_HOST_DEVICE uint vset4_lt_add(uint a, uint b, uint c) {
uint result;
#if __CUDA_ARCH__ >= 300
result = vset4_lt_add_ptx(a, b, c);
#else
result = c;
if((0x000000ff & a) < (0x000000ff & b)) result += 0x00000001;
if((0x0000ff00 & a) < (0x0000ff00 & b)) result += 0x00000100;
if((0x00ff0000 & a) < (0x00ff0000 & b)) result += 0x00010000;
if((0xff000000 & a) < (0xff000000 & b)) result += 0x01000000;
#endif
return result;
}
MGPU_HOST_DEVICE uint vset4_eq(uint a, uint b) {
uint result;
#if __CUDA_ARCH__ >= 300
result = vset4_eq_ptx(a, b);
#else
result = 0;
if((0x000000ff & a) == (0x000000ff & b)) result = 0x00000001;
if((0x0000ff00 & a) == (0x0000ff00 & b)) result += 0x00000100;
if((0x00ff0000 & a) == (0x00ff0000 & b)) result += 0x00010000;
if((0xff000000 & a) == (0xff000000 & b)) result += 0x01000000;
#endif
return result;
}
////////////////////////////////////////////////////////////////////////////////
//
MGPU_HOST_DEVICE uint umulhi(uint x, uint y) {
#if __CUDA_ARCH__ >= 100
return __umulhi(x, y);
#else
uint64 product = (uint64)x * y;
return (uint)(product>> 32);
#endif
}
////////////////////////////////////////////////////////////////////////////////
// ldg() function defined for all devices and all types. Only compiles to __ldg
// intrinsic for __CUDA_ARCH__ >= 320 && __CUDA_ARCH__ < 400 for types supported
// by __ldg in sm_32_intrinsics.h
template<typename T>
struct IsLdgType {
enum { value = false };
};
#define DEFINE_LDG_TYPE(T) \
template<> struct IsLdgType<T> { enum { value = true }; };
template<typename T, bool UseLDG = IsLdgType<T>::value>
struct LdgShim {
MGPU_DEVICE static T Ldg(const T* p) {
return *p;
}
};
#if __CUDA_ARCH__ >= 320 && __CUDA_ARCH__ < 400
// List of __ldg-compatible types from sm_32_intrinsics.h.
DEFINE_LDG_TYPE(char)
DEFINE_LDG_TYPE(short)
DEFINE_LDG_TYPE(int)
DEFINE_LDG_TYPE(long long)
DEFINE_LDG_TYPE(char2)
DEFINE_LDG_TYPE(char4)
DEFINE_LDG_TYPE(short2)
DEFINE_LDG_TYPE(short4)
DEFINE_LDG_TYPE(int2)
DEFINE_LDG_TYPE(int4)
DEFINE_LDG_TYPE(longlong2)
DEFINE_LDG_TYPE(unsigned char)
DEFINE_LDG_TYPE(unsigned short)
DEFINE_LDG_TYPE(unsigned int)
DEFINE_LDG_TYPE(unsigned long long)
DEFINE_LDG_TYPE(uchar2)
DEFINE_LDG_TYPE(uchar4)
DEFINE_LDG_TYPE(ushort2)
DEFINE_LDG_TYPE(ushort4)
DEFINE_LDG_TYPE(uint2)
DEFINE_LDG_TYPE(uint4)
DEFINE_LDG_TYPE(ulonglong2)
DEFINE_LDG_TYPE(float)
DEFINE_LDG_TYPE(double)
DEFINE_LDG_TYPE(float2)
DEFINE_LDG_TYPE(float4)
DEFINE_LDG_TYPE(double2)
template<typename T> struct LdgShim<T, true> {
MGPU_DEVICE static T Ldg(const T* p) {
return __ldg(p);
}
};
#endif
template<typename T>
MGPU_DEVICE T ldg(const T* p) {
return LdgShim<T>::Ldg(p);
}
////////////////////////////////////////////////////////////////////////////////
// Fast division for 31-bit integers.
// Uses the method in Hacker's Delight (2nd edition) page 228.
// Evaluates for denom > 1 and x < 2^31.
struct FastDivide {
uint denom;
uint coef;
uint shift;
MGPU_HOST_DEVICE uint Divide(uint x) {
return umulhi(x, coef)>> shift;
}
MGPU_HOST_DEVICE uint Modulus(uint x) {
return x - Divide(x) * denom;
}
explicit FastDivide(uint denom_) {
denom = denom_;
uint p = 31 + FindLog2(denom, true);
coef = (uint)(((1ull<< p) + denom - 1) / denom);
shift = p - 32;
}
};
#pragma GCC diagnostic pop
} // namespace mgpu
This diff is collapsed.
/******************************************************************************
* Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
*
* Code and text by Sean Baxter, NVIDIA Research
* See http://nvlabs.github.io/moderngpu for repository and documentation.
*
******************************************************************************/
#pragma once
#include "deviceutil.cuh"
namespace mgpu {
////////////////////////////////////////////////////////////////////////////////
// SerialSetIntersection
// Emit A if A and B are in range and equal.
template<int VT, bool RangeCheck, typename T, typename Comp>
MGPU_DEVICE int SerialSetIntersection(const T* data, int aBegin, int aEnd,
int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) {
const int MinIterations = VT / 2;
int commit = 0;
#pragma unroll
for(int i = 0; i < VT; ++i) {
bool test = RangeCheck ?
((aBegin + bBegin < end) && (aBegin < aEnd) && (bBegin < bEnd)) :
(i < MinIterations || (aBegin + bBegin < end));
if(test) {
T aKey = data[aBegin];
T bKey = data[bBegin];
bool pA = comp(aKey, bKey);
bool pB = comp(bKey, aKey);
// The outputs must come from A by definition of set interection.
results[i] = aKey;
indices[i] = aBegin;
if(!pB) ++aBegin;
if(!pA) ++bBegin;
if(pA == pB) commit |= 1<< i;
}
}
return commit;
}
////////////////////////////////////////////////////////////////////////////////
// SerialSetUnion
// Emit A if A <= B. Emit B if B < A.
template<int VT, bool RangeCheck, typename T, typename Comp>
MGPU_DEVICE int SerialSetUnion(const T* data, int aBegin, int aEnd,
int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) {
const int MinIterations = VT / 2;
int commit = 0;
#pragma unroll
for(int i = 0; i < VT; ++i) {
bool test = RangeCheck ?
(aBegin + bBegin < end) :
(i < MinIterations || (aBegin + bBegin < end));
if(test) {
T aKey = data[aBegin];
T bKey = data[bBegin];
bool pA = false, pB = false;
if(RangeCheck && aBegin >= aEnd)
pB = true;
else if(RangeCheck && bBegin >= bEnd)
pA = true;
else {
// Both are in range.
pA = comp(aKey, bKey);
pB = comp(bKey, aKey);
}
// Output A in case of a tie, so check if b < a.
results[i] = pB ? bKey : aKey;
indices[i] = pB ? bBegin : aBegin;
if(!pB) ++aBegin;
if(!pA) ++bBegin;
commit |= 1<< i;
}
}
return commit;
}
////////////////////////////////////////////////////////////////////////////////
// SerialSetDifference
// Emit A if A < B.
template<int VT, bool RangeCheck, typename T, typename Comp>
MGPU_DEVICE int SerialSetDifference(const T* data, int aBegin, int aEnd,
int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) {
const int MinIterations = VT / 2;
int commit = 0;
#pragma unroll
for(int i = 0; i < VT; ++i) {
bool test = RangeCheck ?
(aBegin + bBegin < end) :
(i < MinIterations || (aBegin + bBegin < end));
if(test) {
T aKey = data[aBegin];
T bKey = data[bBegin];
bool pA = false, pB = false;
if(RangeCheck && aBegin >= aEnd)
pB = true;
else if(RangeCheck && bBegin >= bEnd)
pA = true;
else {
pA = comp(aKey, bKey);
pB = comp(bKey, aKey);
}
// The outputs must come from A by definition of set difference.
results[i] = aKey;
indices[i] = aBegin;
if(!pB) ++aBegin;
if(!pA) ++bBegin;
if(pA) commit |= 1<< i;
}
}
return commit;
}
////////////////////////////////////////////////////////////////////////////////
// SerialSetSymDiff
// Emit A if A < B and emit B if B < A.
template<int VT, bool RangeCheck, typename T, typename Comp>
MGPU_DEVICE int SerialSetSymDiff(const T* data, int aBegin, int aEnd,
int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) {
const int MinIterations = VT / 2;
int commit = 0;
#pragma unroll
for(int i = 0; i < VT; ++i) {
bool test = RangeCheck ?
(aBegin + bBegin < end) :
(i < MinIterations || (aBegin + bBegin < end));
if(test) {
T aKey = data[aBegin];
T bKey = data[bBegin];
bool pA = false, pB = false;
if(RangeCheck && (bBegin >= bEnd))
pA = true;
else if(RangeCheck && (aBegin >= aEnd))
pB = true;
else {
pA = comp(aKey, bKey);
pB = comp(bKey, aKey);
}
results[i] = pA ? aKey : bKey;
indices[i] = pA ? aBegin : bBegin;
if(!pA) ++bBegin;
if(!pB) ++aBegin;
if(pA != pB) commit |= 1<< i;
}
}
return commit;
}
////////////////////////////////////////////////////////////////////////////////
// SerialSetOp
// Uses the MgpuSetOp enum to statically select one of the four serial ops
// above.
template<int VT, bool RangeCheck, MgpuSetOp Op, typename T, typename Comp>
MGPU_DEVICE int SerialSetOp(const T* data, int aBegin, int aEnd,
int bBegin, int bEnd, int star, T* results, int* indices, Comp comp) {
int end = aBegin + bBegin + VT - star;
if(RangeCheck) end = min(end, aEnd + bEnd);
int commit;
switch(Op) {
case MgpuSetOpIntersection:
commit = SerialSetIntersection<VT, RangeCheck>(data, aBegin,
aEnd, bBegin, bEnd, end, results, indices, comp);
break;
case MgpuSetOpUnion:
commit = SerialSetUnion<VT, RangeCheck>(data, aBegin, aEnd,
bBegin, bEnd, end, results, indices, comp);
break;
case MgpuSetOpDiff:
commit = SerialSetDifference<VT, RangeCheck>(data, aBegin, aEnd,
bBegin, bEnd, end, results, indices, comp);
break;
case MgpuSetOpSymDiff:
commit = SerialSetSymDiff<VT, RangeCheck>(data, aBegin, aEnd,
bBegin, bEnd, end, results, indices, comp);
break;
}
__syncthreads();
return commit;
}
} // namespace mgpu
/******************************************************************************
* Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
*
* Code and text by Sean Baxter, NVIDIA Research
* See http://nvlabs.github.io/moderngpu for repository and documentation.
*
******************************************************************************/
#pragma once
#include "deviceutil.cuh"
namespace mgpu {
////////////////////////////////////////////////////////////////////////////////
// Odd-even transposition sorting network. Sorts keys and values in-place in
// register.
// http://en.wikipedia.org/wiki/Odd%E2%80%93even_sort
// CUDA Compiler does not currently unroll these loops correctly. Write using
// template loop unrolling.
/*
template<int VT, typename T, typename V, typename Comp>
MGPU_DEVICE void OddEvenTransposeSort(T* keys, V* values, Comp comp) {
#pragma unroll
for(int level = 0; level < VT; ++level) {
#pragma unroll
for(int i = 1 & level; i < VT - 1; i += 2) {
if(comp(keys[i + 1], keys[i])) {
mgpu::swap(keys[i], keys[i + 1]);
mgpu::swap(values[i], values[i + 1]);
}
}
}
}*/
template<int I, int VT>
struct OddEvenTransposeSortT {
// Sort segments marked by head flags. If the head flag between i and i + 1
// is set (so that (2<< i) & flags is true), the values belong to different
// segments and are not swapped.
template<typename K, typename V, typename Comp>
static MGPU_DEVICE void Sort(K* keys, V* values, int flags, Comp comp) {
#pragma unroll
for(int i = 1 & I; i < VT - 1; i += 2)
if((0 == ((2<< i) & flags)) && comp(keys[i + 1], keys[i])) {
mgpu::swap(keys[i], keys[i + 1]);
mgpu::swap(values[i], values[i + 1]);
}
OddEvenTransposeSortT<I + 1, VT>::Sort(keys, values, flags, comp);
}
};
template<int I> struct OddEvenTransposeSortT<I, I> {
template<typename K, typename V, typename Comp>
static MGPU_DEVICE void Sort(K* keys, V* values, int flags, Comp comp) { }
};
template<int VT, typename K, typename V, typename Comp>
MGPU_DEVICE void OddEvenTransposeSort(K* keys, V* values, Comp comp) {
OddEvenTransposeSortT<0, VT>::Sort(keys, values, 0, comp);
}
template<int VT, typename K, typename V, typename Comp>
MGPU_DEVICE void OddEvenTransposeSortFlags(K* keys, V* values, int flags,
Comp comp) {
OddEvenTransposeSortT<0, VT>::Sort(keys, values, flags, comp);
}
////////////////////////////////////////////////////////////////////////////////
// Batcher Odd-Even Mergesort network
// Unstable but executes much faster than the transposition sort.
// http://en.wikipedia.org/wiki/Batcher_odd%E2%80%93even_mergesort
template<int Width, int Low, int Count>
struct OddEvenMergesortT {
template<typename K, typename V, typename Comp>
MGPU_DEVICE static void CompareAndSwap(K* keys, V* values, int flags,
int a, int b, Comp comp) {
if(b < Count) {
// Mask the bits between a and b. Any head flags in this interval
// means the keys are in different segments and must not be swapped.
const int Mask = ((2<< b) - 1) ^ ((2<< a) - 1);
if(!(Mask & flags) && comp(keys[b], keys[a])) {
mgpu::swap(keys[b], keys[a]);
mgpu::swap(values[b], values[a]);
}
}
}
template<int R, int Low2, bool Recurse = 2 * R < Width>
struct OddEvenMerge {
template<typename K, typename V, typename Comp>
MGPU_DEVICE static void Merge(K* keys, V* values, int flags,
Comp comp) {
// Compare and swap
const int M = 2 * R;
OddEvenMerge<M, Low2>::Merge(keys, values, flags, comp);
OddEvenMerge<M, Low2 + R>::Merge(keys, values, flags, comp);
#pragma unroll
for(int i = Low2 + R; i + R < Low2 + Width; i += M)
CompareAndSwap(keys, values, flags, i, i + R, comp);
}
};
template<int R, int Low2>
struct OddEvenMerge<R, Low2, false> {
template<typename K, typename V, typename Comp>
MGPU_DEVICE static void Merge(K* keys, V* values, int flags,
Comp comp) {
CompareAndSwap(keys, values, flags, Low2, Low2 + R, comp);
}
};
template<typename K, typename V, typename Comp>
MGPU_DEVICE static void Sort(K* keys, V* values, int flags,
Comp comp) {
const int M = Width / 2;
OddEvenMergesortT<M, Low, Count>::Sort(keys, values, flags, comp);
OddEvenMergesortT<M, Low + M, Count>::Sort(keys, values, flags, comp);
OddEvenMerge<1, Low>::Merge(keys, values, flags, comp);
}
};
template<int Low, int Count> struct OddEvenMergesortT<1, Low, Count> {
template<typename K, typename V, typename Comp>
MGPU_DEVICE static void Sort(K* keys, V* values, int flags,
Comp comp) { }
};
template<int VT, typename K, typename V, typename Comp>
MGPU_DEVICE void OddEvenMergesort(K* keys, V* values, Comp comp) {
const int Width = 1<< sLogPow2<VT, true>::value;
OddEvenMergesortT<Width, 0, VT>::Sort(keys, values, 0, comp);
}
template<int VT, typename K, typename V, typename Comp>
MGPU_DEVICE void OddEvenMergesortFlags(K* keys, V* values, int flags,
Comp comp) {
const int Width = 1<< sLogPow2<VT, true>::value;
OddEvenMergesortT<Width, 0, VT>::Sort(keys, values, flags, comp);
}
} // namespace mgpu
/******************************************************************************
* Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
*
* Code and text by Sean Baxter, NVIDIA Research
* See http://nvlabs.github.io/moderngpu for repository and documentation.
*
******************************************************************************/
#pragma once
#include "mgpuenums.h"
#include "device/deviceutil.cuh"
namespace mgpu {
////////////////////////////////////////////////////////////////////////////////
// device/loadstore.cuh
// For 0 <= i < VT:
// index = NT * i + tid;
// reg[i] = data[index];
// Synchronize after load.
template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceSharedToReg(InputIt data, int tid, T* reg,
bool sync = true);
// For 0 <= i < VT:
// index = NT * i + tid;
// if(index < count) reg[i] = data[index];
// No synchronize after load.
template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToReg(int count, InputIt data, int tid,
T* reg, bool sync = false);
template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToRegDefault(int count, InputIt data, int tid,
T* reg, T init, bool sync = false);
// For 0 <= i < VT:
// index = NT * i + tid;
// if(index < count) reg[i] = data[index];
// No synchronize after load.
template<int NT, int VT0, int VT1, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToReg(int count, InputIt data, int tid,
T* reg, bool sync = false);
// For 0 <= i < VT:
// index = NT * i + tid;
// if(index < count) reg[i] = data[index];
// No synchronize after load.
template<int NT, int VT0, int VT1, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToRegDefault2(int count, InputIt data, int tid,
T* reg, T init, bool sync = false);
// For 0 <= i < VT:
// index = NT * i + tid;
// if(index < count) reg[i] = data[index];
// No synchronize after load.
// No optimized code path for count < NV (smaller generated code).
template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToRegLoop(int count, InputIt data, int tid,
T* reg, bool sync = false);
// For 0 <= i < VT:
// index = VT * tid + i.
// if(index < count) reg[i] = data[index];
// No synchronize after load.
template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToThread(int count, InputIt data, int tid,
T* reg);
template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToThreadDefault(int count, InputIt data, int tid,
T* reg, T init);
// For 0 <= i < VT:
// index = NT * i + tid;
// if(index < count) data[index] = reg[i];
// Synchronize after load.
template<int NT, int VT, typename OutputIt, typename T>
MGPU_DEVICE void DeviceRegToShared(const T* reg, int tid, OutputIt dest,
bool sync = true);
// For 0 <= i < VT:
// index = NT * i + tid;
// if(index < count) data[index] = reg[i];
// No synchronize after load.
template<int NT, int VT, typename OutputIt, typename T>
MGPU_DEVICE void DeviceRegToGlobal(int count, const T* reg, int tid,
OutputIt dest, bool sync = false);
// For 0 <= index < count:
// dest[index] = source[index];
// This function is intended to replace DeviceGlobalToShared in cases where
// count is much less than NT * VT.
template<int NT, typename InputIt, typename OutputIt>
MGPU_DEVICE void DeviceMemToMemLoop(int count, InputIt source, int tid,
OutputIt dest, bool sync = true);
// For 0 <= index < count:
// dest[index] = source[index];
// Synchronize after store.
template<int NT, int VT, typename T, typename OutputIt>
MGPU_DEVICE void DeviceSharedToGlobal(int count, const T* source, int tid,
OutputIt dest, bool sync = true);
// For 0 <= index < count:
// dest[index] = source[index];
// Synchronize after store.
template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToShared(int count, InputIt source, int tid,
T* dest, bool sync = true);
template<int NT, int VT0, int VT1, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToShared2(int count, InputIt source, int tid,
T* dest, bool sync = true);
// For 0 <= index < count:
// dest[index] = source[index];
// Synchronize after store.
// No optimized code path for count < NV (smaller generated code).
template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToSharedLoop(int count, InputIt source, int tid,
T* dest, bool sync = true);
template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToSharedDefault(int count, InputIt source, int tid,
T* dest, T init, bool sync = true);
template<int NT, int VT0, int VT1, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToSharedDefault2(int count, InputIt source,
int tid, T* dest, T init, bool sync = true);
// For 0 <= index < count:
// dest[index] = source[index];
// No synchronize.
template<int NT, int VT, typename InputIt, typename OutputIt>
MGPU_DEVICE void DeviceGlobalToGlobal(int count, InputIt source, int tid,
OutputIt dest, bool sync = false);
// Transponse VT elements in NT threads (x) into thread-order registers (y)
// using only NT * VT / 2 elements of shared memory.
template<int NT, int VT, typename T>
MGPU_DEVICE void HalfSmemTranspose(const T* x, int tid, T* shared, T* y);
// For 0 <= i < VT:
// index = NT * i + tid;
// if(index < count)
// gather = indices[index];
// reg[i] = data[gather];
// Synchronize after load.
template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGather(int count, InputIt data, int indices[VT],
int tid, T* reg, bool sync = true);
template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGatherDefault(int count, InputIt data, int indices[VT],
int tid, T* reg, T identity, bool sync = true);
// For 0 <= i < VT:
// index = NT * i + tid;
// if(index < count)
// scatter = indices[index];
// data[scatter] = reg[i];
// Synchronize after store.
template<int NT, int VT, typename T, typename OutputIt>
MGPU_DEVICE void DeviceScatter(int count, const T* reg, int tid,
int indices[VT], OutputIt data, bool sync = true);
// For 0 <= i < VT:
// shared[VT * tid + i] = threadReg[i];
// Synchronize after store.
// Note this function moves data in THREAD ORDER.
// (DeviceRegToShared moves data in STRIDED ORDER).
template<int VT, typename T>
MGPU_DEVICE void DeviceThreadToShared(const T* threadReg, int tid, T* shared,
bool sync = true);
// For 0 <= i < VT:
// threadReg[i] = shared[VT * tid + i];
// Synchronize after load.
// Note this function moves data in THREAD ORDER.
// (DeviceSharedToReg moves data in STRIDED ORDER).
template<int VT, typename T>
MGPU_DEVICE void DeviceSharedToThread(const T* shared, int tid, T* threadReg,
bool sync = true);
// For 0 <= index < aCount:
// shared[index] = a_global[index];
// For 0 <= index < bCount:
// shared[aCount + index] = b_global[index];
// VT0 is the lower-bound for predication-free execution:
// If count >= NT * VT0, a predication-free branch is taken.
// VT1 is the upper-bound for loads:
// NT * VT1 must >= aCount + bCount.
template<int NT, int VT0, int VT1, typename T>
MGPU_DEVICE void DeviceLoad2ToReg(const T* a_global, int aCount,
const T* b_global, int bCount, int tid, T* reg, bool sync = false);
template<int NT, int VT0, int VT1, typename T>
MGPU_DEVICE void DeviceLoad2ToShared(const T* a_global, int aCount,
const T* b_global, int bCount, int tid, T* shared, bool sync = true);
template<int NT, int VT0, int VT1, typename InputIt1, typename InputIt2,
typename T>
MGPU_DEVICE void DeviceLoad2ToReg(InputIt1 a_global, int aCount,
InputIt2 b_global, int bCount, int tid, T* reg, bool sync = false);
template<int NT, int VT0, int VT1, typename InputIt1, typename InputIt2,
typename T>
MGPU_DEVICE void DeviceLoad2ToShared(InputIt1 a_global, int aCount,
InputIt2 b_global, int bCount, int tid, T* shared, bool sync = true);
// For 0 <= i < VT
// index = NT * i + tid;
// if(index < count)
// gather = indices_shared[index];
// dest_global[index] = data_global[gather];
// Synchronize after load.
template<int NT, int VT, typename InputIt, typename OutputIt>
MGPU_DEVICE void DeviceGatherGlobalToGlobal(int count, InputIt data_global,
const int* indices_shared, int tid, OutputIt dest_global,
bool sync = true);
// For 0 <= i < VT
// index = NT * i + tid
// if(index < count)
// gather = indices[index];
// if(gather < aCount) data = a_global[gather];
// else data = b_global[gather - aCount];
// dest_global[index] = data;
// Synchronize after load.
template<int NT, int VT, typename InputIt1, typename InputIt2,
typename T>
MGPU_DEVICE void DeviceTransferMergeValuesReg(int count, InputIt1 a_global,
InputIt2 b_global, int bStart, const int* indices, int tid,
T* reg, bool sync = false);
template<int NT, int VT, typename InputIt1, typename InputIt2,
typename OutputIt>
MGPU_DEVICE void DeviceTransferMergeValuesShared(int count, InputIt1 a_global,
InputIt2 b_global, int bStart, const int* indices_shared, int tid,
OutputIt dest_global, bool sync = true);
template<int NT, int VT, typename T>
MGPU_DEVICE void DeviceTransferMergeValuesReg(int count, const T* a_global,
const T* b_global, int bStart, const int* indices, int tid,
T* reg, bool sync = false);
template<int NT, int VT, typename T, typename OutputIt>
MGPU_DEVICE void DeviceTransferMergeValuesShared(int count, const T* a_global,
const T* b_global, int bStart, const int* indices_shared, int tid,
OutputIt dest_global, bool sync = true);
} // namespace mgpu
#include "device/loadstore.cuh"
#include "device/ctasegscan.cuh"
/******************************************************************************
* Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
*
* Code and text by Sean Baxter, NVIDIA Research
* See http://nvlabs.github.io/moderngpu for repository and documentation.
*
******************************************************************************/
#pragma once
namespace mgpu {
enum MgpuBounds {
MgpuBoundsLower,
MgpuBoundsUpper
};
enum MgpuScanType {
MgpuScanTypeExc,
MgpuScanTypeInc
};
enum MgpuSearchType {
MgpuSearchTypeNone,
MgpuSearchTypeIndex,
MgpuSearchTypeMatch,
MgpuSearchTypeIndexMatch
};
enum MgpuJoinKind {
MgpuJoinKindInner,
MgpuJoinKindLeft,
MgpuJoinKindRight,
MgpuJoinKindOuter
};
enum MgpuSetOp {
MgpuSetOpIntersection,
MgpuSetOpUnion,
MgpuSetOpDiff,
MgpuSetOpSymDiff
};
} // namespace mgpu
/******************************************************************************
* Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
*
* Code and text by Sean Baxter, NVIDIA Research
* See http://nvlabs.github.io/moderngpu for repository and documentation.
*
******************************************************************************/
#pragma once
#include <functional>
#include <iterator>
#include <cfloat>
#include <typeinfo>
#include <vector>
#include <list>
#include <map>
#include <algorithm>
#include <cassert>
#include <memory>
#include <cmath>
#include <cstdio>
#include <cstdlib>
#ifndef MGPU_MIN
#define MGPU_MIN(x, y) (((x) <= (y)) ? (x) : (y))
#define MGPU_MAX(x, y) (((x) >= (y)) ? (x) : (y))
#define MGPU_MAX0(x) (((x) >= 0) ? (x) : 0)
#define MGPU_ABS(x) (((x) >= 0) ? (x) : (-x))
#define MGPU_DIV_UP(x, y) (((x) + (y) - 1) / (y))
#define MGPU_DIV_ROUND(x, y) (((x) + (y) / 2) / (y))
#define MGPU_ROUND_UP(x, y) ((y) * MGPU_DIV_UP(x, y))
#define MGPU_SHIFT_DIV_UP(x, y) (((x) + ((1<< (y)) - 1))>> y)
#define MGPU_ROUND_UP_POW2(x, y) (((x) + (y) - 1) & ~((y) - 1))
#define MGPU_ROUND_DOWN_POW2(x, y) ((x) & ~((y) - 1))
#define MGPU_IS_POW_2(x) (0 == ((x) & ((x) - 1)))
#endif // MGPU_MIN
namespace mgpu {
typedef unsigned char byte;
typedef unsigned int uint;
typedef signed short int16;
typedef unsigned short ushort;
typedef unsigned short uint16;
typedef long long int64;
typedef unsigned long long uint64;
// IsPow2<X>::value is true if X is a power of 2.
template<int X> struct sIsPow2 {
enum { value = 0 == (X & (X - 1)) };
};
// Finds the base-2 logarithm of X. value is -1 if X is not a power of 2.
template<int X, bool roundUp = true> struct sLogPow2 {
enum { extra = sIsPow2<X>::value ? 0 : (roundUp ? 1 : 0) };
enum { inner = sLogPow2<X / 2>::inner + 1 };
enum { value = inner + extra };
};
template<bool roundUp> struct sLogPow2<0, roundUp> {
enum { inner = 0 };
enum { value = 0 };
};
template<bool roundUp> struct sLogPow2<1, roundUp> {
enum { inner = 0 };
enum { value = 0 };
};
template<int X, int Y>
struct sDivUp {
enum { value = (X + Y - 1) / Y };
};
template<int count, int levels> struct sDiv2RoundUp {
enum { value = sDiv2RoundUp<sDivUp<count, 2>::value, levels - 1>::value };
};
template<int count> struct sDiv2RoundUp<count, 0> {
enum { value = count };
};
template<int X, int Y>
struct sDivSafe {
enum { value = X / Y };
};
template<int X>
struct sDivSafe<X, 0> {
enum { value = 0 };
};
template<int X, int Y>
struct sRoundUp {
enum { rem = X % Y };
enum { value = X + (rem ? (Y - rem) : 0) };
};
template<int X, int Y>
struct sRoundDown {
enum { rem = X % Y };
enum { value = X - rem };
};
// IntegerDiv is a template for avoiding divisions by zero in template
// evaluation. Templates always evaluate both b and c in an expression like
// a ? b : c, and will error if either rhs contains an illegal expression,
// even if the ternary is explictly designed to guard against that.
template<int X, int Y>
struct sIntegerDiv {
enum { value = X / (Y ? Y : (X + 1)) };
};
template<int X, int Y>
struct sMax {
enum { value = (X >= Y) ? X : Y };
};
template<int X, int Y>
struct sMin {
enum { value = (X <= Y) ? X : Y };
};
template<int X>
struct sAbs {
enum { value = (X >= 0) ? X : -X };
};
// Finds the number of powers of 2 in the prime factorization of X.
template<int X, int LSB = 1 & X> struct sNumFactorsOf2 {
enum { shifted = X >> 1 };
enum { value = 1 + sNumFactorsOf2<shifted>::value };
};
template<int X> struct sNumFactorsOf2<X, 1> {
enum { value = 0 };
};
// Returns the divisor for a conflict-free transpose.
template<int X, int NumBanks = 32> struct sBankConflictDivisor {
enum { value =
(1 & X) ? 0 :
(sIsPow2<X>::value ? NumBanks :
(1<< sNumFactorsOf2<X>::value)) };
enum { log_value = sLogPow2<value>::value };
};
template<int NT, int X, int NumBanks = 32> struct sConflictFreeStorage {
enum { count = NT * X };
enum { divisor = sBankConflictDivisor<X, NumBanks>::value };
enum { padding = sDivSafe<count, divisor>::value };
enum { value = count + padding };
};
} // namespace mgpu
/** \file ctc.h
* Contains a simple C interface to call fast CPU and GPU based computation
* of the CTC loss.
*/
#pragma once
#ifdef __cplusplus
#include <cstddef>
#include <torch/extension.h>
extern "C" {
#endif
//forward declare of CUDA typedef to avoid needing to pull in CUDA headers
//typedef struct CUstream_st* CUstream;
typedef struct ihipStream_t* CUstream;
typedef enum {
CTC_STATUS_SUCCESS = 0,
CTC_STATUS_MEMOPS_FAILED = 1,
CTC_STATUS_INVALID_VALUE = 2,
CTC_STATUS_EXECUTION_FAILED = 3,
CTC_STATUS_UNKNOWN_ERROR = 4
} ctcStatus_t;
/** Returns a single integer which specifies the API version of the warpctc library */
int get_warpctc_version();
/** Returns a string containing a description of status that was passed in
* \param[in] status identifies which string should be returned
* \return C style string containing the text description
* */
const char* ctcGetStatusString(ctcStatus_t status);
typedef enum {
CTC_CPU = 0,
CTC_GPU = 1
} ctcComputeLocation;
/** Structure used for options to the CTC compution. Applications
* should zero out the array using memset and sizeof(struct
* ctcOptions) in C or default initialization (e.g. 'ctcOptions
* options{};' or 'auto options = ctcOptions{}') in C++ to ensure
* forward compatibility with added options. */
struct ctcOptions {
/// indicates where the ctc calculation should take place {CTC_CPU | CTC_GPU}
ctcComputeLocation loc;
union {
/// used when loc == CTC_CPU, the maximum number of threads that can be used
unsigned int num_threads;
/// used when loc == CTC_GPU, which stream the kernels should be launched in
CUstream stream;
};
/// the label value/index that the CTC calculation should use as the blank label
int blank_label;
};
/** Compute the connectionist temporal classification loss between a sequence
* of probabilities and a ground truth labeling. Optionally compute the
* gradient with respect to the inputs.
* \param [in] activations pointer to the activations in either CPU or GPU
* addressable memory, depending on info. We assume a fixed
* memory layout for this 3 dimensional tensor, which has dimension
* (t, n, p), where t is the time index, n is the minibatch index,
* and p indexes over probabilities of each symbol in the alphabet.
* The memory layout is (t, n, p) in C order (slowest to fastest changing
* index, aka row-major), or (p, n, t) in Fortran order (fastest to slowest
* changing index, aka column-major). We also assume strides are equal to
* dimensions - there is no padding between dimensions.
* More precisely, element (t, n, p), for a problem with mini_batch examples
* in the mini batch, and alphabet_size symbols in the alphabet, is located at:
* activations[(t * mini_batch + n) * alphabet_size + p]
* \param [out] gradients if not NULL, then gradients are computed. Should be
* allocated in the same memory space as probs and memory
* ordering is identical.
* \param [in] flat_labels Always in CPU memory. A concatenation
* of all the labels for the minibatch.
* \param [in] label_lengths Always in CPU memory. The length of each label
* for each example in the minibatch.
* \param [in] input_lengths Always in CPU memory. The number of time steps
* for each sequence in the minibatch.
* \param [in] alphabet_size The number of possible output symbols. There
* should be this many probabilities for each time step.
* \param [in] mini_batch How many examples in a minibatch.
* \param [out] costs Always in CPU memory. The cost of each example in the
* minibatch.
* \param [in,out] workspace In same memory space as probs. Should be of
* size requested by get_workspace_size.
* \param [in] options see struct ctcOptions
*
* \return Status information
*
* */
ctcStatus_t compute_ctc_loss(const float* const activations,
float* gradients,
const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths,
int alphabet_size,
int minibatch,
float *costs,
void *workspace,
ctcOptions options);
/** For a given set of labels and minibatch size return the required workspace
* size. This will need to be allocated in the same memory space as your
* probabilities.
* \param [in] label_lengths Always in CPU memory. The length of each label
* for each example in the minibatch.
* \param [in] input_lengths Always in CPU memory. The number of time steps
* for each sequence in the minibatch.
* \param [in] alphabet_size How many symbols in the alphabet or, equivalently,
* the number of probabilities at each time step
* \param [in] mini_batch How many examples in a minibatch.
* \param [in] info see struct ctcOptions
* \param [out] size_bytes is pointer to a scalar where the memory
* requirement in bytes will be placed. This memory should be allocated
* at the same place, CPU or GPU, that the probs are in
*
* \return Status information
**/
ctcStatus_t get_workspace_size(const int* const label_lengths,
const int* const input_lengths,
int alphabet_size, int minibatch,
ctcOptions info,
size_t* size_bytes);
#ifdef __cplusplus
}
#endif
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
#pragma once
#ifdef __HIPCC__
#define HOSTDEVICE __device__ __host__
#else
#define HOSTDEVICE
#endif
#pragma once
ctcStatus_t reduce_negate(const float* input, float* output, int rows, int cols, bool axis, CUstream stream);
ctcStatus_t reduce_exp(const float* input, float* output, int rows, int cols, bool axis, CUstream stream);
ctcStatus_t reduce_max(const float* input, float* output, int rows, int cols, bool axis, CUstream stream);
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
#pragma once
/*
int cpu_ctc(THFloatTensor *probs,
THFloatTensor *grads,
THIntTensor *labels_ptr,
THIntTensor *label_sizes_ptr,
THIntTensor *sizes,
int minibatch_size,
THFloatTensor *costs,
int blank_label);
*/
int cpu_ctc(torch::Tensor probs,
torch::Tensor grads,
torch::Tensor labels,
torch::Tensor label_sizes,
torch::Tensor sizes,
int minibatch_size,
torch::Tensor costs,
int blank_label);
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment