warpctc for dcu

0bf5eb5f · lishen · 949fcc19 · 0bf5eb5f · 0bf5eb5f · 0bf5eb5f
Commit 0bf5eb5f authored May 16, 2023 by lishen
20 changed files
--- a/include/contrib/moderngpu/include/device/deviceutil.cuh
+++ b/include/contrib/moderngpu/include/device/deviceutil.cuh
+/******************************************************************************
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+/******************************************************************************
+ *
+ * Code and text by Sean Baxter, NVIDIA Research
+ * See http://nvlabs.github.io/moderngpu for repository and documentation.
+ *
+ ******************************************************************************/
+#pragma once
+#include "intrinsics.cuh"
+namespace mgpu {
+// Get the difference between two pointers in bytes.
+MGPU_HOST_DEVICE ptrdiff_t PtrDiff(const void* a, const void* b) {
+	return (const byte*)b - (const byte*)a;
+}
+// Offset a pointer by i bytes.
+template<typename T>
+MGPU_HOST_DEVICE const T* PtrOffset(const T* p, ptrdiff_t i) {
+	return (const T*)((const byte*)p + i);
+}
+template<typename T>
+MGPU_HOST_DEVICE T* PtrOffset(T* p, ptrdiff_t i) {
+	return (T*)((byte*)p + i);
+}
+////////////////////////////////////////////////////////////////////////////////
+// Task range support
+// Evenly distributes variable-length arrays over a fixed number of CTAs.
+MGPU_HOST int2 DivideTaskRange(int numItems, int numWorkers) {
+	div_t d = div(numItems, numWorkers);
+	return make_int2(d.quot, d.rem);
+}
+MGPU_HOST_DEVICE int2 ComputeTaskRange(int block, int2 task) {
+	int2 range;
+	range.x = task.x * block;
+	range.x += min(block, task.y);
+	range.y = range.x + task.x + (block < task.y);
+	return range;
+}
+MGPU_HOST_DEVICE int2 ComputeTaskRange(int block, int2 task, int blockSize,
+	int count) {
+	int2 range = ComputeTaskRange(block, task);
+	range.x *= blockSize;
+	range.y = min(count, range.y * blockSize);
+	return range;
+}
+////////////////////////////////////////////////////////////////////////////////
+// DeviceExtractHeadFlags
+// Input array flags is a bit array with 32 head flags per word.
+// ExtractThreadHeadFlags returns numBits flags starting at bit index.
+MGPU_HOST_DEVICE uint DeviceExtractHeadFlags(const uint* flags, int index,
+	int numBits) {
+	int index2 = index>> 5;
+	int shift = 31 & index;
+	uint headFlags = flags[index2]>> shift;
+	int shifted = 32 - shift;
+	if(shifted < numBits)
+		// We also need to shift in the next set of bits.
+		headFlags = bfi(flags[index2 + 1], headFlags, shifted, shift);
+	headFlags &= (1<< numBits) - 1;
+	return headFlags;
+}
+////////////////////////////////////////////////////////////////////////////////
+// DevicePackHeadFlags
+// Pack VT bits per thread at 32 bits/thread. Will consume an integer number of
+// words, because CTA size is a multiple of 32. The first NT * VT / 32 threads
+// return packed words.
+template<int NT, int VT>
+MGPU_DEVICE uint DevicePackHeadFlags(uint threadBits, int tid,
+	uint* flags_shared) {
+	const int WordCount = NT * VT / 32;
+	// Each thread stores its thread bits to flags_shared[tid].
+	flags_shared[tid] = threadBits;
+	__syncthreads();
+	uint packed = 0;
+	if(tid < WordCount) {
+		const int Items = MGPU_DIV_UP(32, VT);
+		int index = 32 * tid;
+		int first = index / VT;
+		int bit = 0;
+		int rem = index - VT * first;
+		packed = flags_shared[first]>> rem;
+		bit = VT - rem;
+		++first;
+		#pragma unroll
+		for(int i = 0; i < Items; ++i) {
+			if(i < Items - 1 || bit < 32) {
+				uint x = flags_shared[first + i];
+				if(bit < 32) packed |= x<< bit;
+				bit += VT;
+			}
+		}
+	}
+	__syncthreads();
+	return packed;
+}
+} // namespace mgpu
--- a/include/contrib/moderngpu/include/device/intrinsics.cuh
+++ b/include/contrib/moderngpu/include/device/intrinsics.cuh
+/******************************************************************************
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+/******************************************************************************
+ *
+ * Code and text by Sean Baxter, NVIDIA Research
+ * See http://nvlabs.github.io/moderngpu for repository and documentation.
+ *
+ ******************************************************************************/
+#include "devicetypes.cuh"
+#pragma once
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+namespace mgpu {
+MGPU_HOST_DEVICE uint2 ulonglong_as_uint2(uint64 x) {
+	return *reinterpret_cast<uint2*>(&x);
+}
+MGPU_HOST_DEVICE uint64 uint2_as_ulonglong(uint2 x) {
+	return *reinterpret_cast<uint64*>(&x);
+}
+MGPU_HOST_DEVICE int2 longlong_as_int2(int64 x) {
+	return *reinterpret_cast<int2*>(&x);
+}
+MGPU_HOST_DEVICE int64 int2_as_longlong(int2 x) {
+	return *reinterpret_cast<int64*>(&x);
+}
+MGPU_HOST_DEVICE int2 double_as_int2(double x) {
+	return *reinterpret_cast<int2*>(&x);
+}
+MGPU_HOST_DEVICE double int2_as_double(int2 x) {
+	return *reinterpret_cast<double*>(&x);
+}
+MGPU_HOST_DEVICE void SetDoubleX(double& d, int x) {
+	reinterpret_cast<int*>(&d)[0] = x;
+}
+MGPU_HOST_DEVICE int GetDoubleX(double d) {
+	return double_as_int2(d).x;
+}
+MGPU_HOST_DEVICE void SetDoubleY(double& d, int y) {
+	reinterpret_cast<int*>(&d)[1] = y;
+}
+MGPU_HOST_DEVICE int GetDoubleY(double d) {
+	return double_as_int2(d).y;
+}
+////////////////////////////////////////////////////////////////////////////////
+// PTX for bfe and bfi
+#if __CUDA_ARCH__ >= 200
+MGPU_DEVICE uint bfe_ptx(uint x, uint bit, uint numBits) {
+	uint result;
+	asm("bfe.u32 %0, %1, %2, %3;" :
+		"=r"(result) : "r"(x), "r"(bit), "r"(numBits));
+	return result;
+}
+MGPU_DEVICE uint bfi_ptx(uint x, uint y, uint bit, uint numBits) {
+	uint result;
+	asm("bfi.b32 %0, %1, %2, %3, %4;" :
+		"=r"(result) : "r"(x), "r"(y), "r"(bit), "r"(numBits));
+	return result;
+}
+MGPU_DEVICE uint prmt_ptx(uint a, uint b, uint index) {
+	uint ret;
+	asm("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index));
+	return ret;
+}
+#endif // __CUDA_ARCH__ >= 200
+////////////////////////////////////////////////////////////////////////////////
+// shfl_up
+__device__ __forceinline__ float shfl_up(float var,
+	unsigned int delta, int width = 32) {
+#if __CUDA_ARCH__ >= 300
+	var = __shfl_up_sync(0xFFFFFFFF, var, delta, width);
+#endif
+	return var;
+}
+__device__ __forceinline__ double shfl_up(double var,
+	unsigned int delta, int width = 32) {
+#if __CUDA_ARCH__ >= 300
+	int2 p = mgpu::double_as_int2(var);
+	p.x = __shfl_up_sync(0xFFFFFFFF, p.x, delta, width);
+	p.y = __shfl_up_sync(0xFFFFFFFF, p.y, delta, width);
+	var = mgpu::int2_as_double(p);
+#endif
+	return var;
+}
+////////////////////////////////////////////////////////////////////////////////
+// shfl_add
+MGPU_DEVICE int shfl_add(int x, int offset, int width = WARP_SIZE) {
+	int result = 0;
+#if __CUDA_ARCH__ >= 300
+	int mask = (WARP_SIZE - width)<< 8;
+	asm(
+		"{.reg .s32 r0;"
+		".reg .pred p;"
+		"shfl.up.sync.b32 r0|p, %1, %2, %3, %4;"
+		"@p add.s32 r0, r0, %4;"
+		"mov.s32 %0, r0; }"
+		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
+#endif
+	return result;
+}
+MGPU_DEVICE int shfl_max(int x, int offset, int width = WARP_SIZE) {
+	int result = 0;
+#if __CUDA_ARCH__ >= 300
+	int mask = (WARP_SIZE - width)<< 8;
+	asm(
+		"{.reg .s32 r0;"
+		".reg .pred p;"
+		"shfl.up.sync..b32 r0|p, %1, %2, %3, %4;"
+		"@p max.s32 r0, r0, %4;"
+		"mov.s32 %0, r0; }"
+		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
+#endif
+	return result;
+}
+////////////////////////////////////////////////////////////////////////////////
+// brev, popc, clz, bfe, bfi, prmt
+// Reverse the bits in an integer.
+MGPU_HOST_DEVICE uint brev(uint x) {
+#if __CUDA_ARCH__ >= 200
+	uint y = __brev(x);
+#else
+	uint y = 0;
+	for(int i = 0; i < 32; ++i)
+		y |= (1 & (x>> i))<< (31 - i);
+#endif
+	return y;
+}
+// Count number of bits in a register.
+MGPU_HOST_DEVICE int popc(uint x) {
+#if __CUDA_ARCH__ >= 200
+	return __popc(x);
+#else
+	int c;
+	for(c = 0; x; ++c)
+		x &= x - 1;
+	return c;
+#endif
+}
+// Count leading zeros - start from most significant bit.
+MGPU_HOST_DEVICE int clz(int x) {
+#if __CUDA_ARCH__ >= 200
+	return __clz(x);
+#else
+	for(int i = 31; i >= 0; --i)
+		if((1<< i) & x) return 31 - i;
+	return 32;
+#endif
+}
+// Find first set - start from least significant bit. LSB is 1. ffs(0) is 0.
+MGPU_HOST_DEVICE int ffs(int x) {
+#if __CUDA_ARCH__ >= 200
+	return __ffs(x);
+#else
+	for(int i = 0; i < 32; ++i)
+		if((1<< i) & x) return i + 1;
+	return 0;
+#endif
+}
+MGPU_HOST_DEVICE uint bfe(uint x, uint bit, uint numBits) {
+#if __CUDA_ARCH__ >= 200
+	return bfe_ptx(x, bit, numBits);
+#else
+	return ((1<< numBits) - 1) & (x>> bit);
+#endif
+}
+MGPU_HOST_DEVICE uint bfi(uint x, uint y, uint bit, uint numBits) {
+	uint result;
+#if __CUDA_ARCH__ >= 200
+	result = bfi_ptx(x, y, bit, numBits);
+#else
+	if(bit + numBits > 32) numBits = 32 - bit;
+	uint mask = ((1<< numBits) - 1)<< bit;
+	result = y & ~mask;
+	result |= mask & (x<< bit);
+#endif
+	return result;
+}
+MGPU_HOST_DEVICE uint prmt(uint a, uint b, uint index) {
+	uint result;
+#if __CUDA_ARCH__ >= 200
+	result = prmt_ptx(a, b, index);
+#else
+	result = 0;
+	for(int i = 0; i < 4; ++i) {
+		uint sel = 0xf & (index>> (4 * i));
+		uint x = ((7 & sel) > 3) ? b : a;
+		x = 0xff & (x>> (8 * (3 & sel)));
+		if(8 & sel) x = (128 & x) ? 0xff : 0;
+		result |= x<< (8 * i);
+	}
+#endif
+	return result;
+}
+// Find log2(x) and optionally round up to the next integer logarithm.
+MGPU_HOST_DEVICE int FindLog2(int x, bool roundUp = false) {
+	int a = 31 - clz(x);
+	if(roundUp) a += !MGPU_IS_POW_2(x);
+	return a;
+}
+////////////////////////////////////////////////////////////////////////////////
+// vset4
+#if __CUDA_ARCH__ >= 300
+// Performs four byte-wise comparisons and returns 1 for each byte that
+// satisfies the conditional, and zero otherwise.
+MGPU_DEVICE uint vset4_lt_add_ptx(uint a, uint b, uint c) {
+	uint result;
+	asm("vset4.u32.u32.lt.add %0, %1, %2, %3;" :
+		"=r"(result) : "r"(a), "r"(b), "r"(c));
+	return result;
+}
+MGPU_DEVICE uint vset4_eq_ptx(uint a, uint b) {
+	uint result;
+	asm("vset4.u32.u32.eq %0, %1, %2, %3;" :
+		"=r"(result) : "r"(a), "r"(b), "r"(0));
+	return result;
+}
+#endif // __CUDA_ARCH__ >= 300
+MGPU_HOST_DEVICE uint vset4_lt_add(uint a, uint b, uint c) {
+	uint result;
+#if __CUDA_ARCH__ >= 300
+	result = vset4_lt_add_ptx(a, b, c);
+#else
+	result = c;
+	if((0x000000ff & a) < (0x000000ff & b)) result += 0x00000001;
+	if((0x0000ff00 & a) < (0x0000ff00 & b)) result += 0x00000100;
+	if((0x00ff0000 & a) < (0x00ff0000 & b)) result += 0x00010000;
+	if((0xff000000 & a) < (0xff000000 & b)) result += 0x01000000;
+#endif
+	return result;
+}
+MGPU_HOST_DEVICE uint vset4_eq(uint a, uint b) {
+	uint result;
+#if __CUDA_ARCH__ >= 300
+	result = vset4_eq_ptx(a, b);
+#else
+	result = 0;
+	if((0x000000ff & a) == (0x000000ff & b)) result = 0x00000001;
+	if((0x0000ff00 & a) == (0x0000ff00 & b)) result += 0x00000100;
+	if((0x00ff0000 & a) == (0x00ff0000 & b)) result += 0x00010000;
+	if((0xff000000 & a) == (0xff000000 & b)) result += 0x01000000;
+#endif
+	return result;
+}
+////////////////////////////////////////////////////////////////////////////////
+//
+MGPU_HOST_DEVICE uint umulhi(uint x, uint y) {
+#if __CUDA_ARCH__ >= 100
+	return __umulhi(x, y);
+#else
+	uint64 product = (uint64)x * y;
+	return (uint)(product>> 32);
+#endif
+}
+////////////////////////////////////////////////////////////////////////////////
+// ldg() function defined for all devices and all types. Only compiles to __ldg
+// intrinsic for __CUDA_ARCH__ >= 320 && __CUDA_ARCH__ < 400 for types supported
+// by __ldg in sm_32_intrinsics.h
+template<typename T>
+struct IsLdgType {
+	enum { value = false };
+};
+#define DEFINE_LDG_TYPE(T) \
+	template<> struct IsLdgType<T> { enum { value = true }; };
+template<typename T, bool UseLDG = IsLdgType<T>::value>
+struct LdgShim {
+	MGPU_DEVICE static T Ldg(const T* p) {
+		return *p;
+	}
+};
+#if __CUDA_ARCH__ >= 320 && __CUDA_ARCH__ < 400
+	// List of __ldg-compatible types from sm_32_intrinsics.h.
+	DEFINE_LDG_TYPE(char)
+	DEFINE_LDG_TYPE(short)
+	DEFINE_LDG_TYPE(int)
+	DEFINE_LDG_TYPE(long long)
+	DEFINE_LDG_TYPE(char2)
+	DEFINE_LDG_TYPE(char4)
+	DEFINE_LDG_TYPE(short2)
+	DEFINE_LDG_TYPE(short4)
+	DEFINE_LDG_TYPE(int2)
+	DEFINE_LDG_TYPE(int4)
+	DEFINE_LDG_TYPE(longlong2)
+	DEFINE_LDG_TYPE(unsigned char)
+	DEFINE_LDG_TYPE(unsigned short)
+	DEFINE_LDG_TYPE(unsigned int)
+	DEFINE_LDG_TYPE(unsigned long long)
+	DEFINE_LDG_TYPE(uchar2)
+	DEFINE_LDG_TYPE(uchar4)
+	DEFINE_LDG_TYPE(ushort2)
+	DEFINE_LDG_TYPE(ushort4)
+	DEFINE_LDG_TYPE(uint2)
+	DEFINE_LDG_TYPE(uint4)
+	DEFINE_LDG_TYPE(ulonglong2)
+	DEFINE_LDG_TYPE(float)
+	DEFINE_LDG_TYPE(double)
+	DEFINE_LDG_TYPE(float2)
+	DEFINE_LDG_TYPE(float4)
+	DEFINE_LDG_TYPE(double2)
+	template<typename T> struct LdgShim<T, true> {
+		MGPU_DEVICE static T Ldg(const T* p) {
+			return __ldg(p);
+		}
+	};
+#endif
+template<typename T>
+MGPU_DEVICE T ldg(const T* p) {
+	return LdgShim<T>::Ldg(p);
+}
+////////////////////////////////////////////////////////////////////////////////
+// Fast division for 31-bit integers.
+// Uses the method in Hacker's Delight (2nd edition) page 228.
+// Evaluates for denom > 1 and x < 2^31.
+struct FastDivide {
+	uint denom;
+	uint coef;
+	uint shift;
+	MGPU_HOST_DEVICE uint Divide(uint x) {
+		return umulhi(x, coef)>> shift;
+	}
+	MGPU_HOST_DEVICE uint Modulus(uint x) {
+		return x - Divide(x) * denom;
+	}
+	explicit FastDivide(uint denom_) {
+		denom = denom_;
+		uint p = 31 + FindLog2(denom, true);
+		coef = (uint)(((1ull<< p) + denom - 1) / denom);
+		shift = p - 32;
+	}
+};
+#pragma GCC diagnostic pop
+} // namespace mgpu
--- a/include/contrib/moderngpu/include/device/loadstore.cuh
+++ b/include/contrib/moderngpu/include/device/loadstore.cuh
--- a/include/contrib/moderngpu/include/device/serialsets.cuh
+++ b/include/contrib/moderngpu/include/device/serialsets.cuh
+/******************************************************************************
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+/******************************************************************************
+ *
+ * Code and text by Sean Baxter, NVIDIA Research
+ * See http://nvlabs.github.io/moderngpu for repository and documentation.
+ *
+ ******************************************************************************/
+#pragma once
+#include "deviceutil.cuh"
+namespace mgpu {
+////////////////////////////////////////////////////////////////////////////////
+// SerialSetIntersection
+// Emit A if A and B are in range and equal.
+template<int VT, bool RangeCheck, typename T, typename Comp>
+MGPU_DEVICE int SerialSetIntersection(const T* data, int aBegin, int aEnd,
+	int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) {
+	const int MinIterations = VT / 2;
+	int commit = 0;
+	#pragma unroll
+	for(int i = 0; i < VT; ++i) {
+		bool test = RangeCheck ?
+			((aBegin + bBegin < end) && (aBegin < aEnd) && (bBegin < bEnd)) :
+			(i < MinIterations || (aBegin + bBegin < end));
+		if(test) {
+			T aKey = data[aBegin];
+			T bKey = data[bBegin];
+			bool pA = comp(aKey, bKey);
+			bool pB = comp(bKey, aKey);
+			// The outputs must come from A by definition of set interection.
+			results[i] = aKey;
+			indices[i] = aBegin;
+			if(!pB) ++aBegin;
+			if(!pA) ++bBegin;
+			if(pA == pB) commit |= 1<< i;
+		}
+	}
+	return commit;
+}
+////////////////////////////////////////////////////////////////////////////////
+// SerialSetUnion
+// Emit A if A <= B. Emit B if B < A.
+template<int VT, bool RangeCheck, typename T, typename Comp>
+MGPU_DEVICE int SerialSetUnion(const T* data, int aBegin, int aEnd,
+	int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) {
+	const int MinIterations = VT / 2;
+	int commit = 0;
+	#pragma unroll
+	for(int i = 0; i < VT; ++i) {
+		bool test = RangeCheck ?
+			(aBegin + bBegin < end) :
+			(i < MinIterations || (aBegin + bBegin < end));
+		if(test) {
+			T aKey = data[aBegin];
+			T bKey = data[bBegin];
+			bool pA = false, pB = false;
+			if(RangeCheck && aBegin >= aEnd)
+				pB = true;
+			else if(RangeCheck && bBegin >= bEnd)
+				pA = true;
+			else {
+				// Both are in range.
+				pA = comp(aKey, bKey);
+				pB = comp(bKey, aKey);
+			}
+			// Output A in case of a tie, so check if b < a.
+			results[i] = pB ? bKey : aKey;
+			indices[i] = pB ? bBegin : aBegin;
+			if(!pB) ++aBegin;
+			if(!pA) ++bBegin;
+			commit |= 1<< i;
+		}
+	}
+	return commit;
+}
+////////////////////////////////////////////////////////////////////////////////
+// SerialSetDifference
+// Emit A if A < B.
+template<int VT, bool RangeCheck, typename T, typename Comp>
+MGPU_DEVICE int SerialSetDifference(const T* data, int aBegin, int aEnd,
+	int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) {
+	const int MinIterations = VT / 2;
+	int commit = 0;
+	#pragma unroll
+	for(int i = 0; i < VT; ++i) {
+		bool test = RangeCheck ?
+			(aBegin + bBegin < end) :
+			(i < MinIterations || (aBegin + bBegin < end));
+		if(test) {
+			T aKey = data[aBegin];
+			T bKey = data[bBegin];
+			bool pA = false, pB = false;
+			if(RangeCheck && aBegin >= aEnd)
+				pB = true;
+			else if(RangeCheck && bBegin >= bEnd)
+				pA = true;
+			else {
+				pA = comp(aKey, bKey);
+				pB = comp(bKey, aKey);
+			}
+			// The outputs must come from A by definition of set difference.
+			results[i] = aKey;
+			indices[i] = aBegin;
+			if(!pB) ++aBegin;
+			if(!pA) ++bBegin;
+			if(pA) commit |= 1<< i;
+		}
+	}
+	return commit;
+}
+////////////////////////////////////////////////////////////////////////////////
+// SerialSetSymDiff
+// Emit A if A < B and emit B if B < A.
+template<int VT, bool RangeCheck, typename T, typename Comp>
+MGPU_DEVICE int SerialSetSymDiff(const T* data, int aBegin, int aEnd,
+	int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) {
+	const int MinIterations = VT / 2;
+	int commit = 0;
+	#pragma unroll
+	for(int i = 0; i < VT; ++i) {
+		bool test = RangeCheck ?
+			(aBegin + bBegin < end) :
+			(i < MinIterations || (aBegin + bBegin < end));
+		if(test) {
+			T aKey = data[aBegin];
+			T bKey = data[bBegin];
+			bool pA = false, pB = false;
+			if(RangeCheck && (bBegin >= bEnd))
+				pA = true;
+			else if(RangeCheck && (aBegin >= aEnd))
+				pB = true;
+			else {
+				pA = comp(aKey, bKey);
+				pB = comp(bKey, aKey);
+			}
+			results[i] = pA ? aKey : bKey;
+			indices[i] = pA ? aBegin : bBegin;
+			if(!pA) ++bBegin;
+			if(!pB) ++aBegin;
+			if(pA != pB) commit |= 1<< i;
+		}
+	}
+	return commit;
+}
+////////////////////////////////////////////////////////////////////////////////
+// SerialSetOp
+// Uses the MgpuSetOp enum to statically select one of the four serial ops
+// above.
+template<int VT, bool RangeCheck, MgpuSetOp Op, typename T, typename Comp>
+MGPU_DEVICE int SerialSetOp(const T* data, int aBegin, int aEnd,
+	int bBegin, int bEnd, int star, T* results, int* indices, Comp comp) {
+	int end = aBegin + bBegin + VT - star;
+	if(RangeCheck) end = min(end, aEnd + bEnd);
+	int commit;
+	switch(Op) {
+		case MgpuSetOpIntersection:
+			commit = SerialSetIntersection<VT, RangeCheck>(data, aBegin,
+				aEnd, bBegin, bEnd, end, results, indices, comp);
+			break;
+		case MgpuSetOpUnion:
+			commit = SerialSetUnion<VT, RangeCheck>(data, aBegin, aEnd,
+				bBegin, bEnd, end, results, indices, comp);
+			break;
+		case MgpuSetOpDiff:
+			commit = SerialSetDifference<VT, RangeCheck>(data, aBegin, aEnd,
+				bBegin, bEnd, end, results, indices, comp);
+			break;
+		case MgpuSetOpSymDiff:
+			commit = SerialSetSymDiff<VT, RangeCheck>(data, aBegin, aEnd,
+				bBegin, bEnd, end, results, indices, comp);
+			break;
+	}
+	__syncthreads();
+	return commit;
+}
+} // namespace mgpu
--- a/include/contrib/moderngpu/include/device/sortnetwork.cuh
+++ b/include/contrib/moderngpu/include/device/sortnetwork.cuh
+/******************************************************************************
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+/******************************************************************************
+ *
+ * Code and text by Sean Baxter, NVIDIA Research
+ * See http://nvlabs.github.io/moderngpu for repository and documentation.
+ *
+ ******************************************************************************/
+#pragma once
+#include "deviceutil.cuh"
+namespace mgpu {
+////////////////////////////////////////////////////////////////////////////////
+// Odd-even transposition sorting network. Sorts keys and values in-place in
+// register.
+// http://en.wikipedia.org/wiki/Odd%E2%80%93even_sort
+// CUDA Compiler does not currently unroll these loops correctly. Write using
+// template loop unrolling.
+/*
+template<int VT, typename T, typename V, typename Comp>
+MGPU_DEVICE void OddEvenTransposeSort(T* keys, V* values, Comp comp) {
+	#pragma unroll
+	for(int level = 0; level < VT; ++level) {
+		#pragma unroll
+		for(int i = 1 & level; i < VT - 1; i += 2) {
+			if(comp(keys[i + 1], keys[i])) {
+				mgpu::swap(keys[i], keys[i + 1]);
+				mgpu::swap(values[i], values[i + 1]);
+			}
+		}
+	}
+}*/
+template<int I, int VT>
+struct OddEvenTransposeSortT {
+	// Sort segments marked by head flags. If the head flag between i and i + 1
+	// is set (so that (2<< i) & flags is true), the values belong to different
+	// segments and are not swapped.
+	template<typename K, typename V, typename Comp>
+	static MGPU_DEVICE void Sort(K* keys, V* values, int flags, Comp comp) {
+		#pragma unroll
+		for(int i = 1 & I; i < VT - 1; i += 2)
+			if((0 == ((2<< i) & flags)) && comp(keys[i + 1], keys[i])) {
+				mgpu::swap(keys[i], keys[i + 1]);
+				mgpu::swap(values[i], values[i + 1]);
+			}
+		OddEvenTransposeSortT<I + 1, VT>::Sort(keys, values, flags, comp);
+	}
+};
+template<int I> struct OddEvenTransposeSortT<I, I> {
+	template<typename K, typename V, typename Comp>
+	static MGPU_DEVICE void Sort(K* keys, V* values, int flags, Comp comp) { }
+};
+template<int VT, typename K, typename V, typename Comp>
+MGPU_DEVICE void OddEvenTransposeSort(K* keys, V* values, Comp comp) {
+	OddEvenTransposeSortT<0, VT>::Sort(keys, values, 0, comp);
+}
+template<int VT, typename K, typename V, typename Comp>
+MGPU_DEVICE void OddEvenTransposeSortFlags(K* keys, V* values, int flags,
+	Comp comp) {
+	OddEvenTransposeSortT<0, VT>::Sort(keys, values, flags, comp);
+}
+////////////////////////////////////////////////////////////////////////////////
+// Batcher Odd-Even Mergesort network
+// Unstable but executes much faster than the transposition sort.
+// http://en.wikipedia.org/wiki/Batcher_odd%E2%80%93even_mergesort
+template<int Width, int Low, int Count>
+struct OddEvenMergesortT {
+	template<typename K, typename V, typename Comp>
+	MGPU_DEVICE static void CompareAndSwap(K* keys, V* values, int flags,
+		int a, int b, Comp comp) {
+		if(b < Count) {
+			// Mask the bits between a and b. Any head flags in this interval
+			// means the keys are in different segments and must not be swapped.
+			const int Mask = ((2<< b) - 1) ^ ((2<< a) - 1);
+			if(!(Mask & flags) && comp(keys[b], keys[a])) {
+				mgpu::swap(keys[b], keys[a]);
+				mgpu::swap(values[b], values[a]);
+			}
+		}
+	}
+	template<int R, int Low2, bool Recurse = 2 * R < Width>
+	struct OddEvenMerge {
+		template<typename K, typename V, typename Comp>
+		MGPU_DEVICE static void Merge(K* keys, V* values, int flags,
+			Comp comp) {
+			// Compare and swap
+			const int M = 2 * R;
+			OddEvenMerge<M, Low2>::Merge(keys, values, flags, comp);
+			OddEvenMerge<M, Low2 + R>::Merge(keys, values, flags, comp);
+			#pragma unroll
+			for(int i = Low2 + R; i + R < Low2 + Width; i += M)
+				CompareAndSwap(keys, values, flags, i, i + R, comp);
+		}
+	};
+	template<int R, int Low2>
+	struct OddEvenMerge<R, Low2, false> {
+		template<typename K, typename V, typename Comp>
+		MGPU_DEVICE static void Merge(K* keys, V* values, int flags,
+			Comp comp) {
+			CompareAndSwap(keys, values, flags, Low2, Low2 + R, comp);
+		}
+	};
+	template<typename K, typename V, typename Comp>
+	MGPU_DEVICE static void Sort(K* keys, V* values, int flags,
+		Comp comp) {
+		const int M = Width / 2;
+		OddEvenMergesortT<M, Low, Count>::Sort(keys, values, flags, comp);
+		OddEvenMergesortT<M, Low + M, Count>::Sort(keys, values, flags, comp);
+		OddEvenMerge<1, Low>::Merge(keys, values, flags, comp);
+	}
+};
+template<int Low, int Count> struct OddEvenMergesortT<1, Low, Count> {
+	template<typename K, typename V, typename Comp>
+	MGPU_DEVICE static void Sort(K* keys, V* values, int flags,
+		Comp comp) { }
+};
+template<int VT, typename K, typename V, typename Comp>
+MGPU_DEVICE void OddEvenMergesort(K* keys, V* values, Comp comp) {
+	const int Width = 1<< sLogPow2<VT, true>::value;
+	OddEvenMergesortT<Width, 0, VT>::Sort(keys, values, 0, comp);
+}
+template<int VT, typename K, typename V, typename Comp>
+MGPU_DEVICE void OddEvenMergesortFlags(K* keys, V* values, int flags,
+	Comp comp) {
+	const int Width = 1<< sLogPow2<VT, true>::value;
+	OddEvenMergesortT<Width, 0, VT>::Sort(keys, values, flags, comp);
+}
+} // namespace mgpu
--- a/include/contrib/moderngpu/include/mgpudevice.cuh
+++ b/include/contrib/moderngpu/include/mgpudevice.cuh
+/******************************************************************************
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+/******************************************************************************
+ *
+ * Code and text by Sean Baxter, NVIDIA Research
+ * See http://nvlabs.github.io/moderngpu for repository and documentation.
+ *
+ ******************************************************************************/
+#pragma once
+#include "mgpuenums.h"
+#include "device/deviceutil.cuh"
+namespace mgpu {
+////////////////////////////////////////////////////////////////////////////////
+// device/loadstore.cuh
+// For 0 <= i < VT:
+//		index = NT * i + tid;
+//		reg[i] = data[index];
+// Synchronize after load.
+template<int NT, int VT, typename InputIt, typename T>
+MGPU_DEVICE void DeviceSharedToReg(InputIt data, int tid, T* reg,
+	bool sync = true);
+// For 0 <= i < VT:
+//		index = NT * i + tid;
+//		if(index < count) reg[i] = data[index];
+// No synchronize after load.
+template<int NT, int VT, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGlobalToReg(int count, InputIt data, int tid,
+	T* reg, bool sync = false);
+template<int NT, int VT, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGlobalToRegDefault(int count, InputIt data, int tid,
+	T* reg, T init, bool sync = false);
+// For 0 <= i < VT:
+//		index = NT * i + tid;
+//		if(index < count) reg[i] = data[index];
+// No synchronize after load.
+template<int NT, int VT0, int VT1, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGlobalToReg(int count, InputIt data, int tid,
+	T* reg, bool sync = false);
+// For 0 <= i < VT:
+//		index = NT * i + tid;
+//		if(index < count) reg[i] = data[index];
+// No synchronize after load.
+template<int NT, int VT0, int VT1, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGlobalToRegDefault2(int count, InputIt data, int tid,
+	T* reg, T init, bool sync = false);
+// For 0 <= i < VT:
+//		index = NT * i + tid;
+//		if(index < count) reg[i] = data[index];
+// No synchronize after load.
+// No optimized code path for count < NV (smaller generated code).
+template<int NT, int VT, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGlobalToRegLoop(int count, InputIt data, int tid,
+	T* reg, bool sync = false);
+// For 0 <= i < VT:
+//		index = VT * tid + i.
+//		if(index < count) reg[i] = data[index];
+// No synchronize after load.
+template<int NT, int VT, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGlobalToThread(int count, InputIt data, int tid,
+	T* reg);
+template<int NT, int VT, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGlobalToThreadDefault(int count, InputIt data, int tid,
+	T* reg, T init);
+// For 0 <= i < VT:
+//		index = NT * i + tid;
+//		if(index < count) data[index] = reg[i];
+// Synchronize after load.
+template<int NT, int VT, typename OutputIt, typename T>
+MGPU_DEVICE void DeviceRegToShared(const T* reg, int tid, OutputIt dest,
+	bool sync = true);
+// For 0 <= i < VT:
+//		index = NT * i + tid;
+//		if(index < count) data[index] = reg[i];
+// No synchronize after load.
+template<int NT, int VT, typename OutputIt, typename T>
+MGPU_DEVICE void DeviceRegToGlobal(int count, const T* reg, int tid,
+	OutputIt dest, bool sync = false);
+// For 0 <= index < count:
+//		dest[index] = source[index];
+// This function is intended to replace DeviceGlobalToShared in cases where
+// count is much less than NT * VT.
+template<int NT, typename InputIt, typename OutputIt>
+MGPU_DEVICE void DeviceMemToMemLoop(int count, InputIt source, int tid,
+	OutputIt dest, bool sync = true);
+// For 0 <= index < count:
+//		dest[index] = source[index];
+// Synchronize after store.
+template<int NT, int VT, typename T, typename OutputIt>
+MGPU_DEVICE void DeviceSharedToGlobal(int count, const T* source, int tid,
+	OutputIt dest, bool sync = true);
+// For 0 <= index < count:
+//		dest[index] = source[index];
+// Synchronize after store.
+template<int NT, int VT, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGlobalToShared(int count, InputIt source, int tid,
+	T* dest, bool sync = true);
+template<int NT, int VT0, int VT1, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGlobalToShared2(int count, InputIt source, int tid,
+	T* dest, bool sync = true);
+// For 0 <= index < count:
+//		dest[index] = source[index];
+// Synchronize after store.
+// No optimized code path for count < NV (smaller generated code).
+template<int NT, int VT, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGlobalToSharedLoop(int count, InputIt source, int tid,
+	T* dest, bool sync = true);
+template<int NT, int VT, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGlobalToSharedDefault(int count, InputIt source, int tid,
+	T* dest, T init, bool sync = true);
+template<int NT, int VT0, int VT1, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGlobalToSharedDefault2(int count, InputIt source,
+	int tid, T* dest, T init, bool sync = true);
+// For 0 <= index < count:
+//		dest[index] = source[index];
+// No synchronize.
+template<int NT, int VT, typename InputIt, typename OutputIt>
+MGPU_DEVICE void DeviceGlobalToGlobal(int count, InputIt source, int tid,
+	OutputIt dest, bool sync = false);
+// Transponse VT elements in NT threads (x) into thread-order registers (y)
+// using only NT * VT / 2 elements of shared memory.
+template<int NT, int VT, typename T>
+MGPU_DEVICE void HalfSmemTranspose(const T* x, int tid, T* shared, T* y);
+// For 0 <= i < VT:
+//		index = NT * i + tid;
+//		if(index < count)
+//			gather = indices[index];
+//			reg[i] = data[gather];
+// Synchronize after load.
+template<int NT, int VT, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGather(int count, InputIt data, int indices[VT],
+	int tid, T* reg, bool sync = true);
+template<int NT, int VT, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGatherDefault(int count, InputIt data, int indices[VT],
+	int tid, T* reg, T identity, bool sync = true);
+// For 0 <= i < VT:
+//		index = NT * i + tid;
+//		if(index < count)
+//			scatter = indices[index];
+//			data[scatter] = reg[i];
+// Synchronize after store.
+template<int NT, int VT, typename T, typename OutputIt>
+MGPU_DEVICE void DeviceScatter(int count, const T* reg, int tid,
+	int indices[VT], OutputIt data, bool sync = true);
+// For 0 <= i < VT:
+//		shared[VT * tid + i] = threadReg[i];
+// Synchronize after store.
+// Note this function moves data in THREAD ORDER.
+// (DeviceRegToShared moves data in STRIDED ORDER).
+template<int VT, typename T>
+MGPU_DEVICE void DeviceThreadToShared(const T* threadReg, int tid, T* shared,
+	bool sync = true);
+// For 0 <= i < VT:
+//		threadReg[i] = shared[VT * tid + i];
+// Synchronize after load.
+// Note this function moves data in THREAD ORDER.
+// (DeviceSharedToReg moves data in STRIDED ORDER).
+template<int VT, typename T>
+MGPU_DEVICE void DeviceSharedToThread(const T* shared, int tid, T* threadReg,
+	bool sync = true);
+// For 0 <= index < aCount:
+//		shared[index] = a_global[index];
+// For 0 <= index < bCount:
+//		shared[aCount + index] = b_global[index];
+// VT0 is the lower-bound for predication-free execution:
+//		If count >= NT * VT0, a predication-free branch is taken.
+// VT1 is the upper-bound for loads:
+//		NT * VT1 must >= aCount + bCount.
+template<int NT, int VT0, int VT1, typename T>
+MGPU_DEVICE void DeviceLoad2ToReg(const T* a_global, int aCount,
+	const T* b_global, int bCount, int tid, T* reg, bool sync = false);
+template<int NT, int VT0, int VT1, typename T>
+MGPU_DEVICE void DeviceLoad2ToShared(const T* a_global, int aCount,
+	const T* b_global, int bCount, int tid, T* shared, bool sync = true);
+template<int NT, int VT0, int VT1, typename InputIt1, typename InputIt2,
+	typename T>
+MGPU_DEVICE void DeviceLoad2ToReg(InputIt1 a_global, int aCount,
+	InputIt2 b_global, int bCount, int tid, T* reg, bool sync = false);
+template<int NT, int VT0, int VT1, typename InputIt1, typename InputIt2,
+	typename T>
+MGPU_DEVICE void DeviceLoad2ToShared(InputIt1 a_global, int aCount,
+	InputIt2 b_global, int bCount, int tid, T* shared, bool sync = true);
+// For 0 <= i < VT
+//		index = NT * i + tid;
+//		if(index < count)
+//			gather = indices_shared[index];
+//			dest_global[index] = data_global[gather];
+// Synchronize after load.
+template<int NT, int VT, typename InputIt, typename OutputIt>
+MGPU_DEVICE void DeviceGatherGlobalToGlobal(int count, InputIt data_global,
+	const int* indices_shared, int tid, OutputIt dest_global,
+	bool sync = true);
+// For 0 <= i < VT
+//		index = NT * i + tid
+//		if(index < count)
+//			gather = indices[index];
+//			if(gather < aCount) data = a_global[gather];
+//			else data = b_global[gather - aCount];
+//			dest_global[index] = data;
+// Synchronize after load.
+template<int NT, int VT, typename InputIt1, typename InputIt2,
+	typename T>
+MGPU_DEVICE void DeviceTransferMergeValuesReg(int count, InputIt1 a_global,
+	InputIt2 b_global, int bStart, const int* indices, int tid,
+	T* reg, bool sync = false);
+template<int NT, int VT, typename InputIt1, typename InputIt2,
+	typename OutputIt>
+MGPU_DEVICE void DeviceTransferMergeValuesShared(int count, InputIt1 a_global,
+	InputIt2 b_global, int bStart, const int* indices_shared, int tid,
+	OutputIt dest_global, bool sync = true);
+template<int NT, int VT, typename T>
+MGPU_DEVICE void DeviceTransferMergeValuesReg(int count, const T* a_global,
+	const T* b_global, int bStart, const int* indices, int tid,
+	T* reg, bool sync = false);
+template<int NT, int VT, typename T, typename OutputIt>
+MGPU_DEVICE void DeviceTransferMergeValuesShared(int count, const T* a_global,
+	const T* b_global, int bStart, const int* indices_shared, int tid,
+	OutputIt dest_global, bool sync = true);
+} // namespace mgpu
+#include "device/loadstore.cuh"
+#include "device/ctasegscan.cuh"
--- a/include/contrib/moderngpu/include/mgpuenums.h
+++ b/include/contrib/moderngpu/include/mgpuenums.h
+/******************************************************************************
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+/******************************************************************************
+ *
+ * Code and text by Sean Baxter, NVIDIA Research
+ * See http://nvlabs.github.io/moderngpu for repository and documentation.
+ *
+ ******************************************************************************/
+#pragma once 
+namespace mgpu {
+enum MgpuBounds {
+	MgpuBoundsLower,
+	MgpuBoundsUpper
+};
+enum MgpuScanType {
+	MgpuScanTypeExc,
+	MgpuScanTypeInc
+};
+enum MgpuSearchType {
+	MgpuSearchTypeNone,
+	MgpuSearchTypeIndex,
+	MgpuSearchTypeMatch,
+	MgpuSearchTypeIndexMatch
+};
+enum MgpuJoinKind {
+	MgpuJoinKindInner,
+	MgpuJoinKindLeft,
+	MgpuJoinKindRight,
+	MgpuJoinKindOuter
+};
+enum MgpuSetOp {
+	MgpuSetOpIntersection,
+	MgpuSetOpUnion,
+	MgpuSetOpDiff,
+	MgpuSetOpSymDiff
+};
+} // namespace mgpu
--- a/include/contrib/moderngpu/include/util/static.h
+++ b/include/contrib/moderngpu/include/util/static.h
+/******************************************************************************
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+/******************************************************************************
+ *
+ * Code and text by Sean Baxter, NVIDIA Research
+ * See http://nvlabs.github.io/moderngpu for repository and documentation.
+ *
+ ******************************************************************************/
+#pragma once
+#include <functional>
+#include <iterator>
+#include <cfloat>
+#include <typeinfo>
+#include <vector>
+#include <list>
+#include <map>
+#include <algorithm>
+#include <cassert>
+#include <memory>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#ifndef MGPU_MIN
+#define MGPU_MIN(x, y) (((x) <= (y)) ? (x) : (y))
+#define MGPU_MAX(x, y) (((x) >= (y)) ? (x) : (y))
+#define MGPU_MAX0(x) (((x) >= 0) ? (x) : 0)
+#define MGPU_ABS(x) (((x) >= 0) ? (x) : (-x))
+#define MGPU_DIV_UP(x, y) (((x) + (y) - 1) / (y))
+#define MGPU_DIV_ROUND(x, y) (((x) + (y) / 2) / (y))
+#define MGPU_ROUND_UP(x, y) ((y) * MGPU_DIV_UP(x, y))
+#define MGPU_SHIFT_DIV_UP(x, y) (((x) + ((1<< (y)) - 1))>> y)
+#define MGPU_ROUND_UP_POW2(x, y) (((x) + (y) - 1) & ~((y) - 1))
+#define MGPU_ROUND_DOWN_POW2(x, y) ((x) & ~((y) - 1))
+#define MGPU_IS_POW_2(x) (0 == ((x) & ((x) - 1)))
+#endif // MGPU_MIN
+namespace mgpu {
+typedef unsigned char byte;
+typedef unsigned int uint;
+typedef signed short int16;
+typedef unsigned short ushort;
+typedef unsigned short uint16;
+typedef long long int64;
+typedef unsigned long long uint64;
+// IsPow2<X>::value is true if X is a power of 2.
+template<int X> struct sIsPow2 {
+	enum { value = 0 == (X & (X - 1)) };
+};
+// Finds the base-2 logarithm of X. value is -1 if X is not a power of 2.
+template<int X, bool roundUp = true> struct sLogPow2 { 
+	enum { extra = sIsPow2<X>::value ? 0 : (roundUp ? 1 : 0) };
+	enum { inner = sLogPow2<X / 2>::inner + 1 };
+	enum { value = inner + extra };
+};
+template<bool roundUp> struct sLogPow2<0, roundUp> {
+	enum { inner = 0 };
+	enum { value = 0 };
+};
+template<bool roundUp> struct sLogPow2<1, roundUp> { 
+	enum { inner = 0 };
+	enum { value = 0 };
+};
+template<int X, int Y>
+struct sDivUp {
+	enum { value = (X + Y - 1) / Y };
+};
+template<int count, int levels> struct sDiv2RoundUp {
+	enum { value = sDiv2RoundUp<sDivUp<count, 2>::value, levels - 1>::value };
+};
+template<int count> struct sDiv2RoundUp<count, 0> {
+	enum { value = count };
+};
+template<int X, int Y>
+struct sDivSafe {
+	enum { value = X / Y };
+};
+template<int X>
+struct sDivSafe<X, 0> {
+	enum { value = 0 };
+};
+template<int X, int Y>
+struct sRoundUp {
+	enum { rem = X % Y };
+	enum { value = X + (rem ? (Y - rem) : 0) };
+};
+template<int X, int Y>
+struct sRoundDown {
+	enum { rem = X % Y };
+	enum { value = X - rem };
+};
+// IntegerDiv is a template for avoiding divisions by zero in template 
+// evaluation. Templates always evaluate both b and c in an expression like
+// a ? b : c, and will error if either rhs contains an illegal expression,
+// even if the ternary is explictly designed to guard against that.
+template<int X, int Y>
+struct sIntegerDiv {
+	enum { value = X / (Y ? Y : (X + 1)) };
+};
+template<int X, int Y>
+struct sMax {
+	enum { value = (X >= Y) ? X : Y };
+};
+template<int X, int Y>
+struct sMin {
+	enum { value = (X <= Y) ? X : Y };
+};
+template<int X>
+struct sAbs {
+	enum { value = (X >= 0) ? X : -X };
+};
+// Finds the number of powers of 2 in the prime factorization of X.
+template<int X, int LSB = 1 & X> struct sNumFactorsOf2 {
+	enum { shifted = X >> 1 };
+	enum { value = 1 + sNumFactorsOf2<shifted>::value };
+};
+template<int X> struct sNumFactorsOf2<X, 1> {
+	enum { value = 0 };
+};
+// Returns the divisor for a conflict-free transpose.
+template<int X, int NumBanks = 32> struct sBankConflictDivisor {
+	enum { value = 
+		(1 & X) ? 0 : 
+		(sIsPow2<X>::value ? NumBanks :
+		(1<< sNumFactorsOf2<X>::value)) }; 
+	enum { log_value = sLogPow2<value>::value };
+};
+template<int NT, int X, int NumBanks = 32> struct sConflictFreeStorage {
+	enum { count = NT * X };
+	enum { divisor = sBankConflictDivisor<X, NumBanks>::value };
+	enum { padding = sDivSafe<count, divisor>::value };
+	enum { value = count + padding };
+};
+} // namespace mgpu
--- a/include/ctc.h
+++ b/include/ctc.h
+/** \file ctc.h
+ * Contains a simple C interface to call fast CPU and GPU based computation
+ * of the CTC loss.
+ */
+#pragma once
+#ifdef __cplusplus
+#include <cstddef>
+#include <torch/extension.h>
+extern "C" {
+#endif
+//forward declare of CUDA typedef to avoid needing to pull in CUDA headers
+//typedef struct CUstream_st* CUstream;
+typedef struct ihipStream_t* CUstream;
+typedef enum {
+    CTC_STATUS_SUCCESS = 0,
+    CTC_STATUS_MEMOPS_FAILED = 1,
+    CTC_STATUS_INVALID_VALUE = 2,
+    CTC_STATUS_EXECUTION_FAILED = 3,
+    CTC_STATUS_UNKNOWN_ERROR = 4
+} ctcStatus_t;
+/** Returns a single integer which specifies the API version of the warpctc library */
+int get_warpctc_version();
+/** Returns a string containing a description of status that was passed in
+ *  \param[in] status identifies which string should be returned
+ *  \return C style string containing the text description
+ *  */
+const char* ctcGetStatusString(ctcStatus_t status);
+typedef enum {
+    CTC_CPU = 0,
+    CTC_GPU = 1
+} ctcComputeLocation;
+/** Structure used for options to the CTC compution.  Applications
+ *  should zero out the array using memset and sizeof(struct
+ *  ctcOptions) in C or default initialization (e.g. 'ctcOptions
+ *  options{};' or 'auto options = ctcOptions{}') in C++ to ensure
+ *  forward compatibility with added options. */
+struct ctcOptions {
+    /// indicates where the ctc calculation should take place {CTC_CPU | CTC_GPU}
+    ctcComputeLocation loc;
+    union {
+        /// used when loc == CTC_CPU, the maximum number of threads that can be used
+        unsigned int num_threads;
+        /// used when loc == CTC_GPU, which stream the kernels should be launched in
+        CUstream stream;
+    };
+    /// the label value/index that the CTC calculation should use as the blank label
+    int blank_label;
+};
+/** Compute the connectionist temporal classification loss between a sequence
+ *  of probabilities and a ground truth labeling.  Optionally compute the
+ *  gradient with respect to the inputs.
+ * \param [in] activations pointer to the activations in either CPU or GPU
+ *             addressable memory, depending on info.  We assume a fixed
+ *             memory layout for this 3 dimensional tensor, which has dimension
+ *             (t, n, p), where t is the time index, n is the minibatch index,
+ *             and p indexes over probabilities of each symbol in the alphabet.
+ *             The memory layout is (t, n, p) in C order (slowest to fastest changing
+ *             index, aka row-major), or (p, n, t) in Fortran order (fastest to slowest
+ *             changing index, aka column-major). We also assume strides are equal to
+ *             dimensions - there is no padding between dimensions.
+ *             More precisely, element (t, n, p), for a problem with mini_batch examples
+ *             in the mini batch, and alphabet_size symbols in the alphabet, is located at:
+ *             activations[(t * mini_batch + n) * alphabet_size + p]
+ * \param [out] gradients if not NULL, then gradients are computed.  Should be
+ *              allocated in the same memory space as probs and memory
+ *              ordering is identical.
+ * \param [in]  flat_labels Always in CPU memory.  A concatenation
+ *              of all the labels for the minibatch.
+ * \param [in]  label_lengths Always in CPU memory. The length of each label
+ *              for each example in the minibatch.
+ * \param [in]  input_lengths Always in CPU memory.  The number of time steps
+ *              for each sequence in the minibatch.
+ * \param [in]  alphabet_size The number of possible output symbols.  There
+ *              should be this many probabilities for each time step.
+ * \param [in]  mini_batch How many examples in a minibatch.
+ * \param [out] costs Always in CPU memory.  The cost of each example in the
+ *              minibatch.
+ * \param [in,out] workspace In same memory space as probs. Should be of
+ *                 size requested by get_workspace_size.
+ * \param [in]  options see struct ctcOptions
+ *
+ *  \return Status information
+ *
+ * */
+ctcStatus_t compute_ctc_loss(const float* const activations,
+                             float* gradients,
+                             const int* const flat_labels,
+                             const int* const label_lengths,
+                             const int* const input_lengths,
+                             int alphabet_size,
+                             int minibatch,
+                             float *costs,
+                             void *workspace,
+                             ctcOptions options);
+/** For a given set of labels and minibatch size return the required workspace
+ *  size.  This will need to be allocated in the same memory space as your
+ *  probabilities.
+ * \param [in]  label_lengths Always in CPU memory. The length of each label
+ *              for each example in the minibatch.
+ * \param [in]  input_lengths Always in CPU memory.  The number of time steps
+ *              for each sequence in the minibatch.
+ * \param [in]  alphabet_size How many symbols in the alphabet or, equivalently,
+ *              the number of probabilities at each time step
+ * \param [in]  mini_batch How many examples in a minibatch.
+ * \param [in]  info see struct ctcOptions
+ * \param [out] size_bytes is pointer to a scalar where the memory
+ *              requirement in bytes will be placed. This memory should be allocated
+ *              at the same place, CPU or GPU, that the probs are in
+ *
+ *  \return Status information
+ **/
+ctcStatus_t get_workspace_size(const int* const label_lengths,
+                               const int* const input_lengths,
+                               int alphabet_size, int minibatch,
+                               ctcOptions info,
+                               size_t* size_bytes);
+#ifdef __cplusplus
+}
+#endif
--- a/include/detail/cpu_ctc.h
+++ b/include/detail/cpu_ctc.h
--- a/include/detail/ctc_helper.h
+++ b/include/detail/ctc_helper.h
--- a/include/detail/gpu_ctc.h
+++ b/include/detail/gpu_ctc.h
--- a/include/detail/gpu_ctc_kernels.h
+++ b/include/detail/gpu_ctc_kernels.h
--- a/include/detail/hostdevice.h
+++ b/include/detail/hostdevice.h
+#pragma once
+#ifdef __HIPCC__
+    #define HOSTDEVICE __device__ __host__
+#else
+    #define HOSTDEVICE
+#endif
--- a/include/detail/reduce.h
+++ b/include/detail/reduce.h
+#pragma once
+ctcStatus_t reduce_negate(const float* input, float* output, int rows, int cols, bool axis, CUstream stream);
+ctcStatus_t reduce_exp(const float* input, float* output, int rows, int cols, bool axis, CUstream stream);
+ctcStatus_t reduce_max(const float* input, float* output, int rows, int cols, bool axis, CUstream stream);
--- a/pytorch_binding/setup.cfg
+++ b/pytorch_binding/setup.cfg
+[tool:pytest]
--- a/pytorch_binding/setup.py
+++ b/pytorch_binding/setup.py
--- a/pytorch_binding/src/binding.cu
+++ b/pytorch_binding/src/binding.cu
--- a/pytorch_binding/src/binding.hip
+++ b/pytorch_binding/src/binding.hip
--- a/pytorch_binding/src/cpu_binding.h
+++ b/pytorch_binding/src/cpu_binding.h
+#pragma once
+/*
+int cpu_ctc(THFloatTensor *probs,
+                        THFloatTensor *grads,
+                        THIntTensor *labels_ptr,
+                        THIntTensor *label_sizes_ptr,
+                        THIntTensor *sizes,
+                        int minibatch_size,
+                        THFloatTensor *costs,
+                        int blank_label);
+*/
+int cpu_ctc(torch::Tensor probs,
+            torch::Tensor grads,
+            torch::Tensor labels,
+            torch::Tensor label_sizes,
+            torch::Tensor sizes,
+            int minibatch_size,
+            torch::Tensor costs,
+            int blank_label);