warpctc from github

a60ed4d7 · lishen · 949fcc19 · a60ed4d7 · a60ed4d7 · a60ed4d7
Commit a60ed4d7 authored May 16, 2023 by lishen
20 changed files
--- a/include/contrib/moderngpu/include/device/devicetypes.cuh
+++ b/include/contrib/moderngpu/include/device/devicetypes.cuh
+/******************************************************************************
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ *
+ * Code and text by Sean Baxter, NVIDIA Research
+ * See http://nvlabs.github.io/moderngpu for repository and documentation.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#if __CUDA_ARCH__ == 100
+	#error "COMPUTE CAPABILITY 1.0 NOT SUPPORTED BY MPGU. TRY 2.0!"
+#endif 
+
+#include <climits>
+#include "../util/static.h"
+
+#ifdef _MSC_VER
+#define INLINESYMBOL __forceinline__
+#else
+#define INLINESYMBOL inline
+#endif
+
+namespace mgpu {
+
+#define MGPU_HOST __host__ INLINESYMBOL
+#define MGPU_DEVICE __device__ INLINESYMBOL
+#define MGPU_HOST_DEVICE __host__ __device__ INLINESYMBOL
+
+const int WARP_SIZE = 32;
+const int LOG_WARP_SIZE = 5;
+
+////////////////////////////////////////////////////////////////////////////////
+// Device-side comparison operators
+
+template<typename T>
+struct less : public std::binary_function<T, T, bool> {
+	MGPU_HOST_DEVICE bool operator()(T a, T b) { return a < b; }
+};
+template<typename T>
+struct less_equal : public std::binary_function<T, T, bool> {
+	MGPU_HOST_DEVICE bool operator()(T a, T b) { return a <= b; }
+};
+template<typename T>
+struct greater : public std::binary_function<T, T, bool> {
+	MGPU_HOST_DEVICE bool operator()(T a, T b) { return a > b; }
+};
+template<typename T>
+struct greater_equal : public std::binary_function<T, T, bool> {
+	MGPU_HOST_DEVICE bool operator()(T a, T b) { return a >= b; }
+};
+template<typename T>
+struct equal_to : public std::binary_function<T, T, bool> {
+	MGPU_HOST_DEVICE bool operator()(T a, T b) { return a == b; }
+};
+template<typename T>
+struct not_equal_to : public std::binary_function<T, T, bool> {
+	MGPU_HOST_DEVICE bool operator()(T a, T b) { return a != b; }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Device-side arithmetic operators
+
+template<typename T>
+struct plus : public std::binary_function<T, T, T> {
+	MGPU_HOST_DEVICE T operator()(T a, T b) { return a + b; }
+};
+
+template<typename T>
+struct minus : public std::binary_function<T, T, T> {
+	MGPU_HOST_DEVICE T operator()(T a, T b) { return a - b; }
+};
+
+template<typename T>
+struct multiplies : public std::binary_function<T, T, T> {
+	MGPU_HOST_DEVICE T operator()(T a, T b) { return a * b; }
+};
+
+template<typename T>
+struct modulus : public std::binary_function<T, T, T> {
+	MGPU_HOST_DEVICE T operator()(T a, T b) { return a % b; }
+};
+
+template<typename T>
+struct bit_or : public std::binary_function<T, T, T> {
+	MGPU_HOST_DEVICE T operator()(T a, T b) { return a | b; }
+};
+
+template<typename T>
+struct bit_and : public std::binary_function<T, T, T> {
+	MGPU_HOST_DEVICE T operator()(T a, T b) { return a & b; }
+};
+
+template<typename T>
+struct bit_xor : public std::binary_function<T, T, T> {
+	MGPU_HOST_DEVICE T operator()(T a, T b) { return a ^ b; }
+};
+
+template<typename T>
+struct maximum : public std::binary_function<T, T, T> {
+	MGPU_HOST_DEVICE T operator()(T a, T b) { return max(a, b); }
+};
+
+template<typename T>
+struct minimum : public std::binary_function<T, T, T> {
+	MGPU_HOST_DEVICE T operator()(T a, T b) { return min(a, b); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+MGPU_HOST_DEVICE void swap(T& a, T& b) {
+	T c = a;
+	a = b;
+	b = c;
+}
+
+template<typename T>
+struct DevicePair {
+	T x, y;
+};
+
+template<typename T>
+MGPU_HOST_DEVICE DevicePair<T> MakeDevicePair(T x, T y) {
+	DevicePair<T> p = { x, y };
+	return p;
+}
+
+template<typename T> struct numeric_limits;
+template<> struct numeric_limits<int> {
+	MGPU_HOST_DEVICE static int min() { return INT_MIN; }
+	MGPU_HOST_DEVICE static int max() { return INT_MAX; }
+	MGPU_HOST_DEVICE static int lowest() { return INT_MIN; }
+	MGPU_HOST_DEVICE static int AddIdent() { return 0; }
+	MGPU_HOST_DEVICE static int MulIdent() { return 1; }
+};
+template<> struct numeric_limits<long long> {
+	MGPU_HOST_DEVICE static long long min() { return LLONG_MIN; }
+	MGPU_HOST_DEVICE static long long max() { return LLONG_MAX; }
+	MGPU_HOST_DEVICE static long long lowest() { return LLONG_MIN; }
+	MGPU_HOST_DEVICE static long long AddIdent() { return 0; }
+	MGPU_HOST_DEVICE static long long MulIdent() { return 1; }
+};
+template<> struct numeric_limits<uint> {
+	MGPU_HOST_DEVICE static uint min() { return 0; }
+	MGPU_HOST_DEVICE static uint max() { return UINT_MAX; }
+	MGPU_HOST_DEVICE static uint lowest() { return 0; }
+	MGPU_HOST_DEVICE static uint AddIdent() { return 0; }
+	MGPU_HOST_DEVICE static uint MulIdent() { return 1; }
+};
+template<> struct numeric_limits<unsigned long long> {
+	MGPU_HOST_DEVICE static unsigned long long min() { return 0; }
+	MGPU_HOST_DEVICE static unsigned long long max() { return ULLONG_MAX; }
+	MGPU_HOST_DEVICE static unsigned long long lowest() { return 0; }
+	MGPU_HOST_DEVICE static unsigned long long AddIdent() { return 0; }
+	MGPU_HOST_DEVICE static unsigned long long MulIdent() { return 1; }
+};
+template<> struct numeric_limits<float> {
+	MGPU_HOST_DEVICE static float min() { return FLT_MIN; }
+	MGPU_HOST_DEVICE static float max() { return FLT_MAX; }
+	MGPU_HOST_DEVICE static float lowest() { return -FLT_MAX; }
+	MGPU_HOST_DEVICE static float AddIdent() { return 0; }
+	MGPU_HOST_DEVICE static float MulIdent() { return 1; }
+};
+template<> struct numeric_limits<double> {
+	MGPU_HOST_DEVICE static double min() { return DBL_MIN; }
+	MGPU_HOST_DEVICE static double max() { return DBL_MAX; }
+	MGPU_HOST_DEVICE static double lowest() { return -DBL_MAX; }
+	MGPU_HOST_DEVICE static double AddIdent() { return 0; }
+	MGPU_HOST_DEVICE static double MulIdent() { return 1; }
+};
+
+
+MGPU_HOST_DEVICE int2 operator+(int2 a, int2 b) {
+	return make_int2(a.x + b.x, a.y + b.y); 
+}
+MGPU_HOST_DEVICE int2& operator+=(int2& a, int2 b) {
+	a = a + b;
+	return a;
+}
+MGPU_HOST_DEVICE int2 operator*(int2 a, int2 b) {
+	return make_int2(a.x * b.x, a.y * b.y);
+}
+MGPU_HOST_DEVICE int2& operator*=(int2& a, int2 b) {
+	a = a * b;
+	return a;
+}
+
+template<typename T>
+MGPU_HOST_DEVICE T max(T a, T b) {
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 100)
+	return std::max(a, b);
+#else
+	return (a < b) ? b : a;
+#endif
+}
+template<typename T>
+MGPU_HOST_DEVICE T min(T a, T b) {
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 100)
+	return std::min(a, b);
+#else
+	return (b < a) ? b : a;
+#endif
+}
+
+MGPU_HOST_DEVICE int2 max(int2 a, int2 b) {
+	return make_int2(max(a.x, b.x), max(a.y, b.y));
+}
+
+MGPU_HOST_DEVICE int2 min(int2 a, int2 b) {
+	return make_int2(min(a.x, b.x), min(a.y, b.y));
+}
+
+template<> struct numeric_limits<int2> {
+	MGPU_HOST_DEVICE static int2 min() { return make_int2(INT_MIN, INT_MIN); }
+	MGPU_HOST_DEVICE static int2 max() { return make_int2(INT_MAX, INT_MAX); }
+	MGPU_HOST_DEVICE static int2 lowest() { 
+		return make_int2(INT_MIN, INT_MIN); 
+	}
+	MGPU_HOST_DEVICE static int2 AddIdent() { return make_int2(0, 0); }
+	MGPU_HOST_DEVICE static int2 MulIdent() { return make_int2(1, 1); }
+};
+
+template<typename T>
+class constant_iterator : public std::iterator_traits<const T*> {
+public:
+	MGPU_HOST_DEVICE constant_iterator(T value) : _value(value) { }
+
+	MGPU_HOST_DEVICE T operator[](ptrdiff_t i) const { 
+		return _value;
+	}
+	MGPU_HOST_DEVICE T operator*() const {
+		return _value;
+	}
+	MGPU_HOST_DEVICE constant_iterator operator+(ptrdiff_t diff) const {
+		return constant_iterator(_value);
+	}
+	MGPU_HOST_DEVICE constant_iterator operator-(ptrdiff_t diff) const {
+		return constant_iterator(_value);
+	}
+	MGPU_HOST_DEVICE constant_iterator& operator+=(ptrdiff_t diff) {
+		return *this;
+	}
+	MGPU_HOST_DEVICE constant_iterator& operator-=(ptrdiff_t diff) {
+		return *this;
+	}
+private:
+	T _value;
+};
+
+template<typename T>
+class counting_iterator : public std::iterator_traits<const T*> {
+public:
+	MGPU_HOST_DEVICE counting_iterator(T value) : _value(value) { }
+
+	MGPU_HOST_DEVICE T operator[](ptrdiff_t i) { 
+		return _value + i;
+	}
+	MGPU_HOST_DEVICE T operator*() {
+		return _value;
+	}
+	MGPU_HOST_DEVICE counting_iterator operator+(ptrdiff_t diff) {
+		return counting_iterator(_value + diff);
+	}
+	MGPU_HOST_DEVICE counting_iterator operator-(ptrdiff_t diff) {
+		return counting_iterator(_value - diff);
+	}
+	MGPU_HOST_DEVICE counting_iterator& operator+=(ptrdiff_t diff) {
+		_value += diff;
+		return *this;
+	}
+	MGPU_HOST_DEVICE counting_iterator& operator-=(ptrdiff_t diff) {
+		_value -= diff;
+		return *this;
+	}
+private:
+	T _value;
+};
+
+template<typename T>
+class step_iterator : public std::iterator_traits<const T*> {
+public:
+	MGPU_HOST_DEVICE step_iterator(T base, T step) :
+		_base(base), _step(step), _offset(0) { }
+
+	MGPU_HOST_DEVICE T operator[](ptrdiff_t i) { 
+		return _base + (_offset + i) * _step; 
+	}
+	MGPU_HOST_DEVICE T operator*() { 
+		return _base + _offset * _step; 
+	} 
+	MGPU_HOST_DEVICE step_iterator operator+(ptrdiff_t diff) {
+		step_iterator it = *this;
+		it._offset += diff;
+		return it;
+	}
+	MGPU_HOST_DEVICE step_iterator operator-(ptrdiff_t diff) {
+		step_iterator it = *this;
+		it._offset -= diff;
+		return it;
+	}
+	MGPU_HOST_DEVICE step_iterator& operator+=(ptrdiff_t diff) { 
+		_offset += diff;
+		return *this;
+	}
+	MGPU_HOST_DEVICE step_iterator& operator-=(ptrdiff_t diff) { 
+		_offset -= diff;
+		return *this;
+	}
+private:
+	ptrdiff_t _offset;
+	T _base, _step;	
+};
+
+} // namespace mgpu
+
+
+template<typename T>
+MGPU_HOST_DEVICE mgpu::counting_iterator<T> operator+(ptrdiff_t diff,
+	mgpu::counting_iterator<T> it) {
+	return it + diff;
+}
+template<typename T>
+MGPU_HOST_DEVICE mgpu::counting_iterator<T> operator-(ptrdiff_t diff,
+	mgpu::counting_iterator<T> it) {
+	return it + (-diff);
+}
+template<typename T>
+MGPU_HOST_DEVICE mgpu::step_iterator<T> operator+(ptrdiff_t diff, 
+	mgpu::step_iterator<T> it) {
+	return it + diff;
+}
+template<typename T>
+MGPU_HOST_DEVICE mgpu::step_iterator<T> operator-(ptrdiff_t diff, 
+	mgpu::step_iterator<T> it) {
+	return it + (-diff);
+}
--- a/include/contrib/moderngpu/include/device/deviceutil.cuh
+++ b/include/contrib/moderngpu/include/device/deviceutil.cuh
+/******************************************************************************
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ *
+ * Code and text by Sean Baxter, NVIDIA Research
+ * See http://nvlabs.github.io/moderngpu for repository and documentation.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include "intrinsics.cuh"
+
+namespace mgpu {
+
+// Get the difference between two pointers in bytes.
+MGPU_HOST_DEVICE ptrdiff_t PtrDiff(const void* a, const void* b) {
+	return (const byte*)b - (const byte*)a;
+}
+
+// Offset a pointer by i bytes.
+template<typename T>
+MGPU_HOST_DEVICE const T* PtrOffset(const T* p, ptrdiff_t i) {
+	return (const T*)((const byte*)p + i);
+}
+template<typename T>
+MGPU_HOST_DEVICE T* PtrOffset(T* p, ptrdiff_t i) {
+	return (T*)((byte*)p + i);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Task range support
+// Evenly distributes variable-length arrays over a fixed number of CTAs.
+
+MGPU_HOST int2 DivideTaskRange(int numItems, int numWorkers) {
+	div_t d = div(numItems, numWorkers);
+	return make_int2(d.quot, d.rem);
+}
+
+MGPU_HOST_DEVICE int2 ComputeTaskRange(int block, int2 task) {
+	int2 range;
+	range.x = task.x * block;
+	range.x += min(block, task.y);
+	range.y = range.x + task.x + (block < task.y);
+	return range;
+}
+
+MGPU_HOST_DEVICE int2 ComputeTaskRange(int block, int2 task, int blockSize,
+	int count) {
+	int2 range = ComputeTaskRange(block, task);
+	range.x *= blockSize;
+	range.y = min(count, range.y * blockSize);
+	return range;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// DeviceExtractHeadFlags
+// Input array flags is a bit array with 32 head flags per word.
+// ExtractThreadHeadFlags returns numBits flags starting at bit index.
+
+MGPU_HOST_DEVICE uint DeviceExtractHeadFlags(const uint* flags, int index,
+	int numBits) {
+
+	int index2 = index>> 5;
+	int shift = 31 & index;
+	uint headFlags = flags[index2]>> shift;
+	int shifted = 32 - shift;
+
+	if(shifted < numBits)
+		// We also need to shift in the next set of bits.
+		headFlags = bfi(flags[index2 + 1], headFlags, shifted, shift);
+	headFlags &= (1<< numBits) - 1;
+	return headFlags;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// DevicePackHeadFlags
+// Pack VT bits per thread at 32 bits/thread. Will consume an integer number of
+// words, because CTA size is a multiple of 32. The first NT * VT / 32 threads
+// return packed words.
+
+template<int NT, int VT>
+MGPU_DEVICE uint DevicePackHeadFlags(uint threadBits, int tid,
+	uint* flags_shared) {
+
+	const int WordCount = NT * VT / 32;
+
+	// Each thread stores its thread bits to flags_shared[tid].
+	flags_shared[tid] = threadBits;
+	__syncthreads();
+
+	uint packed = 0;
+	if(tid < WordCount) {
+		const int Items = MGPU_DIV_UP(32, VT);
+		int index = 32 * tid;
+		int first = index / VT;
+		int bit = 0;
+
+		int rem = index - VT * first;
+		packed = flags_shared[first]>> rem;
+		bit = VT - rem;
+		++first;
+
+		#pragma unroll
+		for(int i = 0; i < Items; ++i) {
+			if(i < Items - 1 || bit < 32) {
+				uint x = flags_shared[first + i];
+				if(bit < 32) packed |= x<< bit;
+				bit += VT;
+			}
+		}
+	}
+	__syncthreads();
+
+	return packed;
+}
+
+} // namespace mgpu
--- a/include/contrib/moderngpu/include/device/intrinsics.cuh
+++ b/include/contrib/moderngpu/include/device/intrinsics.cuh
+/******************************************************************************
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ *
+ * Code and text by Sean Baxter, NVIDIA Research
+ * See http://nvlabs.github.io/moderngpu for repository and documentation.
+ *
+ ******************************************************************************/
+
+#include "devicetypes.cuh"
+
+#pragma once
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+
+namespace mgpu {
+
+MGPU_HOST_DEVICE uint2 ulonglong_as_uint2(uint64 x) {
+	return *reinterpret_cast<uint2*>(&x);
+}
+MGPU_HOST_DEVICE uint64 uint2_as_ulonglong(uint2 x) {
+	return *reinterpret_cast<uint64*>(&x);
+}
+
+MGPU_HOST_DEVICE int2 longlong_as_int2(int64 x) {
+	return *reinterpret_cast<int2*>(&x);
+}
+MGPU_HOST_DEVICE int64 int2_as_longlong(int2 x) {
+	return *reinterpret_cast<int64*>(&x);
+}
+
+MGPU_HOST_DEVICE int2 double_as_int2(double x) {
+	return *reinterpret_cast<int2*>(&x);
+}
+MGPU_HOST_DEVICE double int2_as_double(int2 x) {
+	return *reinterpret_cast<double*>(&x);
+}
+
+MGPU_HOST_DEVICE void SetDoubleX(double& d, int x) {
+	reinterpret_cast<int*>(&d)[0] = x;
+}
+MGPU_HOST_DEVICE int GetDoubleX(double d) {
+	return double_as_int2(d).x;
+}
+MGPU_HOST_DEVICE void SetDoubleY(double& d, int y) {
+	reinterpret_cast<int*>(&d)[1] = y;
+}
+MGPU_HOST_DEVICE int GetDoubleY(double d) {
+	return double_as_int2(d).y;
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// PTX for bfe and bfi
+
+#if __CUDA_ARCH__ >= 200
+
+MGPU_DEVICE uint bfe_ptx(uint x, uint bit, uint numBits) {
+	uint result;
+	asm("bfe.u32 %0, %1, %2, %3;" :
+		"=r"(result) : "r"(x), "r"(bit), "r"(numBits));
+	return result;
+}
+
+
+MGPU_DEVICE uint bfi_ptx(uint x, uint y, uint bit, uint numBits) {
+	uint result;
+	asm("bfi.b32 %0, %1, %2, %3, %4;" :
+		"=r"(result) : "r"(x), "r"(y), "r"(bit), "r"(numBits));
+	return result;
+}
+
+MGPU_DEVICE uint prmt_ptx(uint a, uint b, uint index) {
+	uint ret;
+	asm("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index));
+	return ret;
+}
+
+#endif // __CUDA_ARCH__ >= 200
+
+
+////////////////////////////////////////////////////////////////////////////////
+// shfl_up
+
+__device__ __forceinline__ float shfl_up(float var,
+	unsigned int delta, int width = 32) {
+
+#if __CUDA_ARCH__ >= 300
+	var = __shfl_up_sync(0xFFFFFFFF, var, delta, width);
+#endif
+	return var;
+}
+
+__device__ __forceinline__ double shfl_up(double var,
+	unsigned int delta, int width = 32) {
+
+#if __CUDA_ARCH__ >= 300
+	int2 p = mgpu::double_as_int2(var);
+	p.x = __shfl_up_sync(0xFFFFFFFF, p.x, delta, width);
+	p.y = __shfl_up_sync(0xFFFFFFFF, p.y, delta, width);
+	var = mgpu::int2_as_double(p);
+#endif
+
+	return var;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// shfl_add
+
+MGPU_DEVICE int shfl_add(int x, int offset, int width = WARP_SIZE) {
+	int result = 0;
+#if __CUDA_ARCH__ >= 300
+	int mask = (WARP_SIZE - width)<< 8;
+	asm(
+		"{.reg .s32 r0;"
+		".reg .pred p;"
+		"shfl.up.sync.b32 r0|p, %1, %2, %3, %4;"
+		"@p add.s32 r0, r0, %4;"
+		"mov.s32 %0, r0; }"
+		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
+#endif
+	return result;
+}
+
+MGPU_DEVICE int shfl_max(int x, int offset, int width = WARP_SIZE) {
+	int result = 0;
+#if __CUDA_ARCH__ >= 300
+	int mask = (WARP_SIZE - width)<< 8;
+	asm(
+		"{.reg .s32 r0;"
+		".reg .pred p;"
+		"shfl.up.sync..b32 r0|p, %1, %2, %3, %4;"
+		"@p max.s32 r0, r0, %4;"
+		"mov.s32 %0, r0; }"
+		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
+#endif
+	return result;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// brev, popc, clz, bfe, bfi, prmt
+
+// Reverse the bits in an integer.
+MGPU_HOST_DEVICE uint brev(uint x) {
+#if __CUDA_ARCH__ >= 200
+	uint y = __brev(x);
+#else
+	uint y = 0;
+	for(int i = 0; i < 32; ++i)
+		y |= (1 & (x>> i))<< (31 - i);
+#endif
+	return y;
+}
+
+// Count number of bits in a register.
+MGPU_HOST_DEVICE int popc(uint x) {
+#if __CUDA_ARCH__ >= 200
+	return __popc(x);
+#else
+	int c;
+	for(c = 0; x; ++c)
+		x &= x - 1;
+	return c;
+#endif
+}
+
+// Count leading zeros - start from most significant bit.
+MGPU_HOST_DEVICE int clz(int x) {
+#if __CUDA_ARCH__ >= 200
+	return __clz(x);
+#else
+	for(int i = 31; i >= 0; --i)
+		if((1<< i) & x) return 31 - i;
+	return 32;
+#endif
+}
+
+// Find first set - start from least significant bit. LSB is 1. ffs(0) is 0.
+MGPU_HOST_DEVICE int ffs(int x) {
+#if __CUDA_ARCH__ >= 200
+	return __ffs(x);
+#else
+	for(int i = 0; i < 32; ++i)
+		if((1<< i) & x) return i + 1;
+	return 0;
+#endif
+}
+
+MGPU_HOST_DEVICE uint bfe(uint x, uint bit, uint numBits) {
+#if __CUDA_ARCH__ >= 200
+	return bfe_ptx(x, bit, numBits);
+#else
+	return ((1<< numBits) - 1) & (x>> bit);
+#endif
+}
+
+MGPU_HOST_DEVICE uint bfi(uint x, uint y, uint bit, uint numBits) {
+	uint result;
+#if __CUDA_ARCH__ >= 200
+	result = bfi_ptx(x, y, bit, numBits);
+#else
+	if(bit + numBits > 32) numBits = 32 - bit;
+	uint mask = ((1<< numBits) - 1)<< bit;
+	result = y & ~mask;
+	result |= mask & (x<< bit);
+#endif
+	return result;
+}
+
+MGPU_HOST_DEVICE uint prmt(uint a, uint b, uint index) {
+	uint result;
+#if __CUDA_ARCH__ >= 200
+	result = prmt_ptx(a, b, index);
+#else
+	result = 0;
+	for(int i = 0; i < 4; ++i) {
+		uint sel = 0xf & (index>> (4 * i));
+		uint x = ((7 & sel) > 3) ? b : a;
+		x = 0xff & (x>> (8 * (3 & sel)));
+		if(8 & sel) x = (128 & x) ? 0xff : 0;
+		result |= x<< (8 * i);
+	}
+#endif
+	return result;
+}
+
+// Find log2(x) and optionally round up to the next integer logarithm.
+MGPU_HOST_DEVICE int FindLog2(int x, bool roundUp = false) {
+	int a = 31 - clz(x);
+	if(roundUp) a += !MGPU_IS_POW_2(x);
+	return a;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// vset4
+
+#if __CUDA_ARCH__ >= 300
+
+// Performs four byte-wise comparisons and returns 1 for each byte that
+// satisfies the conditional, and zero otherwise.
+MGPU_DEVICE uint vset4_lt_add_ptx(uint a, uint b, uint c) {
+	uint result;
+	asm("vset4.u32.u32.lt.add %0, %1, %2, %3;" :
+		"=r"(result) : "r"(a), "r"(b), "r"(c));
+	return result;
+}
+MGPU_DEVICE uint vset4_eq_ptx(uint a, uint b) {
+	uint result;
+	asm("vset4.u32.u32.eq %0, %1, %2, %3;" :
+		"=r"(result) : "r"(a), "r"(b), "r"(0));
+	return result;
+}
+#endif // __CUDA_ARCH__ >= 300
+
+MGPU_HOST_DEVICE uint vset4_lt_add(uint a, uint b, uint c) {
+	uint result;
+#if __CUDA_ARCH__ >= 300
+	result = vset4_lt_add_ptx(a, b, c);
+#else
+	result = c;
+	if((0x000000ff & a) < (0x000000ff & b)) result += 0x00000001;
+	if((0x0000ff00 & a) < (0x0000ff00 & b)) result += 0x00000100;
+	if((0x00ff0000 & a) < (0x00ff0000 & b)) result += 0x00010000;
+	if((0xff000000 & a) < (0xff000000 & b)) result += 0x01000000;
+#endif
+	return result;
+}
+
+MGPU_HOST_DEVICE uint vset4_eq(uint a, uint b) {
+	uint result;
+#if __CUDA_ARCH__ >= 300
+	result = vset4_eq_ptx(a, b);
+#else
+	result = 0;
+	if((0x000000ff & a) == (0x000000ff & b)) result = 0x00000001;
+	if((0x0000ff00 & a) == (0x0000ff00 & b)) result += 0x00000100;
+	if((0x00ff0000 & a) == (0x00ff0000 & b)) result += 0x00010000;
+	if((0xff000000 & a) == (0xff000000 & b)) result += 0x01000000;
+#endif
+	return result;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//
+
+MGPU_HOST_DEVICE uint umulhi(uint x, uint y) {
+#if __CUDA_ARCH__ >= 100
+	return __umulhi(x, y);
+#else
+	uint64 product = (uint64)x * y;
+	return (uint)(product>> 32);
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// ldg() function defined for all devices and all types. Only compiles to __ldg
+// intrinsic for __CUDA_ARCH__ >= 320 && __CUDA_ARCH__ < 400 for types supported
+// by __ldg in sm_32_intrinsics.h
+
+template<typename T>
+struct IsLdgType {
+	enum { value = false };
+};
+#define DEFINE_LDG_TYPE(T) \
+	template<> struct IsLdgType<T> { enum { value = true }; };
+
+template<typename T, bool UseLDG = IsLdgType<T>::value>
+struct LdgShim {
+	MGPU_DEVICE static T Ldg(const T* p) {
+		return *p;
+	}
+};
+
+#if __CUDA_ARCH__ >= 320 && __CUDA_ARCH__ < 400
+
+	// List of __ldg-compatible types from sm_32_intrinsics.h.
+	DEFINE_LDG_TYPE(char)
+	DEFINE_LDG_TYPE(short)
+	DEFINE_LDG_TYPE(int)
+	DEFINE_LDG_TYPE(long long)
+	DEFINE_LDG_TYPE(char2)
+	DEFINE_LDG_TYPE(char4)
+	DEFINE_LDG_TYPE(short2)
+	DEFINE_LDG_TYPE(short4)
+	DEFINE_LDG_TYPE(int2)
+	DEFINE_LDG_TYPE(int4)
+	DEFINE_LDG_TYPE(longlong2)
+
+	DEFINE_LDG_TYPE(unsigned char)
+	DEFINE_LDG_TYPE(unsigned short)
+	DEFINE_LDG_TYPE(unsigned int)
+	DEFINE_LDG_TYPE(unsigned long long)
+	DEFINE_LDG_TYPE(uchar2)
+	DEFINE_LDG_TYPE(uchar4)
+	DEFINE_LDG_TYPE(ushort2)
+	DEFINE_LDG_TYPE(ushort4)
+	DEFINE_LDG_TYPE(uint2)
+	DEFINE_LDG_TYPE(uint4)
+	DEFINE_LDG_TYPE(ulonglong2)
+
+	DEFINE_LDG_TYPE(float)
+	DEFINE_LDG_TYPE(double)
+	DEFINE_LDG_TYPE(float2)
+	DEFINE_LDG_TYPE(float4)
+	DEFINE_LDG_TYPE(double2)
+
+	template<typename T> struct LdgShim<T, true> {
+		MGPU_DEVICE static T Ldg(const T* p) {
+			return __ldg(p);
+		}
+	};
+#endif
+
+template<typename T>
+MGPU_DEVICE T ldg(const T* p) {
+	return LdgShim<T>::Ldg(p);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Fast division for 31-bit integers.
+// Uses the method in Hacker's Delight (2nd edition) page 228.
+// Evaluates for denom > 1 and x < 2^31.
+struct FastDivide {
+	uint denom;
+	uint coef;
+	uint shift;
+
+	MGPU_HOST_DEVICE uint Divide(uint x) {
+		return umulhi(x, coef)>> shift;
+	}
+	MGPU_HOST_DEVICE uint Modulus(uint x) {
+		return x - Divide(x) * denom;
+	}
+
+	explicit FastDivide(uint denom_) {
+		denom = denom_;
+		uint p = 31 + FindLog2(denom, true);
+		coef = (uint)(((1ull<< p) + denom - 1) / denom);
+		shift = p - 32;
+	}
+};
+
+#pragma GCC diagnostic pop
+
+} // namespace mgpu
--- a/include/contrib/moderngpu/include/device/loadstore.cuh
+++ b/include/contrib/moderngpu/include/device/loadstore.cuh
+/******************************************************************************
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ *
+ * Code and text by Sean Baxter, NVIDIA Research
+ * See http://nvlabs.github.io/moderngpu for repository and documentation.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include "../mgpudevice.cuh"
+#include "deviceutil.cuh"
+#include "intrinsics.cuh"
+
+namespace mgpu {
+
+////////////////////////////////////////////////////////////////////////////////
+// Cooperative load functions.
+
+template<int NT, int VT, typename InputIt, typename T>
+MGPU_DEVICE void DeviceSharedToReg(InputIt data, int tid, T* reg,
+	bool sync) {
+
+	#pragma unroll
+	for(int i = 0; i < VT; ++i)
+		reg[i] = data[NT * i + tid];
+
+	if(sync) __syncthreads();
+}
+
+template<int NT, int VT, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGlobalToRegPred(int count, InputIt data, int tid,
+	T* reg, bool sync) {
+
+	// TODO: Attempt to issue 4 loads at a time.
+	#pragma unroll
+	for(int i = 0; i < VT; ++i) {
+		int index = NT * i + tid;
+		if(index < count) reg[i] = data[index];
+	}
+	if(sync) __syncthreads();
+}
+
+template<int NT, int VT, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGlobalToReg(int count, InputIt data, int tid,
+	T* reg, bool sync) {
+
+	if(count >= NT * VT) {
+		#pragma unroll
+		for(int i = 0; i < VT; ++i)
+			reg[i] = data[NT * i + tid];
+	} else
+		DeviceGlobalToRegPred<NT, VT>(count, data, tid, reg, false);
+	if(sync) __syncthreads();
+}
+template<int NT, int VT0, int VT1, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGlobalToReg2(int count, InputIt data, int tid,
+	T* reg, bool sync) {
+
+	DeviceGlobalToReg<NT, VT0>(count, data, tid, reg, false);
+	#pragma unroll
+	for(int i = VT0; i < VT1; ++i) {
+		int index = NT * i + tid;
+		if(index < count) reg[i] = data[index];
+	}
+	if(sync) __syncthreads();
+}
+
+template<int NT, int VT, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGlobalToRegDefault(int count, InputIt data, int tid,
+	T* reg, T init, bool sync) {
+
+	if(count >= NT * VT) {
+		#pragma unroll
+		for(int i = 0; i < VT; ++i)
+			reg[i] = data[NT * i + tid];
+	} else {
+		#pragma unroll
+		for(int i = 0; i < VT; ++i) {
+			int index = NT * i + tid;
+			reg[i] = init;
+			if(index < count) reg[i] = data[index];
+		}
+	}
+	if(sync) __syncthreads();
+}
+template<int NT, int VT0, int VT1, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGlobalToRegDefault2(int count, InputIt data, int tid,
+	T* reg, T init, bool sync) {
+
+	DeviceGlobalToRegDefault<NT, VT0>(count, data, tid, reg, init, false);
+	#pragma unroll
+	for(int i = VT0; i < VT1; ++i) {
+		int index = NT * i + tid;
+		reg[i] = init;
+		if(index < count) reg[i] = data[index];
+	}
+	if(sync) __syncthreads();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+template<int NT, int VT, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGlobalToThread(int count, InputIt data, int tid,
+	T* reg) {
+
+	data += VT * tid;
+	if(count >= NT * VT) {
+		#pragma unroll
+		for(int i = 0; i < VT; ++i)
+			reg[i] = ldg(data + i);
+	} else {
+		count -= VT * tid;
+		#pragma unroll
+		for(int i = 0; i < VT; ++i)
+			if(i < count) reg[i] = ldg(data + i);
+	}
+}
+
+template<int NT, int VT, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGlobalToThreadDefault(int count, InputIt data, int tid,
+	T* reg, T init) {
+
+	data += VT * tid;
+	if(count >= NT * VT) {
+		#pragma unroll
+		for(int i = 0; i < VT; ++i)
+			reg[i] = ldg(data + i);
+	} else {
+		count -= VT * tid;
+		#pragma unroll
+		for(int i = 0; i < VT; ++i)
+			reg[i] = (i < count) ? ldg(data + i) : init;
+	}
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Cooperative store functions.
+
+template<int NT, int VT, typename OutputIt, typename T>
+MGPU_DEVICE void DeviceRegToShared(const T* reg, int tid,
+	OutputIt dest, bool sync) {
+
+	typedef typename std::iterator_traits<OutputIt>::value_type T2;
+	#pragma unroll
+	for(int i = 0; i < VT; ++i)
+		dest[NT * i + tid] = (T2)reg[i];
+
+	if(sync) __syncthreads();
+}
+
+template<int NT, int VT, typename OutputIt, typename T>
+MGPU_DEVICE void DeviceRegToGlobal(int count, const T* reg, int tid,
+	OutputIt dest, bool sync) {
+
+	#pragma unroll
+	for(int i = 0; i < VT; ++i) {
+		int index = NT * i + tid;
+		if(index < count)
+			dest[index] = reg[i];
+	}
+	if(sync) __syncthreads();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// DeviceMemToMemLoop
+// Transfer from shared memory to global, or global to shared, for transfers
+// that are smaller than NT * VT in the average case. The goal is to reduce
+// unnecessary comparison logic.
+
+template<int NT, int VT, typename InputIt, typename OutputIt>
+MGPU_DEVICE void DeviceMemToMem4(int count, InputIt source, int tid,
+	OutputIt dest, bool sync) {
+
+	typedef typename std::iterator_traits<InputIt>::value_type T;
+
+	T x[VT];
+	const int Count = (VT < 4) ? VT : 4;
+	if(count >= NT * VT) {
+		#pragma unroll
+		for(int i = 0; i < Count; ++i)
+			x[i] = source[NT * i + tid];
+		#pragma unroll
+		for(int i = 0; i < Count; ++i)
+			dest[NT * i + tid] = x[i];
+	} else {
+		#pragma unroll
+		for(int i = 0; i < Count; ++i) {
+			int index = NT * i + tid;
+			if(index < count)
+				x[i] = source[NT * i + tid];
+		}
+		#pragma unroll
+		for(int i = 0; i < Count; ++i) {
+			int index = NT * i + tid;
+			if(index < count)
+				dest[index] = x[i];
+		}
+	}
+	if(sync) __syncthreads();
+}
+template<int NT, typename InputIt, typename OutputIt>
+MGPU_DEVICE void DeviceMemToMemLoop(int count, InputIt source, int tid,
+	OutputIt dest, bool sync) {
+
+	for(int i = 0; i < count; i += 4 * NT)
+		DeviceMemToMem4<NT, 4>(count - i, source + i, tid, dest + i,
+			false);
+	if(sync) __syncthreads();
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Functions to copy between shared and global memory where the average case is
+// to transfer NT * VT elements.
+
+template<int NT, int VT, typename T, typename OutputIt>
+MGPU_DEVICE void DeviceSharedToGlobal(int count, const T* source, int tid,
+	OutputIt dest, bool sync) {
+
+	typedef typename std::iterator_traits<OutputIt>::value_type T2;
+	#pragma unroll
+	for(int i = 0; i < VT; ++i) {
+		int index = NT * i + tid;
+		if(index < count) dest[index] = (T2)source[index];
+	}
+	if(sync) __syncthreads();
+}
+
+template<int NT, int VT, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGlobalToShared(int count, InputIt source, int tid,
+	T* dest, bool sync) {
+
+	T reg[VT];
+	DeviceGlobalToReg<NT, VT>(count, source, tid, reg, false);
+	DeviceRegToShared<NT, VT>(reg, tid, dest, sync);
+}
+
+template<int NT, int VT0, int VT1, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGlobalToShared2(int count, InputIt source, int tid,
+	T* dest, bool sync) {
+
+	T reg[VT1];
+	DeviceGlobalToReg2<NT, VT0, VT1>(count, source, tid, reg, false);
+	DeviceRegToShared<NT, VT1>(reg, tid, dest, sync);
+}
+
+
+template<int NT, int VT, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGlobalToSharedDefault(int count, InputIt source, int tid,
+	T* dest, T init, bool sync) {
+
+	T reg[VT];
+	DeviceGlobalToRegDefault<NT, VT>(count, source, tid, reg, init, false);
+	DeviceRegToShared<NT, VT>(reg, tid, dest, sync);
+}
+
+template<int NT, int VT0, int VT1, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGlobalToSharedDefault2(int count, InputIt data, int tid,
+	T* dest, T init, bool sync) {
+
+	T reg[VT1];
+	DeviceGlobalToRegDefault2<NT, VT0, VT1>(count, data, tid, reg, init, false);
+	DeviceRegToShared<NT, VT1>(reg, tid, dest, sync);
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+template<int NT, int VT, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGlobalToSharedLoop(int count, InputIt source, int tid,
+	T* dest, bool sync) {
+
+	const int Granularity = MGPU_MIN(VT, 3);
+	DeviceGlobalToShared<NT, Granularity>(count, source, tid, dest, false);
+
+	int offset = Granularity * NT;
+	if(count > offset)
+		DeviceGlobalToShared<NT, VT - Granularity>(count - offset,
+			source + offset, tid, dest + offset, false);
+
+	if(sync) __syncthreads();
+
+	/*
+	source += tid;
+	while(count > 0) {
+		T reg[Granularity];
+		#pragma unroll
+		for(int i = 0; i < Granularity; ++i) {
+			int index = NT * i + tid;
+			if(index < count)
+				reg[i] = source[NT * i];
+		}
+		DeviceRegToShared<NT, Granularity>(reg, tid, dest, false);
+		source += Granularity * NT;
+		dest += Granularity * NT;
+		count -= Granularity * NT;
+	}
+	if(sync) __syncthreads();*/
+}
+
+template<int NT, int VT, typename InputIt, typename OutputIt>
+MGPU_DEVICE void DeviceGlobalToGlobal(int count, InputIt source, int tid,
+	OutputIt dest, bool sync) {
+
+	typedef typename std::iterator_traits<OutputIt>::value_type T;
+	T values[VT];
+	DeviceGlobalToReg<NT, VT>(count, source, tid, values, false);
+	DeviceRegToGlobal<NT, VT>(count, values, tid, dest, sync);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Transponse VT elements in NT threads (x) into thread-order registers (y)
+// using only NT * VT / 2 elements of shared memory.
+
+//This function definitely has a bug, don't use!!! fix TODO(erich)
+template<int NT, int VT, typename T>
+MGPU_DEVICE void HalfSmemTranspose(const T* x, int tid, T* shared, T* y) {
+    printf("HalfSmemTranspose has a bug, use WAR SmemTranpose or find bug before using in production");
+	// Transpose the first half values (tid < NT / 2)
+	#pragma unroll
+	for(int i = 0; i <= VT / 2; ++i)
+		if(i < VT / 2 || tid < NT / 2)
+			shared[NT * i + tid] = x[i];
+	__syncthreads();
+
+	if(tid < NT / 2) {
+		#pragma unroll
+		for(int i = 0; i < VT; ++i)
+			y[i] = shared[VT * tid + i];
+	}
+	__syncthreads();
+
+	// Transpose the second half values (tid >= NT / 2)
+	#pragma unroll
+	for(int i = VT / 2; i < VT; ++i)
+		if(i > VT / 2 || tid >= NT / 2)
+			shared[NT * i - NT * VT / 2 + tid] = x[i];
+	__syncthreads();
+
+	if(tid >= NT / 2) {
+		#pragma unroll
+		for(int i = 0; i < VT; ++i)
+			y[i] = shared[VT * tid + i - NT * VT / 2];
+	}
+	__syncthreads();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Gather/scatter functions
+
+template<int NT, int VT, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGather(int count, InputIt data, int indices[VT],
+	int tid, T* reg, bool sync) {
+
+	if(count >= NT * VT) {
+		#pragma unroll
+		for(int i = 0; i < VT; ++i)
+			reg[i] = data[indices[i]];
+	} else {
+		#pragma unroll
+		for(int i = 0; i < VT; ++i) {
+			int index = NT * i + tid;
+			if(index < count)
+				reg[i] = data[indices[i]];
+		}
+	}
+	if(sync) __syncthreads();
+}
+
+template<int NT, int VT, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGatherDefault(int count, InputIt data, int indices[VT],
+	int tid, T* reg, T identity, bool sync) {
+
+	if(count >= NT * VT) {
+		#pragma unroll
+		for(int i = 0; i < VT; ++i)
+			reg[i] = data[indices[i]];
+	} else {
+		#pragma unroll
+		for(int i = 0; i < VT; ++i) {
+			int index = NT * i + tid;
+			reg[i] = (index < count) ? data[indices[i]] : identity;
+		}
+	}
+	if(sync) __syncthreads();
+}
+
+template<int NT, int VT, typename T, typename OutputIt>
+MGPU_DEVICE void DeviceScatter(int count, const T* reg, int tid,
+	int indices[VT], OutputIt data, bool sync) {
+
+	if(count >= NT * VT) {
+		#pragma unroll
+		for(int i = 0; i < VT; ++i)
+			data[indices[i]] = reg[i];
+	} else {
+		#pragma unroll
+		for(int i = 0; i < VT; ++i) {
+			int index = NT * i + tid;
+			if(index < count)
+				data[indices[i]] = reg[i];
+		}
+	}
+	if(sync) __syncthreads();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Cooperative transpose functions (strided to thread order)
+
+template<int VT, typename T>
+MGPU_DEVICE void DeviceThreadToShared(const T* threadReg, int tid, T* shared,
+	bool sync) {
+
+	if(1 & VT) {
+		// Odd grain size. Store as type T.
+		#pragma unroll
+		for(int i = 0; i < VT; ++i)
+			shared[VT * tid + i] = threadReg[i];
+	} else {
+		// Even grain size. Store as DevicePair<T>. This lets us exploit the
+		// 8-byte shared memory mode on Kepler.
+		DevicePair<T>* dest = (DevicePair<T>*)(shared + VT * tid);
+		#pragma unroll
+		for(int i = 0; i < VT / 2; ++i)
+			dest[i] = MakeDevicePair(threadReg[2 * i], threadReg[2 * i + 1]);
+	}
+	if(sync) __syncthreads();
+}
+
+template<int VT, typename T>
+MGPU_DEVICE void DeviceSharedToThread(const T* shared, int tid, T* threadReg,
+	bool sync) {
+
+	if(1 & VT) {
+		#pragma unroll
+		for(int i = 0; i < VT; ++i)
+			threadReg[i] = shared[VT * tid + i];
+	} else {
+		const DevicePair<T>* source = (const DevicePair<T>*)(shared + VT * tid);
+		#pragma unroll
+		for(int i = 0; i < VT / 2; ++i) {
+			DevicePair<T> p = source[i];
+			threadReg[2 * i] = p.x;
+			threadReg[2 * i + 1] = p.y;
+		}
+	}
+	if(sync) __syncthreads();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// DeviceLoad2 - load from pointers of the same type. Optimize for a single LD
+// statement.
+
+template<int NT, int VT0, int VT1, typename T>
+MGPU_DEVICE void DeviceLoad2ToReg(const T* a_global, int aCount,
+	const T* b_global, int bCount, int tid, T* reg, bool sync) {
+
+	int b0 = b_global - a_global - aCount;
+	int total = aCount + bCount;
+	if(total >= NT * VT0) {
+		#pragma unroll
+		for(int i = 0; i < VT0; ++i) {
+			int index = NT * i + tid;
+			reg[i] = a_global[index + ((index >= aCount) ? b0 : 0)];
+		}
+	} else {
+		#pragma unroll
+		for(int i = 0; i < VT0; ++i) {
+			int index = NT * i + tid;
+			if(index < total)
+				reg[i] = a_global[index + ((index >= aCount) ? b0 : 0)];
+		}
+	}
+	#pragma unroll
+	for(int i = VT0; i < VT1; ++i) {
+		int index = NT * i + tid;
+		if(index < total)
+			reg[i] = a_global[index + ((index >= aCount) ? b0 : 0)];
+	}
+}
+
+template<int NT, int VT0, int VT1, typename T>
+MGPU_DEVICE void DeviceLoad2ToShared(const T* a_global, int aCount,
+	const T* b_global, int bCount, int tid, T* shared, bool sync) {
+
+	T reg[VT1];
+	DeviceLoad2ToReg<NT, VT0, VT1>(a_global, aCount, b_global, bCount, tid,
+		reg, false);
+	DeviceRegToShared<NT, VT1>(reg, tid, shared, sync);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// DeviceLoad2 - load from pointers of different types. Uses two LD statements.
+
+template<int NT, int VT0, int VT1, typename InputIt1, typename InputIt2,
+	typename T>
+MGPU_DEVICE void DeviceLoad2ToReg(InputIt1 a_global, int aCount,
+	InputIt2 b_global, int bCount, int tid, T* reg, bool sync)  {
+
+	b_global -= aCount;
+	int total = aCount + bCount;
+	if(total >= NT * VT0) {
+		#pragma unroll
+		for(int i = 0; i < VT0; ++i) {
+			int index = NT * i + tid;
+			if(index < aCount) reg[i] = a_global[index];
+			else reg[i] = b_global[index];
+		}
+	} else {
+		#pragma unroll
+		for(int i = 0; i < VT0; ++i) {
+			int index = NT * i + tid;
+			if(index < aCount) reg[i] = a_global[index];
+			else if(index < total) reg[i] = b_global[index];
+		}
+	}
+	#pragma unroll
+	for(int i = VT0; i < VT1; ++i) {
+		int index = NT * i + tid;
+		if(index < aCount) reg[i] = a_global[index];
+		else if(index < total) reg[i] = b_global[index];
+	}
+}
+
+template<int NT, int VT0, int VT1, typename InputIt1, typename InputIt2,
+	typename T>
+MGPU_DEVICE void DeviceLoad2ToShared(InputIt1 a_global, int aCount,
+	InputIt2 b_global, int bCount, int tid, T* shared, bool sync) {
+
+	T reg[VT1];
+	DeviceLoad2ToReg<NT, VT0, VT1>(a_global, aCount, b_global, bCount, tid,
+		reg, false);
+	DeviceRegToShared<NT, VT1>(reg, tid, shared, sync);
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// DeviceGatherGlobalToGlobal
+
+template<int NT, int VT, typename InputIt, typename OutputIt>
+MGPU_DEVICE void DeviceGatherGlobalToGlobal(int count, InputIt data_global,
+	const int* indices_shared, int tid, OutputIt dest_global, bool sync) {
+
+	typedef typename std::iterator_traits<InputIt>::value_type ValType;
+	ValType values[VT];
+
+	#pragma unroll
+	for(int i = 0; i < VT; ++i) {
+		int index = NT * i + tid;
+		if(index < count) {
+			int gather = indices_shared[index];
+			values[i] = data_global[gather];
+		}
+	}
+	if(sync) __syncthreads();
+	DeviceRegToGlobal<NT, VT>(count, values, tid, dest_global, false);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// DeviceTransferMergeValues
+// Gather in a merge-like value from two input arrays and store to a single
+// output. Like DeviceGatherGlobalToGlobal, but for two arrays at once.
+
+template<int NT, int VT, typename InputIt1, typename InputIt2,
+	typename T>
+MGPU_DEVICE void DeviceTransferMergeValuesReg(int count, InputIt1 a_global,
+	InputIt2 b_global, int bStart, const int* indices, int tid,
+	T* reg, bool sync) {
+
+	b_global -= bStart;
+	if(count >= NT * VT) {
+		#pragma unroll
+		for(int i = 0; i < VT; ++i) {
+			reg[i] = (indices[i] < bStart) ? a_global[indices[i]] :
+				b_global[indices[i]];
+		}
+	} else {
+		#pragma unroll
+		for(int i = 0; i < VT; ++i) {
+			int index = NT * i + tid;
+			if(index < count)
+				reg[i] = (indices[i] < bStart) ? a_global[indices[i]] :
+					b_global[indices[i]];
+		}
+	}
+	if(sync) __syncthreads();
+}
+
+template<int NT, int VT, typename InputIt1, typename InputIt2,
+	typename OutputIt>
+MGPU_DEVICE void DeviceTransferMergeValuesShared(int count, InputIt1 a_global,
+	InputIt2 b_global, int bStart, const int* indices_shared, int tid,
+	OutputIt dest_global, bool sync) {
+
+	int indices[VT];
+	DeviceSharedToReg<NT, VT>(indices_shared, tid, indices);
+
+	typedef typename std::iterator_traits<InputIt1>::value_type ValType;
+	ValType reg[VT];
+	DeviceTransferMergeValuesReg<NT, VT>(count, a_global, b_global, bStart,
+		indices, tid, reg, sync);
+	DeviceRegToGlobal<NT, VT>(count, reg, tid, dest_global, sync);
+}
+
+template<int NT, int VT, typename T>
+MGPU_DEVICE void DeviceTransferMergeValuesReg(int count, const T* a_global,
+	const T* b_global, int bStart, const int* indices, int tid, T* reg,
+	bool sync) {
+
+	int bOffset = (int)(b_global - a_global - bStart);
+
+	if(count >= NT * VT) {
+		#pragma unroll
+		for(int i = 0; i < VT; ++i) {
+			int gather = indices[i];
+			if(gather >= bStart) gather += bOffset;
+			reg[i] = a_global[gather];
+		}
+	} else {
+		#pragma unroll
+		for(int i = 0; i < VT; ++i) {
+			int index = NT * i + tid;
+			int gather = indices[i];
+			if(gather >= bStart) gather += bOffset;
+			if(index < count)
+				reg[i] = a_global[gather];
+		}
+	}
+	if(sync) __syncthreads();
+}
+
+template<int NT, int VT, typename T, typename OutputIt>
+MGPU_DEVICE void DeviceTransferMergeValuesShared(int count, const T* a_global,
+	const T* b_global, int bStart, const int* indices_shared, int tid,
+	OutputIt dest_global, bool sync) {
+
+	int indices[VT];
+	DeviceSharedToReg<NT, VT>(indices_shared, tid, indices);
+
+	T reg[VT];
+	DeviceTransferMergeValuesReg<NT, VT>(count, a_global, b_global, bStart,
+		indices, tid, reg, sync);
+	DeviceRegToGlobal<NT, VT>(count, reg, tid, dest_global, sync);
+}
+
+} // namespace mgpu
--- a/include/contrib/moderngpu/include/device/serialsets.cuh
+++ b/include/contrib/moderngpu/include/device/serialsets.cuh
+/******************************************************************************
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ *
+ * Code and text by Sean Baxter, NVIDIA Research
+ * See http://nvlabs.github.io/moderngpu for repository and documentation.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include "deviceutil.cuh"
+
+namespace mgpu {
+
+////////////////////////////////////////////////////////////////////////////////
+// SerialSetIntersection
+// Emit A if A and B are in range and equal.
+
+template<int VT, bool RangeCheck, typename T, typename Comp>
+MGPU_DEVICE int SerialSetIntersection(const T* data, int aBegin, int aEnd,
+	int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) {
+
+	const int MinIterations = VT / 2;
+	int commit = 0;
+
+	#pragma unroll
+	for(int i = 0; i < VT; ++i) {
+		bool test = RangeCheck ?
+			((aBegin + bBegin < end) && (aBegin < aEnd) && (bBegin < bEnd)) :
+			(i < MinIterations || (aBegin + bBegin < end));
+
+		if(test) {
+			T aKey = data[aBegin];
+			T bKey = data[bBegin];
+
+			bool pA = comp(aKey, bKey);
+			bool pB = comp(bKey, aKey);
+
+			// The outputs must come from A by definition of set interection.
+			results[i] = aKey;
+			indices[i] = aBegin;
+
+			if(!pB) ++aBegin;
+			if(!pA) ++bBegin;
+			if(pA == pB) commit |= 1<< i;
+		}
+	}
+	return commit;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// SerialSetUnion
+// Emit A if A <= B. Emit B if B < A.
+
+template<int VT, bool RangeCheck, typename T, typename Comp>
+MGPU_DEVICE int SerialSetUnion(const T* data, int aBegin, int aEnd,
+	int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) {
+
+	const int MinIterations = VT / 2;
+	int commit = 0;
+
+	#pragma unroll
+	for(int i = 0; i < VT; ++i) {
+		bool test = RangeCheck ?
+			(aBegin + bBegin < end) :
+			(i < MinIterations || (aBegin + bBegin < end));
+
+		if(test) {
+			T aKey = data[aBegin];
+			T bKey = data[bBegin];
+
+			bool pA = false, pB = false;
+			if(RangeCheck && aBegin >= aEnd)
+				pB = true;
+			else if(RangeCheck && bBegin >= bEnd)
+				pA = true;
+			else {
+				// Both are in range.
+				pA = comp(aKey, bKey);
+				pB = comp(bKey, aKey);
+			}
+
+			// Output A in case of a tie, so check if b < a.
+			results[i] = pB ? bKey : aKey;
+			indices[i] = pB ? bBegin : aBegin;
+			if(!pB) ++aBegin;
+			if(!pA) ++bBegin;
+			commit |= 1<< i;
+		}
+	}
+	return commit;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// SerialSetDifference
+// Emit A if A < B.
+
+template<int VT, bool RangeCheck, typename T, typename Comp>
+MGPU_DEVICE int SerialSetDifference(const T* data, int aBegin, int aEnd,
+	int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) {
+
+	const int MinIterations = VT / 2;
+	int commit = 0;
+
+	#pragma unroll
+	for(int i = 0; i < VT; ++i) {
+		bool test = RangeCheck ?
+			(aBegin + bBegin < end) :
+			(i < MinIterations || (aBegin + bBegin < end));
+		if(test) {
+			T aKey = data[aBegin];
+			T bKey = data[bBegin];
+
+			bool pA = false, pB = false;
+			if(RangeCheck && aBegin >= aEnd)
+				pB = true;
+			else if(RangeCheck && bBegin >= bEnd)
+				pA = true;
+			else {
+				pA = comp(aKey, bKey);
+				pB = comp(bKey, aKey);
+			}
+
+			// The outputs must come from A by definition of set difference.
+			results[i] = aKey;
+			indices[i] = aBegin;
+			if(!pB) ++aBegin;
+			if(!pA) ++bBegin;
+			if(pA) commit |= 1<< i;
+		}
+	}
+	return commit;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// SerialSetSymDiff
+// Emit A if A < B and emit B if B < A.
+
+template<int VT, bool RangeCheck, typename T, typename Comp>
+MGPU_DEVICE int SerialSetSymDiff(const T* data, int aBegin, int aEnd,
+	int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) {
+
+	const int MinIterations = VT / 2;
+	int commit = 0;
+
+	#pragma unroll
+	for(int i = 0; i < VT; ++i) {
+		bool test = RangeCheck ?
+			(aBegin + bBegin < end) :
+			(i < MinIterations || (aBegin + bBegin < end));
+		if(test) {
+			T aKey = data[aBegin];
+			T bKey = data[bBegin];
+
+			bool pA = false, pB = false;
+			if(RangeCheck && (bBegin >= bEnd))
+				pA = true;
+			else if(RangeCheck && (aBegin >= aEnd))
+				pB = true;
+			else {
+				pA = comp(aKey, bKey);
+				pB = comp(bKey, aKey);
+			}
+
+			results[i] = pA ? aKey : bKey;
+			indices[i] = pA ? aBegin : bBegin;
+			if(!pA) ++bBegin;
+			if(!pB) ++aBegin;
+			if(pA != pB) commit |= 1<< i;
+		}
+	}
+	return commit;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// SerialSetOp
+// Uses the MgpuSetOp enum to statically select one of the four serial ops
+// above.
+
+template<int VT, bool RangeCheck, MgpuSetOp Op, typename T, typename Comp>
+MGPU_DEVICE int SerialSetOp(const T* data, int aBegin, int aEnd,
+	int bBegin, int bEnd, int star, T* results, int* indices, Comp comp) {
+
+	int end = aBegin + bBegin + VT - star;
+	if(RangeCheck) end = min(end, aEnd + bEnd);
+	int commit;
+	switch(Op) {
+		case MgpuSetOpIntersection:
+			commit = SerialSetIntersection<VT, RangeCheck>(data, aBegin,
+				aEnd, bBegin, bEnd, end, results, indices, comp);
+			break;
+		case MgpuSetOpUnion:
+			commit = SerialSetUnion<VT, RangeCheck>(data, aBegin, aEnd,
+				bBegin, bEnd, end, results, indices, comp);
+			break;
+		case MgpuSetOpDiff:
+			commit = SerialSetDifference<VT, RangeCheck>(data, aBegin, aEnd,
+				bBegin, bEnd, end, results, indices, comp);
+			break;
+		case MgpuSetOpSymDiff:
+			commit = SerialSetSymDiff<VT, RangeCheck>(data, aBegin, aEnd,
+				bBegin, bEnd, end, results, indices, comp);
+			break;
+	}
+	__syncthreads();
+	return commit;
+}
+
+} // namespace mgpu
--- a/include/contrib/moderngpu/include/device/sortnetwork.cuh
+++ b/include/contrib/moderngpu/include/device/sortnetwork.cuh
+/******************************************************************************
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ *
+ * Code and text by Sean Baxter, NVIDIA Research
+ * See http://nvlabs.github.io/moderngpu for repository and documentation.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include "deviceutil.cuh"
+
+namespace mgpu {
+
+////////////////////////////////////////////////////////////////////////////////
+// Odd-even transposition sorting network. Sorts keys and values in-place in
+// register.
+// http://en.wikipedia.org/wiki/Odd%E2%80%93even_sort
+
+// CUDA Compiler does not currently unroll these loops correctly. Write using
+// template loop unrolling.
+/*
+template<int VT, typename T, typename V, typename Comp>
+MGPU_DEVICE void OddEvenTransposeSort(T* keys, V* values, Comp comp) {
+	#pragma unroll
+	for(int level = 0; level < VT; ++level) {
+
+		#pragma unroll
+		for(int i = 1 & level; i < VT - 1; i += 2) {
+			if(comp(keys[i + 1], keys[i])) {
+				mgpu::swap(keys[i], keys[i + 1]);
+				mgpu::swap(values[i], values[i + 1]);
+			}
+		}
+	}
+}*/
+
+template<int I, int VT>
+struct OddEvenTransposeSortT {
+	// Sort segments marked by head flags. If the head flag between i and i + 1
+	// is set (so that (2<< i) & flags is true), the values belong to different
+	// segments and are not swapped.
+	template<typename K, typename V, typename Comp>
+	static MGPU_DEVICE void Sort(K* keys, V* values, int flags, Comp comp) {
+		#pragma unroll
+		for(int i = 1 & I; i < VT - 1; i += 2)
+			if((0 == ((2<< i) & flags)) && comp(keys[i + 1], keys[i])) {
+				mgpu::swap(keys[i], keys[i + 1]);
+				mgpu::swap(values[i], values[i + 1]);
+			}
+		OddEvenTransposeSortT<I + 1, VT>::Sort(keys, values, flags, comp);
+	}
+};
+template<int I> struct OddEvenTransposeSortT<I, I> {
+	template<typename K, typename V, typename Comp>
+	static MGPU_DEVICE void Sort(K* keys, V* values, int flags, Comp comp) { }
+};
+
+template<int VT, typename K, typename V, typename Comp>
+MGPU_DEVICE void OddEvenTransposeSort(K* keys, V* values, Comp comp) {
+	OddEvenTransposeSortT<0, VT>::Sort(keys, values, 0, comp);
+}
+template<int VT, typename K, typename V, typename Comp>
+MGPU_DEVICE void OddEvenTransposeSortFlags(K* keys, V* values, int flags,
+	Comp comp) {
+	OddEvenTransposeSortT<0, VT>::Sort(keys, values, flags, comp);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Batcher Odd-Even Mergesort network
+// Unstable but executes much faster than the transposition sort.
+// http://en.wikipedia.org/wiki/Batcher_odd%E2%80%93even_mergesort
+
+template<int Width, int Low, int Count>
+struct OddEvenMergesortT {
+	template<typename K, typename V, typename Comp>
+	MGPU_DEVICE static void CompareAndSwap(K* keys, V* values, int flags,
+		int a, int b, Comp comp) {
+		if(b < Count) {
+			// Mask the bits between a and b. Any head flags in this interval
+			// means the keys are in different segments and must not be swapped.
+			const int Mask = ((2<< b) - 1) ^ ((2<< a) - 1);
+			if(!(Mask & flags) && comp(keys[b], keys[a])) {
+				mgpu::swap(keys[b], keys[a]);
+				mgpu::swap(values[b], values[a]);
+			}
+		}
+	}
+
+	template<int R, int Low2, bool Recurse = 2 * R < Width>
+	struct OddEvenMerge {
+		template<typename K, typename V, typename Comp>
+		MGPU_DEVICE static void Merge(K* keys, V* values, int flags,
+			Comp comp) {
+			// Compare and swap
+			const int M = 2 * R;
+			OddEvenMerge<M, Low2>::Merge(keys, values, flags, comp);
+			OddEvenMerge<M, Low2 + R>::Merge(keys, values, flags, comp);
+
+			#pragma unroll
+			for(int i = Low2 + R; i + R < Low2 + Width; i += M)
+				CompareAndSwap(keys, values, flags, i, i + R, comp);
+		}
+	};
+	template<int R, int Low2>
+	struct OddEvenMerge<R, Low2, false> {
+		template<typename K, typename V, typename Comp>
+		MGPU_DEVICE static void Merge(K* keys, V* values, int flags,
+			Comp comp) {
+			CompareAndSwap(keys, values, flags, Low2, Low2 + R, comp);
+		}
+	};
+
+	template<typename K, typename V, typename Comp>
+	MGPU_DEVICE static void Sort(K* keys, V* values, int flags,
+		Comp comp) {
+
+		const int M = Width / 2;
+		OddEvenMergesortT<M, Low, Count>::Sort(keys, values, flags, comp);
+		OddEvenMergesortT<M, Low + M, Count>::Sort(keys, values, flags, comp);
+		OddEvenMerge<1, Low>::Merge(keys, values, flags, comp);
+	}
+};
+template<int Low, int Count> struct OddEvenMergesortT<1, Low, Count> {
+	template<typename K, typename V, typename Comp>
+	MGPU_DEVICE static void Sort(K* keys, V* values, int flags,
+		Comp comp) { }
+};
+
+template<int VT, typename K, typename V, typename Comp>
+MGPU_DEVICE void OddEvenMergesort(K* keys, V* values, Comp comp) {
+	const int Width = 1<< sLogPow2<VT, true>::value;
+	OddEvenMergesortT<Width, 0, VT>::Sort(keys, values, 0, comp);
+}
+template<int VT, typename K, typename V, typename Comp>
+MGPU_DEVICE void OddEvenMergesortFlags(K* keys, V* values, int flags,
+	Comp comp) {
+	const int Width = 1<< sLogPow2<VT, true>::value;
+	OddEvenMergesortT<Width, 0, VT>::Sort(keys, values, flags, comp);
+}
+
+} // namespace mgpu
--- a/include/contrib/moderngpu/include/mgpudevice.cuh
+++ b/include/contrib/moderngpu/include/mgpudevice.cuh
+/******************************************************************************
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ *
+ * Code and text by Sean Baxter, NVIDIA Research
+ * See http://nvlabs.github.io/moderngpu for repository and documentation.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include "mgpuenums.h"
+#include "device/deviceutil.cuh"
+
+namespace mgpu {
+
+////////////////////////////////////////////////////////////////////////////////
+// device/loadstore.cuh
+
+// For 0 <= i < VT:
+//		index = NT * i + tid;
+//		reg[i] = data[index];
+// Synchronize after load.
+template<int NT, int VT, typename InputIt, typename T>
+MGPU_DEVICE void DeviceSharedToReg(InputIt data, int tid, T* reg,
+	bool sync = true);
+
+// For 0 <= i < VT:
+//		index = NT * i + tid;
+//		if(index < count) reg[i] = data[index];
+// No synchronize after load.
+template<int NT, int VT, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGlobalToReg(int count, InputIt data, int tid,
+	T* reg, bool sync = false);
+
+template<int NT, int VT, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGlobalToRegDefault(int count, InputIt data, int tid,
+	T* reg, T init, bool sync = false);
+
+// For 0 <= i < VT:
+//		index = NT * i + tid;
+//		if(index < count) reg[i] = data[index];
+// No synchronize after load.
+template<int NT, int VT0, int VT1, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGlobalToReg(int count, InputIt data, int tid,
+	T* reg, bool sync = false);
+
+// For 0 <= i < VT:
+//		index = NT * i + tid;
+//		if(index < count) reg[i] = data[index];
+// No synchronize after load.
+template<int NT, int VT0, int VT1, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGlobalToRegDefault2(int count, InputIt data, int tid,
+	T* reg, T init, bool sync = false);
+
+// For 0 <= i < VT:
+//		index = NT * i + tid;
+//		if(index < count) reg[i] = data[index];
+// No synchronize after load.
+// No optimized code path for count < NV (smaller generated code).
+template<int NT, int VT, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGlobalToRegLoop(int count, InputIt data, int tid,
+	T* reg, bool sync = false);
+
+
+// For 0 <= i < VT:
+//		index = VT * tid + i.
+//		if(index < count) reg[i] = data[index];
+// No synchronize after load.
+template<int NT, int VT, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGlobalToThread(int count, InputIt data, int tid,
+	T* reg);
+
+template<int NT, int VT, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGlobalToThreadDefault(int count, InputIt data, int tid,
+	T* reg, T init);
+
+// For 0 <= i < VT:
+//		index = NT * i + tid;
+//		if(index < count) data[index] = reg[i];
+// Synchronize after load.
+template<int NT, int VT, typename OutputIt, typename T>
+MGPU_DEVICE void DeviceRegToShared(const T* reg, int tid, OutputIt dest,
+	bool sync = true);
+
+// For 0 <= i < VT:
+//		index = NT * i + tid;
+//		if(index < count) data[index] = reg[i];
+// No synchronize after load.
+template<int NT, int VT, typename OutputIt, typename T>
+MGPU_DEVICE void DeviceRegToGlobal(int count, const T* reg, int tid,
+	OutputIt dest, bool sync = false);
+
+// For 0 <= index < count:
+//		dest[index] = source[index];
+// This function is intended to replace DeviceGlobalToShared in cases where
+// count is much less than NT * VT.
+template<int NT, typename InputIt, typename OutputIt>
+MGPU_DEVICE void DeviceMemToMemLoop(int count, InputIt source, int tid,
+	OutputIt dest, bool sync = true);
+
+// For 0 <= index < count:
+//		dest[index] = source[index];
+// Synchronize after store.
+template<int NT, int VT, typename T, typename OutputIt>
+MGPU_DEVICE void DeviceSharedToGlobal(int count, const T* source, int tid,
+	OutputIt dest, bool sync = true);
+
+// For 0 <= index < count:
+//		dest[index] = source[index];
+// Synchronize after store.
+template<int NT, int VT, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGlobalToShared(int count, InputIt source, int tid,
+	T* dest, bool sync = true);
+
+template<int NT, int VT0, int VT1, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGlobalToShared2(int count, InputIt source, int tid,
+	T* dest, bool sync = true);
+
+// For 0 <= index < count:
+//		dest[index] = source[index];
+// Synchronize after store.
+// No optimized code path for count < NV (smaller generated code).
+template<int NT, int VT, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGlobalToSharedLoop(int count, InputIt source, int tid,
+	T* dest, bool sync = true);
+
+template<int NT, int VT, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGlobalToSharedDefault(int count, InputIt source, int tid,
+	T* dest, T init, bool sync = true);
+
+template<int NT, int VT0, int VT1, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGlobalToSharedDefault2(int count, InputIt source,
+	int tid, T* dest, T init, bool sync = true);
+
+// For 0 <= index < count:
+//		dest[index] = source[index];
+// No synchronize.
+template<int NT, int VT, typename InputIt, typename OutputIt>
+MGPU_DEVICE void DeviceGlobalToGlobal(int count, InputIt source, int tid,
+	OutputIt dest, bool sync = false);
+
+// Transponse VT elements in NT threads (x) into thread-order registers (y)
+// using only NT * VT / 2 elements of shared memory.
+template<int NT, int VT, typename T>
+MGPU_DEVICE void HalfSmemTranspose(const T* x, int tid, T* shared, T* y);
+
+// For 0 <= i < VT:
+//		index = NT * i + tid;
+//		if(index < count)
+//			gather = indices[index];
+//			reg[i] = data[gather];
+// Synchronize after load.
+template<int NT, int VT, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGather(int count, InputIt data, int indices[VT],
+	int tid, T* reg, bool sync = true);
+
+template<int NT, int VT, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGatherDefault(int count, InputIt data, int indices[VT],
+	int tid, T* reg, T identity, bool sync = true);
+
+// For 0 <= i < VT:
+//		index = NT * i + tid;
+//		if(index < count)
+//			scatter = indices[index];
+//			data[scatter] = reg[i];
+// Synchronize after store.
+template<int NT, int VT, typename T, typename OutputIt>
+MGPU_DEVICE void DeviceScatter(int count, const T* reg, int tid,
+	int indices[VT], OutputIt data, bool sync = true);
+
+// For 0 <= i < VT:
+//		shared[VT * tid + i] = threadReg[i];
+// Synchronize after store.
+// Note this function moves data in THREAD ORDER.
+// (DeviceRegToShared moves data in STRIDED ORDER).
+template<int VT, typename T>
+MGPU_DEVICE void DeviceThreadToShared(const T* threadReg, int tid, T* shared,
+	bool sync = true);
+
+// For 0 <= i < VT:
+//		threadReg[i] = shared[VT * tid + i];
+// Synchronize after load.
+// Note this function moves data in THREAD ORDER.
+// (DeviceSharedToReg moves data in STRIDED ORDER).
+template<int VT, typename T>
+MGPU_DEVICE void DeviceSharedToThread(const T* shared, int tid, T* threadReg,
+	bool sync = true);
+
+// For 0 <= index < aCount:
+//		shared[index] = a_global[index];
+// For 0 <= index < bCount:
+//		shared[aCount + index] = b_global[index];
+// VT0 is the lower-bound for predication-free execution:
+//		If count >= NT * VT0, a predication-free branch is taken.
+// VT1 is the upper-bound for loads:
+//		NT * VT1 must >= aCount + bCount.
+
+template<int NT, int VT0, int VT1, typename T>
+MGPU_DEVICE void DeviceLoad2ToReg(const T* a_global, int aCount,
+	const T* b_global, int bCount, int tid, T* reg, bool sync = false);
+
+template<int NT, int VT0, int VT1, typename T>
+MGPU_DEVICE void DeviceLoad2ToShared(const T* a_global, int aCount,
+	const T* b_global, int bCount, int tid, T* shared, bool sync = true);
+
+template<int NT, int VT0, int VT1, typename InputIt1, typename InputIt2,
+	typename T>
+MGPU_DEVICE void DeviceLoad2ToReg(InputIt1 a_global, int aCount,
+	InputIt2 b_global, int bCount, int tid, T* reg, bool sync = false);
+
+template<int NT, int VT0, int VT1, typename InputIt1, typename InputIt2,
+	typename T>
+MGPU_DEVICE void DeviceLoad2ToShared(InputIt1 a_global, int aCount,
+	InputIt2 b_global, int bCount, int tid, T* shared, bool sync = true);
+
+// For 0 <= i < VT
+//		index = NT * i + tid;
+//		if(index < count)
+//			gather = indices_shared[index];
+//			dest_global[index] = data_global[gather];
+// Synchronize after load.
+template<int NT, int VT, typename InputIt, typename OutputIt>
+MGPU_DEVICE void DeviceGatherGlobalToGlobal(int count, InputIt data_global,
+	const int* indices_shared, int tid, OutputIt dest_global,
+	bool sync = true);
+
+// For 0 <= i < VT
+//		index = NT * i + tid
+//		if(index < count)
+//			gather = indices[index];
+//			if(gather < aCount) data = a_global[gather];
+//			else data = b_global[gather - aCount];
+//			dest_global[index] = data;
+// Synchronize after load.
+template<int NT, int VT, typename InputIt1, typename InputIt2,
+	typename T>
+MGPU_DEVICE void DeviceTransferMergeValuesReg(int count, InputIt1 a_global,
+	InputIt2 b_global, int bStart, const int* indices, int tid,
+	T* reg, bool sync = false);
+
+template<int NT, int VT, typename InputIt1, typename InputIt2,
+	typename OutputIt>
+MGPU_DEVICE void DeviceTransferMergeValuesShared(int count, InputIt1 a_global,
+	InputIt2 b_global, int bStart, const int* indices_shared, int tid,
+	OutputIt dest_global, bool sync = true);
+
+template<int NT, int VT, typename T>
+MGPU_DEVICE void DeviceTransferMergeValuesReg(int count, const T* a_global,
+	const T* b_global, int bStart, const int* indices, int tid,
+	T* reg, bool sync = false);
+
+template<int NT, int VT, typename T, typename OutputIt>
+MGPU_DEVICE void DeviceTransferMergeValuesShared(int count, const T* a_global,
+	const T* b_global, int bStart, const int* indices_shared, int tid,
+	OutputIt dest_global, bool sync = true);
+
+
+
+} // namespace mgpu
+
+
+#include "device/loadstore.cuh"
+#include "device/ctasegscan.cuh"
--- a/include/contrib/moderngpu/include/mgpuenums.h
+++ b/include/contrib/moderngpu/include/mgpuenums.h
+/******************************************************************************
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ *
+ * Code and text by Sean Baxter, NVIDIA Research
+ * See http://nvlabs.github.io/moderngpu for repository and documentation.
+ *
+ ******************************************************************************/
+
+#pragma once 
+
+namespace mgpu {
+
+enum MgpuBounds {
+	MgpuBoundsLower,
+	MgpuBoundsUpper
+};
+
+enum MgpuScanType {
+	MgpuScanTypeExc,
+	MgpuScanTypeInc
+};
+
+enum MgpuSearchType {
+	MgpuSearchTypeNone,
+	MgpuSearchTypeIndex,
+	MgpuSearchTypeMatch,
+	MgpuSearchTypeIndexMatch
+};
+
+enum MgpuJoinKind {
+	MgpuJoinKindInner,
+	MgpuJoinKindLeft,
+	MgpuJoinKindRight,
+	MgpuJoinKindOuter
+};
+
+enum MgpuSetOp {
+	MgpuSetOpIntersection,
+	MgpuSetOpUnion,
+	MgpuSetOpDiff,
+	MgpuSetOpSymDiff
+};
+
+} // namespace mgpu
--- a/include/contrib/moderngpu/include/util/static.h
+++ b/include/contrib/moderngpu/include/util/static.h
+/******************************************************************************
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ *
+ * Code and text by Sean Baxter, NVIDIA Research
+ * See http://nvlabs.github.io/moderngpu for repository and documentation.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <functional>
+#include <iterator>
+#include <cfloat>
+#include <typeinfo>
+#include <vector>
+#include <list>
+#include <map>
+#include <algorithm>
+#include <cassert>
+#include <memory>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+
+#ifndef MGPU_MIN
+#define MGPU_MIN(x, y) (((x) <= (y)) ? (x) : (y))
+#define MGPU_MAX(x, y) (((x) >= (y)) ? (x) : (y))
+#define MGPU_MAX0(x) (((x) >= 0) ? (x) : 0)
+#define MGPU_ABS(x) (((x) >= 0) ? (x) : (-x))
+
+#define MGPU_DIV_UP(x, y) (((x) + (y) - 1) / (y))
+#define MGPU_DIV_ROUND(x, y) (((x) + (y) / 2) / (y))
+#define MGPU_ROUND_UP(x, y) ((y) * MGPU_DIV_UP(x, y))
+#define MGPU_SHIFT_DIV_UP(x, y) (((x) + ((1<< (y)) - 1))>> y)
+#define MGPU_ROUND_UP_POW2(x, y) (((x) + (y) - 1) & ~((y) - 1))
+#define MGPU_ROUND_DOWN_POW2(x, y) ((x) & ~((y) - 1))
+#define MGPU_IS_POW_2(x) (0 == ((x) & ((x) - 1)))
+
+#endif // MGPU_MIN
+
+namespace mgpu {
+
+
+typedef unsigned char byte;
+
+typedef unsigned int uint;
+typedef signed short int16;
+
+typedef unsigned short ushort;
+typedef unsigned short uint16;
+
+typedef long long int64;
+typedef unsigned long long uint64;
+
+// IsPow2<X>::value is true if X is a power of 2.
+template<int X> struct sIsPow2 {
+	enum { value = 0 == (X & (X - 1)) };
+};
+
+// Finds the base-2 logarithm of X. value is -1 if X is not a power of 2.
+template<int X, bool roundUp = true> struct sLogPow2 { 
+	enum { extra = sIsPow2<X>::value ? 0 : (roundUp ? 1 : 0) };
+	enum { inner = sLogPow2<X / 2>::inner + 1 };
+	enum { value = inner + extra };
+};
+template<bool roundUp> struct sLogPow2<0, roundUp> {
+	enum { inner = 0 };
+	enum { value = 0 };
+};
+template<bool roundUp> struct sLogPow2<1, roundUp> { 
+	enum { inner = 0 };
+	enum { value = 0 };
+};
+
+template<int X, int Y>
+struct sDivUp {
+	enum { value = (X + Y - 1) / Y };
+};
+
+template<int count, int levels> struct sDiv2RoundUp {
+	enum { value = sDiv2RoundUp<sDivUp<count, 2>::value, levels - 1>::value };
+};
+template<int count> struct sDiv2RoundUp<count, 0> {
+	enum { value = count };
+};
+
+template<int X, int Y>
+struct sDivSafe {
+	enum { value = X / Y };
+};
+template<int X>
+struct sDivSafe<X, 0> {
+	enum { value = 0 };
+};
+
+template<int X, int Y>
+struct sRoundUp {
+	enum { rem = X % Y };
+	enum { value = X + (rem ? (Y - rem) : 0) };
+};
+
+template<int X, int Y>
+struct sRoundDown {
+	enum { rem = X % Y };
+	enum { value = X - rem };
+};
+
+// IntegerDiv is a template for avoiding divisions by zero in template 
+// evaluation. Templates always evaluate both b and c in an expression like
+// a ? b : c, and will error if either rhs contains an illegal expression,
+// even if the ternary is explictly designed to guard against that.
+template<int X, int Y>
+struct sIntegerDiv {
+	enum { value = X / (Y ? Y : (X + 1)) };
+};
+
+template<int X, int Y>
+struct sMax {
+	enum { value = (X >= Y) ? X : Y };
+};
+template<int X, int Y>
+struct sMin {
+	enum { value = (X <= Y) ? X : Y };
+};
+
+template<int X>
+struct sAbs {
+	enum { value = (X >= 0) ? X : -X };
+};
+
+
+// Finds the number of powers of 2 in the prime factorization of X.
+template<int X, int LSB = 1 & X> struct sNumFactorsOf2 {
+	enum { shifted = X >> 1 };
+	enum { value = 1 + sNumFactorsOf2<shifted>::value };
+};
+template<int X> struct sNumFactorsOf2<X, 1> {
+	enum { value = 0 };
+};
+
+// Returns the divisor for a conflict-free transpose.
+template<int X, int NumBanks = 32> struct sBankConflictDivisor {
+	enum { value = 
+		(1 & X) ? 0 : 
+		(sIsPow2<X>::value ? NumBanks :
+		(1<< sNumFactorsOf2<X>::value)) }; 
+	enum { log_value = sLogPow2<value>::value };
+};
+
+template<int NT, int X, int NumBanks = 32> struct sConflictFreeStorage {
+	enum { count = NT * X };
+	enum { divisor = sBankConflictDivisor<X, NumBanks>::value };
+	enum { padding = sDivSafe<count, divisor>::value };
+	enum { value = count + padding };
+};
+
+} // namespace mgpu
--- a/include/ctc.h
+++ b/include/ctc.h
+/** \file ctc.h
+ * Contains a simple C interface to call fast CPU and GPU based computation
+ * of the CTC loss.
+ */
+
+#pragma once
+
+#ifdef __cplusplus
+#include <cstddef>
+extern "C" {
+#endif
+
+//forward declare of CUDA typedef to avoid needing to pull in CUDA headers
+typedef struct CUstream_st* CUstream;
+
+typedef enum {
+    CTC_STATUS_SUCCESS = 0,
+    CTC_STATUS_MEMOPS_FAILED = 1,
+    CTC_STATUS_INVALID_VALUE = 2,
+    CTC_STATUS_EXECUTION_FAILED = 3,
+    CTC_STATUS_UNKNOWN_ERROR = 4
+} ctcStatus_t;
+
+/** Returns a single integer which specifies the API version of the warpctc library */
+int get_warpctc_version();
+
+/** Returns a string containing a description of status that was passed in
+ *  \param[in] status identifies which string should be returned
+ *  \return C style string containing the text description
+ *  */
+const char* ctcGetStatusString(ctcStatus_t status);
+
+typedef enum {
+    CTC_CPU = 0,
+    CTC_GPU = 1
+} ctcComputeLocation;
+
+/** Structure used for options to the CTC compution.  Applications
+ *  should zero out the array using memset and sizeof(struct
+ *  ctcOptions) in C or default initialization (e.g. 'ctcOptions
+ *  options{};' or 'auto options = ctcOptions{}') in C++ to ensure
+ *  forward compatibility with added options. */
+struct ctcOptions {
+    /// indicates where the ctc calculation should take place {CTC_CPU | CTC_GPU}
+    ctcComputeLocation loc;
+    union {
+        /// used when loc == CTC_CPU, the maximum number of threads that can be used
+        unsigned int num_threads;
+
+        /// used when loc == CTC_GPU, which stream the kernels should be launched in
+        CUstream stream;
+    };
+
+    /// the label value/index that the CTC calculation should use as the blank label
+    int blank_label;
+};
+
+/** Compute the connectionist temporal classification loss between a sequence
+ *  of probabilities and a ground truth labeling.  Optionally compute the
+ *  gradient with respect to the inputs.
+ * \param [in] activations pointer to the activations in either CPU or GPU
+ *             addressable memory, depending on info.  We assume a fixed
+ *             memory layout for this 3 dimensional tensor, which has dimension
+ *             (t, n, p), where t is the time index, n is the minibatch index,
+ *             and p indexes over probabilities of each symbol in the alphabet.
+ *             The memory layout is (t, n, p) in C order (slowest to fastest changing
+ *             index, aka row-major), or (p, n, t) in Fortran order (fastest to slowest
+ *             changing index, aka column-major). We also assume strides are equal to
+ *             dimensions - there is no padding between dimensions.
+ *             More precisely, element (t, n, p), for a problem with mini_batch examples
+ *             in the mini batch, and alphabet_size symbols in the alphabet, is located at:
+ *             activations[(t * mini_batch + n) * alphabet_size + p]
+ * \param [out] gradients if not NULL, then gradients are computed.  Should be
+ *              allocated in the same memory space as probs and memory
+ *              ordering is identical.
+ * \param [in]  flat_labels Always in CPU memory.  A concatenation
+ *              of all the labels for the minibatch.
+ * \param [in]  label_lengths Always in CPU memory. The length of each label
+ *              for each example in the minibatch.
+ * \param [in]  input_lengths Always in CPU memory.  The number of time steps
+ *              for each sequence in the minibatch.
+ * \param [in]  alphabet_size The number of possible output symbols.  There
+ *              should be this many probabilities for each time step.
+ * \param [in]  mini_batch How many examples in a minibatch.
+ * \param [out] costs Always in CPU memory.  The cost of each example in the
+ *              minibatch.
+ * \param [in,out] workspace In same memory space as probs. Should be of
+ *                 size requested by get_workspace_size.
+ * \param [in]  options see struct ctcOptions
+ *
+ *  \return Status information
+ *
+ * */
+ctcStatus_t compute_ctc_loss(const float* const activations,
+                             float* gradients,
+                             const int* const flat_labels,
+                             const int* const label_lengths,
+                             const int* const input_lengths,
+                             int alphabet_size,
+                             int minibatch,
+                             float *costs,
+                             void *workspace,
+                             ctcOptions options);
+
+
+/** For a given set of labels and minibatch size return the required workspace
+ *  size.  This will need to be allocated in the same memory space as your
+ *  probabilities.
+ * \param [in]  label_lengths Always in CPU memory. The length of each label
+ *              for each example in the minibatch.
+ * \param [in]  input_lengths Always in CPU memory.  The number of time steps
+ *              for each sequence in the minibatch.
+ * \param [in]  alphabet_size How many symbols in the alphabet or, equivalently,
+ *              the number of probabilities at each time step
+ * \param [in]  mini_batch How many examples in a minibatch.
+ * \param [in]  info see struct ctcOptions
+ * \param [out] size_bytes is pointer to a scalar where the memory
+ *              requirement in bytes will be placed. This memory should be allocated
+ *              at the same place, CPU or GPU, that the probs are in
+ *
+ *  \return Status information
+ **/
+ctcStatus_t get_workspace_size(const int* const label_lengths,
+                               const int* const input_lengths,
+                               int alphabet_size, int minibatch,
+                               ctcOptions info,
+                               size_t* size_bytes);
+
+#ifdef __cplusplus
+}
+#endif
--- a/include/detail/cpu_ctc.h
+++ b/include/detail/cpu_ctc.h
+#pragma once
+
+#include <tuple>
+#include <cmath>
+#include <limits>
+#include <algorithm>
+#include <numeric>
+
+#if !defined(CTC_DISABLE_OMP) && !defined(APPLE)
+#include <omp.h>
+#endif
+
+#include "ctc_helper.h"
+
+
+template<typename ProbT>
+class CpuCTC {
+public:
+    // Noncopyable
+    CpuCTC(int alphabet_size, int minibatch, void* workspace, int num_threads,
+           int blank_label) :
+            alphabet_size_(alphabet_size), minibatch_(minibatch),
+            num_threads_(num_threads), workspace_(workspace),
+            blank_label_(blank_label) {
+#if defined(CTC_DISABLE_OMP) || defined(APPLE)
+#else
+        if (num_threads > 0) {
+            omp_set_num_threads(num_threads);
+        } else {
+            num_threads_ = omp_get_max_threads();
+        }
+#endif
+    };
+
+    CpuCTC(const CpuCTC&) = delete;
+    CpuCTC& operator=(const CpuCTC&) = delete;
+
+    ctcStatus_t cost_and_grad(const ProbT* const activations,
+                              ProbT *grads,
+                              ProbT* costs,
+                              const int* const flat_labels,
+                              const int* const label_lengths,
+                              const int* const input_lengths);
+
+
+    ctcStatus_t score_forward(const ProbT* const activations,
+                              ProbT* costs,
+                              const int* const flat_labels,
+                              const int* const label_lengths,
+                              const int* const input_lengths);
+
+private:
+
+    class CpuCTC_metadata {
+
+    private:
+        int setup_labels(const int* const labels, int blank_label, int L, int S);
+
+    public:
+        CpuCTC_metadata(int L, int S, int T, int mb, int alphabet_size,
+                        void* workspace, size_t bytes_used, int blank_label,
+                        const int* const labels);
+
+        ProbT* alphas;
+        ProbT* betas;
+        int* labels_w_blanks;
+        int* e_inc;
+        int* s_inc;
+        ProbT* output;
+        int repeats;
+    };
+
+    int alphabet_size_; // Number of characters plus blank
+    int minibatch_;
+    int num_threads_;
+    int blank_label_;
+    void* workspace_;
+
+    void softmax(const ProbT* const activations, ProbT* probs,
+                 const int* const input_lengths);
+
+    std::tuple<ProbT, bool>
+            cost_and_grad_kernel(ProbT *grad, const ProbT* const probs,
+                                 const int* const labels, int T, int L,
+                                 int mb, size_t bytes_used);
+
+    ProbT compute_alphas(const ProbT* probs, int repeats, int S, int T,
+                         const int* const e_inc,
+                         const int* const s_inc,
+                         const int* const labels,
+                         ProbT* alphas);
+
+    ProbT compute_betas_and_grad(ProbT* grad, const ProbT* const probs,
+                                 ProbT log_partition, int repeats,
+                                 int S, int T, const int* const e_inc,
+                                 const int* const s_inc,
+                                 const int* const labels,
+                                 ProbT* alphas,
+                                 ProbT* betas,
+                                 ProbT* output);
+};
+
+template<typename ProbT>
+CpuCTC<ProbT>::CpuCTC_metadata::CpuCTC_metadata(int L, int S, int T, int mb,
+                                                int alphabet_size,
+                                                void* workspace, size_t bytes_used,
+                                                int blank_label,
+                                                const int* const labels) {
+
+    alphas = reinterpret_cast<ProbT *>(static_cast<char *>(workspace) + bytes_used);
+    bytes_used += sizeof(ProbT) * S * T;
+    std::fill(alphas, alphas + S * T, ctc_helper::neg_inf<ProbT>());
+    betas = reinterpret_cast<ProbT *>(static_cast<char *>(workspace) + bytes_used);
+    bytes_used += sizeof(ProbT) * S;
+    std::fill(betas, betas + S, ctc_helper::neg_inf<ProbT>());
+    labels_w_blanks = reinterpret_cast<int *>(static_cast<char *>(workspace) + bytes_used);
+    bytes_used += sizeof(int) * S;
+    e_inc = reinterpret_cast<int *>(static_cast<char *>(workspace) + bytes_used);
+    bytes_used += sizeof(int) * S;
+    s_inc = reinterpret_cast<int *>(static_cast<char *>(workspace) + bytes_used);
+    bytes_used += sizeof(int) * S;
+    output = reinterpret_cast<ProbT *>(static_cast<char *>(workspace) + bytes_used);
+    bytes_used += sizeof(ProbT) * alphabet_size;
+
+    repeats = setup_labels(labels, blank_label, L, S);
+}
+
+template<typename ProbT>
+int CpuCTC<ProbT>::CpuCTC_metadata::setup_labels(const int* const labels,
+                                                 int blank_label, int L, int S) {
+    int e_counter = 0;
+    int s_counter = 0;
+
+    s_inc[s_counter++] = 1;
+
+    int repeats = 0;
+
+    for (int i = 1; i < L; ++i) {
+        if (labels[i-1] == labels[i]) {
+            s_inc[s_counter++] = 1;
+            s_inc[s_counter++] = 1;
+            e_inc[e_counter++] = 1;
+            e_inc[e_counter++] = 1;
+            ++repeats;
+        }
+        else {
+            s_inc[s_counter++] = 2;
+            e_inc[e_counter++] = 2;
+        }
+    }
+    e_inc[e_counter++] = 1;
+
+    for (int i = 0; i < L; ++i) {
+        labels_w_blanks[2 * i] = blank_label;
+        labels_w_blanks[2 * i + 1] = labels[i];
+    }
+    labels_w_blanks[S - 1] = blank_label;
+
+    return repeats;
+}
+
+template<typename ProbT>
+void
+CpuCTC<ProbT>::softmax(const ProbT* const activations, ProbT* probs,
+                       const int* const input_lengths) {
+#pragma omp parallel for
+    for (int mb = 0; mb < minibatch_; ++mb) {
+        for(int c = 0; c < input_lengths[mb]; ++c) {
+            int col_offset = (mb + minibatch_ * c) * alphabet_size_;
+            ProbT max_activation = -std::numeric_limits<ProbT>::infinity();
+            for(int r = 0; r < alphabet_size_; ++r)
+                max_activation = std::max(max_activation, activations[r + col_offset]);
+
+            ProbT denom = ProbT(0.);
+            for(int r = 0; r < alphabet_size_; ++r) {
+                probs[r + col_offset] = std::exp(activations[r + col_offset] - max_activation);
+                denom += probs[r + col_offset];
+            }
+
+            for(int r = 0; r < alphabet_size_; ++r) {
+                probs[r + col_offset] /= denom;
+            }
+        }
+    }
+}
+
+template<typename ProbT>
+std::tuple<ProbT, bool>
+CpuCTC<ProbT>::cost_and_grad_kernel(ProbT *grad, const ProbT* const probs,
+                                    const int* const labels,
+                                    int T, int L, int mb, size_t bytes_used) {
+
+    const int S = 2*L + 1; // Number of labels with blanks
+
+    CpuCTC_metadata ctcm(L, S, T, mb, alphabet_size_, workspace_, bytes_used, blank_label_, labels);
+
+    bool over_threshold = false;
+
+    if (L + ctcm.repeats > T) {
+        return std::make_tuple(ProbT(0), over_threshold); // TODO, not right to return 0
+    }
+
+    ProbT llForward = compute_alphas(probs, ctcm.repeats, S, T, ctcm.e_inc,
+                                     ctcm.s_inc, ctcm.labels_w_blanks,
+                                     ctcm.alphas);
+
+    ProbT llBackward = compute_betas_and_grad(grad, probs, llForward, ctcm.repeats,
+                                              S, T, ctcm.e_inc, ctcm.s_inc,
+                                              ctcm.labels_w_blanks,
+                                              ctcm.alphas,
+                                              ctcm.betas,
+                                              ctcm.output);
+
+    ProbT diff = std::abs(llForward - llBackward);
+    if (diff > ctc_helper::threshold) {
+        over_threshold = true;
+    }
+
+    return std::make_tuple(-llForward, over_threshold);
+}
+
+// Computes forward probabilities
+template<typename ProbT>
+ProbT CpuCTC<ProbT>::compute_alphas(const ProbT* probs, int repeats, int S, int T,
+                                    const int* const e_inc,
+                                    const int* const s_inc,
+                                    const int* const labels,
+                                    ProbT* alphas) {
+
+    int start =  (((S /2) + repeats - T) < 0) ? 0 : 1,
+            end = S > 1 ? 2 : 1;
+
+    for (int i = start; i < end; ++i) {
+        alphas[i] = std::log(probs[labels[i]]);
+    }
+
+    for(int t = 1; t < T; ++t) {
+        int remain = (S / 2) + repeats - (T - t);
+        if(remain >= 0)
+            start += s_inc[remain];
+        if(t <= (S / 2) + repeats)
+            end += e_inc[t - 1];
+        int startloop = start;
+        int idx1 = t * S, idx2 = (t - 1) * S, idx3 = t * (alphabet_size_ * minibatch_);
+
+        if (start == 0) {
+            alphas[idx1] = alphas[idx2] + std::log(probs[blank_label_ + idx3]);
+            startloop += 1;
+        }
+
+        for(int i = startloop; i < end; ++i) {
+            ProbT prev_sum = ctc_helper::log_plus<ProbT>()(alphas[i + idx2], alphas[(i-1) + idx2]);
+
+            // Skip two if not on blank and not on repeat.
+            if (labels[i] != blank_label_ && i != 1 && labels[i] != labels[i-2])
+                prev_sum = ctc_helper::log_plus<ProbT>()(prev_sum, alphas[(i-2) + idx2]);
+
+            alphas[i + idx1] = prev_sum + std::log(probs[labels[i] + idx3]);
+        }
+    }
+
+    ProbT loglike = ctc_helper::neg_inf<ProbT>();
+    for(int i = start; i < end; ++i) {
+        loglike = ctc_helper::log_plus<ProbT>()(loglike, alphas[i + (T - 1) * S]);
+    }
+
+    return loglike;
+}
+
+// Starting from T, we sweep backward over the alpha array computing one column
+// of betas as we go.  At each position we can update product alpha * beta and then
+// sum into the gradient associated with each label.
+// NOTE computes gradient w.r.t UNNORMALIZED final layer activations.
+// Assumed passed in grads are already zeroed!
+template<typename ProbT>
+ProbT CpuCTC<ProbT>::compute_betas_and_grad(ProbT* grad, const ProbT* const probs,
+                                            ProbT log_partition, int repeats,
+                                            int S, int T, const int* const e_inc,
+                                            const int* const s_inc,
+                                            const int* const labels,
+                                            ProbT* alphas,
+                                            ProbT* betas,
+                                            ProbT* output) {
+    int start = S > 1 ? (S - 2) : 0,
+            end = (T > (S / 2) + repeats) ? S : S-1;
+
+    std::fill(output, output + alphabet_size_, ctc_helper::neg_inf<ProbT>());
+
+    //set the starting values in the beta column at the very right edge
+    for (int i = start; i < end; ++i) {
+        betas[i] = std::log(probs[labels[i] + (T - 1) * (alphabet_size_ * minibatch_)]);
+
+        //compute alpha * beta in log space at this position in (S, T) space
+        alphas[i + (T - 1) * S] += betas[i];
+
+        //update the gradient associated with this label
+        //essentially performing a reduce-by-key in a sequential manner
+        output[labels[i]] =
+                ctc_helper::log_plus<ProbT>()(alphas[i + (T - 1) * S], output[labels[i]]);
+    }
+
+    //update the gradient wrt to each unique label
+    for (int i = 0; i < alphabet_size_; ++i) {
+        int idx3 = (T - 1) * alphabet_size_ * minibatch_ + i;
+
+        if (output[i] == 0.0 || output[i] == ctc_helper::neg_inf<ProbT>() ||
+            probs[idx3] == 0.0) {
+            grad[idx3] = probs[idx3];
+        } else {
+            grad[idx3] = probs[idx3] - std::exp(output[i] -
+                                                std::log(probs[idx3]) - log_partition);
+        }
+    }
+
+    //loop from the second to last column all the way to the left
+    for(int t = T - 2; t >= 0; --t) {
+        int remain = (S / 2) + repeats - (T - t);
+        if(remain >= -1)
+            start -= s_inc[remain + 1];
+        if(t < (S / 2) + repeats)
+            end -= e_inc[t];
+
+        int endloop = end == S ? end - 1 : end;
+        int idx1 = t * S, idx3 = t * (alphabet_size_ * minibatch_);
+
+        std::fill(output, output + alphabet_size_, ctc_helper::neg_inf<ProbT>());
+
+        for(int i = start; i < endloop; ++i) {
+            ProbT next_sum = ctc_helper::log_plus<ProbT>()(betas[i], betas[(i+1)]);
+            // Skip two if not on blank and not on repeat.
+            if (labels[i] != blank_label_ && i != (S-2) && labels[i] != labels[i+2]){
+                next_sum = ctc_helper::log_plus<ProbT>()(next_sum, betas[(i+2)]);
+            }
+            betas[i] = next_sum + std::log(probs[labels[i] + idx3]);
+
+            //compute alpha * beta in log space
+            alphas[i + idx1] += betas[i];
+
+            //update the gradient associated with this label
+            output[labels[i]] =
+                    ctc_helper::log_plus<ProbT>()(alphas[i + idx1], output[labels[i]]);
+        }
+
+        if (end == S) {
+            betas[(S-1)] = betas[(S-1)] + std::log(probs[blank_label_ + idx3]);
+            alphas[(S-1) + idx1] += betas[(S-1)];
+
+            output[labels[S-1]] =
+                    ctc_helper::log_plus<ProbT>()(alphas[S-1 + idx1], output[labels[S-1]]);
+        }
+
+        //go over the unique labels and compute the final grad
+        // wrt to each one at this time step
+        for (int i = 0; i < alphabet_size_; ++i) {
+
+            if (output[i] == 0.0 || output[i] == ctc_helper::neg_inf<ProbT>() ||
+                probs[idx3] == 0.0) {
+                grad[idx3] = probs[idx3];
+            } else {
+                grad[idx3] = probs[idx3] - std::exp(output[i] -
+                                                    std::log(probs[idx3]) - log_partition);
+            }
+            ++idx3;
+        }
+    }
+
+    ProbT loglike = ctc_helper::neg_inf<ProbT>();
+    for(int i = start; i < end; ++i) {
+        loglike = ctc_helper::log_plus<ProbT>()(loglike, betas[i]);
+    }
+
+    return loglike;
+}
+
+template<typename ProbT>
+ctcStatus_t
+CpuCTC<ProbT>::cost_and_grad(const ProbT* const activations,
+                             ProbT *grads,
+                             ProbT *costs,
+                             const int* const flat_labels,
+                             const int* const label_lengths,
+                             const int* const input_lengths) {
+    if (activations == nullptr ||
+        grads == nullptr ||
+        costs == nullptr ||
+        flat_labels == nullptr ||
+        label_lengths == nullptr ||
+        input_lengths == nullptr
+        )
+        return CTC_STATUS_INVALID_VALUE;
+
+    ProbT* probs = static_cast<ProbT *>(workspace_);
+
+    int maxT = *std::max_element(input_lengths, input_lengths + minibatch_);
+
+    size_t bytes_used = sizeof(ProbT) * minibatch_ * alphabet_size_ * maxT;
+
+    //per minibatch memory
+    size_t per_minibatch_bytes = 0;
+
+    int maxL = *std::max_element(label_lengths, label_lengths + minibatch_);;
+    int maxS = 2 * maxL + 1;
+
+    //output
+    per_minibatch_bytes += sizeof(float) * alphabet_size_;
+
+    //alphas
+    per_minibatch_bytes += sizeof(float) * maxS * maxT;
+
+    //betas
+    per_minibatch_bytes += sizeof(float) * maxS;
+
+    //labels w/blanks, e_inc, s_inc
+    per_minibatch_bytes += 3 * sizeof(int) * maxS;
+
+    softmax(activations, probs, input_lengths);
+
+#pragma omp parallel for
+    for (int mb = 0; mb < minibatch_; ++mb) {
+        const int T = input_lengths[mb]; // Length of utterance (time)
+        const int L = label_lengths[mb]; // Number of labels in transcription
+
+        bool mb_status;
+
+        std::tie(costs[mb], mb_status) =
+                cost_and_grad_kernel(grads + mb * alphabet_size_,
+                                     probs + mb * alphabet_size_,
+                                     flat_labels + std::accumulate(label_lengths, label_lengths + mb, 0),
+                                     T, L, mb,
+                                     bytes_used + mb * per_minibatch_bytes);
+    }
+
+    return CTC_STATUS_SUCCESS;
+}
+
+template<typename ProbT>
+ctcStatus_t CpuCTC<ProbT>::score_forward(const ProbT* const activations,
+                                         ProbT* costs,
+                                         const int* const flat_labels,
+                                         const int* const label_lengths,
+                                         const int* const input_lengths) {
+    if (activations == nullptr ||
+        costs == nullptr ||
+        flat_labels == nullptr ||
+        label_lengths == nullptr ||
+        input_lengths == nullptr
+        )
+        return CTC_STATUS_INVALID_VALUE;
+
+    ProbT* probs = static_cast<ProbT *>(workspace_);
+
+    int maxT = *std::max_element(input_lengths, input_lengths + minibatch_);
+
+    size_t bytes_used = sizeof(ProbT) * minibatch_ * alphabet_size_ * maxT;
+
+    //per minibatch memory
+    size_t per_minibatch_bytes = 0;
+
+    int maxL = *std::max_element(label_lengths, label_lengths + minibatch_);
+    int maxS = 2 * maxL + 1;
+
+    //output
+    per_minibatch_bytes += sizeof(float) * alphabet_size_;
+
+    //alphas
+    per_minibatch_bytes += sizeof(float) * maxS * maxT;
+
+    //betas
+    per_minibatch_bytes += sizeof(float) * maxS;
+
+    //labels w/blanks, e_inc, s_inc
+    per_minibatch_bytes += 3 * sizeof(int) * maxS;
+
+    softmax(activations, probs, input_lengths);
+
+#pragma omp parallel for
+    for (int mb = 0; mb < minibatch_; ++mb) {
+        const int T = input_lengths[mb]; // Length of utterance (time)
+        const int L = label_lengths[mb]; // Number of labels in transcription
+        const int S = 2*L + 1; // Number of labels with blanks
+
+        CpuCTC_metadata ctcm(L, S, T, mb, alphabet_size_, workspace_,
+                             bytes_used + mb * per_minibatch_bytes, blank_label_,
+                             flat_labels + std::accumulate(label_lengths, label_lengths + mb, 0));
+
+
+        if (L + ctcm.repeats > T)
+            costs[mb] = ProbT(0);
+        else {
+            costs[mb] = -compute_alphas(probs + mb * alphabet_size_, ctcm.repeats, S, T,
+                                        ctcm.e_inc, ctcm.s_inc, ctcm.labels_w_blanks,
+                                        ctcm.alphas);
+        }
+
+    }
+
+    return CTC_STATUS_SUCCESS;
+}
--- a/include/detail/ctc_helper.h
+++ b/include/detail/ctc_helper.h
+#pragma once
+
+#include <limits>
+#include <algorithm>
+#include <cmath>
+
+#include "hostdevice.h"
+
+namespace ctc_helper {
+
+static const float threshold = 1e-1;
+
+template<typename T>
+HOSTDEVICE
+T neg_inf() { return -T(INFINITY); }
+
+inline int div_up(int x, int y) {
+    return (x + y - 1) / y;
+}
+
+template <typename Arg, typename Res = Arg> struct maximum {
+    HOSTDEVICE
+    Res operator()(const Arg& x, const Arg& y) const {
+        return x < y ? y : x;
+    }
+};
+
+template <typename Arg, typename Res = Arg> struct add {
+    HOSTDEVICE
+    Res operator()(const Arg& x, const Arg& y) const {
+        return x + y;
+    }
+};
+
+template <typename Arg, typename Res = Arg> struct identity {
+    HOSTDEVICE Res operator()(const Arg& x) const {return Res(x);}
+};
+
+template <typename Arg, typename Res = Arg> struct negate {
+    HOSTDEVICE Res operator()(const Arg& x) const {return Res(-x);}
+};
+
+template <typename Arg, typename Res = Arg> struct exponential {
+    HOSTDEVICE Res operator()(const Arg& x) const {return std::exp(x);}
+};
+
+template<typename Arg1, typename Arg2 = Arg1, typename Res=Arg1>
+struct log_plus {
+    typedef Res result_type;
+    HOSTDEVICE
+    Res operator()(const Arg1& p1, const Arg2& p2) {
+        if (p1 == neg_inf<Arg1>())
+            return p2;
+        if (p2 == neg_inf<Arg2>())
+            return p1;
+        Res result = log1p(exp(-fabs(p1 - p2))) + maximum<Res>()(p1, p2);
+        return result;
+    }
+};
+
+}
--- a/include/detail/gpu_ctc.h
+++ b/include/detail/gpu_ctc.h
+#pragma once
+
+#include "ctc_helper.h"
+#include "gpu_ctc_kernels.h"
+#include "reduce.h"
+
+template <typename ProbT>
+class GpuCTC {
+    public:
+        GpuCTC(int alphabet_size,
+               int minibatch,
+               void *workspace,
+               CUstream stream,
+               int blank_label) :
+            out_dim_(alphabet_size), minibatch_(minibatch),
+            gpu_workspace_(workspace), stream_(stream),
+            blank_label_(blank_label) {};
+
+        // Noncopyable
+        GpuCTC(const GpuCTC&) = delete;
+        GpuCTC& operator=(const GpuCTC&) = delete;
+
+        ctcStatus_t
+        cost_and_grad(const ProbT* const activations,
+                      ProbT* grads,
+                      ProbT* costs,
+                      const int* const flat_labels,
+                      const int* const label_lengths,
+                      const int* const input_lengths);
+
+        ctcStatus_t
+        score_forward(const ProbT* const activations,
+                      ProbT* costs,
+                      const int* const flat_labels,
+                      const int* const label_lengths,
+                      const int* const input_lengths);
+
+    private:
+
+        template<int NT, int VT>
+        ctcStatus_t launch_alpha_beta_kernels(const ProbT* const probs,
+                                              ProbT *grads,
+                                              bool compute_alpha,
+                                              bool compute_beta);
+
+        ctcStatus_t
+        launch_gpu_kernels(const ProbT* const probs,
+                           ProbT *grads,
+                           size_t config,
+                           bool launch_alpha,
+                           bool launch_beta);
+
+        ctcStatus_t
+        setup_gpu_metadata(const int* const flat_labels,
+                           const int* const label_lengths,
+                           const int* const input_lengths);
+
+        ctcStatus_t
+        create_metadata_and_choose_config(const int* const label_lengths,
+                                          const int* const flat_labels,
+                                          const int* const input_lengths,
+                                          size_t& best_config);
+
+        ctcStatus_t
+        compute_probs(const ProbT* const activations);
+
+        ctcStatus_t
+        compute_cost_and_score(const ProbT* const activations,
+                               ProbT* grads,
+                               ProbT* costs,
+                               const int* const flat_labels,
+                               const int* const label_lengths,
+                               const int* const input_lengths,
+                               bool compute_alpha,
+                               bool compute_betas_and_grad);
+
+
+        int out_dim_; // Number of characters plus blank
+        int minibatch_;
+
+        int S_;
+        int T_;
+
+        int activation_cols_; // Number of columns in activations
+
+        CUstream stream_;
+        int blank_label_;
+
+        void *gpu_workspace_; // Buffer for all temporary GPU memory
+        int *utt_length_; // T
+        int *label_sizes_; // L
+        int *repeats_; // repeats_
+        int *label_offsets_;
+        int *labels_without_blanks_;
+        int *labels_with_blanks_;
+        ProbT *alphas_;
+        ProbT *nll_forward_;
+        ProbT *nll_backward_;
+        ProbT *denoms_; // Temporary storage for denoms for softmax
+        ProbT *probs_; // Temporary storage for probabilities (softmax output)
+};
+
+template<typename ProbT>
+ctcStatus_t
+GpuCTC<ProbT>::setup_gpu_metadata(const int* const flat_labels,
+                                  const int* const label_lengths,
+                                  const int* const input_lengths)
+{
+    size_t gpu_bytes_used = 0;
+
+    nll_forward_ =
+        reinterpret_cast<ProbT *>(static_cast<char*>(gpu_workspace_) +
+                                  gpu_bytes_used);
+    gpu_bytes_used += minibatch_ * sizeof(ProbT);
+
+
+    nll_backward_ =
+        reinterpret_cast<ProbT *>(static_cast<char*>(gpu_workspace_) +
+                                  gpu_bytes_used);
+    gpu_bytes_used += minibatch_ * sizeof(ProbT);
+
+
+    repeats_ =
+        reinterpret_cast<int *>(static_cast<char*>(gpu_workspace_) +
+                                gpu_bytes_used);
+    gpu_bytes_used += minibatch_ * sizeof(int);
+
+    label_offsets_ =
+        reinterpret_cast<int *>(static_cast<char*>(gpu_workspace_) +
+                                gpu_bytes_used);
+    gpu_bytes_used += minibatch_ * sizeof(int);
+
+
+    // This is the max of all S and T for all valid examples in the minibatch.
+    // A valid example is one for which L + repeats <= T
+    S_ = 0;
+    T_ = 0;
+
+    // This is the max of all timesteps, valid or not. Needed to compute offsets
+    int Tmax = 0;
+
+    // This is the max of all labels, valid or not. Needed to compute offsets
+    int Lmax = 0;
+    int total_label_length = 0;
+
+    constexpr int cpu_buffer_size = 64;
+    int repeats[cpu_buffer_size];
+    int label_offsets[cpu_buffer_size];
+
+    const int num_passes = ctc_helper::div_up(minibatch_, cpu_buffer_size);
+
+    cudaError_t cuda_status;
+
+    for (int pass = 0; pass < num_passes; ++pass) {
+
+        const int start_idx = pass * cpu_buffer_size;
+        const int end_idx = std::min(minibatch_, (pass+1) * cpu_buffer_size);
+
+        for (int j = start_idx; j < end_idx; ++j) {
+            const int L = label_lengths[j];
+            const int local_T = input_lengths[j];
+            const int *label_ptr = &(flat_labels[total_label_length]);
+
+            label_offsets[j % cpu_buffer_size] = total_label_length;
+            total_label_length += L;
+
+            int repeat_counter = 0;
+
+            for (int i = 1; i < L; ++i)
+                repeat_counter += (label_ptr[i] == label_ptr[i-1]);
+
+            repeats[j % cpu_buffer_size] = repeat_counter;
+            const bool valid_label = ((L + repeat_counter) <= local_T);
+
+            // Only update S and T if label is valid
+            S_ = (valid_label) ? std::max(S_, L) : S_;
+            T_ = (valid_label) ? std::max(T_, local_T) : T_;
+
+            Tmax = std::max(Tmax, local_T);
+            Lmax = std::max(Lmax, L);
+        }
+
+        cuda_status = cudaMemcpyAsync(&(repeats_[start_idx]), repeats,
+                                      (end_idx - start_idx) * sizeof(int),
+                                      cudaMemcpyHostToDevice, stream_);
+        if (cuda_status != cudaSuccess)
+            return CTC_STATUS_MEMOPS_FAILED;
+
+
+        cuda_status = cudaMemcpyAsync(&(label_offsets_[start_idx]), label_offsets,
+                                      (end_idx - start_idx) * sizeof(int),
+                                      cudaMemcpyHostToDevice, stream_);
+        if (cuda_status != cudaSuccess)
+            return CTC_STATUS_MEMOPS_FAILED;
+    }
+
+    S_ = 2 * S_ + 1;
+    const int Smax = 2 * Lmax + 1;
+
+    activation_cols_ = minibatch_ * Tmax;
+
+    // Allocate memory for T
+    utt_length_ =
+        reinterpret_cast<int *>(static_cast<char*>(gpu_workspace_) +
+                                gpu_bytes_used);
+    gpu_bytes_used += minibatch_  * sizeof(int);
+
+    cuda_status = cudaMemcpyAsync(utt_length_, input_lengths,
+                                  minibatch_ * sizeof(int),
+                                  cudaMemcpyHostToDevice, stream_);
+    if (cuda_status != cudaSuccess)
+        return CTC_STATUS_MEMOPS_FAILED;
+
+    label_sizes_ =
+        reinterpret_cast<int *>(static_cast<char*>(gpu_workspace_) +
+                                gpu_bytes_used);
+    gpu_bytes_used += minibatch_ * sizeof(int);
+    cuda_status = cudaMemcpyAsync(label_sizes_, label_lengths,
+                                  minibatch_ * sizeof(int),
+                                  cudaMemcpyHostToDevice, stream_);
+    if (cuda_status != cudaSuccess)
+        return CTC_STATUS_MEMOPS_FAILED;
+
+    labels_without_blanks_ =
+        reinterpret_cast<int *>(static_cast<char*>(gpu_workspace_) +
+                                gpu_bytes_used);
+    gpu_bytes_used += Lmax * minibatch_ * sizeof(int);
+    cuda_status = cudaMemcpyAsync(labels_without_blanks_, flat_labels,
+                                  total_label_length * sizeof(int),
+                                  cudaMemcpyHostToDevice, stream_);
+    if (cuda_status != cudaSuccess)
+        return CTC_STATUS_MEMOPS_FAILED;
+
+    labels_with_blanks_ =
+        reinterpret_cast<int *>(static_cast<char*>(gpu_workspace_) +
+                                gpu_bytes_used);
+    gpu_bytes_used += Smax * minibatch_ * sizeof(int);
+
+    alphas_ =
+        reinterpret_cast<ProbT *>(static_cast<char*>(gpu_workspace_) +
+                                  gpu_bytes_used);
+    gpu_bytes_used += (S_ * T_) * minibatch_ * sizeof(ProbT);
+
+
+    denoms_ =
+        reinterpret_cast<ProbT *>(static_cast<char*>(gpu_workspace_) +
+                                  gpu_bytes_used);
+    gpu_bytes_used += activation_cols_ * sizeof(ProbT);
+
+    probs_ =
+        reinterpret_cast<ProbT *>(static_cast<char*>(gpu_workspace_) +
+                                  gpu_bytes_used);
+    gpu_bytes_used += out_dim_ * activation_cols_ * sizeof(ProbT);
+
+    return CTC_STATUS_SUCCESS;
+}
+
+template<typename ProbT>
+template<int NT, int VT>
+ctcStatus_t GpuCTC<ProbT>::launch_alpha_beta_kernels(const ProbT* const probs,
+                                                     ProbT* grads,
+                                                     bool compute_alpha,
+                                                     bool compute_beta ) {
+
+    // One thread block per utterance
+    const int grid_size = minibatch_;
+
+    // The data is laid out so that the next timestep is minibatch entries
+    // away
+    const int stride = minibatch_;
+
+    if (compute_alpha)
+        compute_alpha_kernel<ProbT, NT, VT><<<grid_size, NT, 0, stream_>>>
+            (probs, label_sizes_, utt_length_,
+             repeats_, labels_without_blanks_, label_offsets_,
+             labels_with_blanks_, alphas_, nll_forward_,
+             stride, out_dim_, S_, T_, blank_label_);
+
+
+    if (compute_beta) {
+        compute_betas_and_grad_kernel<ProbT, NT, VT><<<grid_size, NT, 0, stream_>>>
+            (probs, label_sizes_, utt_length_, repeats_,
+             labels_with_blanks_, alphas_, nll_forward_, nll_backward_,
+             grads, stride, out_dim_, S_, T_, blank_label_);
+
+        cudaStreamSynchronize(stream_);
+    }
+
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess)
+        return CTC_STATUS_EXECUTION_FAILED;
+
+    return CTC_STATUS_SUCCESS;
+}
+
+template<typename ProbT>
+ctcStatus_t
+GpuCTC<ProbT>::create_metadata_and_choose_config(const int* const flat_labels,
+                                                 const int* const label_lengths,
+                                                 const int* const input_lengths,
+                                                 size_t& best_config) {
+
+    // Setup the metadata for GPU
+    ctcStatus_t status = setup_gpu_metadata(flat_labels, label_lengths, input_lengths);
+    if (status != CTC_STATUS_SUCCESS)
+        return status;
+
+    constexpr int num_configs = 12;
+
+    int config_NT[num_configs] =
+        {32, 64, 128, 64, 128, 32, 64, 128, 64, 128, 128, 128};
+    int config_VT[num_configs] =
+        { 1,  1,   1,  3,   2,  9,  6,   4,  9,   6,   9,  10};
+
+    best_config = 0;
+
+    for (int i = 0; i < num_configs; ++i) {
+        if ((config_NT[i]* config_VT[i]) >= S_)
+            break;
+        else
+            best_config++;
+    }
+
+    if (best_config >= num_configs)
+        return CTC_STATUS_UNKNOWN_ERROR;
+
+    return CTC_STATUS_SUCCESS;
+}
+
+template<typename ProbT>
+ctcStatus_t
+GpuCTC<ProbT>::launch_gpu_kernels(const ProbT* const probs,
+                                  ProbT* grads,
+                                  size_t config,
+                                  bool l_a,
+                                  bool l_b) {
+
+    switch(config) {
+        case 0:   {return launch_alpha_beta_kernels<32,   1>(probs, grads, l_a, l_b);}
+        case 1:   {return launch_alpha_beta_kernels<64,   1>(probs, grads, l_a, l_b);}
+        case 2:   {return launch_alpha_beta_kernels<128,  1>(probs, grads, l_a, l_b);}
+        case 3:   {return launch_alpha_beta_kernels<64,   3>(probs, grads, l_a, l_b);}
+        case 4:   {return launch_alpha_beta_kernels<128,  2>(probs, grads, l_a, l_b);}
+        case 5:   {return launch_alpha_beta_kernels<32,   9>(probs, grads, l_a, l_b);}
+        case 6:   {return launch_alpha_beta_kernels<64,   6>(probs, grads, l_a, l_b);}
+        case 7:   {return launch_alpha_beta_kernels<128,  4>(probs, grads, l_a, l_b);}
+        case 8:   {return launch_alpha_beta_kernels<64,   9>(probs, grads, l_a, l_b);}
+        case 9:   {return launch_alpha_beta_kernels<128,  6>(probs, grads, l_a, l_b);}
+        case 10:  {return launch_alpha_beta_kernels<128,  9>(probs, grads, l_a, l_b);}
+        case 11:  {return launch_alpha_beta_kernels<128, 10>(probs, grads, l_a, l_b);}
+    }
+
+    return CTC_STATUS_EXECUTION_FAILED;
+}
+
+template<typename ProbT>
+ctcStatus_t
+GpuCTC<ProbT>::compute_probs(const ProbT* const activations) {
+
+    cudaError_t cuda_status;
+    cuda_status =
+        cudaMemcpyAsync(probs_, activations,
+                        activation_cols_ * out_dim_ *sizeof(ProbT),
+                        cudaMemcpyDeviceToDevice, stream_);
+    if (cuda_status != cudaSuccess)
+        return CTC_STATUS_MEMOPS_FAILED;
+
+    // Numerically stable SM
+    ctcStatus_t ctc_status =
+        reduce_max(probs_, denoms_, out_dim_,
+                   activation_cols_, 1, stream_);
+    if (ctc_status != CTC_STATUS_SUCCESS)
+        return ctc_status;
+
+    // Kernel launch to subtract maximum
+    const int NT = 128;
+    const int VT = 1;
+    const int NV = NT * VT;
+    const int num_elements = out_dim_ * activation_cols_;
+    const int grid_size = ctc_helper::div_up(num_elements, NV);
+
+    prepare_stable_SM_kernel<ProbT, VT> <<< grid_size, NT, 0, stream_>>>
+       (ctc_helper::identity<ProbT>(), probs_,
+        denoms_, out_dim_, num_elements);
+
+    // Reduce along columns to calculate denominator
+    ctc_status =
+        reduce_exp(probs_, denoms_, out_dim_,
+                   activation_cols_, 1, stream_);
+    if (ctc_status != CTC_STATUS_SUCCESS)
+        return ctc_status;
+
+    // Kernel launch to calculate probabilities
+    compute_probs_kernel<ProbT, VT><<<grid_size, NT, 0, stream_>>>
+        (ctc_helper::exponential<ProbT>(), probs_,
+         denoms_, out_dim_, num_elements);
+
+    return CTC_STATUS_SUCCESS;
+}
+
+template<typename ProbT>
+ctcStatus_t
+GpuCTC<ProbT>::compute_cost_and_score(const ProbT* const activations,
+                                      ProbT* grads,
+                                      ProbT* costs,
+                                      const int* const flat_labels,
+                                      const int* const label_lengths,
+                                      const int* const input_lengths,
+                                      bool compute_alpha,
+                                      bool compute_betas_and_grad) {
+
+    size_t best_config;
+    ctcStatus_t status = create_metadata_and_choose_config(flat_labels,
+                                                           label_lengths,
+                                                           input_lengths,
+                                                           best_config);
+    if (status != CTC_STATUS_SUCCESS)
+        return status;
+
+    status = compute_probs(activations);
+    if (status != CTC_STATUS_SUCCESS)
+        return status;
+
+    launch_gpu_kernels(probs_, grads, best_config,
+                       compute_alpha, compute_betas_and_grad);
+
+    cudaError_t cuda_status_mem, cuda_status_sync;
+    cuda_status_mem = cudaMemcpyAsync(costs, nll_forward_,
+                                      sizeof(ProbT) * minibatch_,
+                                      cudaMemcpyDeviceToHost, stream_);
+    cuda_status_sync = cudaStreamSynchronize(stream_);
+    if (cuda_status_mem != cudaSuccess || cuda_status_sync != cudaSuccess)
+        return CTC_STATUS_MEMOPS_FAILED;
+
+    return CTC_STATUS_SUCCESS;
+}
+
+template<typename ProbT>
+ctcStatus_t
+GpuCTC<ProbT>::cost_and_grad(const ProbT* const activations,
+                             ProbT* grads,
+                             ProbT* costs,
+                             const int* const flat_labels,
+                             const int* const label_lengths,
+                             const int* const input_lengths) {
+    if (activations == nullptr ||
+        grads == nullptr ||
+        costs == nullptr ||
+        flat_labels == nullptr ||
+        label_lengths == nullptr ||
+        input_lengths == nullptr
+        )
+        return CTC_STATUS_INVALID_VALUE;
+
+    return compute_cost_and_score(activations, grads, costs, flat_labels,
+                                  label_lengths, input_lengths, true, true);
+}
+
+template<typename ProbT>
+ctcStatus_t
+GpuCTC<ProbT>::score_forward(const ProbT* const activations,
+                             ProbT* costs,
+                             const int* const flat_labels,
+                             const int* const label_lengths,
+                             const int* const input_lengths) {
+    if (activations == nullptr ||
+        costs == nullptr ||
+        flat_labels == nullptr ||
+        label_lengths == nullptr ||
+        input_lengths == nullptr
+        )
+        return CTC_STATUS_INVALID_VALUE;
+
+    return compute_cost_and_score(activations, nullptr, costs, flat_labels,
+                                  label_lengths, input_lengths, true, false);
+}
+
--- a/include/detail/gpu_ctc_kernels.h
+++ b/include/detail/gpu_ctc_kernels.h
+#pragma once
+
+#include <contrib/moderngpu/include/device/ctascan.cuh>
+#include <contrib/moderngpu/include/device/ctamerge.cuh>
+
+#include "ctc_helper.h"
+
+using namespace mgpu;
+
+template<int NT, int VT, typename T, typename KeyT, typename Op>
+struct CTASegReduce {
+
+    enum {NV = NT * VT};
+
+    union Storage {
+        typename CTAScan<NT>::Storage scanStorage;
+        int indices[NV];
+    };
+
+    //adapted from global kernel KernelReduceByKeyPreprocess
+    __device__ static void preprocessKeys(KeyT *keys, int count,
+                                          int *numUniqueLabels, int seg_start[VT],
+                                          int seg_end[VT], int *scanout) {
+        __shared__ Storage shared;
+
+        const int tid = threadIdx.x;
+        // Compare adjacent keys within each thread and mark discontinuities
+        int endFlags = 0;
+        T key = keys[VT * tid];
+        #pragma unroll
+        for (int i = 0; i < VT; ++i) {
+            int index = VT * tid + 1 + i;
+            T next = keys[index];
+            if(index == count || (index < count && key != next)) {
+                endFlags |= 1 << i;
+            }
+            key = next;
+        }
+
+        __syncthreads();
+
+        //Count the number of encountered end flags
+        int scan = CTAScan<NT>::Scan(tid, popc(endFlags), shared.scanStorage, numUniqueLabels);
+
+        __syncthreads();
+
+        //output the unique keys
+        //use indices as scratch space
+        int outputPos = scan;
+        #pragma unroll
+        for (int i = 0; i < VT; ++i) {
+
+            if ( (endFlags >> i) & 1) {
+                shared.indices[outputPos] = keys[VT * tid + i];
+                scanout[outputPos] = VT * tid + i;
+                outputPos++;
+            }
+        }
+
+        __syncthreads();
+
+        // Create start and end
+        for (int idx = tid, j = 0; idx < (*numUniqueLabels); idx += blockDim.x, ++j) {
+            seg_start[j] = (idx == 0) ? 0 : (scanout[idx-1] + 1);
+            seg_end[j] = scanout[idx];
+        }
+
+        __syncthreads();
+
+        //copy from the scratch space back into the keys
+        #pragma unroll
+        for (int i = 0; i < VT; ++i) {
+            keys[i * NT + tid] = shared.indices[i * NT + tid];
+        }
+
+        __syncthreads();
+    }
+};
+
+// Computes forward probabilities. This fills in a T * S matrix.
+// The computation starts at t=1 (2nd row) and ends at t=T-1 (last row). Each row has
+// S elements where S = 2L + 1.
+//
+// We only need to read in probabilities corresponding to the labels, thus a sparse
+// set of values are read from the probs matrix since the character set is much smaller
+// than the labels. This is much more true for Mandarin than English.
+template<typename ProbT, int NT, int VT>
+__global__
+void compute_alpha_kernel (const ProbT* probs, const int *label_sizes,
+                           const int *utt_length, const int *repeats_in_labels,
+                           const int *labels_without_blanks, const int *label_offsets, 
+                           int *labels_with_blanks, ProbT *alphas, 
+                           ProbT* nll_forward, int stride, int out_dim,
+                           int S_memoffset, int T_memoffset, int blank_label) {
+
+    ctc_helper::log_plus<ProbT> log_plus_f;
+
+    const int tid = threadIdx.x;
+    const int L = label_sizes[blockIdx.x];
+    const int T = utt_length[blockIdx.x];
+    const int S = 2*L + 1;
+    const int prob_offset = out_dim * blockIdx.x;
+    const int repeats = repeats_in_labels[blockIdx.x];
+
+    const int NV = NT * VT;
+    __shared__ int label[NV];
+
+    if ((L + repeats) > T)
+        return;
+
+    // Generate labels with blanks from labels without blanks
+    {
+        const int label_start_offset = label_offsets[blockIdx.x];
+        for (int idx = tid; idx < L; idx += blockDim.x) {
+            const int offset = (blockIdx.x * S_memoffset) + 2 * idx;
+            labels_with_blanks[offset] = blank_label;
+            labels_with_blanks[offset+1] = labels_without_blanks[label_start_offset + idx];
+        }
+        if (tid == 0) {
+            labels_with_blanks[(blockIdx.x * S_memoffset) + 2 * L] = blank_label;
+        }
+    }
+    __syncthreads();
+
+    const int *labels = labels_with_blanks;
+    const int* label_global = &labels[blockIdx.x * S_memoffset];
+    ProbT* alpha = &alphas[blockIdx.x * (S_memoffset * T_memoffset)];
+
+    // Set the first row of alpha neg_inf - it is much more efficient to do it
+    // here than outside
+    #pragma unroll
+    for (int idx = tid; idx < min(S, NV); idx += blockDim.x) {
+        alpha[idx] = ctc_helper::neg_inf<ProbT>();
+    }
+
+    // Load labels into shared memory
+    #pragma unroll
+    for (int i = tid; i < S; i += NT) {
+        label[i] = label_global[i];
+    }
+
+    __syncthreads();
+
+    int start =  (L + repeats < T) ? 0 : 1;
+    int end = S > 1 ? 2 : 1;
+
+    // Initialize the first row corresponding to t=0;
+    for(int i = tid; i < (end-start); i += blockDim.x)
+        alpha[i + start] = log(probs[prob_offset + label[i + start]]);
+
+    __syncthreads();
+
+    // Fill in the rest of matrix, one row at a time (outer loop).
+    for(int t = 1; t < T; ++t) {
+
+        // Start offsets into the current and previous row
+        const int start_cur_row = t * S;
+        const int start_prev_row = (t - 1) * S;
+
+        // The prob is a 2D column major array, with probabilites for each t strided
+        // by (out_dim * stride), where stride is the minibatch size
+        const int start_prob_col = t * (out_dim * stride);
+
+        // This is the first column and in this case there is nothing left of it
+        if (tid == 0) {
+            if (start == 0) {
+                alpha[start_cur_row] = alpha[start_prev_row] +
+                                       log(probs[prob_offset + start_prob_col + blank_label]);
+            }
+            else if (start == 1) {
+                alpha[start_cur_row] = alpha[start_prev_row];
+            }
+        }
+
+        __syncthreads();
+
+        // Fill in the elements in each row. There is no loop dependence here since our
+        // input is the row above. We sum either two or three adjacent values from the
+        // row above depending on whether we have a blank or repeated characters. Finally
+        // we add the probability corresponding to this label at time t
+        #pragma unroll
+        for (int idx = (tid+1); idx < S; idx += blockDim.x) {
+
+            ProbT prev_sum = log_plus_f(alpha[idx + start_prev_row], alpha[(idx-1) + start_prev_row]);
+
+            // Skip two if not on blank and not on repeat.
+            if ((label[idx] != blank_label) &&
+                (idx != 1) && (label[idx] != label[idx-2]))
+                prev_sum = log_plus_f(prev_sum, alpha[(idx-2) + start_prev_row]);
+
+            alpha[idx + start_cur_row] =
+                prev_sum + log(probs[prob_offset + start_prob_col + label[idx]]);
+        }
+
+        __syncthreads();
+    }
+
+    if (tid == 0) {
+        // Add and return the rightmost two/one element(s) in the last row.
+        ProbT loglike = ctc_helper::neg_inf<ProbT>();
+
+        // This is the total increment for s_inc and e_inc through the loop
+        const int val = 2 * (L-1) + 1 - (((L + repeats) == T) ? 1 : 0);
+
+        start = (val * (L!=0) + start);
+        end = (val * (L!=0) + end);
+
+        for(int i = start; i < end; ++i)
+            loglike = log_plus_f(loglike, alpha[i + (T - 1) * S]);
+
+        nll_forward[blockIdx.x] = -loglike;
+    }
+}
+
+// Computes backward probabilities. This also fills in a T * S matrix
+//
+// See comments above compute_alphas for more context.
+template<typename ProbT, int NT, int VT>
+__global__
+void compute_betas_and_grad_kernel (const ProbT* probs, const int *label_sizes,
+                                    const int *utt_length, const int *repeats_in_labels,
+                                    const int *labels_with_blanks, ProbT *alphas,
+                                    const ProbT* nll_forward, ProbT *nll_backward,
+                                    ProbT *grads, int stride, int out_dim,
+                                    int S_memoffset, int T_memoffset, int blank_label) {
+
+    ctc_helper::log_plus<ProbT> log_plus_f;
+    typedef CTASegReduce<NT, VT, ProbT, int, ctc_helper::log_plus<ProbT>> SegReduce;
+
+    const int tid = threadIdx.x;
+    const int L = label_sizes[blockIdx.x];
+    const int T = utt_length[blockIdx.x];
+    const int S = 2*L + 1;
+    const int prob_offset = out_dim * blockIdx.x;
+    const int repeats = repeats_in_labels[blockIdx.x];
+    const ProbT log_partition = -nll_forward[blockIdx.x];
+
+    const int* labels = labels_with_blanks;
+    const int* label_global = &labels[blockIdx.x * S_memoffset];
+    ProbT* alpha = &alphas[blockIdx.x * (S_memoffset * T_memoffset)];
+
+    const int NV = NT * VT;
+
+    union TempStorage {
+        ProbT beta[NV];
+        int result[NV];
+    };
+
+    __shared__ TempStorage temp_buffer;
+
+    __shared__ int label[NV];
+
+    // Temporaries needed for segmented reduce
+    // TODO: see if we can combine the shared memory requirements
+    __shared__ int keys_shared[NV];
+    __shared__ int gather_indices[NV];
+    __shared__ ProbT output[NV];
+
+    ProbT beta_val[VT];
+
+    if ((L + repeats) > T)
+        return;
+
+    int start = S > 1 ? (S - 2) : 0;
+    int end = (L + repeats < T) ? S : S-1;
+
+    // Setup shared memory buffers
+    #pragma unroll
+    for (int idx = tid; idx < NV; idx += NT) {
+        label[idx] = (idx < S) ? label_global[idx] : INT_MAX;
+    }
+
+    __syncthreads();
+
+    // int flags;
+    int uniquelabels;
+    int seg_start[VT];
+    int seg_end[VT];
+
+    // Sort labels and record indices from which to gather from
+    {
+        int key[VT];
+        int gather_val[VT];
+
+        #pragma unroll
+        for (int i = 0; i < VT; ++i) {
+            const int idx = tid * VT + i;
+            gather_val[i] = idx;
+            key[i] = label[idx];
+        }
+
+        __syncthreads();
+
+        CTAMergesort<NT, VT, true, true, int, int, mgpu::less<int>>
+            (key, gather_val, keys_shared, gather_indices, S, tid, mgpu::less<int>());
+
+        __syncthreads();
+
+        for (int i = 0; i < VT; ++i) {
+            const int idx = tid * VT + i;
+            gather_indices[idx] = gather_val[i];
+        }
+
+        __syncthreads();
+
+        SegReduce::preprocessKeys(keys_shared, S, &uniquelabels, seg_start, seg_end,
+                                  temp_buffer.result);
+        __syncthreads();
+    }
+
+    // TODO: probably not necessary
+    __syncthreads();
+
+    // Load labels back
+    #pragma unroll
+    for (int idx = tid; idx < NV; idx += NT) {
+        temp_buffer.beta[idx] = ctc_helper::neg_inf<ProbT>();
+    }
+    __syncthreads();
+
+    // Initialize the two rightmost values in the last row (assuming L non-zero)
+    for(int i = tid; i < (end-start); i += blockDim.x)
+        temp_buffer.beta[i + start] =
+            log(probs[prob_offset + (T - 1) * (out_dim * stride) + label[i + start]]);
+
+    __syncthreads();
+
+    // Load output data in registers through the transpose trick - should really be a function
+    #pragma unroll
+    for (int idx = tid; idx < S; idx += NT) {
+        output[idx] = alpha[idx + (T - 1) * S] + temp_buffer.beta[idx];
+    }
+
+    __syncthreads();
+
+    // Start at the second to last row and backward in time
+    for(int t = T - 1; t >= 0; --t) {
+
+        // Start offsets into the current and next row
+        const int start_cur_row = t * S;
+
+        // Starting offset of column that we read from the probs array
+        const int start_prob_col = t * (out_dim * stride);
+
+        if (t < T-1) {
+
+            // Filling up one row at at time but going back in time from the last row
+            // to the first. As in the forward pass, there is no loop dependence and we
+            // do a variable length filter of maximum filter size of 3
+            #pragma unroll
+            for(int idx = tid, i = 0; idx < (S-1); idx += NT, i++) {
+                ProbT next_sum = log_plus_f(temp_buffer.beta[idx], temp_buffer.beta[idx+1]);
+
+                    // Skip two if not on blank and not on repeat.
+                if ((label[idx] != blank_label) &&
+                    (idx != (S-2)) && (label[idx] != label[idx+2]))
+                    next_sum = log_plus_f(next_sum, temp_buffer.beta[idx+2]);
+
+                beta_val[i] = next_sum + log(probs[prob_offset + start_prob_col + label[idx]]);
+            }
+
+            __syncthreads();
+
+            // Initialize values for the rightmost column since there is nothing to the right
+            // Update input buffer for next iteration
+            if ((tid == 0) && (end == S))
+                temp_buffer.beta[(S-1)] = temp_buffer.beta[(S-1)] +
+                                          log(probs[prob_offset + start_prob_col + blank_label]);
+
+            #pragma unroll
+            for(int idx = tid, i = 0; idx < (S-1); idx += NT, i++) {
+               temp_buffer.beta[idx] = beta_val[i];
+            }
+
+            __syncthreads();
+
+            // Beta Computation done - add to alpha and update the gradient. Reload
+            // the gradient back for segmented reduce later on
+            #pragma unroll
+            for(int idx = tid; idx < S; idx += NT) {
+               output[idx] = alpha[idx + start_cur_row] + temp_buffer.beta[idx];
+            }
+
+            __syncthreads();
+
+        }
+
+        __syncthreads();
+
+        // Compute segmented reduction of output by using label as key
+        {
+            // Somewhat faster key value reduce
+            ProbT accum[VT];
+
+            for (int idx = tid, j = 0; idx < uniquelabels; idx += blockDim.x, ++j) {
+
+                accum[j] = ctc_helper::neg_inf<ProbT>();
+                for (int i = seg_start[j]; i <= seg_end[j]; ++i) {
+                    accum[j] = log_plus_f(accum[j], output[gather_indices[i]]);
+                }
+            }
+            __syncthreads();
+
+            // Write accumulated value into output since that is not used
+            for (int idx = tid, j = 0; idx < uniquelabels; idx += blockDim.x, ++j) {
+                output[idx] = accum[j];
+            }
+            __syncthreads();
+
+            for (int idx = tid; idx < out_dim; idx += blockDim.x) {
+                const int grads_offset = prob_offset + start_prob_col + idx;
+                grads[grads_offset] = probs[grads_offset];
+            }
+
+            __syncthreads();
+
+            for (int idx = tid; idx < uniquelabels; idx += blockDim.x) {
+                const int grads_offset = prob_offset + start_prob_col + keys_shared[idx];
+
+                ProbT grad = output[idx];
+
+                if ((grad == 0.0) || (probs[grads_offset] == 0.0) ||
+                    (grad == ctc_helper::neg_inf<ProbT>())) {
+                } else {
+                    grads[grads_offset] =
+                        probs[grads_offset] - exp(grad - log(probs[grads_offset]) - log_partition);
+                }
+            }
+
+            __syncthreads();
+        }
+
+        // Output backward log likelihood
+        if ((t == 0) && (tid == 0)) {
+            ProbT loglike = ctc_helper::neg_inf<ProbT>();
+
+            const int val = 2 * (L-1) + 1 - (((L + repeats) == T) ? 1 : 0);
+
+            start = (-val * (L != 0) + start);
+            end = (-val * (L != 0) + end);
+
+            // Sum and return the leftmost one/two value(s) in first row
+            for(int i = start; i < end; ++i)
+                loglike = log_plus_f(loglike, temp_buffer.beta[i]);
+
+            nll_backward[blockIdx.x] = -loglike;
+        }
+
+        // For some reason this is important
+        __syncthreads();
+    }
+}
+
+template <typename ProbT, int VT = 1, typename Op>
+__global__ void compute_probs_kernel(Op f, ProbT* probs,
+                                     const ProbT* const denom,
+                                     int alphabet_size,
+                                     int count) {
+
+    int idx = blockDim.x * blockIdx.x + threadIdx.x;
+    int stride = blockDim.x * gridDim.x;
+#pragma unroll
+    for(int i = 0; i < VT; i++) {
+        if (idx < count) {
+            const int column_idx = idx / alphabet_size;
+            probs[idx] = f(probs[idx]) / denom[column_idx];
+        }
+        idx += stride;
+    }
+}
+
+template <typename ProbT, int VT = 1, typename Op>
+__global__ void prepare_stable_SM_kernel(Op f, ProbT* probs,
+                                         const ProbT* const col_max,
+                                         int alphabet_size,
+                                         int count) {
+
+    int idx = blockDim.x * blockIdx.x + threadIdx.x;
+    int stride = blockDim.x * gridDim.x;
+#pragma unroll
+    for(int i = 0; i < VT; i++) {
+        if (idx < count) {
+            const int column_idx = idx / alphabet_size;
+            probs[idx] = f(probs[idx] - col_max[column_idx]);
+        }
+        idx += stride;
+    }
+}
--- a/include/detail/hostdevice.h
+++ b/include/detail/hostdevice.h
+#pragma once
+
+#ifdef __CUDACC__
+    #define HOSTDEVICE __host__ __device__
+#else
+    #define HOSTDEVICE
+#endif
--- a/include/detail/reduce.h
+++ b/include/detail/reduce.h
+#pragma once
+
+ctcStatus_t reduce_negate(const float* input, float* output, int rows, int cols, bool axis, cudaStream_t stream);
+ctcStatus_t reduce_exp(const float* input, float* output, int rows, int cols, bool axis, cudaStream_t stream);
+ctcStatus_t reduce_max(const float* input, float* output, int rows, int cols, bool axis, cudaStream_t stream);
--- a/pytorch_binding/.gitignore
+++ b/pytorch_binding/.gitignore
+# Ignore compiled FFI location
+warpctc_pytorch/_warp_ctc
+
+# Created by https://www.gitignore.io/api/python
+
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule.*
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+
+# End of https://www.gitignore.io/api/python
--- a/pytorch_binding/setup.cfg
+++ b/pytorch_binding/setup.cfg
+[tool:pytest]
--- a/pytorch_binding/setup.py
+++ b/pytorch_binding/setup.py
+import os
+import platform
+import sys
+from setuptools import setup, find_packages
+
+from torch.utils.cpp_extension import BuildExtension, CppExtension
+import torch
+
+extra_compile_args = ['-std=c++14', '-fPIC']
+warp_ctc_path = "../build"
+
+if platform.system() == 'Darwin':
+    lib_ext = ".dylib"
+else:
+    lib_ext = ".so"
+
+if "WARP_CTC_PATH" in os.environ:
+    warp_ctc_path = os.environ["WARP_CTC_PATH"]
+if not os.path.exists(os.path.join(warp_ctc_path, "libwarpctc" + lib_ext)):
+    print(("Could not find libwarpctc.so in {}.\n"
+           "Build warp-ctc and set WARP_CTC_PATH to the location of"
+           " libwarpctc.so (default is '../build')").format(warp_ctc_path))
+    sys.exit(1)
+
+include_dirs = [os.path.realpath('../include')]
+
+if torch.cuda.is_available() or "CUDA_HOME" in os.environ:
+    enable_gpu = True
+else:
+    print("Torch was not built with CUDA support, not building warp-ctc GPU extensions.")
+    enable_gpu = False
+
+if enable_gpu:
+    from torch.utils.cpp_extension import CUDAExtension
+
+    build_extension = CUDAExtension
+    extra_compile_args += ['-DWARPCTC_ENABLE_GPU']
+else:
+    build_extension = CppExtension
+
+ext_modules = [
+    build_extension(
+        name='warpctc_pytorch._warp_ctc',
+        language='c++',
+        sources=['src/binding.cpp'],
+        include_dirs=include_dirs,
+        library_dirs=[os.path.realpath(warp_ctc_path)],
+        libraries=['warpctc'],
+        extra_link_args=['-Wl,-rpath,' + os.path.realpath(warp_ctc_path)],
+        extra_compile_args=extra_compile_args
+    )
+]
+
+setup(
+    name="warpctc_pytorch",
+    version="0.1",
+    description="PyTorch wrapper for warp-ctc",
+    url="https://github.com/SeanNaren/warp-ctc",
+    author="Jared Casper, Sean Naren",
+    author_email="jared.casper@baidu.com, sean.narenthiran@digitalreasoning.com",
+    license="Apache",
+    packages=find_packages(),
+    ext_modules=ext_modules,
+    cmdclass={'build_ext': BuildExtension}
+)
--- a/pytorch_binding/src/binding.cpp
+++ b/pytorch_binding/src/binding.cpp
+#include <iostream>
+#include <vector>
+
+#include <numeric>
+
+#include <torch/extension.h>
+
+#ifdef WARPCTC_ENABLE_GPU
+	#include "ATen/cuda/CUDAContext.h"
+	#include <c10/cuda/CUDAGuard.h>
+	#include "ATen/cuda/CUDAEvent.h"
+
+    #include "THC.h"
+    extern THCState* state;
+#endif
+
+#include "ctc.h"
+
+int cpu_ctc(torch::Tensor probs,
+            torch::Tensor grads,
+            torch::Tensor labels,
+            torch::Tensor label_sizes,
+            torch::Tensor sizes,
+            int minibatch_size,
+            torch::Tensor costs,
+            int blank_label)
+{
+    float* probs_ptr       = (float*)probs.data_ptr();
+    float* grads_ptr       = grads.storage() ? (float*)grads.data_ptr() : NULL;
+    int*   sizes_ptr       = (int*)sizes.data_ptr();
+    int*   labels_ptr      = (int*)labels.data_ptr();
+    int*   label_sizes_ptr = (int*)label_sizes.data_ptr();
+    float* costs_ptr       = (float*)costs.data_ptr();
+
+    const int probs_size = probs.size(2);
+
+    ctcOptions options;
+    memset(&options, 0, sizeof(options));
+    options.loc = CTC_CPU;
+    options.num_threads = 0; // will use default number of threads
+    options.blank_label = blank_label;
+
+#if defined(CTC_DISABLE_OMP) || defined(APPLE)
+    // have to use at least one
+    options.num_threads = std::max(options.num_threads, (unsigned int) 1);
+#endif
+
+    size_t cpu_size_bytes;
+    get_workspace_size(label_sizes_ptr, sizes_ptr,
+                       probs_size, minibatch_size,
+                       options, &cpu_size_bytes);
+
+    float* cpu_workspace = new float[cpu_size_bytes / sizeof(float)];
+
+    compute_ctc_loss(probs_ptr, grads_ptr,
+                     labels_ptr, label_sizes_ptr,
+                     sizes_ptr, probs_size,
+                     minibatch_size, costs_ptr,
+                     cpu_workspace, options);
+
+    delete[] cpu_workspace;
+    return 1;
+}
+
+#ifdef WARPCTC_ENABLE_GPU
+int gpu_ctc(torch::Tensor probs,
+            torch::Tensor grads,
+            torch::Tensor labels,
+            torch::Tensor label_sizes,
+            torch::Tensor sizes,
+            int minibatch_size,
+            torch::Tensor costs,
+            int blank_label)
+{
+    float* probs_ptr       = (float*)probs.data_ptr();
+    float* grads_ptr       = grads.storage() ? (float*)grads.data_ptr() : NULL;
+    int*   sizes_ptr       = (int*)sizes.data_ptr();
+    int*   labels_ptr      = (int*)labels.data_ptr();
+    int*   label_sizes_ptr = (int*)label_sizes.data_ptr();
+    float* costs_ptr       = (float*)costs.data_ptr();
+
+    const int probs_size = probs.size(2);
+
+    ctcOptions options;
+    memset(&options, 0, sizeof(options));
+    options.loc = CTC_GPU;
+    options.blank_label = blank_label;
+    options.stream = at::cuda::getCurrentCUDAStream();
+
+    size_t gpu_size_bytes;
+    get_workspace_size(label_sizes_ptr, sizes_ptr,
+                       probs_size, minibatch_size,
+                       options, &gpu_size_bytes);
+
+    void* gpu_workspace = THCudaMalloc(state, gpu_size_bytes);
+
+    compute_ctc_loss(probs_ptr, grads_ptr,
+                     labels_ptr, label_sizes_ptr,
+                     sizes_ptr, probs_size,
+                     minibatch_size, costs_ptr,
+                     gpu_workspace, options);
+
+    THCudaFree(state, (void *) gpu_workspace);
+    return 1;
+}
+#endif
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("cpu_ctc", &cpu_ctc, "CTC Loss function with cpu");
+#ifdef WARPCTC_ENABLE_GPU
+  m.def("gpu_ctc", &gpu_ctc, "CTC Loss function with gpu");
+#endif
+}