warpctc for dcu

ffdb193b · lishen · 99e2985d · ffdb193b · ffdb193b · ffdb193b
Commit ffdb193b authored May 16, 2023 by lishen
20 changed files
--- a/.gitignore
+++ b/.gitignore
+.idea
+*~
+Makefile
+build
--- a/README.md
+++ b/README.md
+# PyTorch bindings for Warp-ctc
+[![Build Status](https://travis-ci.org/SeanNaren/warp-ctc.svg?branch=pytorch_bindings)](https://travis-ci.org/SeanNaren/warp-ctc)
+This is an extension onto the original repo found [here](https://github.com/baidu-research/warp-ctc).
+## Installation
+Install [PyTorch](https://github.com/pytorch/pytorch#installation) v0.4.
+`WARP_CTC_PATH` should be set to the location of a built WarpCTC
+(i.e. `libwarpctc.so`).  This defaults to `../build`, so from within a
+new warp-ctc clone you could build WarpCTC like this:
+```bash
+git clone https://github.com/SeanNaren/warp-ctc.git
+cd warp-ctc
+mkdir build; cd build
+cmake ..
+make
+```
+Now install the bindings:
+```bash
+cd pytorch_binding
+python setup.py install
+```
+If you try the above and get a dlopen error on OSX with anaconda3 (as recommended by pytorch):
+```bash
+cd ../pytorch_binding
+python setup.py install
+cd ../build
+cp libwarpctc.dylib /Users/$WHOAMI/anaconda3/lib
+```
+This will resolve the library not loaded error. This can be easily modified to work with other python installs if needed.
+Example to use the bindings below.
+```python
+import torch
+from warpctc_pytorch import CTCLoss
+ctc_loss = CTCLoss()
+# expected shape of seqLength x batchSize x alphabet_size
+probs = torch.FloatTensor([[[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]]]).transpose(0, 1).contiguous()
+labels = torch.IntTensor([1, 2])
+label_sizes = torch.IntTensor([2])
+probs_sizes = torch.IntTensor([2])
+probs.requires_grad_(True)  # tells autograd to compute gradients for probs
+cost = ctc_loss(probs, labels, probs_sizes, label_sizes)
+cost.backward()
+```
+## Documentation
+```
+CTCLoss(size_average=False, length_average=False)
+    # size_average (bool): normalize the loss by the batch size (default: False)
+    # length_average (bool): normalize the loss by the total number of frames in the batch. If True, supersedes size_average (default: False)
+forward(acts, labels, act_lens, label_lens)
+    # acts: Tensor of (seqLength x batch x outputDim) containing output activations from network (before softmax)
+    # labels: 1 dimensional Tensor containing all the targets of the batch in one large sequence
+    # act_lens: Tensor of size (batch) containing size of each output sequence from the network
+    # label_lens: Tensor of (batch) containing label length of each example
+```
\ No newline at end of file
--- a/README_HIP.md
+++ b/README_HIP.md
+# DLIB
+## 环境配置
+使用DCU编译之前，需要准备编译环境。参考
+[environment prepare](environment_prepare.md)
+## 使用源码安装
+### 编译环境准备（以dtk-23.04版本为例）
+- 拉取代码
+  ```
+  git clone -b dtk-23.04 http://developer.hpccube.com/codes/aicomponent/warpctc.git
+  ```
+- 在[开发者社区](https://developer.hpccube.com/tool/#sdk) DCU Toolkit 中下载 DTK-23.04 解压至 /opt/ 路径下，并建立软链接
+  ```
+  cd /opt && ln -s dtk-23.04 dtk
+  ```
+- 导入环境变量以及安装必要依赖库
+  ```shell
+  source /opt/dtk/env.sh
+  ```
+### 编译安装
+#### 编译 Python API
+- 使用python安装
+```shell
+python setup.py install
+```
+- 使用python编译whl包
+```shell
+python setup.py bdist_wheel
+```
+### 测试
+- 验证warpctc的loss正确性（CPU和GPU的一致性）
+```shell
+python3 test_gpu.py
+```
+- 验证warpctc的loss的GPU加速效果
+```shell
+python3 test_gpu_speed.py
+```
--- a/doc/deep-speech-ctc-small.png
+++ b/doc/deep-speech-ctc-small.png
--- a/include/contrib/moderngpu/LICENSE
+++ b/include/contrib/moderngpu/LICENSE
+/******************************************************************************
+* Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+* 
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are met:
+*     * Redistributions of source code must retain the above copyright
+*       notice, this list of conditions and the following disclaimer.
+*     * Redistributions in binary form must reproduce the above copyright
+*       notice, this list of conditions and the following disclaimer in the
+*       documentation and/or other materials provided with the distribution.
+*     * Neither the name of the NVIDIA CORPORATION nor the
+*       names of its contributors may be used to endorse or promote products
+*       derived from this software without specific prior written permission.
+* 
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+******************************************************************************/
--- a/include/contrib/moderngpu/include/device/ctaloadbalance.cuh
+++ b/include/contrib/moderngpu/include/device/ctaloadbalance.cuh
+/******************************************************************************
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+/******************************************************************************
+ *
+ * Code and text by Sean Baxter, NVIDIA Research
+ * See http://nvlabs.github.io/moderngpu for repository and documentation.
+ *
+ ******************************************************************************/
+#pragma once
+#include "ctasearch.cuh"
+#include "loadstore.cuh"
+namespace mgpu {
+////////////////////////////////////////////////////////////////////////////////
+// DeviceLoadBalancingSearch
+// Upper Bound search from A (needles) into B (haystack). The A values are
+// natural numbers from aBegin to aEnd. bFirst is the index of the B value at
+// bBegin in shared memory.
+template<int VT, bool RangeCheck>
+MGPU_DEVICE void DeviceSerialLoadBalanceSearch(const int* b_shared, int aBegin,
+	int aEnd, int bFirst, int bBegin, int bEnd, int* a_shared) {
+	int bKey = b_shared[bBegin];
+	#pragma unroll
+	for(int i = 0; i < VT; ++i) {
+		bool p;
+		if(RangeCheck)
+			p = (aBegin < aEnd) && ((bBegin >= bEnd) || (aBegin < bKey));
+		else
+			p = aBegin < bKey;
+		if(p)
+			// Advance A (the needle).
+			a_shared[aBegin++] = bFirst + bBegin;
+		else
+			// Advance B (the haystack).
+			bKey = b_shared[++bBegin];
+	}
+}
+////////////////////////////////////////////////////////////////////////////////
+// CTALoadBalance
+// Computes upper_bound(counting_iterator<int>(first), b_global) - 1.
+// Unlike most other CTA* functions, CTALoadBalance loads from global memory.
+// This returns the loaded B elements at the beginning or end of shared memory
+// depending on the aFirst argument.
+// CTALoadBalance requires NT * VT + 2 slots of shared memory.
+template<int NT, int VT, typename InputIt>
+MGPU_DEVICE int4 CTALoadBalance(int destCount, InputIt b_global,
+	int sourceCount, int block, int tid, const int* mp_global,
+	int* indices_shared, bool loadPrecedingB) {
+	int4 range = ComputeMergeRange(destCount, sourceCount, block, 0, NT * VT,
+		mp_global);
+	int a0 = range.x;
+	int a1 = range.y;
+	int b0 = range.z;
+	int b1 = range.w;
+	if(!b0) loadPrecedingB = false;
+	// Load one trailing term from B. If we're already at the end, fill the
+	// end of the buffer with destCount.
+	int aCount = a1 - a0;
+	int bCount = b1 - b0;
+	int extended = b1 < sourceCount;
+	int loadCount = bCount + extended;
+	int fillCount = NT * VT + 1 - loadCount - aCount;
+	int* a_shared = indices_shared;
+	int* b_shared = indices_shared + aCount + (int)loadPrecedingB;
+	// Load the B values.
+//	DeviceMemToMemLoop<NT>(bCount + extended + (int)loadPrecedingB,
+//		b_global + b0 - (int)loadPrecedingB, tid,
+//		b_shared - (int)loadPrecedingB);
+	for(int i = tid - (int)loadPrecedingB; i < bCount + extended; i += NT)
+		b_shared[i] = b_global[b0 + i];
+	// Fill the end of the array with destCount.
+	for(int i = tid + extended; i < fillCount; i += NT)
+		b_shared[bCount + i] = destCount;
+	__syncthreads();
+	// Run a merge path to find the start of the serial merge for each thread.
+	int diag = VT * tid;
+	int mp = MergePath<MgpuBoundsUpper>(mgpu::counting_iterator<int>(a0),
+		aCount, b_shared, bCount, diag, mgpu::less<int>());
+	int a0tid = a0 + mp;
+	int b0tid = diag - mp;
+	// Subtract 1 from b0 because we want to return upper_bound - 1.
+	DeviceSerialLoadBalanceSearch<VT, false>(b_shared, a0tid, a1, b0 - 1,
+		b0tid, bCount, a_shared - a0);
+	__syncthreads();
+	b0 -= (int)loadPrecedingB;
+	return make_int4(a0, a1, b0, b1);
+}
+} // namespace mgpu
--- a/include/contrib/moderngpu/include/device/ctamerge.cuh
+++ b/include/contrib/moderngpu/include/device/ctamerge.cuh
+/******************************************************************************
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+/******************************************************************************
+ *
+ * Code and text by Sean Baxter, NVIDIA Research
+ * See http://nvlabs.github.io/moderngpu for repository and documentation.
+ *
+ ******************************************************************************/
+#pragma once
+#include "ctasearch.cuh"
+#include "loadstore.cuh"
+#include "sortnetwork.cuh"
+namespace mgpu {
+////////////////////////////////////////////////////////////////////////////////
+// SerialMerge
+template<int VT, bool RangeCheck, typename T, typename Comp>
+MGPU_DEVICE void SerialMerge(const T* keys_shared, int aBegin, int aEnd,
+	int bBegin, int bEnd, T* results, int* indices, Comp comp) {
+	T aKey = keys_shared[aBegin];
+	T bKey = keys_shared[bBegin];
+	#pragma unroll
+	for(int i = 0; i < VT; ++i) {
+		bool p;
+		if(RangeCheck)
+			p = (bBegin >= bEnd) || ((aBegin < aEnd) && !comp(bKey, aKey));
+		else
+			p = !comp(bKey, aKey);
+		results[i] = p ? aKey : bKey;
+		indices[i] = p ? aBegin : bBegin - !RangeCheck;
+		if(p) aKey = keys_shared[++aBegin];
+		else bKey = keys_shared[++bBegin];
+	}
+	__syncthreads();
+}
+////////////////////////////////////////////////////////////////////////////////
+// FindMergeFrame and FindMergesortInterval help mergesort (both CTA and global
+// merge pass levels) locate lists within the single source array.
+// Returns (offset of a, offset of b, length of list).
+MGPU_HOST_DEVICE int3 FindMergesortFrame(int coop, int block, int nv) {
+	// coop is the number of CTAs or threads cooperating to merge two lists into
+	// one. We round block down to the first CTA's ID that is working on this
+	// merge.
+	int start = ~(coop - 1) & block;
+	int size = nv * (coop>> 1);
+	return make_int3(nv * start, nv * start + size, size);
+}
+// Returns (a0, a1, b0, b1) into mergesort input lists between mp0 and mp1.
+MGPU_HOST_DEVICE int4 FindMergesortInterval(int3 frame, int coop, int block,
+	int nv, int count, int mp0, int mp1) {
+	// Locate diag from the start of the A sublist.
+	int diag = nv * block - frame.x;
+	int a0 = frame.x + mp0;
+	int a1 = min(count, frame.x + mp1);
+	int b0 = min(count, frame.y + diag - mp0);
+	int b1 = min(count, frame.y + diag + nv - mp1);
+	// The end partition of the last block for each merge operation is computed
+	// and stored as the begin partition for the subsequent merge. i.e. it is
+	// the same partition but in the wrong coordinate system, so its 0 when it
+	// should be listSize. Correct that by checking if this is the last block
+	// in this merge operation.
+	if(coop - 1 == ((coop - 1) & block)) {
+		a1 = min(count, frame.x + frame.z);
+		b1 = min(count, frame.y + frame.z);
+	}
+	return make_int4(a0, a1, b0, b1);
+}
+////////////////////////////////////////////////////////////////////////////////
+// ComputeMergeRange
+MGPU_HOST_DEVICE int4 ComputeMergeRange(int aCount, int bCount, int block,
+	int coop, int NV, const int* mp_global) {
+	// Load the merge paths computed by the partitioning kernel.
+	int mp0 = mp_global[block];
+	int mp1 = mp_global[block + 1];
+	int gid = NV * block;
+	// Compute the ranges of the sources in global memory.
+	int4 range;
+	if(coop) {
+		int3 frame = FindMergesortFrame(coop, block, NV);
+		range = FindMergesortInterval(frame, coop, block, NV, aCount, mp0,
+			mp1);
+	} else {
+		range.x = mp0;											// a0
+		range.y = mp1;											// a1
+		range.z = gid - range.x;								// b0
+		range.w = min(aCount + bCount, gid + NV) - range.y;		// b1
+	}
+	return range;
+}
+////////////////////////////////////////////////////////////////////////////////
+// CTA mergesort support
+template<int NT, int VT, typename T, typename Comp>
+MGPU_DEVICE void CTABlocksortPass(T* keys_shared, int tid, int count,
+	int coop, T* keys, int* indices, Comp comp) {
+	int list = ~(coop - 1) & tid;
+	int diag = min(count, VT * ((coop - 1) & tid));
+	int start = VT * list;
+	int a0 = min(count, start);
+	int b0 = min(count, start + VT * (coop / 2));
+	int b1 = min(count, start + VT * coop);
+	int p = MergePath<MgpuBoundsLower>(keys_shared + a0, b0 - a0,
+		keys_shared + b0, b1 - b0, diag, comp);
+	SerialMerge<VT, true>(keys_shared, a0 + p, b0, b0 + diag - p, b1, keys,
+		indices, comp);
+}
+template<int NT, int VT, bool HasValues, typename KeyType, typename ValType,
+	typename Comp>
+MGPU_DEVICE void CTABlocksortLoop(ValType threadValues[VT],
+	KeyType* keys_shared, ValType* values_shared, int tid, int count,
+	Comp comp) {
+	#pragma unroll
+	for(int coop = 2; coop <= NT; coop *= 2) {
+		int indices[VT];
+		KeyType keys[VT];
+		CTABlocksortPass<NT, VT>(keys_shared, tid, count, coop, keys,
+			indices, comp);
+		if(HasValues) {
+			// Exchange the values through shared memory.
+			DeviceThreadToShared<VT>(threadValues, tid, values_shared);
+			DeviceGather<NT, VT>(NT * VT, values_shared, indices, tid,
+				threadValues);
+		}
+		// Store results in shared memory in sorted order.
+		DeviceThreadToShared<VT>(keys, tid, keys_shared);
+	}
+}
+////////////////////////////////////////////////////////////////////////////////
+// CTAMergesort
+// Caller provides the keys in shared memory. This functions sorts the first
+// count elements.
+template<int NT, int VT, bool Stable, bool HasValues, typename KeyType,
+	typename ValType, typename Comp>
+MGPU_DEVICE void CTAMergesort(KeyType threadKeys[VT], ValType threadValues[VT],
+	KeyType* keys_shared, ValType* values_shared, int count, int tid,
+	Comp comp) {
+	// Stable sort the keys in the thread.
+	if(VT * tid < count) {
+		if(Stable)
+			OddEvenTransposeSort<VT>(threadKeys, threadValues, comp);
+		else
+			OddEvenMergesort<VT>(threadKeys, threadValues, comp);
+	}
+	// Store the locally sorted keys into shared memory.
+	DeviceThreadToShared<VT>(threadKeys, tid, keys_shared);
+	// Recursively merge lists until the entire CTA is sorted.
+	CTABlocksortLoop<NT, VT, HasValues>(threadValues, keys_shared,
+		values_shared, tid, count, comp);
+}
+template<int NT, int VT, bool Stable, typename KeyType, typename Comp>
+MGPU_DEVICE void CTAMergesortKeys(KeyType threadKeys[VT],
+	KeyType* keys_shared, int count, int tid, Comp comp) {
+	int valuesTemp[VT];
+	CTAMergesort<NT, VT, Stable, false>(threadKeys, valuesTemp, keys_shared,
+		(int*)keys_shared, count, tid, comp);
+}
+template<int NT, int VT, bool Stable, typename KeyType, typename ValType,
+	typename Comp>
+MGPU_DEVICE void CTAMergesortPairs(KeyType threadKeys[VT],
+	ValType threadValues[VT], KeyType* keys_shared, ValType* values_shared,
+	int count, int tid, Comp comp) {
+	CTAMergesort<NT, VT, Stable, true>(threadKeys, threadValues, keys_shared,
+		values_shared, count, tid, comp);
+}
+////////////////////////////////////////////////////////////////////////////////
+// DeviceMergeKeysIndices
+template<int NT, int VT, bool LoadExtended, typename It1, typename It2,
+	typename T, typename Comp>
+MGPU_DEVICE void DeviceMergeKeysIndices(It1 a_global, int aCount, It2 b_global,
+	int bCount, int4 range, int tid, T* keys_shared, T* results, int* indices,
+	Comp comp) {
+	int a0 = range.x;
+	int a1 = range.y;
+	int b0 = range.z;
+	int b1 = range.w;
+	if(LoadExtended) {
+		bool extended = (a1 < aCount) && (b1 < bCount);
+		aCount = a1 - a0;
+		bCount = b1 - b0;
+		int aCount2 = aCount + (int)extended;
+		int bCount2 = bCount + (int)extended;
+		// Load one element past the end of each input to avoid having to use
+		// range checking in the merge loop.
+		DeviceLoad2ToShared<NT, VT, VT + 1>(a_global + a0, aCount2,
+			b_global + b0, bCount2, tid, keys_shared);
+		// Run a Merge Path search for each thread's starting point.
+		int diag = VT * tid;
+		int mp = MergePath<MgpuBoundsLower>(keys_shared, aCount,
+			keys_shared + aCount2, bCount, diag, comp);
+		// Compute the ranges of the sources in shared memory.
+		int a0tid = mp;
+		int b0tid = aCount2 + diag - mp;
+		if(extended) {
+			SerialMerge<VT, false>(keys_shared, a0tid, 0, b0tid, 0, results,
+				indices, comp);
+		} else {
+			int a1tid = aCount;
+			int b1tid = aCount2 + bCount;
+			SerialMerge<VT, true>(keys_shared, a0tid, a1tid, b0tid, b1tid,
+				results, indices, comp);
+		}
+	} else {
+		// Use the input intervals from the ranges between the merge path
+		// intersections.
+		aCount = a1 - a0;
+		bCount = b1 - b0;
+		// Load the data into shared memory.
+		DeviceLoad2ToShared<NT, VT, VT>(a_global + a0, aCount, b_global + b0,
+			bCount, tid, keys_shared);
+		// Run a merge path to find the start of the serial merge for each
+		// thread.
+		int diag = VT * tid;
+		int mp = MergePath<MgpuBoundsLower>(keys_shared, aCount,
+			keys_shared + aCount, bCount, diag, comp);
+		// Compute the ranges of the sources in shared memory.
+		int a0tid = mp;
+		int a1tid = aCount;
+		int b0tid = aCount + diag - mp;
+		int b1tid = aCount + bCount;
+		// Serial merge into register.
+		SerialMerge<VT, true>(keys_shared, a0tid, a1tid, b0tid, b1tid, results,
+			indices, comp);
+	}
+}
+////////////////////////////////////////////////////////////////////////////////
+// DeviceMerge
+// Merge pairs from global memory into global memory. Useful factorization to
+// enable calling from merge, mergesort, and locality sort.
+template<int NT, int VT, bool HasValues, bool LoadExtended, typename KeysIt1,
+	typename KeysIt2, typename KeysIt3, typename ValsIt1, typename ValsIt2,
+	typename KeyType, typename ValsIt3, typename Comp>
+MGPU_DEVICE void DeviceMerge(KeysIt1 aKeys_global, ValsIt1 aVals_global,
+	int aCount, KeysIt2 bKeys_global, ValsIt2 bVals_global, int bCount,
+	int tid, int block, int4 range, KeyType* keys_shared, int* indices_shared,
+	KeysIt3 keys_global, ValsIt3 vals_global, Comp comp) {
+	KeyType results[VT];
+	int indices[VT];
+	DeviceMergeKeysIndices<NT, VT, LoadExtended>(aKeys_global, aCount,
+		bKeys_global, bCount, range, tid, keys_shared, results, indices, comp);
+	// Store merge results back to shared memory.
+	DeviceThreadToShared<VT>(results, tid, keys_shared);
+	// Store merged keys to global memory.
+	aCount = range.y - range.x;
+	bCount = range.w - range.z;
+	DeviceSharedToGlobal<NT, VT>(aCount + bCount, keys_shared, tid,
+		keys_global + NT * VT * block);
+	// Copy the values.
+	if(HasValues) {
+		DeviceThreadToShared<VT>(indices, tid, indices_shared);
+		DeviceTransferMergeValuesShared<NT, VT>(aCount + bCount,
+			aVals_global + range.x, bVals_global + range.z, aCount,
+			indices_shared, tid, vals_global + NT * VT * block);
+	}
+}
+} // namespace mgpu
--- a/include/contrib/moderngpu/include/device/ctascan.cuh
+++ b/include/contrib/moderngpu/include/device/ctascan.cuh
+/******************************************************************************
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+/******************************************************************************
+ *
+ * Code and text by Sean Baxter, NVIDIA Research
+ * See http://nvlabs.github.io/moderngpu for repository and documentation.
+ *
+ ******************************************************************************/
+#pragma once
+#include "../mgpuenums.h"
+#include "deviceutil.cuh"
+#include "intrinsics.cuh"
+namespace mgpu {
+////////////////////////////////////////////////////////////////////////////////
+// CTAReduce
+template<int NT, typename Op = mgpu::plus<int> >
+struct CTAReduce {
+	typedef typename Op::first_argument_type T;
+	enum { Size = NT, Capacity = NT };
+	struct Storage { T shared[Capacity]; };
+	MGPU_DEVICE static T Reduce(int tid, T x, Storage& storage, Op op = Op()) {
+		storage.shared[tid] = x;
+		__syncthreads();
+		// Fold the data in half with each pass.
+		#pragma unroll
+		for(int destCount = NT / 2; destCount >= 1; destCount /= 2) {
+			if(tid < destCount) {
+				// Read from the right half and store to the left half.
+				x = op(x, storage.shared[destCount + tid]);
+				storage.shared[tid] = x;
+			}
+			__syncthreads();
+		}
+		T total = storage.shared[0];
+		__syncthreads();
+		return total;
+	}
+};
+#if __CUDA_ARCH__ >= 300
+template<int NT>
+struct CTAReduce<NT, mgpu::plus<int> > {
+	typedef mgpu::plus<int> Op;
+	typedef int T;
+	enum { Size = NT, Capacity = WARP_SIZE };
+	struct Storage { int shared[Capacity]; };
+	MGPU_DEVICE static int Reduce(int tid, int x, Storage& storage,
+		Op op = Op()) {
+		const int NumSections = WARP_SIZE;
+		const int SecSize = NT / NumSections;
+		int lane = (SecSize - 1) & tid;
+		int sec = tid / SecSize;
+		// In the first phase, threads cooperatively find the reduction within
+		// their segment. The segments are SecSize threads (NT / WARP_SIZE)
+		// wide.
+		#pragma unroll
+		for(int offset = 1; offset < SecSize; offset *= 2)
+			x = shfl_add(x, offset, SecSize);
+		// The last thread in each segment stores the local reduction to shared
+		// memory.
+		if(SecSize - 1 == lane) storage.shared[sec] = x;
+		__syncthreads();
+		// Reduce the totals of each input segment. The spine is WARP_SIZE
+		// threads wide.
+		if(tid < NumSections) {
+			x = storage.shared[tid];
+			#pragma unroll
+			for(int offset = 1; offset < NumSections; offset *= 2)
+				x = shfl_add(x, offset, NumSections);
+			storage.shared[tid] = x;
+		}
+		__syncthreads();
+		int reduction = storage.shared[NumSections - 1];
+		__syncthreads();
+		return reduction;
+	}
+};
+template<int NT>
+struct CTAReduce<NT, mgpu::maximum<int> > {
+	typedef mgpu::maximum<int> Op;
+	enum { Size = NT, Capacity = WARP_SIZE };
+	struct Storage { int shared[Capacity]; };
+	MGPU_DEVICE static int Reduce(int tid, int x, Storage& storage,
+		Op op = Op()) {
+		const int NumSections = WARP_SIZE;
+		const int SecSize = NT / NumSections;
+		int lane = (SecSize - 1) & tid;
+		int sec = tid / SecSize;
+		#pragma unroll
+		for(int offset = 1; offset < SecSize; offset *= 2)
+			x = shfl_max(x, offset, SecSize);
+		if(SecSize - 1 == lane) storage.shared[sec] = x;
+		__syncthreads();
+		if(tid < NumSections) {
+			x = storage.shared[tid];
+			#pragma unroll
+			for(int offset = 1; offset < NumSections; offset *= 2)
+				x = shfl_max(x, offset, NumSections);
+			storage.shared[tid] = x;
+		}
+		__syncthreads();
+		int reduction = storage.shared[NumSections - 1];
+		__syncthreads();
+		return reduction;
+	}
+};
+#endif // __CUDA_ARCH__ >= 300
+////////////////////////////////////////////////////////////////////////////////
+// CTAScan
+template<int NT, typename Op = mgpu::plus<int> >
+struct CTAScan {
+	typedef typename Op::result_type T;
+	enum { Size = NT, Capacity = 2 * NT + 1 };
+	struct Storage { T shared[Capacity]; };
+	MGPU_DEVICE static T Scan(int tid, T x, Storage& storage, T* total,
+		MgpuScanType type = MgpuScanTypeExc, T identity = (T)0, Op op = Op()) {
+		storage.shared[tid] = x;
+		int first = 0;
+		__syncthreads();
+		#pragma unroll
+		for(int offset = 1; offset < NT; offset += offset) {
+			if(tid >= offset)
+				x = op(storage.shared[first + tid - offset], x);
+			first = NT - first;
+			storage.shared[first + tid] = x;
+			__syncthreads();
+		}
+		*total = storage.shared[first + NT - 1];
+		if(MgpuScanTypeExc == type)
+			x = tid ? storage.shared[first + tid - 1] : identity;
+		__syncthreads();
+		return x;
+	}
+	MGPU_DEVICE static T Scan(int tid, T x, Storage& storage) {
+		T total;
+		return Scan(tid, x, storage, &total, MgpuScanTypeExc, (T)0, Op());
+	}
+};
+////////////////////////////////////////////////////////////////////////////////
+// Special partial specialization for CTAScan<NT, ScanOpAdd> on Kepler.
+// This uses the shfl intrinsic to reduce scan latency.
+#if __CUDA_ARCH__ >= 300
+template<int NT>
+struct CTAScan<NT, mgpu::plus<int> > {
+	typedef mgpu::plus<int> Op;
+	enum { Size = NT, NumSegments = WARP_SIZE, SegSize = NT / NumSegments };
+	enum { Capacity = NumSegments + 1 };
+	struct Storage { int shared[Capacity + 1]; };
+	MGPU_DEVICE static int Scan(int tid, int x, Storage& storage, int* total,
+		MgpuScanType type = MgpuScanTypeExc, int identity = 0, Op op = Op()) {
+		// Define WARP_SIZE segments that are NT / WARP_SIZE large.
+		// Each warp makes log(SegSize) shfl_add calls.
+		// The spine makes log(WARP_SIZE) shfl_add calls.
+		int lane = (SegSize - 1) & tid;
+		int segment = tid / SegSize;
+		// Scan each segment using shfl_add.
+		int scan = x;
+		#pragma unroll
+		for(int offset = 1; offset < SegSize; offset *= 2)
+			scan = shfl_add(scan, offset, SegSize);
+		// Store the reduction (last element) of each segment into storage.
+		if(SegSize - 1 == lane) storage.shared[segment] = scan;
+		__syncthreads();
+		// Warp 0 does a full shfl warp scan on the partials. The total is
+		// stored to shared[NumSegments]. (NumSegments = WARP_SIZE)
+		if(tid < NumSegments) {
+			int y = storage.shared[tid];
+			int scan = y;
+			#pragma unroll
+			for(int offset = 1; offset < NumSegments; offset *= 2)
+				scan = shfl_add(scan, offset, NumSegments);
+			storage.shared[tid] = scan - y;
+			if(NumSegments - 1 == tid) storage.shared[NumSegments] = scan;
+		}
+		__syncthreads();
+		// Add the scanned partials back in and convert to exclusive scan.
+		scan += storage.shared[segment];
+		if(MgpuScanTypeExc == type) {
+			scan -= x;
+			if(identity && !tid) scan = identity;
+		}
+		*total = storage.shared[NumSegments];
+		__syncthreads();
+		return scan;
+	}
+	MGPU_DEVICE static int Scan(int tid, int x, Storage& storage) {
+		int total;
+		return Scan(tid, x, storage, &total, MgpuScanTypeExc, 0);
+	}
+};
+#endif // __CUDA_ARCH__ >= 300
+////////////////////////////////////////////////////////////////////////////////
+// CTABinaryScan
+template<int NT>
+MGPU_DEVICE int CTABinaryScan(int tid, bool x, int* shared, int* total) {
+	const int NumWarps = NT / WARP_SIZE;
+	int warp = tid / WARP_SIZE;
+	int lane = (WARP_SIZE - 1);
+	// Store the bit totals for each warp.
+	uint bits = __ballot(x);
+	shared[warp] = popc(bits);
+	__syncthreads();
+#if __CUDA_ARCH__ >= 300
+	if(tid < NumWarps) {
+		int x = shared[tid];
+		int scan = x;
+		#pragma unroll
+		for(int offset = 1; offset < NumWarps; offset *= 2)
+			scan = shfl_add(scan, offset, NumWarps);
+		shared[tid] = scan - x;
+	}
+	__syncthreads();
+#else
+	// Thread 0 scans warp totals.
+	if(!tid) {
+		int scan = 0;
+		#pragma unroll
+		for(int i = 0; i < NumWarps; ++i) {
+			int y = shared[i];
+			shared[i] = scan;
+			scan += y;
+		}
+		shared[NumWarps] = scan;
+	}
+	__syncthreads();
+#endif // __CUDA_ARCH__ >= 300
+	// Add the warp scan back into the partials.
+	int scan = shared[warp] + __popc(bfe(bits, 0, lane));
+	*total = shared[NumWarps];
+	__syncthreads();
+	return scan;
+}
+} // namespace mgpu
--- a/include/contrib/moderngpu/include/device/ctasearch.cuh
+++ b/include/contrib/moderngpu/include/device/ctasearch.cuh
+/******************************************************************************
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+/******************************************************************************
+ *
+ * Code and text by Sean Baxter, NVIDIA Research
+ * See http://nvlabs.github.io/moderngpu for repository and documentation.
+ *
+ ******************************************************************************/
+#pragma once
+#include "deviceutil.cuh"
+#include "../mgpudevice.cuh"
+namespace mgpu {
+template<MgpuBounds Bounds, typename IntT, typename It, typename T,
+	typename Comp>
+MGPU_HOST_DEVICE void BinarySearchIt(It data, int& begin, int& end, T key,
+	int shift, Comp comp) {
+	IntT scale = (1<< shift) - 1;
+	int mid = (int)((begin + scale * end)>> shift);
+	T key2 = data[mid];
+	bool pred = (MgpuBoundsUpper == Bounds) ?
+		!comp(key, key2) :
+		comp(key2, key);
+	if(pred) begin = mid + 1;
+	else end = mid;
+}
+template<MgpuBounds Bounds, typename IntT, typename T, typename It,
+	typename Comp>
+MGPU_HOST_DEVICE int BiasedBinarySearch(It data, int count, T key, int levels,
+	Comp comp) {
+	int begin = 0;
+	int end = count;
+	if(levels >= 4 && begin < end)
+		BinarySearchIt<Bounds, IntT>(data, begin, end, key, 9, comp);
+	if(levels >= 3 && begin < end)
+		BinarySearchIt<Bounds, IntT>(data, begin, end, key, 7, comp);
+	if(levels >= 2 && begin < end)
+		BinarySearchIt<Bounds, IntT>(data, begin, end, key, 5, comp);
+	if(levels >= 1 && begin < end)
+		BinarySearchIt<Bounds, IntT>(data, begin, end, key, 4, comp);
+	while(begin < end)
+		BinarySearchIt<Bounds, int>(data, begin, end, key, 1, comp);
+	return begin;
+}
+template<MgpuBounds Bounds, typename T, typename It, typename Comp>
+MGPU_HOST_DEVICE int BinarySearch(It data, int count, T key, Comp comp) {
+	int begin = 0;
+	int end = count;
+	while(begin < end)
+		BinarySearchIt<Bounds, int>(data, begin, end, key, 1, comp);
+	return begin;
+}
+////////////////////////////////////////////////////////////////////////////////
+// MergePath search
+template<MgpuBounds Bounds, typename It1, typename It2, typename Comp>
+MGPU_HOST_DEVICE int MergePath(It1 a, int aCount, It2 b, int bCount, int diag,
+	Comp comp) {
+	typedef typename std::iterator_traits<It1>::value_type T;
+	int begin = max(0, diag - bCount);
+	int end = min(diag, aCount);
+	while(begin < end) {
+		int mid = (begin + end)>> 1;
+		T aKey = a[mid];
+		T bKey = b[diag - 1 - mid];
+		bool pred = (MgpuBoundsUpper == Bounds) ?
+			comp(aKey, bKey) :
+			!comp(bKey, aKey);
+		if(pred) begin = mid + 1;
+		else end = mid;
+	}
+	return begin;
+}
+////////////////////////////////////////////////////////////////////////////////
+// SegmentedMergePath search
+template<typename InputIt, typename Comp>
+MGPU_HOST_DEVICE int SegmentedMergePath(InputIt keys, int aOffset, int aCount,
+	int bOffset, int bCount, int leftEnd, int rightStart, int diag, Comp comp) {
+	// leftEnd and rightStart are defined from the origin, and diag is defined
+	// from aOffset.
+	// We only need to run a Merge Path search if the diagonal intersects the
+	// segment that strides the left and right halves (i.e. is between leftEnd
+	// and rightStart).
+	if(aOffset + diag <= leftEnd) return diag;
+	if(aOffset + diag >= rightStart) return aCount;
+	bCount = min(bCount, rightStart - bOffset);
+	int begin = max(max(leftEnd - aOffset, 0), diag - bCount);
+	int end = min(diag, aCount);
+	while(begin < end) {
+		int mid = (begin + end)>> 1;
+		int ai = aOffset + mid;
+		int bi = bOffset + diag - 1 - mid;
+		bool pred = !comp(keys[bi], keys[ai]);
+		if(pred) begin = mid + 1;
+		else end = mid;
+	}
+	return begin;
+}
+////////////////////////////////////////////////////////////////////////////////
+// BalancedPath search
+template<bool Duplicates, typename IntT, typename InputIt1, typename InputIt2,
+	typename Comp>
+MGPU_HOST_DEVICE int2 BalancedPath(InputIt1 a, int aCount, InputIt2 b,
+	int bCount, int diag, int levels, Comp comp) {
+	typedef typename std::iterator_traits<InputIt1>::value_type T;
+	int p = MergePath<MgpuBoundsLower>(a, aCount, b, bCount, diag, comp);
+	int aIndex = p;
+	int bIndex = diag - p;
+	bool star = false;
+	if(bIndex < bCount) {
+		if(Duplicates) {
+			T x = b[bIndex];
+			// Search for the beginning of the duplicate run in both A and B.
+			// Because
+			int aStart = BiasedBinarySearch<MgpuBoundsLower, IntT>(a, aIndex, x,
+				levels, comp);
+			int bStart = BiasedBinarySearch<MgpuBoundsLower, IntT>(b, bIndex, x,
+				levels, comp);
+			// The distance between the merge path and the lower_bound is the
+			// 'run'. We add up the a- and b- runs and evenly distribute them to
+			// get a stairstep path.
+			int aRun = aIndex - aStart;
+			int bRun = bIndex - bStart;
+			int xCount = aRun + bRun;
+			// Attempt to advance b and regress a.
+			int bAdvance = max(xCount>> 1, bRun);
+			int bEnd = min(bCount, bStart + bAdvance + 1);
+			int bRunEnd = BinarySearch<MgpuBoundsUpper>(b + bIndex,
+				bEnd - bIndex, x, comp) + bIndex;
+			bRun = bRunEnd - bStart;
+			bAdvance = min(bAdvance, bRun);
+			int aAdvance = xCount - bAdvance;
+			bool roundUp = (aAdvance == bAdvance + 1) && (bAdvance < bRun);
+			aIndex = aStart + aAdvance;
+			if(roundUp) star = true;
+		} else {
+			if(aIndex && aCount) {
+				T aKey = a[aIndex - 1];
+				T bKey = b[bIndex];
+				// If the last consumed element in A (aIndex - 1) is the same as
+				// the next element in B (bIndex), we're sitting at a starred
+				// partition.
+				if(!comp(aKey, bKey)) star = true;
+			}
+		}
+	}
+	return make_int2(aIndex, star);
+}
+} // namespace mgpu
--- a/include/contrib/moderngpu/include/device/ctasegreduce.cuh
+++ b/include/contrib/moderngpu/include/device/ctasegreduce.cuh
+/******************************************************************************
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+/******************************************************************************
+ *
+ * Code and text by Sean Baxter, NVIDIA Research
+ * See http://nvlabs.github.io/moderngpu for repository and documentation.
+ *
+ ******************************************************************************/
+#pragma once
+#include "ctasegscan.cuh"
+#include "ctasearch.cuh"
+namespace mgpu {
+////////////////////////////////////////////////////////////////////////////////
+// Segmented reduce utility functions.
+// Extract the upper-bound indices from the coded ranges. Decrement to include
+// the first addressed row/segment.
+struct SegReduceRange {
+	int begin;
+	int end;
+	int total;
+	bool flushLast;
+};
+MGPU_DEVICE SegReduceRange DeviceShiftRange(int limit0, int limit1) {
+	SegReduceRange range;
+	range.begin = 0x7fffffff & limit0;
+	range.end = 0x7fffffff & limit1;
+	range.total = range.end - range.begin;
+	range.flushLast = 0 == (0x80000000 & limit1);
+	range.end += !range.flushLast;
+	return range;
+}
+// Reconstitute row/segment indices from a starting row index and packed end
+// flags. Used for pre-processed versions of interval reduce and interval Spmv.
+template<int VT>
+MGPU_DEVICE void DeviceExpandFlagsToRows(int first, int endFlags,
+	int rows[VT + 1]) {
+	rows[0] = first;
+	#pragma unroll
+	for(int i = 0; i < VT; ++i) {
+		if((1<< i) & endFlags) ++first;
+		rows[i + 1] = first;
+	}
+}
+////////////////////////////////////////////////////////////////////////////////
+// After loading CSR terms into shared memory, each thread binary searches
+// (upper-bound) to find its starting point. Each thread then walks forward,
+// emitting the csr0-relative row indices to register.
+template<int NT, int VT>
+MGPU_DEVICE int DeviceExpandCsrRows(int tidOffset, int* csr_shared,
+	int numRows, int end, int rows[VT + 1], int rowStarts[VT]) {
+	// Each thread binary searches for its starting row.
+	int row = BinarySearch<MgpuBoundsUpper>(csr_shared, numRows, tidOffset,
+		mgpu::less<int>()) - 1;
+	// Each thread starts at row and scans forward, emitting row IDs into
+	// register. Store the CTA-local row index (starts at 0) to rows and the
+	// start of the row (globally) to rowStarts.
+	int curOffset = csr_shared[row];
+	int nextOffset = (row + 1 < numRows) ? csr_shared[row + 1] : end;
+	rows[0] = row;
+	rowStarts[0] = curOffset;
+	int endFlags = 0;
+	#pragma unroll
+	for(int i = 1; i <= VT; ++i) {
+		// Advance the row cursor when the iterator hits the next row offset.
+		if(tidOffset + i == nextOffset) {
+			// Set an end flag when the cursor advances to the next row.
+			endFlags |= 1<< (i - 1);
+			// Advance the cursor and load the next row offset.
+			++row;
+			curOffset = nextOffset;
+			nextOffset = (row + 1 < numRows) ? csr_shared[row + 1] : end;
+		}
+		rows[i] = row;
+		if(i < VT) rowStarts[i] = curOffset;
+	}
+	__syncthreads();
+	return endFlags;
+}
+////////////////////////////////////////////////////////////////////////////////
+// DeviceSegReducePrepare
+// Expand non-empty interval of CSR elements into row indices. Compute end-flags
+// by comparing adjacent row IDs.
+// DeviceSegReducePrepare may be called either by a pre-processing kernel or by
+// the kernel that actually evaluates the segmented reduction if no preprocesing
+// is desired.
+struct SegReduceTerms {
+	int endFlags;
+	int tidDelta;
+};
+template<int NT, int VT>
+MGPU_DEVICE SegReduceTerms DeviceSegReducePrepare(int* csr_shared, int numRows,
+	int tid, int gid, bool flushLast, int rows[VT + 1], int rowStarts[VT]) {
+	// Pass a sentinel (end) to point to the next segment start. If we flush,
+	// this is the end of this tile. Otherwise it is INT_MAX
+	int endFlags = DeviceExpandCsrRows<NT, VT>(gid + VT * tid, csr_shared,
+		numRows, flushLast ? (gid + NT * VT) : INT_MAX, rows, rowStarts);
+	// Find the distance to to scan to compute carry-in for each thread. Use the
+	// existance of an end flag anywhere in the thread to determine if carry-out
+	// values from the left should propagate through to the right.
+	int tidDelta = DeviceFindSegScanDelta<NT>(tid, rows[0] != rows[VT],
+		csr_shared);
+	SegReduceTerms terms = { endFlags, tidDelta };
+	return terms;
+}
+////////////////////////////////////////////////////////////////////////////////
+// CTASegReduce
+// Core segmented reduction code. Supports fast-path and slow-path for intra-CTA
+// segmented reduction. Stores partials to global memory.
+// Callers feed CTASegReduce::ReduceToGlobal values in thread order.
+template<int NT, int VT, bool HalfCapacity, typename T, typename Op>
+struct CTASegReduce {
+	typedef CTASegScan<NT, Op> SegScan;
+	enum {
+		NV = NT * VT,
+		Capacity = HalfCapacity ? (NV / 2) : NV
+	};
+	union Storage {
+		typename SegScan::Storage segScanStorage;
+		T values[Capacity];
+	};
+	template<typename DestIt>
+	MGPU_DEVICE static void ReduceToGlobal(const int rows[VT + 1], int total,
+		int tidDelta, int startRow, int block, int tid, T data[VT],
+		DestIt dest_global, T* carryOut_global, T identity, Op op,
+		Storage& storage) {
+		// Run a segmented scan within the thread.
+		T x, localScan[VT];
+		#pragma unroll
+		for(int i = 0; i < VT; ++i) {
+			x = i ? op(x, data[i]) : data[i];
+			localScan[i] = x;
+			if(rows[i] != rows[i + 1]) x = identity;
+		}
+		// Run a parallel segmented scan over the carry-out values to compute
+		// carry-in.
+		T carryOut;
+		T carryIn = SegScan::SegScanDelta(tid, tidDelta, x,
+			storage.segScanStorage, &carryOut, identity, op);
+		// Store the carry-out for the entire CTA to global memory.
+		if(!tid) carryOut_global[block] = carryOut;
+		dest_global += startRow;
+		if(HalfCapacity && total > Capacity) {
+			// Add carry-in to each thread-local scan value. Store directly
+			// to global.
+			#pragma unroll
+			for(int i = 0; i < VT; ++i) {
+				// Add the carry-in to the local scan.
+				T x2 = op(carryIn, localScan[i]);
+				// Store on the end flag and clear the carry-in.
+				if(rows[i] != rows[i + 1]) {
+					carryIn = identity;
+					dest_global[rows[i]] = x2;
+				}
+			}
+		} else {
+			// All partials fit in shared memory. Add carry-in to each thread-
+			// local scan value.
+			#pragma unroll
+			for(int i = 0; i < VT; ++i) {
+				// Add the carry-in to the local scan.
+				T x2 = op(carryIn, localScan[i]);
+				// Store reduction when the segment changes and clear the
+				// carry-in.
+				if(rows[i] != rows[i + 1]) {
+					storage.values[rows[i]] = x2;
+					carryIn = identity;
+				}
+			}
+			__syncthreads();
+			// Cooperatively store reductions to global memory.
+			for(int index = tid; index < total; index += NT)
+				dest_global[index] = storage.values[index];
+			__syncthreads();
+		}
+	}
+};
+} // namespace mgpu
--- a/include/contrib/moderngpu/include/device/ctasegscan.cuh
+++ b/include/contrib/moderngpu/include/device/ctasegscan.cuh
+/******************************************************************************
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+/******************************************************************************
+ *
+ * Code and text by Sean Baxter, NVIDIA Research
+ * See http://nvlabs.github.io/moderngpu for repository and documentation.
+ *
+ ******************************************************************************/
+#pragma once
+#include "ctascan.cuh"
+namespace mgpu {
+////////////////////////////////////////////////////////////////////////////////
+// DeviceFindSegScanDelta
+// Runs an inclusive max-index scan over binary inputs.
+template<int NT>
+MGPU_DEVICE int DeviceFindSegScanDelta(int tid, bool flag, int* delta_shared) {
+	const int NumWarps = NT / 32;
+	int warp = tid / 32;
+	int lane = 31 & tid;
+	uint warpMask = 0xffffffff>> (31 - lane);		// inclusive search
+	uint ctaMask = 0x7fffffff>> (31 - lane);		// exclusive search
+	uint warpBits = __ballot(flag);
+	delta_shared[warp] = warpBits;
+	__syncthreads();
+	if(tid < NumWarps) {
+		uint ctaBits = __ballot(0 != delta_shared[tid]);
+		int warpSegment = 31 - clz(ctaMask & ctaBits);
+		int start = (-1 != warpSegment) ?
+			(31 - clz(delta_shared[warpSegment]) + 32 * warpSegment) : 0;
+		delta_shared[NumWarps + tid] = start;
+	}
+	__syncthreads();
+	// Find the closest flag to the left of this thread within the warp.
+	// Include the flag for this thread.
+	int start = 31 - clz(warpMask & warpBits);
+	if(-1 != start) start += ~31 & tid;
+	else start = delta_shared[NumWarps + warp];
+	__syncthreads();
+	return tid - start;
+}
+////////////////////////////////////////////////////////////////////////////////
+// CTASegScan
+template<int NT, typename _Op = mgpu::plus<int> >
+struct CTASegScan {
+	typedef _Op Op;
+	typedef typename Op::result_type T;
+	enum { NumWarps = NT / 32, Size = NT, Capacity = 2 * NT };
+	union Storage {
+		int delta[NumWarps];
+		T values[Capacity];
+	};
+	// Each thread passes the reduction of the LAST SEGMENT that it covers.
+	// flag is set to true if there's at least one segment flag in the thread.
+	// SegScan returns the reduction of values for the first segment in this
+	// thread over the preceding threads.
+	// Return the value init for the first thread.
+	// When scanning single elements per thread, interpret the flag as a BEGIN
+	// FLAG. If tid's flag is set, its value belongs to thread tid + 1, not
+	// thread tid.
+	// The function returns the reduction of the last segment in the CTA.
+	MGPU_DEVICE static T SegScanDelta(int tid, int tidDelta, T x,
+		Storage& storage, T* carryOut, T identity = (T)0, Op op = Op()) {
+		// Run an inclusive scan
+		int first = 0;
+		storage.values[first + tid] = x;
+		__syncthreads();
+		#pragma unroll
+		for(int offset = 1; offset < NT; offset += offset) {
+			if(tidDelta >= offset)
+				x = op(storage.values[first + tid - offset], x);
+			first = NT - first;
+			storage.values[first + tid] = x;
+			__syncthreads();
+		}
+		// Get the exclusive scan.
+		x = tid ? storage.values[first + tid - 1] : identity;
+		*carryOut = storage.values[first + NT - 1];
+		__syncthreads();
+		return x;
+	}
+	MGPU_DEVICE static T SegScan(int tid, T x, bool flag, Storage& storage,
+		T* carryOut, T identity = (T)0, Op op = Op()) {
+		// Find the left-most thread that covers the first segment of this
+		// thread.
+		int tidDelta = DeviceFindSegScanDelta<NT>(tid, flag, storage.delta);
+		return SegScanDelta(tid, tidDelta, x, storage, carryOut, identity, op);
+	}
+};
+} // namespace mgpu
--- a/include/contrib/moderngpu/include/device/ctasegsort.cuh
+++ b/include/contrib/moderngpu/include/device/ctasegsort.cuh
+/******************************************************************************
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+/******************************************************************************
+ *
+ * Code and text by Sean Baxter, NVIDIA Research
+ * See http://nvlabs.github.io/moderngpu for repository and documentation.
+ *
+ ******************************************************************************/
+#pragma once
+#include "ctascan.cuh"
+#include "ctasearch.cuh"
+#include "loadstore.cuh"
+#include "sortnetwork.cuh"
+namespace mgpu {
+template<int VT, typename T, typename Comp>
+MGPU_DEVICE void SegmentedSerialMerge(const T* keys_shared, int aBegin,
+	int aEnd, int bBegin, int bEnd, T results[VT], int indices[VT],
+	int leftEnd, int rightStart, Comp comp, bool sync = true) {
+	bEnd = min(rightStart, bEnd);
+	T aKey = keys_shared[aBegin];
+	T bKey = keys_shared[bBegin];
+	#pragma unroll
+	for(int i = 0; i < VT; ++i) {
+		bool p;
+		// If A has run out of inputs, emit B.
+		if(aBegin >= aEnd)
+			p = false;
+		else if(bBegin >= bEnd || aBegin < leftEnd)
+			// B has hit the end of the middle segment.
+			// Emit A if A has inputs remaining in the middle segment.
+			p = true;
+		else
+			// Emit the smaller element in the middle segment.
+			p = !comp(bKey, aKey);
+		results[i] = p ? aKey : bKey;
+		indices[i] = p ? aBegin : bBegin;
+		if(p) aKey = keys_shared[++aBegin];
+		else bKey = keys_shared[++bBegin];
+	}
+	if(sync) { __syncthreads(); }
+}
+////////////////////////////////////////////////////////////////////////////////
+// CTASegsortPass
+template<int NT, int VT, typename T, typename Comp>
+MGPU_DEVICE void CTASegsortPass(T* keys_shared, int* ranges_shared, int tid,
+	int pass, T results[VT], int indices[VT], int2& activeRange, Comp comp) {
+	// Locate the intervals of the input lists.
+	int3 frame = FindMergesortFrame(2<< pass, tid, VT);
+	int a0 = frame.x;
+	int b0 = frame.y;
+	int listLen = frame.z;
+	int list = tid>> pass;
+	int listParity = 1 & list;
+	int diag = VT * tid - frame.x;
+	// Fetch the active range for the list this thread's list is merging with.
+	int siblingRange = ranges_shared[1 ^ list];
+	int siblingStart = 0x0000ffff & siblingRange;
+	int siblingEnd = siblingRange>> 16;
+	// Create a new active range for the merge.
+	int leftEnd = listParity ? siblingEnd : activeRange.y;
+	int rightStart = listParity ? activeRange.x : siblingStart;
+	activeRange.x = min(activeRange.x, siblingStart);
+	activeRange.y = max(activeRange.y, siblingEnd);
+	int p = SegmentedMergePath(keys_shared, a0, listLen, b0, listLen, leftEnd,
+		rightStart, diag, comp);
+	int a0tid = a0 + p;
+	int b0tid = b0 + diag - p;
+	SegmentedSerialMerge<VT>(keys_shared, a0tid, b0, b0tid, b0 + listLen,
+		results, indices, leftEnd, rightStart, comp);
+	// Store the ranges to shared memory.
+	if(0 == diag)
+		ranges_shared[list>> 1] =
+			(int)bfi(activeRange.y, activeRange.x, 16, 16);
+}
+////////////////////////////////////////////////////////////////////////////////
+// CTASegsortLoop
+template<int NT, int VT, bool HasValues, typename KeyType, typename ValType,
+	typename Comp>
+MGPU_DEVICE int2 CTASegsortLoop(KeyType threadKeys[VT],
+	ValType threadValues[VT], KeyType* keys_shared, ValType* values_shared,
+	int* ranges_shared, int tid, int2 activeRange, Comp comp) {
+	const int NumPasses = sLogPow2<NT>::value;
+	#pragma unroll
+	for(int pass = 0; pass < NumPasses; ++pass) {
+		int indices[VT];
+		CTASegsortPass<NT, VT>(keys_shared, ranges_shared, tid, pass,
+			threadKeys, indices, activeRange, comp);
+		if(HasValues) {
+			// Exchange values through shared memory.
+			DeviceThreadToShared<VT>(threadValues, tid, values_shared);
+			DeviceGather<NT, VT>(NT * VT, values_shared, indices, tid,
+				threadValues);
+		}
+		// Store results in shared memory in sorted order.
+		DeviceThreadToShared<VT>(threadKeys, tid, keys_shared);
+	}
+	return activeRange;
+}
+////////////////////////////////////////////////////////////////////////////////
+// CTASegsort
+// Pass keys and values in register. On return, values are returned in register
+// and keys returned in shared memory.
+template<int NT, int VT, bool Stable, bool HasValues, typename KeyType,
+	typename ValType, typename Comp>
+MGPU_DEVICE int2 CTASegsort(KeyType threadKeys[VT], ValType threadValues[VT],
+	int tid, int headFlags, KeyType* keys_shared, ValType* values_shared,
+	int* ranges_shared, Comp comp) {
+	if(Stable)
+		// Odd-even transpose sort.
+		OddEvenTransposeSortFlags<VT>(threadKeys, threadValues, headFlags,
+			comp);
+	else
+		// Batcher's odd-even mergesort.
+		OddEvenMergesortFlags<VT>(threadKeys, threadValues, headFlags, comp);
+	// Record the first and last occurrence of head flags in this segment.
+	int blockEnd = 31 - clz(headFlags);
+	if(-1 != blockEnd) blockEnd += VT * tid;
+	int blockStart = ffs(headFlags);
+	blockStart = blockStart ? (VT * tid - 1 + blockStart) : (NT * VT);
+	ranges_shared[tid] = (int)bfi(blockEnd, blockStart, 16, 16);
+	// Store back to shared mem. The values are in VT-length sorted lists.
+	// These are merged recursively.
+	DeviceThreadToShared<VT>(threadKeys, tid, keys_shared);
+	int2 activeRange = CTASegsortLoop<NT, VT, HasValues>(threadKeys,
+		threadValues, keys_shared, values_shared, ranges_shared, tid,
+		make_int2(blockStart, blockEnd), comp);
+	return activeRange;
+}
+template<int NT, int VT, bool Stable, typename KeyType, typename Comp>
+MGPU_DEVICE int2 CTASegsortKeys(KeyType threadKeys[VT], int tid, int headFlags,
+	KeyType* keys_shared, int* ranges_shared, Comp comp) {
+	int valuesTemp[VT];
+	return CTASegsort<NT, VT, Stable, false>(threadKeys, valuesTemp, tid,
+		headFlags, keys_shared, (int*)keys_shared, ranges_shared, comp);
+}
+template<int NT, int VT, bool Stable, typename KeyType, typename ValType,
+	typename Comp>
+MGPU_DEVICE int2 CTASegsortPairs(KeyType threadKeys[VT],
+	ValType threadValues[VT], int tid, int headFlags, KeyType* keys_shared,
+	ValType* values_shared, int* ranges_shared, Comp comp) {
+	return CTASegsort<NT, VT, Stable, true>(threadKeys, threadValues, tid,
+		headFlags, keys_shared, values_shared, ranges_shared, comp);
+}
+////////////////////////////////////////////////////////////////////////////////
+// DeviceSegBlocksort
+// Load keys and values from global memory, sort in shared memory, and store
+// back to global memory. Store the left-most and right-most encountered
+// headflag locations to ranges_global to prepare for the next pass.
+// This function is factored out of the blocksort kernel to allow easier
+// customization of that kernel - we have two implementations currently:
+// sort over indices and sort over bitfield.
+template<int NT, int VT, bool Stable, bool HasValues, typename InputIt1,
+	typename InputIt2, typename KeyType, typename ValType, typename OutputIt1,
+	typename OutputIt2, typename Comp>
+MGPU_DEVICE void DeviceSegBlocksort(InputIt1 keys_global,
+	InputIt2 values_global, int count2, KeyType* keys_shared,
+	ValType* values_shared, int* ranges_shared, int headFlags, int tid,
+	int block, OutputIt1 keysDest_global, OutputIt2 valsDest_global,
+	int* ranges_global, Comp comp) {
+	// Load keys into register in thread order.
+	int gid = NT * VT * block;
+	KeyType threadKeys[VT];
+	DeviceGlobalToShared<NT, VT>(count2, keys_global + gid, tid, keys_shared);
+	DeviceSharedToThread<VT>(keys_shared, tid, threadKeys);
+	// Load the values from global memory and into register in thread order.
+	ValType threadValues[VT];
+	if(HasValues) {
+		DeviceGlobalToShared<NT, VT>(count2, values_global + gid, tid,
+			values_shared);
+		DeviceSharedToThread<VT>(values_shared, tid, threadValues);
+	}
+	// Run the CTA segmented blocksort.
+	int2 activeRange = CTASegsort<NT, VT, Stable, HasValues>(threadKeys,
+		threadValues, tid, headFlags, keys_shared, values_shared, ranges_shared,
+		comp);
+	// Store the keys to global memory.
+	DeviceSharedToGlobal<NT, VT>(count2, keys_shared, tid,
+		 keysDest_global + gid);
+	if(HasValues) {
+		// Store the values to global memory.xk b
+		DeviceThreadToShared<VT>(threadValues, tid, values_shared);
+		DeviceSharedToGlobal<NT, VT>(count2, values_shared, tid,
+			valsDest_global + gid, false);
+	}
+	// Store the 16-bit packed ranges. These are used by all merge kernels and
+	// the first level of global segmented merge path partitioning.
+	if(!tid)
+		ranges_global[block] = bfi(activeRange.y, activeRange.x, 16, 16);
+}
+////////////////////////////////////////////////////////////////////////////////
+// DeviceIndicesToHeadFlags
+// Load indices from an array and cooperatively turn into a head flag bitfield
+// for each thread.
+template<int NT, int VT>
+MGPU_DEVICE int DeviceIndicesToHeadFlags(const int* indices_global,
+	const int* partitions_global, int tid, int block, int count2,
+	int* words_shared, byte* flags_shared) {
+	const int FlagWordsPerThread = MGPU_DIV_UP(VT, 4);
+	int gid = NT * VT * block;
+	int p0 = partitions_global[block];
+	int p1 = partitions_global[block + 1];
+	int headFlags = 0;
+	if(p1 > p0 || count2 < NT * VT) {
+		// Clear the flag bytes, then loop through the indices and poke in flag
+		// values.
+		#pragma unroll
+		for(int i = 0; i < FlagWordsPerThread; ++i)
+			words_shared[NT * i + tid] = 0;
+		__syncthreads();
+		for(int index = p0 + tid; index < p1; index += NT) {
+			int headFlag = indices_global[index];
+			flags_shared[headFlag - gid] = 1;
+		}
+		__syncthreads();
+		// Combine all the head flags for this thread.
+		int first = VT * tid;
+		int offset = first / 4;
+		int prev = words_shared[offset];
+		int mask = 0x3210 + 0x1111 * (3 & first);
+		#pragma unroll
+		for(int i = 0; i < FlagWordsPerThread; ++i) {
+			// Gather the next four flags.
+			int next = words_shared[offset + 1 + i];
+			int x = prmt(prev, next, mask);
+			prev = next;
+			// Set the head flag bits.
+			if(0x00000001 & x) headFlags |= 1<< (4 * i);
+			if(0x00000100 & x) headFlags |= 1<< (4 * i + 1);
+			if(0x00010000 & x) headFlags |= 1<< (4 * i + 2);
+			if(0x01000000 & x) headFlags |= 1<< (4 * i + 3);
+		}
+		__syncthreads();
+		// Set head flags for out-of-range keys.
+		int outOfRange = min(VT, first + VT - count2);
+		if(outOfRange > 0)
+			headFlags = bfi(0xffffffff, headFlags, VT - outOfRange, outOfRange);
+		// Clear head flags above VT.
+		headFlags &= (1<< VT) - 1;
+	}
+	return headFlags;
+}
+////////////////////////////////////////////////////////////////////////////////
+// SegSortSupport
+struct SegSortSupport {
+	int* ranges_global;
+	int2* ranges2_global;
+	int4* mergeList_global;
+	int* copyList_global;
+	int2* queueCounters_global;
+	int2* nextCounters_global;
+	byte* copyStatus_global;
+};
+////////////////////////////////////////////////////////////////////////////////
+// DeviceSegSortMerge
+template<int NT, int VT, bool HasValues, typename KeyType, typename ValueType,
+	typename Comp>
+MGPU_DEVICE void DeviceSegSortMerge(const KeyType* keys_global,
+	const ValueType* values_global, int2 segmentRange, int tid,
+	int block, int4 range, int pass, KeyType* keys_shared,
+	int* indices_shared, KeyType* keysDest_global, ValueType* valsDest_global,
+	Comp comp) {
+	const int NV = NT * VT;
+	int gid = NV * block;
+	// Load the local compressed segment indices.
+	int a0 = range.x;
+	int aCount = range.y - range.x;
+	int b0 = range.z;
+	int bCount = range.w - range.z;
+	DeviceLoad2ToShared<NT, VT, VT>(keys_global + a0, aCount, keys_global + b0,
+		bCount, tid, keys_shared);
+	////////////////////////////////////////////////////////////////////////////
+	// Run a merge path to find the starting point for each thread to merge.
+	// If the entire warp fits into the already-sorted segments, we can skip
+	// sorting it and leave its keys in shared memory. Doing this on the warp
+	// level rather than thread level (also legal) gives slightly better
+	// performance.
+	int segStart = segmentRange.x;
+	int segEnd = segmentRange.y;
+	int listParity = 1 & (block>> pass);
+	int warpOffset = VT * (~31 & tid);
+	bool sortWarp = listParity ?
+		// The spliced segment is to the left (segStart).
+		(warpOffset < segStart) :
+		// The spliced segment is to the right (segEnd).
+		(warpOffset + 32 * VT > segEnd);
+	KeyType threadKeys[VT];
+	int indices[VT];
+	if(sortWarp) {
+		int diag = VT * tid;
+		int mp = SegmentedMergePath(keys_shared, 0, aCount, aCount, bCount,
+			listParity ? 0 : segEnd, listParity ? segStart : NV, diag, comp);
+		int a0tid = mp;
+		int a1tid = aCount;
+		int b0tid = aCount + diag - mp;
+		int b1tid = aCount + bCount;
+		// Serial merge into register. All threads in the CTA so we hoist the
+		// check for list parity outside the function call to simplify the
+		// logic. Unlike in the blocksort, this does not cause warp divergence.
+		SegmentedSerialMerge<VT>(keys_shared, a0tid, a1tid, b0tid, b1tid,
+			threadKeys, indices, listParity ? 0 : segEnd,
+			listParity ? segStart : NV, comp, false);
+	}
+	__syncthreads();
+	// Store sorted data in register back to shared memory. Then copy to global.
+	if(sortWarp)
+		DeviceThreadToShared<VT>(threadKeys, tid, keys_shared, false);
+	__syncthreads();
+	DeviceSharedToGlobal<NT, VT>(aCount + bCount, keys_shared, tid,
+		keysDest_global + gid);
+	////////////////////////////////////////////////////////////////////////////
+	// Use the merge indices to gather values from global memory. Store directly
+	// to valsDest_global.
+	if(HasValues) {
+		// Transpose the gather indices to help coalesce loads.
+		if(sortWarp)
+			DeviceThreadToShared<VT>(indices, tid, indices_shared, false);
+		else {
+			#pragma unroll
+			for(int i = 0; i < VT; ++i)
+				indices_shared[VT * tid + i] = VT * tid + i;
+		}
+		__syncthreads();
+		DeviceTransferMergeValuesShared<NT, VT>(aCount + bCount,
+			values_global + a0,  values_global + b0, aCount, indices_shared,
+			tid, valsDest_global + NV * block);
+	}
+}
+////////////////////////////////////////////////////////////////////////////////
+// DeviceSegSortCopy
+template<int NT, int VT, bool HasValues, typename KeyType, typename ValueType>
+MGPU_DEVICE void DeviceSegSortCopy(const KeyType* keys_global,
+	const ValueType* values_global, int tid, int block, int count,
+	KeyType* keysDest_global, ValueType* valsDest_global) {
+	int gid = NT * VT * block;
+	int count2 = min(NT * VT, count - gid);
+	DeviceGlobalToGlobal<NT, VT>(count2, keys_global + gid, tid,
+		keysDest_global + gid);
+	if(HasValues)
+		DeviceGlobalToGlobal<NT, VT>(count2, values_global + gid, tid,
+			valsDest_global + gid);
+}
+} // namespace mgpu
--- a/include/contrib/moderngpu/include/device/ctasortedsearch.cuh
+++ b/include/contrib/moderngpu/include/device/ctasortedsearch.cuh
+/******************************************************************************
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+/******************************************************************************
+ *
+ * Code and text by Sean Baxter, NVIDIA Research
+ * See http://nvlabs.github.io/moderngpu for repository and documentation.
+ *
+ ******************************************************************************/
+#pragma once
+#include "../mgpudevice.cuh"
+#include "ctasearch.cuh"
+namespace mgpu {
+////////////////////////////////////////////////////////////////////////////////
+// DeviceSerialSearch
+template<int VT, MgpuBounds Bounds, bool RangeCheck, bool IndexA, bool MatchA,
+	bool IndexB, bool MatchB, typename T, typename Comp>
+MGPU_DEVICE int3 DeviceSerialSearch(const T* keys_shared, int aBegin,
+	int aEnd, int bBegin, int bEnd, int aOffset, int bOffset, int* indices,
+	Comp comp) {
+	const int FlagA = IndexA ? 0x80000000 : 1;
+	const int FlagB = IndexB ? 0x80000000 : 1;
+	T aKey = keys_shared[aBegin];
+	T bKey = keys_shared[bBegin];
+	T aPrev, bPrev;
+	if(aBegin > 0) aPrev = keys_shared[aBegin - 1];
+	if(bBegin > 0) bPrev = keys_shared[bBegin - 1];
+	int decisions = 0;
+	int matchCountA = 0;
+	int matchCountB = 0;
+	#pragma unroll
+	for(int i = 0; i < VT; ++i) {
+		bool p;
+		if(RangeCheck && aBegin >= aEnd) p = false;
+		else if(RangeCheck && bBegin >= bEnd) p = true;
+		else p = (MgpuBoundsUpper == Bounds) ?
+			comp(aKey, bKey) :
+			!comp(bKey, aKey);
+		if(p) {
+			// aKey is smaller than bKey, so it is inserted before bKey.
+			// Save bKey's index (bBegin + first) as the result of the search
+			// and advance to the next needle in A.
+			bool match = false;
+			if(MatchA) {
+				// Test if there is an element in B that matches aKey.
+				if(MgpuBoundsUpper == Bounds) {
+					// Upper Bound: We're inserting aKey after bKey. If there
+					// is a match for aKey it must be bPrev. Check that bPrev
+					// is in range and equal to aKey.
+					// The predicate test result !comp(aKey, bPrev) was
+					// established on the previous A-advancing iteration (it
+					// failed the comp(aKey, bKey) test to get us to this
+					// point). Check the other half of the equality condition
+					// with a second comparison.
+					bool inRange = !RangeCheck || (bBegin > aEnd);
+					match = inRange && !comp(bPrev, aKey);
+				} else {
+					// Lower Bound: We're inserting aKey before bKey. If there
+					// is a match for aKey, it must be bKey. Check that bKey
+					// is in range and equal to aKey.
+					// The predicate test !comp(bKey, aKey) has established one
+					// half of the equality condition. We establish the other
+					// half with a second comparison.
+					bool inRange = !RangeCheck || (bBegin < bEnd);
+					match = inRange && !comp(aKey, bKey);
+				}
+			}
+			int index = 0;
+		 	if(IndexA) index = bOffset + bBegin;
+			if(match) index |= FlagA;
+			if(IndexA || MatchA) indices[i] = index;
+			matchCountA += match;
+			// Mark the decision bit to indicate that this iteration has
+			// progressed A (the needles).
+			decisions |= 1<< i;
+			aPrev = aKey;
+			aKey = keys_shared[++aBegin];
+		} else {
+			// aKey is larger than bKey, so it is inserted after bKey (but we
+			// don't know where yet). Advance the B index to the next element in
+			// the haystack to continue the search for the current needle.
+			bool match = false;
+			if(MatchB) {
+				if(MgpuBoundsUpper == Bounds) {
+					// Upper Bound: aKey is not smaller than bKey. We advance to
+					// the next haystack element in B. If there is a match in A
+					// for bKey it must be aKey. By entering this branch we've
+					// verified that !comp(aKey, bKey). Making the reciprocal
+					// comparison !comp(bKey, aKey) establishes aKey == bKey.
+					bool inRange = !RangeCheck ||
+						((bBegin < bEnd) && (aBegin < aEnd));
+					match = inRange && !comp(bKey, aKey);
+				} else {
+					// Lower Bound: bKey is smaller than aKey. We advance to the
+					// next element in B. If there is a match for bKey, it must
+					// be aPrev. The previous A-advancing iteration proved that
+					// !comp(bKey, aPrev). We test !comp(aPrev, bKey) for the
+					// other half of the equality condition.
+					bool inRange = !RangeCheck ||
+						((bBegin < bEnd) && (aBegin > 0));
+					match = inRange && !comp(aPrev, bKey);
+				}
+			}
+			int index = 0;
+			if(IndexB) index = aOffset + aBegin;
+			if(match) index |= FlagB;
+			if(IndexB || MatchB) indices[i] = index;
+			matchCountB += match;
+			// Keep the decision bit cleared to indicate that this iteration
+			// has progressed B (the haystack).
+			bPrev = bKey;
+			bKey = keys_shared[++bBegin];
+		}
+	}
+	return make_int3(decisions, matchCountA, matchCountB);
+}
+////////////////////////////////////////////////////////////////////////////////
+// CTASortedSearch
+// Take keys in shared memory and return indices and b-match flags in shared
+// memory.
+// NOTE: This function doesn't do any strided-to-thread order transposes so
+// using an even number of values per thread will incur no additional bank
+// conflicts.
+template<int NT, int VT, MgpuBounds Bounds, bool IndexA, bool MatchA,
+	bool IndexB, bool MatchB, typename T, typename Comp>
+MGPU_DEVICE int2 CTASortedSearch(T* keys_shared, int aStart, int aCount,
+	int aEnd, int a0, int bStart, int bCount, int bEnd, int b0, bool extended,
+	int tid, int* indices_shared, Comp comp) {
+	// Run a merge path to find the start of the serial search for each thread.
+	int diag = VT * tid;
+	int mp = MergePath<Bounds>(keys_shared + aStart, aCount,
+		keys_shared + bStart, bCount, diag, comp);
+	int a0tid = mp;
+	int b0tid = diag - mp;
+	// Serial search into register.
+	int3 results;
+	int indices[VT];
+	if(extended)
+		results = DeviceSerialSearch<VT, Bounds, false, IndexA, MatchA, IndexB,
+			MatchB>(keys_shared, a0tid + aStart, aEnd, b0tid + bStart, bEnd,
+			a0 - aStart, b0 - bStart, indices, comp);
+	else
+		results = DeviceSerialSearch<VT, Bounds, true, IndexA, MatchA, IndexB,
+			MatchB>(keys_shared, a0tid + aStart, aEnd, b0tid + bStart, bEnd,
+			a0 - aStart, b0 - bStart, indices, comp);
+	__syncthreads();
+	// Compact the indices into shared memory. Use the decision bits (set is A,
+	// cleared is B) to select the destination.
+	int decisions = results.x;
+	b0tid += aCount;
+	#pragma unroll
+	for(int i = 0; i < VT; ++i) {
+		if((1<< i) & decisions) {
+			if(IndexA || MatchA) indices_shared[a0tid++] = indices[i];
+		} else {
+			if(IndexB || MatchB) indices_shared[b0tid++] = indices[i];
+		}
+	}
+	__syncthreads();
+	// Return the match counts for A and B keys.
+	return make_int2(results.y, results.z);
+}
+} // namespace mgpu
--- a/include/contrib/moderngpu/include/device/devicetypes.cuh
+++ b/include/contrib/moderngpu/include/device/devicetypes.cuh
+/******************************************************************************
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+/******************************************************************************
+ *
+ * Code and text by Sean Baxter, NVIDIA Research
+ * See http://nvlabs.github.io/moderngpu for repository and documentation.
+ *
+ ******************************************************************************/
+#pragma once
+#if __CUDA_ARCH__ == 100
+	#error "COMPUTE CAPABILITY 1.0 NOT SUPPORTED BY MPGU. TRY 2.0!"
+#endif 
+#include <climits>
+#include "../util/static.h"
+#ifdef _MSC_VER
+#define INLINESYMBOL __forceinline__
+#else
+#define INLINESYMBOL inline
+#endif
+namespace mgpu {
+#define MGPU_HOST __host__ INLINESYMBOL
+#define MGPU_DEVICE __device__ INLINESYMBOL
+#define MGPU_HOST_DEVICE __host__ __device__ INLINESYMBOL
+const int WARP_SIZE = 32;
+const int LOG_WARP_SIZE = 5;
+////////////////////////////////////////////////////////////////////////////////
+// Device-side comparison operators
+template<typename T>
+struct less : public std::binary_function<T, T, bool> {
+	MGPU_HOST_DEVICE bool operator()(T a, T b) { return a < b; }
+};
+template<typename T>
+struct less_equal : public std::binary_function<T, T, bool> {
+	MGPU_HOST_DEVICE bool operator()(T a, T b) { return a <= b; }
+};
+template<typename T>
+struct greater : public std::binary_function<T, T, bool> {
+	MGPU_HOST_DEVICE bool operator()(T a, T b) { return a > b; }
+};
+template<typename T>
+struct greater_equal : public std::binary_function<T, T, bool> {
+	MGPU_HOST_DEVICE bool operator()(T a, T b) { return a >= b; }
+};
+template<typename T>
+struct equal_to : public std::binary_function<T, T, bool> {
+	MGPU_HOST_DEVICE bool operator()(T a, T b) { return a == b; }
+};
+template<typename T>
+struct not_equal_to : public std::binary_function<T, T, bool> {
+	MGPU_HOST_DEVICE bool operator()(T a, T b) { return a != b; }
+};
+////////////////////////////////////////////////////////////////////////////////
+// Device-side arithmetic operators
+template<typename T>
+struct plus : public std::binary_function<T, T, T> {
+	MGPU_HOST_DEVICE T operator()(T a, T b) { return a + b; }
+};
+template<typename T>
+struct minus : public std::binary_function<T, T, T> {
+	MGPU_HOST_DEVICE T operator()(T a, T b) { return a - b; }
+};
+template<typename T>
+struct multiplies : public std::binary_function<T, T, T> {
+	MGPU_HOST_DEVICE T operator()(T a, T b) { return a * b; }
+};
+template<typename T>
+struct modulus : public std::binary_function<T, T, T> {
+	MGPU_HOST_DEVICE T operator()(T a, T b) { return a % b; }
+};
+template<typename T>
+struct bit_or : public std::binary_function<T, T, T> {
+	MGPU_HOST_DEVICE T operator()(T a, T b) { return a | b; }
+};
+template<typename T>
+struct bit_and : public std::binary_function<T, T, T> {
+	MGPU_HOST_DEVICE T operator()(T a, T b) { return a & b; }
+};
+template<typename T>
+struct bit_xor : public std::binary_function<T, T, T> {
+	MGPU_HOST_DEVICE T operator()(T a, T b) { return a ^ b; }
+};
+template<typename T>
+struct maximum : public std::binary_function<T, T, T> {
+	MGPU_HOST_DEVICE T operator()(T a, T b) { return max(a, b); }
+};
+template<typename T>
+struct minimum : public std::binary_function<T, T, T> {
+	MGPU_HOST_DEVICE T operator()(T a, T b) { return min(a, b); }
+};
+////////////////////////////////////////////////////////////////////////////////
+template<typename T>
+MGPU_HOST_DEVICE void swap(T& a, T& b) {
+	T c = a;
+	a = b;
+	b = c;
+}
+template<typename T>
+struct DevicePair {
+	T x, y;
+};
+template<typename T>
+MGPU_HOST_DEVICE DevicePair<T> MakeDevicePair(T x, T y) {
+	DevicePair<T> p = { x, y };
+	return p;
+}
+template<typename T> struct numeric_limits;
+template<> struct numeric_limits<int> {
+	MGPU_HOST_DEVICE static int min() { return INT_MIN; }
+	MGPU_HOST_DEVICE static int max() { return INT_MAX; }
+	MGPU_HOST_DEVICE static int lowest() { return INT_MIN; }
+	MGPU_HOST_DEVICE static int AddIdent() { return 0; }
+	MGPU_HOST_DEVICE static int MulIdent() { return 1; }
+};
+template<> struct numeric_limits<long long> {
+	MGPU_HOST_DEVICE static long long min() { return LLONG_MIN; }
+	MGPU_HOST_DEVICE static long long max() { return LLONG_MAX; }
+	MGPU_HOST_DEVICE static long long lowest() { return LLONG_MIN; }
+	MGPU_HOST_DEVICE static long long AddIdent() { return 0; }
+	MGPU_HOST_DEVICE static long long MulIdent() { return 1; }
+};
+template<> struct numeric_limits<uint> {
+	MGPU_HOST_DEVICE static uint min() { return 0; }
+	MGPU_HOST_DEVICE static uint max() { return UINT_MAX; }
+	MGPU_HOST_DEVICE static uint lowest() { return 0; }
+	MGPU_HOST_DEVICE static uint AddIdent() { return 0; }
+	MGPU_HOST_DEVICE static uint MulIdent() { return 1; }
+};
+template<> struct numeric_limits<unsigned long long> {
+	MGPU_HOST_DEVICE static unsigned long long min() { return 0; }
+	MGPU_HOST_DEVICE static unsigned long long max() { return ULLONG_MAX; }
+	MGPU_HOST_DEVICE static unsigned long long lowest() { return 0; }
+	MGPU_HOST_DEVICE static unsigned long long AddIdent() { return 0; }
+	MGPU_HOST_DEVICE static unsigned long long MulIdent() { return 1; }
+};
+template<> struct numeric_limits<float> {
+	MGPU_HOST_DEVICE static float min() { return FLT_MIN; }
+	MGPU_HOST_DEVICE static float max() { return FLT_MAX; }
+	MGPU_HOST_DEVICE static float lowest() { return -FLT_MAX; }
+	MGPU_HOST_DEVICE static float AddIdent() { return 0; }
+	MGPU_HOST_DEVICE static float MulIdent() { return 1; }
+};
+template<> struct numeric_limits<double> {
+	MGPU_HOST_DEVICE static double min() { return DBL_MIN; }
+	MGPU_HOST_DEVICE static double max() { return DBL_MAX; }
+	MGPU_HOST_DEVICE static double lowest() { return -DBL_MAX; }
+	MGPU_HOST_DEVICE static double AddIdent() { return 0; }
+	MGPU_HOST_DEVICE static double MulIdent() { return 1; }
+};
+MGPU_HOST_DEVICE int2 operator+(int2 a, int2 b) {
+	return make_int2(a.x + b.x, a.y + b.y); 
+}
+MGPU_HOST_DEVICE int2& operator+=(int2& a, int2 b) {
+	a = a + b;
+	return a;
+}
+MGPU_HOST_DEVICE int2 operator*(int2 a, int2 b) {
+	return make_int2(a.x * b.x, a.y * b.y);
+}
+MGPU_HOST_DEVICE int2& operator*=(int2& a, int2 b) {
+	a = a * b;
+	return a;
+}
+template<typename T>
+MGPU_HOST_DEVICE T max(T a, T b) {
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 100)
+	return std::max(a, b);
+#else
+	return (a < b) ? b : a;
+#endif
+}
+template<typename T>
+MGPU_HOST_DEVICE T min(T a, T b) {
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 100)
+	return std::min(a, b);
+#else
+	return (b < a) ? b : a;
+#endif
+}
+MGPU_HOST_DEVICE int2 max(int2 a, int2 b) {
+	return make_int2(max(a.x, b.x), max(a.y, b.y));
+}
+MGPU_HOST_DEVICE int2 min(int2 a, int2 b) {
+	return make_int2(min(a.x, b.x), min(a.y, b.y));
+}
+template<> struct numeric_limits<int2> {
+	MGPU_HOST_DEVICE static int2 min() { return make_int2(INT_MIN, INT_MIN); }
+	MGPU_HOST_DEVICE static int2 max() { return make_int2(INT_MAX, INT_MAX); }
+	MGPU_HOST_DEVICE static int2 lowest() { 
+		return make_int2(INT_MIN, INT_MIN); 
+	}
+	MGPU_HOST_DEVICE static int2 AddIdent() { return make_int2(0, 0); }
+	MGPU_HOST_DEVICE static int2 MulIdent() { return make_int2(1, 1); }
+};
+template<typename T>
+class constant_iterator : public std::iterator_traits<const T*> {
+public:
+	MGPU_HOST_DEVICE constant_iterator(T value) : _value(value) { }
+	MGPU_HOST_DEVICE T operator[](ptrdiff_t i) const { 
+		return _value;
+	}
+	MGPU_HOST_DEVICE T operator*() const {
+		return _value;
+	}
+	MGPU_HOST_DEVICE constant_iterator operator+(ptrdiff_t diff) const {
+		return constant_iterator(_value);
+	}
+	MGPU_HOST_DEVICE constant_iterator operator-(ptrdiff_t diff) const {
+		return constant_iterator(_value);
+	}
+	MGPU_HOST_DEVICE constant_iterator& operator+=(ptrdiff_t diff) {
+		return *this;
+	}
+	MGPU_HOST_DEVICE constant_iterator& operator-=(ptrdiff_t diff) {
+		return *this;
+	}
+private:
+	T _value;
+};
+template<typename T>
+class counting_iterator : public std::iterator_traits<const T*> {
+public:
+	MGPU_HOST_DEVICE counting_iterator(T value) : _value(value) { }
+	MGPU_HOST_DEVICE T operator[](ptrdiff_t i) { 
+		return _value + i;
+	}
+	MGPU_HOST_DEVICE T operator*() {
+		return _value;
+	}
+	MGPU_HOST_DEVICE counting_iterator operator+(ptrdiff_t diff) {
+		return counting_iterator(_value + diff);
+	}
+	MGPU_HOST_DEVICE counting_iterator operator-(ptrdiff_t diff) {
+		return counting_iterator(_value - diff);
+	}
+	MGPU_HOST_DEVICE counting_iterator& operator+=(ptrdiff_t diff) {
+		_value += diff;
+		return *this;
+	}
+	MGPU_HOST_DEVICE counting_iterator& operator-=(ptrdiff_t diff) {
+		_value -= diff;
+		return *this;
+	}
+private:
+	T _value;
+};
+template<typename T>
+class step_iterator : public std::iterator_traits<const T*> {
+public:
+	MGPU_HOST_DEVICE step_iterator(T base, T step) :
+		_base(base), _step(step), _offset(0) { }
+	MGPU_HOST_DEVICE T operator[](ptrdiff_t i) { 
+		return _base + (_offset + i) * _step; 
+	}
+	MGPU_HOST_DEVICE T operator*() { 
+		return _base + _offset * _step; 
+	} 
+	MGPU_HOST_DEVICE step_iterator operator+(ptrdiff_t diff) {
+		step_iterator it = *this;
+		it._offset += diff;
+		return it;
+	}
+	MGPU_HOST_DEVICE step_iterator operator-(ptrdiff_t diff) {
+		step_iterator it = *this;
+		it._offset -= diff;
+		return it;
+	}
+	MGPU_HOST_DEVICE step_iterator& operator+=(ptrdiff_t diff) { 
+		_offset += diff;
+		return *this;
+	}
+	MGPU_HOST_DEVICE step_iterator& operator-=(ptrdiff_t diff) { 
+		_offset -= diff;
+		return *this;
+	}
+private:
+	ptrdiff_t _offset;
+	T _base, _step;	
+};
+} // namespace mgpu
+template<typename T>
+MGPU_HOST_DEVICE mgpu::counting_iterator<T> operator+(ptrdiff_t diff,
+	mgpu::counting_iterator<T> it) {
+	return it + diff;
+}
+template<typename T>
+MGPU_HOST_DEVICE mgpu::counting_iterator<T> operator-(ptrdiff_t diff,
+	mgpu::counting_iterator<T> it) {
+	return it + (-diff);
+}
+template<typename T>
+MGPU_HOST_DEVICE mgpu::step_iterator<T> operator+(ptrdiff_t diff, 
+	mgpu::step_iterator<T> it) {
+	return it + diff;
+}
+template<typename T>
+MGPU_HOST_DEVICE mgpu::step_iterator<T> operator-(ptrdiff_t diff, 
+	mgpu::step_iterator<T> it) {
+	return it + (-diff);
+}
--- a/include/contrib/moderngpu/include/device/deviceutil.cuh
+++ b/include/contrib/moderngpu/include/device/deviceutil.cuh
+/******************************************************************************
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+/******************************************************************************
+ *
+ * Code and text by Sean Baxter, NVIDIA Research
+ * See http://nvlabs.github.io/moderngpu for repository and documentation.
+ *
+ ******************************************************************************/
+#pragma once
+#include "intrinsics.cuh"
+namespace mgpu {
+// Get the difference between two pointers in bytes.
+MGPU_HOST_DEVICE ptrdiff_t PtrDiff(const void* a, const void* b) {
+	return (const byte*)b - (const byte*)a;
+}
+// Offset a pointer by i bytes.
+template<typename T>
+MGPU_HOST_DEVICE const T* PtrOffset(const T* p, ptrdiff_t i) {
+	return (const T*)((const byte*)p + i);
+}
+template<typename T>
+MGPU_HOST_DEVICE T* PtrOffset(T* p, ptrdiff_t i) {
+	return (T*)((byte*)p + i);
+}
+////////////////////////////////////////////////////////////////////////////////
+// Task range support
+// Evenly distributes variable-length arrays over a fixed number of CTAs.
+MGPU_HOST int2 DivideTaskRange(int numItems, int numWorkers) {
+	div_t d = div(numItems, numWorkers);
+	return make_int2(d.quot, d.rem);
+}
+MGPU_HOST_DEVICE int2 ComputeTaskRange(int block, int2 task) {
+	int2 range;
+	range.x = task.x * block;
+	range.x += min(block, task.y);
+	range.y = range.x + task.x + (block < task.y);
+	return range;
+}
+MGPU_HOST_DEVICE int2 ComputeTaskRange(int block, int2 task, int blockSize,
+	int count) {
+	int2 range = ComputeTaskRange(block, task);
+	range.x *= blockSize;
+	range.y = min(count, range.y * blockSize);
+	return range;
+}
+////////////////////////////////////////////////////////////////////////////////
+// DeviceExtractHeadFlags
+// Input array flags is a bit array with 32 head flags per word.
+// ExtractThreadHeadFlags returns numBits flags starting at bit index.
+MGPU_HOST_DEVICE uint DeviceExtractHeadFlags(const uint* flags, int index,
+	int numBits) {
+	int index2 = index>> 5;
+	int shift = 31 & index;
+	uint headFlags = flags[index2]>> shift;
+	int shifted = 32 - shift;
+	if(shifted < numBits)
+		// We also need to shift in the next set of bits.
+		headFlags = bfi(flags[index2 + 1], headFlags, shifted, shift);
+	headFlags &= (1<< numBits) - 1;
+	return headFlags;
+}
+////////////////////////////////////////////////////////////////////////////////
+// DevicePackHeadFlags
+// Pack VT bits per thread at 32 bits/thread. Will consume an integer number of
+// words, because CTA size is a multiple of 32. The first NT * VT / 32 threads
+// return packed words.
+template<int NT, int VT>
+MGPU_DEVICE uint DevicePackHeadFlags(uint threadBits, int tid,
+	uint* flags_shared) {
+	const int WordCount = NT * VT / 32;
+	// Each thread stores its thread bits to flags_shared[tid].
+	flags_shared[tid] = threadBits;
+	__syncthreads();
+	uint packed = 0;
+	if(tid < WordCount) {
+		const int Items = MGPU_DIV_UP(32, VT);
+		int index = 32 * tid;
+		int first = index / VT;
+		int bit = 0;
+		int rem = index - VT * first;
+		packed = flags_shared[first]>> rem;
+		bit = VT - rem;
+		++first;
+		#pragma unroll
+		for(int i = 0; i < Items; ++i) {
+			if(i < Items - 1 || bit < 32) {
+				uint x = flags_shared[first + i];
+				if(bit < 32) packed |= x<< bit;
+				bit += VT;
+			}
+		}
+	}
+	__syncthreads();
+	return packed;
+}
+} // namespace mgpu
--- a/include/contrib/moderngpu/include/device/intrinsics.cuh
+++ b/include/contrib/moderngpu/include/device/intrinsics.cuh
+/******************************************************************************
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+/******************************************************************************
+ *
+ * Code and text by Sean Baxter, NVIDIA Research
+ * See http://nvlabs.github.io/moderngpu for repository and documentation.
+ *
+ ******************************************************************************/
+#include "devicetypes.cuh"
+#pragma once
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+namespace mgpu {
+MGPU_HOST_DEVICE uint2 ulonglong_as_uint2(uint64 x) {
+	return *reinterpret_cast<uint2*>(&x);
+}
+MGPU_HOST_DEVICE uint64 uint2_as_ulonglong(uint2 x) {
+	return *reinterpret_cast<uint64*>(&x);
+}
+MGPU_HOST_DEVICE int2 longlong_as_int2(int64 x) {
+	return *reinterpret_cast<int2*>(&x);
+}
+MGPU_HOST_DEVICE int64 int2_as_longlong(int2 x) {
+	return *reinterpret_cast<int64*>(&x);
+}
+MGPU_HOST_DEVICE int2 double_as_int2(double x) {
+	return *reinterpret_cast<int2*>(&x);
+}
+MGPU_HOST_DEVICE double int2_as_double(int2 x) {
+	return *reinterpret_cast<double*>(&x);
+}
+MGPU_HOST_DEVICE void SetDoubleX(double& d, int x) {
+	reinterpret_cast<int*>(&d)[0] = x;
+}
+MGPU_HOST_DEVICE int GetDoubleX(double d) {
+	return double_as_int2(d).x;
+}
+MGPU_HOST_DEVICE void SetDoubleY(double& d, int y) {
+	reinterpret_cast<int*>(&d)[1] = y;
+}
+MGPU_HOST_DEVICE int GetDoubleY(double d) {
+	return double_as_int2(d).y;
+}
+////////////////////////////////////////////////////////////////////////////////
+// PTX for bfe and bfi
+#if __CUDA_ARCH__ >= 200
+MGPU_DEVICE uint bfe_ptx(uint x, uint bit, uint numBits) {
+	uint result;
+	asm("bfe.u32 %0, %1, %2, %3;" :
+		"=r"(result) : "r"(x), "r"(bit), "r"(numBits));
+	return result;
+}
+MGPU_DEVICE uint bfi_ptx(uint x, uint y, uint bit, uint numBits) {
+	uint result;
+	asm("bfi.b32 %0, %1, %2, %3, %4;" :
+		"=r"(result) : "r"(x), "r"(y), "r"(bit), "r"(numBits));
+	return result;
+}
+MGPU_DEVICE uint prmt_ptx(uint a, uint b, uint index) {
+	uint ret;
+	asm("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index));
+	return ret;
+}
+#endif // __CUDA_ARCH__ >= 200
+////////////////////////////////////////////////////////////////////////////////
+// shfl_up
+__device__ __forceinline__ float shfl_up(float var,
+	unsigned int delta, int width = 32) {
+#if __CUDA_ARCH__ >= 300
+	var = __shfl_up_sync(0xFFFFFFFF, var, delta, width);
+#endif
+	return var;
+}
+__device__ __forceinline__ double shfl_up(double var,
+	unsigned int delta, int width = 32) {
+#if __CUDA_ARCH__ >= 300
+	int2 p = mgpu::double_as_int2(var);
+	p.x = __shfl_up_sync(0xFFFFFFFF, p.x, delta, width);
+	p.y = __shfl_up_sync(0xFFFFFFFF, p.y, delta, width);
+	var = mgpu::int2_as_double(p);
+#endif
+	return var;
+}
+////////////////////////////////////////////////////////////////////////////////
+// shfl_add
+MGPU_DEVICE int shfl_add(int x, int offset, int width = WARP_SIZE) {
+	int result = 0;
+#if __CUDA_ARCH__ >= 300
+	int mask = (WARP_SIZE - width)<< 8;
+	asm(
+		"{.reg .s32 r0;"
+		".reg .pred p;"
+		"shfl.up.sync.b32 r0|p, %1, %2, %3, %4;"
+		"@p add.s32 r0, r0, %4;"
+		"mov.s32 %0, r0; }"
+		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
+#endif
+	return result;
+}
+MGPU_DEVICE int shfl_max(int x, int offset, int width = WARP_SIZE) {
+	int result = 0;
+#if __CUDA_ARCH__ >= 300
+	int mask = (WARP_SIZE - width)<< 8;
+	asm(
+		"{.reg .s32 r0;"
+		".reg .pred p;"
+		"shfl.up.sync..b32 r0|p, %1, %2, %3, %4;"
+		"@p max.s32 r0, r0, %4;"
+		"mov.s32 %0, r0; }"
+		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
+#endif
+	return result;
+}
+////////////////////////////////////////////////////////////////////////////////
+// brev, popc, clz, bfe, bfi, prmt
+// Reverse the bits in an integer.
+MGPU_HOST_DEVICE uint brev(uint x) {
+#if __CUDA_ARCH__ >= 200
+	uint y = __brev(x);
+#else
+	uint y = 0;
+	for(int i = 0; i < 32; ++i)
+		y |= (1 & (x>> i))<< (31 - i);
+#endif
+	return y;
+}
+// Count number of bits in a register.
+MGPU_HOST_DEVICE int popc(uint x) {
+#if __CUDA_ARCH__ >= 200
+	return __popc(x);
+#else
+	int c;
+	for(c = 0; x; ++c)
+		x &= x - 1;
+	return c;
+#endif
+}
+// Count leading zeros - start from most significant bit.
+MGPU_HOST_DEVICE int clz(int x) {
+#if __CUDA_ARCH__ >= 200
+	return __clz(x);
+#else
+	for(int i = 31; i >= 0; --i)
+		if((1<< i) & x) return 31 - i;
+	return 32;
+#endif
+}
+// Find first set - start from least significant bit. LSB is 1. ffs(0) is 0.
+MGPU_HOST_DEVICE int ffs(int x) {
+#if __CUDA_ARCH__ >= 200
+	return __ffs(x);
+#else
+	for(int i = 0; i < 32; ++i)
+		if((1<< i) & x) return i + 1;
+	return 0;
+#endif
+}
+MGPU_HOST_DEVICE uint bfe(uint x, uint bit, uint numBits) {
+#if __CUDA_ARCH__ >= 200
+	return bfe_ptx(x, bit, numBits);
+#else
+	return ((1<< numBits) - 1) & (x>> bit);
+#endif
+}
+MGPU_HOST_DEVICE uint bfi(uint x, uint y, uint bit, uint numBits) {
+	uint result;
+#if __CUDA_ARCH__ >= 200
+	result = bfi_ptx(x, y, bit, numBits);
+#else
+	if(bit + numBits > 32) numBits = 32 - bit;
+	uint mask = ((1<< numBits) - 1)<< bit;
+	result = y & ~mask;
+	result |= mask & (x<< bit);
+#endif
+	return result;
+}
+MGPU_HOST_DEVICE uint prmt(uint a, uint b, uint index) {
+	uint result;
+#if __CUDA_ARCH__ >= 200
+	result = prmt_ptx(a, b, index);
+#else
+	result = 0;
+	for(int i = 0; i < 4; ++i) {
+		uint sel = 0xf & (index>> (4 * i));
+		uint x = ((7 & sel) > 3) ? b : a;
+		x = 0xff & (x>> (8 * (3 & sel)));
+		if(8 & sel) x = (128 & x) ? 0xff : 0;
+		result |= x<< (8 * i);
+	}
+#endif
+	return result;
+}
+// Find log2(x) and optionally round up to the next integer logarithm.
+MGPU_HOST_DEVICE int FindLog2(int x, bool roundUp = false) {
+	int a = 31 - clz(x);
+	if(roundUp) a += !MGPU_IS_POW_2(x);
+	return a;
+}
+////////////////////////////////////////////////////////////////////////////////
+// vset4
+#if __CUDA_ARCH__ >= 300
+// Performs four byte-wise comparisons and returns 1 for each byte that
+// satisfies the conditional, and zero otherwise.
+MGPU_DEVICE uint vset4_lt_add_ptx(uint a, uint b, uint c) {
+	uint result;
+	asm("vset4.u32.u32.lt.add %0, %1, %2, %3;" :
+		"=r"(result) : "r"(a), "r"(b), "r"(c));
+	return result;
+}
+MGPU_DEVICE uint vset4_eq_ptx(uint a, uint b) {
+	uint result;
+	asm("vset4.u32.u32.eq %0, %1, %2, %3;" :
+		"=r"(result) : "r"(a), "r"(b), "r"(0));
+	return result;
+}
+#endif // __CUDA_ARCH__ >= 300
+MGPU_HOST_DEVICE uint vset4_lt_add(uint a, uint b, uint c) {
+	uint result;
+#if __CUDA_ARCH__ >= 300
+	result = vset4_lt_add_ptx(a, b, c);
+#else
+	result = c;
+	if((0x000000ff & a) < (0x000000ff & b)) result += 0x00000001;
+	if((0x0000ff00 & a) < (0x0000ff00 & b)) result += 0x00000100;
+	if((0x00ff0000 & a) < (0x00ff0000 & b)) result += 0x00010000;
+	if((0xff000000 & a) < (0xff000000 & b)) result += 0x01000000;
+#endif
+	return result;
+}
+MGPU_HOST_DEVICE uint vset4_eq(uint a, uint b) {
+	uint result;
+#if __CUDA_ARCH__ >= 300
+	result = vset4_eq_ptx(a, b);
+#else
+	result = 0;
+	if((0x000000ff & a) == (0x000000ff & b)) result = 0x00000001;
+	if((0x0000ff00 & a) == (0x0000ff00 & b)) result += 0x00000100;
+	if((0x00ff0000 & a) == (0x00ff0000 & b)) result += 0x00010000;
+	if((0xff000000 & a) == (0xff000000 & b)) result += 0x01000000;
+#endif
+	return result;
+}
+////////////////////////////////////////////////////////////////////////////////
+//
+MGPU_HOST_DEVICE uint umulhi(uint x, uint y) {
+#if __CUDA_ARCH__ >= 100
+	return __umulhi(x, y);
+#else
+	uint64 product = (uint64)x * y;
+	return (uint)(product>> 32);
+#endif
+}
+////////////////////////////////////////////////////////////////////////////////
+// ldg() function defined for all devices and all types. Only compiles to __ldg
+// intrinsic for __CUDA_ARCH__ >= 320 && __CUDA_ARCH__ < 400 for types supported
+// by __ldg in sm_32_intrinsics.h
+template<typename T>
+struct IsLdgType {
+	enum { value = false };
+};
+#define DEFINE_LDG_TYPE(T) \
+	template<> struct IsLdgType<T> { enum { value = true }; };
+template<typename T, bool UseLDG = IsLdgType<T>::value>
+struct LdgShim {
+	MGPU_DEVICE static T Ldg(const T* p) {
+		return *p;
+	}
+};
+#if __CUDA_ARCH__ >= 320 && __CUDA_ARCH__ < 400
+	// List of __ldg-compatible types from sm_32_intrinsics.h.
+	DEFINE_LDG_TYPE(char)
+	DEFINE_LDG_TYPE(short)
+	DEFINE_LDG_TYPE(int)
+	DEFINE_LDG_TYPE(long long)
+	DEFINE_LDG_TYPE(char2)
+	DEFINE_LDG_TYPE(char4)
+	DEFINE_LDG_TYPE(short2)
+	DEFINE_LDG_TYPE(short4)
+	DEFINE_LDG_TYPE(int2)
+	DEFINE_LDG_TYPE(int4)
+	DEFINE_LDG_TYPE(longlong2)
+	DEFINE_LDG_TYPE(unsigned char)
+	DEFINE_LDG_TYPE(unsigned short)
+	DEFINE_LDG_TYPE(unsigned int)
+	DEFINE_LDG_TYPE(unsigned long long)
+	DEFINE_LDG_TYPE(uchar2)
+	DEFINE_LDG_TYPE(uchar4)
+	DEFINE_LDG_TYPE(ushort2)
+	DEFINE_LDG_TYPE(ushort4)
+	DEFINE_LDG_TYPE(uint2)
+	DEFINE_LDG_TYPE(uint4)
+	DEFINE_LDG_TYPE(ulonglong2)
+	DEFINE_LDG_TYPE(float)
+	DEFINE_LDG_TYPE(double)
+	DEFINE_LDG_TYPE(float2)
+	DEFINE_LDG_TYPE(float4)
+	DEFINE_LDG_TYPE(double2)
+	template<typename T> struct LdgShim<T, true> {
+		MGPU_DEVICE static T Ldg(const T* p) {
+			return __ldg(p);
+		}
+	};
+#endif
+template<typename T>
+MGPU_DEVICE T ldg(const T* p) {
+	return LdgShim<T>::Ldg(p);
+}
+////////////////////////////////////////////////////////////////////////////////
+// Fast division for 31-bit integers.
+// Uses the method in Hacker's Delight (2nd edition) page 228.
+// Evaluates for denom > 1 and x < 2^31.
+struct FastDivide {
+	uint denom;
+	uint coef;
+	uint shift;
+	MGPU_HOST_DEVICE uint Divide(uint x) {
+		return umulhi(x, coef)>> shift;
+	}
+	MGPU_HOST_DEVICE uint Modulus(uint x) {
+		return x - Divide(x) * denom;
+	}
+	explicit FastDivide(uint denom_) {
+		denom = denom_;
+		uint p = 31 + FindLog2(denom, true);
+		coef = (uint)(((1ull<< p) + denom - 1) / denom);
+		shift = p - 32;
+	}
+};
+#pragma GCC diagnostic pop
+} // namespace mgpu
--- a/include/contrib/moderngpu/include/device/loadstore.cuh
+++ b/include/contrib/moderngpu/include/device/loadstore.cuh
--- a/include/contrib/moderngpu/include/device/serialsets.cuh
+++ b/include/contrib/moderngpu/include/device/serialsets.cuh
+/******************************************************************************
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+/******************************************************************************
+ *
+ * Code and text by Sean Baxter, NVIDIA Research
+ * See http://nvlabs.github.io/moderngpu for repository and documentation.
+ *
+ ******************************************************************************/
+#pragma once
+#include "deviceutil.cuh"
+namespace mgpu {
+////////////////////////////////////////////////////////////////////////////////
+// SerialSetIntersection
+// Emit A if A and B are in range and equal.
+template<int VT, bool RangeCheck, typename T, typename Comp>
+MGPU_DEVICE int SerialSetIntersection(const T* data, int aBegin, int aEnd,
+	int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) {
+	const int MinIterations = VT / 2;
+	int commit = 0;
+	#pragma unroll
+	for(int i = 0; i < VT; ++i) {
+		bool test = RangeCheck ?
+			((aBegin + bBegin < end) && (aBegin < aEnd) && (bBegin < bEnd)) :
+			(i < MinIterations || (aBegin + bBegin < end));
+		if(test) {
+			T aKey = data[aBegin];
+			T bKey = data[bBegin];
+			bool pA = comp(aKey, bKey);
+			bool pB = comp(bKey, aKey);
+			// The outputs must come from A by definition of set interection.
+			results[i] = aKey;
+			indices[i] = aBegin;
+			if(!pB) ++aBegin;
+			if(!pA) ++bBegin;
+			if(pA == pB) commit |= 1<< i;
+		}
+	}
+	return commit;
+}
+////////////////////////////////////////////////////////////////////////////////
+// SerialSetUnion
+// Emit A if A <= B. Emit B if B < A.
+template<int VT, bool RangeCheck, typename T, typename Comp>
+MGPU_DEVICE int SerialSetUnion(const T* data, int aBegin, int aEnd,
+	int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) {
+	const int MinIterations = VT / 2;
+	int commit = 0;
+	#pragma unroll
+	for(int i = 0; i < VT; ++i) {
+		bool test = RangeCheck ?
+			(aBegin + bBegin < end) :
+			(i < MinIterations || (aBegin + bBegin < end));
+		if(test) {
+			T aKey = data[aBegin];
+			T bKey = data[bBegin];
+			bool pA = false, pB = false;
+			if(RangeCheck && aBegin >= aEnd)
+				pB = true;
+			else if(RangeCheck && bBegin >= bEnd)
+				pA = true;
+			else {
+				// Both are in range.
+				pA = comp(aKey, bKey);
+				pB = comp(bKey, aKey);
+			}
+			// Output A in case of a tie, so check if b < a.
+			results[i] = pB ? bKey : aKey;
+			indices[i] = pB ? bBegin : aBegin;
+			if(!pB) ++aBegin;
+			if(!pA) ++bBegin;
+			commit |= 1<< i;
+		}
+	}
+	return commit;
+}
+////////////////////////////////////////////////////////////////////////////////
+// SerialSetDifference
+// Emit A if A < B.
+template<int VT, bool RangeCheck, typename T, typename Comp>
+MGPU_DEVICE int SerialSetDifference(const T* data, int aBegin, int aEnd,
+	int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) {
+	const int MinIterations = VT / 2;
+	int commit = 0;
+	#pragma unroll
+	for(int i = 0; i < VT; ++i) {
+		bool test = RangeCheck ?
+			(aBegin + bBegin < end) :
+			(i < MinIterations || (aBegin + bBegin < end));
+		if(test) {
+			T aKey = data[aBegin];
+			T bKey = data[bBegin];
+			bool pA = false, pB = false;
+			if(RangeCheck && aBegin >= aEnd)
+				pB = true;
+			else if(RangeCheck && bBegin >= bEnd)
+				pA = true;
+			else {
+				pA = comp(aKey, bKey);
+				pB = comp(bKey, aKey);
+			}
+			// The outputs must come from A by definition of set difference.
+			results[i] = aKey;
+			indices[i] = aBegin;
+			if(!pB) ++aBegin;
+			if(!pA) ++bBegin;
+			if(pA) commit |= 1<< i;
+		}
+	}
+	return commit;
+}
+////////////////////////////////////////////////////////////////////////////////
+// SerialSetSymDiff
+// Emit A if A < B and emit B if B < A.
+template<int VT, bool RangeCheck, typename T, typename Comp>
+MGPU_DEVICE int SerialSetSymDiff(const T* data, int aBegin, int aEnd,
+	int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) {
+	const int MinIterations = VT / 2;
+	int commit = 0;
+	#pragma unroll
+	for(int i = 0; i < VT; ++i) {
+		bool test = RangeCheck ?
+			(aBegin + bBegin < end) :
+			(i < MinIterations || (aBegin + bBegin < end));
+		if(test) {
+			T aKey = data[aBegin];
+			T bKey = data[bBegin];
+			bool pA = false, pB = false;
+			if(RangeCheck && (bBegin >= bEnd))
+				pA = true;
+			else if(RangeCheck && (aBegin >= aEnd))
+				pB = true;
+			else {
+				pA = comp(aKey, bKey);
+				pB = comp(bKey, aKey);
+			}
+			results[i] = pA ? aKey : bKey;
+			indices[i] = pA ? aBegin : bBegin;
+			if(!pA) ++bBegin;
+			if(!pB) ++aBegin;
+			if(pA != pB) commit |= 1<< i;
+		}
+	}
+	return commit;
+}
+////////////////////////////////////////////////////////////////////////////////
+// SerialSetOp
+// Uses the MgpuSetOp enum to statically select one of the four serial ops
+// above.
+template<int VT, bool RangeCheck, MgpuSetOp Op, typename T, typename Comp>
+MGPU_DEVICE int SerialSetOp(const T* data, int aBegin, int aEnd,
+	int bBegin, int bEnd, int star, T* results, int* indices, Comp comp) {
+	int end = aBegin + bBegin + VT - star;
+	if(RangeCheck) end = min(end, aEnd + bEnd);
+	int commit;
+	switch(Op) {
+		case MgpuSetOpIntersection:
+			commit = SerialSetIntersection<VT, RangeCheck>(data, aBegin,
+				aEnd, bBegin, bEnd, end, results, indices, comp);
+			break;
+		case MgpuSetOpUnion:
+			commit = SerialSetUnion<VT, RangeCheck>(data, aBegin, aEnd,
+				bBegin, bEnd, end, results, indices, comp);
+			break;
+		case MgpuSetOpDiff:
+			commit = SerialSetDifference<VT, RangeCheck>(data, aBegin, aEnd,
+				bBegin, bEnd, end, results, indices, comp);
+			break;
+		case MgpuSetOpSymDiff:
+			commit = SerialSetSymDiff<VT, RangeCheck>(data, aBegin, aEnd,
+				bBegin, bEnd, end, results, indices, comp);
+			break;
+	}
+	__syncthreads();
+	return commit;
+}
+} // namespace mgpu
--- a/include/contrib/moderngpu/include/device/sortnetwork.cuh
+++ b/include/contrib/moderngpu/include/device/sortnetwork.cuh
+/******************************************************************************
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+/******************************************************************************
+ *
+ * Code and text by Sean Baxter, NVIDIA Research
+ * See http://nvlabs.github.io/moderngpu for repository and documentation.
+ *
+ ******************************************************************************/
+#pragma once
+#include "deviceutil.cuh"
+namespace mgpu {
+////////////////////////////////////////////////////////////////////////////////
+// Odd-even transposition sorting network. Sorts keys and values in-place in
+// register.
+// http://en.wikipedia.org/wiki/Odd%E2%80%93even_sort
+// CUDA Compiler does not currently unroll these loops correctly. Write using
+// template loop unrolling.
+/*
+template<int VT, typename T, typename V, typename Comp>
+MGPU_DEVICE void OddEvenTransposeSort(T* keys, V* values, Comp comp) {
+	#pragma unroll
+	for(int level = 0; level < VT; ++level) {
+		#pragma unroll
+		for(int i = 1 & level; i < VT - 1; i += 2) {
+			if(comp(keys[i + 1], keys[i])) {
+				mgpu::swap(keys[i], keys[i + 1]);
+				mgpu::swap(values[i], values[i + 1]);
+			}
+		}
+	}
+}*/
+template<int I, int VT>
+struct OddEvenTransposeSortT {
+	// Sort segments marked by head flags. If the head flag between i and i + 1
+	// is set (so that (2<< i) & flags is true), the values belong to different
+	// segments and are not swapped.
+	template<typename K, typename V, typename Comp>
+	static MGPU_DEVICE void Sort(K* keys, V* values, int flags, Comp comp) {
+		#pragma unroll
+		for(int i = 1 & I; i < VT - 1; i += 2)
+			if((0 == ((2<< i) & flags)) && comp(keys[i + 1], keys[i])) {
+				mgpu::swap(keys[i], keys[i + 1]);
+				mgpu::swap(values[i], values[i + 1]);
+			}
+		OddEvenTransposeSortT<I + 1, VT>::Sort(keys, values, flags, comp);
+	}
+};
+template<int I> struct OddEvenTransposeSortT<I, I> {
+	template<typename K, typename V, typename Comp>
+	static MGPU_DEVICE void Sort(K* keys, V* values, int flags, Comp comp) { }
+};
+template<int VT, typename K, typename V, typename Comp>
+MGPU_DEVICE void OddEvenTransposeSort(K* keys, V* values, Comp comp) {
+	OddEvenTransposeSortT<0, VT>::Sort(keys, values, 0, comp);
+}
+template<int VT, typename K, typename V, typename Comp>
+MGPU_DEVICE void OddEvenTransposeSortFlags(K* keys, V* values, int flags,
+	Comp comp) {
+	OddEvenTransposeSortT<0, VT>::Sort(keys, values, flags, comp);
+}
+////////////////////////////////////////////////////////////////////////////////
+// Batcher Odd-Even Mergesort network
+// Unstable but executes much faster than the transposition sort.
+// http://en.wikipedia.org/wiki/Batcher_odd%E2%80%93even_mergesort
+template<int Width, int Low, int Count>
+struct OddEvenMergesortT {
+	template<typename K, typename V, typename Comp>
+	MGPU_DEVICE static void CompareAndSwap(K* keys, V* values, int flags,
+		int a, int b, Comp comp) {
+		if(b < Count) {
+			// Mask the bits between a and b. Any head flags in this interval
+			// means the keys are in different segments and must not be swapped.
+			const int Mask = ((2<< b) - 1) ^ ((2<< a) - 1);
+			if(!(Mask & flags) && comp(keys[b], keys[a])) {
+				mgpu::swap(keys[b], keys[a]);
+				mgpu::swap(values[b], values[a]);
+			}
+		}
+	}
+	template<int R, int Low2, bool Recurse = 2 * R < Width>
+	struct OddEvenMerge {
+		template<typename K, typename V, typename Comp>
+		MGPU_DEVICE static void Merge(K* keys, V* values, int flags,
+			Comp comp) {
+			// Compare and swap
+			const int M = 2 * R;
+			OddEvenMerge<M, Low2>::Merge(keys, values, flags, comp);
+			OddEvenMerge<M, Low2 + R>::Merge(keys, values, flags, comp);
+			#pragma unroll
+			for(int i = Low2 + R; i + R < Low2 + Width; i += M)
+				CompareAndSwap(keys, values, flags, i, i + R, comp);
+		}
+	};
+	template<int R, int Low2>
+	struct OddEvenMerge<R, Low2, false> {
+		template<typename K, typename V, typename Comp>
+		MGPU_DEVICE static void Merge(K* keys, V* values, int flags,
+			Comp comp) {
+			CompareAndSwap(keys, values, flags, Low2, Low2 + R, comp);
+		}
+	};
+	template<typename K, typename V, typename Comp>
+	MGPU_DEVICE static void Sort(K* keys, V* values, int flags,
+		Comp comp) {
+		const int M = Width / 2;
+		OddEvenMergesortT<M, Low, Count>::Sort(keys, values, flags, comp);
+		OddEvenMergesortT<M, Low + M, Count>::Sort(keys, values, flags, comp);
+		OddEvenMerge<1, Low>::Merge(keys, values, flags, comp);
+	}
+};
+template<int Low, int Count> struct OddEvenMergesortT<1, Low, Count> {
+	template<typename K, typename V, typename Comp>
+	MGPU_DEVICE static void Sort(K* keys, V* values, int flags,
+		Comp comp) { }
+};
+template<int VT, typename K, typename V, typename Comp>
+MGPU_DEVICE void OddEvenMergesort(K* keys, V* values, Comp comp) {
+	const int Width = 1<< sLogPow2<VT, true>::value;
+	OddEvenMergesortT<Width, 0, VT>::Sort(keys, values, 0, comp);
+}
+template<int VT, typename K, typename V, typename Comp>
+MGPU_DEVICE void OddEvenMergesortFlags(K* keys, V* values, int flags,
+	Comp comp) {
+	const int Width = 1<< sLogPow2<VT, true>::value;
+	OddEvenMergesortT<Width, 0, VT>::Sort(keys, values, flags, comp);
+}
+} // namespace mgpu
--- a/include/contrib/moderngpu/include/mgpudevice.cuh
+++ b/include/contrib/moderngpu/include/mgpudevice.cuh
+/******************************************************************************
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+/******************************************************************************
+ *
+ * Code and text by Sean Baxter, NVIDIA Research
+ * See http://nvlabs.github.io/moderngpu for repository and documentation.
+ *
+ ******************************************************************************/
+#pragma once
+#include "mgpuenums.h"
+#include "device/deviceutil.cuh"
+namespace mgpu {
+////////////////////////////////////////////////////////////////////////////////
+// device/loadstore.cuh
+// For 0 <= i < VT:
+//		index = NT * i + tid;
+//		reg[i] = data[index];
+// Synchronize after load.
+template<int NT, int VT, typename InputIt, typename T>
+MGPU_DEVICE void DeviceSharedToReg(InputIt data, int tid, T* reg,
+	bool sync = true);
+// For 0 <= i < VT:
+//		index = NT * i + tid;
+//		if(index < count) reg[i] = data[index];
+// No synchronize after load.
+template<int NT, int VT, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGlobalToReg(int count, InputIt data, int tid,
+	T* reg, bool sync = false);
+template<int NT, int VT, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGlobalToRegDefault(int count, InputIt data, int tid,
+	T* reg, T init, bool sync = false);
+// For 0 <= i < VT:
+//		index = NT * i + tid;
+//		if(index < count) reg[i] = data[index];
+// No synchronize after load.
+template<int NT, int VT0, int VT1, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGlobalToReg(int count, InputIt data, int tid,
+	T* reg, bool sync = false);
+// For 0 <= i < VT:
+//		index = NT * i + tid;
+//		if(index < count) reg[i] = data[index];
+// No synchronize after load.
+template<int NT, int VT0, int VT1, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGlobalToRegDefault2(int count, InputIt data, int tid,
+	T* reg, T init, bool sync = false);
+// For 0 <= i < VT:
+//		index = NT * i + tid;
+//		if(index < count) reg[i] = data[index];
+// No synchronize after load.
+// No optimized code path for count < NV (smaller generated code).
+template<int NT, int VT, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGlobalToRegLoop(int count, InputIt data, int tid,
+	T* reg, bool sync = false);
+// For 0 <= i < VT:
+//		index = VT * tid + i.
+//		if(index < count) reg[i] = data[index];
+// No synchronize after load.
+template<int NT, int VT, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGlobalToThread(int count, InputIt data, int tid,
+	T* reg);
+template<int NT, int VT, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGlobalToThreadDefault(int count, InputIt data, int tid,
+	T* reg, T init);
+// For 0 <= i < VT:
+//		index = NT * i + tid;
+//		if(index < count) data[index] = reg[i];
+// Synchronize after load.
+template<int NT, int VT, typename OutputIt, typename T>
+MGPU_DEVICE void DeviceRegToShared(const T* reg, int tid, OutputIt dest,
+	bool sync = true);
+// For 0 <= i < VT:
+//		index = NT * i + tid;
+//		if(index < count) data[index] = reg[i];
+// No synchronize after load.
+template<int NT, int VT, typename OutputIt, typename T>
+MGPU_DEVICE void DeviceRegToGlobal(int count, const T* reg, int tid,
+	OutputIt dest, bool sync = false);
+// For 0 <= index < count:
+//		dest[index] = source[index];
+// This function is intended to replace DeviceGlobalToShared in cases where
+// count is much less than NT * VT.
+template<int NT, typename InputIt, typename OutputIt>
+MGPU_DEVICE void DeviceMemToMemLoop(int count, InputIt source, int tid,
+	OutputIt dest, bool sync = true);
+// For 0 <= index < count:
+//		dest[index] = source[index];
+// Synchronize after store.
+template<int NT, int VT, typename T, typename OutputIt>
+MGPU_DEVICE void DeviceSharedToGlobal(int count, const T* source, int tid,
+	OutputIt dest, bool sync = true);
+// For 0 <= index < count:
+//		dest[index] = source[index];
+// Synchronize after store.
+template<int NT, int VT, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGlobalToShared(int count, InputIt source, int tid,
+	T* dest, bool sync = true);
+template<int NT, int VT0, int VT1, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGlobalToShared2(int count, InputIt source, int tid,
+	T* dest, bool sync = true);
+// For 0 <= index < count:
+//		dest[index] = source[index];
+// Synchronize after store.
+// No optimized code path for count < NV (smaller generated code).
+template<int NT, int VT, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGlobalToSharedLoop(int count, InputIt source, int tid,
+	T* dest, bool sync = true);
+template<int NT, int VT, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGlobalToSharedDefault(int count, InputIt source, int tid,
+	T* dest, T init, bool sync = true);
+template<int NT, int VT0, int VT1, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGlobalToSharedDefault2(int count, InputIt source,
+	int tid, T* dest, T init, bool sync = true);
+// For 0 <= index < count:
+//		dest[index] = source[index];
+// No synchronize.
+template<int NT, int VT, typename InputIt, typename OutputIt>
+MGPU_DEVICE void DeviceGlobalToGlobal(int count, InputIt source, int tid,
+	OutputIt dest, bool sync = false);
+// Transponse VT elements in NT threads (x) into thread-order registers (y)
+// using only NT * VT / 2 elements of shared memory.
+template<int NT, int VT, typename T>
+MGPU_DEVICE void HalfSmemTranspose(const T* x, int tid, T* shared, T* y);
+// For 0 <= i < VT:
+//		index = NT * i + tid;
+//		if(index < count)
+//			gather = indices[index];
+//			reg[i] = data[gather];
+// Synchronize after load.
+template<int NT, int VT, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGather(int count, InputIt data, int indices[VT],
+	int tid, T* reg, bool sync = true);
+template<int NT, int VT, typename InputIt, typename T>
+MGPU_DEVICE void DeviceGatherDefault(int count, InputIt data, int indices[VT],
+	int tid, T* reg, T identity, bool sync = true);
+// For 0 <= i < VT:
+//		index = NT * i + tid;
+//		if(index < count)
+//			scatter = indices[index];
+//			data[scatter] = reg[i];
+// Synchronize after store.
+template<int NT, int VT, typename T, typename OutputIt>
+MGPU_DEVICE void DeviceScatter(int count, const T* reg, int tid,
+	int indices[VT], OutputIt data, bool sync = true);
+// For 0 <= i < VT:
+//		shared[VT * tid + i] = threadReg[i];
+// Synchronize after store.
+// Note this function moves data in THREAD ORDER.
+// (DeviceRegToShared moves data in STRIDED ORDER).
+template<int VT, typename T>
+MGPU_DEVICE void DeviceThreadToShared(const T* threadReg, int tid, T* shared,
+	bool sync = true);
+// For 0 <= i < VT:
+//		threadReg[i] = shared[VT * tid + i];
+// Synchronize after load.
+// Note this function moves data in THREAD ORDER.
+// (DeviceSharedToReg moves data in STRIDED ORDER).
+template<int VT, typename T>
+MGPU_DEVICE void DeviceSharedToThread(const T* shared, int tid, T* threadReg,
+	bool sync = true);
+// For 0 <= index < aCount:
+//		shared[index] = a_global[index];
+// For 0 <= index < bCount:
+//		shared[aCount + index] = b_global[index];
+// VT0 is the lower-bound for predication-free execution:
+//		If count >= NT * VT0, a predication-free branch is taken.
+// VT1 is the upper-bound for loads:
+//		NT * VT1 must >= aCount + bCount.
+template<int NT, int VT0, int VT1, typename T>
+MGPU_DEVICE void DeviceLoad2ToReg(const T* a_global, int aCount,
+	const T* b_global, int bCount, int tid, T* reg, bool sync = false);
+template<int NT, int VT0, int VT1, typename T>
+MGPU_DEVICE void DeviceLoad2ToShared(const T* a_global, int aCount,
+	const T* b_global, int bCount, int tid, T* shared, bool sync = true);
+template<int NT, int VT0, int VT1, typename InputIt1, typename InputIt2,
+	typename T>
+MGPU_DEVICE void DeviceLoad2ToReg(InputIt1 a_global, int aCount,
+	InputIt2 b_global, int bCount, int tid, T* reg, bool sync = false);
+template<int NT, int VT0, int VT1, typename InputIt1, typename InputIt2,
+	typename T>
+MGPU_DEVICE void DeviceLoad2ToShared(InputIt1 a_global, int aCount,
+	InputIt2 b_global, int bCount, int tid, T* shared, bool sync = true);
+// For 0 <= i < VT
+//		index = NT * i + tid;
+//		if(index < count)
+//			gather = indices_shared[index];
+//			dest_global[index] = data_global[gather];
+// Synchronize after load.
+template<int NT, int VT, typename InputIt, typename OutputIt>
+MGPU_DEVICE void DeviceGatherGlobalToGlobal(int count, InputIt data_global,
+	const int* indices_shared, int tid, OutputIt dest_global,
+	bool sync = true);
+// For 0 <= i < VT
+//		index = NT * i + tid
+//		if(index < count)
+//			gather = indices[index];
+//			if(gather < aCount) data = a_global[gather];
+//			else data = b_global[gather - aCount];
+//			dest_global[index] = data;
+// Synchronize after load.
+template<int NT, int VT, typename InputIt1, typename InputIt2,
+	typename T>
+MGPU_DEVICE void DeviceTransferMergeValuesReg(int count, InputIt1 a_global,
+	InputIt2 b_global, int bStart, const int* indices, int tid,
+	T* reg, bool sync = false);
+template<int NT, int VT, typename InputIt1, typename InputIt2,
+	typename OutputIt>
+MGPU_DEVICE void DeviceTransferMergeValuesShared(int count, InputIt1 a_global,
+	InputIt2 b_global, int bStart, const int* indices_shared, int tid,
+	OutputIt dest_global, bool sync = true);
+template<int NT, int VT, typename T>
+MGPU_DEVICE void DeviceTransferMergeValuesReg(int count, const T* a_global,
+	const T* b_global, int bStart, const int* indices, int tid,
+	T* reg, bool sync = false);
+template<int NT, int VT, typename T, typename OutputIt>
+MGPU_DEVICE void DeviceTransferMergeValuesShared(int count, const T* a_global,
+	const T* b_global, int bStart, const int* indices_shared, int tid,
+	OutputIt dest_global, bool sync = true);
+} // namespace mgpu
+#include "device/loadstore.cuh"
+#include "device/ctasegscan.cuh"