warpctc for dcu

99e2985d · lishen · 0bf5eb5f · 0bf5eb5f · 0bf5eb5f · 0bf5eb5f
Commit 99e2985d authored May 16, 2023 by lishen
20 changed files
--- a/.gitignore
+++ b/.gitignore
-.idea
-*~
-Makefile
-build
--- a/README.md
+++ b/README.md
-# PyTorch bindings for Warp-ctc
-[![Build Status](https://travis-ci.org/SeanNaren/warp-ctc.svg?branch=pytorch_bindings)](https://travis-ci.org/SeanNaren/warp-ctc)
-This is an extension onto the original repo found [here](https://github.com/baidu-research/warp-ctc).
-## Installation
-Install [PyTorch](https://github.com/pytorch/pytorch#installation) v0.4.
-`WARP_CTC_PATH` should be set to the location of a built WarpCTC
-(i.e. `libwarpctc.so`).  This defaults to `../build`, so from within a
-new warp-ctc clone you could build WarpCTC like this:
-```bash
-git clone https://github.com/SeanNaren/warp-ctc.git
-cd warp-ctc
-mkdir build; cd build
-cmake ..
-make
-```
-Now install the bindings:
-```bash
-cd pytorch_binding
-python setup.py install
-```
-If you try the above and get a dlopen error on OSX with anaconda3 (as recommended by pytorch):
-```bash
-cd ../pytorch_binding
-python setup.py install
-cd ../build
-cp libwarpctc.dylib /Users/$WHOAMI/anaconda3/lib
-```
-This will resolve the library not loaded error. This can be easily modified to work with other python installs if needed.
-Example to use the bindings below.
-```python
-import torch
-from warpctc_pytorch import CTCLoss
-ctc_loss = CTCLoss()
-# expected shape of seqLength x batchSize x alphabet_size
-probs = torch.FloatTensor([[[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]]]).transpose(0, 1).contiguous()
-labels = torch.IntTensor([1, 2])
-label_sizes = torch.IntTensor([2])
-probs_sizes = torch.IntTensor([2])
-probs.requires_grad_(True)  # tells autograd to compute gradients for probs
-cost = ctc_loss(probs, labels, probs_sizes, label_sizes)
-cost.backward()
-```
-## Documentation
-```
-CTCLoss(size_average=False, length_average=False)
-    # size_average (bool): normalize the loss by the batch size (default: False)
-    # length_average (bool): normalize the loss by the total number of frames in the batch. If True, supersedes size_average (default: False)
-forward(acts, labels, act_lens, label_lens)
-    # acts: Tensor of (seqLength x batch x outputDim) containing output activations from network (before softmax)
-    # labels: 1 dimensional Tensor containing all the targets of the batch in one large sequence
-    # act_lens: Tensor of size (batch) containing size of each output sequence from the network
-    # label_lens: Tensor of (batch) containing label length of each example
-```
\ No newline at end of file
--- a/README_HIP.md
+++ b/README_HIP.md
-# DLIB
-## 环境配置
-使用DCU编译之前，需要准备编译环境。参考
-[environment prepare](environment_prepare.md)
-## 使用源码安装
-### 编译环境准备（以dtk-23.04版本为例）
- 拉取代码
-  ```
-  git clone -b dtk-23.04 http://developer.hpccube.com/codes/aicomponent/warpctc.git
-  ```
- 在[开发者社区](https://developer.hpccube.com/tool/#sdk) DCU Toolkit 中下载 DTK-23.04 解压至 /opt/ 路径下，并建立软链接
-  ```
-  cd /opt && ln -s dtk-23.04 dtk
-  ```
- 导入环境变量以及安装必要依赖库
-  ```shell
-  source /opt/dtk/env.sh
-  ```
-### 编译安装
-#### 编译 Python API
- 使用python安装
-```shell
-python setup.py install
-```
- 使用python编译whl包
-```shell
-python setup.py bdist_wheel
-```
-### 测试
- 验证warpctc的loss正确性（CPU和GPU的一致性）
-```shell
-python3 test_gpu.py
-```
- 验证warpctc的loss的GPU加速效果
-```shell
-python3 test_gpu_speed.py
-```
--- a/doc/deep-speech-ctc-small.png
+++ b/doc/deep-speech-ctc-small.png
--- a/include/contrib/moderngpu/LICENSE
+++ b/include/contrib/moderngpu/LICENSE
-/******************************************************************************
-* Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
-* 
-* Redistribution and use in source and binary forms, with or without
-* modification, are permitted provided that the following conditions are met:
-*     * Redistributions of source code must retain the above copyright
-*       notice, this list of conditions and the following disclaimer.
-*     * Redistributions in binary form must reproduce the above copyright
-*       notice, this list of conditions and the following disclaimer in the
-*       documentation and/or other materials provided with the distribution.
-*     * Neither the name of the NVIDIA CORPORATION nor the
-*       names of its contributors may be used to endorse or promote products
-*       derived from this software without specific prior written permission.
-* 
-* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
-* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
-* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
-* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*
-******************************************************************************/
--- a/include/contrib/moderngpu/include/device/ctaloadbalance.cuh
+++ b/include/contrib/moderngpu/include/device/ctaloadbalance.cuh
-/******************************************************************************
- * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-/******************************************************************************
- *
- * Code and text by Sean Baxter, NVIDIA Research
- * See http://nvlabs.github.io/moderngpu for repository and documentation.
- *
- ******************************************************************************/
-#pragma once
-#include "ctasearch.cuh"
-#include "loadstore.cuh"
-namespace mgpu {
-////////////////////////////////////////////////////////////////////////////////
-// DeviceLoadBalancingSearch
-// Upper Bound search from A (needles) into B (haystack). The A values are
-// natural numbers from aBegin to aEnd. bFirst is the index of the B value at
-// bBegin in shared memory.
-template<int VT, bool RangeCheck>
-MGPU_DEVICE void DeviceSerialLoadBalanceSearch(const int* b_shared, int aBegin,
-	int aEnd, int bFirst, int bBegin, int bEnd, int* a_shared) {
-	int bKey = b_shared[bBegin];
-	#pragma unroll
-	for(int i = 0; i < VT; ++i) {
-		bool p;
-		if(RangeCheck)
-			p = (aBegin < aEnd) && ((bBegin >= bEnd) || (aBegin < bKey));
-		else
-			p = aBegin < bKey;
-		if(p)
-			// Advance A (the needle).
-			a_shared[aBegin++] = bFirst + bBegin;
-		else
-			// Advance B (the haystack).
-			bKey = b_shared[++bBegin];
-	}
-}
-////////////////////////////////////////////////////////////////////////////////
-// CTALoadBalance
-// Computes upper_bound(counting_iterator<int>(first), b_global) - 1.
-// Unlike most other CTA* functions, CTALoadBalance loads from global memory.
-// This returns the loaded B elements at the beginning or end of shared memory
-// depending on the aFirst argument.
-// CTALoadBalance requires NT * VT + 2 slots of shared memory.
-template<int NT, int VT, typename InputIt>
-MGPU_DEVICE int4 CTALoadBalance(int destCount, InputIt b_global,
-	int sourceCount, int block, int tid, const int* mp_global,
-	int* indices_shared, bool loadPrecedingB) {
-	int4 range = ComputeMergeRange(destCount, sourceCount, block, 0, NT * VT,
-		mp_global);
-	int a0 = range.x;
-	int a1 = range.y;
-	int b0 = range.z;
-	int b1 = range.w;
-	if(!b0) loadPrecedingB = false;
-	// Load one trailing term from B. If we're already at the end, fill the
-	// end of the buffer with destCount.
-	int aCount = a1 - a0;
-	int bCount = b1 - b0;
-	int extended = b1 < sourceCount;
-	int loadCount = bCount + extended;
-	int fillCount = NT * VT + 1 - loadCount - aCount;
-	int* a_shared = indices_shared;
-	int* b_shared = indices_shared + aCount + (int)loadPrecedingB;
-	// Load the B values.
-//	DeviceMemToMemLoop<NT>(bCount + extended + (int)loadPrecedingB,
-//		b_global + b0 - (int)loadPrecedingB, tid,
-//		b_shared - (int)loadPrecedingB);
-	for(int i = tid - (int)loadPrecedingB; i < bCount + extended; i += NT)
-		b_shared[i] = b_global[b0 + i];
-	// Fill the end of the array with destCount.
-	for(int i = tid + extended; i < fillCount; i += NT)
-		b_shared[bCount + i] = destCount;
-	__syncthreads();
-	// Run a merge path to find the start of the serial merge for each thread.
-	int diag = VT * tid;
-	int mp = MergePath<MgpuBoundsUpper>(mgpu::counting_iterator<int>(a0),
-		aCount, b_shared, bCount, diag, mgpu::less<int>());
-	int a0tid = a0 + mp;
-	int b0tid = diag - mp;
-	// Subtract 1 from b0 because we want to return upper_bound - 1.
-	DeviceSerialLoadBalanceSearch<VT, false>(b_shared, a0tid, a1, b0 - 1,
-		b0tid, bCount, a_shared - a0);
-	__syncthreads();
-	b0 -= (int)loadPrecedingB;
-	return make_int4(a0, a1, b0, b1);
-}
-} // namespace mgpu
--- a/include/contrib/moderngpu/include/device/ctamerge.cuh
+++ b/include/contrib/moderngpu/include/device/ctamerge.cuh
-/******************************************************************************
- * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-/******************************************************************************
- *
- * Code and text by Sean Baxter, NVIDIA Research
- * See http://nvlabs.github.io/moderngpu for repository and documentation.
- *
- ******************************************************************************/
-#pragma once
-#include "ctasearch.cuh"
-#include "loadstore.cuh"
-#include "sortnetwork.cuh"
-namespace mgpu {
-////////////////////////////////////////////////////////////////////////////////
-// SerialMerge
-template<int VT, bool RangeCheck, typename T, typename Comp>
-MGPU_DEVICE void SerialMerge(const T* keys_shared, int aBegin, int aEnd,
-	int bBegin, int bEnd, T* results, int* indices, Comp comp) {
-	T aKey = keys_shared[aBegin];
-	T bKey = keys_shared[bBegin];
-	#pragma unroll
-	for(int i = 0; i < VT; ++i) {
-		bool p;
-		if(RangeCheck)
-			p = (bBegin >= bEnd) || ((aBegin < aEnd) && !comp(bKey, aKey));
-		else
-			p = !comp(bKey, aKey);
-		results[i] = p ? aKey : bKey;
-		indices[i] = p ? aBegin : bBegin - !RangeCheck;
-		if(p) aKey = keys_shared[++aBegin];
-		else bKey = keys_shared[++bBegin];
-	}
-	__syncthreads();
-}
-////////////////////////////////////////////////////////////////////////////////
-// FindMergeFrame and FindMergesortInterval help mergesort (both CTA and global
-// merge pass levels) locate lists within the single source array.
-// Returns (offset of a, offset of b, length of list).
-MGPU_HOST_DEVICE int3 FindMergesortFrame(int coop, int block, int nv) {
-	// coop is the number of CTAs or threads cooperating to merge two lists into
-	// one. We round block down to the first CTA's ID that is working on this
-	// merge.
-	int start = ~(coop - 1) & block;
-	int size = nv * (coop>> 1);
-	return make_int3(nv * start, nv * start + size, size);
-}
-// Returns (a0, a1, b0, b1) into mergesort input lists between mp0 and mp1.
-MGPU_HOST_DEVICE int4 FindMergesortInterval(int3 frame, int coop, int block,
-	int nv, int count, int mp0, int mp1) {
-	// Locate diag from the start of the A sublist.
-	int diag = nv * block - frame.x;
-	int a0 = frame.x + mp0;
-	int a1 = min(count, frame.x + mp1);
-	int b0 = min(count, frame.y + diag - mp0);
-	int b1 = min(count, frame.y + diag + nv - mp1);
-	// The end partition of the last block for each merge operation is computed
-	// and stored as the begin partition for the subsequent merge. i.e. it is
-	// the same partition but in the wrong coordinate system, so its 0 when it
-	// should be listSize. Correct that by checking if this is the last block
-	// in this merge operation.
-	if(coop - 1 == ((coop - 1) & block)) {
-		a1 = min(count, frame.x + frame.z);
-		b1 = min(count, frame.y + frame.z);
-	}
-	return make_int4(a0, a1, b0, b1);
-}
-////////////////////////////////////////////////////////////////////////////////
-// ComputeMergeRange
-MGPU_HOST_DEVICE int4 ComputeMergeRange(int aCount, int bCount, int block,
-	int coop, int NV, const int* mp_global) {
-	// Load the merge paths computed by the partitioning kernel.
-	int mp0 = mp_global[block];
-	int mp1 = mp_global[block + 1];
-	int gid = NV * block;
-	// Compute the ranges of the sources in global memory.
-	int4 range;
-	if(coop) {
-		int3 frame = FindMergesortFrame(coop, block, NV);
-		range = FindMergesortInterval(frame, coop, block, NV, aCount, mp0,
-			mp1);
-	} else {
-		range.x = mp0;											// a0
-		range.y = mp1;											// a1
-		range.z = gid - range.x;								// b0
-		range.w = min(aCount + bCount, gid + NV) - range.y;		// b1
-	}
-	return range;
-}
-////////////////////////////////////////////////////////////////////////////////
-// CTA mergesort support
-template<int NT, int VT, typename T, typename Comp>
-MGPU_DEVICE void CTABlocksortPass(T* keys_shared, int tid, int count,
-	int coop, T* keys, int* indices, Comp comp) {
-	int list = ~(coop - 1) & tid;
-	int diag = min(count, VT * ((coop - 1) & tid));
-	int start = VT * list;
-	int a0 = min(count, start);
-	int b0 = min(count, start + VT * (coop / 2));
-	int b1 = min(count, start + VT * coop);
-	int p = MergePath<MgpuBoundsLower>(keys_shared + a0, b0 - a0,
-		keys_shared + b0, b1 - b0, diag, comp);
-	SerialMerge<VT, true>(keys_shared, a0 + p, b0, b0 + diag - p, b1, keys,
-		indices, comp);
-}
-template<int NT, int VT, bool HasValues, typename KeyType, typename ValType,
-	typename Comp>
-MGPU_DEVICE void CTABlocksortLoop(ValType threadValues[VT],
-	KeyType* keys_shared, ValType* values_shared, int tid, int count,
-	Comp comp) {
-	#pragma unroll
-	for(int coop = 2; coop <= NT; coop *= 2) {
-		int indices[VT];
-		KeyType keys[VT];
-		CTABlocksortPass<NT, VT>(keys_shared, tid, count, coop, keys,
-			indices, comp);
-		if(HasValues) {
-			// Exchange the values through shared memory.
-			DeviceThreadToShared<VT>(threadValues, tid, values_shared);
-			DeviceGather<NT, VT>(NT * VT, values_shared, indices, tid,
-				threadValues);
-		}
-		// Store results in shared memory in sorted order.
-		DeviceThreadToShared<VT>(keys, tid, keys_shared);
-	}
-}
-////////////////////////////////////////////////////////////////////////////////
-// CTAMergesort
-// Caller provides the keys in shared memory. This functions sorts the first
-// count elements.
-template<int NT, int VT, bool Stable, bool HasValues, typename KeyType,
-	typename ValType, typename Comp>
-MGPU_DEVICE void CTAMergesort(KeyType threadKeys[VT], ValType threadValues[VT],
-	KeyType* keys_shared, ValType* values_shared, int count, int tid,
-	Comp comp) {
-	// Stable sort the keys in the thread.
-	if(VT * tid < count) {
-		if(Stable)
-			OddEvenTransposeSort<VT>(threadKeys, threadValues, comp);
-		else
-			OddEvenMergesort<VT>(threadKeys, threadValues, comp);
-	}
-	// Store the locally sorted keys into shared memory.
-	DeviceThreadToShared<VT>(threadKeys, tid, keys_shared);
-	// Recursively merge lists until the entire CTA is sorted.
-	CTABlocksortLoop<NT, VT, HasValues>(threadValues, keys_shared,
-		values_shared, tid, count, comp);
-}
-template<int NT, int VT, bool Stable, typename KeyType, typename Comp>
-MGPU_DEVICE void CTAMergesortKeys(KeyType threadKeys[VT],
-	KeyType* keys_shared, int count, int tid, Comp comp) {
-	int valuesTemp[VT];
-	CTAMergesort<NT, VT, Stable, false>(threadKeys, valuesTemp, keys_shared,
-		(int*)keys_shared, count, tid, comp);
-}
-template<int NT, int VT, bool Stable, typename KeyType, typename ValType,
-	typename Comp>
-MGPU_DEVICE void CTAMergesortPairs(KeyType threadKeys[VT],
-	ValType threadValues[VT], KeyType* keys_shared, ValType* values_shared,
-	int count, int tid, Comp comp) {
-	CTAMergesort<NT, VT, Stable, true>(threadKeys, threadValues, keys_shared,
-		values_shared, count, tid, comp);
-}
-////////////////////////////////////////////////////////////////////////////////
-// DeviceMergeKeysIndices
-template<int NT, int VT, bool LoadExtended, typename It1, typename It2,
-	typename T, typename Comp>
-MGPU_DEVICE void DeviceMergeKeysIndices(It1 a_global, int aCount, It2 b_global,
-	int bCount, int4 range, int tid, T* keys_shared, T* results, int* indices,
-	Comp comp) {
-	int a0 = range.x;
-	int a1 = range.y;
-	int b0 = range.z;
-	int b1 = range.w;
-	if(LoadExtended) {
-		bool extended = (a1 < aCount) && (b1 < bCount);
-		aCount = a1 - a0;
-		bCount = b1 - b0;
-		int aCount2 = aCount + (int)extended;
-		int bCount2 = bCount + (int)extended;
-		// Load one element past the end of each input to avoid having to use
-		// range checking in the merge loop.
-		DeviceLoad2ToShared<NT, VT, VT + 1>(a_global + a0, aCount2,
-			b_global + b0, bCount2, tid, keys_shared);
-		// Run a Merge Path search for each thread's starting point.
-		int diag = VT * tid;
-		int mp = MergePath<MgpuBoundsLower>(keys_shared, aCount,
-			keys_shared + aCount2, bCount, diag, comp);
-		// Compute the ranges of the sources in shared memory.
-		int a0tid = mp;
-		int b0tid = aCount2 + diag - mp;
-		if(extended) {
-			SerialMerge<VT, false>(keys_shared, a0tid, 0, b0tid, 0, results,
-				indices, comp);
-		} else {
-			int a1tid = aCount;
-			int b1tid = aCount2 + bCount;
-			SerialMerge<VT, true>(keys_shared, a0tid, a1tid, b0tid, b1tid,
-				results, indices, comp);
-		}
-	} else {
-		// Use the input intervals from the ranges between the merge path
-		// intersections.
-		aCount = a1 - a0;
-		bCount = b1 - b0;
-		// Load the data into shared memory.
-		DeviceLoad2ToShared<NT, VT, VT>(a_global + a0, aCount, b_global + b0,
-			bCount, tid, keys_shared);
-		// Run a merge path to find the start of the serial merge for each
-		// thread.
-		int diag = VT * tid;
-		int mp = MergePath<MgpuBoundsLower>(keys_shared, aCount,
-			keys_shared + aCount, bCount, diag, comp);
-		// Compute the ranges of the sources in shared memory.
-		int a0tid = mp;
-		int a1tid = aCount;
-		int b0tid = aCount + diag - mp;
-		int b1tid = aCount + bCount;
-		// Serial merge into register.
-		SerialMerge<VT, true>(keys_shared, a0tid, a1tid, b0tid, b1tid, results,
-			indices, comp);
-	}
-}
-////////////////////////////////////////////////////////////////////////////////
-// DeviceMerge
-// Merge pairs from global memory into global memory. Useful factorization to
-// enable calling from merge, mergesort, and locality sort.
-template<int NT, int VT, bool HasValues, bool LoadExtended, typename KeysIt1,
-	typename KeysIt2, typename KeysIt3, typename ValsIt1, typename ValsIt2,
-	typename KeyType, typename ValsIt3, typename Comp>
-MGPU_DEVICE void DeviceMerge(KeysIt1 aKeys_global, ValsIt1 aVals_global,
-	int aCount, KeysIt2 bKeys_global, ValsIt2 bVals_global, int bCount,
-	int tid, int block, int4 range, KeyType* keys_shared, int* indices_shared,
-	KeysIt3 keys_global, ValsIt3 vals_global, Comp comp) {
-	KeyType results[VT];
-	int indices[VT];
-	DeviceMergeKeysIndices<NT, VT, LoadExtended>(aKeys_global, aCount,
-		bKeys_global, bCount, range, tid, keys_shared, results, indices, comp);
-	// Store merge results back to shared memory.
-	DeviceThreadToShared<VT>(results, tid, keys_shared);
-	// Store merged keys to global memory.
-	aCount = range.y - range.x;
-	bCount = range.w - range.z;
-	DeviceSharedToGlobal<NT, VT>(aCount + bCount, keys_shared, tid,
-		keys_global + NT * VT * block);
-	// Copy the values.
-	if(HasValues) {
-		DeviceThreadToShared<VT>(indices, tid, indices_shared);
-		DeviceTransferMergeValuesShared<NT, VT>(aCount + bCount,
-			aVals_global + range.x, bVals_global + range.z, aCount,
-			indices_shared, tid, vals_global + NT * VT * block);
-	}
-}
-} // namespace mgpu
--- a/include/contrib/moderngpu/include/device/ctascan.cuh
+++ b/include/contrib/moderngpu/include/device/ctascan.cuh
-/******************************************************************************
- * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-/******************************************************************************
- *
- * Code and text by Sean Baxter, NVIDIA Research
- * See http://nvlabs.github.io/moderngpu for repository and documentation.
- *
- ******************************************************************************/
-#pragma once
-#include "../mgpuenums.h"
-#include "deviceutil.cuh"
-#include "intrinsics.cuh"
-namespace mgpu {
-////////////////////////////////////////////////////////////////////////////////
-// CTAReduce
-template<int NT, typename Op = mgpu::plus<int> >
-struct CTAReduce {
-	typedef typename Op::first_argument_type T;
-	enum { Size = NT, Capacity = NT };
-	struct Storage { T shared[Capacity]; };
-	MGPU_DEVICE static T Reduce(int tid, T x, Storage& storage, Op op = Op()) {
-		storage.shared[tid] = x;
-		__syncthreads();
-		// Fold the data in half with each pass.
-		#pragma unroll
-		for(int destCount = NT / 2; destCount >= 1; destCount /= 2) {
-			if(tid < destCount) {
-				// Read from the right half and store to the left half.
-				x = op(x, storage.shared[destCount + tid]);
-				storage.shared[tid] = x;
-			}
-			__syncthreads();
-		}
-		T total = storage.shared[0];
-		__syncthreads();
-		return total;
-	}
-};
-#if __CUDA_ARCH__ >= 300
-template<int NT>
-struct CTAReduce<NT, mgpu::plus<int> > {
-	typedef mgpu::plus<int> Op;
-	typedef int T;
-	enum { Size = NT, Capacity = WARP_SIZE };
-	struct Storage { int shared[Capacity]; };
-	MGPU_DEVICE static int Reduce(int tid, int x, Storage& storage,
-		Op op = Op()) {
-		const int NumSections = WARP_SIZE;
-		const int SecSize = NT / NumSections;
-		int lane = (SecSize - 1) & tid;
-		int sec = tid / SecSize;
-		// In the first phase, threads cooperatively find the reduction within
-		// their segment. The segments are SecSize threads (NT / WARP_SIZE)
-		// wide.
-		#pragma unroll
-		for(int offset = 1; offset < SecSize; offset *= 2)
-			x = shfl_add(x, offset, SecSize);
-		// The last thread in each segment stores the local reduction to shared
-		// memory.
-		if(SecSize - 1 == lane) storage.shared[sec] = x;
-		__syncthreads();
-		// Reduce the totals of each input segment. The spine is WARP_SIZE
-		// threads wide.
-		if(tid < NumSections) {
-			x = storage.shared[tid];
-			#pragma unroll
-			for(int offset = 1; offset < NumSections; offset *= 2)
-				x = shfl_add(x, offset, NumSections);
-			storage.shared[tid] = x;
-		}
-		__syncthreads();
-		int reduction = storage.shared[NumSections - 1];
-		__syncthreads();
-		return reduction;
-	}
-};
-template<int NT>
-struct CTAReduce<NT, mgpu::maximum<int> > {
-	typedef mgpu::maximum<int> Op;
-	enum { Size = NT, Capacity = WARP_SIZE };
-	struct Storage { int shared[Capacity]; };
-	MGPU_DEVICE static int Reduce(int tid, int x, Storage& storage,
-		Op op = Op()) {
-		const int NumSections = WARP_SIZE;
-		const int SecSize = NT / NumSections;
-		int lane = (SecSize - 1) & tid;
-		int sec = tid / SecSize;
-		#pragma unroll
-		for(int offset = 1; offset < SecSize; offset *= 2)
-			x = shfl_max(x, offset, SecSize);
-		if(SecSize - 1 == lane) storage.shared[sec] = x;
-		__syncthreads();
-		if(tid < NumSections) {
-			x = storage.shared[tid];
-			#pragma unroll
-			for(int offset = 1; offset < NumSections; offset *= 2)
-				x = shfl_max(x, offset, NumSections);
-			storage.shared[tid] = x;
-		}
-		__syncthreads();
-		int reduction = storage.shared[NumSections - 1];
-		__syncthreads();
-		return reduction;
-	}
-};
-#endif // __CUDA_ARCH__ >= 300
-////////////////////////////////////////////////////////////////////////////////
-// CTAScan
-template<int NT, typename Op = mgpu::plus<int> >
-struct CTAScan {
-	typedef typename Op::result_type T;
-	enum { Size = NT, Capacity = 2 * NT + 1 };
-	struct Storage { T shared[Capacity]; };
-	MGPU_DEVICE static T Scan(int tid, T x, Storage& storage, T* total,
-		MgpuScanType type = MgpuScanTypeExc, T identity = (T)0, Op op = Op()) {
-		storage.shared[tid] = x;
-		int first = 0;
-		__syncthreads();
-		#pragma unroll
-		for(int offset = 1; offset < NT; offset += offset) {
-			if(tid >= offset)
-				x = op(storage.shared[first + tid - offset], x);
-			first = NT - first;
-			storage.shared[first + tid] = x;
-			__syncthreads();
-		}
-		*total = storage.shared[first + NT - 1];
-		if(MgpuScanTypeExc == type)
-			x = tid ? storage.shared[first + tid - 1] : identity;
-		__syncthreads();
-		return x;
-	}
-	MGPU_DEVICE static T Scan(int tid, T x, Storage& storage) {
-		T total;
-		return Scan(tid, x, storage, &total, MgpuScanTypeExc, (T)0, Op());
-	}
-};
-////////////////////////////////////////////////////////////////////////////////
-// Special partial specialization for CTAScan<NT, ScanOpAdd> on Kepler.
-// This uses the shfl intrinsic to reduce scan latency.
-#if __CUDA_ARCH__ >= 300
-template<int NT>
-struct CTAScan<NT, mgpu::plus<int> > {
-	typedef mgpu::plus<int> Op;
-	enum { Size = NT, NumSegments = WARP_SIZE, SegSize = NT / NumSegments };
-	enum { Capacity = NumSegments + 1 };
-	struct Storage { int shared[Capacity + 1]; };
-	MGPU_DEVICE static int Scan(int tid, int x, Storage& storage, int* total,
-		MgpuScanType type = MgpuScanTypeExc, int identity = 0, Op op = Op()) {
-		// Define WARP_SIZE segments that are NT / WARP_SIZE large.
-		// Each warp makes log(SegSize) shfl_add calls.
-		// The spine makes log(WARP_SIZE) shfl_add calls.
-		int lane = (SegSize - 1) & tid;
-		int segment = tid / SegSize;
-		// Scan each segment using shfl_add.
-		int scan = x;
-		#pragma unroll
-		for(int offset = 1; offset < SegSize; offset *= 2)
-			scan = shfl_add(scan, offset, SegSize);
-		// Store the reduction (last element) of each segment into storage.
-		if(SegSize - 1 == lane) storage.shared[segment] = scan;
-		__syncthreads();
-		// Warp 0 does a full shfl warp scan on the partials. The total is
-		// stored to shared[NumSegments]. (NumSegments = WARP_SIZE)
-		if(tid < NumSegments) {
-			int y = storage.shared[tid];
-			int scan = y;
-			#pragma unroll
-			for(int offset = 1; offset < NumSegments; offset *= 2)
-				scan = shfl_add(scan, offset, NumSegments);
-			storage.shared[tid] = scan - y;
-			if(NumSegments - 1 == tid) storage.shared[NumSegments] = scan;
-		}
-		__syncthreads();
-		// Add the scanned partials back in and convert to exclusive scan.
-		scan += storage.shared[segment];
-		if(MgpuScanTypeExc == type) {
-			scan -= x;
-			if(identity && !tid) scan = identity;
-		}
-		*total = storage.shared[NumSegments];
-		__syncthreads();
-		return scan;
-	}
-	MGPU_DEVICE static int Scan(int tid, int x, Storage& storage) {
-		int total;
-		return Scan(tid, x, storage, &total, MgpuScanTypeExc, 0);
-	}
-};
-#endif // __CUDA_ARCH__ >= 300
-////////////////////////////////////////////////////////////////////////////////
-// CTABinaryScan
-template<int NT>
-MGPU_DEVICE int CTABinaryScan(int tid, bool x, int* shared, int* total) {
-	const int NumWarps = NT / WARP_SIZE;
-	int warp = tid / WARP_SIZE;
-	int lane = (WARP_SIZE - 1);
-	// Store the bit totals for each warp.
-	uint bits = __ballot(x);
-	shared[warp] = popc(bits);
-	__syncthreads();
-#if __CUDA_ARCH__ >= 300
-	if(tid < NumWarps) {
-		int x = shared[tid];
-		int scan = x;
-		#pragma unroll
-		for(int offset = 1; offset < NumWarps; offset *= 2)
-			scan = shfl_add(scan, offset, NumWarps);
-		shared[tid] = scan - x;
-	}
-	__syncthreads();
-#else
-	// Thread 0 scans warp totals.
-	if(!tid) {
-		int scan = 0;
-		#pragma unroll
-		for(int i = 0; i < NumWarps; ++i) {
-			int y = shared[i];
-			shared[i] = scan;
-			scan += y;
-		}
-		shared[NumWarps] = scan;
-	}
-	__syncthreads();
-#endif // __CUDA_ARCH__ >= 300
-	// Add the warp scan back into the partials.
-	int scan = shared[warp] + __popc(bfe(bits, 0, lane));
-	*total = shared[NumWarps];
-	__syncthreads();
-	return scan;
-}
-} // namespace mgpu
--- a/include/contrib/moderngpu/include/device/ctasearch.cuh
+++ b/include/contrib/moderngpu/include/device/ctasearch.cuh
-/******************************************************************************
- * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-/******************************************************************************
- *
- * Code and text by Sean Baxter, NVIDIA Research
- * See http://nvlabs.github.io/moderngpu for repository and documentation.
- *
- ******************************************************************************/
-#pragma once
-#include "deviceutil.cuh"
-#include "../mgpudevice.cuh"
-namespace mgpu {
-template<MgpuBounds Bounds, typename IntT, typename It, typename T,
-	typename Comp>
-MGPU_HOST_DEVICE void BinarySearchIt(It data, int& begin, int& end, T key,
-	int shift, Comp comp) {
-	IntT scale = (1<< shift) - 1;
-	int mid = (int)((begin + scale * end)>> shift);
-	T key2 = data[mid];
-	bool pred = (MgpuBoundsUpper == Bounds) ?
-		!comp(key, key2) :
-		comp(key2, key);
-	if(pred) begin = mid + 1;
-	else end = mid;
-}
-template<MgpuBounds Bounds, typename IntT, typename T, typename It,
-	typename Comp>
-MGPU_HOST_DEVICE int BiasedBinarySearch(It data, int count, T key, int levels,
-	Comp comp) {
-	int begin = 0;
-	int end = count;
-	if(levels >= 4 && begin < end)
-		BinarySearchIt<Bounds, IntT>(data, begin, end, key, 9, comp);
-	if(levels >= 3 && begin < end)
-		BinarySearchIt<Bounds, IntT>(data, begin, end, key, 7, comp);
-	if(levels >= 2 && begin < end)
-		BinarySearchIt<Bounds, IntT>(data, begin, end, key, 5, comp);
-	if(levels >= 1 && begin < end)
-		BinarySearchIt<Bounds, IntT>(data, begin, end, key, 4, comp);
-	while(begin < end)
-		BinarySearchIt<Bounds, int>(data, begin, end, key, 1, comp);
-	return begin;
-}
-template<MgpuBounds Bounds, typename T, typename It, typename Comp>
-MGPU_HOST_DEVICE int BinarySearch(It data, int count, T key, Comp comp) {
-	int begin = 0;
-	int end = count;
-	while(begin < end)
-		BinarySearchIt<Bounds, int>(data, begin, end, key, 1, comp);
-	return begin;
-}
-////////////////////////////////////////////////////////////////////////////////
-// MergePath search
-template<MgpuBounds Bounds, typename It1, typename It2, typename Comp>
-MGPU_HOST_DEVICE int MergePath(It1 a, int aCount, It2 b, int bCount, int diag,
-	Comp comp) {
-	typedef typename std::iterator_traits<It1>::value_type T;
-	int begin = max(0, diag - bCount);
-	int end = min(diag, aCount);
-	while(begin < end) {
-		int mid = (begin + end)>> 1;
-		T aKey = a[mid];
-		T bKey = b[diag - 1 - mid];
-		bool pred = (MgpuBoundsUpper == Bounds) ?
-			comp(aKey, bKey) :
-			!comp(bKey, aKey);
-		if(pred) begin = mid + 1;
-		else end = mid;
-	}
-	return begin;
-}
-////////////////////////////////////////////////////////////////////////////////
-// SegmentedMergePath search
-template<typename InputIt, typename Comp>
-MGPU_HOST_DEVICE int SegmentedMergePath(InputIt keys, int aOffset, int aCount,
-	int bOffset, int bCount, int leftEnd, int rightStart, int diag, Comp comp) {
-	// leftEnd and rightStart are defined from the origin, and diag is defined
-	// from aOffset.
-	// We only need to run a Merge Path search if the diagonal intersects the
-	// segment that strides the left and right halves (i.e. is between leftEnd
-	// and rightStart).
-	if(aOffset + diag <= leftEnd) return diag;
-	if(aOffset + diag >= rightStart) return aCount;
-	bCount = min(bCount, rightStart - bOffset);
-	int begin = max(max(leftEnd - aOffset, 0), diag - bCount);
-	int end = min(diag, aCount);
-	while(begin < end) {
-		int mid = (begin + end)>> 1;
-		int ai = aOffset + mid;
-		int bi = bOffset + diag - 1 - mid;
-		bool pred = !comp(keys[bi], keys[ai]);
-		if(pred) begin = mid + 1;
-		else end = mid;
-	}
-	return begin;
-}
-////////////////////////////////////////////////////////////////////////////////
-// BalancedPath search
-template<bool Duplicates, typename IntT, typename InputIt1, typename InputIt2,
-	typename Comp>
-MGPU_HOST_DEVICE int2 BalancedPath(InputIt1 a, int aCount, InputIt2 b,
-	int bCount, int diag, int levels, Comp comp) {
-	typedef typename std::iterator_traits<InputIt1>::value_type T;
-	int p = MergePath<MgpuBoundsLower>(a, aCount, b, bCount, diag, comp);
-	int aIndex = p;
-	int bIndex = diag - p;
-	bool star = false;
-	if(bIndex < bCount) {
-		if(Duplicates) {
-			T x = b[bIndex];
-			// Search for the beginning of the duplicate run in both A and B.
-			// Because
-			int aStart = BiasedBinarySearch<MgpuBoundsLower, IntT>(a, aIndex, x,
-				levels, comp);
-			int bStart = BiasedBinarySearch<MgpuBoundsLower, IntT>(b, bIndex, x,
-				levels, comp);
-			// The distance between the merge path and the lower_bound is the
-			// 'run'. We add up the a- and b- runs and evenly distribute them to
-			// get a stairstep path.
-			int aRun = aIndex - aStart;
-			int bRun = bIndex - bStart;
-			int xCount = aRun + bRun;
-			// Attempt to advance b and regress a.
-			int bAdvance = max(xCount>> 1, bRun);
-			int bEnd = min(bCount, bStart + bAdvance + 1);
-			int bRunEnd = BinarySearch<MgpuBoundsUpper>(b + bIndex,
-				bEnd - bIndex, x, comp) + bIndex;
-			bRun = bRunEnd - bStart;
-			bAdvance = min(bAdvance, bRun);
-			int aAdvance = xCount - bAdvance;
-			bool roundUp = (aAdvance == bAdvance + 1) && (bAdvance < bRun);
-			aIndex = aStart + aAdvance;
-			if(roundUp) star = true;
-		} else {
-			if(aIndex && aCount) {
-				T aKey = a[aIndex - 1];
-				T bKey = b[bIndex];
-				// If the last consumed element in A (aIndex - 1) is the same as
-				// the next element in B (bIndex), we're sitting at a starred
-				// partition.
-				if(!comp(aKey, bKey)) star = true;
-			}
-		}
-	}
-	return make_int2(aIndex, star);
-}
-} // namespace mgpu
--- a/include/contrib/moderngpu/include/device/ctasegreduce.cuh
+++ b/include/contrib/moderngpu/include/device/ctasegreduce.cuh
-/******************************************************************************
- * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-/******************************************************************************
- *
- * Code and text by Sean Baxter, NVIDIA Research
- * See http://nvlabs.github.io/moderngpu for repository and documentation.
- *
- ******************************************************************************/
-#pragma once
-#include "ctasegscan.cuh"
-#include "ctasearch.cuh"
-namespace mgpu {
-////////////////////////////////////////////////////////////////////////////////
-// Segmented reduce utility functions.
-// Extract the upper-bound indices from the coded ranges. Decrement to include
-// the first addressed row/segment.
-struct SegReduceRange {
-	int begin;
-	int end;
-	int total;
-	bool flushLast;
-};
-MGPU_DEVICE SegReduceRange DeviceShiftRange(int limit0, int limit1) {
-	SegReduceRange range;
-	range.begin = 0x7fffffff & limit0;
-	range.end = 0x7fffffff & limit1;
-	range.total = range.end - range.begin;
-	range.flushLast = 0 == (0x80000000 & limit1);
-	range.end += !range.flushLast;
-	return range;
-}
-// Reconstitute row/segment indices from a starting row index and packed end
-// flags. Used for pre-processed versions of interval reduce and interval Spmv.
-template<int VT>
-MGPU_DEVICE void DeviceExpandFlagsToRows(int first, int endFlags,
-	int rows[VT + 1]) {
-	rows[0] = first;
-	#pragma unroll
-	for(int i = 0; i < VT; ++i) {
-		if((1<< i) & endFlags) ++first;
-		rows[i + 1] = first;
-	}
-}
-////////////////////////////////////////////////////////////////////////////////
-// After loading CSR terms into shared memory, each thread binary searches
-// (upper-bound) to find its starting point. Each thread then walks forward,
-// emitting the csr0-relative row indices to register.
-template<int NT, int VT>
-MGPU_DEVICE int DeviceExpandCsrRows(int tidOffset, int* csr_shared,
-	int numRows, int end, int rows[VT + 1], int rowStarts[VT]) {
-	// Each thread binary searches for its starting row.
-	int row = BinarySearch<MgpuBoundsUpper>(csr_shared, numRows, tidOffset,
-		mgpu::less<int>()) - 1;
-	// Each thread starts at row and scans forward, emitting row IDs into
-	// register. Store the CTA-local row index (starts at 0) to rows and the
-	// start of the row (globally) to rowStarts.
-	int curOffset = csr_shared[row];
-	int nextOffset = (row + 1 < numRows) ? csr_shared[row + 1] : end;
-	rows[0] = row;
-	rowStarts[0] = curOffset;
-	int endFlags = 0;
-	#pragma unroll
-	for(int i = 1; i <= VT; ++i) {
-		// Advance the row cursor when the iterator hits the next row offset.
-		if(tidOffset + i == nextOffset) {
-			// Set an end flag when the cursor advances to the next row.
-			endFlags |= 1<< (i - 1);
-			// Advance the cursor and load the next row offset.
-			++row;
-			curOffset = nextOffset;
-			nextOffset = (row + 1 < numRows) ? csr_shared[row + 1] : end;
-		}
-		rows[i] = row;
-		if(i < VT) rowStarts[i] = curOffset;
-	}
-	__syncthreads();
-	return endFlags;
-}
-////////////////////////////////////////////////////////////////////////////////
-// DeviceSegReducePrepare
-// Expand non-empty interval of CSR elements into row indices. Compute end-flags
-// by comparing adjacent row IDs.
-// DeviceSegReducePrepare may be called either by a pre-processing kernel or by
-// the kernel that actually evaluates the segmented reduction if no preprocesing
-// is desired.
-struct SegReduceTerms {
-	int endFlags;
-	int tidDelta;
-};
-template<int NT, int VT>
-MGPU_DEVICE SegReduceTerms DeviceSegReducePrepare(int* csr_shared, int numRows,
-	int tid, int gid, bool flushLast, int rows[VT + 1], int rowStarts[VT]) {
-	// Pass a sentinel (end) to point to the next segment start. If we flush,
-	// this is the end of this tile. Otherwise it is INT_MAX
-	int endFlags = DeviceExpandCsrRows<NT, VT>(gid + VT * tid, csr_shared,
-		numRows, flushLast ? (gid + NT * VT) : INT_MAX, rows, rowStarts);
-	// Find the distance to to scan to compute carry-in for each thread. Use the
-	// existance of an end flag anywhere in the thread to determine if carry-out
-	// values from the left should propagate through to the right.
-	int tidDelta = DeviceFindSegScanDelta<NT>(tid, rows[0] != rows[VT],
-		csr_shared);
-	SegReduceTerms terms = { endFlags, tidDelta };
-	return terms;
-}
-////////////////////////////////////////////////////////////////////////////////
-// CTASegReduce
-// Core segmented reduction code. Supports fast-path and slow-path for intra-CTA
-// segmented reduction. Stores partials to global memory.
-// Callers feed CTASegReduce::ReduceToGlobal values in thread order.
-template<int NT, int VT, bool HalfCapacity, typename T, typename Op>
-struct CTASegReduce {
-	typedef CTASegScan<NT, Op> SegScan;
-	enum {
-		NV = NT * VT,
-		Capacity = HalfCapacity ? (NV / 2) : NV
-	};
-	union Storage {
-		typename SegScan::Storage segScanStorage;
-		T values[Capacity];
-	};
-	template<typename DestIt>
-	MGPU_DEVICE static void ReduceToGlobal(const int rows[VT + 1], int total,
-		int tidDelta, int startRow, int block, int tid, T data[VT],
-		DestIt dest_global, T* carryOut_global, T identity, Op op,
-		Storage& storage) {
-		// Run a segmented scan within the thread.
-		T x, localScan[VT];
-		#pragma unroll
-		for(int i = 0; i < VT; ++i) {
-			x = i ? op(x, data[i]) : data[i];
-			localScan[i] = x;
-			if(rows[i] != rows[i + 1]) x = identity;
-		}
-		// Run a parallel segmented scan over the carry-out values to compute
-		// carry-in.
-		T carryOut;
-		T carryIn = SegScan::SegScanDelta(tid, tidDelta, x,
-			storage.segScanStorage, &carryOut, identity, op);
-		// Store the carry-out for the entire CTA to global memory.
-		if(!tid) carryOut_global[block] = carryOut;
-		dest_global += startRow;
-		if(HalfCapacity && total > Capacity) {
-			// Add carry-in to each thread-local scan value. Store directly
-			// to global.
-			#pragma unroll
-			for(int i = 0; i < VT; ++i) {
-				// Add the carry-in to the local scan.
-				T x2 = op(carryIn, localScan[i]);
-				// Store on the end flag and clear the carry-in.
-				if(rows[i] != rows[i + 1]) {
-					carryIn = identity;
-					dest_global[rows[i]] = x2;
-				}
-			}
-		} else {
-			// All partials fit in shared memory. Add carry-in to each thread-
-			// local scan value.
-			#pragma unroll
-			for(int i = 0; i < VT; ++i) {
-				// Add the carry-in to the local scan.
-				T x2 = op(carryIn, localScan[i]);
-				// Store reduction when the segment changes and clear the
-				// carry-in.
-				if(rows[i] != rows[i + 1]) {
-					storage.values[rows[i]] = x2;
-					carryIn = identity;
-				}
-			}
-			__syncthreads();
-			// Cooperatively store reductions to global memory.
-			for(int index = tid; index < total; index += NT)
-				dest_global[index] = storage.values[index];
-			__syncthreads();
-		}
-	}
-};
-} // namespace mgpu
--- a/include/contrib/moderngpu/include/device/ctasegscan.cuh
+++ b/include/contrib/moderngpu/include/device/ctasegscan.cuh
-/******************************************************************************
- * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-/******************************************************************************
- *
- * Code and text by Sean Baxter, NVIDIA Research
- * See http://nvlabs.github.io/moderngpu for repository and documentation.
- *
- ******************************************************************************/
-#pragma once
-#include "ctascan.cuh"
-namespace mgpu {
-////////////////////////////////////////////////////////////////////////////////
-// DeviceFindSegScanDelta
-// Runs an inclusive max-index scan over binary inputs.
-template<int NT>
-MGPU_DEVICE int DeviceFindSegScanDelta(int tid, bool flag, int* delta_shared) {
-	const int NumWarps = NT / 32;
-	int warp = tid / 32;
-	int lane = 31 & tid;
-	uint warpMask = 0xffffffff>> (31 - lane);		// inclusive search
-	uint ctaMask = 0x7fffffff>> (31 - lane);		// exclusive search
-	uint warpBits = __ballot(flag);
-	delta_shared[warp] = warpBits;
-	__syncthreads();
-	if(tid < NumWarps) {
-		uint ctaBits = __ballot(0 != delta_shared[tid]);
-		int warpSegment = 31 - clz(ctaMask & ctaBits);
-		int start = (-1 != warpSegment) ?
-			(31 - clz(delta_shared[warpSegment]) + 32 * warpSegment) : 0;
-		delta_shared[NumWarps + tid] = start;
-	}
-	__syncthreads();
-	// Find the closest flag to the left of this thread within the warp.
-	// Include the flag for this thread.
-	int start = 31 - clz(warpMask & warpBits);
-	if(-1 != start) start += ~31 & tid;
-	else start = delta_shared[NumWarps + warp];
-	__syncthreads();
-	return tid - start;
-}
-////////////////////////////////////////////////////////////////////////////////
-// CTASegScan
-template<int NT, typename _Op = mgpu::plus<int> >
-struct CTASegScan {
-	typedef _Op Op;
-	typedef typename Op::result_type T;
-	enum { NumWarps = NT / 32, Size = NT, Capacity = 2 * NT };
-	union Storage {
-		int delta[NumWarps];
-		T values[Capacity];
-	};
-	// Each thread passes the reduction of the LAST SEGMENT that it covers.
-	// flag is set to true if there's at least one segment flag in the thread.
-	// SegScan returns the reduction of values for the first segment in this
-	// thread over the preceding threads.
-	// Return the value init for the first thread.
-	// When scanning single elements per thread, interpret the flag as a BEGIN
-	// FLAG. If tid's flag is set, its value belongs to thread tid + 1, not
-	// thread tid.
-	// The function returns the reduction of the last segment in the CTA.
-	MGPU_DEVICE static T SegScanDelta(int tid, int tidDelta, T x,
-		Storage& storage, T* carryOut, T identity = (T)0, Op op = Op()) {
-		// Run an inclusive scan
-		int first = 0;
-		storage.values[first + tid] = x;
-		__syncthreads();
-		#pragma unroll
-		for(int offset = 1; offset < NT; offset += offset) {
-			if(tidDelta >= offset)
-				x = op(storage.values[first + tid - offset], x);
-			first = NT - first;
-			storage.values[first + tid] = x;
-			__syncthreads();
-		}
-		// Get the exclusive scan.
-		x = tid ? storage.values[first + tid - 1] : identity;
-		*carryOut = storage.values[first + NT - 1];
-		__syncthreads();
-		return x;
-	}
-	MGPU_DEVICE static T SegScan(int tid, T x, bool flag, Storage& storage,
-		T* carryOut, T identity = (T)0, Op op = Op()) {
-		// Find the left-most thread that covers the first segment of this
-		// thread.
-		int tidDelta = DeviceFindSegScanDelta<NT>(tid, flag, storage.delta);
-		return SegScanDelta(tid, tidDelta, x, storage, carryOut, identity, op);
-	}
-};
-} // namespace mgpu
--- a/include/contrib/moderngpu/include/device/ctasegsort.cuh
+++ b/include/contrib/moderngpu/include/device/ctasegsort.cuh
-/******************************************************************************
- * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-/******************************************************************************
- *
- * Code and text by Sean Baxter, NVIDIA Research
- * See http://nvlabs.github.io/moderngpu for repository and documentation.
- *
- ******************************************************************************/
-#pragma once
-#include "ctascan.cuh"
-#include "ctasearch.cuh"
-#include "loadstore.cuh"
-#include "sortnetwork.cuh"
-namespace mgpu {
-template<int VT, typename T, typename Comp>
-MGPU_DEVICE void SegmentedSerialMerge(const T* keys_shared, int aBegin,
-	int aEnd, int bBegin, int bEnd, T results[VT], int indices[VT],
-	int leftEnd, int rightStart, Comp comp, bool sync = true) {
-	bEnd = min(rightStart, bEnd);
-	T aKey = keys_shared[aBegin];
-	T bKey = keys_shared[bBegin];
-	#pragma unroll
-	for(int i = 0; i < VT; ++i) {
-		bool p;
-		// If A has run out of inputs, emit B.
-		if(aBegin >= aEnd)
-			p = false;
-		else if(bBegin >= bEnd || aBegin < leftEnd)
-			// B has hit the end of the middle segment.
-			// Emit A if A has inputs remaining in the middle segment.
-			p = true;
-		else
-			// Emit the smaller element in the middle segment.
-			p = !comp(bKey, aKey);
-		results[i] = p ? aKey : bKey;
-		indices[i] = p ? aBegin : bBegin;
-		if(p) aKey = keys_shared[++aBegin];
-		else bKey = keys_shared[++bBegin];
-	}
-	if(sync) { __syncthreads(); }
-}
-////////////////////////////////////////////////////////////////////////////////
-// CTASegsortPass
-template<int NT, int VT, typename T, typename Comp>
-MGPU_DEVICE void CTASegsortPass(T* keys_shared, int* ranges_shared, int tid,
-	int pass, T results[VT], int indices[VT], int2& activeRange, Comp comp) {
-	// Locate the intervals of the input lists.
-	int3 frame = FindMergesortFrame(2<< pass, tid, VT);
-	int a0 = frame.x;
-	int b0 = frame.y;
-	int listLen = frame.z;
-	int list = tid>> pass;
-	int listParity = 1 & list;
-	int diag = VT * tid - frame.x;
-	// Fetch the active range for the list this thread's list is merging with.
-	int siblingRange = ranges_shared[1 ^ list];
-	int siblingStart = 0x0000ffff & siblingRange;
-	int siblingEnd = siblingRange>> 16;
-	// Create a new active range for the merge.
-	int leftEnd = listParity ? siblingEnd : activeRange.y;
-	int rightStart = listParity ? activeRange.x : siblingStart;
-	activeRange.x = min(activeRange.x, siblingStart);
-	activeRange.y = max(activeRange.y, siblingEnd);
-	int p = SegmentedMergePath(keys_shared, a0, listLen, b0, listLen, leftEnd,
-		rightStart, diag, comp);
-	int a0tid = a0 + p;
-	int b0tid = b0 + diag - p;
-	SegmentedSerialMerge<VT>(keys_shared, a0tid, b0, b0tid, b0 + listLen,
-		results, indices, leftEnd, rightStart, comp);
-	// Store the ranges to shared memory.
-	if(0 == diag)
-		ranges_shared[list>> 1] =
-			(int)bfi(activeRange.y, activeRange.x, 16, 16);
-}
-////////////////////////////////////////////////////////////////////////////////
-// CTASegsortLoop
-template<int NT, int VT, bool HasValues, typename KeyType, typename ValType,
-	typename Comp>
-MGPU_DEVICE int2 CTASegsortLoop(KeyType threadKeys[VT],
-	ValType threadValues[VT], KeyType* keys_shared, ValType* values_shared,
-	int* ranges_shared, int tid, int2 activeRange, Comp comp) {
-	const int NumPasses = sLogPow2<NT>::value;
-	#pragma unroll
-	for(int pass = 0; pass < NumPasses; ++pass) {
-		int indices[VT];
-		CTASegsortPass<NT, VT>(keys_shared, ranges_shared, tid, pass,
-			threadKeys, indices, activeRange, comp);
-		if(HasValues) {
-			// Exchange values through shared memory.
-			DeviceThreadToShared<VT>(threadValues, tid, values_shared);
-			DeviceGather<NT, VT>(NT * VT, values_shared, indices, tid,
-				threadValues);
-		}
-		// Store results in shared memory in sorted order.
-		DeviceThreadToShared<VT>(threadKeys, tid, keys_shared);
-	}
-	return activeRange;
-}
-////////////////////////////////////////////////////////////////////////////////
-// CTASegsort
-// Pass keys and values in register. On return, values are returned in register
-// and keys returned in shared memory.
-template<int NT, int VT, bool Stable, bool HasValues, typename KeyType,
-	typename ValType, typename Comp>
-MGPU_DEVICE int2 CTASegsort(KeyType threadKeys[VT], ValType threadValues[VT],
-	int tid, int headFlags, KeyType* keys_shared, ValType* values_shared,
-	int* ranges_shared, Comp comp) {
-	if(Stable)
-		// Odd-even transpose sort.
-		OddEvenTransposeSortFlags<VT>(threadKeys, threadValues, headFlags,
-			comp);
-	else
-		// Batcher's odd-even mergesort.
-		OddEvenMergesortFlags<VT>(threadKeys, threadValues, headFlags, comp);
-	// Record the first and last occurrence of head flags in this segment.
-	int blockEnd = 31 - clz(headFlags);
-	if(-1 != blockEnd) blockEnd += VT * tid;
-	int blockStart = ffs(headFlags);
-	blockStart = blockStart ? (VT * tid - 1 + blockStart) : (NT * VT);
-	ranges_shared[tid] = (int)bfi(blockEnd, blockStart, 16, 16);
-	// Store back to shared mem. The values are in VT-length sorted lists.
-	// These are merged recursively.
-	DeviceThreadToShared<VT>(threadKeys, tid, keys_shared);
-	int2 activeRange = CTASegsortLoop<NT, VT, HasValues>(threadKeys,
-		threadValues, keys_shared, values_shared, ranges_shared, tid,
-		make_int2(blockStart, blockEnd), comp);
-	return activeRange;
-}
-template<int NT, int VT, bool Stable, typename KeyType, typename Comp>
-MGPU_DEVICE int2 CTASegsortKeys(KeyType threadKeys[VT], int tid, int headFlags,
-	KeyType* keys_shared, int* ranges_shared, Comp comp) {
-	int valuesTemp[VT];
-	return CTASegsort<NT, VT, Stable, false>(threadKeys, valuesTemp, tid,
-		headFlags, keys_shared, (int*)keys_shared, ranges_shared, comp);
-}
-template<int NT, int VT, bool Stable, typename KeyType, typename ValType,
-	typename Comp>
-MGPU_DEVICE int2 CTASegsortPairs(KeyType threadKeys[VT],
-	ValType threadValues[VT], int tid, int headFlags, KeyType* keys_shared,
-	ValType* values_shared, int* ranges_shared, Comp comp) {
-	return CTASegsort<NT, VT, Stable, true>(threadKeys, threadValues, tid,
-		headFlags, keys_shared, values_shared, ranges_shared, comp);
-}
-////////////////////////////////////////////////////////////////////////////////
-// DeviceSegBlocksort
-// Load keys and values from global memory, sort in shared memory, and store
-// back to global memory. Store the left-most and right-most encountered
-// headflag locations to ranges_global to prepare for the next pass.
-// This function is factored out of the blocksort kernel to allow easier
-// customization of that kernel - we have two implementations currently:
-// sort over indices and sort over bitfield.
-template<int NT, int VT, bool Stable, bool HasValues, typename InputIt1,
-	typename InputIt2, typename KeyType, typename ValType, typename OutputIt1,
-	typename OutputIt2, typename Comp>
-MGPU_DEVICE void DeviceSegBlocksort(InputIt1 keys_global,
-	InputIt2 values_global, int count2, KeyType* keys_shared,
-	ValType* values_shared, int* ranges_shared, int headFlags, int tid,
-	int block, OutputIt1 keysDest_global, OutputIt2 valsDest_global,
-	int* ranges_global, Comp comp) {
-	// Load keys into register in thread order.
-	int gid = NT * VT * block;
-	KeyType threadKeys[VT];
-	DeviceGlobalToShared<NT, VT>(count2, keys_global + gid, tid, keys_shared);
-	DeviceSharedToThread<VT>(keys_shared, tid, threadKeys);
-	// Load the values from global memory and into register in thread order.
-	ValType threadValues[VT];
-	if(HasValues) {
-		DeviceGlobalToShared<NT, VT>(count2, values_global + gid, tid,
-			values_shared);
-		DeviceSharedToThread<VT>(values_shared, tid, threadValues);
-	}
-	// Run the CTA segmented blocksort.
-	int2 activeRange = CTASegsort<NT, VT, Stable, HasValues>(threadKeys,
-		threadValues, tid, headFlags, keys_shared, values_shared, ranges_shared,
-		comp);
-	// Store the keys to global memory.
-	DeviceSharedToGlobal<NT, VT>(count2, keys_shared, tid,
-		 keysDest_global + gid);
-	if(HasValues) {
-		// Store the values to global memory.xk b
-		DeviceThreadToShared<VT>(threadValues, tid, values_shared);
-		DeviceSharedToGlobal<NT, VT>(count2, values_shared, tid,
-			valsDest_global + gid, false);
-	}
-	// Store the 16-bit packed ranges. These are used by all merge kernels and
-	// the first level of global segmented merge path partitioning.
-	if(!tid)
-		ranges_global[block] = bfi(activeRange.y, activeRange.x, 16, 16);
-}
-////////////////////////////////////////////////////////////////////////////////
-// DeviceIndicesToHeadFlags
-// Load indices from an array and cooperatively turn into a head flag bitfield
-// for each thread.
-template<int NT, int VT>
-MGPU_DEVICE int DeviceIndicesToHeadFlags(const int* indices_global,
-	const int* partitions_global, int tid, int block, int count2,
-	int* words_shared, byte* flags_shared) {
-	const int FlagWordsPerThread = MGPU_DIV_UP(VT, 4);
-	int gid = NT * VT * block;
-	int p0 = partitions_global[block];
-	int p1 = partitions_global[block + 1];
-	int headFlags = 0;
-	if(p1 > p0 || count2 < NT * VT) {
-		// Clear the flag bytes, then loop through the indices and poke in flag
-		// values.
-		#pragma unroll
-		for(int i = 0; i < FlagWordsPerThread; ++i)
-			words_shared[NT * i + tid] = 0;
-		__syncthreads();
-		for(int index = p0 + tid; index < p1; index += NT) {
-			int headFlag = indices_global[index];
-			flags_shared[headFlag - gid] = 1;
-		}
-		__syncthreads();
-		// Combine all the head flags for this thread.
-		int first = VT * tid;
-		int offset = first / 4;
-		int prev = words_shared[offset];
-		int mask = 0x3210 + 0x1111 * (3 & first);
-		#pragma unroll
-		for(int i = 0; i < FlagWordsPerThread; ++i) {
-			// Gather the next four flags.
-			int next = words_shared[offset + 1 + i];
-			int x = prmt(prev, next, mask);
-			prev = next;
-			// Set the head flag bits.
-			if(0x00000001 & x) headFlags |= 1<< (4 * i);
-			if(0x00000100 & x) headFlags |= 1<< (4 * i + 1);
-			if(0x00010000 & x) headFlags |= 1<< (4 * i + 2);
-			if(0x01000000 & x) headFlags |= 1<< (4 * i + 3);
-		}
-		__syncthreads();
-		// Set head flags for out-of-range keys.
-		int outOfRange = min(VT, first + VT - count2);
-		if(outOfRange > 0)
-			headFlags = bfi(0xffffffff, headFlags, VT - outOfRange, outOfRange);
-		// Clear head flags above VT.
-		headFlags &= (1<< VT) - 1;
-	}
-	return headFlags;
-}
-////////////////////////////////////////////////////////////////////////////////
-// SegSortSupport
-struct SegSortSupport {
-	int* ranges_global;
-	int2* ranges2_global;
-	int4* mergeList_global;
-	int* copyList_global;
-	int2* queueCounters_global;
-	int2* nextCounters_global;
-	byte* copyStatus_global;
-};
-////////////////////////////////////////////////////////////////////////////////
-// DeviceSegSortMerge
-template<int NT, int VT, bool HasValues, typename KeyType, typename ValueType,
-	typename Comp>
-MGPU_DEVICE void DeviceSegSortMerge(const KeyType* keys_global,
-	const ValueType* values_global, int2 segmentRange, int tid,
-	int block, int4 range, int pass, KeyType* keys_shared,
-	int* indices_shared, KeyType* keysDest_global, ValueType* valsDest_global,
-	Comp comp) {
-	const int NV = NT * VT;
-	int gid = NV * block;
-	// Load the local compressed segment indices.
-	int a0 = range.x;
-	int aCount = range.y - range.x;
-	int b0 = range.z;
-	int bCount = range.w - range.z;
-	DeviceLoad2ToShared<NT, VT, VT>(keys_global + a0, aCount, keys_global + b0,
-		bCount, tid, keys_shared);
-	////////////////////////////////////////////////////////////////////////////
-	// Run a merge path to find the starting point for each thread to merge.
-	// If the entire warp fits into the already-sorted segments, we can skip
-	// sorting it and leave its keys in shared memory. Doing this on the warp
-	// level rather than thread level (also legal) gives slightly better
-	// performance.
-	int segStart = segmentRange.x;
-	int segEnd = segmentRange.y;
-	int listParity = 1 & (block>> pass);
-	int warpOffset = VT * (~31 & tid);
-	bool sortWarp = listParity ?
-		// The spliced segment is to the left (segStart).
-		(warpOffset < segStart) :
-		// The spliced segment is to the right (segEnd).
-		(warpOffset + 32 * VT > segEnd);
-	KeyType threadKeys[VT];
-	int indices[VT];
-	if(sortWarp) {
-		int diag = VT * tid;
-		int mp = SegmentedMergePath(keys_shared, 0, aCount, aCount, bCount,
-			listParity ? 0 : segEnd, listParity ? segStart : NV, diag, comp);
-		int a0tid = mp;
-		int a1tid = aCount;
-		int b0tid = aCount + diag - mp;
-		int b1tid = aCount + bCount;
-		// Serial merge into register. All threads in the CTA so we hoist the
-		// check for list parity outside the function call to simplify the
-		// logic. Unlike in the blocksort, this does not cause warp divergence.
-		SegmentedSerialMerge<VT>(keys_shared, a0tid, a1tid, b0tid, b1tid,
-			threadKeys, indices, listParity ? 0 : segEnd,
-			listParity ? segStart : NV, comp, false);
-	}
-	__syncthreads();
-	// Store sorted data in register back to shared memory. Then copy to global.
-	if(sortWarp)
-		DeviceThreadToShared<VT>(threadKeys, tid, keys_shared, false);
-	__syncthreads();
-	DeviceSharedToGlobal<NT, VT>(aCount + bCount, keys_shared, tid,
-		keysDest_global + gid);
-	////////////////////////////////////////////////////////////////////////////
-	// Use the merge indices to gather values from global memory. Store directly
-	// to valsDest_global.
-	if(HasValues) {
-		// Transpose the gather indices to help coalesce loads.
-		if(sortWarp)
-			DeviceThreadToShared<VT>(indices, tid, indices_shared, false);
-		else {
-			#pragma unroll
-			for(int i = 0; i < VT; ++i)
-				indices_shared[VT * tid + i] = VT * tid + i;
-		}
-		__syncthreads();
-		DeviceTransferMergeValuesShared<NT, VT>(aCount + bCount,
-			values_global + a0,  values_global + b0, aCount, indices_shared,
-			tid, valsDest_global + NV * block);
-	}
-}
-////////////////////////////////////////////////////////////////////////////////
-// DeviceSegSortCopy
-template<int NT, int VT, bool HasValues, typename KeyType, typename ValueType>
-MGPU_DEVICE void DeviceSegSortCopy(const KeyType* keys_global,
-	const ValueType* values_global, int tid, int block, int count,
-	KeyType* keysDest_global, ValueType* valsDest_global) {
-	int gid = NT * VT * block;
-	int count2 = min(NT * VT, count - gid);
-	DeviceGlobalToGlobal<NT, VT>(count2, keys_global + gid, tid,
-		keysDest_global + gid);
-	if(HasValues)
-		DeviceGlobalToGlobal<NT, VT>(count2, values_global + gid, tid,
-			valsDest_global + gid);
-}
-} // namespace mgpu
--- a/include/contrib/moderngpu/include/device/ctasortedsearch.cuh
+++ b/include/contrib/moderngpu/include/device/ctasortedsearch.cuh
-/******************************************************************************
- * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-/******************************************************************************
- *
- * Code and text by Sean Baxter, NVIDIA Research
- * See http://nvlabs.github.io/moderngpu for repository and documentation.
- *
- ******************************************************************************/
-#pragma once
-#include "../mgpudevice.cuh"
-#include "ctasearch.cuh"
-namespace mgpu {
-////////////////////////////////////////////////////////////////////////////////
-// DeviceSerialSearch
-template<int VT, MgpuBounds Bounds, bool RangeCheck, bool IndexA, bool MatchA,
-	bool IndexB, bool MatchB, typename T, typename Comp>
-MGPU_DEVICE int3 DeviceSerialSearch(const T* keys_shared, int aBegin,
-	int aEnd, int bBegin, int bEnd, int aOffset, int bOffset, int* indices,
-	Comp comp) {
-	const int FlagA = IndexA ? 0x80000000 : 1;
-	const int FlagB = IndexB ? 0x80000000 : 1;
-	T aKey = keys_shared[aBegin];
-	T bKey = keys_shared[bBegin];
-	T aPrev, bPrev;
-	if(aBegin > 0) aPrev = keys_shared[aBegin - 1];
-	if(bBegin > 0) bPrev = keys_shared[bBegin - 1];
-	int decisions = 0;
-	int matchCountA = 0;
-	int matchCountB = 0;
-	#pragma unroll
-	for(int i = 0; i < VT; ++i) {
-		bool p;
-		if(RangeCheck && aBegin >= aEnd) p = false;
-		else if(RangeCheck && bBegin >= bEnd) p = true;
-		else p = (MgpuBoundsUpper == Bounds) ?
-			comp(aKey, bKey) :
-			!comp(bKey, aKey);
-		if(p) {
-			// aKey is smaller than bKey, so it is inserted before bKey.
-			// Save bKey's index (bBegin + first) as the result of the search
-			// and advance to the next needle in A.
-			bool match = false;
-			if(MatchA) {
-				// Test if there is an element in B that matches aKey.
-				if(MgpuBoundsUpper == Bounds) {
-					// Upper Bound: We're inserting aKey after bKey. If there
-					// is a match for aKey it must be bPrev. Check that bPrev
-					// is in range and equal to aKey.
-					// The predicate test result !comp(aKey, bPrev) was
-					// established on the previous A-advancing iteration (it
-					// failed the comp(aKey, bKey) test to get us to this
-					// point). Check the other half of the equality condition
-					// with a second comparison.
-					bool inRange = !RangeCheck || (bBegin > aEnd);
-					match = inRange && !comp(bPrev, aKey);
-				} else {
-					// Lower Bound: We're inserting aKey before bKey. If there
-					// is a match for aKey, it must be bKey. Check that bKey
-					// is in range and equal to aKey.
-					// The predicate test !comp(bKey, aKey) has established one
-					// half of the equality condition. We establish the other
-					// half with a second comparison.
-					bool inRange = !RangeCheck || (bBegin < bEnd);
-					match = inRange && !comp(aKey, bKey);
-				}
-			}
-			int index = 0;
-		 	if(IndexA) index = bOffset + bBegin;
-			if(match) index |= FlagA;
-			if(IndexA || MatchA) indices[i] = index;
-			matchCountA += match;
-			// Mark the decision bit to indicate that this iteration has
-			// progressed A (the needles).
-			decisions |= 1<< i;
-			aPrev = aKey;
-			aKey = keys_shared[++aBegin];
-		} else {
-			// aKey is larger than bKey, so it is inserted after bKey (but we
-			// don't know where yet). Advance the B index to the next element in
-			// the haystack to continue the search for the current needle.
-			bool match = false;
-			if(MatchB) {
-				if(MgpuBoundsUpper == Bounds) {
-					// Upper Bound: aKey is not smaller than bKey. We advance to
-					// the next haystack element in B. If there is a match in A
-					// for bKey it must be aKey. By entering this branch we've
-					// verified that !comp(aKey, bKey). Making the reciprocal
-					// comparison !comp(bKey, aKey) establishes aKey == bKey.
-					bool inRange = !RangeCheck ||
-						((bBegin < bEnd) && (aBegin < aEnd));
-					match = inRange && !comp(bKey, aKey);
-				} else {
-					// Lower Bound: bKey is smaller than aKey. We advance to the
-					// next element in B. If there is a match for bKey, it must
-					// be aPrev. The previous A-advancing iteration proved that
-					// !comp(bKey, aPrev). We test !comp(aPrev, bKey) for the
-					// other half of the equality condition.
-					bool inRange = !RangeCheck ||
-						((bBegin < bEnd) && (aBegin > 0));
-					match = inRange && !comp(aPrev, bKey);
-				}
-			}
-			int index = 0;
-			if(IndexB) index = aOffset + aBegin;
-			if(match) index |= FlagB;
-			if(IndexB || MatchB) indices[i] = index;
-			matchCountB += match;
-			// Keep the decision bit cleared to indicate that this iteration
-			// has progressed B (the haystack).
-			bPrev = bKey;
-			bKey = keys_shared[++bBegin];
-		}
-	}
-	return make_int3(decisions, matchCountA, matchCountB);
-}
-////////////////////////////////////////////////////////////////////////////////
-// CTASortedSearch
-// Take keys in shared memory and return indices and b-match flags in shared
-// memory.
-// NOTE: This function doesn't do any strided-to-thread order transposes so
-// using an even number of values per thread will incur no additional bank
-// conflicts.
-template<int NT, int VT, MgpuBounds Bounds, bool IndexA, bool MatchA,
-	bool IndexB, bool MatchB, typename T, typename Comp>
-MGPU_DEVICE int2 CTASortedSearch(T* keys_shared, int aStart, int aCount,
-	int aEnd, int a0, int bStart, int bCount, int bEnd, int b0, bool extended,
-	int tid, int* indices_shared, Comp comp) {
-	// Run a merge path to find the start of the serial search for each thread.
-	int diag = VT * tid;
-	int mp = MergePath<Bounds>(keys_shared + aStart, aCount,
-		keys_shared + bStart, bCount, diag, comp);
-	int a0tid = mp;
-	int b0tid = diag - mp;
-	// Serial search into register.
-	int3 results;
-	int indices[VT];
-	if(extended)
-		results = DeviceSerialSearch<VT, Bounds, false, IndexA, MatchA, IndexB,
-			MatchB>(keys_shared, a0tid + aStart, aEnd, b0tid + bStart, bEnd,
-			a0 - aStart, b0 - bStart, indices, comp);
-	else
-		results = DeviceSerialSearch<VT, Bounds, true, IndexA, MatchA, IndexB,
-			MatchB>(keys_shared, a0tid + aStart, aEnd, b0tid + bStart, bEnd,
-			a0 - aStart, b0 - bStart, indices, comp);
-	__syncthreads();
-	// Compact the indices into shared memory. Use the decision bits (set is A,
-	// cleared is B) to select the destination.
-	int decisions = results.x;
-	b0tid += aCount;
-	#pragma unroll
-	for(int i = 0; i < VT; ++i) {
-		if((1<< i) & decisions) {
-			if(IndexA || MatchA) indices_shared[a0tid++] = indices[i];
-		} else {
-			if(IndexB || MatchB) indices_shared[b0tid++] = indices[i];
-		}
-	}
-	__syncthreads();
-	// Return the match counts for A and B keys.
-	return make_int2(results.y, results.z);
-}
-} // namespace mgpu
--- a/include/contrib/moderngpu/include/device/devicetypes.cuh
+++ b/include/contrib/moderngpu/include/device/devicetypes.cuh
-/******************************************************************************
- * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-/******************************************************************************
- *
- * Code and text by Sean Baxter, NVIDIA Research
- * See http://nvlabs.github.io/moderngpu for repository and documentation.
- *
- ******************************************************************************/
-#pragma once
-#if __CUDA_ARCH__ == 100
-	#error "COMPUTE CAPABILITY 1.0 NOT SUPPORTED BY MPGU. TRY 2.0!"
-#endif 
-#include <climits>
-#include "../util/static.h"
-#ifdef _MSC_VER
-#define INLINESYMBOL __forceinline__
-#else
-#define INLINESYMBOL inline
-#endif
-namespace mgpu {
-#define MGPU_HOST __host__ INLINESYMBOL
-#define MGPU_DEVICE __device__ INLINESYMBOL
-#define MGPU_HOST_DEVICE __host__ __device__ INLINESYMBOL
-const int WARP_SIZE = 32;
-const int LOG_WARP_SIZE = 5;
-////////////////////////////////////////////////////////////////////////////////
-// Device-side comparison operators
-template<typename T>
-struct less : public std::binary_function<T, T, bool> {
-	MGPU_HOST_DEVICE bool operator()(T a, T b) { return a < b; }
-};
-template<typename T>
-struct less_equal : public std::binary_function<T, T, bool> {
-	MGPU_HOST_DEVICE bool operator()(T a, T b) { return a <= b; }
-};
-template<typename T>
-struct greater : public std::binary_function<T, T, bool> {
-	MGPU_HOST_DEVICE bool operator()(T a, T b) { return a > b; }
-};
-template<typename T>
-struct greater_equal : public std::binary_function<T, T, bool> {
-	MGPU_HOST_DEVICE bool operator()(T a, T b) { return a >= b; }
-};
-template<typename T>
-struct equal_to : public std::binary_function<T, T, bool> {
-	MGPU_HOST_DEVICE bool operator()(T a, T b) { return a == b; }
-};
-template<typename T>
-struct not_equal_to : public std::binary_function<T, T, bool> {
-	MGPU_HOST_DEVICE bool operator()(T a, T b) { return a != b; }
-};
-////////////////////////////////////////////////////////////////////////////////
-// Device-side arithmetic operators
-template<typename T>
-struct plus : public std::binary_function<T, T, T> {
-	MGPU_HOST_DEVICE T operator()(T a, T b) { return a + b; }
-};
-template<typename T>
-struct minus : public std::binary_function<T, T, T> {
-	MGPU_HOST_DEVICE T operator()(T a, T b) { return a - b; }
-};
-template<typename T>
-struct multiplies : public std::binary_function<T, T, T> {
-	MGPU_HOST_DEVICE T operator()(T a, T b) { return a * b; }
-};
-template<typename T>
-struct modulus : public std::binary_function<T, T, T> {
-	MGPU_HOST_DEVICE T operator()(T a, T b) { return a % b; }
-};
-template<typename T>
-struct bit_or : public std::binary_function<T, T, T> {
-	MGPU_HOST_DEVICE T operator()(T a, T b) { return a | b; }
-};
-template<typename T>
-struct bit_and : public std::binary_function<T, T, T> {
-	MGPU_HOST_DEVICE T operator()(T a, T b) { return a & b; }
-};
-template<typename T>
-struct bit_xor : public std::binary_function<T, T, T> {
-	MGPU_HOST_DEVICE T operator()(T a, T b) { return a ^ b; }
-};
-template<typename T>
-struct maximum : public std::binary_function<T, T, T> {
-	MGPU_HOST_DEVICE T operator()(T a, T b) { return max(a, b); }
-};
-template<typename T>
-struct minimum : public std::binary_function<T, T, T> {
-	MGPU_HOST_DEVICE T operator()(T a, T b) { return min(a, b); }
-};
-////////////////////////////////////////////////////////////////////////////////
-template<typename T>
-MGPU_HOST_DEVICE void swap(T& a, T& b) {
-	T c = a;
-	a = b;
-	b = c;
-}
-template<typename T>
-struct DevicePair {
-	T x, y;
-};
-template<typename T>
-MGPU_HOST_DEVICE DevicePair<T> MakeDevicePair(T x, T y) {
-	DevicePair<T> p = { x, y };
-	return p;
-}
-template<typename T> struct numeric_limits;
-template<> struct numeric_limits<int> {
-	MGPU_HOST_DEVICE static int min() { return INT_MIN; }
-	MGPU_HOST_DEVICE static int max() { return INT_MAX; }
-	MGPU_HOST_DEVICE static int lowest() { return INT_MIN; }
-	MGPU_HOST_DEVICE static int AddIdent() { return 0; }
-	MGPU_HOST_DEVICE static int MulIdent() { return 1; }
-};
-template<> struct numeric_limits<long long> {
-	MGPU_HOST_DEVICE static long long min() { return LLONG_MIN; }
-	MGPU_HOST_DEVICE static long long max() { return LLONG_MAX; }
-	MGPU_HOST_DEVICE static long long lowest() { return LLONG_MIN; }
-	MGPU_HOST_DEVICE static long long AddIdent() { return 0; }
-	MGPU_HOST_DEVICE static long long MulIdent() { return 1; }
-};
-template<> struct numeric_limits<uint> {
-	MGPU_HOST_DEVICE static uint min() { return 0; }
-	MGPU_HOST_DEVICE static uint max() { return UINT_MAX; }
-	MGPU_HOST_DEVICE static uint lowest() { return 0; }
-	MGPU_HOST_DEVICE static uint AddIdent() { return 0; }
-	MGPU_HOST_DEVICE static uint MulIdent() { return 1; }
-};
-template<> struct numeric_limits<unsigned long long> {
-	MGPU_HOST_DEVICE static unsigned long long min() { return 0; }
-	MGPU_HOST_DEVICE static unsigned long long max() { return ULLONG_MAX; }
-	MGPU_HOST_DEVICE static unsigned long long lowest() { return 0; }
-	MGPU_HOST_DEVICE static unsigned long long AddIdent() { return 0; }
-	MGPU_HOST_DEVICE static unsigned long long MulIdent() { return 1; }
-};
-template<> struct numeric_limits<float> {
-	MGPU_HOST_DEVICE static float min() { return FLT_MIN; }
-	MGPU_HOST_DEVICE static float max() { return FLT_MAX; }
-	MGPU_HOST_DEVICE static float lowest() { return -FLT_MAX; }
-	MGPU_HOST_DEVICE static float AddIdent() { return 0; }
-	MGPU_HOST_DEVICE static float MulIdent() { return 1; }
-};
-template<> struct numeric_limits<double> {
-	MGPU_HOST_DEVICE static double min() { return DBL_MIN; }
-	MGPU_HOST_DEVICE static double max() { return DBL_MAX; }
-	MGPU_HOST_DEVICE static double lowest() { return -DBL_MAX; }
-	MGPU_HOST_DEVICE static double AddIdent() { return 0; }
-	MGPU_HOST_DEVICE static double MulIdent() { return 1; }
-};
-MGPU_HOST_DEVICE int2 operator+(int2 a, int2 b) {
-	return make_int2(a.x + b.x, a.y + b.y); 
-}
-MGPU_HOST_DEVICE int2& operator+=(int2& a, int2 b) {
-	a = a + b;
-	return a;
-}
-MGPU_HOST_DEVICE int2 operator*(int2 a, int2 b) {
-	return make_int2(a.x * b.x, a.y * b.y);
-}
-MGPU_HOST_DEVICE int2& operator*=(int2& a, int2 b) {
-	a = a * b;
-	return a;
-}
-template<typename T>
-MGPU_HOST_DEVICE T max(T a, T b) {
-#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 100)
-	return std::max(a, b);
-#else
-	return (a < b) ? b : a;
-#endif
-}
-template<typename T>
-MGPU_HOST_DEVICE T min(T a, T b) {
-#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 100)
-	return std::min(a, b);
-#else
-	return (b < a) ? b : a;
-#endif
-}
-MGPU_HOST_DEVICE int2 max(int2 a, int2 b) {
-	return make_int2(max(a.x, b.x), max(a.y, b.y));
-}
-MGPU_HOST_DEVICE int2 min(int2 a, int2 b) {
-	return make_int2(min(a.x, b.x), min(a.y, b.y));
-}
-template<> struct numeric_limits<int2> {
-	MGPU_HOST_DEVICE static int2 min() { return make_int2(INT_MIN, INT_MIN); }
-	MGPU_HOST_DEVICE static int2 max() { return make_int2(INT_MAX, INT_MAX); }
-	MGPU_HOST_DEVICE static int2 lowest() { 
-		return make_int2(INT_MIN, INT_MIN); 
-	}
-	MGPU_HOST_DEVICE static int2 AddIdent() { return make_int2(0, 0); }
-	MGPU_HOST_DEVICE static int2 MulIdent() { return make_int2(1, 1); }
-};
-template<typename T>
-class constant_iterator : public std::iterator_traits<const T*> {
-public:
-	MGPU_HOST_DEVICE constant_iterator(T value) : _value(value) { }
-	MGPU_HOST_DEVICE T operator[](ptrdiff_t i) const { 
-		return _value;
-	}
-	MGPU_HOST_DEVICE T operator*() const {
-		return _value;
-	}
-	MGPU_HOST_DEVICE constant_iterator operator+(ptrdiff_t diff) const {
-		return constant_iterator(_value);
-	}
-	MGPU_HOST_DEVICE constant_iterator operator-(ptrdiff_t diff) const {
-		return constant_iterator(_value);
-	}
-	MGPU_HOST_DEVICE constant_iterator& operator+=(ptrdiff_t diff) {
-		return *this;
-	}
-	MGPU_HOST_DEVICE constant_iterator& operator-=(ptrdiff_t diff) {
-		return *this;
-	}
-private:
-	T _value;
-};
-template<typename T>
-class counting_iterator : public std::iterator_traits<const T*> {
-public:
-	MGPU_HOST_DEVICE counting_iterator(T value) : _value(value) { }
-	MGPU_HOST_DEVICE T operator[](ptrdiff_t i) { 
-		return _value + i;
-	}
-	MGPU_HOST_DEVICE T operator*() {
-		return _value;
-	}
-	MGPU_HOST_DEVICE counting_iterator operator+(ptrdiff_t diff) {
-		return counting_iterator(_value + diff);
-	}
-	MGPU_HOST_DEVICE counting_iterator operator-(ptrdiff_t diff) {
-		return counting_iterator(_value - diff);
-	}
-	MGPU_HOST_DEVICE counting_iterator& operator+=(ptrdiff_t diff) {
-		_value += diff;
-		return *this;
-	}
-	MGPU_HOST_DEVICE counting_iterator& operator-=(ptrdiff_t diff) {
-		_value -= diff;
-		return *this;
-	}
-private:
-	T _value;
-};
-template<typename T>
-class step_iterator : public std::iterator_traits<const T*> {
-public:
-	MGPU_HOST_DEVICE step_iterator(T base, T step) :
-		_base(base), _step(step), _offset(0) { }
-	MGPU_HOST_DEVICE T operator[](ptrdiff_t i) { 
-		return _base + (_offset + i) * _step; 
-	}
-	MGPU_HOST_DEVICE T operator*() { 
-		return _base + _offset * _step; 
-	} 
-	MGPU_HOST_DEVICE step_iterator operator+(ptrdiff_t diff) {
-		step_iterator it = *this;
-		it._offset += diff;
-		return it;
-	}
-	MGPU_HOST_DEVICE step_iterator operator-(ptrdiff_t diff) {
-		step_iterator it = *this;
-		it._offset -= diff;
-		return it;
-	}
-	MGPU_HOST_DEVICE step_iterator& operator+=(ptrdiff_t diff) { 
-		_offset += diff;
-		return *this;
-	}
-	MGPU_HOST_DEVICE step_iterator& operator-=(ptrdiff_t diff) { 
-		_offset -= diff;
-		return *this;
-	}
-private:
-	ptrdiff_t _offset;
-	T _base, _step;	
-};
-} // namespace mgpu
-template<typename T>
-MGPU_HOST_DEVICE mgpu::counting_iterator<T> operator+(ptrdiff_t diff,
-	mgpu::counting_iterator<T> it) {
-	return it + diff;
-}
-template<typename T>
-MGPU_HOST_DEVICE mgpu::counting_iterator<T> operator-(ptrdiff_t diff,
-	mgpu::counting_iterator<T> it) {
-	return it + (-diff);
-}
-template<typename T>
-MGPU_HOST_DEVICE mgpu::step_iterator<T> operator+(ptrdiff_t diff, 
-	mgpu::step_iterator<T> it) {
-	return it + diff;
-}
-template<typename T>
-MGPU_HOST_DEVICE mgpu::step_iterator<T> operator-(ptrdiff_t diff, 
-	mgpu::step_iterator<T> it) {
-	return it + (-diff);
-}
--- a/include/contrib/moderngpu/include/device/deviceutil.cuh
+++ b/include/contrib/moderngpu/include/device/deviceutil.cuh
-/******************************************************************************
- * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-/******************************************************************************
- *
- * Code and text by Sean Baxter, NVIDIA Research
- * See http://nvlabs.github.io/moderngpu for repository and documentation.
- *
- ******************************************************************************/
-#pragma once
-#include "intrinsics.cuh"
-namespace mgpu {
-// Get the difference between two pointers in bytes.
-MGPU_HOST_DEVICE ptrdiff_t PtrDiff(const void* a, const void* b) {
-	return (const byte*)b - (const byte*)a;
-}
-// Offset a pointer by i bytes.
-template<typename T>
-MGPU_HOST_DEVICE const T* PtrOffset(const T* p, ptrdiff_t i) {
-	return (const T*)((const byte*)p + i);
-}
-template<typename T>
-MGPU_HOST_DEVICE T* PtrOffset(T* p, ptrdiff_t i) {
-	return (T*)((byte*)p + i);
-}
-////////////////////////////////////////////////////////////////////////////////
-// Task range support
-// Evenly distributes variable-length arrays over a fixed number of CTAs.
-MGPU_HOST int2 DivideTaskRange(int numItems, int numWorkers) {
-	div_t d = div(numItems, numWorkers);
-	return make_int2(d.quot, d.rem);
-}
-MGPU_HOST_DEVICE int2 ComputeTaskRange(int block, int2 task) {
-	int2 range;
-	range.x = task.x * block;
-	range.x += min(block, task.y);
-	range.y = range.x + task.x + (block < task.y);
-	return range;
-}
-MGPU_HOST_DEVICE int2 ComputeTaskRange(int block, int2 task, int blockSize,
-	int count) {
-	int2 range = ComputeTaskRange(block, task);
-	range.x *= blockSize;
-	range.y = min(count, range.y * blockSize);
-	return range;
-}
-////////////////////////////////////////////////////////////////////////////////
-// DeviceExtractHeadFlags
-// Input array flags is a bit array with 32 head flags per word.
-// ExtractThreadHeadFlags returns numBits flags starting at bit index.
-MGPU_HOST_DEVICE uint DeviceExtractHeadFlags(const uint* flags, int index,
-	int numBits) {
-	int index2 = index>> 5;
-	int shift = 31 & index;
-	uint headFlags = flags[index2]>> shift;
-	int shifted = 32 - shift;
-	if(shifted < numBits)
-		// We also need to shift in the next set of bits.
-		headFlags = bfi(flags[index2 + 1], headFlags, shifted, shift);
-	headFlags &= (1<< numBits) - 1;
-	return headFlags;
-}
-////////////////////////////////////////////////////////////////////////////////
-// DevicePackHeadFlags
-// Pack VT bits per thread at 32 bits/thread. Will consume an integer number of
-// words, because CTA size is a multiple of 32. The first NT * VT / 32 threads
-// return packed words.
-template<int NT, int VT>
-MGPU_DEVICE uint DevicePackHeadFlags(uint threadBits, int tid,
-	uint* flags_shared) {
-	const int WordCount = NT * VT / 32;
-	// Each thread stores its thread bits to flags_shared[tid].
-	flags_shared[tid] = threadBits;
-	__syncthreads();
-	uint packed = 0;
-	if(tid < WordCount) {
-		const int Items = MGPU_DIV_UP(32, VT);
-		int index = 32 * tid;
-		int first = index / VT;
-		int bit = 0;
-		int rem = index - VT * first;
-		packed = flags_shared[first]>> rem;
-		bit = VT - rem;
-		++first;
-		#pragma unroll
-		for(int i = 0; i < Items; ++i) {
-			if(i < Items - 1 || bit < 32) {
-				uint x = flags_shared[first + i];
-				if(bit < 32) packed |= x<< bit;
-				bit += VT;
-			}
-		}
-	}
-	__syncthreads();
-	return packed;
-}
-} // namespace mgpu
--- a/include/contrib/moderngpu/include/device/intrinsics.cuh
+++ b/include/contrib/moderngpu/include/device/intrinsics.cuh
-/******************************************************************************
- * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-/******************************************************************************
- *
- * Code and text by Sean Baxter, NVIDIA Research
- * See http://nvlabs.github.io/moderngpu for repository and documentation.
- *
- ******************************************************************************/
-#include "devicetypes.cuh"
-#pragma once
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wstrict-aliasing"
-namespace mgpu {
-MGPU_HOST_DEVICE uint2 ulonglong_as_uint2(uint64 x) {
-	return *reinterpret_cast<uint2*>(&x);
-}
-MGPU_HOST_DEVICE uint64 uint2_as_ulonglong(uint2 x) {
-	return *reinterpret_cast<uint64*>(&x);
-}
-MGPU_HOST_DEVICE int2 longlong_as_int2(int64 x) {
-	return *reinterpret_cast<int2*>(&x);
-}
-MGPU_HOST_DEVICE int64 int2_as_longlong(int2 x) {
-	return *reinterpret_cast<int64*>(&x);
-}
-MGPU_HOST_DEVICE int2 double_as_int2(double x) {
-	return *reinterpret_cast<int2*>(&x);
-}
-MGPU_HOST_DEVICE double int2_as_double(int2 x) {
-	return *reinterpret_cast<double*>(&x);
-}
-MGPU_HOST_DEVICE void SetDoubleX(double& d, int x) {
-	reinterpret_cast<int*>(&d)[0] = x;
-}
-MGPU_HOST_DEVICE int GetDoubleX(double d) {
-	return double_as_int2(d).x;
-}
-MGPU_HOST_DEVICE void SetDoubleY(double& d, int y) {
-	reinterpret_cast<int*>(&d)[1] = y;
-}
-MGPU_HOST_DEVICE int GetDoubleY(double d) {
-	return double_as_int2(d).y;
-}
-////////////////////////////////////////////////////////////////////////////////
-// PTX for bfe and bfi
-#if __CUDA_ARCH__ >= 200
-MGPU_DEVICE uint bfe_ptx(uint x, uint bit, uint numBits) {
-	uint result;
-	asm("bfe.u32 %0, %1, %2, %3;" :
-		"=r"(result) : "r"(x), "r"(bit), "r"(numBits));
-	return result;
-}
-MGPU_DEVICE uint bfi_ptx(uint x, uint y, uint bit, uint numBits) {
-	uint result;
-	asm("bfi.b32 %0, %1, %2, %3, %4;" :
-		"=r"(result) : "r"(x), "r"(y), "r"(bit), "r"(numBits));
-	return result;
-}
-MGPU_DEVICE uint prmt_ptx(uint a, uint b, uint index) {
-	uint ret;
-	asm("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index));
-	return ret;
-}
-#endif // __CUDA_ARCH__ >= 200
-////////////////////////////////////////////////////////////////////////////////
-// shfl_up
-__device__ __forceinline__ float shfl_up(float var,
-	unsigned int delta, int width = 32) {
-#if __CUDA_ARCH__ >= 300
-	var = __shfl_up_sync(0xFFFFFFFF, var, delta, width);
-#endif
-	return var;
-}
-__device__ __forceinline__ double shfl_up(double var,
-	unsigned int delta, int width = 32) {
-#if __CUDA_ARCH__ >= 300
-	int2 p = mgpu::double_as_int2(var);
-	p.x = __shfl_up_sync(0xFFFFFFFF, p.x, delta, width);
-	p.y = __shfl_up_sync(0xFFFFFFFF, p.y, delta, width);
-	var = mgpu::int2_as_double(p);
-#endif
-	return var;
-}
-////////////////////////////////////////////////////////////////////////////////
-// shfl_add
-MGPU_DEVICE int shfl_add(int x, int offset, int width = WARP_SIZE) {
-	int result = 0;
-#if __CUDA_ARCH__ >= 300
-	int mask = (WARP_SIZE - width)<< 8;
-	asm(
-		"{.reg .s32 r0;"
-		".reg .pred p;"
-		"shfl.up.sync.b32 r0|p, %1, %2, %3, %4;"
-		"@p add.s32 r0, r0, %4;"
-		"mov.s32 %0, r0; }"
-		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
-#endif
-	return result;
-}
-MGPU_DEVICE int shfl_max(int x, int offset, int width = WARP_SIZE) {
-	int result = 0;
-#if __CUDA_ARCH__ >= 300
-	int mask = (WARP_SIZE - width)<< 8;
-	asm(
-		"{.reg .s32 r0;"
-		".reg .pred p;"
-		"shfl.up.sync..b32 r0|p, %1, %2, %3, %4;"
-		"@p max.s32 r0, r0, %4;"
-		"mov.s32 %0, r0; }"
-		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
-#endif
-	return result;
-}
-////////////////////////////////////////////////////////////////////////////////
-// brev, popc, clz, bfe, bfi, prmt
-// Reverse the bits in an integer.
-MGPU_HOST_DEVICE uint brev(uint x) {
-#if __CUDA_ARCH__ >= 200
-	uint y = __brev(x);
-#else
-	uint y = 0;
-	for(int i = 0; i < 32; ++i)
-		y |= (1 & (x>> i))<< (31 - i);
-#endif
-	return y;
-}
-// Count number of bits in a register.
-MGPU_HOST_DEVICE int popc(uint x) {
-#if __CUDA_ARCH__ >= 200
-	return __popc(x);
-#else
-	int c;
-	for(c = 0; x; ++c)
-		x &= x - 1;
-	return c;
-#endif
-}
-// Count leading zeros - start from most significant bit.
-MGPU_HOST_DEVICE int clz(int x) {
-#if __CUDA_ARCH__ >= 200
-	return __clz(x);
-#else
-	for(int i = 31; i >= 0; --i)
-		if((1<< i) & x) return 31 - i;
-	return 32;
-#endif
-}
-// Find first set - start from least significant bit. LSB is 1. ffs(0) is 0.
-MGPU_HOST_DEVICE int ffs(int x) {
-#if __CUDA_ARCH__ >= 200
-	return __ffs(x);
-#else
-	for(int i = 0; i < 32; ++i)
-		if((1<< i) & x) return i + 1;
-	return 0;
-#endif
-}
-MGPU_HOST_DEVICE uint bfe(uint x, uint bit, uint numBits) {
-#if __CUDA_ARCH__ >= 200
-	return bfe_ptx(x, bit, numBits);
-#else
-	return ((1<< numBits) - 1) & (x>> bit);
-#endif
-}
-MGPU_HOST_DEVICE uint bfi(uint x, uint y, uint bit, uint numBits) {
-	uint result;
-#if __CUDA_ARCH__ >= 200
-	result = bfi_ptx(x, y, bit, numBits);
-#else
-	if(bit + numBits > 32) numBits = 32 - bit;
-	uint mask = ((1<< numBits) - 1)<< bit;
-	result = y & ~mask;
-	result |= mask & (x<< bit);
-#endif
-	return result;
-}
-MGPU_HOST_DEVICE uint prmt(uint a, uint b, uint index) {
-	uint result;
-#if __CUDA_ARCH__ >= 200
-	result = prmt_ptx(a, b, index);
-#else
-	result = 0;
-	for(int i = 0; i < 4; ++i) {
-		uint sel = 0xf & (index>> (4 * i));
-		uint x = ((7 & sel) > 3) ? b : a;
-		x = 0xff & (x>> (8 * (3 & sel)));
-		if(8 & sel) x = (128 & x) ? 0xff : 0;
-		result |= x<< (8 * i);
-	}
-#endif
-	return result;
-}
-// Find log2(x) and optionally round up to the next integer logarithm.
-MGPU_HOST_DEVICE int FindLog2(int x, bool roundUp = false) {
-	int a = 31 - clz(x);
-	if(roundUp) a += !MGPU_IS_POW_2(x);
-	return a;
-}
-////////////////////////////////////////////////////////////////////////////////
-// vset4
-#if __CUDA_ARCH__ >= 300
-// Performs four byte-wise comparisons and returns 1 for each byte that
-// satisfies the conditional, and zero otherwise.
-MGPU_DEVICE uint vset4_lt_add_ptx(uint a, uint b, uint c) {
-	uint result;
-	asm("vset4.u32.u32.lt.add %0, %1, %2, %3;" :
-		"=r"(result) : "r"(a), "r"(b), "r"(c));
-	return result;
-}
-MGPU_DEVICE uint vset4_eq_ptx(uint a, uint b) {
-	uint result;
-	asm("vset4.u32.u32.eq %0, %1, %2, %3;" :
-		"=r"(result) : "r"(a), "r"(b), "r"(0));
-	return result;
-}
-#endif // __CUDA_ARCH__ >= 300
-MGPU_HOST_DEVICE uint vset4_lt_add(uint a, uint b, uint c) {
-	uint result;
-#if __CUDA_ARCH__ >= 300
-	result = vset4_lt_add_ptx(a, b, c);
-#else
-	result = c;
-	if((0x000000ff & a) < (0x000000ff & b)) result += 0x00000001;
-	if((0x0000ff00 & a) < (0x0000ff00 & b)) result += 0x00000100;
-	if((0x00ff0000 & a) < (0x00ff0000 & b)) result += 0x00010000;
-	if((0xff000000 & a) < (0xff000000 & b)) result += 0x01000000;
-#endif
-	return result;
-}
-MGPU_HOST_DEVICE uint vset4_eq(uint a, uint b) {
-	uint result;
-#if __CUDA_ARCH__ >= 300
-	result = vset4_eq_ptx(a, b);
-#else
-	result = 0;
-	if((0x000000ff & a) == (0x000000ff & b)) result = 0x00000001;
-	if((0x0000ff00 & a) == (0x0000ff00 & b)) result += 0x00000100;
-	if((0x00ff0000 & a) == (0x00ff0000 & b)) result += 0x00010000;
-	if((0xff000000 & a) == (0xff000000 & b)) result += 0x01000000;
-#endif
-	return result;
-}
-////////////////////////////////////////////////////////////////////////////////
-//
-MGPU_HOST_DEVICE uint umulhi(uint x, uint y) {
-#if __CUDA_ARCH__ >= 100
-	return __umulhi(x, y);
-#else
-	uint64 product = (uint64)x * y;
-	return (uint)(product>> 32);
-#endif
-}
-////////////////////////////////////////////////////////////////////////////////
-// ldg() function defined for all devices and all types. Only compiles to __ldg
-// intrinsic for __CUDA_ARCH__ >= 320 && __CUDA_ARCH__ < 400 for types supported
-// by __ldg in sm_32_intrinsics.h
-template<typename T>
-struct IsLdgType {
-	enum { value = false };
-};
-#define DEFINE_LDG_TYPE(T) \
-	template<> struct IsLdgType<T> { enum { value = true }; };
-template<typename T, bool UseLDG = IsLdgType<T>::value>
-struct LdgShim {
-	MGPU_DEVICE static T Ldg(const T* p) {
-		return *p;
-	}
-};
-#if __CUDA_ARCH__ >= 320 && __CUDA_ARCH__ < 400
-	// List of __ldg-compatible types from sm_32_intrinsics.h.
-	DEFINE_LDG_TYPE(char)
-	DEFINE_LDG_TYPE(short)
-	DEFINE_LDG_TYPE(int)
-	DEFINE_LDG_TYPE(long long)
-	DEFINE_LDG_TYPE(char2)
-	DEFINE_LDG_TYPE(char4)
-	DEFINE_LDG_TYPE(short2)
-	DEFINE_LDG_TYPE(short4)
-	DEFINE_LDG_TYPE(int2)
-	DEFINE_LDG_TYPE(int4)
-	DEFINE_LDG_TYPE(longlong2)
-	DEFINE_LDG_TYPE(unsigned char)
-	DEFINE_LDG_TYPE(unsigned short)
-	DEFINE_LDG_TYPE(unsigned int)
-	DEFINE_LDG_TYPE(unsigned long long)
-	DEFINE_LDG_TYPE(uchar2)
-	DEFINE_LDG_TYPE(uchar4)
-	DEFINE_LDG_TYPE(ushort2)
-	DEFINE_LDG_TYPE(ushort4)
-	DEFINE_LDG_TYPE(uint2)
-	DEFINE_LDG_TYPE(uint4)
-	DEFINE_LDG_TYPE(ulonglong2)
-	DEFINE_LDG_TYPE(float)
-	DEFINE_LDG_TYPE(double)
-	DEFINE_LDG_TYPE(float2)
-	DEFINE_LDG_TYPE(float4)
-	DEFINE_LDG_TYPE(double2)
-	template<typename T> struct LdgShim<T, true> {
-		MGPU_DEVICE static T Ldg(const T* p) {
-			return __ldg(p);
-		}
-	};
-#endif
-template<typename T>
-MGPU_DEVICE T ldg(const T* p) {
-	return LdgShim<T>::Ldg(p);
-}
-////////////////////////////////////////////////////////////////////////////////
-// Fast division for 31-bit integers.
-// Uses the method in Hacker's Delight (2nd edition) page 228.
-// Evaluates for denom > 1 and x < 2^31.
-struct FastDivide {
-	uint denom;
-	uint coef;
-	uint shift;
-	MGPU_HOST_DEVICE uint Divide(uint x) {
-		return umulhi(x, coef)>> shift;
-	}
-	MGPU_HOST_DEVICE uint Modulus(uint x) {
-		return x - Divide(x) * denom;
-	}
-	explicit FastDivide(uint denom_) {
-		denom = denom_;
-		uint p = 31 + FindLog2(denom, true);
-		coef = (uint)(((1ull<< p) + denom - 1) / denom);
-		shift = p - 32;
-	}
-};
-#pragma GCC diagnostic pop
-} // namespace mgpu
--- a/include/contrib/moderngpu/include/device/loadstore.cuh
+++ b/include/contrib/moderngpu/include/device/loadstore.cuh
--- a/include/contrib/moderngpu/include/device/serialsets.cuh
+++ b/include/contrib/moderngpu/include/device/serialsets.cuh
-/******************************************************************************
- * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-/******************************************************************************
- *
- * Code and text by Sean Baxter, NVIDIA Research
- * See http://nvlabs.github.io/moderngpu for repository and documentation.
- *
- ******************************************************************************/
-#pragma once
-#include "deviceutil.cuh"
-namespace mgpu {
-////////////////////////////////////////////////////////////////////////////////
-// SerialSetIntersection
-// Emit A if A and B are in range and equal.
-template<int VT, bool RangeCheck, typename T, typename Comp>
-MGPU_DEVICE int SerialSetIntersection(const T* data, int aBegin, int aEnd,
-	int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) {
-	const int MinIterations = VT / 2;
-	int commit = 0;
-	#pragma unroll
-	for(int i = 0; i < VT; ++i) {
-		bool test = RangeCheck ?
-			((aBegin + bBegin < end) && (aBegin < aEnd) && (bBegin < bEnd)) :
-			(i < MinIterations || (aBegin + bBegin < end));
-		if(test) {
-			T aKey = data[aBegin];
-			T bKey = data[bBegin];
-			bool pA = comp(aKey, bKey);
-			bool pB = comp(bKey, aKey);
-			// The outputs must come from A by definition of set interection.
-			results[i] = aKey;
-			indices[i] = aBegin;
-			if(!pB) ++aBegin;
-			if(!pA) ++bBegin;
-			if(pA == pB) commit |= 1<< i;
-		}
-	}
-	return commit;
-}
-////////////////////////////////////////////////////////////////////////////////
-// SerialSetUnion
-// Emit A if A <= B. Emit B if B < A.
-template<int VT, bool RangeCheck, typename T, typename Comp>
-MGPU_DEVICE int SerialSetUnion(const T* data, int aBegin, int aEnd,
-	int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) {
-	const int MinIterations = VT / 2;
-	int commit = 0;
-	#pragma unroll
-	for(int i = 0; i < VT; ++i) {
-		bool test = RangeCheck ?
-			(aBegin + bBegin < end) :
-			(i < MinIterations || (aBegin + bBegin < end));
-		if(test) {
-			T aKey = data[aBegin];
-			T bKey = data[bBegin];
-			bool pA = false, pB = false;
-			if(RangeCheck && aBegin >= aEnd)
-				pB = true;
-			else if(RangeCheck && bBegin >= bEnd)
-				pA = true;
-			else {
-				// Both are in range.
-				pA = comp(aKey, bKey);
-				pB = comp(bKey, aKey);
-			}
-			// Output A in case of a tie, so check if b < a.
-			results[i] = pB ? bKey : aKey;
-			indices[i] = pB ? bBegin : aBegin;
-			if(!pB) ++aBegin;
-			if(!pA) ++bBegin;
-			commit |= 1<< i;
-		}
-	}
-	return commit;
-}
-////////////////////////////////////////////////////////////////////////////////
-// SerialSetDifference
-// Emit A if A < B.
-template<int VT, bool RangeCheck, typename T, typename Comp>
-MGPU_DEVICE int SerialSetDifference(const T* data, int aBegin, int aEnd,
-	int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) {
-	const int MinIterations = VT / 2;
-	int commit = 0;
-	#pragma unroll
-	for(int i = 0; i < VT; ++i) {
-		bool test = RangeCheck ?
-			(aBegin + bBegin < end) :
-			(i < MinIterations || (aBegin + bBegin < end));
-		if(test) {
-			T aKey = data[aBegin];
-			T bKey = data[bBegin];
-			bool pA = false, pB = false;
-			if(RangeCheck && aBegin >= aEnd)
-				pB = true;
-			else if(RangeCheck && bBegin >= bEnd)
-				pA = true;
-			else {
-				pA = comp(aKey, bKey);
-				pB = comp(bKey, aKey);
-			}
-			// The outputs must come from A by definition of set difference.
-			results[i] = aKey;
-			indices[i] = aBegin;
-			if(!pB) ++aBegin;
-			if(!pA) ++bBegin;
-			if(pA) commit |= 1<< i;
-		}
-	}
-	return commit;
-}
-////////////////////////////////////////////////////////////////////////////////
-// SerialSetSymDiff
-// Emit A if A < B and emit B if B < A.
-template<int VT, bool RangeCheck, typename T, typename Comp>
-MGPU_DEVICE int SerialSetSymDiff(const T* data, int aBegin, int aEnd,
-	int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) {
-	const int MinIterations = VT / 2;
-	int commit = 0;
-	#pragma unroll
-	for(int i = 0; i < VT; ++i) {
-		bool test = RangeCheck ?
-			(aBegin + bBegin < end) :
-			(i < MinIterations || (aBegin + bBegin < end));
-		if(test) {
-			T aKey = data[aBegin];
-			T bKey = data[bBegin];
-			bool pA = false, pB = false;
-			if(RangeCheck && (bBegin >= bEnd))
-				pA = true;
-			else if(RangeCheck && (aBegin >= aEnd))
-				pB = true;
-			else {
-				pA = comp(aKey, bKey);
-				pB = comp(bKey, aKey);
-			}
-			results[i] = pA ? aKey : bKey;
-			indices[i] = pA ? aBegin : bBegin;
-			if(!pA) ++bBegin;
-			if(!pB) ++aBegin;
-			if(pA != pB) commit |= 1<< i;
-		}
-	}
-	return commit;
-}
-////////////////////////////////////////////////////////////////////////////////
-// SerialSetOp
-// Uses the MgpuSetOp enum to statically select one of the four serial ops
-// above.
-template<int VT, bool RangeCheck, MgpuSetOp Op, typename T, typename Comp>
-MGPU_DEVICE int SerialSetOp(const T* data, int aBegin, int aEnd,
-	int bBegin, int bEnd, int star, T* results, int* indices, Comp comp) {
-	int end = aBegin + bBegin + VT - star;
-	if(RangeCheck) end = min(end, aEnd + bEnd);
-	int commit;
-	switch(Op) {
-		case MgpuSetOpIntersection:
-			commit = SerialSetIntersection<VT, RangeCheck>(data, aBegin,
-				aEnd, bBegin, bEnd, end, results, indices, comp);
-			break;
-		case MgpuSetOpUnion:
-			commit = SerialSetUnion<VT, RangeCheck>(data, aBegin, aEnd,
-				bBegin, bEnd, end, results, indices, comp);
-			break;
-		case MgpuSetOpDiff:
-			commit = SerialSetDifference<VT, RangeCheck>(data, aBegin, aEnd,
-				bBegin, bEnd, end, results, indices, comp);
-			break;
-		case MgpuSetOpSymDiff:
-			commit = SerialSetSymDiff<VT, RangeCheck>(data, aBegin, aEnd,
-				bBegin, bEnd, end, results, indices, comp);
-			break;
-	}
-	__syncthreads();
-	return commit;
-}
-} // namespace mgpu
--- a/include/contrib/moderngpu/include/device/sortnetwork.cuh
+++ b/include/contrib/moderngpu/include/device/sortnetwork.cuh
-/******************************************************************************
- * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-/******************************************************************************
- *
- * Code and text by Sean Baxter, NVIDIA Research
- * See http://nvlabs.github.io/moderngpu for repository and documentation.
- *
- ******************************************************************************/
-#pragma once
-#include "deviceutil.cuh"
-namespace mgpu {
-////////////////////////////////////////////////////////////////////////////////
-// Odd-even transposition sorting network. Sorts keys and values in-place in
-// register.
-// http://en.wikipedia.org/wiki/Odd%E2%80%93even_sort
-// CUDA Compiler does not currently unroll these loops correctly. Write using
-// template loop unrolling.
-/*
-template<int VT, typename T, typename V, typename Comp>
-MGPU_DEVICE void OddEvenTransposeSort(T* keys, V* values, Comp comp) {
-	#pragma unroll
-	for(int level = 0; level < VT; ++level) {
-		#pragma unroll
-		for(int i = 1 & level; i < VT - 1; i += 2) {
-			if(comp(keys[i + 1], keys[i])) {
-				mgpu::swap(keys[i], keys[i + 1]);
-				mgpu::swap(values[i], values[i + 1]);
-			}
-		}
-	}
-}*/
-template<int I, int VT>
-struct OddEvenTransposeSortT {
-	// Sort segments marked by head flags. If the head flag between i and i + 1
-	// is set (so that (2<< i) & flags is true), the values belong to different
-	// segments and are not swapped.
-	template<typename K, typename V, typename Comp>
-	static MGPU_DEVICE void Sort(K* keys, V* values, int flags, Comp comp) {
-		#pragma unroll
-		for(int i = 1 & I; i < VT - 1; i += 2)
-			if((0 == ((2<< i) & flags)) && comp(keys[i + 1], keys[i])) {
-				mgpu::swap(keys[i], keys[i + 1]);
-				mgpu::swap(values[i], values[i + 1]);
-			}
-		OddEvenTransposeSortT<I + 1, VT>::Sort(keys, values, flags, comp);
-	}
-};
-template<int I> struct OddEvenTransposeSortT<I, I> {
-	template<typename K, typename V, typename Comp>
-	static MGPU_DEVICE void Sort(K* keys, V* values, int flags, Comp comp) { }
-};
-template<int VT, typename K, typename V, typename Comp>
-MGPU_DEVICE void OddEvenTransposeSort(K* keys, V* values, Comp comp) {
-	OddEvenTransposeSortT<0, VT>::Sort(keys, values, 0, comp);
-}
-template<int VT, typename K, typename V, typename Comp>
-MGPU_DEVICE void OddEvenTransposeSortFlags(K* keys, V* values, int flags,
-	Comp comp) {
-	OddEvenTransposeSortT<0, VT>::Sort(keys, values, flags, comp);
-}
-////////////////////////////////////////////////////////////////////////////////
-// Batcher Odd-Even Mergesort network
-// Unstable but executes much faster than the transposition sort.
-// http://en.wikipedia.org/wiki/Batcher_odd%E2%80%93even_mergesort
-template<int Width, int Low, int Count>
-struct OddEvenMergesortT {
-	template<typename K, typename V, typename Comp>
-	MGPU_DEVICE static void CompareAndSwap(K* keys, V* values, int flags,
-		int a, int b, Comp comp) {
-		if(b < Count) {
-			// Mask the bits between a and b. Any head flags in this interval
-			// means the keys are in different segments and must not be swapped.
-			const int Mask = ((2<< b) - 1) ^ ((2<< a) - 1);
-			if(!(Mask & flags) && comp(keys[b], keys[a])) {
-				mgpu::swap(keys[b], keys[a]);
-				mgpu::swap(values[b], values[a]);
-			}
-		}
-	}
-	template<int R, int Low2, bool Recurse = 2 * R < Width>
-	struct OddEvenMerge {
-		template<typename K, typename V, typename Comp>
-		MGPU_DEVICE static void Merge(K* keys, V* values, int flags,
-			Comp comp) {
-			// Compare and swap
-			const int M = 2 * R;
-			OddEvenMerge<M, Low2>::Merge(keys, values, flags, comp);
-			OddEvenMerge<M, Low2 + R>::Merge(keys, values, flags, comp);
-			#pragma unroll
-			for(int i = Low2 + R; i + R < Low2 + Width; i += M)
-				CompareAndSwap(keys, values, flags, i, i + R, comp);
-		}
-	};
-	template<int R, int Low2>
-	struct OddEvenMerge<R, Low2, false> {
-		template<typename K, typename V, typename Comp>
-		MGPU_DEVICE static void Merge(K* keys, V* values, int flags,
-			Comp comp) {
-			CompareAndSwap(keys, values, flags, Low2, Low2 + R, comp);
-		}
-	};
-	template<typename K, typename V, typename Comp>
-	MGPU_DEVICE static void Sort(K* keys, V* values, int flags,
-		Comp comp) {
-		const int M = Width / 2;
-		OddEvenMergesortT<M, Low, Count>::Sort(keys, values, flags, comp);
-		OddEvenMergesortT<M, Low + M, Count>::Sort(keys, values, flags, comp);
-		OddEvenMerge<1, Low>::Merge(keys, values, flags, comp);
-	}
-};
-template<int Low, int Count> struct OddEvenMergesortT<1, Low, Count> {
-	template<typename K, typename V, typename Comp>
-	MGPU_DEVICE static void Sort(K* keys, V* values, int flags,
-		Comp comp) { }
-};
-template<int VT, typename K, typename V, typename Comp>
-MGPU_DEVICE void OddEvenMergesort(K* keys, V* values, Comp comp) {
-	const int Width = 1<< sLogPow2<VT, true>::value;
-	OddEvenMergesortT<Width, 0, VT>::Sort(keys, values, 0, comp);
-}
-template<int VT, typename K, typename V, typename Comp>
-MGPU_DEVICE void OddEvenMergesortFlags(K* keys, V* values, int flags,
-	Comp comp) {
-	const int Width = 1<< sLogPow2<VT, true>::value;
-	OddEvenMergesortT<Width, 0, VT>::Sort(keys, values, flags, comp);
-}
-} // namespace mgpu
--- a/include/contrib/moderngpu/include/mgpudevice.cuh
+++ b/include/contrib/moderngpu/include/mgpudevice.cuh
-/******************************************************************************
- * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-/******************************************************************************
- *
- * Code and text by Sean Baxter, NVIDIA Research
- * See http://nvlabs.github.io/moderngpu for repository and documentation.
- *
- ******************************************************************************/
-#pragma once
-#include "mgpuenums.h"
-#include "device/deviceutil.cuh"
-namespace mgpu {
-////////////////////////////////////////////////////////////////////////////////
-// device/loadstore.cuh
-// For 0 <= i < VT:
-//		index = NT * i + tid;
-//		reg[i] = data[index];
-// Synchronize after load.
-template<int NT, int VT, typename InputIt, typename T>
-MGPU_DEVICE void DeviceSharedToReg(InputIt data, int tid, T* reg,
-	bool sync = true);
-// For 0 <= i < VT:
-//		index = NT * i + tid;
-//		if(index < count) reg[i] = data[index];
-// No synchronize after load.
-template<int NT, int VT, typename InputIt, typename T>
-MGPU_DEVICE void DeviceGlobalToReg(int count, InputIt data, int tid,
-	T* reg, bool sync = false);
-template<int NT, int VT, typename InputIt, typename T>
-MGPU_DEVICE void DeviceGlobalToRegDefault(int count, InputIt data, int tid,
-	T* reg, T init, bool sync = false);
-// For 0 <= i < VT:
-//		index = NT * i + tid;
-//		if(index < count) reg[i] = data[index];
-// No synchronize after load.
-template<int NT, int VT0, int VT1, typename InputIt, typename T>
-MGPU_DEVICE void DeviceGlobalToReg(int count, InputIt data, int tid,
-	T* reg, bool sync = false);
-// For 0 <= i < VT:
-//		index = NT * i + tid;
-//		if(index < count) reg[i] = data[index];
-// No synchronize after load.
-template<int NT, int VT0, int VT1, typename InputIt, typename T>
-MGPU_DEVICE void DeviceGlobalToRegDefault2(int count, InputIt data, int tid,
-	T* reg, T init, bool sync = false);
-// For 0 <= i < VT:
-//		index = NT * i + tid;
-//		if(index < count) reg[i] = data[index];
-// No synchronize after load.
-// No optimized code path for count < NV (smaller generated code).
-template<int NT, int VT, typename InputIt, typename T>
-MGPU_DEVICE void DeviceGlobalToRegLoop(int count, InputIt data, int tid,
-	T* reg, bool sync = false);
-// For 0 <= i < VT:
-//		index = VT * tid + i.
-//		if(index < count) reg[i] = data[index];
-// No synchronize after load.
-template<int NT, int VT, typename InputIt, typename T>
-MGPU_DEVICE void DeviceGlobalToThread(int count, InputIt data, int tid,
-	T* reg);
-template<int NT, int VT, typename InputIt, typename T>
-MGPU_DEVICE void DeviceGlobalToThreadDefault(int count, InputIt data, int tid,
-	T* reg, T init);
-// For 0 <= i < VT:
-//		index = NT * i + tid;
-//		if(index < count) data[index] = reg[i];
-// Synchronize after load.
-template<int NT, int VT, typename OutputIt, typename T>
-MGPU_DEVICE void DeviceRegToShared(const T* reg, int tid, OutputIt dest,
-	bool sync = true);
-// For 0 <= i < VT:
-//		index = NT * i + tid;
-//		if(index < count) data[index] = reg[i];
-// No synchronize after load.
-template<int NT, int VT, typename OutputIt, typename T>
-MGPU_DEVICE void DeviceRegToGlobal(int count, const T* reg, int tid,
-	OutputIt dest, bool sync = false);
-// For 0 <= index < count:
-//		dest[index] = source[index];
-// This function is intended to replace DeviceGlobalToShared in cases where
-// count is much less than NT * VT.
-template<int NT, typename InputIt, typename OutputIt>
-MGPU_DEVICE void DeviceMemToMemLoop(int count, InputIt source, int tid,
-	OutputIt dest, bool sync = true);
-// For 0 <= index < count:
-//		dest[index] = source[index];
-// Synchronize after store.
-template<int NT, int VT, typename T, typename OutputIt>
-MGPU_DEVICE void DeviceSharedToGlobal(int count, const T* source, int tid,
-	OutputIt dest, bool sync = true);
-// For 0 <= index < count:
-//		dest[index] = source[index];
-// Synchronize after store.
-template<int NT, int VT, typename InputIt, typename T>
-MGPU_DEVICE void DeviceGlobalToShared(int count, InputIt source, int tid,
-	T* dest, bool sync = true);
-template<int NT, int VT0, int VT1, typename InputIt, typename T>
-MGPU_DEVICE void DeviceGlobalToShared2(int count, InputIt source, int tid,
-	T* dest, bool sync = true);
-// For 0 <= index < count:
-//		dest[index] = source[index];
-// Synchronize after store.
-// No optimized code path for count < NV (smaller generated code).
-template<int NT, int VT, typename InputIt, typename T>
-MGPU_DEVICE void DeviceGlobalToSharedLoop(int count, InputIt source, int tid,
-	T* dest, bool sync = true);
-template<int NT, int VT, typename InputIt, typename T>
-MGPU_DEVICE void DeviceGlobalToSharedDefault(int count, InputIt source, int tid,
-	T* dest, T init, bool sync = true);
-template<int NT, int VT0, int VT1, typename InputIt, typename T>
-MGPU_DEVICE void DeviceGlobalToSharedDefault2(int count, InputIt source,
-	int tid, T* dest, T init, bool sync = true);
-// For 0 <= index < count:
-//		dest[index] = source[index];
-// No synchronize.
-template<int NT, int VT, typename InputIt, typename OutputIt>
-MGPU_DEVICE void DeviceGlobalToGlobal(int count, InputIt source, int tid,
-	OutputIt dest, bool sync = false);
-// Transponse VT elements in NT threads (x) into thread-order registers (y)
-// using only NT * VT / 2 elements of shared memory.
-template<int NT, int VT, typename T>
-MGPU_DEVICE void HalfSmemTranspose(const T* x, int tid, T* shared, T* y);
-// For 0 <= i < VT:
-//		index = NT * i + tid;
-//		if(index < count)
-//			gather = indices[index];
-//			reg[i] = data[gather];
-// Synchronize after load.
-template<int NT, int VT, typename InputIt, typename T>
-MGPU_DEVICE void DeviceGather(int count, InputIt data, int indices[VT],
-	int tid, T* reg, bool sync = true);
-template<int NT, int VT, typename InputIt, typename T>
-MGPU_DEVICE void DeviceGatherDefault(int count, InputIt data, int indices[VT],
-	int tid, T* reg, T identity, bool sync = true);
-// For 0 <= i < VT:
-//		index = NT * i + tid;
-//		if(index < count)
-//			scatter = indices[index];
-//			data[scatter] = reg[i];
-// Synchronize after store.
-template<int NT, int VT, typename T, typename OutputIt>
-MGPU_DEVICE void DeviceScatter(int count, const T* reg, int tid,
-	int indices[VT], OutputIt data, bool sync = true);
-// For 0 <= i < VT:
-//		shared[VT * tid + i] = threadReg[i];
-// Synchronize after store.
-// Note this function moves data in THREAD ORDER.
-// (DeviceRegToShared moves data in STRIDED ORDER).
-template<int VT, typename T>
-MGPU_DEVICE void DeviceThreadToShared(const T* threadReg, int tid, T* shared,
-	bool sync = true);
-// For 0 <= i < VT:
-//		threadReg[i] = shared[VT * tid + i];
-// Synchronize after load.
-// Note this function moves data in THREAD ORDER.
-// (DeviceSharedToReg moves data in STRIDED ORDER).
-template<int VT, typename T>
-MGPU_DEVICE void DeviceSharedToThread(const T* shared, int tid, T* threadReg,
-	bool sync = true);
-// For 0 <= index < aCount:
-//		shared[index] = a_global[index];
-// For 0 <= index < bCount:
-//		shared[aCount + index] = b_global[index];
-// VT0 is the lower-bound for predication-free execution:
-//		If count >= NT * VT0, a predication-free branch is taken.
-// VT1 is the upper-bound for loads:
-//		NT * VT1 must >= aCount + bCount.
-template<int NT, int VT0, int VT1, typename T>
-MGPU_DEVICE void DeviceLoad2ToReg(const T* a_global, int aCount,
-	const T* b_global, int bCount, int tid, T* reg, bool sync = false);
-template<int NT, int VT0, int VT1, typename T>
-MGPU_DEVICE void DeviceLoad2ToShared(const T* a_global, int aCount,
-	const T* b_global, int bCount, int tid, T* shared, bool sync = true);
-template<int NT, int VT0, int VT1, typename InputIt1, typename InputIt2,
-	typename T>
-MGPU_DEVICE void DeviceLoad2ToReg(InputIt1 a_global, int aCount,
-	InputIt2 b_global, int bCount, int tid, T* reg, bool sync = false);
-template<int NT, int VT0, int VT1, typename InputIt1, typename InputIt2,
-	typename T>
-MGPU_DEVICE void DeviceLoad2ToShared(InputIt1 a_global, int aCount,
-	InputIt2 b_global, int bCount, int tid, T* shared, bool sync = true);
-// For 0 <= i < VT
-//		index = NT * i + tid;
-//		if(index < count)
-//			gather = indices_shared[index];
-//			dest_global[index] = data_global[gather];
-// Synchronize after load.
-template<int NT, int VT, typename InputIt, typename OutputIt>
-MGPU_DEVICE void DeviceGatherGlobalToGlobal(int count, InputIt data_global,
-	const int* indices_shared, int tid, OutputIt dest_global,
-	bool sync = true);
-// For 0 <= i < VT
-//		index = NT * i + tid
-//		if(index < count)
-//			gather = indices[index];
-//			if(gather < aCount) data = a_global[gather];
-//			else data = b_global[gather - aCount];
-//			dest_global[index] = data;
-// Synchronize after load.
-template<int NT, int VT, typename InputIt1, typename InputIt2,
-	typename T>
-MGPU_DEVICE void DeviceTransferMergeValuesReg(int count, InputIt1 a_global,
-	InputIt2 b_global, int bStart, const int* indices, int tid,
-	T* reg, bool sync = false);
-template<int NT, int VT, typename InputIt1, typename InputIt2,
-	typename OutputIt>
-MGPU_DEVICE void DeviceTransferMergeValuesShared(int count, InputIt1 a_global,
-	InputIt2 b_global, int bStart, const int* indices_shared, int tid,
-	OutputIt dest_global, bool sync = true);
-template<int NT, int VT, typename T>
-MGPU_DEVICE void DeviceTransferMergeValuesReg(int count, const T* a_global,
-	const T* b_global, int bStart, const int* indices, int tid,
-	T* reg, bool sync = false);
-template<int NT, int VT, typename T, typename OutputIt>
-MGPU_DEVICE void DeviceTransferMergeValuesShared(int count, const T* a_global,
-	const T* b_global, int bStart, const int* indices_shared, int tid,
-	OutputIt dest_global, bool sync = true);
-} // namespace mgpu
-#include "device/loadstore.cuh"
-#include "device/ctasegscan.cuh"