warpctc for dcu

0bf5eb5f · lishen · 949fcc19 · 0bf5eb5f · 0bf5eb5f · 0bf5eb5f
Commit 0bf5eb5f authored May 16, 2023 by lishen
20 changed files
--- a/.gitignore
+++ b/.gitignore
+.idea
+*~
+Makefile
+build
+
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
--- a/.idea/deployment.xml
+++ b/.idea/deployment.xml
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="PublishConfigData" autoUpload="On explicit save action" serverName="10.6.10.62" preserveTimestamps="false" deleteMissingItems="true" createEmptyFolders="true" filePermissions="420" folderPermissions="493" confirmBeforeUploading="false" confirmBeforeDeletion="false" autoUploadExternalChanges="true">
+    <option name="confirmBeforeDeletion" value="false" />
+    <option name="confirmBeforeUploading" value="false" />
+    <serverData>
+      <paths name="10.6.10.61">
+        <serverdata>
+          <mappings>
+            <mapping local="$PROJECT_DIR$" web="/" />
+          </mappings>
+        </serverdata>
+      </paths>
+      <paths name="10.6.10.62">
+        <serverdata>
+          <mappings>
+            <mapping deploy="/public/home/lishen/warpctc/warpctc_dcu" local="$PROJECT_DIR$" web="/" />
+          </mappings>
+        </serverdata>
+      </paths>
+      <paths name="10.6.10.69">
+        <serverdata>
+          <mappings>
+            <mapping local="$PROJECT_DIR$" web="/" />
+          </mappings>
+        </serverdata>
+      </paths>
+      <paths name="10.6.6.220">
+        <serverdata>
+          <mappings>
+            <mapping local="$PROJECT_DIR$" web="/" />
+          </mappings>
+        </serverdata>
+      </paths>
+      <paths name="lishen_89_docker (c96f91f6-1c12-4fb0-b3d8-cc68ccf3f77c)">
+        <serverdata>
+          <mappings>
+            <mapping local="$PROJECT_DIR$" web="/" />
+          </mappings>
+        </serverdata>
+      </paths>
+      <paths name="lishen_90">
+        <serverdata>
+          <mappings>
+            <mapping local="$PROJECT_DIR$" web="/" />
+          </mappings>
+        </serverdata>
+      </paths>
+      <paths name="lishen_93">
+        <serverdata>
+          <mappings>
+            <mapping local="$PROJECT_DIR$" web="/" />
+          </mappings>
+        </serverdata>
+      </paths>
+      <paths name="lishen_95">
+        <serverdata>
+          <mappings>
+            <mapping local="$PROJECT_DIR$" web="/" />
+          </mappings>
+        </serverdata>
+      </paths>
+    </serverData>
+    <option name="myAutoUpload" value="ON_EXPLICIT_SAVE" />
+  </component>
+</project>
\ No newline at end of file
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/warpctc_dcu.iml" filepath="$PROJECT_DIR$/.idea/warpctc_dcu.iml" />
+    </modules>
+  </component>
+</project>
\ No newline at end of file
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>
\ No newline at end of file
--- a/.idea/warpctc_dcu.iml
+++ b/.idea/warpctc_dcu.iml
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="CPP_MODULE" version="4">
+  <component name="FacetManager">
+    <facet type="Python" name="Python facet">
+      <configuration sdkName="Python 3.7" />
+    </facet>
+  </component>
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+    <orderEntry type="library" name="Python 3.7 interpreter library" level="application" />
+  </component>
+</module>
\ No newline at end of file
--- a/LICENSE
+++ b/LICENSE
-123456
--- a/README.md
+++ b/README.md
+# PyTorch bindings for Warp-ctc
+
+[![Build Status](https://travis-ci.org/SeanNaren/warp-ctc.svg?branch=pytorch_bindings)](https://travis-ci.org/SeanNaren/warp-ctc)
+
+This is an extension onto the original repo found [here](https://github.com/baidu-research/warp-ctc).
+
+## Installation
+
+Install [PyTorch](https://github.com/pytorch/pytorch#installation) v0.4.
+
+`WARP_CTC_PATH` should be set to the location of a built WarpCTC
+(i.e. `libwarpctc.so`).  This defaults to `../build`, so from within a
+new warp-ctc clone you could build WarpCTC like this:
+
+```bash
+git clone https://github.com/SeanNaren/warp-ctc.git
+cd warp-ctc
+mkdir build; cd build
+cmake ..
+make
+```
+
+Now install the bindings:
+```bash
+cd pytorch_binding
+python setup.py install
+```
+
+If you try the above and get a dlopen error on OSX with anaconda3 (as recommended by pytorch):
+```bash
+cd ../pytorch_binding
+python setup.py install
+cd ../build
+cp libwarpctc.dylib /Users/$WHOAMI/anaconda3/lib
+```
+This will resolve the library not loaded error. This can be easily modified to work with other python installs if needed.
+
+Example to use the bindings below.
+
+```python
+import torch
+from warpctc_pytorch import CTCLoss
+ctc_loss = CTCLoss()
+# expected shape of seqLength x batchSize x alphabet_size
+probs = torch.FloatTensor([[[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]]]).transpose(0, 1).contiguous()
+labels = torch.IntTensor([1, 2])
+label_sizes = torch.IntTensor([2])
+probs_sizes = torch.IntTensor([2])
+probs.requires_grad_(True)  # tells autograd to compute gradients for probs
+cost = ctc_loss(probs, labels, probs_sizes, label_sizes)
+cost.backward()
+```
+
+## Documentation
+
+```
+CTCLoss(size_average=False, length_average=False)
+    # size_average (bool): normalize the loss by the batch size (default: False)
+    # length_average (bool): normalize the loss by the total number of frames in the batch. If True, supersedes size_average (default: False)
+
+forward(acts, labels, act_lens, label_lens)
+    # acts: Tensor of (seqLength x batch x outputDim) containing output activations from network (before softmax)
+    # labels: 1 dimensional Tensor containing all the targets of the batch in one large sequence
+    # act_lens: Tensor of size (batch) containing size of each output sequence from the network
+    # label_lens: Tensor of (batch) containing label length of each example
+```
\ No newline at end of file
--- a/README_HIP.md
+++ b/README_HIP.md
+# DLIB
+
+## 环境配置
+
+使用DCU编译之前，需要准备编译环境。参考
+[environment prepare](environment_prepare.md)
+
+## 使用源码安装
+
+### 编译环境准备（以dtk-23.04版本为例）
+
+- 拉取代码
+
+  ```
+  git clone -b dtk-23.04 http://developer.hpccube.com/codes/aicomponent/warpctc.git
+  ```
+- 在[开发者社区](https://developer.hpccube.com/tool/#sdk) DCU Toolkit 中下载 DTK-23.04 解压至 /opt/ 路径下，并建立软链接
+
+  ```
+  cd /opt && ln -s dtk-23.04 dtk
+  ```
+
+- 导入环境变量以及安装必要依赖库
+
+  ```shell
+  source /opt/dtk/env.sh
+  ```
+
+### 编译安装
+
+#### 编译 Python API
+
+- 使用python安装
+
+```shell
+python setup.py install
+```
+
+- 使用python编译whl包
+
+```shell
+python setup.py bdist_wheel
+```
+
+### 测试
+
+- 验证warpctc的loss正确性（CPU和GPU的一致性）
+
+```shell
+python3 test_gpu.py
+```
+
+- 验证warpctc的loss的GPU加速效果
+
+```shell
+python3 test_gpu_speed.py
+```
+ 
+
+
+
--- a/doc/deep-speech-ctc-small.png
+++ b/doc/deep-speech-ctc-small.png
--- a/include/contrib/moderngpu/LICENSE
+++ b/include/contrib/moderngpu/LICENSE
+/******************************************************************************
+* Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+* 
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are met:
+*     * Redistributions of source code must retain the above copyright
+*       notice, this list of conditions and the following disclaimer.
+*     * Redistributions in binary form must reproduce the above copyright
+*       notice, this list of conditions and the following disclaimer in the
+*       documentation and/or other materials provided with the distribution.
+*     * Neither the name of the NVIDIA CORPORATION nor the
+*       names of its contributors may be used to endorse or promote products
+*       derived from this software without specific prior written permission.
+* 
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+******************************************************************************/
--- a/include/contrib/moderngpu/include/device/ctaloadbalance.cuh
+++ b/include/contrib/moderngpu/include/device/ctaloadbalance.cuh
+/******************************************************************************
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ *
+ * Code and text by Sean Baxter, NVIDIA Research
+ * See http://nvlabs.github.io/moderngpu for repository and documentation.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include "ctasearch.cuh"
+#include "loadstore.cuh"
+
+namespace mgpu {
+
+////////////////////////////////////////////////////////////////////////////////
+// DeviceLoadBalancingSearch
+// Upper Bound search from A (needles) into B (haystack). The A values are
+// natural numbers from aBegin to aEnd. bFirst is the index of the B value at
+// bBegin in shared memory.
+
+template<int VT, bool RangeCheck>
+MGPU_DEVICE void DeviceSerialLoadBalanceSearch(const int* b_shared, int aBegin,
+	int aEnd, int bFirst, int bBegin, int bEnd, int* a_shared) {
+
+	int bKey = b_shared[bBegin];
+
+	#pragma unroll
+	for(int i = 0; i < VT; ++i) {
+		bool p;
+		if(RangeCheck)
+			p = (aBegin < aEnd) && ((bBegin >= bEnd) || (aBegin < bKey));
+		else
+			p = aBegin < bKey;
+
+		if(p)
+			// Advance A (the needle).
+			a_shared[aBegin++] = bFirst + bBegin;
+		else
+			// Advance B (the haystack).
+			bKey = b_shared[++bBegin];
+	}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// CTALoadBalance
+// Computes upper_bound(counting_iterator<int>(first), b_global) - 1.
+
+// Unlike most other CTA* functions, CTALoadBalance loads from global memory.
+// This returns the loaded B elements at the beginning or end of shared memory
+// depending on the aFirst argument.
+
+// CTALoadBalance requires NT * VT + 2 slots of shared memory.
+template<int NT, int VT, typename InputIt>
+MGPU_DEVICE int4 CTALoadBalance(int destCount, InputIt b_global,
+	int sourceCount, int block, int tid, const int* mp_global,
+	int* indices_shared, bool loadPrecedingB) {
+
+	int4 range = ComputeMergeRange(destCount, sourceCount, block, 0, NT * VT,
+		mp_global);
+
+	int a0 = range.x;
+	int a1 = range.y;
+	int b0 = range.z;
+	int b1 = range.w;
+	if(!b0) loadPrecedingB = false;
+
+	// Load one trailing term from B. If we're already at the end, fill the
+	// end of the buffer with destCount.
+	int aCount = a1 - a0;
+	int bCount = b1 - b0;
+	int extended = b1 < sourceCount;
+	int loadCount = bCount + extended;
+	int fillCount = NT * VT + 1 - loadCount - aCount;
+
+	int* a_shared = indices_shared;
+	int* b_shared = indices_shared + aCount + (int)loadPrecedingB;
+
+	// Load the B values.
+//	DeviceMemToMemLoop<NT>(bCount + extended + (int)loadPrecedingB,
+//		b_global + b0 - (int)loadPrecedingB, tid,
+//		b_shared - (int)loadPrecedingB);
+
+	for(int i = tid - (int)loadPrecedingB; i < bCount + extended; i += NT)
+		b_shared[i] = b_global[b0 + i];
+
+	// Fill the end of the array with destCount.
+	for(int i = tid + extended; i < fillCount; i += NT)
+		b_shared[bCount + i] = destCount;
+	__syncthreads();
+
+	// Run a merge path to find the start of the serial merge for each thread.
+	int diag = VT * tid;
+	int mp = MergePath<MgpuBoundsUpper>(mgpu::counting_iterator<int>(a0),
+		aCount, b_shared, bCount, diag, mgpu::less<int>());
+
+	int a0tid = a0 + mp;
+	int b0tid = diag - mp;
+
+	// Subtract 1 from b0 because we want to return upper_bound - 1.
+	DeviceSerialLoadBalanceSearch<VT, false>(b_shared, a0tid, a1, b0 - 1,
+		b0tid, bCount, a_shared - a0);
+	__syncthreads();
+
+	b0 -= (int)loadPrecedingB;
+	return make_int4(a0, a1, b0, b1);
+}
+
+
+} // namespace mgpu
--- a/include/contrib/moderngpu/include/device/ctamerge.cuh
+++ b/include/contrib/moderngpu/include/device/ctamerge.cuh
+/******************************************************************************
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ *
+ * Code and text by Sean Baxter, NVIDIA Research
+ * See http://nvlabs.github.io/moderngpu for repository and documentation.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include "ctasearch.cuh"
+#include "loadstore.cuh"
+#include "sortnetwork.cuh"
+
+namespace mgpu {
+
+////////////////////////////////////////////////////////////////////////////////
+// SerialMerge
+
+template<int VT, bool RangeCheck, typename T, typename Comp>
+MGPU_DEVICE void SerialMerge(const T* keys_shared, int aBegin, int aEnd,
+	int bBegin, int bEnd, T* results, int* indices, Comp comp) {
+
+	T aKey = keys_shared[aBegin];
+	T bKey = keys_shared[bBegin];
+
+	#pragma unroll
+	for(int i = 0; i < VT; ++i) {
+		bool p;
+		if(RangeCheck)
+			p = (bBegin >= bEnd) || ((aBegin < aEnd) && !comp(bKey, aKey));
+		else
+			p = !comp(bKey, aKey);
+
+		results[i] = p ? aKey : bKey;
+		indices[i] = p ? aBegin : bBegin - !RangeCheck;
+
+		if(p) aKey = keys_shared[++aBegin];
+		else bKey = keys_shared[++bBegin];
+	}
+	__syncthreads();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// FindMergeFrame and FindMergesortInterval help mergesort (both CTA and global
+// merge pass levels) locate lists within the single source array.
+
+// Returns (offset of a, offset of b, length of list).
+MGPU_HOST_DEVICE int3 FindMergesortFrame(int coop, int block, int nv) {
+	// coop is the number of CTAs or threads cooperating to merge two lists into
+	// one. We round block down to the first CTA's ID that is working on this
+	// merge.
+	int start = ~(coop - 1) & block;
+	int size = nv * (coop>> 1);
+	return make_int3(nv * start, nv * start + size, size);
+}
+
+// Returns (a0, a1, b0, b1) into mergesort input lists between mp0 and mp1.
+MGPU_HOST_DEVICE int4 FindMergesortInterval(int3 frame, int coop, int block,
+	int nv, int count, int mp0, int mp1) {
+
+	// Locate diag from the start of the A sublist.
+	int diag = nv * block - frame.x;
+	int a0 = frame.x + mp0;
+	int a1 = min(count, frame.x + mp1);
+	int b0 = min(count, frame.y + diag - mp0);
+	int b1 = min(count, frame.y + diag + nv - mp1);
+
+	// The end partition of the last block for each merge operation is computed
+	// and stored as the begin partition for the subsequent merge. i.e. it is
+	// the same partition but in the wrong coordinate system, so its 0 when it
+	// should be listSize. Correct that by checking if this is the last block
+	// in this merge operation.
+	if(coop - 1 == ((coop - 1) & block)) {
+		a1 = min(count, frame.x + frame.z);
+		b1 = min(count, frame.y + frame.z);
+	}
+	return make_int4(a0, a1, b0, b1);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// ComputeMergeRange
+
+MGPU_HOST_DEVICE int4 ComputeMergeRange(int aCount, int bCount, int block,
+	int coop, int NV, const int* mp_global) {
+
+	// Load the merge paths computed by the partitioning kernel.
+	int mp0 = mp_global[block];
+	int mp1 = mp_global[block + 1];
+	int gid = NV * block;
+
+	// Compute the ranges of the sources in global memory.
+	int4 range;
+	if(coop) {
+		int3 frame = FindMergesortFrame(coop, block, NV);
+		range = FindMergesortInterval(frame, coop, block, NV, aCount, mp0,
+			mp1);
+	} else {
+		range.x = mp0;											// a0
+		range.y = mp1;											// a1
+		range.z = gid - range.x;								// b0
+		range.w = min(aCount + bCount, gid + NV) - range.y;		// b1
+	}
+	return range;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// CTA mergesort support
+
+template<int NT, int VT, typename T, typename Comp>
+MGPU_DEVICE void CTABlocksortPass(T* keys_shared, int tid, int count,
+	int coop, T* keys, int* indices, Comp comp) {
+
+	int list = ~(coop - 1) & tid;
+	int diag = min(count, VT * ((coop - 1) & tid));
+	int start = VT * list;
+	int a0 = min(count, start);
+	int b0 = min(count, start + VT * (coop / 2));
+	int b1 = min(count, start + VT * coop);
+
+	int p = MergePath<MgpuBoundsLower>(keys_shared + a0, b0 - a0,
+		keys_shared + b0, b1 - b0, diag, comp);
+
+	SerialMerge<VT, true>(keys_shared, a0 + p, b0, b0 + diag - p, b1, keys,
+		indices, comp);
+}
+
+template<int NT, int VT, bool HasValues, typename KeyType, typename ValType,
+	typename Comp>
+MGPU_DEVICE void CTABlocksortLoop(ValType threadValues[VT],
+	KeyType* keys_shared, ValType* values_shared, int tid, int count,
+	Comp comp) {
+
+	#pragma unroll
+	for(int coop = 2; coop <= NT; coop *= 2) {
+		int indices[VT];
+		KeyType keys[VT];
+		CTABlocksortPass<NT, VT>(keys_shared, tid, count, coop, keys,
+			indices, comp);
+
+		if(HasValues) {
+			// Exchange the values through shared memory.
+			DeviceThreadToShared<VT>(threadValues, tid, values_shared);
+			DeviceGather<NT, VT>(NT * VT, values_shared, indices, tid,
+				threadValues);
+		}
+
+		// Store results in shared memory in sorted order.
+		DeviceThreadToShared<VT>(keys, tid, keys_shared);
+	}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// CTAMergesort
+// Caller provides the keys in shared memory. This functions sorts the first
+// count elements.
+
+template<int NT, int VT, bool Stable, bool HasValues, typename KeyType,
+	typename ValType, typename Comp>
+MGPU_DEVICE void CTAMergesort(KeyType threadKeys[VT], ValType threadValues[VT],
+	KeyType* keys_shared, ValType* values_shared, int count, int tid,
+	Comp comp) {
+
+	// Stable sort the keys in the thread.
+	if(VT * tid < count) {
+		if(Stable)
+			OddEvenTransposeSort<VT>(threadKeys, threadValues, comp);
+		else
+			OddEvenMergesort<VT>(threadKeys, threadValues, comp);
+	}
+
+	// Store the locally sorted keys into shared memory.
+	DeviceThreadToShared<VT>(threadKeys, tid, keys_shared);
+
+	// Recursively merge lists until the entire CTA is sorted.
+	CTABlocksortLoop<NT, VT, HasValues>(threadValues, keys_shared,
+		values_shared, tid, count, comp);
+}
+
+template<int NT, int VT, bool Stable, typename KeyType, typename Comp>
+MGPU_DEVICE void CTAMergesortKeys(KeyType threadKeys[VT],
+	KeyType* keys_shared, int count, int tid, Comp comp) {
+
+	int valuesTemp[VT];
+	CTAMergesort<NT, VT, Stable, false>(threadKeys, valuesTemp, keys_shared,
+		(int*)keys_shared, count, tid, comp);
+}
+
+template<int NT, int VT, bool Stable, typename KeyType, typename ValType,
+	typename Comp>
+MGPU_DEVICE void CTAMergesortPairs(KeyType threadKeys[VT],
+	ValType threadValues[VT], KeyType* keys_shared, ValType* values_shared,
+	int count, int tid, Comp comp) {
+
+	CTAMergesort<NT, VT, Stable, true>(threadKeys, threadValues, keys_shared,
+		values_shared, count, tid, comp);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// DeviceMergeKeysIndices
+
+template<int NT, int VT, bool LoadExtended, typename It1, typename It2,
+	typename T, typename Comp>
+MGPU_DEVICE void DeviceMergeKeysIndices(It1 a_global, int aCount, It2 b_global,
+	int bCount, int4 range, int tid, T* keys_shared, T* results, int* indices,
+	Comp comp) {
+
+	int a0 = range.x;
+	int a1 = range.y;
+	int b0 = range.z;
+	int b1 = range.w;
+
+	if(LoadExtended) {
+		bool extended = (a1 < aCount) && (b1 < bCount);
+		aCount = a1 - a0;
+		bCount = b1 - b0;
+		int aCount2 = aCount + (int)extended;
+		int bCount2 = bCount + (int)extended;
+
+		// Load one element past the end of each input to avoid having to use
+		// range checking in the merge loop.
+		DeviceLoad2ToShared<NT, VT, VT + 1>(a_global + a0, aCount2,
+			b_global + b0, bCount2, tid, keys_shared);
+
+		// Run a Merge Path search for each thread's starting point.
+		int diag = VT * tid;
+		int mp = MergePath<MgpuBoundsLower>(keys_shared, aCount,
+			keys_shared + aCount2, bCount, diag, comp);
+
+		// Compute the ranges of the sources in shared memory.
+		int a0tid = mp;
+		int b0tid = aCount2 + diag - mp;
+		if(extended) {
+			SerialMerge<VT, false>(keys_shared, a0tid, 0, b0tid, 0, results,
+				indices, comp);
+		} else {
+			int a1tid = aCount;
+			int b1tid = aCount2 + bCount;
+			SerialMerge<VT, true>(keys_shared, a0tid, a1tid, b0tid, b1tid,
+				results, indices, comp);
+		}
+	} else {
+		// Use the input intervals from the ranges between the merge path
+		// intersections.
+		aCount = a1 - a0;
+		bCount = b1 - b0;
+
+		// Load the data into shared memory.
+		DeviceLoad2ToShared<NT, VT, VT>(a_global + a0, aCount, b_global + b0,
+			bCount, tid, keys_shared);
+
+		// Run a merge path to find the start of the serial merge for each
+		// thread.
+		int diag = VT * tid;
+		int mp = MergePath<MgpuBoundsLower>(keys_shared, aCount,
+			keys_shared + aCount, bCount, diag, comp);
+
+		// Compute the ranges of the sources in shared memory.
+		int a0tid = mp;
+		int a1tid = aCount;
+		int b0tid = aCount + diag - mp;
+		int b1tid = aCount + bCount;
+
+		// Serial merge into register.
+		SerialMerge<VT, true>(keys_shared, a0tid, a1tid, b0tid, b1tid, results,
+			indices, comp);
+	}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// DeviceMerge
+// Merge pairs from global memory into global memory. Useful factorization to
+// enable calling from merge, mergesort, and locality sort.
+
+template<int NT, int VT, bool HasValues, bool LoadExtended, typename KeysIt1,
+	typename KeysIt2, typename KeysIt3, typename ValsIt1, typename ValsIt2,
+	typename KeyType, typename ValsIt3, typename Comp>
+MGPU_DEVICE void DeviceMerge(KeysIt1 aKeys_global, ValsIt1 aVals_global,
+	int aCount, KeysIt2 bKeys_global, ValsIt2 bVals_global, int bCount,
+	int tid, int block, int4 range, KeyType* keys_shared, int* indices_shared,
+	KeysIt3 keys_global, ValsIt3 vals_global, Comp comp) {
+
+	KeyType results[VT];
+	int indices[VT];
+	DeviceMergeKeysIndices<NT, VT, LoadExtended>(aKeys_global, aCount,
+		bKeys_global, bCount, range, tid, keys_shared, results, indices, comp);
+
+	// Store merge results back to shared memory.
+	DeviceThreadToShared<VT>(results, tid, keys_shared);
+
+	// Store merged keys to global memory.
+	aCount = range.y - range.x;
+	bCount = range.w - range.z;
+	DeviceSharedToGlobal<NT, VT>(aCount + bCount, keys_shared, tid,
+		keys_global + NT * VT * block);
+
+	// Copy the values.
+	if(HasValues) {
+		DeviceThreadToShared<VT>(indices, tid, indices_shared);
+
+		DeviceTransferMergeValuesShared<NT, VT>(aCount + bCount,
+			aVals_global + range.x, bVals_global + range.z, aCount,
+			indices_shared, tid, vals_global + NT * VT * block);
+	}
+}
+
+} // namespace mgpu
--- a/include/contrib/moderngpu/include/device/ctascan.cuh
+++ b/include/contrib/moderngpu/include/device/ctascan.cuh
+/******************************************************************************
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ *
+ * Code and text by Sean Baxter, NVIDIA Research
+ * See http://nvlabs.github.io/moderngpu for repository and documentation.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include "../mgpuenums.h"
+#include "deviceutil.cuh"
+#include "intrinsics.cuh"
+
+namespace mgpu {
+
+////////////////////////////////////////////////////////////////////////////////
+// CTAReduce
+
+template<int NT, typename Op = mgpu::plus<int> >
+struct CTAReduce {
+	typedef typename Op::first_argument_type T;
+	enum { Size = NT, Capacity = NT };
+	struct Storage { T shared[Capacity]; };
+
+	MGPU_DEVICE static T Reduce(int tid, T x, Storage& storage, Op op = Op()) {
+		storage.shared[tid] = x;
+		__syncthreads();
+
+		// Fold the data in half with each pass.
+		#pragma unroll
+		for(int destCount = NT / 2; destCount >= 1; destCount /= 2) {
+			if(tid < destCount) {
+				// Read from the right half and store to the left half.
+				x = op(x, storage.shared[destCount + tid]);
+				storage.shared[tid] = x;
+			}
+			__syncthreads();
+		}
+		T total = storage.shared[0];
+		__syncthreads();
+		return total;
+	}
+};
+
+#if __CUDA_ARCH__ >= 300
+
+template<int NT>
+struct CTAReduce<NT, mgpu::plus<int> > {
+	typedef mgpu::plus<int> Op;
+	typedef int T;
+	enum { Size = NT, Capacity = WARP_SIZE };
+	struct Storage { int shared[Capacity]; };
+
+	MGPU_DEVICE static int Reduce(int tid, int x, Storage& storage,
+		Op op = Op()) {
+
+		const int NumSections = WARP_SIZE;
+		const int SecSize = NT / NumSections;
+		int lane = (SecSize - 1) & tid;
+		int sec = tid / SecSize;
+
+		// In the first phase, threads cooperatively find the reduction within
+		// their segment. The segments are SecSize threads (NT / WARP_SIZE)
+		// wide.
+		#pragma unroll
+		for(int offset = 1; offset < SecSize; offset *= 2)
+			x = shfl_add(x, offset, SecSize);
+
+		// The last thread in each segment stores the local reduction to shared
+		// memory.
+		if(SecSize - 1 == lane) storage.shared[sec] = x;
+		__syncthreads();
+
+		// Reduce the totals of each input segment. The spine is WARP_SIZE
+		// threads wide.
+		if(tid < NumSections) {
+			x = storage.shared[tid];
+			#pragma unroll
+			for(int offset = 1; offset < NumSections; offset *= 2)
+				x = shfl_add(x, offset, NumSections);
+			storage.shared[tid] = x;
+		}
+		__syncthreads();
+
+		int reduction = storage.shared[NumSections - 1];
+		__syncthreads();
+
+		return reduction;
+	}
+};
+
+template<int NT>
+struct CTAReduce<NT, mgpu::maximum<int> > {
+	typedef mgpu::maximum<int> Op;
+	enum { Size = NT, Capacity = WARP_SIZE };
+	struct Storage { int shared[Capacity]; };
+
+	MGPU_DEVICE static int Reduce(int tid, int x, Storage& storage,
+		Op op = Op()) {
+
+		const int NumSections = WARP_SIZE;
+		const int SecSize = NT / NumSections;
+		int lane = (SecSize - 1) & tid;
+		int sec = tid / SecSize;
+
+		#pragma unroll
+		for(int offset = 1; offset < SecSize; offset *= 2)
+			x = shfl_max(x, offset, SecSize);
+
+		if(SecSize - 1 == lane) storage.shared[sec] = x;
+		__syncthreads();
+
+		if(tid < NumSections) {
+			x = storage.shared[tid];
+			#pragma unroll
+			for(int offset = 1; offset < NumSections; offset *= 2)
+				x = shfl_max(x, offset, NumSections);
+			storage.shared[tid] = x;
+		}
+		__syncthreads();
+
+		int reduction = storage.shared[NumSections - 1];
+		__syncthreads();
+
+		return reduction;
+	}
+};
+
+#endif // __CUDA_ARCH__ >= 300
+
+////////////////////////////////////////////////////////////////////////////////
+// CTAScan
+
+template<int NT, typename Op = mgpu::plus<int> >
+struct CTAScan {
+	typedef typename Op::result_type T;
+	enum { Size = NT, Capacity = 2 * NT + 1 };
+	struct Storage { T shared[Capacity]; };
+
+	MGPU_DEVICE static T Scan(int tid, T x, Storage& storage, T* total,
+		MgpuScanType type = MgpuScanTypeExc, T identity = (T)0, Op op = Op()) {
+
+		storage.shared[tid] = x;
+		int first = 0;
+		__syncthreads();
+
+		#pragma unroll
+		for(int offset = 1; offset < NT; offset += offset) {
+			if(tid >= offset)
+				x = op(storage.shared[first + tid - offset], x);
+			first = NT - first;
+			storage.shared[first + tid] = x;
+			__syncthreads();
+		}
+		*total = storage.shared[first + NT - 1];
+
+		if(MgpuScanTypeExc == type)
+			x = tid ? storage.shared[first + tid - 1] : identity;
+
+		__syncthreads();
+		return x;
+	}
+	MGPU_DEVICE static T Scan(int tid, T x, Storage& storage) {
+		T total;
+		return Scan(tid, x, storage, &total, MgpuScanTypeExc, (T)0, Op());
+	}
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Special partial specialization for CTAScan<NT, ScanOpAdd> on Kepler.
+// This uses the shfl intrinsic to reduce scan latency.
+
+#if __CUDA_ARCH__ >= 300
+
+template<int NT>
+struct CTAScan<NT, mgpu::plus<int> > {
+	typedef mgpu::plus<int> Op;
+	enum { Size = NT, NumSegments = WARP_SIZE, SegSize = NT / NumSegments };
+	enum { Capacity = NumSegments + 1 };
+	struct Storage { int shared[Capacity + 1]; };
+
+	MGPU_DEVICE static int Scan(int tid, int x, Storage& storage, int* total,
+		MgpuScanType type = MgpuScanTypeExc, int identity = 0, Op op = Op()) {
+
+		// Define WARP_SIZE segments that are NT / WARP_SIZE large.
+		// Each warp makes log(SegSize) shfl_add calls.
+		// The spine makes log(WARP_SIZE) shfl_add calls.
+		int lane = (SegSize - 1) & tid;
+		int segment = tid / SegSize;
+
+		// Scan each segment using shfl_add.
+		int scan = x;
+		#pragma unroll
+		for(int offset = 1; offset < SegSize; offset *= 2)
+			scan = shfl_add(scan, offset, SegSize);
+
+		// Store the reduction (last element) of each segment into storage.
+		if(SegSize - 1 == lane) storage.shared[segment] = scan;
+		__syncthreads();
+
+		// Warp 0 does a full shfl warp scan on the partials. The total is
+		// stored to shared[NumSegments]. (NumSegments = WARP_SIZE)
+		if(tid < NumSegments) {
+			int y = storage.shared[tid];
+			int scan = y;
+			#pragma unroll
+			for(int offset = 1; offset < NumSegments; offset *= 2)
+				scan = shfl_add(scan, offset, NumSegments);
+			storage.shared[tid] = scan - y;
+			if(NumSegments - 1 == tid) storage.shared[NumSegments] = scan;
+		}
+		__syncthreads();
+
+		// Add the scanned partials back in and convert to exclusive scan.
+		scan += storage.shared[segment];
+		if(MgpuScanTypeExc == type) {
+			scan -= x;
+			if(identity && !tid) scan = identity;
+		}
+		*total = storage.shared[NumSegments];
+		__syncthreads();
+
+		return scan;
+	}
+	MGPU_DEVICE static int Scan(int tid, int x, Storage& storage) {
+		int total;
+		return Scan(tid, x, storage, &total, MgpuScanTypeExc, 0);
+	}
+};
+
+#endif // __CUDA_ARCH__ >= 300
+
+////////////////////////////////////////////////////////////////////////////////
+// CTABinaryScan
+
+template<int NT>
+MGPU_DEVICE int CTABinaryScan(int tid, bool x, int* shared, int* total) {
+	const int NumWarps = NT / WARP_SIZE;
+	int warp = tid / WARP_SIZE;
+	int lane = (WARP_SIZE - 1);
+
+	// Store the bit totals for each warp.
+	uint bits = __ballot(x);
+	shared[warp] = popc(bits);
+	__syncthreads();
+
+#if __CUDA_ARCH__ >= 300
+	if(tid < NumWarps) {
+		int x = shared[tid];
+		int scan = x;
+		#pragma unroll
+		for(int offset = 1; offset < NumWarps; offset *= 2)
+			scan = shfl_add(scan, offset, NumWarps);
+		shared[tid] = scan - x;
+	}
+	__syncthreads();
+
+#else
+	// Thread 0 scans warp totals.
+	if(!tid) {
+		int scan = 0;
+		#pragma unroll
+		for(int i = 0; i < NumWarps; ++i) {
+			int y = shared[i];
+			shared[i] = scan;
+			scan += y;
+		}
+		shared[NumWarps] = scan;
+	}
+	__syncthreads();
+
+#endif // __CUDA_ARCH__ >= 300
+
+	// Add the warp scan back into the partials.
+	int scan = shared[warp] + __popc(bfe(bits, 0, lane));
+	*total = shared[NumWarps];
+	__syncthreads();
+	return scan;
+}
+
+} // namespace mgpu
--- a/include/contrib/moderngpu/include/device/ctasearch.cuh
+++ b/include/contrib/moderngpu/include/device/ctasearch.cuh
+/******************************************************************************
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ *
+ * Code and text by Sean Baxter, NVIDIA Research
+ * See http://nvlabs.github.io/moderngpu for repository and documentation.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include "deviceutil.cuh"
+#include "../mgpudevice.cuh"
+
+namespace mgpu {
+
+template<MgpuBounds Bounds, typename IntT, typename It, typename T,
+	typename Comp>
+MGPU_HOST_DEVICE void BinarySearchIt(It data, int& begin, int& end, T key,
+	int shift, Comp comp) {
+
+	IntT scale = (1<< shift) - 1;
+	int mid = (int)((begin + scale * end)>> shift);
+
+	T key2 = data[mid];
+	bool pred = (MgpuBoundsUpper == Bounds) ?
+		!comp(key, key2) :
+		comp(key2, key);
+	if(pred) begin = mid + 1;
+	else end = mid;
+}
+
+template<MgpuBounds Bounds, typename IntT, typename T, typename It,
+	typename Comp>
+MGPU_HOST_DEVICE int BiasedBinarySearch(It data, int count, T key, int levels,
+	Comp comp) {
+
+	int begin = 0;
+	int end = count;
+
+	if(levels >= 4 && begin < end)
+		BinarySearchIt<Bounds, IntT>(data, begin, end, key, 9, comp);
+	if(levels >= 3 && begin < end)
+		BinarySearchIt<Bounds, IntT>(data, begin, end, key, 7, comp);
+	if(levels >= 2 && begin < end)
+		BinarySearchIt<Bounds, IntT>(data, begin, end, key, 5, comp);
+	if(levels >= 1 && begin < end)
+		BinarySearchIt<Bounds, IntT>(data, begin, end, key, 4, comp);
+
+	while(begin < end)
+		BinarySearchIt<Bounds, int>(data, begin, end, key, 1, comp);
+	return begin;
+}
+
+template<MgpuBounds Bounds, typename T, typename It, typename Comp>
+MGPU_HOST_DEVICE int BinarySearch(It data, int count, T key, Comp comp) {
+	int begin = 0;
+	int end = count;
+	while(begin < end)
+		BinarySearchIt<Bounds, int>(data, begin, end, key, 1, comp);
+	return begin;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// MergePath search
+
+template<MgpuBounds Bounds, typename It1, typename It2, typename Comp>
+MGPU_HOST_DEVICE int MergePath(It1 a, int aCount, It2 b, int bCount, int diag,
+	Comp comp) {
+
+	typedef typename std::iterator_traits<It1>::value_type T;
+	int begin = max(0, diag - bCount);
+	int end = min(diag, aCount);
+
+	while(begin < end) {
+		int mid = (begin + end)>> 1;
+		T aKey = a[mid];
+		T bKey = b[diag - 1 - mid];
+		bool pred = (MgpuBoundsUpper == Bounds) ?
+			comp(aKey, bKey) :
+			!comp(bKey, aKey);
+		if(pred) begin = mid + 1;
+		else end = mid;
+	}
+	return begin;
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// SegmentedMergePath search
+
+template<typename InputIt, typename Comp>
+MGPU_HOST_DEVICE int SegmentedMergePath(InputIt keys, int aOffset, int aCount,
+	int bOffset, int bCount, int leftEnd, int rightStart, int diag, Comp comp) {
+
+	// leftEnd and rightStart are defined from the origin, and diag is defined
+	// from aOffset.
+	// We only need to run a Merge Path search if the diagonal intersects the
+	// segment that strides the left and right halves (i.e. is between leftEnd
+	// and rightStart).
+	if(aOffset + diag <= leftEnd) return diag;
+	if(aOffset + diag >= rightStart) return aCount;
+
+	bCount = min(bCount, rightStart - bOffset);
+	int begin = max(max(leftEnd - aOffset, 0), diag - bCount);
+	int end = min(diag, aCount);
+
+	while(begin < end) {
+		int mid = (begin + end)>> 1;
+		int ai = aOffset + mid;
+		int bi = bOffset + diag - 1 - mid;
+
+		bool pred = !comp(keys[bi], keys[ai]);
+		if(pred) begin = mid + 1;
+		else end = mid;
+	}
+	return begin;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// BalancedPath search
+
+template<bool Duplicates, typename IntT, typename InputIt1, typename InputIt2,
+	typename Comp>
+MGPU_HOST_DEVICE int2 BalancedPath(InputIt1 a, int aCount, InputIt2 b,
+	int bCount, int diag, int levels, Comp comp) {
+
+	typedef typename std::iterator_traits<InputIt1>::value_type T;
+
+	int p = MergePath<MgpuBoundsLower>(a, aCount, b, bCount, diag, comp);
+	int aIndex = p;
+	int bIndex = diag - p;
+
+	bool star = false;
+	if(bIndex < bCount) {
+		if(Duplicates) {
+			T x = b[bIndex];
+
+			// Search for the beginning of the duplicate run in both A and B.
+			// Because
+			int aStart = BiasedBinarySearch<MgpuBoundsLower, IntT>(a, aIndex, x,
+				levels, comp);
+			int bStart = BiasedBinarySearch<MgpuBoundsLower, IntT>(b, bIndex, x,
+				levels, comp);
+
+			// The distance between the merge path and the lower_bound is the
+			// 'run'. We add up the a- and b- runs and evenly distribute them to
+			// get a stairstep path.
+			int aRun = aIndex - aStart;
+			int bRun = bIndex - bStart;
+			int xCount = aRun + bRun;
+
+			// Attempt to advance b and regress a.
+			int bAdvance = max(xCount>> 1, bRun);
+			int bEnd = min(bCount, bStart + bAdvance + 1);
+			int bRunEnd = BinarySearch<MgpuBoundsUpper>(b + bIndex,
+				bEnd - bIndex, x, comp) + bIndex;
+			bRun = bRunEnd - bStart;
+
+			bAdvance = min(bAdvance, bRun);
+			int aAdvance = xCount - bAdvance;
+
+			bool roundUp = (aAdvance == bAdvance + 1) && (bAdvance < bRun);
+			aIndex = aStart + aAdvance;
+
+			if(roundUp) star = true;
+		} else {
+			if(aIndex && aCount) {
+				T aKey = a[aIndex - 1];
+				T bKey = b[bIndex];
+
+				// If the last consumed element in A (aIndex - 1) is the same as
+				// the next element in B (bIndex), we're sitting at a starred
+				// partition.
+				if(!comp(aKey, bKey)) star = true;
+			}
+		}
+	}
+	return make_int2(aIndex, star);
+}
+
+} // namespace mgpu
--- a/include/contrib/moderngpu/include/device/ctasegreduce.cuh
+++ b/include/contrib/moderngpu/include/device/ctasegreduce.cuh
+/******************************************************************************
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ *
+ * Code and text by Sean Baxter, NVIDIA Research
+ * See http://nvlabs.github.io/moderngpu for repository and documentation.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include "ctasegscan.cuh"
+#include "ctasearch.cuh"
+
+namespace mgpu {
+
+////////////////////////////////////////////////////////////////////////////////
+// Segmented reduce utility functions.
+
+// Extract the upper-bound indices from the coded ranges. Decrement to include
+// the first addressed row/segment.
+
+struct SegReduceRange {
+	int begin;
+	int end;
+	int total;
+	bool flushLast;
+};
+
+MGPU_DEVICE SegReduceRange DeviceShiftRange(int limit0, int limit1) {
+	SegReduceRange range;
+	range.begin = 0x7fffffff & limit0;
+	range.end = 0x7fffffff & limit1;
+	range.total = range.end - range.begin;
+	range.flushLast = 0 == (0x80000000 & limit1);
+	range.end += !range.flushLast;
+	return range;
+}
+
+// Reconstitute row/segment indices from a starting row index and packed end
+// flags. Used for pre-processed versions of interval reduce and interval Spmv.
+template<int VT>
+MGPU_DEVICE void DeviceExpandFlagsToRows(int first, int endFlags,
+	int rows[VT + 1]) {
+
+	rows[0] = first;
+	#pragma unroll
+	for(int i = 0; i < VT; ++i) {
+		if((1<< i) & endFlags) ++first;
+		rows[i + 1] = first;
+	}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// After loading CSR terms into shared memory, each thread binary searches
+// (upper-bound) to find its starting point. Each thread then walks forward,
+// emitting the csr0-relative row indices to register.
+
+template<int NT, int VT>
+MGPU_DEVICE int DeviceExpandCsrRows(int tidOffset, int* csr_shared,
+	int numRows, int end, int rows[VT + 1], int rowStarts[VT]) {
+
+	// Each thread binary searches for its starting row.
+	int row = BinarySearch<MgpuBoundsUpper>(csr_shared, numRows, tidOffset,
+		mgpu::less<int>()) - 1;
+
+	// Each thread starts at row and scans forward, emitting row IDs into
+	// register. Store the CTA-local row index (starts at 0) to rows and the
+	// start of the row (globally) to rowStarts.
+	int curOffset = csr_shared[row];
+	int nextOffset = (row + 1 < numRows) ? csr_shared[row + 1] : end;
+
+	rows[0] = row;
+	rowStarts[0] = curOffset;
+	int endFlags = 0;
+
+	#pragma unroll
+	for(int i = 1; i <= VT; ++i) {
+		// Advance the row cursor when the iterator hits the next row offset.
+		if(tidOffset + i == nextOffset) {
+			// Set an end flag when the cursor advances to the next row.
+			endFlags |= 1<< (i - 1);
+
+			// Advance the cursor and load the next row offset.
+			++row;
+			curOffset = nextOffset;
+			nextOffset = (row + 1 < numRows) ? csr_shared[row + 1] : end;
+		}
+		rows[i] = row;
+		if(i < VT) rowStarts[i] = curOffset;
+	}
+	__syncthreads();
+
+	return endFlags;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// DeviceSegReducePrepare
+// Expand non-empty interval of CSR elements into row indices. Compute end-flags
+// by comparing adjacent row IDs.
+
+// DeviceSegReducePrepare may be called either by a pre-processing kernel or by
+// the kernel that actually evaluates the segmented reduction if no preprocesing
+// is desired.
+struct SegReduceTerms {
+	int endFlags;
+	int tidDelta;
+};
+
+template<int NT, int VT>
+MGPU_DEVICE SegReduceTerms DeviceSegReducePrepare(int* csr_shared, int numRows,
+	int tid, int gid, bool flushLast, int rows[VT + 1], int rowStarts[VT]) {
+
+	// Pass a sentinel (end) to point to the next segment start. If we flush,
+	// this is the end of this tile. Otherwise it is INT_MAX
+	int endFlags = DeviceExpandCsrRows<NT, VT>(gid + VT * tid, csr_shared,
+		numRows, flushLast ? (gid + NT * VT) : INT_MAX, rows, rowStarts);
+
+	// Find the distance to to scan to compute carry-in for each thread. Use the
+	// existance of an end flag anywhere in the thread to determine if carry-out
+	// values from the left should propagate through to the right.
+	int tidDelta = DeviceFindSegScanDelta<NT>(tid, rows[0] != rows[VT],
+		csr_shared);
+
+	SegReduceTerms terms = { endFlags, tidDelta };
+	return terms;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// CTASegReduce
+// Core segmented reduction code. Supports fast-path and slow-path for intra-CTA
+// segmented reduction. Stores partials to global memory.
+// Callers feed CTASegReduce::ReduceToGlobal values in thread order.
+template<int NT, int VT, bool HalfCapacity, typename T, typename Op>
+struct CTASegReduce {
+	typedef CTASegScan<NT, Op> SegScan;
+
+	enum {
+		NV = NT * VT,
+		Capacity = HalfCapacity ? (NV / 2) : NV
+	};
+
+	union Storage {
+		typename SegScan::Storage segScanStorage;
+		T values[Capacity];
+	};
+
+	template<typename DestIt>
+	MGPU_DEVICE static void ReduceToGlobal(const int rows[VT + 1], int total,
+		int tidDelta, int startRow, int block, int tid, T data[VT],
+		DestIt dest_global, T* carryOut_global, T identity, Op op,
+		Storage& storage) {
+
+		// Run a segmented scan within the thread.
+		T x, localScan[VT];
+		#pragma unroll
+		for(int i = 0; i < VT; ++i) {
+			x = i ? op(x, data[i]) : data[i];
+			localScan[i] = x;
+			if(rows[i] != rows[i + 1]) x = identity;
+		}
+
+		// Run a parallel segmented scan over the carry-out values to compute
+		// carry-in.
+		T carryOut;
+		T carryIn = SegScan::SegScanDelta(tid, tidDelta, x,
+			storage.segScanStorage, &carryOut, identity, op);
+
+		// Store the carry-out for the entire CTA to global memory.
+		if(!tid) carryOut_global[block] = carryOut;
+
+		dest_global += startRow;
+		if(HalfCapacity && total > Capacity) {
+			// Add carry-in to each thread-local scan value. Store directly
+			// to global.
+			#pragma unroll
+			for(int i = 0; i < VT; ++i) {
+				// Add the carry-in to the local scan.
+				T x2 = op(carryIn, localScan[i]);
+
+				// Store on the end flag and clear the carry-in.
+				if(rows[i] != rows[i + 1]) {
+					carryIn = identity;
+					dest_global[rows[i]] = x2;
+				}
+			}
+		} else {
+			// All partials fit in shared memory. Add carry-in to each thread-
+			// local scan value.
+			#pragma unroll
+			for(int i = 0; i < VT; ++i) {
+				// Add the carry-in to the local scan.
+				T x2 = op(carryIn, localScan[i]);
+
+				// Store reduction when the segment changes and clear the
+				// carry-in.
+				if(rows[i] != rows[i + 1]) {
+					storage.values[rows[i]] = x2;
+					carryIn = identity;
+				}
+			}
+			__syncthreads();
+
+			// Cooperatively store reductions to global memory.
+			for(int index = tid; index < total; index += NT)
+				dest_global[index] = storage.values[index];
+			__syncthreads();
+		}
+	}
+};
+
+} // namespace mgpu
+
--- a/include/contrib/moderngpu/include/device/ctasegscan.cuh
+++ b/include/contrib/moderngpu/include/device/ctasegscan.cuh
+/******************************************************************************
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ *
+ * Code and text by Sean Baxter, NVIDIA Research
+ * See http://nvlabs.github.io/moderngpu for repository and documentation.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include "ctascan.cuh"
+
+namespace mgpu {
+
+////////////////////////////////////////////////////////////////////////////////
+// DeviceFindSegScanDelta
+// Runs an inclusive max-index scan over binary inputs.
+
+template<int NT>
+MGPU_DEVICE int DeviceFindSegScanDelta(int tid, bool flag, int* delta_shared) {
+	const int NumWarps = NT / 32;
+
+	int warp = tid / 32;
+	int lane = 31 & tid;
+	uint warpMask = 0xffffffff>> (31 - lane);		// inclusive search
+	uint ctaMask = 0x7fffffff>> (31 - lane);		// exclusive search
+
+	uint warpBits = __ballot(flag);
+	delta_shared[warp] = warpBits;
+	__syncthreads();
+
+	if(tid < NumWarps) {
+		uint ctaBits = __ballot(0 != delta_shared[tid]);
+		int warpSegment = 31 - clz(ctaMask & ctaBits);
+		int start = (-1 != warpSegment) ?
+			(31 - clz(delta_shared[warpSegment]) + 32 * warpSegment) : 0;
+		delta_shared[NumWarps + tid] = start;
+	}
+	__syncthreads();
+
+	// Find the closest flag to the left of this thread within the warp.
+	// Include the flag for this thread.
+	int start = 31 - clz(warpMask & warpBits);
+	if(-1 != start) start += ~31 & tid;
+	else start = delta_shared[NumWarps + warp];
+	__syncthreads();
+
+	return tid - start;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// CTASegScan
+
+template<int NT, typename _Op = mgpu::plus<int> >
+struct CTASegScan {
+	typedef _Op Op;
+	typedef typename Op::result_type T;
+	enum { NumWarps = NT / 32, Size = NT, Capacity = 2 * NT };
+	union Storage {
+		int delta[NumWarps];
+		T values[Capacity];
+	};
+
+	// Each thread passes the reduction of the LAST SEGMENT that it covers.
+	// flag is set to true if there's at least one segment flag in the thread.
+	// SegScan returns the reduction of values for the first segment in this
+	// thread over the preceding threads.
+	// Return the value init for the first thread.
+
+	// When scanning single elements per thread, interpret the flag as a BEGIN
+	// FLAG. If tid's flag is set, its value belongs to thread tid + 1, not
+	// thread tid.
+
+	// The function returns the reduction of the last segment in the CTA.
+
+	MGPU_DEVICE static T SegScanDelta(int tid, int tidDelta, T x,
+		Storage& storage, T* carryOut, T identity = (T)0, Op op = Op()) {
+
+		// Run an inclusive scan
+		int first = 0;
+		storage.values[first + tid] = x;
+		__syncthreads();
+
+		#pragma unroll
+		for(int offset = 1; offset < NT; offset += offset) {
+			if(tidDelta >= offset)
+				x = op(storage.values[first + tid - offset], x);
+			first = NT - first;
+			storage.values[first + tid] = x;
+			__syncthreads();
+		}
+
+		// Get the exclusive scan.
+		x = tid ? storage.values[first + tid - 1] : identity;
+		*carryOut = storage.values[first + NT - 1];
+		__syncthreads();
+		return x;
+	}
+
+	MGPU_DEVICE static T SegScan(int tid, T x, bool flag, Storage& storage,
+		T* carryOut, T identity = (T)0, Op op = Op()) {
+
+		// Find the left-most thread that covers the first segment of this
+		// thread.
+		int tidDelta = DeviceFindSegScanDelta<NT>(tid, flag, storage.delta);
+
+		return SegScanDelta(tid, tidDelta, x, storage, carryOut, identity, op);
+	}
+};
+
+} // namespace mgpu
--- a/include/contrib/moderngpu/include/device/ctasegsort.cuh
+++ b/include/contrib/moderngpu/include/device/ctasegsort.cuh
+/******************************************************************************
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ *
+ * Code and text by Sean Baxter, NVIDIA Research
+ * See http://nvlabs.github.io/moderngpu for repository and documentation.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include "ctascan.cuh"
+#include "ctasearch.cuh"
+#include "loadstore.cuh"
+#include "sortnetwork.cuh"
+
+namespace mgpu {
+
+template<int VT, typename T, typename Comp>
+MGPU_DEVICE void SegmentedSerialMerge(const T* keys_shared, int aBegin,
+	int aEnd, int bBegin, int bEnd, T results[VT], int indices[VT],
+	int leftEnd, int rightStart, Comp comp, bool sync = true) {
+
+	bEnd = min(rightStart, bEnd);
+	T aKey = keys_shared[aBegin];
+	T bKey = keys_shared[bBegin];
+
+	#pragma unroll
+	for(int i = 0; i < VT; ++i) {
+		bool p;
+
+		// If A has run out of inputs, emit B.
+		if(aBegin >= aEnd)
+			p = false;
+		else if(bBegin >= bEnd || aBegin < leftEnd)
+			// B has hit the end of the middle segment.
+			// Emit A if A has inputs remaining in the middle segment.
+			p = true;
+		else
+			// Emit the smaller element in the middle segment.
+			p = !comp(bKey, aKey);
+
+		results[i] = p ? aKey : bKey;
+		indices[i] = p ? aBegin : bBegin;
+		if(p) aKey = keys_shared[++aBegin];
+		else bKey = keys_shared[++bBegin];
+	}
+	if(sync) { __syncthreads(); }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// CTASegsortPass
+
+template<int NT, int VT, typename T, typename Comp>
+MGPU_DEVICE void CTASegsortPass(T* keys_shared, int* ranges_shared, int tid,
+	int pass, T results[VT], int indices[VT], int2& activeRange, Comp comp) {
+
+	// Locate the intervals of the input lists.
+	int3 frame = FindMergesortFrame(2<< pass, tid, VT);
+	int a0 = frame.x;
+	int b0 = frame.y;
+	int listLen = frame.z;
+	int list = tid>> pass;
+	int listParity = 1 & list;
+	int diag = VT * tid - frame.x;
+
+	// Fetch the active range for the list this thread's list is merging with.
+	int siblingRange = ranges_shared[1 ^ list];
+	int siblingStart = 0x0000ffff & siblingRange;
+	int siblingEnd = siblingRange>> 16;
+
+	// Create a new active range for the merge.
+	int leftEnd = listParity ? siblingEnd : activeRange.y;
+	int rightStart = listParity ? activeRange.x : siblingStart;
+	activeRange.x = min(activeRange.x, siblingStart);
+	activeRange.y = max(activeRange.y, siblingEnd);
+
+	int p = SegmentedMergePath(keys_shared, a0, listLen, b0, listLen, leftEnd,
+		rightStart, diag, comp);
+
+	int a0tid = a0 + p;
+	int b0tid = b0 + diag - p;
+	SegmentedSerialMerge<VT>(keys_shared, a0tid, b0, b0tid, b0 + listLen,
+		results, indices, leftEnd, rightStart, comp);
+
+	// Store the ranges to shared memory.
+	if(0 == diag)
+		ranges_shared[list>> 1] =
+			(int)bfi(activeRange.y, activeRange.x, 16, 16);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// CTASegsortLoop
+
+template<int NT, int VT, bool HasValues, typename KeyType, typename ValType,
+	typename Comp>
+MGPU_DEVICE int2 CTASegsortLoop(KeyType threadKeys[VT],
+	ValType threadValues[VT], KeyType* keys_shared, ValType* values_shared,
+	int* ranges_shared, int tid, int2 activeRange, Comp comp) {
+
+	const int NumPasses = sLogPow2<NT>::value;
+	#pragma unroll
+	for(int pass = 0; pass < NumPasses; ++pass) {
+		int indices[VT];
+		CTASegsortPass<NT, VT>(keys_shared, ranges_shared, tid, pass,
+			threadKeys, indices, activeRange, comp);
+
+		if(HasValues) {
+			// Exchange values through shared memory.
+			DeviceThreadToShared<VT>(threadValues, tid, values_shared);
+			DeviceGather<NT, VT>(NT * VT, values_shared, indices, tid,
+				threadValues);
+		}
+
+		// Store results in shared memory in sorted order.
+		DeviceThreadToShared<VT>(threadKeys, tid, keys_shared);
+	}
+	return activeRange;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// CTASegsort
+// Pass keys and values in register. On return, values are returned in register
+// and keys returned in shared memory.
+
+template<int NT, int VT, bool Stable, bool HasValues, typename KeyType,
+	typename ValType, typename Comp>
+MGPU_DEVICE int2 CTASegsort(KeyType threadKeys[VT], ValType threadValues[VT],
+	int tid, int headFlags, KeyType* keys_shared, ValType* values_shared,
+	int* ranges_shared, Comp comp) {
+
+	if(Stable)
+		// Odd-even transpose sort.
+		OddEvenTransposeSortFlags<VT>(threadKeys, threadValues, headFlags,
+			comp);
+	else
+		// Batcher's odd-even mergesort.
+		OddEvenMergesortFlags<VT>(threadKeys, threadValues, headFlags, comp);
+
+	// Record the first and last occurrence of head flags in this segment.
+	int blockEnd = 31 - clz(headFlags);
+	if(-1 != blockEnd) blockEnd += VT * tid;
+
+	int blockStart = ffs(headFlags);
+	blockStart = blockStart ? (VT * tid - 1 + blockStart) : (NT * VT);
+
+	ranges_shared[tid] = (int)bfi(blockEnd, blockStart, 16, 16);
+
+	// Store back to shared mem. The values are in VT-length sorted lists.
+	// These are merged recursively.
+	DeviceThreadToShared<VT>(threadKeys, tid, keys_shared);
+
+	int2 activeRange = CTASegsortLoop<NT, VT, HasValues>(threadKeys,
+		threadValues, keys_shared, values_shared, ranges_shared, tid,
+		make_int2(blockStart, blockEnd), comp);
+	return activeRange;
+}
+
+
+template<int NT, int VT, bool Stable, typename KeyType, typename Comp>
+MGPU_DEVICE int2 CTASegsortKeys(KeyType threadKeys[VT], int tid, int headFlags,
+	KeyType* keys_shared, int* ranges_shared, Comp comp) {
+
+	int valuesTemp[VT];
+	return CTASegsort<NT, VT, Stable, false>(threadKeys, valuesTemp, tid,
+		headFlags, keys_shared, (int*)keys_shared, ranges_shared, comp);
+}
+
+template<int NT, int VT, bool Stable, typename KeyType, typename ValType,
+	typename Comp>
+MGPU_DEVICE int2 CTASegsortPairs(KeyType threadKeys[VT],
+	ValType threadValues[VT], int tid, int headFlags, KeyType* keys_shared,
+	ValType* values_shared, int* ranges_shared, Comp comp) {
+
+	return CTASegsort<NT, VT, Stable, true>(threadKeys, threadValues, tid,
+		headFlags, keys_shared, values_shared, ranges_shared, comp);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// DeviceSegBlocksort
+// Load keys and values from global memory, sort in shared memory, and store
+// back to global memory. Store the left-most and right-most encountered
+// headflag locations to ranges_global to prepare for the next pass.
+// This function is factored out of the blocksort kernel to allow easier
+// customization of that kernel - we have two implementations currently:
+// sort over indices and sort over bitfield.
+
+template<int NT, int VT, bool Stable, bool HasValues, typename InputIt1,
+	typename InputIt2, typename KeyType, typename ValType, typename OutputIt1,
+	typename OutputIt2, typename Comp>
+MGPU_DEVICE void DeviceSegBlocksort(InputIt1 keys_global,
+	InputIt2 values_global, int count2, KeyType* keys_shared,
+	ValType* values_shared, int* ranges_shared, int headFlags, int tid,
+	int block, OutputIt1 keysDest_global, OutputIt2 valsDest_global,
+	int* ranges_global, Comp comp) {
+
+	// Load keys into register in thread order.
+	int gid = NT * VT * block;
+	KeyType threadKeys[VT];
+	DeviceGlobalToShared<NT, VT>(count2, keys_global + gid, tid, keys_shared);
+	DeviceSharedToThread<VT>(keys_shared, tid, threadKeys);
+
+	// Load the values from global memory and into register in thread order.
+	ValType threadValues[VT];
+	if(HasValues) {
+		DeviceGlobalToShared<NT, VT>(count2, values_global + gid, tid,
+			values_shared);
+		DeviceSharedToThread<VT>(values_shared, tid, threadValues);
+	}
+
+	// Run the CTA segmented blocksort.
+	int2 activeRange = CTASegsort<NT, VT, Stable, HasValues>(threadKeys,
+		threadValues, tid, headFlags, keys_shared, values_shared, ranges_shared,
+		comp);
+
+	// Store the keys to global memory.
+	DeviceSharedToGlobal<NT, VT>(count2, keys_shared, tid,
+		 keysDest_global + gid);
+
+	if(HasValues) {
+		// Store the values to global memory.xk b
+		DeviceThreadToShared<VT>(threadValues, tid, values_shared);
+		DeviceSharedToGlobal<NT, VT>(count2, values_shared, tid,
+			valsDest_global + gid, false);
+	}
+
+	// Store the 16-bit packed ranges. These are used by all merge kernels and
+	// the first level of global segmented merge path partitioning.
+	if(!tid)
+		ranges_global[block] = bfi(activeRange.y, activeRange.x, 16, 16);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// DeviceIndicesToHeadFlags
+// Load indices from an array and cooperatively turn into a head flag bitfield
+// for each thread.
+
+template<int NT, int VT>
+MGPU_DEVICE int DeviceIndicesToHeadFlags(const int* indices_global,
+	const int* partitions_global, int tid, int block, int count2,
+	int* words_shared, byte* flags_shared) {
+
+	const int FlagWordsPerThread = MGPU_DIV_UP(VT, 4);
+	int gid = NT * VT * block;
+	int p0 = partitions_global[block];
+	int p1 = partitions_global[block + 1];
+
+	int headFlags = 0;
+	if(p1 > p0 || count2 < NT * VT) {
+
+		// Clear the flag bytes, then loop through the indices and poke in flag
+		// values.
+		#pragma unroll
+		for(int i = 0; i < FlagWordsPerThread; ++i)
+			words_shared[NT * i + tid] = 0;
+		__syncthreads();
+
+		for(int index = p0 + tid; index < p1; index += NT) {
+			int headFlag = indices_global[index];
+			flags_shared[headFlag - gid] = 1;
+		}
+		__syncthreads();
+
+		// Combine all the head flags for this thread.
+		int first = VT * tid;
+		int offset = first / 4;
+		int prev = words_shared[offset];
+		int mask = 0x3210 + 0x1111 * (3 & first);
+		#pragma unroll
+		for(int i = 0; i < FlagWordsPerThread; ++i) {
+			// Gather the next four flags.
+			int next = words_shared[offset + 1 + i];
+			int x = prmt(prev, next, mask);
+			prev = next;
+
+			// Set the head flag bits.
+			if(0x00000001 & x) headFlags |= 1<< (4 * i);
+			if(0x00000100 & x) headFlags |= 1<< (4 * i + 1);
+			if(0x00010000 & x) headFlags |= 1<< (4 * i + 2);
+			if(0x01000000 & x) headFlags |= 1<< (4 * i + 3);
+		}
+		__syncthreads();
+
+		// Set head flags for out-of-range keys.
+		int outOfRange = min(VT, first + VT - count2);
+		if(outOfRange > 0)
+			headFlags = bfi(0xffffffff, headFlags, VT - outOfRange, outOfRange);
+
+		// Clear head flags above VT.
+		headFlags &= (1<< VT) - 1;
+	}
+	return headFlags;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// SegSortSupport
+
+struct SegSortSupport {
+	int* ranges_global;
+	int2* ranges2_global;
+
+	int4* mergeList_global;
+	int* copyList_global;
+	int2* queueCounters_global;
+	int2* nextCounters_global;
+
+	byte* copyStatus_global;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// DeviceSegSortMerge
+
+template<int NT, int VT, bool HasValues, typename KeyType, typename ValueType,
+	typename Comp>
+MGPU_DEVICE void DeviceSegSortMerge(const KeyType* keys_global,
+	const ValueType* values_global, int2 segmentRange, int tid,
+	int block, int4 range, int pass, KeyType* keys_shared,
+	int* indices_shared, KeyType* keysDest_global, ValueType* valsDest_global,
+	Comp comp) {
+
+	const int NV = NT * VT;
+	int gid = NV * block;
+
+	// Load the local compressed segment indices.
+	int a0 = range.x;
+	int aCount = range.y - range.x;
+	int b0 = range.z;
+	int bCount = range.w - range.z;
+
+	DeviceLoad2ToShared<NT, VT, VT>(keys_global + a0, aCount, keys_global + b0,
+		bCount, tid, keys_shared);
+
+	////////////////////////////////////////////////////////////////////////////
+	// Run a merge path to find the starting point for each thread to merge.
+	// If the entire warp fits into the already-sorted segments, we can skip
+	// sorting it and leave its keys in shared memory. Doing this on the warp
+	// level rather than thread level (also legal) gives slightly better
+	// performance.
+
+	int segStart = segmentRange.x;
+	int segEnd = segmentRange.y;
+	int listParity = 1 & (block>> pass);
+
+	int warpOffset = VT * (~31 & tid);
+	bool sortWarp = listParity ?
+		// The spliced segment is to the left (segStart).
+		(warpOffset < segStart) :
+		// The spliced segment is to the right (segEnd).
+		(warpOffset + 32 * VT > segEnd);
+
+	KeyType threadKeys[VT];
+	int indices[VT];
+	if(sortWarp) {
+		int diag = VT * tid;
+		int mp = SegmentedMergePath(keys_shared, 0, aCount, aCount, bCount,
+			listParity ? 0 : segEnd, listParity ? segStart : NV, diag, comp);
+		int a0tid = mp;
+		int a1tid = aCount;
+		int b0tid = aCount + diag - mp;
+		int b1tid = aCount + bCount;
+
+		// Serial merge into register. All threads in the CTA so we hoist the
+		// check for list parity outside the function call to simplify the
+		// logic. Unlike in the blocksort, this does not cause warp divergence.
+		SegmentedSerialMerge<VT>(keys_shared, a0tid, a1tid, b0tid, b1tid,
+			threadKeys, indices, listParity ? 0 : segEnd,
+			listParity ? segStart : NV, comp, false);
+	}
+	__syncthreads();
+
+	// Store sorted data in register back to shared memory. Then copy to global.
+	if(sortWarp)
+		DeviceThreadToShared<VT>(threadKeys, tid, keys_shared, false);
+	__syncthreads();
+
+	DeviceSharedToGlobal<NT, VT>(aCount + bCount, keys_shared, tid,
+		keysDest_global + gid);
+
+	////////////////////////////////////////////////////////////////////////////
+	// Use the merge indices to gather values from global memory. Store directly
+	// to valsDest_global.
+
+	if(HasValues) {
+		// Transpose the gather indices to help coalesce loads.
+		if(sortWarp)
+			DeviceThreadToShared<VT>(indices, tid, indices_shared, false);
+		else {
+			#pragma unroll
+			for(int i = 0; i < VT; ++i)
+				indices_shared[VT * tid + i] = VT * tid + i;
+		}
+		__syncthreads();
+
+		DeviceTransferMergeValuesShared<NT, VT>(aCount + bCount,
+			values_global + a0,  values_global + b0, aCount, indices_shared,
+			tid, valsDest_global + NV * block);
+	}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// DeviceSegSortCopy
+
+template<int NT, int VT, bool HasValues, typename KeyType, typename ValueType>
+MGPU_DEVICE void DeviceSegSortCopy(const KeyType* keys_global,
+	const ValueType* values_global, int tid, int block, int count,
+	KeyType* keysDest_global, ValueType* valsDest_global) {
+
+	int gid = NT * VT * block;
+	int count2 = min(NT * VT, count - gid);
+
+	DeviceGlobalToGlobal<NT, VT>(count2, keys_global + gid, tid,
+		keysDest_global + gid);
+	if(HasValues)
+		DeviceGlobalToGlobal<NT, VT>(count2, values_global + gid, tid,
+			valsDest_global + gid);
+}
+
+} // namespace mgpu
--- a/include/contrib/moderngpu/include/device/ctasortedsearch.cuh
+++ b/include/contrib/moderngpu/include/device/ctasortedsearch.cuh
+/******************************************************************************
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ *
+ * Code and text by Sean Baxter, NVIDIA Research
+ * See http://nvlabs.github.io/moderngpu for repository and documentation.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include "../mgpudevice.cuh"
+#include "ctasearch.cuh"
+
+namespace mgpu {
+
+
+////////////////////////////////////////////////////////////////////////////////
+// DeviceSerialSearch
+
+template<int VT, MgpuBounds Bounds, bool RangeCheck, bool IndexA, bool MatchA,
+	bool IndexB, bool MatchB, typename T, typename Comp>
+MGPU_DEVICE int3 DeviceSerialSearch(const T* keys_shared, int aBegin,
+	int aEnd, int bBegin, int bEnd, int aOffset, int bOffset, int* indices,
+	Comp comp) {
+
+	const int FlagA = IndexA ? 0x80000000 : 1;
+	const int FlagB = IndexB ? 0x80000000 : 1;
+
+	T aKey = keys_shared[aBegin];
+	T bKey = keys_shared[bBegin];
+	T aPrev, bPrev;
+	if(aBegin > 0) aPrev = keys_shared[aBegin - 1];
+	if(bBegin > 0) bPrev = keys_shared[bBegin - 1];
+	int decisions = 0;
+	int matchCountA = 0;
+	int matchCountB = 0;
+
+	#pragma unroll
+	for(int i = 0; i < VT; ++i) {
+		bool p;
+		if(RangeCheck && aBegin >= aEnd) p = false;
+		else if(RangeCheck && bBegin >= bEnd) p = true;
+		else p = (MgpuBoundsUpper == Bounds) ?
+			comp(aKey, bKey) :
+			!comp(bKey, aKey);
+
+		if(p) {
+			// aKey is smaller than bKey, so it is inserted before bKey.
+			// Save bKey's index (bBegin + first) as the result of the search
+			// and advance to the next needle in A.
+			bool match = false;
+			if(MatchA) {
+				// Test if there is an element in B that matches aKey.
+				if(MgpuBoundsUpper == Bounds) {
+					// Upper Bound: We're inserting aKey after bKey. If there
+					// is a match for aKey it must be bPrev. Check that bPrev
+					// is in range and equal to aKey.
+					// The predicate test result !comp(aKey, bPrev) was
+					// established on the previous A-advancing iteration (it
+					// failed the comp(aKey, bKey) test to get us to this
+					// point). Check the other half of the equality condition
+					// with a second comparison.
+					bool inRange = !RangeCheck || (bBegin > aEnd);
+					match = inRange && !comp(bPrev, aKey);
+				} else {
+					// Lower Bound: We're inserting aKey before bKey. If there
+					// is a match for aKey, it must be bKey. Check that bKey
+					// is in range and equal to aKey.
+					// The predicate test !comp(bKey, aKey) has established one
+					// half of the equality condition. We establish the other
+					// half with a second comparison.
+					bool inRange = !RangeCheck || (bBegin < bEnd);
+					match = inRange && !comp(aKey, bKey);
+				}
+			}
+
+			int index = 0;
+		 	if(IndexA) index = bOffset + bBegin;
+			if(match) index |= FlagA;
+			if(IndexA || MatchA) indices[i] = index;
+			matchCountA += match;
+
+			// Mark the decision bit to indicate that this iteration has
+			// progressed A (the needles).
+			decisions |= 1<< i;
+			aPrev = aKey;
+			aKey = keys_shared[++aBegin];
+		} else {
+			// aKey is larger than bKey, so it is inserted after bKey (but we
+			// don't know where yet). Advance the B index to the next element in
+			// the haystack to continue the search for the current needle.
+			bool match = false;
+			if(MatchB) {
+				if(MgpuBoundsUpper == Bounds) {
+					// Upper Bound: aKey is not smaller than bKey. We advance to
+					// the next haystack element in B. If there is a match in A
+					// for bKey it must be aKey. By entering this branch we've
+					// verified that !comp(aKey, bKey). Making the reciprocal
+					// comparison !comp(bKey, aKey) establishes aKey == bKey.
+					bool inRange = !RangeCheck ||
+						((bBegin < bEnd) && (aBegin < aEnd));
+					match = inRange && !comp(bKey, aKey);
+				} else {
+					// Lower Bound: bKey is smaller than aKey. We advance to the
+					// next element in B. If there is a match for bKey, it must
+					// be aPrev. The previous A-advancing iteration proved that
+					// !comp(bKey, aPrev). We test !comp(aPrev, bKey) for the
+					// other half of the equality condition.
+					bool inRange = !RangeCheck ||
+						((bBegin < bEnd) && (aBegin > 0));
+					match = inRange && !comp(aPrev, bKey);
+				}
+			}
+
+			int index = 0;
+			if(IndexB) index = aOffset + aBegin;
+			if(match) index |= FlagB;
+			if(IndexB || MatchB) indices[i] = index;
+			matchCountB += match;
+
+			// Keep the decision bit cleared to indicate that this iteration
+			// has progressed B (the haystack).
+			bPrev = bKey;
+			bKey = keys_shared[++bBegin];
+		}
+	}
+	return make_int3(decisions, matchCountA, matchCountB);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// CTASortedSearch
+// Take keys in shared memory and return indices and b-match flags in shared
+// memory.
+// NOTE: This function doesn't do any strided-to-thread order transposes so
+// using an even number of values per thread will incur no additional bank
+// conflicts.
+
+template<int NT, int VT, MgpuBounds Bounds, bool IndexA, bool MatchA,
+	bool IndexB, bool MatchB, typename T, typename Comp>
+MGPU_DEVICE int2 CTASortedSearch(T* keys_shared, int aStart, int aCount,
+	int aEnd, int a0, int bStart, int bCount, int bEnd, int b0, bool extended,
+	int tid, int* indices_shared, Comp comp) {
+
+	// Run a merge path to find the start of the serial search for each thread.
+	int diag = VT * tid;
+	int mp = MergePath<Bounds>(keys_shared + aStart, aCount,
+		keys_shared + bStart, bCount, diag, comp);
+	int a0tid = mp;
+	int b0tid = diag - mp;
+
+	// Serial search into register.
+	int3 results;
+	int indices[VT];
+	if(extended)
+		results = DeviceSerialSearch<VT, Bounds, false, IndexA, MatchA, IndexB,
+			MatchB>(keys_shared, a0tid + aStart, aEnd, b0tid + bStart, bEnd,
+			a0 - aStart, b0 - bStart, indices, comp);
+	else
+		results = DeviceSerialSearch<VT, Bounds, true, IndexA, MatchA, IndexB,
+			MatchB>(keys_shared, a0tid + aStart, aEnd, b0tid + bStart, bEnd,
+			a0 - aStart, b0 - bStart, indices, comp);
+	__syncthreads();
+
+	// Compact the indices into shared memory. Use the decision bits (set is A,
+	// cleared is B) to select the destination.
+	int decisions = results.x;
+	b0tid += aCount;
+	#pragma unroll
+	for(int i = 0; i < VT; ++i) {
+		if((1<< i) & decisions) {
+			if(IndexA || MatchA) indices_shared[a0tid++] = indices[i];
+		} else {
+			if(IndexB || MatchB) indices_shared[b0tid++] = indices[i];
+		}
+	}
+	__syncthreads();
+
+	// Return the match counts for A and B keys.
+	return make_int2(results.y, results.z);
+}
+
+} // namespace mgpu
--- a/include/contrib/moderngpu/include/device/devicetypes.cuh
+++ b/include/contrib/moderngpu/include/device/devicetypes.cuh
+/******************************************************************************
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ *
+ * Code and text by Sean Baxter, NVIDIA Research
+ * See http://nvlabs.github.io/moderngpu for repository and documentation.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#if __CUDA_ARCH__ == 100
+	#error "COMPUTE CAPABILITY 1.0 NOT SUPPORTED BY MPGU. TRY 2.0!"
+#endif 
+
+#include <climits>
+#include "../util/static.h"
+
+#ifdef _MSC_VER
+#define INLINESYMBOL __forceinline__
+#else
+#define INLINESYMBOL inline
+#endif
+
+namespace mgpu {
+
+#define MGPU_HOST __host__ INLINESYMBOL
+#define MGPU_DEVICE __device__ INLINESYMBOL
+#define MGPU_HOST_DEVICE __host__ __device__ INLINESYMBOL
+
+const int WARP_SIZE = 32;
+const int LOG_WARP_SIZE = 5;
+
+////////////////////////////////////////////////////////////////////////////////
+// Device-side comparison operators
+
+template<typename T>
+struct less : public std::binary_function<T, T, bool> {
+	MGPU_HOST_DEVICE bool operator()(T a, T b) { return a < b; }
+};
+template<typename T>
+struct less_equal : public std::binary_function<T, T, bool> {
+	MGPU_HOST_DEVICE bool operator()(T a, T b) { return a <= b; }
+};
+template<typename T>
+struct greater : public std::binary_function<T, T, bool> {
+	MGPU_HOST_DEVICE bool operator()(T a, T b) { return a > b; }
+};
+template<typename T>
+struct greater_equal : public std::binary_function<T, T, bool> {
+	MGPU_HOST_DEVICE bool operator()(T a, T b) { return a >= b; }
+};
+template<typename T>
+struct equal_to : public std::binary_function<T, T, bool> {
+	MGPU_HOST_DEVICE bool operator()(T a, T b) { return a == b; }
+};
+template<typename T>
+struct not_equal_to : public std::binary_function<T, T, bool> {
+	MGPU_HOST_DEVICE bool operator()(T a, T b) { return a != b; }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Device-side arithmetic operators
+
+template<typename T>
+struct plus : public std::binary_function<T, T, T> {
+	MGPU_HOST_DEVICE T operator()(T a, T b) { return a + b; }
+};
+
+template<typename T>
+struct minus : public std::binary_function<T, T, T> {
+	MGPU_HOST_DEVICE T operator()(T a, T b) { return a - b; }
+};
+
+template<typename T>
+struct multiplies : public std::binary_function<T, T, T> {
+	MGPU_HOST_DEVICE T operator()(T a, T b) { return a * b; }
+};
+
+template<typename T>
+struct modulus : public std::binary_function<T, T, T> {
+	MGPU_HOST_DEVICE T operator()(T a, T b) { return a % b; }
+};
+
+template<typename T>
+struct bit_or : public std::binary_function<T, T, T> {
+	MGPU_HOST_DEVICE T operator()(T a, T b) { return a | b; }
+};
+
+template<typename T>
+struct bit_and : public std::binary_function<T, T, T> {
+	MGPU_HOST_DEVICE T operator()(T a, T b) { return a & b; }
+};
+
+template<typename T>
+struct bit_xor : public std::binary_function<T, T, T> {
+	MGPU_HOST_DEVICE T operator()(T a, T b) { return a ^ b; }
+};
+
+template<typename T>
+struct maximum : public std::binary_function<T, T, T> {
+	MGPU_HOST_DEVICE T operator()(T a, T b) { return max(a, b); }
+};
+
+template<typename T>
+struct minimum : public std::binary_function<T, T, T> {
+	MGPU_HOST_DEVICE T operator()(T a, T b) { return min(a, b); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+MGPU_HOST_DEVICE void swap(T& a, T& b) {
+	T c = a;
+	a = b;
+	b = c;
+}
+
+template<typename T>
+struct DevicePair {
+	T x, y;
+};
+
+template<typename T>
+MGPU_HOST_DEVICE DevicePair<T> MakeDevicePair(T x, T y) {
+	DevicePair<T> p = { x, y };
+	return p;
+}
+
+template<typename T> struct numeric_limits;
+template<> struct numeric_limits<int> {
+	MGPU_HOST_DEVICE static int min() { return INT_MIN; }
+	MGPU_HOST_DEVICE static int max() { return INT_MAX; }
+	MGPU_HOST_DEVICE static int lowest() { return INT_MIN; }
+	MGPU_HOST_DEVICE static int AddIdent() { return 0; }
+	MGPU_HOST_DEVICE static int MulIdent() { return 1; }
+};
+template<> struct numeric_limits<long long> {
+	MGPU_HOST_DEVICE static long long min() { return LLONG_MIN; }
+	MGPU_HOST_DEVICE static long long max() { return LLONG_MAX; }
+	MGPU_HOST_DEVICE static long long lowest() { return LLONG_MIN; }
+	MGPU_HOST_DEVICE static long long AddIdent() { return 0; }
+	MGPU_HOST_DEVICE static long long MulIdent() { return 1; }
+};
+template<> struct numeric_limits<uint> {
+	MGPU_HOST_DEVICE static uint min() { return 0; }
+	MGPU_HOST_DEVICE static uint max() { return UINT_MAX; }
+	MGPU_HOST_DEVICE static uint lowest() { return 0; }
+	MGPU_HOST_DEVICE static uint AddIdent() { return 0; }
+	MGPU_HOST_DEVICE static uint MulIdent() { return 1; }
+};
+template<> struct numeric_limits<unsigned long long> {
+	MGPU_HOST_DEVICE static unsigned long long min() { return 0; }
+	MGPU_HOST_DEVICE static unsigned long long max() { return ULLONG_MAX; }
+	MGPU_HOST_DEVICE static unsigned long long lowest() { return 0; }
+	MGPU_HOST_DEVICE static unsigned long long AddIdent() { return 0; }
+	MGPU_HOST_DEVICE static unsigned long long MulIdent() { return 1; }
+};
+template<> struct numeric_limits<float> {
+	MGPU_HOST_DEVICE static float min() { return FLT_MIN; }
+	MGPU_HOST_DEVICE static float max() { return FLT_MAX; }
+	MGPU_HOST_DEVICE static float lowest() { return -FLT_MAX; }
+	MGPU_HOST_DEVICE static float AddIdent() { return 0; }
+	MGPU_HOST_DEVICE static float MulIdent() { return 1; }
+};
+template<> struct numeric_limits<double> {
+	MGPU_HOST_DEVICE static double min() { return DBL_MIN; }
+	MGPU_HOST_DEVICE static double max() { return DBL_MAX; }
+	MGPU_HOST_DEVICE static double lowest() { return -DBL_MAX; }
+	MGPU_HOST_DEVICE static double AddIdent() { return 0; }
+	MGPU_HOST_DEVICE static double MulIdent() { return 1; }
+};
+
+
+MGPU_HOST_DEVICE int2 operator+(int2 a, int2 b) {
+	return make_int2(a.x + b.x, a.y + b.y); 
+}
+MGPU_HOST_DEVICE int2& operator+=(int2& a, int2 b) {
+	a = a + b;
+	return a;
+}
+MGPU_HOST_DEVICE int2 operator*(int2 a, int2 b) {
+	return make_int2(a.x * b.x, a.y * b.y);
+}
+MGPU_HOST_DEVICE int2& operator*=(int2& a, int2 b) {
+	a = a * b;
+	return a;
+}
+
+template<typename T>
+MGPU_HOST_DEVICE T max(T a, T b) {
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 100)
+	return std::max(a, b);
+#else
+	return (a < b) ? b : a;
+#endif
+}
+template<typename T>
+MGPU_HOST_DEVICE T min(T a, T b) {
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 100)
+	return std::min(a, b);
+#else
+	return (b < a) ? b : a;
+#endif
+}
+
+MGPU_HOST_DEVICE int2 max(int2 a, int2 b) {
+	return make_int2(max(a.x, b.x), max(a.y, b.y));
+}
+
+MGPU_HOST_DEVICE int2 min(int2 a, int2 b) {
+	return make_int2(min(a.x, b.x), min(a.y, b.y));
+}
+
+template<> struct numeric_limits<int2> {
+	MGPU_HOST_DEVICE static int2 min() { return make_int2(INT_MIN, INT_MIN); }
+	MGPU_HOST_DEVICE static int2 max() { return make_int2(INT_MAX, INT_MAX); }
+	MGPU_HOST_DEVICE static int2 lowest() { 
+		return make_int2(INT_MIN, INT_MIN); 
+	}
+	MGPU_HOST_DEVICE static int2 AddIdent() { return make_int2(0, 0); }
+	MGPU_HOST_DEVICE static int2 MulIdent() { return make_int2(1, 1); }
+};
+
+template<typename T>
+class constant_iterator : public std::iterator_traits<const T*> {
+public:
+	MGPU_HOST_DEVICE constant_iterator(T value) : _value(value) { }
+
+	MGPU_HOST_DEVICE T operator[](ptrdiff_t i) const { 
+		return _value;
+	}
+	MGPU_HOST_DEVICE T operator*() const {
+		return _value;
+	}
+	MGPU_HOST_DEVICE constant_iterator operator+(ptrdiff_t diff) const {
+		return constant_iterator(_value);
+	}
+	MGPU_HOST_DEVICE constant_iterator operator-(ptrdiff_t diff) const {
+		return constant_iterator(_value);
+	}
+	MGPU_HOST_DEVICE constant_iterator& operator+=(ptrdiff_t diff) {
+		return *this;
+	}
+	MGPU_HOST_DEVICE constant_iterator& operator-=(ptrdiff_t diff) {
+		return *this;
+	}
+private:
+	T _value;
+};
+
+template<typename T>
+class counting_iterator : public std::iterator_traits<const T*> {
+public:
+	MGPU_HOST_DEVICE counting_iterator(T value) : _value(value) { }
+
+	MGPU_HOST_DEVICE T operator[](ptrdiff_t i) { 
+		return _value + i;
+	}
+	MGPU_HOST_DEVICE T operator*() {
+		return _value;
+	}
+	MGPU_HOST_DEVICE counting_iterator operator+(ptrdiff_t diff) {
+		return counting_iterator(_value + diff);
+	}
+	MGPU_HOST_DEVICE counting_iterator operator-(ptrdiff_t diff) {
+		return counting_iterator(_value - diff);
+	}
+	MGPU_HOST_DEVICE counting_iterator& operator+=(ptrdiff_t diff) {
+		_value += diff;
+		return *this;
+	}
+	MGPU_HOST_DEVICE counting_iterator& operator-=(ptrdiff_t diff) {
+		_value -= diff;
+		return *this;
+	}
+private:
+	T _value;
+};
+
+template<typename T>
+class step_iterator : public std::iterator_traits<const T*> {
+public:
+	MGPU_HOST_DEVICE step_iterator(T base, T step) :
+		_base(base), _step(step), _offset(0) { }
+
+	MGPU_HOST_DEVICE T operator[](ptrdiff_t i) { 
+		return _base + (_offset + i) * _step; 
+	}
+	MGPU_HOST_DEVICE T operator*() { 
+		return _base + _offset * _step; 
+	} 
+	MGPU_HOST_DEVICE step_iterator operator+(ptrdiff_t diff) {
+		step_iterator it = *this;
+		it._offset += diff;
+		return it;
+	}
+	MGPU_HOST_DEVICE step_iterator operator-(ptrdiff_t diff) {
+		step_iterator it = *this;
+		it._offset -= diff;
+		return it;
+	}
+	MGPU_HOST_DEVICE step_iterator& operator+=(ptrdiff_t diff) { 
+		_offset += diff;
+		return *this;
+	}
+	MGPU_HOST_DEVICE step_iterator& operator-=(ptrdiff_t diff) { 
+		_offset -= diff;
+		return *this;
+	}
+private:
+	ptrdiff_t _offset;
+	T _base, _step;	
+};
+
+} // namespace mgpu
+
+
+template<typename T>
+MGPU_HOST_DEVICE mgpu::counting_iterator<T> operator+(ptrdiff_t diff,
+	mgpu::counting_iterator<T> it) {
+	return it + diff;
+}
+template<typename T>
+MGPU_HOST_DEVICE mgpu::counting_iterator<T> operator-(ptrdiff_t diff,
+	mgpu::counting_iterator<T> it) {
+	return it + (-diff);
+}
+template<typename T>
+MGPU_HOST_DEVICE mgpu::step_iterator<T> operator+(ptrdiff_t diff, 
+	mgpu::step_iterator<T> it) {
+	return it + diff;
+}
+template<typename T>
+MGPU_HOST_DEVICE mgpu::step_iterator<T> operator-(ptrdiff_t diff, 
+	mgpu::step_iterator<T> it) {
+	return it + (-diff);
+}