Commit 0bf5eb5f authored by lishen's avatar lishen
Browse files

warpctc for dcu

parent 949fcc19
.idea
*~
Makefile
build
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="PublishConfigData" autoUpload="On explicit save action" serverName="10.6.10.62" preserveTimestamps="false" deleteMissingItems="true" createEmptyFolders="true" filePermissions="420" folderPermissions="493" confirmBeforeUploading="false" confirmBeforeDeletion="false" autoUploadExternalChanges="true">
<option name="confirmBeforeDeletion" value="false" />
<option name="confirmBeforeUploading" value="false" />
<serverData>
<paths name="10.6.10.61">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="10.6.10.62">
<serverdata>
<mappings>
<mapping deploy="/public/home/lishen/warpctc/warpctc_dcu" local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="10.6.10.69">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="10.6.6.220">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="lishen_89_docker (c96f91f6-1c12-4fb0-b3d8-cc68ccf3f77c)">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="lishen_90">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="lishen_93">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="lishen_95">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
</serverData>
<option name="myAutoUpload" value="ON_EXPLICIT_SAVE" />
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/warpctc_dcu.iml" filepath="$PROJECT_DIR$/.idea/warpctc_dcu.iml" />
</modules>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<module type="CPP_MODULE" version="4">
<component name="FacetManager">
<facet type="Python" name="Python facet">
<configuration sdkName="Python 3.7" />
</facet>
</component>
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
<orderEntry type="library" name="Python 3.7 interpreter library" level="application" />
</component>
</module>
\ No newline at end of file
# PyTorch bindings for Warp-ctc
[![Build Status](https://travis-ci.org/SeanNaren/warp-ctc.svg?branch=pytorch_bindings)](https://travis-ci.org/SeanNaren/warp-ctc)
This is an extension onto the original repo found [here](https://github.com/baidu-research/warp-ctc).
## Installation
Install [PyTorch](https://github.com/pytorch/pytorch#installation) v0.4.
`WARP_CTC_PATH` should be set to the location of a built WarpCTC
(i.e. `libwarpctc.so`). This defaults to `../build`, so from within a
new warp-ctc clone you could build WarpCTC like this:
```bash
git clone https://github.com/SeanNaren/warp-ctc.git
cd warp-ctc
mkdir build; cd build
cmake ..
make
```
Now install the bindings:
```bash
cd pytorch_binding
python setup.py install
```
If you try the above and get a dlopen error on OSX with anaconda3 (as recommended by pytorch):
```bash
cd ../pytorch_binding
python setup.py install
cd ../build
cp libwarpctc.dylib /Users/$WHOAMI/anaconda3/lib
```
This will resolve the library not loaded error. This can be easily modified to work with other python installs if needed.
Example to use the bindings below.
```python
import torch
from warpctc_pytorch import CTCLoss
ctc_loss = CTCLoss()
# expected shape of seqLength x batchSize x alphabet_size
probs = torch.FloatTensor([[[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]]]).transpose(0, 1).contiguous()
labels = torch.IntTensor([1, 2])
label_sizes = torch.IntTensor([2])
probs_sizes = torch.IntTensor([2])
probs.requires_grad_(True) # tells autograd to compute gradients for probs
cost = ctc_loss(probs, labels, probs_sizes, label_sizes)
cost.backward()
```
## Documentation
```
CTCLoss(size_average=False, length_average=False)
# size_average (bool): normalize the loss by the batch size (default: False)
# length_average (bool): normalize the loss by the total number of frames in the batch. If True, supersedes size_average (default: False)
forward(acts, labels, act_lens, label_lens)
# acts: Tensor of (seqLength x batch x outputDim) containing output activations from network (before softmax)
# labels: 1 dimensional Tensor containing all the targets of the batch in one large sequence
# act_lens: Tensor of size (batch) containing size of each output sequence from the network
# label_lens: Tensor of (batch) containing label length of each example
```
\ No newline at end of file
# DLIB
## 环境配置
使用DCU编译之前,需要准备编译环境。参考
[environment prepare](environment_prepare.md)
## 使用源码安装
### 编译环境准备(以dtk-23.04版本为例)
- 拉取代码
```
git clone -b dtk-23.04 http://developer.hpccube.com/codes/aicomponent/warpctc.git
```
-[开发者社区](https://developer.hpccube.com/tool/#sdk) DCU Toolkit 中下载 DTK-23.04 解压至 /opt/ 路径下,并建立软链接
```
cd /opt && ln -s dtk-23.04 dtk
```
- 导入环境变量以及安装必要依赖库
```shell
source /opt/dtk/env.sh
```
### 编译安装
#### 编译 Python API
- 使用python安装
```shell
python setup.py install
```
- 使用python编译whl包
```shell
python setup.py bdist_wheel
```
### 测试
- 验证warpctc的loss正确性(CPU和GPU的一致性)
```shell
python3 test_gpu.py
```
- 验证warpctc的loss的GPU加速效果
```shell
python3 test_gpu_speed.py
```
/******************************************************************************
* Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
* Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
*
* Code and text by Sean Baxter, NVIDIA Research
* See http://nvlabs.github.io/moderngpu for repository and documentation.
*
******************************************************************************/
#pragma once
#include "ctasearch.cuh"
#include "loadstore.cuh"
namespace mgpu {
////////////////////////////////////////////////////////////////////////////////
// DeviceLoadBalancingSearch
// Upper Bound search from A (needles) into B (haystack). The A values are
// natural numbers from aBegin to aEnd. bFirst is the index of the B value at
// bBegin in shared memory.
template<int VT, bool RangeCheck>
MGPU_DEVICE void DeviceSerialLoadBalanceSearch(const int* b_shared, int aBegin,
int aEnd, int bFirst, int bBegin, int bEnd, int* a_shared) {
int bKey = b_shared[bBegin];
#pragma unroll
for(int i = 0; i < VT; ++i) {
bool p;
if(RangeCheck)
p = (aBegin < aEnd) && ((bBegin >= bEnd) || (aBegin < bKey));
else
p = aBegin < bKey;
if(p)
// Advance A (the needle).
a_shared[aBegin++] = bFirst + bBegin;
else
// Advance B (the haystack).
bKey = b_shared[++bBegin];
}
}
////////////////////////////////////////////////////////////////////////////////
// CTALoadBalance
// Computes upper_bound(counting_iterator<int>(first), b_global) - 1.
// Unlike most other CTA* functions, CTALoadBalance loads from global memory.
// This returns the loaded B elements at the beginning or end of shared memory
// depending on the aFirst argument.
// CTALoadBalance requires NT * VT + 2 slots of shared memory.
template<int NT, int VT, typename InputIt>
MGPU_DEVICE int4 CTALoadBalance(int destCount, InputIt b_global,
int sourceCount, int block, int tid, const int* mp_global,
int* indices_shared, bool loadPrecedingB) {
int4 range = ComputeMergeRange(destCount, sourceCount, block, 0, NT * VT,
mp_global);
int a0 = range.x;
int a1 = range.y;
int b0 = range.z;
int b1 = range.w;
if(!b0) loadPrecedingB = false;
// Load one trailing term from B. If we're already at the end, fill the
// end of the buffer with destCount.
int aCount = a1 - a0;
int bCount = b1 - b0;
int extended = b1 < sourceCount;
int loadCount = bCount + extended;
int fillCount = NT * VT + 1 - loadCount - aCount;
int* a_shared = indices_shared;
int* b_shared = indices_shared + aCount + (int)loadPrecedingB;
// Load the B values.
// DeviceMemToMemLoop<NT>(bCount + extended + (int)loadPrecedingB,
// b_global + b0 - (int)loadPrecedingB, tid,
// b_shared - (int)loadPrecedingB);
for(int i = tid - (int)loadPrecedingB; i < bCount + extended; i += NT)
b_shared[i] = b_global[b0 + i];
// Fill the end of the array with destCount.
for(int i = tid + extended; i < fillCount; i += NT)
b_shared[bCount + i] = destCount;
__syncthreads();
// Run a merge path to find the start of the serial merge for each thread.
int diag = VT * tid;
int mp = MergePath<MgpuBoundsUpper>(mgpu::counting_iterator<int>(a0),
aCount, b_shared, bCount, diag, mgpu::less<int>());
int a0tid = a0 + mp;
int b0tid = diag - mp;
// Subtract 1 from b0 because we want to return upper_bound - 1.
DeviceSerialLoadBalanceSearch<VT, false>(b_shared, a0tid, a1, b0 - 1,
b0tid, bCount, a_shared - a0);
__syncthreads();
b0 -= (int)loadPrecedingB;
return make_int4(a0, a1, b0, b1);
}
} // namespace mgpu
/******************************************************************************
* Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
*
* Code and text by Sean Baxter, NVIDIA Research
* See http://nvlabs.github.io/moderngpu for repository and documentation.
*
******************************************************************************/
#pragma once
#include "ctasearch.cuh"
#include "loadstore.cuh"
#include "sortnetwork.cuh"
namespace mgpu {
////////////////////////////////////////////////////////////////////////////////
// SerialMerge
template<int VT, bool RangeCheck, typename T, typename Comp>
MGPU_DEVICE void SerialMerge(const T* keys_shared, int aBegin, int aEnd,
int bBegin, int bEnd, T* results, int* indices, Comp comp) {
T aKey = keys_shared[aBegin];
T bKey = keys_shared[bBegin];
#pragma unroll
for(int i = 0; i < VT; ++i) {
bool p;
if(RangeCheck)
p = (bBegin >= bEnd) || ((aBegin < aEnd) && !comp(bKey, aKey));
else
p = !comp(bKey, aKey);
results[i] = p ? aKey : bKey;
indices[i] = p ? aBegin : bBegin - !RangeCheck;
if(p) aKey = keys_shared[++aBegin];
else bKey = keys_shared[++bBegin];
}
__syncthreads();
}
////////////////////////////////////////////////////////////////////////////////
// FindMergeFrame and FindMergesortInterval help mergesort (both CTA and global
// merge pass levels) locate lists within the single source array.
// Returns (offset of a, offset of b, length of list).
MGPU_HOST_DEVICE int3 FindMergesortFrame(int coop, int block, int nv) {
// coop is the number of CTAs or threads cooperating to merge two lists into
// one. We round block down to the first CTA's ID that is working on this
// merge.
int start = ~(coop - 1) & block;
int size = nv * (coop>> 1);
return make_int3(nv * start, nv * start + size, size);
}
// Returns (a0, a1, b0, b1) into mergesort input lists between mp0 and mp1.
MGPU_HOST_DEVICE int4 FindMergesortInterval(int3 frame, int coop, int block,
int nv, int count, int mp0, int mp1) {
// Locate diag from the start of the A sublist.
int diag = nv * block - frame.x;
int a0 = frame.x + mp0;
int a1 = min(count, frame.x + mp1);
int b0 = min(count, frame.y + diag - mp0);
int b1 = min(count, frame.y + diag + nv - mp1);
// The end partition of the last block for each merge operation is computed
// and stored as the begin partition for the subsequent merge. i.e. it is
// the same partition but in the wrong coordinate system, so its 0 when it
// should be listSize. Correct that by checking if this is the last block
// in this merge operation.
if(coop - 1 == ((coop - 1) & block)) {
a1 = min(count, frame.x + frame.z);
b1 = min(count, frame.y + frame.z);
}
return make_int4(a0, a1, b0, b1);
}
////////////////////////////////////////////////////////////////////////////////
// ComputeMergeRange
MGPU_HOST_DEVICE int4 ComputeMergeRange(int aCount, int bCount, int block,
int coop, int NV, const int* mp_global) {
// Load the merge paths computed by the partitioning kernel.
int mp0 = mp_global[block];
int mp1 = mp_global[block + 1];
int gid = NV * block;
// Compute the ranges of the sources in global memory.
int4 range;
if(coop) {
int3 frame = FindMergesortFrame(coop, block, NV);
range = FindMergesortInterval(frame, coop, block, NV, aCount, mp0,
mp1);
} else {
range.x = mp0; // a0
range.y = mp1; // a1
range.z = gid - range.x; // b0
range.w = min(aCount + bCount, gid + NV) - range.y; // b1
}
return range;
}
////////////////////////////////////////////////////////////////////////////////
// CTA mergesort support
template<int NT, int VT, typename T, typename Comp>
MGPU_DEVICE void CTABlocksortPass(T* keys_shared, int tid, int count,
int coop, T* keys, int* indices, Comp comp) {
int list = ~(coop - 1) & tid;
int diag = min(count, VT * ((coop - 1) & tid));
int start = VT * list;
int a0 = min(count, start);
int b0 = min(count, start + VT * (coop / 2));
int b1 = min(count, start + VT * coop);
int p = MergePath<MgpuBoundsLower>(keys_shared + a0, b0 - a0,
keys_shared + b0, b1 - b0, diag, comp);
SerialMerge<VT, true>(keys_shared, a0 + p, b0, b0 + diag - p, b1, keys,
indices, comp);
}
template<int NT, int VT, bool HasValues, typename KeyType, typename ValType,
typename Comp>
MGPU_DEVICE void CTABlocksortLoop(ValType threadValues[VT],
KeyType* keys_shared, ValType* values_shared, int tid, int count,
Comp comp) {
#pragma unroll
for(int coop = 2; coop <= NT; coop *= 2) {
int indices[VT];
KeyType keys[VT];
CTABlocksortPass<NT, VT>(keys_shared, tid, count, coop, keys,
indices, comp);
if(HasValues) {
// Exchange the values through shared memory.
DeviceThreadToShared<VT>(threadValues, tid, values_shared);
DeviceGather<NT, VT>(NT * VT, values_shared, indices, tid,
threadValues);
}
// Store results in shared memory in sorted order.
DeviceThreadToShared<VT>(keys, tid, keys_shared);
}
}
////////////////////////////////////////////////////////////////////////////////
// CTAMergesort
// Caller provides the keys in shared memory. This functions sorts the first
// count elements.
template<int NT, int VT, bool Stable, bool HasValues, typename KeyType,
typename ValType, typename Comp>
MGPU_DEVICE void CTAMergesort(KeyType threadKeys[VT], ValType threadValues[VT],
KeyType* keys_shared, ValType* values_shared, int count, int tid,
Comp comp) {
// Stable sort the keys in the thread.
if(VT * tid < count) {
if(Stable)
OddEvenTransposeSort<VT>(threadKeys, threadValues, comp);
else
OddEvenMergesort<VT>(threadKeys, threadValues, comp);
}
// Store the locally sorted keys into shared memory.
DeviceThreadToShared<VT>(threadKeys, tid, keys_shared);
// Recursively merge lists until the entire CTA is sorted.
CTABlocksortLoop<NT, VT, HasValues>(threadValues, keys_shared,
values_shared, tid, count, comp);
}
template<int NT, int VT, bool Stable, typename KeyType, typename Comp>
MGPU_DEVICE void CTAMergesortKeys(KeyType threadKeys[VT],
KeyType* keys_shared, int count, int tid, Comp comp) {
int valuesTemp[VT];
CTAMergesort<NT, VT, Stable, false>(threadKeys, valuesTemp, keys_shared,
(int*)keys_shared, count, tid, comp);
}
template<int NT, int VT, bool Stable, typename KeyType, typename ValType,
typename Comp>
MGPU_DEVICE void CTAMergesortPairs(KeyType threadKeys[VT],
ValType threadValues[VT], KeyType* keys_shared, ValType* values_shared,
int count, int tid, Comp comp) {
CTAMergesort<NT, VT, Stable, true>(threadKeys, threadValues, keys_shared,
values_shared, count, tid, comp);
}
////////////////////////////////////////////////////////////////////////////////
// DeviceMergeKeysIndices
template<int NT, int VT, bool LoadExtended, typename It1, typename It2,
typename T, typename Comp>
MGPU_DEVICE void DeviceMergeKeysIndices(It1 a_global, int aCount, It2 b_global,
int bCount, int4 range, int tid, T* keys_shared, T* results, int* indices,
Comp comp) {
int a0 = range.x;
int a1 = range.y;
int b0 = range.z;
int b1 = range.w;
if(LoadExtended) {
bool extended = (a1 < aCount) && (b1 < bCount);
aCount = a1 - a0;
bCount = b1 - b0;
int aCount2 = aCount + (int)extended;
int bCount2 = bCount + (int)extended;
// Load one element past the end of each input to avoid having to use
// range checking in the merge loop.
DeviceLoad2ToShared<NT, VT, VT + 1>(a_global + a0, aCount2,
b_global + b0, bCount2, tid, keys_shared);
// Run a Merge Path search for each thread's starting point.
int diag = VT * tid;
int mp = MergePath<MgpuBoundsLower>(keys_shared, aCount,
keys_shared + aCount2, bCount, diag, comp);
// Compute the ranges of the sources in shared memory.
int a0tid = mp;
int b0tid = aCount2 + diag - mp;
if(extended) {
SerialMerge<VT, false>(keys_shared, a0tid, 0, b0tid, 0, results,
indices, comp);
} else {
int a1tid = aCount;
int b1tid = aCount2 + bCount;
SerialMerge<VT, true>(keys_shared, a0tid, a1tid, b0tid, b1tid,
results, indices, comp);
}
} else {
// Use the input intervals from the ranges between the merge path
// intersections.
aCount = a1 - a0;
bCount = b1 - b0;
// Load the data into shared memory.
DeviceLoad2ToShared<NT, VT, VT>(a_global + a0, aCount, b_global + b0,
bCount, tid, keys_shared);
// Run a merge path to find the start of the serial merge for each
// thread.
int diag = VT * tid;
int mp = MergePath<MgpuBoundsLower>(keys_shared, aCount,
keys_shared + aCount, bCount, diag, comp);
// Compute the ranges of the sources in shared memory.
int a0tid = mp;
int a1tid = aCount;
int b0tid = aCount + diag - mp;
int b1tid = aCount + bCount;
// Serial merge into register.
SerialMerge<VT, true>(keys_shared, a0tid, a1tid, b0tid, b1tid, results,
indices, comp);
}
}
////////////////////////////////////////////////////////////////////////////////
// DeviceMerge
// Merge pairs from global memory into global memory. Useful factorization to
// enable calling from merge, mergesort, and locality sort.
template<int NT, int VT, bool HasValues, bool LoadExtended, typename KeysIt1,
typename KeysIt2, typename KeysIt3, typename ValsIt1, typename ValsIt2,
typename KeyType, typename ValsIt3, typename Comp>
MGPU_DEVICE void DeviceMerge(KeysIt1 aKeys_global, ValsIt1 aVals_global,
int aCount, KeysIt2 bKeys_global, ValsIt2 bVals_global, int bCount,
int tid, int block, int4 range, KeyType* keys_shared, int* indices_shared,
KeysIt3 keys_global, ValsIt3 vals_global, Comp comp) {
KeyType results[VT];
int indices[VT];
DeviceMergeKeysIndices<NT, VT, LoadExtended>(aKeys_global, aCount,
bKeys_global, bCount, range, tid, keys_shared, results, indices, comp);
// Store merge results back to shared memory.
DeviceThreadToShared<VT>(results, tid, keys_shared);
// Store merged keys to global memory.
aCount = range.y - range.x;
bCount = range.w - range.z;
DeviceSharedToGlobal<NT, VT>(aCount + bCount, keys_shared, tid,
keys_global + NT * VT * block);
// Copy the values.
if(HasValues) {
DeviceThreadToShared<VT>(indices, tid, indices_shared);
DeviceTransferMergeValuesShared<NT, VT>(aCount + bCount,
aVals_global + range.x, bVals_global + range.z, aCount,
indices_shared, tid, vals_global + NT * VT * block);
}
}
} // namespace mgpu
/******************************************************************************
* Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
*
* Code and text by Sean Baxter, NVIDIA Research
* See http://nvlabs.github.io/moderngpu for repository and documentation.
*
******************************************************************************/
#pragma once
#include "../mgpuenums.h"
#include "deviceutil.cuh"
#include "intrinsics.cuh"
namespace mgpu {
////////////////////////////////////////////////////////////////////////////////
// CTAReduce
template<int NT, typename Op = mgpu::plus<int> >
struct CTAReduce {
typedef typename Op::first_argument_type T;
enum { Size = NT, Capacity = NT };
struct Storage { T shared[Capacity]; };
MGPU_DEVICE static T Reduce(int tid, T x, Storage& storage, Op op = Op()) {
storage.shared[tid] = x;
__syncthreads();
// Fold the data in half with each pass.
#pragma unroll
for(int destCount = NT / 2; destCount >= 1; destCount /= 2) {
if(tid < destCount) {
// Read from the right half and store to the left half.
x = op(x, storage.shared[destCount + tid]);
storage.shared[tid] = x;
}
__syncthreads();
}
T total = storage.shared[0];
__syncthreads();
return total;
}
};
#if __CUDA_ARCH__ >= 300
template<int NT>
struct CTAReduce<NT, mgpu::plus<int> > {
typedef mgpu::plus<int> Op;
typedef int T;
enum { Size = NT, Capacity = WARP_SIZE };
struct Storage { int shared[Capacity]; };
MGPU_DEVICE static int Reduce(int tid, int x, Storage& storage,
Op op = Op()) {
const int NumSections = WARP_SIZE;
const int SecSize = NT / NumSections;
int lane = (SecSize - 1) & tid;
int sec = tid / SecSize;
// In the first phase, threads cooperatively find the reduction within
// their segment. The segments are SecSize threads (NT / WARP_SIZE)
// wide.
#pragma unroll
for(int offset = 1; offset < SecSize; offset *= 2)
x = shfl_add(x, offset, SecSize);
// The last thread in each segment stores the local reduction to shared
// memory.
if(SecSize - 1 == lane) storage.shared[sec] = x;
__syncthreads();
// Reduce the totals of each input segment. The spine is WARP_SIZE
// threads wide.
if(tid < NumSections) {
x = storage.shared[tid];
#pragma unroll
for(int offset = 1; offset < NumSections; offset *= 2)
x = shfl_add(x, offset, NumSections);
storage.shared[tid] = x;
}
__syncthreads();
int reduction = storage.shared[NumSections - 1];
__syncthreads();
return reduction;
}
};
template<int NT>
struct CTAReduce<NT, mgpu::maximum<int> > {
typedef mgpu::maximum<int> Op;
enum { Size = NT, Capacity = WARP_SIZE };
struct Storage { int shared[Capacity]; };
MGPU_DEVICE static int Reduce(int tid, int x, Storage& storage,
Op op = Op()) {
const int NumSections = WARP_SIZE;
const int SecSize = NT / NumSections;
int lane = (SecSize - 1) & tid;
int sec = tid / SecSize;
#pragma unroll
for(int offset = 1; offset < SecSize; offset *= 2)
x = shfl_max(x, offset, SecSize);
if(SecSize - 1 == lane) storage.shared[sec] = x;
__syncthreads();
if(tid < NumSections) {
x = storage.shared[tid];
#pragma unroll
for(int offset = 1; offset < NumSections; offset *= 2)
x = shfl_max(x, offset, NumSections);
storage.shared[tid] = x;
}
__syncthreads();
int reduction = storage.shared[NumSections - 1];
__syncthreads();
return reduction;
}
};
#endif // __CUDA_ARCH__ >= 300
////////////////////////////////////////////////////////////////////////////////
// CTAScan
template<int NT, typename Op = mgpu::plus<int> >
struct CTAScan {
typedef typename Op::result_type T;
enum { Size = NT, Capacity = 2 * NT + 1 };
struct Storage { T shared[Capacity]; };
MGPU_DEVICE static T Scan(int tid, T x, Storage& storage, T* total,
MgpuScanType type = MgpuScanTypeExc, T identity = (T)0, Op op = Op()) {
storage.shared[tid] = x;
int first = 0;
__syncthreads();
#pragma unroll
for(int offset = 1; offset < NT; offset += offset) {
if(tid >= offset)
x = op(storage.shared[first + tid - offset], x);
first = NT - first;
storage.shared[first + tid] = x;
__syncthreads();
}
*total = storage.shared[first + NT - 1];
if(MgpuScanTypeExc == type)
x = tid ? storage.shared[first + tid - 1] : identity;
__syncthreads();
return x;
}
MGPU_DEVICE static T Scan(int tid, T x, Storage& storage) {
T total;
return Scan(tid, x, storage, &total, MgpuScanTypeExc, (T)0, Op());
}
};
////////////////////////////////////////////////////////////////////////////////
// Special partial specialization for CTAScan<NT, ScanOpAdd> on Kepler.
// This uses the shfl intrinsic to reduce scan latency.
#if __CUDA_ARCH__ >= 300
template<int NT>
struct CTAScan<NT, mgpu::plus<int> > {
typedef mgpu::plus<int> Op;
enum { Size = NT, NumSegments = WARP_SIZE, SegSize = NT / NumSegments };
enum { Capacity = NumSegments + 1 };
struct Storage { int shared[Capacity + 1]; };
MGPU_DEVICE static int Scan(int tid, int x, Storage& storage, int* total,
MgpuScanType type = MgpuScanTypeExc, int identity = 0, Op op = Op()) {
// Define WARP_SIZE segments that are NT / WARP_SIZE large.
// Each warp makes log(SegSize) shfl_add calls.
// The spine makes log(WARP_SIZE) shfl_add calls.
int lane = (SegSize - 1) & tid;
int segment = tid / SegSize;
// Scan each segment using shfl_add.
int scan = x;
#pragma unroll
for(int offset = 1; offset < SegSize; offset *= 2)
scan = shfl_add(scan, offset, SegSize);
// Store the reduction (last element) of each segment into storage.
if(SegSize - 1 == lane) storage.shared[segment] = scan;
__syncthreads();
// Warp 0 does a full shfl warp scan on the partials. The total is
// stored to shared[NumSegments]. (NumSegments = WARP_SIZE)
if(tid < NumSegments) {
int y = storage.shared[tid];
int scan = y;
#pragma unroll
for(int offset = 1; offset < NumSegments; offset *= 2)
scan = shfl_add(scan, offset, NumSegments);
storage.shared[tid] = scan - y;
if(NumSegments - 1 == tid) storage.shared[NumSegments] = scan;
}
__syncthreads();
// Add the scanned partials back in and convert to exclusive scan.
scan += storage.shared[segment];
if(MgpuScanTypeExc == type) {
scan -= x;
if(identity && !tid) scan = identity;
}
*total = storage.shared[NumSegments];
__syncthreads();
return scan;
}
MGPU_DEVICE static int Scan(int tid, int x, Storage& storage) {
int total;
return Scan(tid, x, storage, &total, MgpuScanTypeExc, 0);
}
};
#endif // __CUDA_ARCH__ >= 300
////////////////////////////////////////////////////////////////////////////////
// CTABinaryScan
template<int NT>
MGPU_DEVICE int CTABinaryScan(int tid, bool x, int* shared, int* total) {
const int NumWarps = NT / WARP_SIZE;
int warp = tid / WARP_SIZE;
int lane = (WARP_SIZE - 1);
// Store the bit totals for each warp.
uint bits = __ballot(x);
shared[warp] = popc(bits);
__syncthreads();
#if __CUDA_ARCH__ >= 300
if(tid < NumWarps) {
int x = shared[tid];
int scan = x;
#pragma unroll
for(int offset = 1; offset < NumWarps; offset *= 2)
scan = shfl_add(scan, offset, NumWarps);
shared[tid] = scan - x;
}
__syncthreads();
#else
// Thread 0 scans warp totals.
if(!tid) {
int scan = 0;
#pragma unroll
for(int i = 0; i < NumWarps; ++i) {
int y = shared[i];
shared[i] = scan;
scan += y;
}
shared[NumWarps] = scan;
}
__syncthreads();
#endif // __CUDA_ARCH__ >= 300
// Add the warp scan back into the partials.
int scan = shared[warp] + __popc(bfe(bits, 0, lane));
*total = shared[NumWarps];
__syncthreads();
return scan;
}
} // namespace mgpu
/******************************************************************************
* Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
*
* Code and text by Sean Baxter, NVIDIA Research
* See http://nvlabs.github.io/moderngpu for repository and documentation.
*
******************************************************************************/
#pragma once
#include "deviceutil.cuh"
#include "../mgpudevice.cuh"
namespace mgpu {
template<MgpuBounds Bounds, typename IntT, typename It, typename T,
typename Comp>
MGPU_HOST_DEVICE void BinarySearchIt(It data, int& begin, int& end, T key,
int shift, Comp comp) {
IntT scale = (1<< shift) - 1;
int mid = (int)((begin + scale * end)>> shift);
T key2 = data[mid];
bool pred = (MgpuBoundsUpper == Bounds) ?
!comp(key, key2) :
comp(key2, key);
if(pred) begin = mid + 1;
else end = mid;
}
template<MgpuBounds Bounds, typename IntT, typename T, typename It,
typename Comp>
MGPU_HOST_DEVICE int BiasedBinarySearch(It data, int count, T key, int levels,
Comp comp) {
int begin = 0;
int end = count;
if(levels >= 4 && begin < end)
BinarySearchIt<Bounds, IntT>(data, begin, end, key, 9, comp);
if(levels >= 3 && begin < end)
BinarySearchIt<Bounds, IntT>(data, begin, end, key, 7, comp);
if(levels >= 2 && begin < end)
BinarySearchIt<Bounds, IntT>(data, begin, end, key, 5, comp);
if(levels >= 1 && begin < end)
BinarySearchIt<Bounds, IntT>(data, begin, end, key, 4, comp);
while(begin < end)
BinarySearchIt<Bounds, int>(data, begin, end, key, 1, comp);
return begin;
}
template<MgpuBounds Bounds, typename T, typename It, typename Comp>
MGPU_HOST_DEVICE int BinarySearch(It data, int count, T key, Comp comp) {
int begin = 0;
int end = count;
while(begin < end)
BinarySearchIt<Bounds, int>(data, begin, end, key, 1, comp);
return begin;
}
////////////////////////////////////////////////////////////////////////////////
// MergePath search
template<MgpuBounds Bounds, typename It1, typename It2, typename Comp>
MGPU_HOST_DEVICE int MergePath(It1 a, int aCount, It2 b, int bCount, int diag,
Comp comp) {
typedef typename std::iterator_traits<It1>::value_type T;
int begin = max(0, diag - bCount);
int end = min(diag, aCount);
while(begin < end) {
int mid = (begin + end)>> 1;
T aKey = a[mid];
T bKey = b[diag - 1 - mid];
bool pred = (MgpuBoundsUpper == Bounds) ?
comp(aKey, bKey) :
!comp(bKey, aKey);
if(pred) begin = mid + 1;
else end = mid;
}
return begin;
}
////////////////////////////////////////////////////////////////////////////////
// SegmentedMergePath search
template<typename InputIt, typename Comp>
MGPU_HOST_DEVICE int SegmentedMergePath(InputIt keys, int aOffset, int aCount,
int bOffset, int bCount, int leftEnd, int rightStart, int diag, Comp comp) {
// leftEnd and rightStart are defined from the origin, and diag is defined
// from aOffset.
// We only need to run a Merge Path search if the diagonal intersects the
// segment that strides the left and right halves (i.e. is between leftEnd
// and rightStart).
if(aOffset + diag <= leftEnd) return diag;
if(aOffset + diag >= rightStart) return aCount;
bCount = min(bCount, rightStart - bOffset);
int begin = max(max(leftEnd - aOffset, 0), diag - bCount);
int end = min(diag, aCount);
while(begin < end) {
int mid = (begin + end)>> 1;
int ai = aOffset + mid;
int bi = bOffset + diag - 1 - mid;
bool pred = !comp(keys[bi], keys[ai]);
if(pred) begin = mid + 1;
else end = mid;
}
return begin;
}
////////////////////////////////////////////////////////////////////////////////
// BalancedPath search
template<bool Duplicates, typename IntT, typename InputIt1, typename InputIt2,
typename Comp>
MGPU_HOST_DEVICE int2 BalancedPath(InputIt1 a, int aCount, InputIt2 b,
int bCount, int diag, int levels, Comp comp) {
typedef typename std::iterator_traits<InputIt1>::value_type T;
int p = MergePath<MgpuBoundsLower>(a, aCount, b, bCount, diag, comp);
int aIndex = p;
int bIndex = diag - p;
bool star = false;
if(bIndex < bCount) {
if(Duplicates) {
T x = b[bIndex];
// Search for the beginning of the duplicate run in both A and B.
// Because
int aStart = BiasedBinarySearch<MgpuBoundsLower, IntT>(a, aIndex, x,
levels, comp);
int bStart = BiasedBinarySearch<MgpuBoundsLower, IntT>(b, bIndex, x,
levels, comp);
// The distance between the merge path and the lower_bound is the
// 'run'. We add up the a- and b- runs and evenly distribute them to
// get a stairstep path.
int aRun = aIndex - aStart;
int bRun = bIndex - bStart;
int xCount = aRun + bRun;
// Attempt to advance b and regress a.
int bAdvance = max(xCount>> 1, bRun);
int bEnd = min(bCount, bStart + bAdvance + 1);
int bRunEnd = BinarySearch<MgpuBoundsUpper>(b + bIndex,
bEnd - bIndex, x, comp) + bIndex;
bRun = bRunEnd - bStart;
bAdvance = min(bAdvance, bRun);
int aAdvance = xCount - bAdvance;
bool roundUp = (aAdvance == bAdvance + 1) && (bAdvance < bRun);
aIndex = aStart + aAdvance;
if(roundUp) star = true;
} else {
if(aIndex && aCount) {
T aKey = a[aIndex - 1];
T bKey = b[bIndex];
// If the last consumed element in A (aIndex - 1) is the same as
// the next element in B (bIndex), we're sitting at a starred
// partition.
if(!comp(aKey, bKey)) star = true;
}
}
}
return make_int2(aIndex, star);
}
} // namespace mgpu
/******************************************************************************
* Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
*
* Code and text by Sean Baxter, NVIDIA Research
* See http://nvlabs.github.io/moderngpu for repository and documentation.
*
******************************************************************************/
#pragma once
#include "ctasegscan.cuh"
#include "ctasearch.cuh"
namespace mgpu {
////////////////////////////////////////////////////////////////////////////////
// Segmented reduce utility functions.
// Extract the upper-bound indices from the coded ranges. Decrement to include
// the first addressed row/segment.
struct SegReduceRange {
int begin;
int end;
int total;
bool flushLast;
};
MGPU_DEVICE SegReduceRange DeviceShiftRange(int limit0, int limit1) {
SegReduceRange range;
range.begin = 0x7fffffff & limit0;
range.end = 0x7fffffff & limit1;
range.total = range.end - range.begin;
range.flushLast = 0 == (0x80000000 & limit1);
range.end += !range.flushLast;
return range;
}
// Reconstitute row/segment indices from a starting row index and packed end
// flags. Used for pre-processed versions of interval reduce and interval Spmv.
template<int VT>
MGPU_DEVICE void DeviceExpandFlagsToRows(int first, int endFlags,
int rows[VT + 1]) {
rows[0] = first;
#pragma unroll
for(int i = 0; i < VT; ++i) {
if((1<< i) & endFlags) ++first;
rows[i + 1] = first;
}
}
////////////////////////////////////////////////////////////////////////////////
// After loading CSR terms into shared memory, each thread binary searches
// (upper-bound) to find its starting point. Each thread then walks forward,
// emitting the csr0-relative row indices to register.
template<int NT, int VT>
MGPU_DEVICE int DeviceExpandCsrRows(int tidOffset, int* csr_shared,
int numRows, int end, int rows[VT + 1], int rowStarts[VT]) {
// Each thread binary searches for its starting row.
int row = BinarySearch<MgpuBoundsUpper>(csr_shared, numRows, tidOffset,
mgpu::less<int>()) - 1;
// Each thread starts at row and scans forward, emitting row IDs into
// register. Store the CTA-local row index (starts at 0) to rows and the
// start of the row (globally) to rowStarts.
int curOffset = csr_shared[row];
int nextOffset = (row + 1 < numRows) ? csr_shared[row + 1] : end;
rows[0] = row;
rowStarts[0] = curOffset;
int endFlags = 0;
#pragma unroll
for(int i = 1; i <= VT; ++i) {
// Advance the row cursor when the iterator hits the next row offset.
if(tidOffset + i == nextOffset) {
// Set an end flag when the cursor advances to the next row.
endFlags |= 1<< (i - 1);
// Advance the cursor and load the next row offset.
++row;
curOffset = nextOffset;
nextOffset = (row + 1 < numRows) ? csr_shared[row + 1] : end;
}
rows[i] = row;
if(i < VT) rowStarts[i] = curOffset;
}
__syncthreads();
return endFlags;
}
////////////////////////////////////////////////////////////////////////////////
// DeviceSegReducePrepare
// Expand non-empty interval of CSR elements into row indices. Compute end-flags
// by comparing adjacent row IDs.
// DeviceSegReducePrepare may be called either by a pre-processing kernel or by
// the kernel that actually evaluates the segmented reduction if no preprocesing
// is desired.
struct SegReduceTerms {
int endFlags;
int tidDelta;
};
template<int NT, int VT>
MGPU_DEVICE SegReduceTerms DeviceSegReducePrepare(int* csr_shared, int numRows,
int tid, int gid, bool flushLast, int rows[VT + 1], int rowStarts[VT]) {
// Pass a sentinel (end) to point to the next segment start. If we flush,
// this is the end of this tile. Otherwise it is INT_MAX
int endFlags = DeviceExpandCsrRows<NT, VT>(gid + VT * tid, csr_shared,
numRows, flushLast ? (gid + NT * VT) : INT_MAX, rows, rowStarts);
// Find the distance to to scan to compute carry-in for each thread. Use the
// existance of an end flag anywhere in the thread to determine if carry-out
// values from the left should propagate through to the right.
int tidDelta = DeviceFindSegScanDelta<NT>(tid, rows[0] != rows[VT],
csr_shared);
SegReduceTerms terms = { endFlags, tidDelta };
return terms;
}
////////////////////////////////////////////////////////////////////////////////
// CTASegReduce
// Core segmented reduction code. Supports fast-path and slow-path for intra-CTA
// segmented reduction. Stores partials to global memory.
// Callers feed CTASegReduce::ReduceToGlobal values in thread order.
template<int NT, int VT, bool HalfCapacity, typename T, typename Op>
struct CTASegReduce {
typedef CTASegScan<NT, Op> SegScan;
enum {
NV = NT * VT,
Capacity = HalfCapacity ? (NV / 2) : NV
};
union Storage {
typename SegScan::Storage segScanStorage;
T values[Capacity];
};
template<typename DestIt>
MGPU_DEVICE static void ReduceToGlobal(const int rows[VT + 1], int total,
int tidDelta, int startRow, int block, int tid, T data[VT],
DestIt dest_global, T* carryOut_global, T identity, Op op,
Storage& storage) {
// Run a segmented scan within the thread.
T x, localScan[VT];
#pragma unroll
for(int i = 0; i < VT; ++i) {
x = i ? op(x, data[i]) : data[i];
localScan[i] = x;
if(rows[i] != rows[i + 1]) x = identity;
}
// Run a parallel segmented scan over the carry-out values to compute
// carry-in.
T carryOut;
T carryIn = SegScan::SegScanDelta(tid, tidDelta, x,
storage.segScanStorage, &carryOut, identity, op);
// Store the carry-out for the entire CTA to global memory.
if(!tid) carryOut_global[block] = carryOut;
dest_global += startRow;
if(HalfCapacity && total > Capacity) {
// Add carry-in to each thread-local scan value. Store directly
// to global.
#pragma unroll
for(int i = 0; i < VT; ++i) {
// Add the carry-in to the local scan.
T x2 = op(carryIn, localScan[i]);
// Store on the end flag and clear the carry-in.
if(rows[i] != rows[i + 1]) {
carryIn = identity;
dest_global[rows[i]] = x2;
}
}
} else {
// All partials fit in shared memory. Add carry-in to each thread-
// local scan value.
#pragma unroll
for(int i = 0; i < VT; ++i) {
// Add the carry-in to the local scan.
T x2 = op(carryIn, localScan[i]);
// Store reduction when the segment changes and clear the
// carry-in.
if(rows[i] != rows[i + 1]) {
storage.values[rows[i]] = x2;
carryIn = identity;
}
}
__syncthreads();
// Cooperatively store reductions to global memory.
for(int index = tid; index < total; index += NT)
dest_global[index] = storage.values[index];
__syncthreads();
}
}
};
} // namespace mgpu
/******************************************************************************
* Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
*
* Code and text by Sean Baxter, NVIDIA Research
* See http://nvlabs.github.io/moderngpu for repository and documentation.
*
******************************************************************************/
#pragma once
#include "ctascan.cuh"
namespace mgpu {
////////////////////////////////////////////////////////////////////////////////
// DeviceFindSegScanDelta
// Runs an inclusive max-index scan over binary inputs.
template<int NT>
MGPU_DEVICE int DeviceFindSegScanDelta(int tid, bool flag, int* delta_shared) {
const int NumWarps = NT / 32;
int warp = tid / 32;
int lane = 31 & tid;
uint warpMask = 0xffffffff>> (31 - lane); // inclusive search
uint ctaMask = 0x7fffffff>> (31 - lane); // exclusive search
uint warpBits = __ballot(flag);
delta_shared[warp] = warpBits;
__syncthreads();
if(tid < NumWarps) {
uint ctaBits = __ballot(0 != delta_shared[tid]);
int warpSegment = 31 - clz(ctaMask & ctaBits);
int start = (-1 != warpSegment) ?
(31 - clz(delta_shared[warpSegment]) + 32 * warpSegment) : 0;
delta_shared[NumWarps + tid] = start;
}
__syncthreads();
// Find the closest flag to the left of this thread within the warp.
// Include the flag for this thread.
int start = 31 - clz(warpMask & warpBits);
if(-1 != start) start += ~31 & tid;
else start = delta_shared[NumWarps + warp];
__syncthreads();
return tid - start;
}
////////////////////////////////////////////////////////////////////////////////
// CTASegScan
template<int NT, typename _Op = mgpu::plus<int> >
struct CTASegScan {
typedef _Op Op;
typedef typename Op::result_type T;
enum { NumWarps = NT / 32, Size = NT, Capacity = 2 * NT };
union Storage {
int delta[NumWarps];
T values[Capacity];
};
// Each thread passes the reduction of the LAST SEGMENT that it covers.
// flag is set to true if there's at least one segment flag in the thread.
// SegScan returns the reduction of values for the first segment in this
// thread over the preceding threads.
// Return the value init for the first thread.
// When scanning single elements per thread, interpret the flag as a BEGIN
// FLAG. If tid's flag is set, its value belongs to thread tid + 1, not
// thread tid.
// The function returns the reduction of the last segment in the CTA.
MGPU_DEVICE static T SegScanDelta(int tid, int tidDelta, T x,
Storage& storage, T* carryOut, T identity = (T)0, Op op = Op()) {
// Run an inclusive scan
int first = 0;
storage.values[first + tid] = x;
__syncthreads();
#pragma unroll
for(int offset = 1; offset < NT; offset += offset) {
if(tidDelta >= offset)
x = op(storage.values[first + tid - offset], x);
first = NT - first;
storage.values[first + tid] = x;
__syncthreads();
}
// Get the exclusive scan.
x = tid ? storage.values[first + tid - 1] : identity;
*carryOut = storage.values[first + NT - 1];
__syncthreads();
return x;
}
MGPU_DEVICE static T SegScan(int tid, T x, bool flag, Storage& storage,
T* carryOut, T identity = (T)0, Op op = Op()) {
// Find the left-most thread that covers the first segment of this
// thread.
int tidDelta = DeviceFindSegScanDelta<NT>(tid, flag, storage.delta);
return SegScanDelta(tid, tidDelta, x, storage, carryOut, identity, op);
}
};
} // namespace mgpu
/******************************************************************************
* Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
*
* Code and text by Sean Baxter, NVIDIA Research
* See http://nvlabs.github.io/moderngpu for repository and documentation.
*
******************************************************************************/
#pragma once
#include "ctascan.cuh"
#include "ctasearch.cuh"
#include "loadstore.cuh"
#include "sortnetwork.cuh"
namespace mgpu {
template<int VT, typename T, typename Comp>
MGPU_DEVICE void SegmentedSerialMerge(const T* keys_shared, int aBegin,
int aEnd, int bBegin, int bEnd, T results[VT], int indices[VT],
int leftEnd, int rightStart, Comp comp, bool sync = true) {
bEnd = min(rightStart, bEnd);
T aKey = keys_shared[aBegin];
T bKey = keys_shared[bBegin];
#pragma unroll
for(int i = 0; i < VT; ++i) {
bool p;
// If A has run out of inputs, emit B.
if(aBegin >= aEnd)
p = false;
else if(bBegin >= bEnd || aBegin < leftEnd)
// B has hit the end of the middle segment.
// Emit A if A has inputs remaining in the middle segment.
p = true;
else
// Emit the smaller element in the middle segment.
p = !comp(bKey, aKey);
results[i] = p ? aKey : bKey;
indices[i] = p ? aBegin : bBegin;
if(p) aKey = keys_shared[++aBegin];
else bKey = keys_shared[++bBegin];
}
if(sync) { __syncthreads(); }
}
////////////////////////////////////////////////////////////////////////////////
// CTASegsortPass
template<int NT, int VT, typename T, typename Comp>
MGPU_DEVICE void CTASegsortPass(T* keys_shared, int* ranges_shared, int tid,
int pass, T results[VT], int indices[VT], int2& activeRange, Comp comp) {
// Locate the intervals of the input lists.
int3 frame = FindMergesortFrame(2<< pass, tid, VT);
int a0 = frame.x;
int b0 = frame.y;
int listLen = frame.z;
int list = tid>> pass;
int listParity = 1 & list;
int diag = VT * tid - frame.x;
// Fetch the active range for the list this thread's list is merging with.
int siblingRange = ranges_shared[1 ^ list];
int siblingStart = 0x0000ffff & siblingRange;
int siblingEnd = siblingRange>> 16;
// Create a new active range for the merge.
int leftEnd = listParity ? siblingEnd : activeRange.y;
int rightStart = listParity ? activeRange.x : siblingStart;
activeRange.x = min(activeRange.x, siblingStart);
activeRange.y = max(activeRange.y, siblingEnd);
int p = SegmentedMergePath(keys_shared, a0, listLen, b0, listLen, leftEnd,
rightStart, diag, comp);
int a0tid = a0 + p;
int b0tid = b0 + diag - p;
SegmentedSerialMerge<VT>(keys_shared, a0tid, b0, b0tid, b0 + listLen,
results, indices, leftEnd, rightStart, comp);
// Store the ranges to shared memory.
if(0 == diag)
ranges_shared[list>> 1] =
(int)bfi(activeRange.y, activeRange.x, 16, 16);
}
////////////////////////////////////////////////////////////////////////////////
// CTASegsortLoop
template<int NT, int VT, bool HasValues, typename KeyType, typename ValType,
typename Comp>
MGPU_DEVICE int2 CTASegsortLoop(KeyType threadKeys[VT],
ValType threadValues[VT], KeyType* keys_shared, ValType* values_shared,
int* ranges_shared, int tid, int2 activeRange, Comp comp) {
const int NumPasses = sLogPow2<NT>::value;
#pragma unroll
for(int pass = 0; pass < NumPasses; ++pass) {
int indices[VT];
CTASegsortPass<NT, VT>(keys_shared, ranges_shared, tid, pass,
threadKeys, indices, activeRange, comp);
if(HasValues) {
// Exchange values through shared memory.
DeviceThreadToShared<VT>(threadValues, tid, values_shared);
DeviceGather<NT, VT>(NT * VT, values_shared, indices, tid,
threadValues);
}
// Store results in shared memory in sorted order.
DeviceThreadToShared<VT>(threadKeys, tid, keys_shared);
}
return activeRange;
}
////////////////////////////////////////////////////////////////////////////////
// CTASegsort
// Pass keys and values in register. On return, values are returned in register
// and keys returned in shared memory.
template<int NT, int VT, bool Stable, bool HasValues, typename KeyType,
typename ValType, typename Comp>
MGPU_DEVICE int2 CTASegsort(KeyType threadKeys[VT], ValType threadValues[VT],
int tid, int headFlags, KeyType* keys_shared, ValType* values_shared,
int* ranges_shared, Comp comp) {
if(Stable)
// Odd-even transpose sort.
OddEvenTransposeSortFlags<VT>(threadKeys, threadValues, headFlags,
comp);
else
// Batcher's odd-even mergesort.
OddEvenMergesortFlags<VT>(threadKeys, threadValues, headFlags, comp);
// Record the first and last occurrence of head flags in this segment.
int blockEnd = 31 - clz(headFlags);
if(-1 != blockEnd) blockEnd += VT * tid;
int blockStart = ffs(headFlags);
blockStart = blockStart ? (VT * tid - 1 + blockStart) : (NT * VT);
ranges_shared[tid] = (int)bfi(blockEnd, blockStart, 16, 16);
// Store back to shared mem. The values are in VT-length sorted lists.
// These are merged recursively.
DeviceThreadToShared<VT>(threadKeys, tid, keys_shared);
int2 activeRange = CTASegsortLoop<NT, VT, HasValues>(threadKeys,
threadValues, keys_shared, values_shared, ranges_shared, tid,
make_int2(blockStart, blockEnd), comp);
return activeRange;
}
template<int NT, int VT, bool Stable, typename KeyType, typename Comp>
MGPU_DEVICE int2 CTASegsortKeys(KeyType threadKeys[VT], int tid, int headFlags,
KeyType* keys_shared, int* ranges_shared, Comp comp) {
int valuesTemp[VT];
return CTASegsort<NT, VT, Stable, false>(threadKeys, valuesTemp, tid,
headFlags, keys_shared, (int*)keys_shared, ranges_shared, comp);
}
template<int NT, int VT, bool Stable, typename KeyType, typename ValType,
typename Comp>
MGPU_DEVICE int2 CTASegsortPairs(KeyType threadKeys[VT],
ValType threadValues[VT], int tid, int headFlags, KeyType* keys_shared,
ValType* values_shared, int* ranges_shared, Comp comp) {
return CTASegsort<NT, VT, Stable, true>(threadKeys, threadValues, tid,
headFlags, keys_shared, values_shared, ranges_shared, comp);
}
////////////////////////////////////////////////////////////////////////////////
// DeviceSegBlocksort
// Load keys and values from global memory, sort in shared memory, and store
// back to global memory. Store the left-most and right-most encountered
// headflag locations to ranges_global to prepare for the next pass.
// This function is factored out of the blocksort kernel to allow easier
// customization of that kernel - we have two implementations currently:
// sort over indices and sort over bitfield.
template<int NT, int VT, bool Stable, bool HasValues, typename InputIt1,
typename InputIt2, typename KeyType, typename ValType, typename OutputIt1,
typename OutputIt2, typename Comp>
MGPU_DEVICE void DeviceSegBlocksort(InputIt1 keys_global,
InputIt2 values_global, int count2, KeyType* keys_shared,
ValType* values_shared, int* ranges_shared, int headFlags, int tid,
int block, OutputIt1 keysDest_global, OutputIt2 valsDest_global,
int* ranges_global, Comp comp) {
// Load keys into register in thread order.
int gid = NT * VT * block;
KeyType threadKeys[VT];
DeviceGlobalToShared<NT, VT>(count2, keys_global + gid, tid, keys_shared);
DeviceSharedToThread<VT>(keys_shared, tid, threadKeys);
// Load the values from global memory and into register in thread order.
ValType threadValues[VT];
if(HasValues) {
DeviceGlobalToShared<NT, VT>(count2, values_global + gid, tid,
values_shared);
DeviceSharedToThread<VT>(values_shared, tid, threadValues);
}
// Run the CTA segmented blocksort.
int2 activeRange = CTASegsort<NT, VT, Stable, HasValues>(threadKeys,
threadValues, tid, headFlags, keys_shared, values_shared, ranges_shared,
comp);
// Store the keys to global memory.
DeviceSharedToGlobal<NT, VT>(count2, keys_shared, tid,
keysDest_global + gid);
if(HasValues) {
// Store the values to global memory.xk b
DeviceThreadToShared<VT>(threadValues, tid, values_shared);
DeviceSharedToGlobal<NT, VT>(count2, values_shared, tid,
valsDest_global + gid, false);
}
// Store the 16-bit packed ranges. These are used by all merge kernels and
// the first level of global segmented merge path partitioning.
if(!tid)
ranges_global[block] = bfi(activeRange.y, activeRange.x, 16, 16);
}
////////////////////////////////////////////////////////////////////////////////
// DeviceIndicesToHeadFlags
// Load indices from an array and cooperatively turn into a head flag bitfield
// for each thread.
template<int NT, int VT>
MGPU_DEVICE int DeviceIndicesToHeadFlags(const int* indices_global,
const int* partitions_global, int tid, int block, int count2,
int* words_shared, byte* flags_shared) {
const int FlagWordsPerThread = MGPU_DIV_UP(VT, 4);
int gid = NT * VT * block;
int p0 = partitions_global[block];
int p1 = partitions_global[block + 1];
int headFlags = 0;
if(p1 > p0 || count2 < NT * VT) {
// Clear the flag bytes, then loop through the indices and poke in flag
// values.
#pragma unroll
for(int i = 0; i < FlagWordsPerThread; ++i)
words_shared[NT * i + tid] = 0;
__syncthreads();
for(int index = p0 + tid; index < p1; index += NT) {
int headFlag = indices_global[index];
flags_shared[headFlag - gid] = 1;
}
__syncthreads();
// Combine all the head flags for this thread.
int first = VT * tid;
int offset = first / 4;
int prev = words_shared[offset];
int mask = 0x3210 + 0x1111 * (3 & first);
#pragma unroll
for(int i = 0; i < FlagWordsPerThread; ++i) {
// Gather the next four flags.
int next = words_shared[offset + 1 + i];
int x = prmt(prev, next, mask);
prev = next;
// Set the head flag bits.
if(0x00000001 & x) headFlags |= 1<< (4 * i);
if(0x00000100 & x) headFlags |= 1<< (4 * i + 1);
if(0x00010000 & x) headFlags |= 1<< (4 * i + 2);
if(0x01000000 & x) headFlags |= 1<< (4 * i + 3);
}
__syncthreads();
// Set head flags for out-of-range keys.
int outOfRange = min(VT, first + VT - count2);
if(outOfRange > 0)
headFlags = bfi(0xffffffff, headFlags, VT - outOfRange, outOfRange);
// Clear head flags above VT.
headFlags &= (1<< VT) - 1;
}
return headFlags;
}
////////////////////////////////////////////////////////////////////////////////
// SegSortSupport
struct SegSortSupport {
int* ranges_global;
int2* ranges2_global;
int4* mergeList_global;
int* copyList_global;
int2* queueCounters_global;
int2* nextCounters_global;
byte* copyStatus_global;
};
////////////////////////////////////////////////////////////////////////////////
// DeviceSegSortMerge
template<int NT, int VT, bool HasValues, typename KeyType, typename ValueType,
typename Comp>
MGPU_DEVICE void DeviceSegSortMerge(const KeyType* keys_global,
const ValueType* values_global, int2 segmentRange, int tid,
int block, int4 range, int pass, KeyType* keys_shared,
int* indices_shared, KeyType* keysDest_global, ValueType* valsDest_global,
Comp comp) {
const int NV = NT * VT;
int gid = NV * block;
// Load the local compressed segment indices.
int a0 = range.x;
int aCount = range.y - range.x;
int b0 = range.z;
int bCount = range.w - range.z;
DeviceLoad2ToShared<NT, VT, VT>(keys_global + a0, aCount, keys_global + b0,
bCount, tid, keys_shared);
////////////////////////////////////////////////////////////////////////////
// Run a merge path to find the starting point for each thread to merge.
// If the entire warp fits into the already-sorted segments, we can skip
// sorting it and leave its keys in shared memory. Doing this on the warp
// level rather than thread level (also legal) gives slightly better
// performance.
int segStart = segmentRange.x;
int segEnd = segmentRange.y;
int listParity = 1 & (block>> pass);
int warpOffset = VT * (~31 & tid);
bool sortWarp = listParity ?
// The spliced segment is to the left (segStart).
(warpOffset < segStart) :
// The spliced segment is to the right (segEnd).
(warpOffset + 32 * VT > segEnd);
KeyType threadKeys[VT];
int indices[VT];
if(sortWarp) {
int diag = VT * tid;
int mp = SegmentedMergePath(keys_shared, 0, aCount, aCount, bCount,
listParity ? 0 : segEnd, listParity ? segStart : NV, diag, comp);
int a0tid = mp;
int a1tid = aCount;
int b0tid = aCount + diag - mp;
int b1tid = aCount + bCount;
// Serial merge into register. All threads in the CTA so we hoist the
// check for list parity outside the function call to simplify the
// logic. Unlike in the blocksort, this does not cause warp divergence.
SegmentedSerialMerge<VT>(keys_shared, a0tid, a1tid, b0tid, b1tid,
threadKeys, indices, listParity ? 0 : segEnd,
listParity ? segStart : NV, comp, false);
}
__syncthreads();
// Store sorted data in register back to shared memory. Then copy to global.
if(sortWarp)
DeviceThreadToShared<VT>(threadKeys, tid, keys_shared, false);
__syncthreads();
DeviceSharedToGlobal<NT, VT>(aCount + bCount, keys_shared, tid,
keysDest_global + gid);
////////////////////////////////////////////////////////////////////////////
// Use the merge indices to gather values from global memory. Store directly
// to valsDest_global.
if(HasValues) {
// Transpose the gather indices to help coalesce loads.
if(sortWarp)
DeviceThreadToShared<VT>(indices, tid, indices_shared, false);
else {
#pragma unroll
for(int i = 0; i < VT; ++i)
indices_shared[VT * tid + i] = VT * tid + i;
}
__syncthreads();
DeviceTransferMergeValuesShared<NT, VT>(aCount + bCount,
values_global + a0, values_global + b0, aCount, indices_shared,
tid, valsDest_global + NV * block);
}
}
////////////////////////////////////////////////////////////////////////////////
// DeviceSegSortCopy
template<int NT, int VT, bool HasValues, typename KeyType, typename ValueType>
MGPU_DEVICE void DeviceSegSortCopy(const KeyType* keys_global,
const ValueType* values_global, int tid, int block, int count,
KeyType* keysDest_global, ValueType* valsDest_global) {
int gid = NT * VT * block;
int count2 = min(NT * VT, count - gid);
DeviceGlobalToGlobal<NT, VT>(count2, keys_global + gid, tid,
keysDest_global + gid);
if(HasValues)
DeviceGlobalToGlobal<NT, VT>(count2, values_global + gid, tid,
valsDest_global + gid);
}
} // namespace mgpu
/******************************************************************************
* Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
*
* Code and text by Sean Baxter, NVIDIA Research
* See http://nvlabs.github.io/moderngpu for repository and documentation.
*
******************************************************************************/
#pragma once
#include "../mgpudevice.cuh"
#include "ctasearch.cuh"
namespace mgpu {
////////////////////////////////////////////////////////////////////////////////
// DeviceSerialSearch
template<int VT, MgpuBounds Bounds, bool RangeCheck, bool IndexA, bool MatchA,
bool IndexB, bool MatchB, typename T, typename Comp>
MGPU_DEVICE int3 DeviceSerialSearch(const T* keys_shared, int aBegin,
int aEnd, int bBegin, int bEnd, int aOffset, int bOffset, int* indices,
Comp comp) {
const int FlagA = IndexA ? 0x80000000 : 1;
const int FlagB = IndexB ? 0x80000000 : 1;
T aKey = keys_shared[aBegin];
T bKey = keys_shared[bBegin];
T aPrev, bPrev;
if(aBegin > 0) aPrev = keys_shared[aBegin - 1];
if(bBegin > 0) bPrev = keys_shared[bBegin - 1];
int decisions = 0;
int matchCountA = 0;
int matchCountB = 0;
#pragma unroll
for(int i = 0; i < VT; ++i) {
bool p;
if(RangeCheck && aBegin >= aEnd) p = false;
else if(RangeCheck && bBegin >= bEnd) p = true;
else p = (MgpuBoundsUpper == Bounds) ?
comp(aKey, bKey) :
!comp(bKey, aKey);
if(p) {
// aKey is smaller than bKey, so it is inserted before bKey.
// Save bKey's index (bBegin + first) as the result of the search
// and advance to the next needle in A.
bool match = false;
if(MatchA) {
// Test if there is an element in B that matches aKey.
if(MgpuBoundsUpper == Bounds) {
// Upper Bound: We're inserting aKey after bKey. If there
// is a match for aKey it must be bPrev. Check that bPrev
// is in range and equal to aKey.
// The predicate test result !comp(aKey, bPrev) was
// established on the previous A-advancing iteration (it
// failed the comp(aKey, bKey) test to get us to this
// point). Check the other half of the equality condition
// with a second comparison.
bool inRange = !RangeCheck || (bBegin > aEnd);
match = inRange && !comp(bPrev, aKey);
} else {
// Lower Bound: We're inserting aKey before bKey. If there
// is a match for aKey, it must be bKey. Check that bKey
// is in range and equal to aKey.
// The predicate test !comp(bKey, aKey) has established one
// half of the equality condition. We establish the other
// half with a second comparison.
bool inRange = !RangeCheck || (bBegin < bEnd);
match = inRange && !comp(aKey, bKey);
}
}
int index = 0;
if(IndexA) index = bOffset + bBegin;
if(match) index |= FlagA;
if(IndexA || MatchA) indices[i] = index;
matchCountA += match;
// Mark the decision bit to indicate that this iteration has
// progressed A (the needles).
decisions |= 1<< i;
aPrev = aKey;
aKey = keys_shared[++aBegin];
} else {
// aKey is larger than bKey, so it is inserted after bKey (but we
// don't know where yet). Advance the B index to the next element in
// the haystack to continue the search for the current needle.
bool match = false;
if(MatchB) {
if(MgpuBoundsUpper == Bounds) {
// Upper Bound: aKey is not smaller than bKey. We advance to
// the next haystack element in B. If there is a match in A
// for bKey it must be aKey. By entering this branch we've
// verified that !comp(aKey, bKey). Making the reciprocal
// comparison !comp(bKey, aKey) establishes aKey == bKey.
bool inRange = !RangeCheck ||
((bBegin < bEnd) && (aBegin < aEnd));
match = inRange && !comp(bKey, aKey);
} else {
// Lower Bound: bKey is smaller than aKey. We advance to the
// next element in B. If there is a match for bKey, it must
// be aPrev. The previous A-advancing iteration proved that
// !comp(bKey, aPrev). We test !comp(aPrev, bKey) for the
// other half of the equality condition.
bool inRange = !RangeCheck ||
((bBegin < bEnd) && (aBegin > 0));
match = inRange && !comp(aPrev, bKey);
}
}
int index = 0;
if(IndexB) index = aOffset + aBegin;
if(match) index |= FlagB;
if(IndexB || MatchB) indices[i] = index;
matchCountB += match;
// Keep the decision bit cleared to indicate that this iteration
// has progressed B (the haystack).
bPrev = bKey;
bKey = keys_shared[++bBegin];
}
}
return make_int3(decisions, matchCountA, matchCountB);
}
////////////////////////////////////////////////////////////////////////////////
// CTASortedSearch
// Take keys in shared memory and return indices and b-match flags in shared
// memory.
// NOTE: This function doesn't do any strided-to-thread order transposes so
// using an even number of values per thread will incur no additional bank
// conflicts.
template<int NT, int VT, MgpuBounds Bounds, bool IndexA, bool MatchA,
bool IndexB, bool MatchB, typename T, typename Comp>
MGPU_DEVICE int2 CTASortedSearch(T* keys_shared, int aStart, int aCount,
int aEnd, int a0, int bStart, int bCount, int bEnd, int b0, bool extended,
int tid, int* indices_shared, Comp comp) {
// Run a merge path to find the start of the serial search for each thread.
int diag = VT * tid;
int mp = MergePath<Bounds>(keys_shared + aStart, aCount,
keys_shared + bStart, bCount, diag, comp);
int a0tid = mp;
int b0tid = diag - mp;
// Serial search into register.
int3 results;
int indices[VT];
if(extended)
results = DeviceSerialSearch<VT, Bounds, false, IndexA, MatchA, IndexB,
MatchB>(keys_shared, a0tid + aStart, aEnd, b0tid + bStart, bEnd,
a0 - aStart, b0 - bStart, indices, comp);
else
results = DeviceSerialSearch<VT, Bounds, true, IndexA, MatchA, IndexB,
MatchB>(keys_shared, a0tid + aStart, aEnd, b0tid + bStart, bEnd,
a0 - aStart, b0 - bStart, indices, comp);
__syncthreads();
// Compact the indices into shared memory. Use the decision bits (set is A,
// cleared is B) to select the destination.
int decisions = results.x;
b0tid += aCount;
#pragma unroll
for(int i = 0; i < VT; ++i) {
if((1<< i) & decisions) {
if(IndexA || MatchA) indices_shared[a0tid++] = indices[i];
} else {
if(IndexB || MatchB) indices_shared[b0tid++] = indices[i];
}
}
__syncthreads();
// Return the match counts for A and B keys.
return make_int2(results.y, results.z);
}
} // namespace mgpu
/******************************************************************************
* Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
*
* Code and text by Sean Baxter, NVIDIA Research
* See http://nvlabs.github.io/moderngpu for repository and documentation.
*
******************************************************************************/
#pragma once
#if __CUDA_ARCH__ == 100
#error "COMPUTE CAPABILITY 1.0 NOT SUPPORTED BY MPGU. TRY 2.0!"
#endif
#include <climits>
#include "../util/static.h"
#ifdef _MSC_VER
#define INLINESYMBOL __forceinline__
#else
#define INLINESYMBOL inline
#endif
namespace mgpu {
#define MGPU_HOST __host__ INLINESYMBOL
#define MGPU_DEVICE __device__ INLINESYMBOL
#define MGPU_HOST_DEVICE __host__ __device__ INLINESYMBOL
const int WARP_SIZE = 32;
const int LOG_WARP_SIZE = 5;
////////////////////////////////////////////////////////////////////////////////
// Device-side comparison operators
template<typename T>
struct less : public std::binary_function<T, T, bool> {
MGPU_HOST_DEVICE bool operator()(T a, T b) { return a < b; }
};
template<typename T>
struct less_equal : public std::binary_function<T, T, bool> {
MGPU_HOST_DEVICE bool operator()(T a, T b) { return a <= b; }
};
template<typename T>
struct greater : public std::binary_function<T, T, bool> {
MGPU_HOST_DEVICE bool operator()(T a, T b) { return a > b; }
};
template<typename T>
struct greater_equal : public std::binary_function<T, T, bool> {
MGPU_HOST_DEVICE bool operator()(T a, T b) { return a >= b; }
};
template<typename T>
struct equal_to : public std::binary_function<T, T, bool> {
MGPU_HOST_DEVICE bool operator()(T a, T b) { return a == b; }
};
template<typename T>
struct not_equal_to : public std::binary_function<T, T, bool> {
MGPU_HOST_DEVICE bool operator()(T a, T b) { return a != b; }
};
////////////////////////////////////////////////////////////////////////////////
// Device-side arithmetic operators
template<typename T>
struct plus : public std::binary_function<T, T, T> {
MGPU_HOST_DEVICE T operator()(T a, T b) { return a + b; }
};
template<typename T>
struct minus : public std::binary_function<T, T, T> {
MGPU_HOST_DEVICE T operator()(T a, T b) { return a - b; }
};
template<typename T>
struct multiplies : public std::binary_function<T, T, T> {
MGPU_HOST_DEVICE T operator()(T a, T b) { return a * b; }
};
template<typename T>
struct modulus : public std::binary_function<T, T, T> {
MGPU_HOST_DEVICE T operator()(T a, T b) { return a % b; }
};
template<typename T>
struct bit_or : public std::binary_function<T, T, T> {
MGPU_HOST_DEVICE T operator()(T a, T b) { return a | b; }
};
template<typename T>
struct bit_and : public std::binary_function<T, T, T> {
MGPU_HOST_DEVICE T operator()(T a, T b) { return a & b; }
};
template<typename T>
struct bit_xor : public std::binary_function<T, T, T> {
MGPU_HOST_DEVICE T operator()(T a, T b) { return a ^ b; }
};
template<typename T>
struct maximum : public std::binary_function<T, T, T> {
MGPU_HOST_DEVICE T operator()(T a, T b) { return max(a, b); }
};
template<typename T>
struct minimum : public std::binary_function<T, T, T> {
MGPU_HOST_DEVICE T operator()(T a, T b) { return min(a, b); }
};
////////////////////////////////////////////////////////////////////////////////
template<typename T>
MGPU_HOST_DEVICE void swap(T& a, T& b) {
T c = a;
a = b;
b = c;
}
template<typename T>
struct DevicePair {
T x, y;
};
template<typename T>
MGPU_HOST_DEVICE DevicePair<T> MakeDevicePair(T x, T y) {
DevicePair<T> p = { x, y };
return p;
}
template<typename T> struct numeric_limits;
template<> struct numeric_limits<int> {
MGPU_HOST_DEVICE static int min() { return INT_MIN; }
MGPU_HOST_DEVICE static int max() { return INT_MAX; }
MGPU_HOST_DEVICE static int lowest() { return INT_MIN; }
MGPU_HOST_DEVICE static int AddIdent() { return 0; }
MGPU_HOST_DEVICE static int MulIdent() { return 1; }
};
template<> struct numeric_limits<long long> {
MGPU_HOST_DEVICE static long long min() { return LLONG_MIN; }
MGPU_HOST_DEVICE static long long max() { return LLONG_MAX; }
MGPU_HOST_DEVICE static long long lowest() { return LLONG_MIN; }
MGPU_HOST_DEVICE static long long AddIdent() { return 0; }
MGPU_HOST_DEVICE static long long MulIdent() { return 1; }
};
template<> struct numeric_limits<uint> {
MGPU_HOST_DEVICE static uint min() { return 0; }
MGPU_HOST_DEVICE static uint max() { return UINT_MAX; }
MGPU_HOST_DEVICE static uint lowest() { return 0; }
MGPU_HOST_DEVICE static uint AddIdent() { return 0; }
MGPU_HOST_DEVICE static uint MulIdent() { return 1; }
};
template<> struct numeric_limits<unsigned long long> {
MGPU_HOST_DEVICE static unsigned long long min() { return 0; }
MGPU_HOST_DEVICE static unsigned long long max() { return ULLONG_MAX; }
MGPU_HOST_DEVICE static unsigned long long lowest() { return 0; }
MGPU_HOST_DEVICE static unsigned long long AddIdent() { return 0; }
MGPU_HOST_DEVICE static unsigned long long MulIdent() { return 1; }
};
template<> struct numeric_limits<float> {
MGPU_HOST_DEVICE static float min() { return FLT_MIN; }
MGPU_HOST_DEVICE static float max() { return FLT_MAX; }
MGPU_HOST_DEVICE static float lowest() { return -FLT_MAX; }
MGPU_HOST_DEVICE static float AddIdent() { return 0; }
MGPU_HOST_DEVICE static float MulIdent() { return 1; }
};
template<> struct numeric_limits<double> {
MGPU_HOST_DEVICE static double min() { return DBL_MIN; }
MGPU_HOST_DEVICE static double max() { return DBL_MAX; }
MGPU_HOST_DEVICE static double lowest() { return -DBL_MAX; }
MGPU_HOST_DEVICE static double AddIdent() { return 0; }
MGPU_HOST_DEVICE static double MulIdent() { return 1; }
};
MGPU_HOST_DEVICE int2 operator+(int2 a, int2 b) {
return make_int2(a.x + b.x, a.y + b.y);
}
MGPU_HOST_DEVICE int2& operator+=(int2& a, int2 b) {
a = a + b;
return a;
}
MGPU_HOST_DEVICE int2 operator*(int2 a, int2 b) {
return make_int2(a.x * b.x, a.y * b.y);
}
MGPU_HOST_DEVICE int2& operator*=(int2& a, int2 b) {
a = a * b;
return a;
}
template<typename T>
MGPU_HOST_DEVICE T max(T a, T b) {
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 100)
return std::max(a, b);
#else
return (a < b) ? b : a;
#endif
}
template<typename T>
MGPU_HOST_DEVICE T min(T a, T b) {
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 100)
return std::min(a, b);
#else
return (b < a) ? b : a;
#endif
}
MGPU_HOST_DEVICE int2 max(int2 a, int2 b) {
return make_int2(max(a.x, b.x), max(a.y, b.y));
}
MGPU_HOST_DEVICE int2 min(int2 a, int2 b) {
return make_int2(min(a.x, b.x), min(a.y, b.y));
}
template<> struct numeric_limits<int2> {
MGPU_HOST_DEVICE static int2 min() { return make_int2(INT_MIN, INT_MIN); }
MGPU_HOST_DEVICE static int2 max() { return make_int2(INT_MAX, INT_MAX); }
MGPU_HOST_DEVICE static int2 lowest() {
return make_int2(INT_MIN, INT_MIN);
}
MGPU_HOST_DEVICE static int2 AddIdent() { return make_int2(0, 0); }
MGPU_HOST_DEVICE static int2 MulIdent() { return make_int2(1, 1); }
};
template<typename T>
class constant_iterator : public std::iterator_traits<const T*> {
public:
MGPU_HOST_DEVICE constant_iterator(T value) : _value(value) { }
MGPU_HOST_DEVICE T operator[](ptrdiff_t i) const {
return _value;
}
MGPU_HOST_DEVICE T operator*() const {
return _value;
}
MGPU_HOST_DEVICE constant_iterator operator+(ptrdiff_t diff) const {
return constant_iterator(_value);
}
MGPU_HOST_DEVICE constant_iterator operator-(ptrdiff_t diff) const {
return constant_iterator(_value);
}
MGPU_HOST_DEVICE constant_iterator& operator+=(ptrdiff_t diff) {
return *this;
}
MGPU_HOST_DEVICE constant_iterator& operator-=(ptrdiff_t diff) {
return *this;
}
private:
T _value;
};
template<typename T>
class counting_iterator : public std::iterator_traits<const T*> {
public:
MGPU_HOST_DEVICE counting_iterator(T value) : _value(value) { }
MGPU_HOST_DEVICE T operator[](ptrdiff_t i) {
return _value + i;
}
MGPU_HOST_DEVICE T operator*() {
return _value;
}
MGPU_HOST_DEVICE counting_iterator operator+(ptrdiff_t diff) {
return counting_iterator(_value + diff);
}
MGPU_HOST_DEVICE counting_iterator operator-(ptrdiff_t diff) {
return counting_iterator(_value - diff);
}
MGPU_HOST_DEVICE counting_iterator& operator+=(ptrdiff_t diff) {
_value += diff;
return *this;
}
MGPU_HOST_DEVICE counting_iterator& operator-=(ptrdiff_t diff) {
_value -= diff;
return *this;
}
private:
T _value;
};
template<typename T>
class step_iterator : public std::iterator_traits<const T*> {
public:
MGPU_HOST_DEVICE step_iterator(T base, T step) :
_base(base), _step(step), _offset(0) { }
MGPU_HOST_DEVICE T operator[](ptrdiff_t i) {
return _base + (_offset + i) * _step;
}
MGPU_HOST_DEVICE T operator*() {
return _base + _offset * _step;
}
MGPU_HOST_DEVICE step_iterator operator+(ptrdiff_t diff) {
step_iterator it = *this;
it._offset += diff;
return it;
}
MGPU_HOST_DEVICE step_iterator operator-(ptrdiff_t diff) {
step_iterator it = *this;
it._offset -= diff;
return it;
}
MGPU_HOST_DEVICE step_iterator& operator+=(ptrdiff_t diff) {
_offset += diff;
return *this;
}
MGPU_HOST_DEVICE step_iterator& operator-=(ptrdiff_t diff) {
_offset -= diff;
return *this;
}
private:
ptrdiff_t _offset;
T _base, _step;
};
} // namespace mgpu
template<typename T>
MGPU_HOST_DEVICE mgpu::counting_iterator<T> operator+(ptrdiff_t diff,
mgpu::counting_iterator<T> it) {
return it + diff;
}
template<typename T>
MGPU_HOST_DEVICE mgpu::counting_iterator<T> operator-(ptrdiff_t diff,
mgpu::counting_iterator<T> it) {
return it + (-diff);
}
template<typename T>
MGPU_HOST_DEVICE mgpu::step_iterator<T> operator+(ptrdiff_t diff,
mgpu::step_iterator<T> it) {
return it + diff;
}
template<typename T>
MGPU_HOST_DEVICE mgpu::step_iterator<T> operator-(ptrdiff_t diff,
mgpu::step_iterator<T> it) {
return it + (-diff);
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment