Commit f3c55c28 authored by peastman's avatar peastman
Browse files

Minor optimizations

parent 72def6fb
......@@ -92,7 +92,7 @@ private:
CudaArray offsetInBucket;
CudaArray bucketOffset;
CudaArray buckets;
CUfunction shortListKernel, computeRangeKernel, assignElementsKernel, computeBucketPositionsKernel, copyToBucketsKernel, sortBucketsKernel;
CUfunction shortListKernel, shortList2Kernel, computeRangeKernel, assignElementsKernel, computeBucketPositionsKernel, copyToBucketsKernel, sortBucketsKernel;
unsigned int dataLength, rangeKernelSize, positionsKernelSize, sortKernelSize;
bool isShortList;
};
......
......@@ -43,6 +43,7 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
replacements["MAX_VALUE"] = trait->getMaxValue();
CUmodule module = context.createModule(context.replaceStrings(CudaKernelSources::sort, replacements));
shortListKernel = context.getKernel(module, "sortShortList");
shortList2Kernel = context.getKernel(module, "sortShortList2");
computeRangeKernel = context.getKernel(module, "computeRange");
assignElementsKernel = context.getKernel(module, "assignElementsToBuckets");
computeBucketPositionsKernel = context.getKernel(module, "computeBucketPositions");
......@@ -56,7 +57,7 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
int maxSharedMem;
cuDeviceGetAttribute(&maxSharedMem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, context.getDevice());
unsigned int maxLocalBuffer = (unsigned int) ((maxSharedMem/trait->getDataSize())/2);
isShortList = (length <= maxLocalBuffer);
isShortList = (length <= maxLocalBuffer || length <= CudaContext::ThreadBlockSize*context.getNumThreadBlocks());
for (rangeKernelSize = 1; rangeKernelSize*2 <= maxBlockSize; rangeKernelSize *= 2)
;
positionsKernelSize = rangeKernelSize;
......@@ -79,8 +80,8 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
bucketOffset.initialize<uint1>(context, numBuckets, "bucketOffset");
bucketOfElement.initialize<uint1>(context, length, "bucketOfElement");
offsetInBucket.initialize<uint1>(context, length, "offsetInBucket");
buckets.initialize(context, length, trait->getDataSize(), "buckets");
}
buckets.initialize(context, length, trait->getDataSize(), "buckets");
}
CudaSort::~CudaSort() {
......@@ -93,10 +94,17 @@ void CudaSort::sort(CudaArray& data) {
if (data.getSize() == 0)
return;
if (isShortList) {
// We can use a simpler sort kernel that does the entire operation at once in local memory.
// We can use a simpler sort kernel that does the entire operation in one kernel.
void* sortArgs[] = {&data.getDevicePointer(), &dataLength};
context.executeKernel(shortListKernel, sortArgs, sortKernelSize, sortKernelSize, dataLength*trait->getDataSize());
if (dataLength <= CudaContext::ThreadBlockSize*context.getNumThreadBlocks()) {
void* sortArgs[] = {&data.getDevicePointer(), &buckets.getDevicePointer(), &dataLength};
context.executeKernel(shortList2Kernel, sortArgs, dataLength);
buckets.copyTo(data);
}
else {
void* sortArgs[] = {&data.getDevicePointer(), &dataLength};
context.executeKernel(shortListKernel, sortArgs, sortKernelSize, sortKernelSize, dataLength*trait->getDataSize());
}
}
else {
// Compute the range of data values.
......
......@@ -195,8 +195,8 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
__shared__ int warpExclusions[MAX_EXCLUSIONS*(GROUP_SIZE/32)];
__shared__ real3 posBuffer[GROUP_SIZE];
__shared__ volatile int workgroupTileIndex[GROUP_SIZE/32];
__shared__ int sumBuffer[GROUP_SIZE];
__shared__ int worksgroupPairStartIndex[GROUP_SIZE/32];
int* sumBuffer = (int*) posBuffer; // Reuse the same buffer to save memory
int* buffer = workgroupBuffer+BUFFER_SIZE*(warpStart/32);
int* flagsBuffer = workgroupFlagsBuffer+BUFFER_SIZE*(warpStart/32);
int* exclusionsForX = warpExclusions+MAX_EXCLUSIONS*(warpStart/32);
......
......@@ -46,6 +46,33 @@ __global__ void sortShortList(DATA_TYPE* __restrict__ data, unsigned int length)
data[index] = dataBuffer[index];
}
/**
* An alternate kernel for sorting short lists. In this version every thread does a full
* scan through the data to select the destination for one element. This involves more
* work, but also parallelizes much better.
*/
__global__ void sortShortList2(const DATA_TYPE* __restrict__ dataIn, DATA_TYPE* __restrict__ dataOut, unsigned int length) {
__shared__ DATA_TYPE dataBuffer[64];
int globalId = blockDim.x*blockIdx.x+threadIdx.x;
DATA_TYPE value = dataIn[globalId < length ? globalId : 0];
KEY_TYPE key = getValue(value);
int count = 0;
for (int blockStart = 0; blockStart < length; blockStart += blockDim.x) {
int numInBlock = min(blockDim.x, length-blockStart);
__syncthreads();
if (threadIdx.x < numInBlock)
dataBuffer[threadIdx.x] = dataIn[blockStart+threadIdx.x];
__syncthreads();
for (int i = 0; i < numInBlock; i++) {
KEY_TYPE otherKey = getValue(dataBuffer[i]);
if (otherKey < key || (otherKey == key && blockStart+i < globalId))
count++;
}
}
if (globalId < length)
dataOut[count] = value;
}
/**
* Calculate the minimum and maximum value in the array to be sorted. This kernel
* is executed as a single work group.
......
......@@ -92,7 +92,7 @@ private:
OpenCLArray offsetInBucket;
OpenCLArray bucketOffset;
OpenCLArray buckets;
cl::Kernel shortListKernel, computeRangeKernel, assignElementsKernel, computeBucketPositionsKernel, copyToBucketsKernel, sortBucketsKernel;
cl::Kernel shortListKernel, shortList2Kernel, computeRangeKernel, assignElementsKernel, computeBucketPositionsKernel, copyToBucketsKernel, sortBucketsKernel;
unsigned int dataLength, rangeKernelSize, positionsKernelSize, sortKernelSize;
bool isShortList;
};
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010-2015 Stanford University and the Authors. *
* Portions copyright (c) 2010-2018 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -48,6 +48,7 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
replacements["MAX_VALUE"] = trait->getMaxValue();
cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::sort, replacements));
shortListKernel = cl::Kernel(program, "sortShortList");
shortList2Kernel = cl::Kernel(program, "sortShortList2");
computeRangeKernel = cl::Kernel(program, "computeRange");
assignElementsKernel = cl::Kernel(program, "assignElementsToBuckets");
computeBucketPositionsKernel = cl::Kernel(program, "computeBucketPositions");
......@@ -66,7 +67,7 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
// But AMD's OpenCL returns an inappropriately small value for it that is much shorter than the actual
// maximum, so including the check hurts performance. For the moment I'm going to just comment it out.
// If we officially support Qualcomm in the future, we'll need to do something better.
isShortList = (length <= maxLocalBuffer/* && length < maxShortListSize*/);
isShortList = (length <= maxLocalBuffer/* && length < maxShortListSize*/ || length <= OpenCLContext::ThreadBlockSize*context.getNumThreadBlocks());
for (rangeKernelSize = 1; rangeKernelSize*2 <= maxRangeSize; rangeKernelSize *= 2)
;
positionsKernelSize = std::min(rangeKernelSize, maxPositionsSize);
......@@ -89,8 +90,8 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
bucketOffset.initialize<cl_uint>(context, numBuckets, "bucketOffset");
bucketOfElement.initialize<cl_uint>(context, length, "bucketOfElement");
offsetInBucket.initialize<cl_uint>(context, length, "offsetInBucket");
buckets.initialize(context, length, trait->getDataSize(), "buckets");
}
buckets.initialize(context, length, trait->getDataSize(), "buckets");
}
OpenCLSort::~OpenCLSort() {
......@@ -103,12 +104,21 @@ void OpenCLSort::sort(OpenCLArray& data) {
if (data.getSize() == 0)
return;
if (isShortList) {
// We can use a simpler sort kernel that does the entire operation at once in local memory.
// We can use a simpler sort kernel that does the entire operation in one kernel.
shortListKernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
shortListKernel.setArg<cl_uint>(1, dataLength);
shortListKernel.setArg(2, dataLength*trait->getDataSize(), NULL);
context.executeKernel(shortListKernel, sortKernelSize, sortKernelSize);
if (dataLength <= OpenCLContext::ThreadBlockSize*context.getNumThreadBlocks()) {
shortList2Kernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
shortList2Kernel.setArg<cl::Buffer>(1, buckets.getDeviceBuffer());
shortList2Kernel.setArg<cl_int>(2, dataLength);
context.executeKernel(shortList2Kernel, dataLength);
buckets.copyTo(data);
}
else {
shortListKernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
shortListKernel.setArg<cl_uint>(1, dataLength);
shortListKernel.setArg(2, dataLength*trait->getDataSize(), NULL);
context.executeKernel(shortListKernel, sortKernelSize, sortKernelSize);
}
}
else {
// Compute the range of data values.
......
......@@ -45,6 +45,32 @@ __kernel void sortShortList(__global DATA_TYPE* restrict data, uint length, __lo
data[index] = dataBuffer[index];
}
/**
* An alternate kernel for sorting short lists. In this version every thread does a full
* scan through the data to select the destination for one element. This involves more
* work, but also parallelizes much better.
*/
__kernel void sortShortList2(__global const DATA_TYPE* restrict dataIn, __global DATA_TYPE* restrict dataOut, int length) {
__local DATA_TYPE dataBuffer[64];
DATA_TYPE value = dataIn[get_global_id(0) < length ? get_global_id(0) : 0];
KEY_TYPE key = getValue(value);
int count = 0;
for (int blockStart = 0; blockStart < length; blockStart += get_local_size(0)) {
int numInBlock = min((int) get_local_size(0), length-blockStart);
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < numInBlock)
dataBuffer[get_local_id(0)] = dataIn[blockStart+get_local_id(0)];
barrier(CLK_LOCAL_MEM_FENCE);
for (int i = 0; i < numInBlock; i++) {
KEY_TYPE otherKey = getValue(dataBuffer[i]);
if (otherKey < key || (otherKey == key && blockStart+i < get_global_id(0)))
count++;
}
}
if (get_global_id(0) < length)
dataOut[count] = value;
}
/**
* Calculate the minimum and maximum value in the array to be sorted. This kernel
* is executed as a single work group.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment