Unverified Commit 143fe36d authored by peastman's avatar peastman Committed by GitHub
Browse files

Merge pull request #2060 from peastman/opt

Minor optimizations
parents 38e14267 f3c55c28
...@@ -92,7 +92,7 @@ private: ...@@ -92,7 +92,7 @@ private:
CudaArray offsetInBucket; CudaArray offsetInBucket;
CudaArray bucketOffset; CudaArray bucketOffset;
CudaArray buckets; CudaArray buckets;
CUfunction shortListKernel, computeRangeKernel, assignElementsKernel, computeBucketPositionsKernel, copyToBucketsKernel, sortBucketsKernel; CUfunction shortListKernel, shortList2Kernel, computeRangeKernel, assignElementsKernel, computeBucketPositionsKernel, copyToBucketsKernel, sortBucketsKernel;
unsigned int dataLength, rangeKernelSize, positionsKernelSize, sortKernelSize; unsigned int dataLength, rangeKernelSize, positionsKernelSize, sortKernelSize;
bool isShortList; bool isShortList;
}; };
......
...@@ -43,6 +43,7 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length) ...@@ -43,6 +43,7 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
replacements["MAX_VALUE"] = trait->getMaxValue(); replacements["MAX_VALUE"] = trait->getMaxValue();
CUmodule module = context.createModule(context.replaceStrings(CudaKernelSources::sort, replacements)); CUmodule module = context.createModule(context.replaceStrings(CudaKernelSources::sort, replacements));
shortListKernel = context.getKernel(module, "sortShortList"); shortListKernel = context.getKernel(module, "sortShortList");
shortList2Kernel = context.getKernel(module, "sortShortList2");
computeRangeKernel = context.getKernel(module, "computeRange"); computeRangeKernel = context.getKernel(module, "computeRange");
assignElementsKernel = context.getKernel(module, "assignElementsToBuckets"); assignElementsKernel = context.getKernel(module, "assignElementsToBuckets");
computeBucketPositionsKernel = context.getKernel(module, "computeBucketPositions"); computeBucketPositionsKernel = context.getKernel(module, "computeBucketPositions");
...@@ -56,7 +57,7 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length) ...@@ -56,7 +57,7 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
int maxSharedMem; int maxSharedMem;
cuDeviceGetAttribute(&maxSharedMem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, context.getDevice()); cuDeviceGetAttribute(&maxSharedMem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, context.getDevice());
unsigned int maxLocalBuffer = (unsigned int) ((maxSharedMem/trait->getDataSize())/2); unsigned int maxLocalBuffer = (unsigned int) ((maxSharedMem/trait->getDataSize())/2);
isShortList = (length <= maxLocalBuffer); isShortList = (length <= maxLocalBuffer || length <= CudaContext::ThreadBlockSize*context.getNumThreadBlocks());
for (rangeKernelSize = 1; rangeKernelSize*2 <= maxBlockSize; rangeKernelSize *= 2) for (rangeKernelSize = 1; rangeKernelSize*2 <= maxBlockSize; rangeKernelSize *= 2)
; ;
positionsKernelSize = rangeKernelSize; positionsKernelSize = rangeKernelSize;
...@@ -79,8 +80,8 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length) ...@@ -79,8 +80,8 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
bucketOffset.initialize<uint1>(context, numBuckets, "bucketOffset"); bucketOffset.initialize<uint1>(context, numBuckets, "bucketOffset");
bucketOfElement.initialize<uint1>(context, length, "bucketOfElement"); bucketOfElement.initialize<uint1>(context, length, "bucketOfElement");
offsetInBucket.initialize<uint1>(context, length, "offsetInBucket"); offsetInBucket.initialize<uint1>(context, length, "offsetInBucket");
buckets.initialize(context, length, trait->getDataSize(), "buckets");
} }
buckets.initialize(context, length, trait->getDataSize(), "buckets");
} }
CudaSort::~CudaSort() { CudaSort::~CudaSort() {
...@@ -93,10 +94,17 @@ void CudaSort::sort(CudaArray& data) { ...@@ -93,10 +94,17 @@ void CudaSort::sort(CudaArray& data) {
if (data.getSize() == 0) if (data.getSize() == 0)
return; return;
if (isShortList) { if (isShortList) {
// We can use a simpler sort kernel that does the entire operation at once in local memory. // We can use a simpler sort kernel that does the entire operation in one kernel.
void* sortArgs[] = {&data.getDevicePointer(), &dataLength}; if (dataLength <= CudaContext::ThreadBlockSize*context.getNumThreadBlocks()) {
context.executeKernel(shortListKernel, sortArgs, sortKernelSize, sortKernelSize, dataLength*trait->getDataSize()); void* sortArgs[] = {&data.getDevicePointer(), &buckets.getDevicePointer(), &dataLength};
context.executeKernel(shortList2Kernel, sortArgs, dataLength);
buckets.copyTo(data);
}
else {
void* sortArgs[] = {&data.getDevicePointer(), &dataLength};
context.executeKernel(shortListKernel, sortArgs, sortKernelSize, sortKernelSize, dataLength*trait->getDataSize());
}
} }
else { else {
// Compute the range of data values. // Compute the range of data values.
......
...@@ -195,8 +195,8 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea ...@@ -195,8 +195,8 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
__shared__ int warpExclusions[MAX_EXCLUSIONS*(GROUP_SIZE/32)]; __shared__ int warpExclusions[MAX_EXCLUSIONS*(GROUP_SIZE/32)];
__shared__ real3 posBuffer[GROUP_SIZE]; __shared__ real3 posBuffer[GROUP_SIZE];
__shared__ volatile int workgroupTileIndex[GROUP_SIZE/32]; __shared__ volatile int workgroupTileIndex[GROUP_SIZE/32];
__shared__ int sumBuffer[GROUP_SIZE];
__shared__ int worksgroupPairStartIndex[GROUP_SIZE/32]; __shared__ int worksgroupPairStartIndex[GROUP_SIZE/32];
int* sumBuffer = (int*) posBuffer; // Reuse the same buffer to save memory
int* buffer = workgroupBuffer+BUFFER_SIZE*(warpStart/32); int* buffer = workgroupBuffer+BUFFER_SIZE*(warpStart/32);
int* flagsBuffer = workgroupFlagsBuffer+BUFFER_SIZE*(warpStart/32); int* flagsBuffer = workgroupFlagsBuffer+BUFFER_SIZE*(warpStart/32);
int* exclusionsForX = warpExclusions+MAX_EXCLUSIONS*(warpStart/32); int* exclusionsForX = warpExclusions+MAX_EXCLUSIONS*(warpStart/32);
......
...@@ -46,6 +46,33 @@ __global__ void sortShortList(DATA_TYPE* __restrict__ data, unsigned int length) ...@@ -46,6 +46,33 @@ __global__ void sortShortList(DATA_TYPE* __restrict__ data, unsigned int length)
data[index] = dataBuffer[index]; data[index] = dataBuffer[index];
} }
/**
* An alternate kernel for sorting short lists. In this version every thread does a full
* scan through the data to select the destination for one element. This involves more
* work, but also parallelizes much better.
*/
__global__ void sortShortList2(const DATA_TYPE* __restrict__ dataIn, DATA_TYPE* __restrict__ dataOut, unsigned int length) {
__shared__ DATA_TYPE dataBuffer[64];
int globalId = blockDim.x*blockIdx.x+threadIdx.x;
DATA_TYPE value = dataIn[globalId < length ? globalId : 0];
KEY_TYPE key = getValue(value);
int count = 0;
for (int blockStart = 0; blockStart < length; blockStart += blockDim.x) {
int numInBlock = min(blockDim.x, length-blockStart);
__syncthreads();
if (threadIdx.x < numInBlock)
dataBuffer[threadIdx.x] = dataIn[blockStart+threadIdx.x];
__syncthreads();
for (int i = 0; i < numInBlock; i++) {
KEY_TYPE otherKey = getValue(dataBuffer[i]);
if (otherKey < key || (otherKey == key && blockStart+i < globalId))
count++;
}
}
if (globalId < length)
dataOut[count] = value;
}
/** /**
* Calculate the minimum and maximum value in the array to be sorted. This kernel * Calculate the minimum and maximum value in the array to be sorted. This kernel
* is executed as a single work group. * is executed as a single work group.
......
...@@ -92,7 +92,7 @@ private: ...@@ -92,7 +92,7 @@ private:
OpenCLArray offsetInBucket; OpenCLArray offsetInBucket;
OpenCLArray bucketOffset; OpenCLArray bucketOffset;
OpenCLArray buckets; OpenCLArray buckets;
cl::Kernel shortListKernel, computeRangeKernel, assignElementsKernel, computeBucketPositionsKernel, copyToBucketsKernel, sortBucketsKernel; cl::Kernel shortListKernel, shortList2Kernel, computeRangeKernel, assignElementsKernel, computeBucketPositionsKernel, copyToBucketsKernel, sortBucketsKernel;
unsigned int dataLength, rangeKernelSize, positionsKernelSize, sortKernelSize; unsigned int dataLength, rangeKernelSize, positionsKernelSize, sortKernelSize;
bool isShortList; bool isShortList;
}; };
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2010-2015 Stanford University and the Authors. * * Portions copyright (c) 2010-2018 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -48,6 +48,7 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le ...@@ -48,6 +48,7 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
replacements["MAX_VALUE"] = trait->getMaxValue(); replacements["MAX_VALUE"] = trait->getMaxValue();
cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::sort, replacements)); cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::sort, replacements));
shortListKernel = cl::Kernel(program, "sortShortList"); shortListKernel = cl::Kernel(program, "sortShortList");
shortList2Kernel = cl::Kernel(program, "sortShortList2");
computeRangeKernel = cl::Kernel(program, "computeRange"); computeRangeKernel = cl::Kernel(program, "computeRange");
assignElementsKernel = cl::Kernel(program, "assignElementsToBuckets"); assignElementsKernel = cl::Kernel(program, "assignElementsToBuckets");
computeBucketPositionsKernel = cl::Kernel(program, "computeBucketPositions"); computeBucketPositionsKernel = cl::Kernel(program, "computeBucketPositions");
...@@ -66,7 +67,7 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le ...@@ -66,7 +67,7 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
// But AMD's OpenCL returns an inappropriately small value for it that is much shorter than the actual // But AMD's OpenCL returns an inappropriately small value for it that is much shorter than the actual
// maximum, so including the check hurts performance. For the moment I'm going to just comment it out. // maximum, so including the check hurts performance. For the moment I'm going to just comment it out.
// If we officially support Qualcomm in the future, we'll need to do something better. // If we officially support Qualcomm in the future, we'll need to do something better.
isShortList = (length <= maxLocalBuffer/* && length < maxShortListSize*/); isShortList = (length <= maxLocalBuffer/* && length < maxShortListSize*/ || length <= OpenCLContext::ThreadBlockSize*context.getNumThreadBlocks());
for (rangeKernelSize = 1; rangeKernelSize*2 <= maxRangeSize; rangeKernelSize *= 2) for (rangeKernelSize = 1; rangeKernelSize*2 <= maxRangeSize; rangeKernelSize *= 2)
; ;
positionsKernelSize = std::min(rangeKernelSize, maxPositionsSize); positionsKernelSize = std::min(rangeKernelSize, maxPositionsSize);
...@@ -89,8 +90,8 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le ...@@ -89,8 +90,8 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
bucketOffset.initialize<cl_uint>(context, numBuckets, "bucketOffset"); bucketOffset.initialize<cl_uint>(context, numBuckets, "bucketOffset");
bucketOfElement.initialize<cl_uint>(context, length, "bucketOfElement"); bucketOfElement.initialize<cl_uint>(context, length, "bucketOfElement");
offsetInBucket.initialize<cl_uint>(context, length, "offsetInBucket"); offsetInBucket.initialize<cl_uint>(context, length, "offsetInBucket");
buckets.initialize(context, length, trait->getDataSize(), "buckets");
} }
buckets.initialize(context, length, trait->getDataSize(), "buckets");
} }
OpenCLSort::~OpenCLSort() { OpenCLSort::~OpenCLSort() {
...@@ -103,12 +104,21 @@ void OpenCLSort::sort(OpenCLArray& data) { ...@@ -103,12 +104,21 @@ void OpenCLSort::sort(OpenCLArray& data) {
if (data.getSize() == 0) if (data.getSize() == 0)
return; return;
if (isShortList) { if (isShortList) {
// We can use a simpler sort kernel that does the entire operation at once in local memory. // We can use a simpler sort kernel that does the entire operation in one kernel.
shortListKernel.setArg<cl::Buffer>(0, data.getDeviceBuffer()); if (dataLength <= OpenCLContext::ThreadBlockSize*context.getNumThreadBlocks()) {
shortListKernel.setArg<cl_uint>(1, dataLength); shortList2Kernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
shortListKernel.setArg(2, dataLength*trait->getDataSize(), NULL); shortList2Kernel.setArg<cl::Buffer>(1, buckets.getDeviceBuffer());
context.executeKernel(shortListKernel, sortKernelSize, sortKernelSize); shortList2Kernel.setArg<cl_int>(2, dataLength);
context.executeKernel(shortList2Kernel, dataLength);
buckets.copyTo(data);
}
else {
shortListKernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
shortListKernel.setArg<cl_uint>(1, dataLength);
shortListKernel.setArg(2, dataLength*trait->getDataSize(), NULL);
context.executeKernel(shortListKernel, sortKernelSize, sortKernelSize);
}
} }
else { else {
// Compute the range of data values. // Compute the range of data values.
......
...@@ -45,6 +45,32 @@ __kernel void sortShortList(__global DATA_TYPE* restrict data, uint length, __lo ...@@ -45,6 +45,32 @@ __kernel void sortShortList(__global DATA_TYPE* restrict data, uint length, __lo
data[index] = dataBuffer[index]; data[index] = dataBuffer[index];
} }
/**
* An alternate kernel for sorting short lists. In this version every thread does a full
* scan through the data to select the destination for one element. This involves more
* work, but also parallelizes much better.
*/
__kernel void sortShortList2(__global const DATA_TYPE* restrict dataIn, __global DATA_TYPE* restrict dataOut, int length) {
__local DATA_TYPE dataBuffer[64];
DATA_TYPE value = dataIn[get_global_id(0) < length ? get_global_id(0) : 0];
KEY_TYPE key = getValue(value);
int count = 0;
for (int blockStart = 0; blockStart < length; blockStart += get_local_size(0)) {
int numInBlock = min((int) get_local_size(0), length-blockStart);
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < numInBlock)
dataBuffer[get_local_id(0)] = dataIn[blockStart+get_local_id(0)];
barrier(CLK_LOCAL_MEM_FENCE);
for (int i = 0; i < numInBlock; i++) {
KEY_TYPE otherKey = getValue(dataBuffer[i]);
if (otherKey < key || (otherKey == key && blockStart+i < get_global_id(0)))
count++;
}
}
if (get_global_id(0) < length)
dataOut[count] = value;
}
/** /**
* Calculate the minimum and maximum value in the array to be sorted. This kernel * Calculate the minimum and maximum value in the array to be sorted. This kernel
* is executed as a single work group. * is executed as a single work group.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment