Commit f93e230f authored by peastman's avatar peastman Committed by GitHub
Browse files

Merge pull request #1587 from peastman/opt

Very minor optimizations
parents 10b51d25 5ab3c394
...@@ -2002,7 +2002,7 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF ...@@ -2002,7 +2002,7 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
if (cu.getUseDoublePrecision() || cu.getComputeCapability() < 2.0 || cu.getPlatformData().deterministicForces) { if (cu.getUseDoublePrecision() || cu.getComputeCapability() < 2.0 || cu.getPlatformData().deterministicForces) {
void* finishSpreadArgs[] = {&directPmeGrid->getDevicePointer()}; void* finishSpreadArgs[] = {&directPmeGrid->getDevicePointer()};
cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, directPmeGrid->getSize()); cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, directPmeGrid->getSize(), 256);
} }
if (useCudaFFT) { if (useCudaFFT) {
...@@ -2019,13 +2019,13 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF ...@@ -2019,13 +2019,13 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
void* computeEnergyArgs[] = {&reciprocalPmeGrid->getDevicePointer(), usePmeStream ? &pmeEnergyBuffer->getDevicePointer() : &cu.getEnergyBuffer().getDevicePointer(), void* computeEnergyArgs[] = {&reciprocalPmeGrid->getDevicePointer(), usePmeStream ? &pmeEnergyBuffer->getDevicePointer() : &cu.getEnergyBuffer().getDevicePointer(),
&pmeBsplineModuliX->getDevicePointer(), &pmeBsplineModuliY->getDevicePointer(), &pmeBsplineModuliZ->getDevicePointer(), &pmeBsplineModuliX->getDevicePointer(), &pmeBsplineModuliY->getDevicePointer(), &pmeBsplineModuliZ->getDevicePointer(),
cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]}; cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeEvalEnergyKernel, computeEnergyArgs, cu.getNumAtoms()); cu.executeKernel(pmeEvalEnergyKernel, computeEnergyArgs, gridSizeX*gridSizeY*gridSizeZ);
} }
void* convolutionArgs[] = {&reciprocalPmeGrid->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(), void* convolutionArgs[] = {&reciprocalPmeGrid->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
&pmeBsplineModuliX->getDevicePointer(), &pmeBsplineModuliY->getDevicePointer(), &pmeBsplineModuliZ->getDevicePointer(), &pmeBsplineModuliX->getDevicePointer(), &pmeBsplineModuliY->getDevicePointer(), &pmeBsplineModuliZ->getDevicePointer(),
cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]}; cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeConvolutionKernel, convolutionArgs, cu.getNumAtoms()); cu.executeKernel(pmeConvolutionKernel, convolutionArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
if (useCudaFFT) { if (useCudaFFT) {
if (cu.getUseDoublePrecision()) if (cu.getUseDoublePrecision())
...@@ -6443,7 +6443,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn ...@@ -6443,7 +6443,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0); CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
void* args1[] = {&numAtoms, &paddedNumAtoms, &cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection, void* args1[] = {&numAtoms, &paddedNumAtoms, &cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection,
&cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer()}; &cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer()};
cu.executeKernel(kernel1, args1, numAtoms); cu.executeKernel(kernel1, args1, numAtoms, 128);
// Apply constraints. // Apply constraints.
...@@ -6453,7 +6453,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn ...@@ -6453,7 +6453,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
void* args2[] = {&numAtoms, &cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection, void* args2[] = {&numAtoms, &cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection,
&cu.getVelm().getDevicePointer(), &integration.getPosDelta().getDevicePointer()}; &cu.getVelm().getDevicePointer(), &integration.getPosDelta().getDevicePointer()};
cu.executeKernel(kernel2, args2, numAtoms); cu.executeKernel(kernel2, args2, numAtoms, 128);
integration.computeVirtualSites(); integration.computeVirtualSites();
// Update the time and step count. // Update the time and step count.
...@@ -6526,7 +6526,7 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev ...@@ -6526,7 +6526,7 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev
int randomIndex = integration.prepareRandomNumbers(cu.getPaddedNumAtoms()); int randomIndex = integration.prepareRandomNumbers(cu.getPaddedNumAtoms());
void* args1[] = {&numAtoms, &paddedNumAtoms, &cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer(), void* args1[] = {&numAtoms, &paddedNumAtoms, &cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer(),
&params->getDevicePointer(), &integration.getStepSize().getDevicePointer(), &integration.getRandom().getDevicePointer(), &randomIndex}; &params->getDevicePointer(), &integration.getStepSize().getDevicePointer(), &integration.getRandom().getDevicePointer(), &randomIndex};
cu.executeKernel(kernel1, args1, numAtoms); cu.executeKernel(kernel1, args1, numAtoms, 128);
// Apply constraints. // Apply constraints.
...@@ -6537,7 +6537,7 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev ...@@ -6537,7 +6537,7 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev
CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0); CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
void* args2[] = {&numAtoms, &cu.getPosq().getDevicePointer(), &posCorrection, &integration.getPosDelta().getDevicePointer(), void* args2[] = {&numAtoms, &cu.getPosq().getDevicePointer(), &posCorrection, &integration.getPosDelta().getDevicePointer(),
&cu.getVelm().getDevicePointer(), &integration.getStepSize().getDevicePointer()}; &cu.getVelm().getDevicePointer(), &integration.getStepSize().getDevicePointer()};
cu.executeKernel(kernel2, args2, numAtoms); cu.executeKernel(kernel2, args2, numAtoms, 128);
integration.computeVirtualSites(); integration.computeVirtualSites();
// Update the time and step count. // Update the time and step count.
...@@ -6588,7 +6588,7 @@ void CudaIntegrateBrownianStepKernel::execute(ContextImpl& context, const Browni ...@@ -6588,7 +6588,7 @@ void CudaIntegrateBrownianStepKernel::execute(ContextImpl& context, const Browni
useDouble ? (void*) &noise : (void*) &noiseFloat, useDouble ? (void*) &noise : (void*) &noiseFloat,
&cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer(),
&cu.getVelm().getDevicePointer(), &integration.getRandom().getDevicePointer(), &randomIndex}; &cu.getVelm().getDevicePointer(), &integration.getRandom().getDevicePointer(), &randomIndex};
cu.executeKernel(kernel1, args1, numAtoms); cu.executeKernel(kernel1, args1, numAtoms, 128);
// Apply constraints. // Apply constraints.
...@@ -6599,7 +6599,7 @@ void CudaIntegrateBrownianStepKernel::execute(ContextImpl& context, const Browni ...@@ -6599,7 +6599,7 @@ void CudaIntegrateBrownianStepKernel::execute(ContextImpl& context, const Browni
CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0); CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
void* args2[] = {&numAtoms, useDouble ? (void*) &stepSize : (void*) &stepSizeFloat, void* args2[] = {&numAtoms, useDouble ? (void*) &stepSize : (void*) &stepSizeFloat,
&cu.getPosq().getDevicePointer(), &posCorrection, &cu.getVelm().getDevicePointer(), &integration.getPosDelta().getDevicePointer()}; &cu.getPosq().getDevicePointer(), &posCorrection, &cu.getVelm().getDevicePointer(), &integration.getPosDelta().getDevicePointer()};
cu.executeKernel(kernel2, args2, numAtoms); cu.executeKernel(kernel2, args2, numAtoms, 128);
integration.computeVirtualSites(); integration.computeVirtualSites();
// Update the time and step count. // Update the time and step count.
...@@ -6652,7 +6652,7 @@ double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, cons ...@@ -6652,7 +6652,7 @@ double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, cons
CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0); CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
void* args1[] = {&numAtoms, &paddedNumAtoms, &cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection, void* args1[] = {&numAtoms, &paddedNumAtoms, &cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection,
&cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer()}; &cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer()};
cu.executeKernel(kernel1, args1, numAtoms); cu.executeKernel(kernel1, args1, numAtoms, 128);
// Apply constraints. // Apply constraints.
...@@ -6662,7 +6662,7 @@ double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, cons ...@@ -6662,7 +6662,7 @@ double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, cons
void* args2[] = {&numAtoms, &cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection, void* args2[] = {&numAtoms, &cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection,
&cu.getVelm().getDevicePointer(), &integration.getPosDelta().getDevicePointer()}; &cu.getVelm().getDevicePointer(), &integration.getPosDelta().getDevicePointer()};
cu.executeKernel(kernel2, args2, numAtoms); cu.executeKernel(kernel2, args2, numAtoms, 128);
integration.computeVirtualSites(); integration.computeVirtualSites();
// Update the time and step count. // Update the time and step count.
...@@ -6738,7 +6738,7 @@ double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, co ...@@ -6738,7 +6738,7 @@ double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, co
int randomIndex = integration.prepareRandomNumbers(cu.getPaddedNumAtoms()); int randomIndex = integration.prepareRandomNumbers(cu.getPaddedNumAtoms());
void* args1[] = {&numAtoms, &paddedNumAtoms, &cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer(), void* args1[] = {&numAtoms, &paddedNumAtoms, &cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer(),
&params->getDevicePointer(), &integration.getStepSize().getDevicePointer(), &integration.getRandom().getDevicePointer(), &randomIndex}; &params->getDevicePointer(), &integration.getStepSize().getDevicePointer(), &integration.getRandom().getDevicePointer(), &randomIndex};
cu.executeKernel(kernel1, args1, numAtoms); cu.executeKernel(kernel1, args1, numAtoms, 128);
// Apply constraints. // Apply constraints.
...@@ -6749,7 +6749,7 @@ double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, co ...@@ -6749,7 +6749,7 @@ double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, co
CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0); CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
void* args2[] = {&numAtoms, &cu.getPosq().getDevicePointer(), &posCorrection, &integration.getPosDelta().getDevicePointer(), void* args2[] = {&numAtoms, &cu.getPosq().getDevicePointer(), &posCorrection, &integration.getPosDelta().getDevicePointer(),
&cu.getVelm().getDevicePointer(), &integration.getStepSize().getDevicePointer()}; &cu.getVelm().getDevicePointer(), &integration.getStepSize().getDevicePointer()};
cu.executeKernel(kernel2, args2, numAtoms); cu.executeKernel(kernel2, args2, numAtoms, 128);
integration.computeVirtualSites(); integration.computeVirtualSites();
// Update the time and step count. // Update the time and step count.
......
...@@ -114,13 +114,13 @@ void CudaSort::sort(CudaArray& data) { ...@@ -114,13 +114,13 @@ void CudaSort::sort(CudaArray& data) {
unsigned int numBuckets = bucketOffset->getSize(); unsigned int numBuckets = bucketOffset->getSize();
void* rangeArgs[] = {&data.getDevicePointer(), &dataLength, &dataRange->getDevicePointer(), &numBuckets, &bucketOffset->getDevicePointer()}; void* rangeArgs[] = {&data.getDevicePointer(), &dataLength, &dataRange->getDevicePointer(), &numBuckets, &bucketOffset->getDevicePointer()};
context.executeKernel(computeRangeKernel, rangeArgs, rangeKernelSize, rangeKernelSize, rangeKernelSize*trait->getKeySize()); context.executeKernel(computeRangeKernel, rangeArgs, rangeKernelSize, rangeKernelSize, 2*rangeKernelSize*trait->getKeySize());
// Assign array elements to buckets. // Assign array elements to buckets.
void* elementsArgs[] = {&data.getDevicePointer(), &dataLength, &numBuckets, &dataRange->getDevicePointer(), void* elementsArgs[] = {&data.getDevicePointer(), &dataLength, &numBuckets, &dataRange->getDevicePointer(),
&bucketOffset->getDevicePointer(), &bucketOfElement->getDevicePointer(), &offsetInBucket->getDevicePointer()}; &bucketOffset->getDevicePointer(), &bucketOfElement->getDevicePointer(), &offsetInBucket->getDevicePointer()};
context.executeKernel(assignElementsKernel, elementsArgs, data.getSize()); context.executeKernel(assignElementsKernel, elementsArgs, data.getSize(), 128);
// Compute the position of each bucket. // Compute the position of each bucket.
......
...@@ -52,7 +52,8 @@ __global__ void sortShortList(DATA_TYPE* __restrict__ data, unsigned int length) ...@@ -52,7 +52,8 @@ __global__ void sortShortList(DATA_TYPE* __restrict__ data, unsigned int length)
*/ */
__global__ void computeRange(const DATA_TYPE* __restrict__ data, unsigned int length, KEY_TYPE* __restrict__ range, __global__ void computeRange(const DATA_TYPE* __restrict__ data, unsigned int length, KEY_TYPE* __restrict__ range,
unsigned int numBuckets, unsigned int* __restrict__ bucketOffset) { unsigned int numBuckets, unsigned int* __restrict__ bucketOffset) {
extern __shared__ KEY_TYPE rangeBuffer[]; extern __shared__ KEY_TYPE minBuffer[];
KEY_TYPE* maxBuffer = minBuffer+blockDim.x;
KEY_TYPE minimum = MAX_KEY; KEY_TYPE minimum = MAX_KEY;
KEY_TYPE maximum = MIN_KEY; KEY_TYPE maximum = MIN_KEY;
...@@ -66,23 +67,18 @@ __global__ void computeRange(const DATA_TYPE* __restrict__ data, unsigned int le ...@@ -66,23 +67,18 @@ __global__ void computeRange(const DATA_TYPE* __restrict__ data, unsigned int le
// Now reduce them. // Now reduce them.
rangeBuffer[threadIdx.x] = minimum; minBuffer[threadIdx.x] = minimum;
maxBuffer[threadIdx.x] = maximum;
__syncthreads(); __syncthreads();
for (unsigned int step = 1; step < blockDim.x; step *= 2) { for (unsigned int step = 1; step < blockDim.x; step *= 2) {
if (threadIdx.x+step < blockDim.x && threadIdx.x%(2*step) == 0) if (threadIdx.x+step < blockDim.x && threadIdx.x%(2*step) == 0) {
rangeBuffer[threadIdx.x] = min(rangeBuffer[threadIdx.x], rangeBuffer[threadIdx.x+step]); minBuffer[threadIdx.x] = min(minBuffer[threadIdx.x], minBuffer[threadIdx.x+step]);
__syncthreads(); maxBuffer[threadIdx.x] = max(maxBuffer[threadIdx.x], maxBuffer[threadIdx.x+step]);
} }
minimum = rangeBuffer[0];
__syncthreads();
rangeBuffer[threadIdx.x] = maximum;
__syncthreads();
for (unsigned int step = 1; step < blockDim.x; step *= 2) {
if (threadIdx.x+step < blockDim.x && threadIdx.x%(2*step) == 0)
rangeBuffer[threadIdx.x] = max(rangeBuffer[threadIdx.x], rangeBuffer[threadIdx.x+step]);
__syncthreads(); __syncthreads();
} }
maximum = rangeBuffer[0]; minimum = minBuffer[0];
maximum = maxBuffer[0];
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
range[0] = minimum; range[0] = minimum;
range[1] = maximum; range[1] = maximum;
......
...@@ -2086,8 +2086,8 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -2086,8 +2086,8 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeEvalEnergyKernel.setArg<mm_float4>(7, recipBoxVectorsFloat[2]); pmeEvalEnergyKernel.setArg<mm_float4>(7, recipBoxVectorsFloat[2]);
} }
if (includeEnergy) if (includeEnergy)
cl.executeKernel(pmeEvalEnergyKernel, cl.getNumAtoms()); cl.executeKernel(pmeEvalEnergyKernel, gridSizeX*gridSizeY*gridSizeZ);
cl.executeKernel(pmeConvolutionKernel, cl.getNumAtoms()); cl.executeKernel(pmeConvolutionKernel, gridSizeX*gridSizeY*gridSizeZ);
fft->execFFT(*pmeGrid2, *pmeGrid, false); fft->execFFT(*pmeGrid2, *pmeGrid, false);
setPeriodicBoxArgs(cl, pmeInterpolateForceKernel, 3); setPeriodicBoxArgs(cl, pmeInterpolateForceKernel, 3);
if (cl.getUseDoublePrecision()) { if (cl.getUseDoublePrecision()) {
......
...@@ -124,8 +124,9 @@ void OpenCLSort::sort(OpenCLArray& data) { ...@@ -124,8 +124,9 @@ void OpenCLSort::sort(OpenCLArray& data) {
computeRangeKernel.setArg<cl_uint>(1, data.getSize()); computeRangeKernel.setArg<cl_uint>(1, data.getSize());
computeRangeKernel.setArg<cl::Buffer>(2, dataRange->getDeviceBuffer()); computeRangeKernel.setArg<cl::Buffer>(2, dataRange->getDeviceBuffer());
computeRangeKernel.setArg(3, rangeKernelSize*trait->getKeySize(), NULL); computeRangeKernel.setArg(3, rangeKernelSize*trait->getKeySize(), NULL);
computeRangeKernel.setArg<cl_int>(4, numBuckets); computeRangeKernel.setArg(4, rangeKernelSize*trait->getKeySize(), NULL);
computeRangeKernel.setArg<cl::Buffer>(5, bucketOffset->getDeviceBuffer()); computeRangeKernel.setArg<cl_int>(5, numBuckets);
computeRangeKernel.setArg<cl::Buffer>(6, bucketOffset->getDeviceBuffer());
context.executeKernel(computeRangeKernel, rangeKernelSize, rangeKernelSize); context.executeKernel(computeRangeKernel, rangeKernelSize, rangeKernelSize);
// Assign array elements to buckets. // Assign array elements to buckets.
......
...@@ -49,8 +49,8 @@ __kernel void sortShortList(__global DATA_TYPE* restrict data, uint length, __lo ...@@ -49,8 +49,8 @@ __kernel void sortShortList(__global DATA_TYPE* restrict data, uint length, __lo
* Calculate the minimum and maximum value in the array to be sorted. This kernel * Calculate the minimum and maximum value in the array to be sorted. This kernel
* is executed as a single work group. * is executed as a single work group.
*/ */
__kernel void computeRange(__global const DATA_TYPE* restrict data, uint length, __global KEY_TYPE* restrict range, __local KEY_TYPE* restrict buffer, __kernel void computeRange(__global const DATA_TYPE* restrict data, uint length, __global KEY_TYPE* restrict range, __local KEY_TYPE* restrict minBuffer,
uint numBuckets, __global uint* restrict bucketOffset) { __local KEY_TYPE* restrict maxBuffer, uint numBuckets, __global uint* restrict bucketOffset) {
KEY_TYPE minimum = MAX_KEY; KEY_TYPE minimum = MAX_KEY;
KEY_TYPE maximum = MIN_KEY; KEY_TYPE maximum = MIN_KEY;
...@@ -64,23 +64,18 @@ __kernel void computeRange(__global const DATA_TYPE* restrict data, uint length, ...@@ -64,23 +64,18 @@ __kernel void computeRange(__global const DATA_TYPE* restrict data, uint length,
// Now reduce them. // Now reduce them.
buffer[get_local_id(0)] = minimum; minBuffer[get_local_id(0)] = minimum;
maxBuffer[get_local_id(0)] = maximum;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
for (uint step = 1; step < get_local_size(0); step *= 2) { for (uint step = 1; step < get_local_size(0); step *= 2) {
if (get_local_id(0)+step < get_local_size(0) && get_local_id(0)%(2*step) == 0) if (get_local_id(0)+step < get_local_size(0) && get_local_id(0)%(2*step) == 0) {
buffer[get_local_id(0)] = min(buffer[get_local_id(0)], buffer[get_local_id(0)+step]); minBuffer[get_local_id(0)] = min(minBuffer[get_local_id(0)], minBuffer[get_local_id(0)+step]);
barrier(CLK_LOCAL_MEM_FENCE); maxBuffer[get_local_id(0)] = max(maxBuffer[get_local_id(0)], maxBuffer[get_local_id(0)+step]);
} }
minimum = buffer[0];
barrier(CLK_LOCAL_MEM_FENCE);
buffer[get_local_id(0)] = maximum;
barrier(CLK_LOCAL_MEM_FENCE);
for (uint step = 1; step < get_local_size(0); step *= 2) {
if (get_local_id(0)+step < get_local_size(0) && get_local_id(0)%(2*step) == 0)
buffer[get_local_id(0)] = max(buffer[get_local_id(0)], buffer[get_local_id(0)+step]);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
} }
maximum = buffer[0]; minimum = minBuffer[0];
maximum = maxBuffer[0];
if (get_local_id(0) == 0) { if (get_local_id(0) == 0) {
range[0] = minimum; range[0] = minimum;
range[1] = maximum; range[1] = maximum;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment