Commit 5ab3c394 authored by Peter Eastman's avatar Peter Eastman
Browse files

Minor optimizations

parent 9e34d39e
......@@ -2002,7 +2002,7 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
if (cu.getUseDoublePrecision() || cu.getComputeCapability() < 2.0 || cu.getPlatformData().deterministicForces) {
void* finishSpreadArgs[] = {&directPmeGrid->getDevicePointer()};
cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, directPmeGrid->getSize());
cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, directPmeGrid->getSize(), 256);
}
if (useCudaFFT) {
......@@ -2019,13 +2019,13 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
void* computeEnergyArgs[] = {&reciprocalPmeGrid->getDevicePointer(), usePmeStream ? &pmeEnergyBuffer->getDevicePointer() : &cu.getEnergyBuffer().getDevicePointer(),
&pmeBsplineModuliX->getDevicePointer(), &pmeBsplineModuliY->getDevicePointer(), &pmeBsplineModuliZ->getDevicePointer(),
cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeEvalEnergyKernel, computeEnergyArgs, cu.getNumAtoms());
cu.executeKernel(pmeEvalEnergyKernel, computeEnergyArgs, gridSizeX*gridSizeY*gridSizeZ);
}
void* convolutionArgs[] = {&reciprocalPmeGrid->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
&pmeBsplineModuliX->getDevicePointer(), &pmeBsplineModuliY->getDevicePointer(), &pmeBsplineModuliZ->getDevicePointer(),
cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeConvolutionKernel, convolutionArgs, cu.getNumAtoms());
cu.executeKernel(pmeConvolutionKernel, convolutionArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
if (useCudaFFT) {
if (cu.getUseDoublePrecision())
......@@ -6443,7 +6443,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
void* args1[] = {&numAtoms, &paddedNumAtoms, &cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection,
&cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer()};
cu.executeKernel(kernel1, args1, numAtoms);
cu.executeKernel(kernel1, args1, numAtoms, 128);
// Apply constraints.
......@@ -6453,7 +6453,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
void* args2[] = {&numAtoms, &cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection,
&cu.getVelm().getDevicePointer(), &integration.getPosDelta().getDevicePointer()};
cu.executeKernel(kernel2, args2, numAtoms);
cu.executeKernel(kernel2, args2, numAtoms, 128);
integration.computeVirtualSites();
// Update the time and step count.
......@@ -6526,7 +6526,7 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev
int randomIndex = integration.prepareRandomNumbers(cu.getPaddedNumAtoms());
void* args1[] = {&numAtoms, &paddedNumAtoms, &cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer(),
&params->getDevicePointer(), &integration.getStepSize().getDevicePointer(), &integration.getRandom().getDevicePointer(), &randomIndex};
cu.executeKernel(kernel1, args1, numAtoms);
cu.executeKernel(kernel1, args1, numAtoms, 128);
// Apply constraints.
......@@ -6537,7 +6537,7 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev
CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
void* args2[] = {&numAtoms, &cu.getPosq().getDevicePointer(), &posCorrection, &integration.getPosDelta().getDevicePointer(),
&cu.getVelm().getDevicePointer(), &integration.getStepSize().getDevicePointer()};
cu.executeKernel(kernel2, args2, numAtoms);
cu.executeKernel(kernel2, args2, numAtoms, 128);
integration.computeVirtualSites();
// Update the time and step count.
......@@ -6588,7 +6588,7 @@ void CudaIntegrateBrownianStepKernel::execute(ContextImpl& context, const Browni
useDouble ? (void*) &noise : (void*) &noiseFloat,
&cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer(),
&cu.getVelm().getDevicePointer(), &integration.getRandom().getDevicePointer(), &randomIndex};
cu.executeKernel(kernel1, args1, numAtoms);
cu.executeKernel(kernel1, args1, numAtoms, 128);
// Apply constraints.
......@@ -6599,7 +6599,7 @@ void CudaIntegrateBrownianStepKernel::execute(ContextImpl& context, const Browni
CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
void* args2[] = {&numAtoms, useDouble ? (void*) &stepSize : (void*) &stepSizeFloat,
&cu.getPosq().getDevicePointer(), &posCorrection, &cu.getVelm().getDevicePointer(), &integration.getPosDelta().getDevicePointer()};
cu.executeKernel(kernel2, args2, numAtoms);
cu.executeKernel(kernel2, args2, numAtoms, 128);
integration.computeVirtualSites();
// Update the time and step count.
......@@ -6652,7 +6652,7 @@ double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, cons
CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
void* args1[] = {&numAtoms, &paddedNumAtoms, &cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection,
&cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer()};
cu.executeKernel(kernel1, args1, numAtoms);
cu.executeKernel(kernel1, args1, numAtoms, 128);
// Apply constraints.
......@@ -6662,7 +6662,7 @@ double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, cons
void* args2[] = {&numAtoms, &cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection,
&cu.getVelm().getDevicePointer(), &integration.getPosDelta().getDevicePointer()};
cu.executeKernel(kernel2, args2, numAtoms);
cu.executeKernel(kernel2, args2, numAtoms, 128);
integration.computeVirtualSites();
// Update the time and step count.
......@@ -6738,7 +6738,7 @@ double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, co
int randomIndex = integration.prepareRandomNumbers(cu.getPaddedNumAtoms());
void* args1[] = {&numAtoms, &paddedNumAtoms, &cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer(),
&params->getDevicePointer(), &integration.getStepSize().getDevicePointer(), &integration.getRandom().getDevicePointer(), &randomIndex};
cu.executeKernel(kernel1, args1, numAtoms);
cu.executeKernel(kernel1, args1, numAtoms, 128);
// Apply constraints.
......@@ -6749,7 +6749,7 @@ double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, co
CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
void* args2[] = {&numAtoms, &cu.getPosq().getDevicePointer(), &posCorrection, &integration.getPosDelta().getDevicePointer(),
&cu.getVelm().getDevicePointer(), &integration.getStepSize().getDevicePointer()};
cu.executeKernel(kernel2, args2, numAtoms);
cu.executeKernel(kernel2, args2, numAtoms, 128);
integration.computeVirtualSites();
// Update the time and step count.
......
......@@ -114,13 +114,13 @@ void CudaSort::sort(CudaArray& data) {
unsigned int numBuckets = bucketOffset->getSize();
void* rangeArgs[] = {&data.getDevicePointer(), &dataLength, &dataRange->getDevicePointer(), &numBuckets, &bucketOffset->getDevicePointer()};
context.executeKernel(computeRangeKernel, rangeArgs, rangeKernelSize, rangeKernelSize, rangeKernelSize*trait->getKeySize());
context.executeKernel(computeRangeKernel, rangeArgs, rangeKernelSize, rangeKernelSize, 2*rangeKernelSize*trait->getKeySize());
// Assign array elements to buckets.
void* elementsArgs[] = {&data.getDevicePointer(), &dataLength, &numBuckets, &dataRange->getDevicePointer(),
&bucketOffset->getDevicePointer(), &bucketOfElement->getDevicePointer(), &offsetInBucket->getDevicePointer()};
context.executeKernel(assignElementsKernel, elementsArgs, data.getSize());
context.executeKernel(assignElementsKernel, elementsArgs, data.getSize(), 128);
// Compute the position of each bucket.
......
......@@ -52,7 +52,8 @@ __global__ void sortShortList(DATA_TYPE* __restrict__ data, unsigned int length)
*/
__global__ void computeRange(const DATA_TYPE* __restrict__ data, unsigned int length, KEY_TYPE* __restrict__ range,
unsigned int numBuckets, unsigned int* __restrict__ bucketOffset) {
extern __shared__ KEY_TYPE rangeBuffer[];
extern __shared__ KEY_TYPE minBuffer[];
KEY_TYPE* maxBuffer = minBuffer+blockDim.x;
KEY_TYPE minimum = MAX_KEY;
KEY_TYPE maximum = MIN_KEY;
......@@ -66,23 +67,18 @@ __global__ void computeRange(const DATA_TYPE* __restrict__ data, unsigned int le
// Now reduce them.
rangeBuffer[threadIdx.x] = minimum;
minBuffer[threadIdx.x] = minimum;
maxBuffer[threadIdx.x] = maximum;
__syncthreads();
for (unsigned int step = 1; step < blockDim.x; step *= 2) {
if (threadIdx.x+step < blockDim.x && threadIdx.x%(2*step) == 0)
rangeBuffer[threadIdx.x] = min(rangeBuffer[threadIdx.x], rangeBuffer[threadIdx.x+step]);
__syncthreads();
}
minimum = rangeBuffer[0];
__syncthreads();
rangeBuffer[threadIdx.x] = maximum;
__syncthreads();
for (unsigned int step = 1; step < blockDim.x; step *= 2) {
if (threadIdx.x+step < blockDim.x && threadIdx.x%(2*step) == 0)
rangeBuffer[threadIdx.x] = max(rangeBuffer[threadIdx.x], rangeBuffer[threadIdx.x+step]);
if (threadIdx.x+step < blockDim.x && threadIdx.x%(2*step) == 0) {
minBuffer[threadIdx.x] = min(minBuffer[threadIdx.x], minBuffer[threadIdx.x+step]);
maxBuffer[threadIdx.x] = max(maxBuffer[threadIdx.x], maxBuffer[threadIdx.x+step]);
}
__syncthreads();
}
maximum = rangeBuffer[0];
minimum = minBuffer[0];
maximum = maxBuffer[0];
if (threadIdx.x == 0) {
range[0] = minimum;
range[1] = maximum;
......
......@@ -2086,8 +2086,8 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeEvalEnergyKernel.setArg<mm_float4>(7, recipBoxVectorsFloat[2]);
}
if (includeEnergy)
cl.executeKernel(pmeEvalEnergyKernel, cl.getNumAtoms());
cl.executeKernel(pmeConvolutionKernel, cl.getNumAtoms());
cl.executeKernel(pmeEvalEnergyKernel, gridSizeX*gridSizeY*gridSizeZ);
cl.executeKernel(pmeConvolutionKernel, gridSizeX*gridSizeY*gridSizeZ);
fft->execFFT(*pmeGrid2, *pmeGrid, false);
setPeriodicBoxArgs(cl, pmeInterpolateForceKernel, 3);
if (cl.getUseDoublePrecision()) {
......
......@@ -124,8 +124,9 @@ void OpenCLSort::sort(OpenCLArray& data) {
computeRangeKernel.setArg<cl_uint>(1, data.getSize());
computeRangeKernel.setArg<cl::Buffer>(2, dataRange->getDeviceBuffer());
computeRangeKernel.setArg(3, rangeKernelSize*trait->getKeySize(), NULL);
computeRangeKernel.setArg<cl_int>(4, numBuckets);
computeRangeKernel.setArg<cl::Buffer>(5, bucketOffset->getDeviceBuffer());
computeRangeKernel.setArg(4, rangeKernelSize*trait->getKeySize(), NULL);
computeRangeKernel.setArg<cl_int>(5, numBuckets);
computeRangeKernel.setArg<cl::Buffer>(6, bucketOffset->getDeviceBuffer());
context.executeKernel(computeRangeKernel, rangeKernelSize, rangeKernelSize);
// Assign array elements to buckets.
......
......@@ -49,8 +49,8 @@ __kernel void sortShortList(__global DATA_TYPE* __restrict__ data, uint length,
* Calculate the minimum and maximum value in the array to be sorted. This kernel
* is executed as a single work group.
*/
__kernel void computeRange(__global const DATA_TYPE* restrict data, uint length, __global KEY_TYPE* restrict range, __local KEY_TYPE* restrict buffer,
uint numBuckets, __global uint* restrict bucketOffset) {
__kernel void computeRange(__global const DATA_TYPE* restrict data, uint length, __global KEY_TYPE* restrict range, __local KEY_TYPE* restrict minBuffer,
__local KEY_TYPE* restrict maxBuffer, uint numBuckets, __global uint* restrict bucketOffset) {
KEY_TYPE minimum = MAX_KEY;
KEY_TYPE maximum = MIN_KEY;
......@@ -64,23 +64,18 @@ __kernel void computeRange(__global const DATA_TYPE* restrict data, uint length,
// Now reduce them.
buffer[get_local_id(0)] = minimum;
minBuffer[get_local_id(0)] = minimum;
maxBuffer[get_local_id(0)] = maximum;
barrier(CLK_LOCAL_MEM_FENCE);
for (uint step = 1; step < get_local_size(0); step *= 2) {
if (get_local_id(0)+step < get_local_size(0) && get_local_id(0)%(2*step) == 0)
buffer[get_local_id(0)] = min(buffer[get_local_id(0)], buffer[get_local_id(0)+step]);
barrier(CLK_LOCAL_MEM_FENCE);
}
minimum = buffer[0];
barrier(CLK_LOCAL_MEM_FENCE);
buffer[get_local_id(0)] = maximum;
barrier(CLK_LOCAL_MEM_FENCE);
for (uint step = 1; step < get_local_size(0); step *= 2) {
if (get_local_id(0)+step < get_local_size(0) && get_local_id(0)%(2*step) == 0)
buffer[get_local_id(0)] = max(buffer[get_local_id(0)], buffer[get_local_id(0)+step]);
if (get_local_id(0)+step < get_local_size(0) && get_local_id(0)%(2*step) == 0) {
minBuffer[get_local_id(0)] = min(minBuffer[get_local_id(0)], minBuffer[get_local_id(0)+step]);
maxBuffer[get_local_id(0)] = max(maxBuffer[get_local_id(0)], maxBuffer[get_local_id(0)+step]);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
maximum = buffer[0];
minimum = minBuffer[0];
maximum = maxBuffer[0];
if (get_local_id(0) == 0) {
range[0] = minimum;
range[1] = maximum;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment