Commit ad0783dd authored by peastman's avatar peastman Committed by GitHub
Browse files

Merge pull request #1564 from peastman/opt

Minor optimizations
parents e93c2aba b07cf776
...@@ -205,6 +205,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking ...@@ -205,6 +205,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
int major, minor; int major, minor;
CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device)); CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device));
int numThreadBlocksPerComputeUnit = (major >= 6 ? 4 : 6);
#if __CUDA_API_VERSION < 7000 #if __CUDA_API_VERSION < 7000
// This is a workaround to support GTX 980 with CUDA 6.5. It reports // This is a workaround to support GTX 980 with CUDA 6.5. It reports
// its compute capability as 5.2, but the compiler doesn't support // its compute capability as 5.2, but the compiler doesn't support
...@@ -241,7 +242,6 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking ...@@ -241,7 +242,6 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
numAtomBlocks = (paddedNumAtoms+(TileSize-1))/TileSize; numAtomBlocks = (paddedNumAtoms+(TileSize-1))/TileSize;
int multiprocessors; int multiprocessors;
CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device)); CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device));
int numThreadBlocksPerComputeUnit = 6;
numThreadBlocks = numThreadBlocksPerComputeUnit*multiprocessors; numThreadBlocks = numThreadBlocksPerComputeUnit*multiprocessors;
if (useDoublePrecision) { if (useDoublePrecision) {
posq = CudaArray::create<double4>(*this, paddedNumAtoms, "posq"); posq = CudaArray::create<double4>(*this, paddedNumAtoms, "posq");
......
...@@ -27,8 +27,18 @@ extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize, ...@@ -27,8 +27,18 @@ extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize,
maxPos = make_real4(max(maxPos.x,pos.x), max(maxPos.y,pos.y), max(maxPos.z,pos.z), 0); maxPos = make_real4(max(maxPos.x,pos.x), max(maxPos.y,pos.y), max(maxPos.z,pos.z), 0);
} }
real4 blockSize = 0.5f*(maxPos-minPos); real4 blockSize = 0.5f*(maxPos-minPos);
real4 center = 0.5f*(maxPos+minPos);
center.w = 0;
for (int i = base+1; i < last; i++) {
pos = posq[i];
real4 delta = posq[i]-center;
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
center.w = max(center.w, delta.x*delta.x+delta.y*delta.y+delta.z*delta.z);
}
blockBoundingBox[index] = blockSize; blockBoundingBox[index] = blockSize;
blockCenter[index] = 0.5f*(maxPos+minPos); blockCenter[index] = center;
sortedBlocks[index] = make_real2(blockSize.x+blockSize.y+blockSize.z, index); sortedBlocks[index] = make_real2(blockSize.x+blockSize.y+blockSize.z, index);
index += blockDim.x*gridDim.x; index += blockDim.x*gridDim.x;
base = index*TILE_SIZE; base = index*TILE_SIZE;
...@@ -176,12 +186,13 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea ...@@ -176,12 +186,13 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
int block2 = block2Base+indexInWarp; int block2 = block2Base+indexInWarp;
bool includeBlock2 = (block2 < NUM_BLOCKS); bool includeBlock2 = (block2 < NUM_BLOCKS);
if (includeBlock2) { if (includeBlock2) {
real4 blockCenterY = (block2 < NUM_BLOCKS ? sortedBlockCenter[block2] : make_real4(0)); real4 blockCenterY = sortedBlockCenter[block2];
real4 blockSizeY = (block2 < NUM_BLOCKS ? sortedBlockBoundingBox[block2] : make_real4(0)); real4 blockSizeY = sortedBlockBoundingBox[block2];
real4 blockDelta = blockCenterX-blockCenterY; real4 blockDelta = blockCenterX-blockCenterY;
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(blockDelta) APPLY_PERIODIC_TO_DELTA(blockDelta)
#endif #endif
includeBlock2 &= (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < (PADDED_CUTOFF+blockCenterX.w+blockCenterY.w)*(PADDED_CUTOFF+blockCenterX.w+blockCenterY.w));
blockDelta.x = max(0.0f, fabs(blockDelta.x)-blockSizeX.x-blockSizeY.x); blockDelta.x = max(0.0f, fabs(blockDelta.x)-blockSizeX.x-blockSizeY.x);
blockDelta.y = max(0.0f, fabs(blockDelta.y)-blockSizeX.y-blockSizeY.y); blockDelta.y = max(0.0f, fabs(blockDelta.y)-blockSizeX.y-blockSizeY.y);
blockDelta.z = max(0.0f, fabs(blockDelta.z)-blockSizeX.z-blockSizeY.z); blockDelta.z = max(0.0f, fabs(blockDelta.z)-blockSizeX.z-blockSizeY.z);
...@@ -203,19 +214,26 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea ...@@ -203,19 +214,26 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
// Check each atom in block Y for interactions. // Check each atom in block Y for interactions.
int start = y*TILE_SIZE; int atom2 = y*TILE_SIZE+indexInWarp;
int atom2 = start+indexInWarp;
real3 pos2 = trimTo3(posq[atom2]); real3 pos2 = trimTo3(posq[atom2]);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
if (singlePeriodicCopy) { if (singlePeriodicCopy) {
APPLY_PERIODIC_TO_POS_WITH_CENTER(pos2, blockCenterX) APPLY_PERIODIC_TO_POS_WITH_CENTER(pos2, blockCenterX)
} }
#endif #endif
real4 blockCenterY = sortedBlockCenter[block2Base+i];
real3 atomDelta = posBuffer[warpStart+indexInWarp]-trimTo3(blockCenterY);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(atomDelta)
#endif
int atomFlags = ballot(atomDelta.x*atomDelta.x+atomDelta.y*atomDelta.y+atomDelta.z*atomDelta.z < (PADDED_CUTOFF+blockCenterY.w)*(PADDED_CUTOFF+blockCenterY.w));
bool interacts = false; bool interacts = false;
if (atom2 < NUM_ATOMS) { if (atom2 < NUM_ATOMS && atomFlags != 0) {
int first = __ffs(atomFlags)-1;
int last = 32-__clz(atomFlags);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
if (!singlePeriodicCopy) { if (!singlePeriodicCopy) {
for (int j = 0; j < TILE_SIZE; j++) { for (int j = first; j < last; j++) {
real3 delta = pos2-posBuffer[warpStart+j]; real3 delta = pos2-posBuffer[warpStart+j];
APPLY_PERIODIC_TO_DELTA(delta) APPLY_PERIODIC_TO_DELTA(delta)
interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED); interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
...@@ -223,7 +241,7 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea ...@@ -223,7 +241,7 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
} }
else { else {
#endif #endif
for (int j = 0; j < TILE_SIZE; j++) { for (int j = first; j < last; j++) {
real3 delta = pos2-posBuffer[warpStart+j]; real3 delta = pos2-posBuffer[warpStart+j];
interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED); interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
} }
......
...@@ -27,8 +27,18 @@ __kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeri ...@@ -27,8 +27,18 @@ __kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeri
maxPos = max(maxPos, pos); maxPos = max(maxPos, pos);
} }
real4 blockSize = 0.5f*(maxPos-minPos); real4 blockSize = 0.5f*(maxPos-minPos);
real4 center = 0.5f*(maxPos+minPos);
center.w = 0;
for (int i = base+1; i < last; i++) {
pos = posq[i];
real4 delta = posq[i]-center;
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
center.w = max(center.w, delta.x*delta.x+delta.y*delta.y+delta.z*delta.z);
}
blockBoundingBox[index] = blockSize; blockBoundingBox[index] = blockSize;
blockCenter[index] = 0.5f*(maxPos+minPos); blockCenter[index] = center;
sortedBlocks[index] = (real2) (blockSize.x+blockSize.y+blockSize.z, index); sortedBlocks[index] = (real2) (blockSize.x+blockSize.y+blockSize.z, index);
index += get_global_size(0); index += get_global_size(0);
base = index*TILE_SIZE; base = index*TILE_SIZE;
...@@ -142,6 +152,7 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi ...@@ -142,6 +152,7 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(blockDelta) APPLY_PERIODIC_TO_DELTA(blockDelta)
#endif #endif
includeBlock2 &= (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < (PADDED_CUTOFF+blockCenterX.w+blockCenterY.w)*(PADDED_CUTOFF+blockCenterX.w+blockCenterY.w));
blockDelta.x = max((real) 0, fabs(blockDelta.x)-blockSizeX.x-blockSizeY.x); blockDelta.x = max((real) 0, fabs(blockDelta.x)-blockSizeX.x-blockSizeY.x);
blockDelta.y = max((real) 0, fabs(blockDelta.y)-blockSizeX.y-blockSizeY.y); blockDelta.y = max((real) 0, fabs(blockDelta.y)-blockSizeX.y-blockSizeY.y);
blockDelta.z = max((real) 0, fabs(blockDelta.z)-blockSizeX.z-blockSizeY.z); blockDelta.z = max((real) 0, fabs(blockDelta.z)-blockSizeX.z-blockSizeY.z);
...@@ -165,8 +176,7 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi ...@@ -165,8 +176,7 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi
// Check each atom in block Y for interactions. // Check each atom in block Y for interactions.
int start = y*TILE_SIZE; int atom2 = y*TILE_SIZE+indexInWarp;
int atom2 = start+indexInWarp;
real3 pos2 = posq[atom2].xyz; real3 pos2 = posq[atom2].xyz;
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
if (singlePeriodicCopy) if (singlePeriodicCopy)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment