Commit 5fa4345f authored by Peter Eastman's avatar Peter Eastman
Browse files

Beginning of implementing fine grained pair list

parent 10b51d25
...@@ -270,6 +270,8 @@ private: ...@@ -270,6 +270,8 @@ private:
CudaArray* interactingTiles; CudaArray* interactingTiles;
CudaArray* interactingAtoms; CudaArray* interactingAtoms;
CudaArray* interactionCount; CudaArray* interactionCount;
CudaArray* singlePairs;
CudaArray* singlePairCount;
CudaArray* blockCenter; CudaArray* blockCenter;
CudaArray* blockBoundingBox; CudaArray* blockBoundingBox;
CudaArray* sortedBlocks; CudaArray* sortedBlocks;
...@@ -289,7 +291,7 @@ private: ...@@ -289,7 +291,7 @@ private:
std::map<int, std::string> groupKernelSource; std::map<int, std::string> groupKernelSource;
double lastCutoff; double lastCutoff;
bool useCutoff, usePeriodic, anyExclusions, usePadding, forceRebuildNeighborList; bool useCutoff, usePeriodic, anyExclusions, usePadding, forceRebuildNeighborList;
int startTileIndex, numTiles, startBlockIndex, numBlocks, maxTiles, maxExclusions, numForceThreadBlocks, forceThreadBlockSize, numAtoms, groupFlags; int startTileIndex, numTiles, startBlockIndex, numBlocks, maxTiles, maxSinglePairs, maxExclusions, numForceThreadBlocks, forceThreadBlockSize, numAtoms, groupFlags;
}; };
/** /**
......
...@@ -64,7 +64,7 @@ private: ...@@ -64,7 +64,7 @@ private:
CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(context), useCutoff(false), usePeriodic(false), anyExclusions(false), usePadding(true), CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(context), useCutoff(false), usePeriodic(false), anyExclusions(false), usePadding(true),
exclusionIndices(NULL), exclusionRowIndices(NULL), exclusionTiles(NULL), exclusions(NULL), interactingTiles(NULL), interactingAtoms(NULL), exclusionIndices(NULL), exclusionRowIndices(NULL), exclusionTiles(NULL), exclusions(NULL), interactingTiles(NULL), interactingAtoms(NULL),
interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL), sortedBlocks(NULL), sortedBlockCenter(NULL), sortedBlockBoundingBox(NULL), interactionCount(NULL), singlePairs(NULL), singlePairCount(NULL), blockCenter(NULL), blockBoundingBox(NULL), sortedBlocks(NULL), sortedBlockCenter(NULL), sortedBlockBoundingBox(NULL),
oldPositions(NULL), rebuildNeighborList(NULL), blockSorter(NULL), pinnedCountBuffer(NULL), forceRebuildNeighborList(true), lastCutoff(0.0), groupFlags(0) { oldPositions(NULL), rebuildNeighborList(NULL), blockSorter(NULL), pinnedCountBuffer(NULL), forceRebuildNeighborList(true), lastCutoff(0.0), groupFlags(0) {
// Decide how many thread blocks to use. // Decide how many thread blocks to use.
...@@ -72,7 +72,7 @@ CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(c ...@@ -72,7 +72,7 @@ CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(c
int multiprocessors; int multiprocessors;
CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, context.getDevice())); CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, context.getDevice()));
CHECK_RESULT(cuEventCreate(&downloadCountEvent, 0)); CHECK_RESULT(cuEventCreate(&downloadCountEvent, 0));
CHECK_RESULT(cuMemHostAlloc((void**) &pinnedCountBuffer, sizeof(int), CU_MEMHOSTALLOC_PORTABLE)); CHECK_RESULT(cuMemHostAlloc((void**) &pinnedCountBuffer, 2*sizeof(int), CU_MEMHOSTALLOC_PORTABLE));
numForceThreadBlocks = 4*multiprocessors; numForceThreadBlocks = 4*multiprocessors;
forceThreadBlockSize = (context.getComputeCapability() < 2.0 ? 128 : 256); forceThreadBlockSize = (context.getComputeCapability() < 2.0 ? 128 : 256);
} }
...@@ -92,6 +92,10 @@ CudaNonbondedUtilities::~CudaNonbondedUtilities() { ...@@ -92,6 +92,10 @@ CudaNonbondedUtilities::~CudaNonbondedUtilities() {
delete interactingAtoms; delete interactingAtoms;
if (interactionCount != NULL) if (interactionCount != NULL)
delete interactionCount; delete interactionCount;
if (singlePairs != NULL)
delete singlePairs;
if (singlePairCount != NULL)
delete singlePairCount;
if (blockCenter != NULL) if (blockCenter != NULL)
delete blockCenter; delete blockCenter;
if (blockBoundingBox != NULL) if (blockBoundingBox != NULL)
...@@ -279,9 +283,12 @@ void CudaNonbondedUtilities::initialize(const System& system) { ...@@ -279,9 +283,12 @@ void CudaNonbondedUtilities::initialize(const System& system) {
maxTiles = numTiles; maxTiles = numTiles;
if (maxTiles < 1) if (maxTiles < 1)
maxTiles = 1; maxTiles = 1;
maxSinglePairs = 5*numAtoms;
interactingTiles = CudaArray::create<int>(context, maxTiles, "interactingTiles"); interactingTiles = CudaArray::create<int>(context, maxTiles, "interactingTiles");
interactingAtoms = CudaArray::create<int>(context, CudaContext::TileSize*maxTiles, "interactingAtoms"); interactingAtoms = CudaArray::create<int>(context, CudaContext::TileSize*maxTiles, "interactingAtoms");
interactionCount = CudaArray::create<unsigned int>(context, 1, "interactionCount"); interactionCount = CudaArray::create<unsigned int>(context, 1, "interactionCount");
singlePairs = CudaArray::create<int2>(context, maxSinglePairs, "singlePairs");
singlePairCount = CudaArray::create<unsigned int>(context, 1, "singlePairCount");
int elementSize = (context.getUseDoublePrecision() ? sizeof(double) : sizeof(float)); int elementSize = (context.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
blockCenter = new CudaArray(context, numAtomBlocks, 4*elementSize, "blockCenter"); blockCenter = new CudaArray(context, numAtomBlocks, 4*elementSize, "blockCenter");
blockBoundingBox = new CudaArray(context, numAtomBlocks, 4*elementSize, "blockBoundingBox"); blockBoundingBox = new CudaArray(context, numAtomBlocks, 4*elementSize, "blockBoundingBox");
...@@ -293,6 +300,7 @@ void CudaNonbondedUtilities::initialize(const System& system) { ...@@ -293,6 +300,7 @@ void CudaNonbondedUtilities::initialize(const System& system) {
blockSorter = new CudaSort(context, new BlockSortTrait(context.getUseDoublePrecision()), numAtomBlocks); blockSorter = new CudaSort(context, new BlockSortTrait(context.getUseDoublePrecision()), numAtomBlocks);
vector<unsigned int> count(1, 0); vector<unsigned int> count(1, 0);
interactionCount->upload(count); interactionCount->upload(count);
singlePairCount->upload(count);
} }
// Record arguments for kernels. // Record arguments for kernels.
...@@ -316,6 +324,9 @@ void CudaNonbondedUtilities::initialize(const System& system) { ...@@ -316,6 +324,9 @@ void CudaNonbondedUtilities::initialize(const System& system) {
forceArgs.push_back(&blockCenter->getDevicePointer()); forceArgs.push_back(&blockCenter->getDevicePointer());
forceArgs.push_back(&blockBoundingBox->getDevicePointer()); forceArgs.push_back(&blockBoundingBox->getDevicePointer());
forceArgs.push_back(&interactingAtoms->getDevicePointer()); forceArgs.push_back(&interactingAtoms->getDevicePointer());
forceArgs.push_back(&maxSinglePairs);
forceArgs.push_back(&singlePairCount->getDevicePointer());
forceArgs.push_back(&singlePairs->getDevicePointer());
} }
for (int i = 0; i < (int) parameters.size(); i++) for (int i = 0; i < (int) parameters.size(); i++)
forceArgs.push_back(&parameters[i].getMemory()); forceArgs.push_back(&parameters[i].getMemory());
...@@ -343,6 +354,7 @@ void CudaNonbondedUtilities::initialize(const System& system) { ...@@ -343,6 +354,7 @@ void CudaNonbondedUtilities::initialize(const System& system) {
sortBoxDataArgs.push_back(&context.getPosq().getDevicePointer()); sortBoxDataArgs.push_back(&context.getPosq().getDevicePointer());
sortBoxDataArgs.push_back(&oldPositions->getDevicePointer()); sortBoxDataArgs.push_back(&oldPositions->getDevicePointer());
sortBoxDataArgs.push_back(&interactionCount->getDevicePointer()); sortBoxDataArgs.push_back(&interactionCount->getDevicePointer());
sortBoxDataArgs.push_back(&singlePairCount->getDevicePointer());
sortBoxDataArgs.push_back(&rebuildNeighborList->getDevicePointer()); sortBoxDataArgs.push_back(&rebuildNeighborList->getDevicePointer());
sortBoxDataArgs.push_back(&forceRebuildNeighborList); sortBoxDataArgs.push_back(&forceRebuildNeighborList);
findInteractingBlocksArgs.push_back(context.getPeriodicBoxSizePointer()); findInteractingBlocksArgs.push_back(context.getPeriodicBoxSizePointer());
...@@ -353,8 +365,11 @@ void CudaNonbondedUtilities::initialize(const System& system) { ...@@ -353,8 +365,11 @@ void CudaNonbondedUtilities::initialize(const System& system) {
findInteractingBlocksArgs.push_back(&interactionCount->getDevicePointer()); findInteractingBlocksArgs.push_back(&interactionCount->getDevicePointer());
findInteractingBlocksArgs.push_back(&interactingTiles->getDevicePointer()); findInteractingBlocksArgs.push_back(&interactingTiles->getDevicePointer());
findInteractingBlocksArgs.push_back(&interactingAtoms->getDevicePointer()); findInteractingBlocksArgs.push_back(&interactingAtoms->getDevicePointer());
findInteractingBlocksArgs.push_back(&singlePairCount->getDevicePointer());
findInteractingBlocksArgs.push_back(&singlePairs->getDevicePointer());
findInteractingBlocksArgs.push_back(&context.getPosq().getDevicePointer()); findInteractingBlocksArgs.push_back(&context.getPosq().getDevicePointer());
findInteractingBlocksArgs.push_back(&maxTiles); findInteractingBlocksArgs.push_back(&maxTiles);
findInteractingBlocksArgs.push_back(&maxSinglePairs);
findInteractingBlocksArgs.push_back(&startBlockIndex); findInteractingBlocksArgs.push_back(&startBlockIndex);
findInteractingBlocksArgs.push_back(&numBlocks); findInteractingBlocksArgs.push_back(&numBlocks);
findInteractingBlocksArgs.push_back(&sortedBlocks->getDevicePointer()); findInteractingBlocksArgs.push_back(&sortedBlocks->getDevicePointer());
...@@ -402,6 +417,7 @@ void CudaNonbondedUtilities::prepareInteractions(int forceGroups) { ...@@ -402,6 +417,7 @@ void CudaNonbondedUtilities::prepareInteractions(int forceGroups) {
forceRebuildNeighborList = false; forceRebuildNeighborList = false;
lastCutoff = kernels.cutoffDistance; lastCutoff = kernels.cutoffDistance;
interactionCount->download(pinnedCountBuffer, false); interactionCount->download(pinnedCountBuffer, false);
singlePairCount->download(pinnedCountBuffer+1, false);
cuEventRecord(downloadCountEvent, context.getCurrentStream()); cuEventRecord(downloadCountEvent, context.getCurrentStream());
} }
...@@ -424,28 +440,39 @@ void CudaNonbondedUtilities::computeInteractions(int forceGroups, bool includeFo ...@@ -424,28 +440,39 @@ void CudaNonbondedUtilities::computeInteractions(int forceGroups, bool includeFo
bool CudaNonbondedUtilities::updateNeighborListSize() { bool CudaNonbondedUtilities::updateNeighborListSize() {
if (!useCutoff) if (!useCutoff)
return false; return false;
if (pinnedCountBuffer[0] <= (unsigned int) maxTiles) if (pinnedCountBuffer[0] <= maxTiles && pinnedCountBuffer[1] <= maxSinglePairs)
return false; return false;
// The most recent timestep had too many interactions to fit in the arrays. Make the arrays bigger to prevent // The most recent timestep had too many interactions to fit in the arrays. Make the arrays bigger to prevent
// this from happening in the future. // this from happening in the future.
maxTiles = (int) (1.2*pinnedCountBuffer[0]); if (pinnedCountBuffer[0] > maxTiles) {
int totalTiles = context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2; maxTiles = (int) (1.2*pinnedCountBuffer[0]);
if (maxTiles > totalTiles) int totalTiles = context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2;
maxTiles = totalTiles; if (maxTiles > totalTiles)
delete interactingTiles; maxTiles = totalTiles;
delete interactingAtoms; delete interactingTiles;
interactingTiles = NULL; // Avoid an error in the destructor if the following allocation fails delete interactingAtoms;
interactingAtoms = NULL; interactingTiles = NULL; // Avoid an error in the destructor if the following allocation fails
interactingTiles = CudaArray::create<int>(context, maxTiles, "interactingTiles"); interactingAtoms = NULL;
interactingAtoms = CudaArray::create<int>(context, CudaContext::TileSize*maxTiles, "interactingAtoms"); interactingTiles = CudaArray::create<int>(context, maxTiles, "interactingTiles");
if (forceArgs.size() > 0) interactingAtoms = CudaArray::create<int>(context, CudaContext::TileSize*maxTiles, "interactingAtoms");
forceArgs[7] = &interactingTiles->getDevicePointer(); if (forceArgs.size() > 0)
findInteractingBlocksArgs[6] = &interactingTiles->getDevicePointer(); forceArgs[7] = &interactingTiles->getDevicePointer();
if (forceArgs.size() > 0) findInteractingBlocksArgs[6] = &interactingTiles->getDevicePointer();
forceArgs[17] = &interactingAtoms->getDevicePointer(); if (forceArgs.size() > 0)
findInteractingBlocksArgs[7] = &interactingAtoms->getDevicePointer(); forceArgs[17] = &interactingAtoms->getDevicePointer();
findInteractingBlocksArgs[7] = &interactingAtoms->getDevicePointer();
}
if (pinnedCountBuffer[1] > maxSinglePairs) {
maxSinglePairs = (int) (1.2*pinnedCountBuffer[1]);
delete singlePairs;
singlePairs = NULL; // Avoid an error in the destructor if the following allocation fails
singlePairs = CudaArray::create<int2>(context, maxSinglePairs, "singlePairs");
if (forceArgs.size() > 0)
forceArgs[20] = &singlePairs->getDevicePointer();
findInteractingBlocksArgs[9] = &singlePairs->getDevicePointer();
}
forceRebuildNeighborList = true; forceRebuildNeighborList = true;
context.setForcesValid(false); context.setForcesValid(false);
return true; return true;
......
...@@ -53,7 +53,7 @@ extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize, ...@@ -53,7 +53,7 @@ extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize,
extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, const real4* __restrict__ blockCenter, extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, const real4* __restrict__ blockCenter,
const real4* __restrict__ blockBoundingBox, real4* __restrict__ sortedBlockCenter, const real4* __restrict__ blockBoundingBox, real4* __restrict__ sortedBlockCenter,
real4* __restrict__ sortedBlockBoundingBox, const real4* __restrict__ posq, const real4* __restrict__ oldPositions, real4* __restrict__ sortedBlockBoundingBox, const real4* __restrict__ posq, const real4* __restrict__ oldPositions,
unsigned int* __restrict__ interactionCount, int* __restrict__ rebuildNeighborList, bool forceRebuild) { unsigned int* __restrict__ interactionCount, unsigned int* __restrict__ singlePairCount, int* __restrict__ rebuildNeighborList, bool forceRebuild) {
for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_BLOCKS; i += blockDim.x*gridDim.x) { for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_BLOCKS; i += blockDim.x*gridDim.x) {
int index = (int) sortedBlock[i].y; int index = (int) sortedBlock[i].y;
sortedBlockCenter[i] = blockCenter[index]; sortedBlockCenter[i] = blockCenter[index];
...@@ -71,9 +71,96 @@ extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, co ...@@ -71,9 +71,96 @@ extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, co
if (rebuild) { if (rebuild) {
rebuildNeighborList[0] = 1; rebuildNeighborList[0] = 1;
interactionCount[0] = 0; interactionCount[0] = 0;
singlePairCount[0] = 0;
} }
} }
__device__ int sortBlockAtoms(int x, int* atoms, int* flags, int length, unsigned int maxSinglePairs, unsigned int* singlePairCount, int2* singlePairs, int* sumBuffer, volatile int& pairStartIndex) {
const int indexInWarp = threadIdx.x%32;
const int maxBitsForPairs = 2;
int sum = 0;
for (int i = indexInWarp; i < length; i += 32) {
int count = __popc(flags[i]);
sum += (count <= maxBitsForPairs ? count : 0);
}
sumBuffer[indexInWarp] = sum;
for (int step = 1; step < 32; step *= 2) {
int add = (indexInWarp >= step ? sumBuffer[indexInWarp-step] : 0);
sumBuffer[indexInWarp] += add;
}
int pairsToStore = sumBuffer[31];
if (indexInWarp == 0)
pairStartIndex = atomicAdd(singlePairCount, pairsToStore);
int pairIndex = pairStartIndex + (indexInWarp > 0 ? sumBuffer[indexInWarp-1] : 0);
for (int i = indexInWarp; i < length; i += 32) {
int count = __popc(flags[i]);
if (count <= maxBitsForPairs && pairIndex+count < maxSinglePairs) {
int f = flags[i];
while (f != 0) {
singlePairs[pairIndex] = make_int2(atoms[i], x*TILE_SIZE+__ffs(f)-1);
f &= f-1;
pairIndex++;
}
}
}
// sum = 0;
// for (int i = indexInWarp; i < length; i += 32) {
// int count = __popc(flags[i]);
// sum += (count <= maxBitsForPairs ? 1 : 0);
// }
// sumBuffer[indexInWarp] = sum;
// for (int step = 1; step < 32; step *= 2) {
// int add = (indexInWarp >= step ? sumBuffer[indexInWarp-step] : 0);
// sumBuffer[indexInWarp] += add;
// }
//
//
// for (unsigned int k = 2; k < 2*length; k *= 2) {
// for (unsigned int j = k/2; j > 0; j /= 2) {
// for (unsigned int i = indexInWarp; i < length; i += 32) {
// int ixj = i^j;
// if (ixj > i && ixj < length) {
// int key1 = __popc(flags[i]);
// int key2 = __popc(flags[ixj]);
// bool ascending = ((i&k) == 0);
// for (unsigned int mask = k*2; mask < 2*length; mask *= 2)
// ascending = ((i&mask) == 0 ? !ascending : ascending);
// int lowKey = (ascending ? key1 : key2);
// int highKey = (ascending ? key2 : key1);
// if (lowKey < highKey) {
// int tempAtom = atoms[i];
// int tempFlags = flags[i];
// atoms[i] = atoms[ixj];
// flags[i] = flags[ixj];
// atoms[ixj] = tempAtom;
// flags[ixj] = tempFlags;
// }
// }
// }
// }
// }
// return length-sumBuffer[31];
const int warpMask = (1<<indexInWarp)-1;
int numCompacted = 0;
for (int i = indexInWarp; i < BUFFER_SIZE; i += 32) {
int atom = atoms[i];
int flag = flags[i];
bool include = (i < length && __popc(flags[i]) > maxBitsForPairs);
int includeFlags = __ballot(include);
if (include) {
int index = numCompacted+__popc(includeFlags&warpMask);
atoms[index] = atom;
flags[index] = flag;
}
numCompacted += __popc(includeFlags);
}
return numCompacted;
}
/** /**
* Compare the bounding boxes for each pair of atom blocks (comprised of 32 atoms each), forming a tile. If the two * Compare the bounding boxes for each pair of atom blocks (comprised of 32 atoms each), forming a tile. If the two
* atom blocks are sufficiently far apart, mark them as non-interacting. There are two stages in the algorithm. * atom blocks are sufficiently far apart, mark them as non-interacting. There are two stages in the algorithm.
...@@ -124,8 +211,9 @@ extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, co ...@@ -124,8 +211,9 @@ extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, co
* *
*/ */
extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
unsigned int* __restrict__ interactionCount, int* __restrict__ interactingTiles, unsigned int* __restrict__ interactingAtoms, const real4* __restrict__ posq, unsigned int* __restrict__ interactionCount, int* __restrict__ interactingTiles, unsigned int* __restrict__ interactingAtoms,
unsigned int maxTiles, unsigned int startBlockIndex, unsigned int numBlocks, real2* __restrict__ sortedBlocks, const real4* __restrict__ sortedBlockCenter, unsigned int* __restrict__ singlePairCount, int2* __restrict__ singlePairs, const real4* __restrict__ posq, unsigned int maxTiles,
unsigned int maxSinglePairs, unsigned int startBlockIndex, unsigned int numBlocks, real2* __restrict__ sortedBlocks, const real4* __restrict__ sortedBlockCenter,
const real4* __restrict__ sortedBlockBoundingBox, const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices, const real4* __restrict__ sortedBlockBoundingBox, const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices,
real4* __restrict__ oldPositions, const int* __restrict__ rebuildNeighborList) { real4* __restrict__ oldPositions, const int* __restrict__ rebuildNeighborList) {
...@@ -138,12 +226,17 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea ...@@ -138,12 +226,17 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
const int warpIndex = (blockIdx.x*blockDim.x+threadIdx.x)/32; const int warpIndex = (blockIdx.x*blockDim.x+threadIdx.x)/32;
const int warpMask = (1<<indexInWarp)-1; const int warpMask = (1<<indexInWarp)-1;
__shared__ int workgroupBuffer[BUFFER_SIZE*(GROUP_SIZE/32)]; __shared__ int workgroupBuffer[BUFFER_SIZE*(GROUP_SIZE/32)];
__shared__ int workgroupFlagsBuffer[BUFFER_SIZE*(GROUP_SIZE/32)];
__shared__ int warpExclusions[MAX_EXCLUSIONS*(GROUP_SIZE/32)]; __shared__ int warpExclusions[MAX_EXCLUSIONS*(GROUP_SIZE/32)];
__shared__ real3 posBuffer[GROUP_SIZE]; __shared__ real3 posBuffer[GROUP_SIZE];
__shared__ volatile int workgroupTileIndex[GROUP_SIZE/32]; __shared__ volatile int workgroupTileIndex[GROUP_SIZE/32];
__shared__ int sumBuffer[GROUP_SIZE];
__shared__ int worksgroupPairStartIndex[GROUP_SIZE/32];
int* buffer = workgroupBuffer+BUFFER_SIZE*(warpStart/32); int* buffer = workgroupBuffer+BUFFER_SIZE*(warpStart/32);
int* flagsBuffer = workgroupFlagsBuffer+BUFFER_SIZE*(warpStart/32);
int* exclusionsForX = warpExclusions+MAX_EXCLUSIONS*(warpStart/32); int* exclusionsForX = warpExclusions+MAX_EXCLUSIONS*(warpStart/32);
volatile int& tileStartIndex = workgroupTileIndex[warpStart/32]; volatile int& tileStartIndex = workgroupTileIndex[warpStart/32];
volatile int& pairStartIndex = worksgroupPairStartIndex[warpStart/32];
// Loop over blocks. // Loop over blocks.
...@@ -227,7 +320,7 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea ...@@ -227,7 +320,7 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
APPLY_PERIODIC_TO_DELTA(atomDelta) APPLY_PERIODIC_TO_DELTA(atomDelta)
#endif #endif
int atomFlags = ballot(atomDelta.x*atomDelta.x+atomDelta.y*atomDelta.y+atomDelta.z*atomDelta.z < (PADDED_CUTOFF+blockCenterY.w)*(PADDED_CUTOFF+blockCenterY.w)); int atomFlags = ballot(atomDelta.x*atomDelta.x+atomDelta.y*atomDelta.y+atomDelta.z*atomDelta.z < (PADDED_CUTOFF+blockCenterY.w)*(PADDED_CUTOFF+blockCenterY.w));
bool interacts = false; int interacts = 0;
if (atom2 < NUM_ATOMS && atomFlags != 0) { if (atom2 < NUM_ATOMS && atomFlags != 0) {
int first = __ffs(atomFlags)-1; int first = __ffs(atomFlags)-1;
int last = 32-__clz(atomFlags); int last = 32-__clz(atomFlags);
...@@ -236,14 +329,14 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea ...@@ -236,14 +329,14 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
for (int j = first; j < last; j++) { for (int j = first; j < last; j++) {
real3 delta = pos2-posBuffer[warpStart+j]; real3 delta = pos2-posBuffer[warpStart+j];
APPLY_PERIODIC_TO_DELTA(delta) APPLY_PERIODIC_TO_DELTA(delta)
interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED); interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED ? 1<<j : 0);
} }
} }
else { else {
#endif #endif
for (int j = first; j < last; j++) { for (int j = first; j < last; j++) {
real3 delta = pos2-posBuffer[warpStart+j]; real3 delta = pos2-posBuffer[warpStart+j];
interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED); interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED ? 1<<j : 0);
} }
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
} }
...@@ -253,12 +346,16 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea ...@@ -253,12 +346,16 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
// Add any interacting atoms to the buffer. // Add any interacting atoms to the buffer.
int includeAtomFlags = __ballot(interacts); int includeAtomFlags = __ballot(interacts);
if (interacts) if (interacts) {
buffer[neighborsInBuffer+__popc(includeAtomFlags&warpMask)] = atom2; int index = neighborsInBuffer+__popc(includeAtomFlags&warpMask);
buffer[index] = atom2;
flagsBuffer[index] = interacts;
}
neighborsInBuffer += __popc(includeAtomFlags); neighborsInBuffer += __popc(includeAtomFlags);
if (neighborsInBuffer > BUFFER_SIZE-TILE_SIZE) { if (neighborsInBuffer > BUFFER_SIZE-TILE_SIZE) {
// Store the new tiles to memory. // Store the new tiles to memory.
neighborsInBuffer = sortBlockAtoms(x, buffer, flagsBuffer, neighborsInBuffer, maxSinglePairs, singlePairCount, singlePairs, sumBuffer+warpStart, pairStartIndex);
int tilesToStore = neighborsInBuffer/TILE_SIZE; int tilesToStore = neighborsInBuffer/TILE_SIZE;
if (indexInWarp == 0) if (indexInWarp == 0)
tileStartIndex = atomicAdd(interactionCount, tilesToStore); tileStartIndex = atomicAdd(interactionCount, tilesToStore);
...@@ -277,6 +374,8 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea ...@@ -277,6 +374,8 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
// If we have a partially filled buffer, store it to memory. // If we have a partially filled buffer, store it to memory.
if (neighborsInBuffer > 32)
neighborsInBuffer = sortBlockAtoms(x, buffer, flagsBuffer, neighborsInBuffer, maxSinglePairs, singlePairCount, singlePairs, sumBuffer+warpStart, pairStartIndex);
if (neighborsInBuffer > 0) { if (neighborsInBuffer > 0) {
int tilesToStore = (neighborsInBuffer+TILE_SIZE-1)/TILE_SIZE; int tilesToStore = (neighborsInBuffer+TILE_SIZE-1)/TILE_SIZE;
if (indexInWarp == 0) if (indexInWarp == 0)
......
...@@ -105,7 +105,8 @@ extern "C" __global__ void computeNonbonded( ...@@ -105,7 +105,8 @@ extern "C" __global__ void computeNonbonded(
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
, const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount,real4 periodicBoxSize, real4 invPeriodicBoxSize, , const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount,real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter,
const real4* __restrict__ blockSize, const unsigned int* __restrict__ interactingAtoms const real4* __restrict__ blockSize, const unsigned int* __restrict__ interactingAtoms, unsigned int maxSinglePairs, const int* __restrict__ singlePairCount,
const int2* __restrict__ singlePairs
#endif #endif
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE; const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
...@@ -588,6 +589,59 @@ extern "C" __global__ void computeNonbonded( ...@@ -588,6 +589,59 @@ extern "C" __global__ void computeNonbonded(
} }
pos++; pos++;
} }
// Third loop: single pairs that aren't part of a tile.
#if USE_CUTOFF
const unsigned int numPairs = singlePairCount[0];
if (numPairs > maxSinglePairs)
return; // There wasn't enough memory for the neighbor list.
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < numPairs; i += blockDim.x*gridDim.x) {
int2 pair = singlePairs[i];
int atom1 = pair.x;
int atom2 = pair.y;
real4 posq1 = posq[atom1];
real4 posq2 = posq[atom2];
LOAD_ATOM1_PARAMETERS
int j = atom2;
atom2 = threadIdx.x;
DECLARE_LOCAL_PARAMETERS
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
LOAD_ATOM2_PARAMETERS
atom2 = pair.y;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real invR = RSQRT(r2);
real r = r2*invR;
#ifdef USE_SYMMETRIC
real dEdR = 0.0f;
#else
real3 dEdR1 = make_real3(0);
real3 dEdR2 = make_real3(0);
#endif
bool hasExclusions = false;
bool isExcluded = false;
real tempEnergy = 0.0f;
const real interactionScale = 1.0f;
COMPUTE_INTERACTION
energy += tempEnergy;
#ifdef INCLUDE_FORCES
#ifdef USE_SYMMETRIC
real3 dEdR1 = delta*dEdR;
real3 dEdR2 = -dEdR1;
#endif
atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (-dEdR1.x*0x100000000)));
atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (-dEdR1.y*0x100000000)));
atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (-dEdR1.z*0x100000000)));
atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (-dEdR2.x*0x100000000)));
atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (-dEdR2.y*0x100000000)));
atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (-dEdR2.z*0x100000000)));
#endif
}
#endif
#ifdef INCLUDE_ENERGY #ifdef INCLUDE_ENERGY
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy; energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
#endif #endif
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment