Commit 7c911459 authored by Peter Eastman's avatar Peter Eastman
Browse files

Further improvements to pair list

parent 9650d66e
......@@ -78,8 +78,9 @@ public:
* @param exclusionList for each atom, specifies the list of other atoms whose interactions should be excluded
* @param kernel the code to evaluate the interaction
* @param forceGroup the force group in which the interaction should be calculated
* @param supportsPairList specifies whether this interaction can work with a neighbor list that uses a separate pair list
*/
void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector<std::vector<int> >& exclusionList, const std::string& kernel, int forceGroup);
void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector<std::vector<int> >& exclusionList, const std::string& kernel, int forceGroup, bool supportsPairList=false);
/**
* Add a per-atom parameter that the default interaction kernel may depend on.
*/
......@@ -189,6 +190,12 @@ public:
CudaArray& getInteractingAtoms() {
return *interactingAtoms;
}
/**
* Get the array containing single pairs in the neighbor list.
*/
CudaArray& getSinglePairs() {
return *singlePairs;
}
/**
* Get the array containing exclusion flags.
*/
......@@ -290,7 +297,7 @@ private:
std::map<int, double> groupCutoff;
std::map<int, std::string> groupKernelSource;
double lastCutoff;
bool useCutoff, usePeriodic, anyExclusions, usePadding, forceRebuildNeighborList;
bool useCutoff, usePeriodic, anyExclusions, usePadding, forceRebuildNeighborList, canUsePairList;
int startTileIndex, numTiles, startBlockIndex, numBlocks, maxTiles, maxSinglePairs, maxExclusions, numForceThreadBlocks, forceThreadBlockSize, numAtoms, groupFlags;
};
......
......@@ -83,7 +83,7 @@ private:
std::vector<Kernel> kernels;
std::vector<long long> completionTimes;
std::vector<double> contextNonbondedFractions;
int* tileCounts;
int2* interactionCounts;
CudaArray* contextForces;
void* pinnedPositionBuffer;
long long* pinnedForceBuffer;
......
......@@ -1920,7 +1920,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
// Add the interaction to the default nonbonded kernel.
string source = cu.replaceStrings(CudaKernelSources::coulombLennardJones, defines);
cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup());
cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup(), true);
if (hasLJ)
cu.getNonbondedUtilities().addParameter(CudaNonbondedUtilities::ParameterInfo("sigmaEpsilon", "float", 2, sizeof(float2), sigmaEpsilon->getDevicePointer()));
......@@ -2314,7 +2314,7 @@ void CudaCalcCustomNonbondedForceKernel::initialize(const System& system, const
if (force.getNumInteractionGroups() > 0)
initInteractionGroups(force, source, tableTypes);
else {
cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup());
cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup(), true);
for (int i = 0; i < (int) params->getBuffers().size(); i++) {
CudaNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
cu.getNonbondedUtilities().addParameter(CudaNonbondedUtilities::ParameterInfo(prefix+"params"+cu.intToString(i+1), buffer.getComponentType(), buffer.getNumComponents(), buffer.getSize(), buffer.getMemory()));
......
......@@ -64,8 +64,9 @@ private:
CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(context), useCutoff(false), usePeriodic(false), anyExclusions(false), usePadding(true),
exclusionIndices(NULL), exclusionRowIndices(NULL), exclusionTiles(NULL), exclusions(NULL), interactingTiles(NULL), interactingAtoms(NULL),
interactionCount(NULL), singlePairs(NULL), singlePairCount(NULL), blockCenter(NULL), blockBoundingBox(NULL), sortedBlocks(NULL), sortedBlockCenter(NULL), sortedBlockBoundingBox(NULL),
oldPositions(NULL), rebuildNeighborList(NULL), blockSorter(NULL), pinnedCountBuffer(NULL), forceRebuildNeighborList(true), lastCutoff(0.0), groupFlags(0) {
interactionCount(NULL), singlePairs(NULL), blockCenter(NULL), blockBoundingBox(NULL), sortedBlocks(NULL), sortedBlockCenter(NULL), sortedBlockBoundingBox(NULL),
oldPositions(NULL), rebuildNeighborList(NULL), blockSorter(NULL), pinnedCountBuffer(NULL), forceRebuildNeighborList(true), lastCutoff(0.0), groupFlags(0),
canUsePairList(true) {
// Decide how many thread blocks to use.
string errorMessage = "Error initializing nonbonded utilities";
......@@ -94,8 +95,6 @@ CudaNonbondedUtilities::~CudaNonbondedUtilities() {
delete interactionCount;
if (singlePairs != NULL)
delete singlePairs;
if (singlePairCount != NULL)
delete singlePairCount;
if (blockCenter != NULL)
delete blockCenter;
if (blockBoundingBox != NULL)
......@@ -117,7 +116,7 @@ CudaNonbondedUtilities::~CudaNonbondedUtilities() {
cuEventDestroy(downloadCountEvent);
}
void CudaNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup) {
void CudaNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup, bool supportsPairList) {
if (groupCutoff.size() > 0) {
if (usesCutoff != useCutoff)
throw OpenMMException("All Forces must agree on whether to use a cutoff");
......@@ -132,6 +131,7 @@ void CudaNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic,
usePeriodic = usesPeriodic;
groupCutoff[forceGroup] = cutoffDistance;
groupFlags |= 1<<forceGroup;
canUsePairList &= supportsPairList;
if (kernel.size() > 0) {
if (groupKernelSource.find(forceGroup) == groupKernelSource.end())
groupKernelSource[forceGroup] = "";
......@@ -286,9 +286,8 @@ void CudaNonbondedUtilities::initialize(const System& system) {
maxSinglePairs = 5*numAtoms;
interactingTiles = CudaArray::create<int>(context, maxTiles, "interactingTiles");
interactingAtoms = CudaArray::create<int>(context, CudaContext::TileSize*maxTiles, "interactingAtoms");
interactionCount = CudaArray::create<unsigned int>(context, 1, "interactionCount");
interactionCount = CudaArray::create<unsigned int>(context, 2, "interactionCount");
singlePairs = CudaArray::create<int2>(context, maxSinglePairs, "singlePairs");
singlePairCount = CudaArray::create<unsigned int>(context, 1, "singlePairCount");
int elementSize = (context.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
blockCenter = new CudaArray(context, numAtomBlocks, 4*elementSize, "blockCenter");
blockBoundingBox = new CudaArray(context, numAtomBlocks, 4*elementSize, "blockBoundingBox");
......@@ -298,9 +297,8 @@ void CudaNonbondedUtilities::initialize(const System& system) {
oldPositions = new CudaArray(context, numAtoms, 4*elementSize, "oldPositions");
rebuildNeighborList = CudaArray::create<int>(context, 1, "rebuildNeighborList");
blockSorter = new CudaSort(context, new BlockSortTrait(context.getUseDoublePrecision()), numAtomBlocks);
vector<unsigned int> count(1, 0);
vector<unsigned int> count(2, 0);
interactionCount->upload(count);
singlePairCount->upload(count);
}
// Record arguments for kernels.
......@@ -325,7 +323,6 @@ void CudaNonbondedUtilities::initialize(const System& system) {
forceArgs.push_back(&blockBoundingBox->getDevicePointer());
forceArgs.push_back(&interactingAtoms->getDevicePointer());
forceArgs.push_back(&maxSinglePairs);
forceArgs.push_back(&singlePairCount->getDevicePointer());
forceArgs.push_back(&singlePairs->getDevicePointer());
}
for (int i = 0; i < (int) parameters.size(); i++)
......@@ -354,7 +351,6 @@ void CudaNonbondedUtilities::initialize(const System& system) {
sortBoxDataArgs.push_back(&context.getPosq().getDevicePointer());
sortBoxDataArgs.push_back(&oldPositions->getDevicePointer());
sortBoxDataArgs.push_back(&interactionCount->getDevicePointer());
sortBoxDataArgs.push_back(&singlePairCount->getDevicePointer());
sortBoxDataArgs.push_back(&rebuildNeighborList->getDevicePointer());
sortBoxDataArgs.push_back(&forceRebuildNeighborList);
findInteractingBlocksArgs.push_back(context.getPeriodicBoxSizePointer());
......@@ -365,7 +361,6 @@ void CudaNonbondedUtilities::initialize(const System& system) {
findInteractingBlocksArgs.push_back(&interactionCount->getDevicePointer());
findInteractingBlocksArgs.push_back(&interactingTiles->getDevicePointer());
findInteractingBlocksArgs.push_back(&interactingAtoms->getDevicePointer());
findInteractingBlocksArgs.push_back(&singlePairCount->getDevicePointer());
findInteractingBlocksArgs.push_back(&singlePairs->getDevicePointer());
findInteractingBlocksArgs.push_back(&context.getPosq().getDevicePointer());
findInteractingBlocksArgs.push_back(&maxTiles);
......@@ -417,7 +412,6 @@ void CudaNonbondedUtilities::prepareInteractions(int forceGroups) {
forceRebuildNeighborList = false;
lastCutoff = kernels.cutoffDistance;
interactionCount->download(pinnedCountBuffer, false);
singlePairCount->download(pinnedCountBuffer+1, false);
cuEventRecord(downloadCountEvent, context.getCurrentStream());
}
......@@ -470,8 +464,8 @@ bool CudaNonbondedUtilities::updateNeighborListSize() {
singlePairs = NULL; // Avoid an error in the destructor if the following allocation fails
singlePairs = CudaArray::create<int2>(context, maxSinglePairs, "singlePairs");
if (forceArgs.size() > 0)
forceArgs[20] = &singlePairs->getDevicePointer();
findInteractingBlocksArgs[9] = &singlePairs->getDevicePointer();
forceArgs[19] = &singlePairs->getDevicePointer();
findInteractingBlocksArgs[8] = &singlePairs->getDevicePointer();
}
forceRebuildNeighborList = true;
context.setForcesValid(false);
......@@ -520,6 +514,7 @@ void CudaNonbondedUtilities::createKernelsForGroups(int groups) {
if (usePeriodic)
defines["USE_PERIODIC"] = "1";
defines["MAX_EXCLUSIONS"] = context.intToString(maxExclusions);
defines["MAX_BITS_FOR_PAIRS"] = (canUsePairList ? "2" : "0");
CUmodule interactingBlocksProgram = context.createModule(CudaKernelSources::vectorOps+CudaKernelSources::findInteractingBlocks, defines);
kernels.findBlockBoundsKernel = context.getKernel(interactingBlocksProgram, "findBlockBounds");
kernels.sortBoxDataKernel = context.getKernel(interactingBlocksProgram, "sortBoxData");
......
......@@ -63,8 +63,8 @@ if (result != CUDA_SUCCESS) { \
class CudaParallelCalcForcesAndEnergyKernel::BeginComputationTask : public CudaContext::WorkTask {
public:
BeginComputationTask(ContextImpl& context, CudaContext& cu, CudaCalcForcesAndEnergyKernel& kernel,
bool includeForce, bool includeEnergy, int groups, void* pinnedMemory, CUevent event, int& numTiles) : context(context), cu(cu), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), pinnedMemory(pinnedMemory), event(event), numTiles(numTiles) {
bool includeForce, bool includeEnergy, int groups, void* pinnedMemory, CUevent event, int2& interactionCount) : context(context), cu(cu), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), pinnedMemory(pinnedMemory), event(event), interactionCount(interactionCount) {
}
void execute() {
// Copy coordinates over to this device and execute the kernel.
......@@ -77,7 +77,7 @@ public:
}
kernel.beginComputation(context, includeForce, includeEnergy, groups);
if (cu.getNonbondedUtilities().getUsePeriodic())
cu.getNonbondedUtilities().getInteractionCount().download(&numTiles, false);
cu.getNonbondedUtilities().getInteractionCount().download(&interactionCount, false);
}
private:
ContextImpl& context;
......@@ -87,15 +87,15 @@ private:
int groups;
void* pinnedMemory;
CUevent event;
int& numTiles;
int2& interactionCount;
};
class CudaParallelCalcForcesAndEnergyKernel::FinishComputationTask : public CudaContext::WorkTask {
public:
FinishComputationTask(ContextImpl& context, CudaContext& cu, CudaCalcForcesAndEnergyKernel& kernel,
bool includeForce, bool includeEnergy, int groups, double& energy, long long& completionTime, long long* pinnedMemory, CudaArray& contextForces, bool& valid, int& numTiles) :
bool includeForce, bool includeEnergy, int groups, double& energy, long long& completionTime, long long* pinnedMemory, CudaArray& contextForces, bool& valid, int2& interactionCount) :
context(context), cu(cu), kernel(kernel), includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), energy(energy),
completionTime(completionTime), pinnedMemory(pinnedMemory), contextForces(contextForces), valid(valid), numTiles(numTiles) {
completionTime(completionTime), pinnedMemory(pinnedMemory), contextForces(contextForces), valid(valid), interactionCount(interactionCount) {
}
void execute() {
// Execute the kernel, then download forces.
......@@ -120,7 +120,8 @@ public:
cu.getForce().download(&pinnedMemory[(cu.getContextIndex()-1)*numAtoms*3]);
}
}
if (cu.getNonbondedUtilities().getUsePeriodic() && numTiles > cu.getNonbondedUtilities().getInteractingTiles().getSize()) {
if (cu.getNonbondedUtilities().getUsePeriodic() && (interactionCount.x > cu.getNonbondedUtilities().getInteractingTiles().getSize() ||
interactionCount.y > cu.getNonbondedUtilities().getSinglePairs().getSize())) {
valid = false;
cu.getNonbondedUtilities().updateNeighborListSize();
}
......@@ -136,12 +137,12 @@ private:
long long* pinnedMemory;
CudaArray& contextForces;
bool& valid;
int& numTiles;
int2& interactionCount;
};
CudaParallelCalcForcesAndEnergyKernel::CudaParallelCalcForcesAndEnergyKernel(string name, const Platform& platform, CudaPlatform::PlatformData& data) :
CalcForcesAndEnergyKernel(name, platform), data(data), completionTimes(data.contexts.size()), contextNonbondedFractions(data.contexts.size()),
tileCounts(NULL), contextForces(NULL), pinnedPositionBuffer(NULL), pinnedForceBuffer(NULL) {
interactionCounts(NULL), contextForces(NULL), pinnedPositionBuffer(NULL), pinnedForceBuffer(NULL) {
for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new CudaCalcForcesAndEnergyKernel(name, platform, *data.contexts[i])));
}
......@@ -156,8 +157,8 @@ CudaParallelCalcForcesAndEnergyKernel::~CudaParallelCalcForcesAndEnergyKernel()
cuMemFreeHost(pinnedForceBuffer);
cuEventDestroy(event);
cuStreamDestroy(peerCopyStream);
if (tileCounts != NULL)
cuMemFreeHost(tileCounts);
if (interactionCounts != NULL)
cuMemFreeHost(interactionCounts);
}
void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
......@@ -172,7 +173,7 @@ void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
contextNonbondedFractions[i] = 1/(double) numContexts;
CHECK_RESULT(cuEventCreate(&event, 0), "Error creating event");
CHECK_RESULT(cuStreamCreate(&peerCopyStream, CU_STREAM_NON_BLOCKING), "Error creating stream");
CHECK_RESULT(cuMemHostAlloc((void**) &tileCounts, numContexts*sizeof(int), 0), "Error creating tile count buffer");
CHECK_RESULT(cuMemHostAlloc((void**) &interactionCounts, numContexts*sizeof(int2), 0), "Error creating interaction counts buffer");
}
void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
......@@ -202,7 +203,7 @@ void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& contex
data.contextEnergy[i] = 0.0;
CudaContext& cu = *data.contexts[i];
CudaContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new BeginComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, pinnedPositionBuffer, event, tileCounts[i]));
thread.addTask(new BeginComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, pinnedPositionBuffer, event, interactionCounts[i]));
}
}
......@@ -210,7 +211,7 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con
for (int i = 0; i < (int) data.contexts.size(); i++) {
CudaContext& cu = *data.contexts[i];
CudaContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new FinishComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, data.contextEnergy[i], completionTimes[i], pinnedForceBuffer, *contextForces, valid, tileCounts[i]));
thread.addTask(new FinishComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, data.contextEnergy[i], completionTimes[i], pinnedForceBuffer, *contextForces, valid, interactionCounts[i]));
}
data.syncContexts();
double energy = 0.0;
......
......@@ -53,7 +53,7 @@ extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize,
extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, const real4* __restrict__ blockCenter,
const real4* __restrict__ blockBoundingBox, real4* __restrict__ sortedBlockCenter,
real4* __restrict__ sortedBlockBoundingBox, const real4* __restrict__ posq, const real4* __restrict__ oldPositions,
unsigned int* __restrict__ interactionCount, unsigned int* __restrict__ singlePairCount, int* __restrict__ rebuildNeighborList, bool forceRebuild) {
unsigned int* __restrict__ interactionCount, int* __restrict__ rebuildNeighborList, bool forceRebuild) {
for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_BLOCKS; i += blockDim.x*gridDim.x) {
int index = (int) sortedBlock[i].y;
sortedBlockCenter[i] = blockCenter[index];
......@@ -71,7 +71,7 @@ extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, co
if (rebuild) {
rebuildNeighborList[0] = 1;
interactionCount[0] = 0;
singlePairCount[0] = 0;
interactionCount[1] = 0;
}
}
......@@ -79,11 +79,10 @@ __device__ int saveSinglePairs(int x, int* atoms, int* flags, int length, unsign
// Record interactions that should be computed as single pairs rather than in blocks.
const int indexInWarp = threadIdx.x%32;
const int maxBitsForPairs = 2;
int sum = 0;
for (int i = indexInWarp; i < length; i += 32) {
int count = __popc(flags[i]);
sum += (count <= maxBitsForPairs ? count : 0);
sum += (count <= MAX_BITS_FOR_PAIRS ? count : 0);
}
sumBuffer[indexInWarp] = sum;
for (int step = 1; step < 32; step *= 2) {
......@@ -96,7 +95,7 @@ __device__ int saveSinglePairs(int x, int* atoms, int* flags, int length, unsign
int pairIndex = pairStartIndex + (indexInWarp > 0 ? sumBuffer[indexInWarp-1] : 0);
for (int i = indexInWarp; i < length; i += 32) {
int count = __popc(flags[i]);
if (count <= maxBitsForPairs && pairIndex+count < maxSinglePairs) {
if (count <= MAX_BITS_FOR_PAIRS && pairIndex+count < maxSinglePairs) {
int f = flags[i];
while (f != 0) {
singlePairs[pairIndex] = make_int2(atoms[i], x*TILE_SIZE+__ffs(f)-1);
......@@ -114,7 +113,7 @@ __device__ int saveSinglePairs(int x, int* atoms, int* flags, int length, unsign
int i = start+indexInWarp;
int atom = atoms[i];
int flag = flags[i];
bool include = (i < length && __popc(flags[i]) > maxBitsForPairs);
bool include = (i < length && __popc(flags[i]) > MAX_BITS_FOR_PAIRS);
int includeFlags = __ballot(include);
if (include) {
int index = numCompacted+__popc(includeFlags&warpMask);
......@@ -177,8 +176,8 @@ __device__ int saveSinglePairs(int x, int* atoms, int* flags, int length, unsign
*/
extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
unsigned int* __restrict__ interactionCount, int* __restrict__ interactingTiles, unsigned int* __restrict__ interactingAtoms,
unsigned int* __restrict__ singlePairCount, int2* __restrict__ singlePairs, const real4* __restrict__ posq, unsigned int maxTiles,
unsigned int maxSinglePairs, unsigned int startBlockIndex, unsigned int numBlocks, real2* __restrict__ sortedBlocks, const real4* __restrict__ sortedBlockCenter,
int2* __restrict__ singlePairs, const real4* __restrict__ posq, unsigned int maxTiles, unsigned int maxSinglePairs,
unsigned int startBlockIndex, unsigned int numBlocks, real2* __restrict__ sortedBlocks, const real4* __restrict__ sortedBlockCenter,
const real4* __restrict__ sortedBlockBoundingBox, const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices,
real4* __restrict__ oldPositions, const int* __restrict__ rebuildNeighborList) {
......@@ -320,11 +319,13 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
if (neighborsInBuffer > BUFFER_SIZE-TILE_SIZE) {
// Store the new tiles to memory.
neighborsInBuffer = saveSinglePairs(x, buffer, flagsBuffer, neighborsInBuffer, maxSinglePairs, singlePairCount, singlePairs, sumBuffer+warpStart, pairStartIndex);
#if MAX_BITS_FOR_PAIRS > 0
neighborsInBuffer = saveSinglePairs(x, buffer, flagsBuffer, neighborsInBuffer, maxSinglePairs, &interactionCount[1], singlePairs, sumBuffer+warpStart, pairStartIndex);
#endif
int tilesToStore = neighborsInBuffer/TILE_SIZE;
if (tilesToStore > 0) {
if (indexInWarp == 0)
tileStartIndex = atomicAdd(interactionCount, tilesToStore);
tileStartIndex = atomicAdd(&interactionCount[0], tilesToStore);
int newTileStartIndex = tileStartIndex;
if (newTileStartIndex+tilesToStore <= maxTiles) {
if (indexInWarp < tilesToStore)
......@@ -341,12 +342,14 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
// If we have a partially filled buffer, store it to memory.
#if MAX_BITS_FOR_PAIRS > 0
if (neighborsInBuffer > 32)
neighborsInBuffer = saveSinglePairs(x, buffer, flagsBuffer, neighborsInBuffer, maxSinglePairs, singlePairCount, singlePairs, sumBuffer+warpStart, pairStartIndex);
neighborsInBuffer = saveSinglePairs(x, buffer, flagsBuffer, neighborsInBuffer, maxSinglePairs, &interactionCount[1], singlePairs, sumBuffer+warpStart, pairStartIndex);
#endif
if (neighborsInBuffer > 0) {
int tilesToStore = (neighborsInBuffer+TILE_SIZE-1)/TILE_SIZE;
if (indexInWarp == 0)
tileStartIndex = atomicAdd(interactionCount, tilesToStore);
tileStartIndex = atomicAdd(&interactionCount[0], tilesToStore);
int newTileStartIndex = tileStartIndex;
if (newTileStartIndex+tilesToStore <= maxTiles) {
if (indexInWarp < tilesToStore)
......
......@@ -103,9 +103,9 @@ extern "C" __global__ void computeNonbonded(
unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, const tileflags* __restrict__ exclusions,
const ushort2* __restrict__ exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices
#ifdef USE_CUTOFF
, const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount,real4 periodicBoxSize, real4 invPeriodicBoxSize,
, const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter,
const real4* __restrict__ blockSize, const unsigned int* __restrict__ interactingAtoms, unsigned int maxSinglePairs, const int* __restrict__ singlePairCount,
const real4* __restrict__ blockSize, const unsigned int* __restrict__ interactingAtoms, unsigned int maxSinglePairs,
const int2* __restrict__ singlePairs
#endif
PARAMETER_ARGUMENTS) {
......@@ -593,7 +593,7 @@ extern "C" __global__ void computeNonbonded(
// Third loop: single pairs that aren't part of a tile.
#if USE_CUTOFF
const unsigned int numPairs = singlePairCount[0];
const unsigned int numPairs = interactionCount[1];
if (numPairs > maxSinglePairs)
return; // There wasn't enough memory for the neighbor list.
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < numPairs; i += blockDim.x*gridDim.x) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment