"PyTorch/vscode:/vscode.git/clone" did not exist on "6a57493a58afc4b7b131511e6cd7217f3e6eb62e"
Commit 388bf972 authored by peastman's avatar peastman Committed by GitHub
Browse files

Merge pull request #1632 from peastman/pairs

Fine grained pair list of nonbonded interactions
parents 064a781f 7c911459
...@@ -78,8 +78,9 @@ public: ...@@ -78,8 +78,9 @@ public:
* @param exclusionList for each atom, specifies the list of other atoms whose interactions should be excluded * @param exclusionList for each atom, specifies the list of other atoms whose interactions should be excluded
* @param kernel the code to evaluate the interaction * @param kernel the code to evaluate the interaction
* @param forceGroup the force group in which the interaction should be calculated * @param forceGroup the force group in which the interaction should be calculated
* @param supportsPairList specifies whether this interaction can work with a neighbor list that uses a separate pair list
*/ */
void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector<std::vector<int> >& exclusionList, const std::string& kernel, int forceGroup); void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector<std::vector<int> >& exclusionList, const std::string& kernel, int forceGroup, bool supportsPairList=false);
/** /**
* Add a per-atom parameter that the default interaction kernel may depend on. * Add a per-atom parameter that the default interaction kernel may depend on.
*/ */
...@@ -189,6 +190,12 @@ public: ...@@ -189,6 +190,12 @@ public:
CudaArray& getInteractingAtoms() { CudaArray& getInteractingAtoms() {
return *interactingAtoms; return *interactingAtoms;
} }
/**
* Get the array containing single pairs in the neighbor list.
*/
CudaArray& getSinglePairs() {
return *singlePairs;
}
/** /**
* Get the array containing exclusion flags. * Get the array containing exclusion flags.
*/ */
...@@ -270,6 +277,8 @@ private: ...@@ -270,6 +277,8 @@ private:
CudaArray* interactingTiles; CudaArray* interactingTiles;
CudaArray* interactingAtoms; CudaArray* interactingAtoms;
CudaArray* interactionCount; CudaArray* interactionCount;
CudaArray* singlePairs;
CudaArray* singlePairCount;
CudaArray* blockCenter; CudaArray* blockCenter;
CudaArray* blockBoundingBox; CudaArray* blockBoundingBox;
CudaArray* sortedBlocks; CudaArray* sortedBlocks;
...@@ -288,8 +297,8 @@ private: ...@@ -288,8 +297,8 @@ private:
std::map<int, double> groupCutoff; std::map<int, double> groupCutoff;
std::map<int, std::string> groupKernelSource; std::map<int, std::string> groupKernelSource;
double lastCutoff; double lastCutoff;
bool useCutoff, usePeriodic, anyExclusions, usePadding, forceRebuildNeighborList; bool useCutoff, usePeriodic, anyExclusions, usePadding, forceRebuildNeighborList, canUsePairList;
int startTileIndex, numTiles, startBlockIndex, numBlocks, maxTiles, maxExclusions, numForceThreadBlocks, forceThreadBlockSize, numAtoms, groupFlags; int startTileIndex, numTiles, startBlockIndex, numBlocks, maxTiles, maxSinglePairs, maxExclusions, numForceThreadBlocks, forceThreadBlockSize, numAtoms, groupFlags;
}; };
/** /**
......
...@@ -83,7 +83,7 @@ private: ...@@ -83,7 +83,7 @@ private:
std::vector<Kernel> kernels; std::vector<Kernel> kernels;
std::vector<long long> completionTimes; std::vector<long long> completionTimes;
std::vector<double> contextNonbondedFractions; std::vector<double> contextNonbondedFractions;
int* tileCounts; int2* interactionCounts;
CudaArray* contextForces; CudaArray* contextForces;
void* pinnedPositionBuffer; void* pinnedPositionBuffer;
long long* pinnedForceBuffer; long long* pinnedForceBuffer;
......
...@@ -1920,7 +1920,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -1920,7 +1920,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
// Add the interaction to the default nonbonded kernel. // Add the interaction to the default nonbonded kernel.
string source = cu.replaceStrings(CudaKernelSources::coulombLennardJones, defines); string source = cu.replaceStrings(CudaKernelSources::coulombLennardJones, defines);
cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup()); cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup(), true);
if (hasLJ) if (hasLJ)
cu.getNonbondedUtilities().addParameter(CudaNonbondedUtilities::ParameterInfo("sigmaEpsilon", "float", 2, sizeof(float2), sigmaEpsilon->getDevicePointer())); cu.getNonbondedUtilities().addParameter(CudaNonbondedUtilities::ParameterInfo("sigmaEpsilon", "float", 2, sizeof(float2), sigmaEpsilon->getDevicePointer()));
...@@ -2314,7 +2314,7 @@ void CudaCalcCustomNonbondedForceKernel::initialize(const System& system, const ...@@ -2314,7 +2314,7 @@ void CudaCalcCustomNonbondedForceKernel::initialize(const System& system, const
if (force.getNumInteractionGroups() > 0) if (force.getNumInteractionGroups() > 0)
initInteractionGroups(force, source, tableTypes); initInteractionGroups(force, source, tableTypes);
else { else {
cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup()); cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup(), true);
for (int i = 0; i < (int) params->getBuffers().size(); i++) { for (int i = 0; i < (int) params->getBuffers().size(); i++) {
CudaNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i]; CudaNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
cu.getNonbondedUtilities().addParameter(CudaNonbondedUtilities::ParameterInfo(prefix+"params"+cu.intToString(i+1), buffer.getComponentType(), buffer.getNumComponents(), buffer.getSize(), buffer.getMemory())); cu.getNonbondedUtilities().addParameter(CudaNonbondedUtilities::ParameterInfo(prefix+"params"+cu.intToString(i+1), buffer.getComponentType(), buffer.getNumComponents(), buffer.getSize(), buffer.getMemory()));
......
...@@ -64,15 +64,16 @@ private: ...@@ -64,15 +64,16 @@ private:
CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(context), useCutoff(false), usePeriodic(false), anyExclusions(false), usePadding(true), CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(context), useCutoff(false), usePeriodic(false), anyExclusions(false), usePadding(true),
exclusionIndices(NULL), exclusionRowIndices(NULL), exclusionTiles(NULL), exclusions(NULL), interactingTiles(NULL), interactingAtoms(NULL), exclusionIndices(NULL), exclusionRowIndices(NULL), exclusionTiles(NULL), exclusions(NULL), interactingTiles(NULL), interactingAtoms(NULL),
interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL), sortedBlocks(NULL), sortedBlockCenter(NULL), sortedBlockBoundingBox(NULL), interactionCount(NULL), singlePairs(NULL), blockCenter(NULL), blockBoundingBox(NULL), sortedBlocks(NULL), sortedBlockCenter(NULL), sortedBlockBoundingBox(NULL),
oldPositions(NULL), rebuildNeighborList(NULL), blockSorter(NULL), pinnedCountBuffer(NULL), forceRebuildNeighborList(true), lastCutoff(0.0), groupFlags(0) { oldPositions(NULL), rebuildNeighborList(NULL), blockSorter(NULL), pinnedCountBuffer(NULL), forceRebuildNeighborList(true), lastCutoff(0.0), groupFlags(0),
canUsePairList(true) {
// Decide how many thread blocks to use. // Decide how many thread blocks to use.
string errorMessage = "Error initializing nonbonded utilities"; string errorMessage = "Error initializing nonbonded utilities";
int multiprocessors; int multiprocessors;
CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, context.getDevice())); CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, context.getDevice()));
CHECK_RESULT(cuEventCreate(&downloadCountEvent, 0)); CHECK_RESULT(cuEventCreate(&downloadCountEvent, 0));
CHECK_RESULT(cuMemHostAlloc((void**) &pinnedCountBuffer, sizeof(int), CU_MEMHOSTALLOC_PORTABLE)); CHECK_RESULT(cuMemHostAlloc((void**) &pinnedCountBuffer, 2*sizeof(int), CU_MEMHOSTALLOC_PORTABLE));
numForceThreadBlocks = 4*multiprocessors; numForceThreadBlocks = 4*multiprocessors;
forceThreadBlockSize = (context.getComputeCapability() < 2.0 ? 128 : 256); forceThreadBlockSize = (context.getComputeCapability() < 2.0 ? 128 : 256);
} }
...@@ -92,6 +93,8 @@ CudaNonbondedUtilities::~CudaNonbondedUtilities() { ...@@ -92,6 +93,8 @@ CudaNonbondedUtilities::~CudaNonbondedUtilities() {
delete interactingAtoms; delete interactingAtoms;
if (interactionCount != NULL) if (interactionCount != NULL)
delete interactionCount; delete interactionCount;
if (singlePairs != NULL)
delete singlePairs;
if (blockCenter != NULL) if (blockCenter != NULL)
delete blockCenter; delete blockCenter;
if (blockBoundingBox != NULL) if (blockBoundingBox != NULL)
...@@ -113,7 +116,7 @@ CudaNonbondedUtilities::~CudaNonbondedUtilities() { ...@@ -113,7 +116,7 @@ CudaNonbondedUtilities::~CudaNonbondedUtilities() {
cuEventDestroy(downloadCountEvent); cuEventDestroy(downloadCountEvent);
} }
void CudaNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup) { void CudaNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup, bool supportsPairList) {
if (groupCutoff.size() > 0) { if (groupCutoff.size() > 0) {
if (usesCutoff != useCutoff) if (usesCutoff != useCutoff)
throw OpenMMException("All Forces must agree on whether to use a cutoff"); throw OpenMMException("All Forces must agree on whether to use a cutoff");
...@@ -128,6 +131,7 @@ void CudaNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, ...@@ -128,6 +131,7 @@ void CudaNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic,
usePeriodic = usesPeriodic; usePeriodic = usesPeriodic;
groupCutoff[forceGroup] = cutoffDistance; groupCutoff[forceGroup] = cutoffDistance;
groupFlags |= 1<<forceGroup; groupFlags |= 1<<forceGroup;
canUsePairList &= supportsPairList;
if (kernel.size() > 0) { if (kernel.size() > 0) {
if (groupKernelSource.find(forceGroup) == groupKernelSource.end()) if (groupKernelSource.find(forceGroup) == groupKernelSource.end())
groupKernelSource[forceGroup] = ""; groupKernelSource[forceGroup] = "";
...@@ -279,9 +283,11 @@ void CudaNonbondedUtilities::initialize(const System& system) { ...@@ -279,9 +283,11 @@ void CudaNonbondedUtilities::initialize(const System& system) {
maxTiles = numTiles; maxTiles = numTiles;
if (maxTiles < 1) if (maxTiles < 1)
maxTiles = 1; maxTiles = 1;
maxSinglePairs = 5*numAtoms;
interactingTiles = CudaArray::create<int>(context, maxTiles, "interactingTiles"); interactingTiles = CudaArray::create<int>(context, maxTiles, "interactingTiles");
interactingAtoms = CudaArray::create<int>(context, CudaContext::TileSize*maxTiles, "interactingAtoms"); interactingAtoms = CudaArray::create<int>(context, CudaContext::TileSize*maxTiles, "interactingAtoms");
interactionCount = CudaArray::create<unsigned int>(context, 1, "interactionCount"); interactionCount = CudaArray::create<unsigned int>(context, 2, "interactionCount");
singlePairs = CudaArray::create<int2>(context, maxSinglePairs, "singlePairs");
int elementSize = (context.getUseDoublePrecision() ? sizeof(double) : sizeof(float)); int elementSize = (context.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
blockCenter = new CudaArray(context, numAtomBlocks, 4*elementSize, "blockCenter"); blockCenter = new CudaArray(context, numAtomBlocks, 4*elementSize, "blockCenter");
blockBoundingBox = new CudaArray(context, numAtomBlocks, 4*elementSize, "blockBoundingBox"); blockBoundingBox = new CudaArray(context, numAtomBlocks, 4*elementSize, "blockBoundingBox");
...@@ -291,7 +297,7 @@ void CudaNonbondedUtilities::initialize(const System& system) { ...@@ -291,7 +297,7 @@ void CudaNonbondedUtilities::initialize(const System& system) {
oldPositions = new CudaArray(context, numAtoms, 4*elementSize, "oldPositions"); oldPositions = new CudaArray(context, numAtoms, 4*elementSize, "oldPositions");
rebuildNeighborList = CudaArray::create<int>(context, 1, "rebuildNeighborList"); rebuildNeighborList = CudaArray::create<int>(context, 1, "rebuildNeighborList");
blockSorter = new CudaSort(context, new BlockSortTrait(context.getUseDoublePrecision()), numAtomBlocks); blockSorter = new CudaSort(context, new BlockSortTrait(context.getUseDoublePrecision()), numAtomBlocks);
vector<unsigned int> count(1, 0); vector<unsigned int> count(2, 0);
interactionCount->upload(count); interactionCount->upload(count);
} }
...@@ -316,6 +322,8 @@ void CudaNonbondedUtilities::initialize(const System& system) { ...@@ -316,6 +322,8 @@ void CudaNonbondedUtilities::initialize(const System& system) {
forceArgs.push_back(&blockCenter->getDevicePointer()); forceArgs.push_back(&blockCenter->getDevicePointer());
forceArgs.push_back(&blockBoundingBox->getDevicePointer()); forceArgs.push_back(&blockBoundingBox->getDevicePointer());
forceArgs.push_back(&interactingAtoms->getDevicePointer()); forceArgs.push_back(&interactingAtoms->getDevicePointer());
forceArgs.push_back(&maxSinglePairs);
forceArgs.push_back(&singlePairs->getDevicePointer());
} }
for (int i = 0; i < (int) parameters.size(); i++) for (int i = 0; i < (int) parameters.size(); i++)
forceArgs.push_back(&parameters[i].getMemory()); forceArgs.push_back(&parameters[i].getMemory());
...@@ -353,8 +361,10 @@ void CudaNonbondedUtilities::initialize(const System& system) { ...@@ -353,8 +361,10 @@ void CudaNonbondedUtilities::initialize(const System& system) {
findInteractingBlocksArgs.push_back(&interactionCount->getDevicePointer()); findInteractingBlocksArgs.push_back(&interactionCount->getDevicePointer());
findInteractingBlocksArgs.push_back(&interactingTiles->getDevicePointer()); findInteractingBlocksArgs.push_back(&interactingTiles->getDevicePointer());
findInteractingBlocksArgs.push_back(&interactingAtoms->getDevicePointer()); findInteractingBlocksArgs.push_back(&interactingAtoms->getDevicePointer());
findInteractingBlocksArgs.push_back(&singlePairs->getDevicePointer());
findInteractingBlocksArgs.push_back(&context.getPosq().getDevicePointer()); findInteractingBlocksArgs.push_back(&context.getPosq().getDevicePointer());
findInteractingBlocksArgs.push_back(&maxTiles); findInteractingBlocksArgs.push_back(&maxTiles);
findInteractingBlocksArgs.push_back(&maxSinglePairs);
findInteractingBlocksArgs.push_back(&startBlockIndex); findInteractingBlocksArgs.push_back(&startBlockIndex);
findInteractingBlocksArgs.push_back(&numBlocks); findInteractingBlocksArgs.push_back(&numBlocks);
findInteractingBlocksArgs.push_back(&sortedBlocks->getDevicePointer()); findInteractingBlocksArgs.push_back(&sortedBlocks->getDevicePointer());
...@@ -424,12 +434,13 @@ void CudaNonbondedUtilities::computeInteractions(int forceGroups, bool includeFo ...@@ -424,12 +434,13 @@ void CudaNonbondedUtilities::computeInteractions(int forceGroups, bool includeFo
bool CudaNonbondedUtilities::updateNeighborListSize() { bool CudaNonbondedUtilities::updateNeighborListSize() {
if (!useCutoff) if (!useCutoff)
return false; return false;
if (pinnedCountBuffer[0] <= (unsigned int) maxTiles) if (pinnedCountBuffer[0] <= maxTiles && pinnedCountBuffer[1] <= maxSinglePairs)
return false; return false;
// The most recent timestep had too many interactions to fit in the arrays. Make the arrays bigger to prevent // The most recent timestep had too many interactions to fit in the arrays. Make the arrays bigger to prevent
// this from happening in the future. // this from happening in the future.
if (pinnedCountBuffer[0] > maxTiles) {
maxTiles = (int) (1.2*pinnedCountBuffer[0]); maxTiles = (int) (1.2*pinnedCountBuffer[0]);
int totalTiles = context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2; int totalTiles = context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2;
if (maxTiles > totalTiles) if (maxTiles > totalTiles)
...@@ -446,6 +457,16 @@ bool CudaNonbondedUtilities::updateNeighborListSize() { ...@@ -446,6 +457,16 @@ bool CudaNonbondedUtilities::updateNeighborListSize() {
if (forceArgs.size() > 0) if (forceArgs.size() > 0)
forceArgs[17] = &interactingAtoms->getDevicePointer(); forceArgs[17] = &interactingAtoms->getDevicePointer();
findInteractingBlocksArgs[7] = &interactingAtoms->getDevicePointer(); findInteractingBlocksArgs[7] = &interactingAtoms->getDevicePointer();
}
if (pinnedCountBuffer[1] > maxSinglePairs) {
maxSinglePairs = (int) (1.2*pinnedCountBuffer[1]);
delete singlePairs;
singlePairs = NULL; // Avoid an error in the destructor if the following allocation fails
singlePairs = CudaArray::create<int2>(context, maxSinglePairs, "singlePairs");
if (forceArgs.size() > 0)
forceArgs[19] = &singlePairs->getDevicePointer();
findInteractingBlocksArgs[8] = &singlePairs->getDevicePointer();
}
forceRebuildNeighborList = true; forceRebuildNeighborList = true;
context.setForcesValid(false); context.setForcesValid(false);
return true; return true;
...@@ -493,6 +514,7 @@ void CudaNonbondedUtilities::createKernelsForGroups(int groups) { ...@@ -493,6 +514,7 @@ void CudaNonbondedUtilities::createKernelsForGroups(int groups) {
if (usePeriodic) if (usePeriodic)
defines["USE_PERIODIC"] = "1"; defines["USE_PERIODIC"] = "1";
defines["MAX_EXCLUSIONS"] = context.intToString(maxExclusions); defines["MAX_EXCLUSIONS"] = context.intToString(maxExclusions);
defines["MAX_BITS_FOR_PAIRS"] = (canUsePairList ? "2" : "0");
CUmodule interactingBlocksProgram = context.createModule(CudaKernelSources::vectorOps+CudaKernelSources::findInteractingBlocks, defines); CUmodule interactingBlocksProgram = context.createModule(CudaKernelSources::vectorOps+CudaKernelSources::findInteractingBlocks, defines);
kernels.findBlockBoundsKernel = context.getKernel(interactingBlocksProgram, "findBlockBounds"); kernels.findBlockBoundsKernel = context.getKernel(interactingBlocksProgram, "findBlockBounds");
kernels.sortBoxDataKernel = context.getKernel(interactingBlocksProgram, "sortBoxData"); kernels.sortBoxDataKernel = context.getKernel(interactingBlocksProgram, "sortBoxData");
......
...@@ -63,8 +63,8 @@ if (result != CUDA_SUCCESS) { \ ...@@ -63,8 +63,8 @@ if (result != CUDA_SUCCESS) { \
class CudaParallelCalcForcesAndEnergyKernel::BeginComputationTask : public CudaContext::WorkTask { class CudaParallelCalcForcesAndEnergyKernel::BeginComputationTask : public CudaContext::WorkTask {
public: public:
BeginComputationTask(ContextImpl& context, CudaContext& cu, CudaCalcForcesAndEnergyKernel& kernel, BeginComputationTask(ContextImpl& context, CudaContext& cu, CudaCalcForcesAndEnergyKernel& kernel,
bool includeForce, bool includeEnergy, int groups, void* pinnedMemory, CUevent event, int& numTiles) : context(context), cu(cu), kernel(kernel), bool includeForce, bool includeEnergy, int groups, void* pinnedMemory, CUevent event, int2& interactionCount) : context(context), cu(cu), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), pinnedMemory(pinnedMemory), event(event), numTiles(numTiles) { includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), pinnedMemory(pinnedMemory), event(event), interactionCount(interactionCount) {
} }
void execute() { void execute() {
// Copy coordinates over to this device and execute the kernel. // Copy coordinates over to this device and execute the kernel.
...@@ -77,7 +77,7 @@ public: ...@@ -77,7 +77,7 @@ public:
} }
kernel.beginComputation(context, includeForce, includeEnergy, groups); kernel.beginComputation(context, includeForce, includeEnergy, groups);
if (cu.getNonbondedUtilities().getUsePeriodic()) if (cu.getNonbondedUtilities().getUsePeriodic())
cu.getNonbondedUtilities().getInteractionCount().download(&numTiles, false); cu.getNonbondedUtilities().getInteractionCount().download(&interactionCount, false);
} }
private: private:
ContextImpl& context; ContextImpl& context;
...@@ -87,15 +87,15 @@ private: ...@@ -87,15 +87,15 @@ private:
int groups; int groups;
void* pinnedMemory; void* pinnedMemory;
CUevent event; CUevent event;
int& numTiles; int2& interactionCount;
}; };
class CudaParallelCalcForcesAndEnergyKernel::FinishComputationTask : public CudaContext::WorkTask { class CudaParallelCalcForcesAndEnergyKernel::FinishComputationTask : public CudaContext::WorkTask {
public: public:
FinishComputationTask(ContextImpl& context, CudaContext& cu, CudaCalcForcesAndEnergyKernel& kernel, FinishComputationTask(ContextImpl& context, CudaContext& cu, CudaCalcForcesAndEnergyKernel& kernel,
bool includeForce, bool includeEnergy, int groups, double& energy, long long& completionTime, long long* pinnedMemory, CudaArray& contextForces, bool& valid, int& numTiles) : bool includeForce, bool includeEnergy, int groups, double& energy, long long& completionTime, long long* pinnedMemory, CudaArray& contextForces, bool& valid, int2& interactionCount) :
context(context), cu(cu), kernel(kernel), includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), energy(energy), context(context), cu(cu), kernel(kernel), includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), energy(energy),
completionTime(completionTime), pinnedMemory(pinnedMemory), contextForces(contextForces), valid(valid), numTiles(numTiles) { completionTime(completionTime), pinnedMemory(pinnedMemory), contextForces(contextForces), valid(valid), interactionCount(interactionCount) {
} }
void execute() { void execute() {
// Execute the kernel, then download forces. // Execute the kernel, then download forces.
...@@ -120,7 +120,8 @@ public: ...@@ -120,7 +120,8 @@ public:
cu.getForce().download(&pinnedMemory[(cu.getContextIndex()-1)*numAtoms*3]); cu.getForce().download(&pinnedMemory[(cu.getContextIndex()-1)*numAtoms*3]);
} }
} }
if (cu.getNonbondedUtilities().getUsePeriodic() && numTiles > cu.getNonbondedUtilities().getInteractingTiles().getSize()) { if (cu.getNonbondedUtilities().getUsePeriodic() && (interactionCount.x > cu.getNonbondedUtilities().getInteractingTiles().getSize() ||
interactionCount.y > cu.getNonbondedUtilities().getSinglePairs().getSize())) {
valid = false; valid = false;
cu.getNonbondedUtilities().updateNeighborListSize(); cu.getNonbondedUtilities().updateNeighborListSize();
} }
...@@ -136,12 +137,12 @@ private: ...@@ -136,12 +137,12 @@ private:
long long* pinnedMemory; long long* pinnedMemory;
CudaArray& contextForces; CudaArray& contextForces;
bool& valid; bool& valid;
int& numTiles; int2& interactionCount;
}; };
CudaParallelCalcForcesAndEnergyKernel::CudaParallelCalcForcesAndEnergyKernel(string name, const Platform& platform, CudaPlatform::PlatformData& data) : CudaParallelCalcForcesAndEnergyKernel::CudaParallelCalcForcesAndEnergyKernel(string name, const Platform& platform, CudaPlatform::PlatformData& data) :
CalcForcesAndEnergyKernel(name, platform), data(data), completionTimes(data.contexts.size()), contextNonbondedFractions(data.contexts.size()), CalcForcesAndEnergyKernel(name, platform), data(data), completionTimes(data.contexts.size()), contextNonbondedFractions(data.contexts.size()),
tileCounts(NULL), contextForces(NULL), pinnedPositionBuffer(NULL), pinnedForceBuffer(NULL) { interactionCounts(NULL), contextForces(NULL), pinnedPositionBuffer(NULL), pinnedForceBuffer(NULL) {
for (int i = 0; i < (int) data.contexts.size(); i++) for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new CudaCalcForcesAndEnergyKernel(name, platform, *data.contexts[i]))); kernels.push_back(Kernel(new CudaCalcForcesAndEnergyKernel(name, platform, *data.contexts[i])));
} }
...@@ -156,8 +157,8 @@ CudaParallelCalcForcesAndEnergyKernel::~CudaParallelCalcForcesAndEnergyKernel() ...@@ -156,8 +157,8 @@ CudaParallelCalcForcesAndEnergyKernel::~CudaParallelCalcForcesAndEnergyKernel()
cuMemFreeHost(pinnedForceBuffer); cuMemFreeHost(pinnedForceBuffer);
cuEventDestroy(event); cuEventDestroy(event);
cuStreamDestroy(peerCopyStream); cuStreamDestroy(peerCopyStream);
if (tileCounts != NULL) if (interactionCounts != NULL)
cuMemFreeHost(tileCounts); cuMemFreeHost(interactionCounts);
} }
void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) { void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
...@@ -172,7 +173,7 @@ void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) { ...@@ -172,7 +173,7 @@ void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
contextNonbondedFractions[i] = 1/(double) numContexts; contextNonbondedFractions[i] = 1/(double) numContexts;
CHECK_RESULT(cuEventCreate(&event, 0), "Error creating event"); CHECK_RESULT(cuEventCreate(&event, 0), "Error creating event");
CHECK_RESULT(cuStreamCreate(&peerCopyStream, CU_STREAM_NON_BLOCKING), "Error creating stream"); CHECK_RESULT(cuStreamCreate(&peerCopyStream, CU_STREAM_NON_BLOCKING), "Error creating stream");
CHECK_RESULT(cuMemHostAlloc((void**) &tileCounts, numContexts*sizeof(int), 0), "Error creating tile count buffer"); CHECK_RESULT(cuMemHostAlloc((void**) &interactionCounts, numContexts*sizeof(int2), 0), "Error creating interaction counts buffer");
} }
void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) { void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
...@@ -202,7 +203,7 @@ void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& contex ...@@ -202,7 +203,7 @@ void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& contex
data.contextEnergy[i] = 0.0; data.contextEnergy[i] = 0.0;
CudaContext& cu = *data.contexts[i]; CudaContext& cu = *data.contexts[i];
CudaContext::WorkThread& thread = cu.getWorkThread(); CudaContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new BeginComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, pinnedPositionBuffer, event, tileCounts[i])); thread.addTask(new BeginComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, pinnedPositionBuffer, event, interactionCounts[i]));
} }
} }
...@@ -210,7 +211,7 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con ...@@ -210,7 +211,7 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con
for (int i = 0; i < (int) data.contexts.size(); i++) { for (int i = 0; i < (int) data.contexts.size(); i++) {
CudaContext& cu = *data.contexts[i]; CudaContext& cu = *data.contexts[i];
CudaContext::WorkThread& thread = cu.getWorkThread(); CudaContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new FinishComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, data.contextEnergy[i], completionTimes[i], pinnedForceBuffer, *contextForces, valid, tileCounts[i])); thread.addTask(new FinishComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, data.contextEnergy[i], completionTimes[i], pinnedForceBuffer, *contextForces, valid, interactionCounts[i]));
} }
data.syncContexts(); data.syncContexts();
double energy = 0.0; double energy = 0.0;
......
...@@ -71,9 +71,60 @@ extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, co ...@@ -71,9 +71,60 @@ extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, co
if (rebuild) { if (rebuild) {
rebuildNeighborList[0] = 1; rebuildNeighborList[0] = 1;
interactionCount[0] = 0; interactionCount[0] = 0;
interactionCount[1] = 0;
} }
} }
__device__ int saveSinglePairs(int x, int* atoms, int* flags, int length, unsigned int maxSinglePairs, unsigned int* singlePairCount, int2* singlePairs, int* sumBuffer, volatile int& pairStartIndex) {
// Record interactions that should be computed as single pairs rather than in blocks.
const int indexInWarp = threadIdx.x%32;
int sum = 0;
for (int i = indexInWarp; i < length; i += 32) {
int count = __popc(flags[i]);
sum += (count <= MAX_BITS_FOR_PAIRS ? count : 0);
}
sumBuffer[indexInWarp] = sum;
for (int step = 1; step < 32; step *= 2) {
int add = (indexInWarp >= step ? sumBuffer[indexInWarp-step] : 0);
sumBuffer[indexInWarp] += add;
}
int pairsToStore = sumBuffer[31];
if (indexInWarp == 0)
pairStartIndex = atomicAdd(singlePairCount, pairsToStore);
int pairIndex = pairStartIndex + (indexInWarp > 0 ? sumBuffer[indexInWarp-1] : 0);
for (int i = indexInWarp; i < length; i += 32) {
int count = __popc(flags[i]);
if (count <= MAX_BITS_FOR_PAIRS && pairIndex+count < maxSinglePairs) {
int f = flags[i];
while (f != 0) {
singlePairs[pairIndex] = make_int2(atoms[i], x*TILE_SIZE+__ffs(f)-1);
f &= f-1;
pairIndex++;
}
}
}
// Compact the remaining interactions.
const int warpMask = (1<<indexInWarp)-1;
int numCompacted = 0;
for (int start = 0; start < length; start += 32) {
int i = start+indexInWarp;
int atom = atoms[i];
int flag = flags[i];
bool include = (i < length && __popc(flags[i]) > MAX_BITS_FOR_PAIRS);
int includeFlags = __ballot(include);
if (include) {
int index = numCompacted+__popc(includeFlags&warpMask);
atoms[index] = atom;
flags[index] = flag;
}
numCompacted += __popc(includeFlags);
}
return numCompacted;
}
/** /**
* Compare the bounding boxes for each pair of atom blocks (comprised of 32 atoms each), forming a tile. If the two * Compare the bounding boxes for each pair of atom blocks (comprised of 32 atoms each), forming a tile. If the two
* atom blocks are sufficiently far apart, mark them as non-interacting. There are two stages in the algorithm. * atom blocks are sufficiently far apart, mark them as non-interacting. There are two stages in the algorithm.
...@@ -124,8 +175,9 @@ extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, co ...@@ -124,8 +175,9 @@ extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, co
* *
*/ */
extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
unsigned int* __restrict__ interactionCount, int* __restrict__ interactingTiles, unsigned int* __restrict__ interactingAtoms, const real4* __restrict__ posq, unsigned int* __restrict__ interactionCount, int* __restrict__ interactingTiles, unsigned int* __restrict__ interactingAtoms,
unsigned int maxTiles, unsigned int startBlockIndex, unsigned int numBlocks, real2* __restrict__ sortedBlocks, const real4* __restrict__ sortedBlockCenter, int2* __restrict__ singlePairs, const real4* __restrict__ posq, unsigned int maxTiles, unsigned int maxSinglePairs,
unsigned int startBlockIndex, unsigned int numBlocks, real2* __restrict__ sortedBlocks, const real4* __restrict__ sortedBlockCenter,
const real4* __restrict__ sortedBlockBoundingBox, const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices, const real4* __restrict__ sortedBlockBoundingBox, const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices,
real4* __restrict__ oldPositions, const int* __restrict__ rebuildNeighborList) { real4* __restrict__ oldPositions, const int* __restrict__ rebuildNeighborList) {
...@@ -138,12 +190,17 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea ...@@ -138,12 +190,17 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
const int warpIndex = (blockIdx.x*blockDim.x+threadIdx.x)/32; const int warpIndex = (blockIdx.x*blockDim.x+threadIdx.x)/32;
const int warpMask = (1<<indexInWarp)-1; const int warpMask = (1<<indexInWarp)-1;
__shared__ int workgroupBuffer[BUFFER_SIZE*(GROUP_SIZE/32)]; __shared__ int workgroupBuffer[BUFFER_SIZE*(GROUP_SIZE/32)];
__shared__ int workgroupFlagsBuffer[BUFFER_SIZE*(GROUP_SIZE/32)];
__shared__ int warpExclusions[MAX_EXCLUSIONS*(GROUP_SIZE/32)]; __shared__ int warpExclusions[MAX_EXCLUSIONS*(GROUP_SIZE/32)];
__shared__ real3 posBuffer[GROUP_SIZE]; __shared__ real3 posBuffer[GROUP_SIZE];
__shared__ volatile int workgroupTileIndex[GROUP_SIZE/32]; __shared__ volatile int workgroupTileIndex[GROUP_SIZE/32];
__shared__ int sumBuffer[GROUP_SIZE];
__shared__ int worksgroupPairStartIndex[GROUP_SIZE/32];
int* buffer = workgroupBuffer+BUFFER_SIZE*(warpStart/32); int* buffer = workgroupBuffer+BUFFER_SIZE*(warpStart/32);
int* flagsBuffer = workgroupFlagsBuffer+BUFFER_SIZE*(warpStart/32);
int* exclusionsForX = warpExclusions+MAX_EXCLUSIONS*(warpStart/32); int* exclusionsForX = warpExclusions+MAX_EXCLUSIONS*(warpStart/32);
volatile int& tileStartIndex = workgroupTileIndex[warpStart/32]; volatile int& tileStartIndex = workgroupTileIndex[warpStart/32];
volatile int& pairStartIndex = worksgroupPairStartIndex[warpStart/32];
// Loop over blocks. // Loop over blocks.
...@@ -227,7 +284,7 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea ...@@ -227,7 +284,7 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
APPLY_PERIODIC_TO_DELTA(atomDelta) APPLY_PERIODIC_TO_DELTA(atomDelta)
#endif #endif
int atomFlags = ballot(atomDelta.x*atomDelta.x+atomDelta.y*atomDelta.y+atomDelta.z*atomDelta.z < (PADDED_CUTOFF+blockCenterY.w)*(PADDED_CUTOFF+blockCenterY.w)); int atomFlags = ballot(atomDelta.x*atomDelta.x+atomDelta.y*atomDelta.y+atomDelta.z*atomDelta.z < (PADDED_CUTOFF+blockCenterY.w)*(PADDED_CUTOFF+blockCenterY.w));
bool interacts = false; int interacts = 0;
if (atom2 < NUM_ATOMS && atomFlags != 0) { if (atom2 < NUM_ATOMS && atomFlags != 0) {
int first = __ffs(atomFlags)-1; int first = __ffs(atomFlags)-1;
int last = 32-__clz(atomFlags); int last = 32-__clz(atomFlags);
...@@ -236,14 +293,14 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea ...@@ -236,14 +293,14 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
for (int j = first; j < last; j++) { for (int j = first; j < last; j++) {
real3 delta = pos2-posBuffer[warpStart+j]; real3 delta = pos2-posBuffer[warpStart+j];
APPLY_PERIODIC_TO_DELTA(delta) APPLY_PERIODIC_TO_DELTA(delta)
interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED); interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED ? 1<<j : 0);
} }
} }
else { else {
#endif #endif
for (int j = first; j < last; j++) { for (int j = first; j < last; j++) {
real3 delta = pos2-posBuffer[warpStart+j]; real3 delta = pos2-posBuffer[warpStart+j];
interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED); interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED ? 1<<j : 0);
} }
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
} }
...@@ -253,15 +310,22 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea ...@@ -253,15 +310,22 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
// Add any interacting atoms to the buffer. // Add any interacting atoms to the buffer.
int includeAtomFlags = __ballot(interacts); int includeAtomFlags = __ballot(interacts);
if (interacts) if (interacts) {
buffer[neighborsInBuffer+__popc(includeAtomFlags&warpMask)] = atom2; int index = neighborsInBuffer+__popc(includeAtomFlags&warpMask);
buffer[index] = atom2;
flagsBuffer[index] = interacts;
}
neighborsInBuffer += __popc(includeAtomFlags); neighborsInBuffer += __popc(includeAtomFlags);
if (neighborsInBuffer > BUFFER_SIZE-TILE_SIZE) { if (neighborsInBuffer > BUFFER_SIZE-TILE_SIZE) {
// Store the new tiles to memory. // Store the new tiles to memory.
#if MAX_BITS_FOR_PAIRS > 0
neighborsInBuffer = saveSinglePairs(x, buffer, flagsBuffer, neighborsInBuffer, maxSinglePairs, &interactionCount[1], singlePairs, sumBuffer+warpStart, pairStartIndex);
#endif
int tilesToStore = neighborsInBuffer/TILE_SIZE; int tilesToStore = neighborsInBuffer/TILE_SIZE;
if (tilesToStore > 0) {
if (indexInWarp == 0) if (indexInWarp == 0)
tileStartIndex = atomicAdd(interactionCount, tilesToStore); tileStartIndex = atomicAdd(&interactionCount[0], tilesToStore);
int newTileStartIndex = tileStartIndex; int newTileStartIndex = tileStartIndex;
if (newTileStartIndex+tilesToStore <= maxTiles) { if (newTileStartIndex+tilesToStore <= maxTiles) {
if (indexInWarp < tilesToStore) if (indexInWarp < tilesToStore)
...@@ -274,13 +338,18 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea ...@@ -274,13 +338,18 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
} }
} }
} }
}
// If we have a partially filled buffer, store it to memory. // If we have a partially filled buffer, store it to memory.
#if MAX_BITS_FOR_PAIRS > 0
if (neighborsInBuffer > 32)
neighborsInBuffer = saveSinglePairs(x, buffer, flagsBuffer, neighborsInBuffer, maxSinglePairs, &interactionCount[1], singlePairs, sumBuffer+warpStart, pairStartIndex);
#endif
if (neighborsInBuffer > 0) { if (neighborsInBuffer > 0) {
int tilesToStore = (neighborsInBuffer+TILE_SIZE-1)/TILE_SIZE; int tilesToStore = (neighborsInBuffer+TILE_SIZE-1)/TILE_SIZE;
if (indexInWarp == 0) if (indexInWarp == 0)
tileStartIndex = atomicAdd(interactionCount, tilesToStore); tileStartIndex = atomicAdd(&interactionCount[0], tilesToStore);
int newTileStartIndex = tileStartIndex; int newTileStartIndex = tileStartIndex;
if (newTileStartIndex+tilesToStore <= maxTiles) { if (newTileStartIndex+tilesToStore <= maxTiles) {
if (indexInWarp < tilesToStore) if (indexInWarp < tilesToStore)
......
...@@ -103,9 +103,10 @@ extern "C" __global__ void computeNonbonded( ...@@ -103,9 +103,10 @@ extern "C" __global__ void computeNonbonded(
unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, const tileflags* __restrict__ exclusions, unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, const tileflags* __restrict__ exclusions,
const ushort2* __restrict__ exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices const ushort2* __restrict__ exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
, const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount,real4 periodicBoxSize, real4 invPeriodicBoxSize, , const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter,
const real4* __restrict__ blockSize, const unsigned int* __restrict__ interactingAtoms const real4* __restrict__ blockSize, const unsigned int* __restrict__ interactingAtoms, unsigned int maxSinglePairs,
const int2* __restrict__ singlePairs
#endif #endif
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE; const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
...@@ -588,6 +589,59 @@ extern "C" __global__ void computeNonbonded( ...@@ -588,6 +589,59 @@ extern "C" __global__ void computeNonbonded(
} }
pos++; pos++;
} }
// Third loop: single pairs that aren't part of a tile.
#if USE_CUTOFF
const unsigned int numPairs = interactionCount[1];
if (numPairs > maxSinglePairs)
return; // There wasn't enough memory for the neighbor list.
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < numPairs; i += blockDim.x*gridDim.x) {
int2 pair = singlePairs[i];
int atom1 = pair.x;
int atom2 = pair.y;
real4 posq1 = posq[atom1];
real4 posq2 = posq[atom2];
LOAD_ATOM1_PARAMETERS
int j = atom2;
atom2 = threadIdx.x;
DECLARE_LOCAL_PARAMETERS
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
LOAD_ATOM2_PARAMETERS
atom2 = pair.y;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real invR = RSQRT(r2);
real r = r2*invR;
#ifdef USE_SYMMETRIC
real dEdR = 0.0f;
#else
real3 dEdR1 = make_real3(0);
real3 dEdR2 = make_real3(0);
#endif
bool hasExclusions = false;
bool isExcluded = false;
real tempEnergy = 0.0f;
const real interactionScale = 1.0f;
COMPUTE_INTERACTION
energy += tempEnergy;
#ifdef INCLUDE_FORCES
#ifdef USE_SYMMETRIC
real3 dEdR1 = delta*dEdR;
real3 dEdR2 = -dEdR1;
#endif
atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (-dEdR1.x*0x100000000)));
atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (-dEdR1.y*0x100000000)));
atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (-dEdR1.z*0x100000000)));
atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (-dEdR2.x*0x100000000)));
atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (-dEdR2.y*0x100000000)));
atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (-dEdR2.z*0x100000000)));
#endif
}
#endif
#ifdef INCLUDE_ENERGY #ifdef INCLUDE_ENERGY
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy; energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
#endif #endif
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment