"platforms/opencl/tests/TestOpenCLPythonForce.cpp" did not exist on "05472c9a812927c863be67abbb3376e944b2c7ef"
Commit e3b631f6 authored by peastman's avatar peastman
Browse files

Wrote code to build neighbor list for CUDA version of CustomManyParticleForce

parent 786e1ee7
...@@ -85,11 +85,11 @@ namespace OpenMM { ...@@ -85,11 +85,11 @@ namespace OpenMM {
* As an example, the following code creates a CustomManyParticleForce that implements an Axilrod-Teller potential. This * As an example, the following code creates a CustomManyParticleForce that implements an Axilrod-Teller potential. This
* is an interaction between three particles that depends on all three distances and angles formed by the particles. * is an interaction between three particles that depends on all three distances and angles formed by the particles.
* *
* <tt>CustomManyParticleForce* force = new CustomManyParticleForce(3, * <tt><pre>CustomManyParticleForce* force = new CustomManyParticleForce(3,
* "C*(1+3*cos(theta1)*cos(theta2)*cos(theta3))/(r12*r13*r23)^3;" * "C*(1+3*cos(theta1)*cos(theta2)*cos(theta3))/(r12*r13*r23)^3;"
* "theta1=angle(p1,p2,p3); theta2=angle(p2,p3,p1); theta3=angle(p3,p1,p2);" * "theta1=angle(p1,p2,p3); theta2=angle(p2,p3,p1); theta3=angle(p3,p1,p2);"
* "r12=distance(p1,p2); r13=distance(p1,p3); r23=distance(p2,p3)"); * "r12=distance(p1,p2); r13=distance(p1,p3); r23=distance(p2,p3)");
* </tt> * <pre></tt>
* *
* This force depends on one parameter, C. The following code defines it as a global parameter: * This force depends on one parameter, C. The following code defines it as a global parameter:
* *
......
...@@ -44,7 +44,7 @@ using namespace OpenMM; ...@@ -44,7 +44,7 @@ using namespace OpenMM;
using namespace std; using namespace std;
CustomManyParticleForce::CustomManyParticleForce(int particlesPerSet, const string& energy) : CustomManyParticleForce::CustomManyParticleForce(int particlesPerSet, const string& energy) :
particlesPerSet(particlesPerSet), energyExpression(energy), nonbondedMethod(NoCutoff), typeFilters(particlesPerSet) { particlesPerSet(particlesPerSet), energyExpression(energy), nonbondedMethod(NoCutoff), cutoffDistance(1.0), typeFilters(particlesPerSet) {
} }
CustomManyParticleForce::~CustomManyParticleForce() { CustomManyParticleForce::~CustomManyParticleForce() {
......
...@@ -932,7 +932,8 @@ class CudaCalcCustomManyParticleForceKernel : public CalcCustomManyParticleForce ...@@ -932,7 +932,8 @@ class CudaCalcCustomManyParticleForceKernel : public CalcCustomManyParticleForce
public: public:
CudaCalcCustomManyParticleForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcCustomManyParticleForceKernel(name, platform), CudaCalcCustomManyParticleForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcCustomManyParticleForceKernel(name, platform),
hasInitializedKernel(false), cu(cu), params(NULL), globals(NULL), particleTypes(NULL), orderIndex(NULL), particleOrder(NULL), exclusions(NULL), hasInitializedKernel(false), cu(cu), params(NULL), globals(NULL), particleTypes(NULL), orderIndex(NULL), particleOrder(NULL), exclusions(NULL),
exclusionStartIndex(NULL), system(system) { exclusionStartIndex(NULL), blockCenter(NULL), blockBoundingBox(NULL), neighborPairs(NULL), numNeighborPairs(NULL), neighborStartIndex(NULL),
numNeighborsForAtom(NULL), neighbors(NULL), system(system) {
} }
~CudaCalcCustomManyParticleForceKernel(); ~CudaCalcCustomManyParticleForceKernel();
/** /**
...@@ -963,6 +964,7 @@ private: ...@@ -963,6 +964,7 @@ private:
CudaContext& cu; CudaContext& cu;
bool hasInitializedKernel; bool hasInitializedKernel;
NonbondedMethod nonbondedMethod; NonbondedMethod nonbondedMethod;
int maxNeighborPairs;
CudaParameterSet* params; CudaParameterSet* params;
CudaArray* globals; CudaArray* globals;
CudaArray* particleTypes; CudaArray* particleTypes;
...@@ -970,12 +972,20 @@ private: ...@@ -970,12 +972,20 @@ private:
CudaArray* particleOrder; CudaArray* particleOrder;
CudaArray* exclusions; CudaArray* exclusions;
CudaArray* exclusionStartIndex; CudaArray* exclusionStartIndex;
CudaArray* blockCenter;
CudaArray* blockBoundingBox;
CudaArray* neighborPairs;
CudaArray* numNeighborPairs;
CudaArray* neighborStartIndex;
CudaArray* numNeighborsForAtom;
CudaArray* neighbors;
std::vector<std::string> globalParamNames; std::vector<std::string> globalParamNames;
std::vector<float> globalParamValues; std::vector<float> globalParamValues;
std::vector<CudaArray*> tabulatedFunctions; std::vector<CudaArray*> tabulatedFunctions;
std::vector<void*> forceArgs; std::vector<void*> forceArgs, blockBoundsArgs, neighborsArgs, startIndicesArgs, copyPairsArgs;
const System& system; const System& system;
CUfunction forceKernel; CUfunction forceKernel, blockBoundsKernel, neighborsKernel, startIndicesKernel, copyPairsKernel;
CUevent event;
}; };
/** /**
......
...@@ -56,6 +56,13 @@ using namespace std; ...@@ -56,6 +56,13 @@ using namespace std;
using Lepton::ExpressionTreeNode; using Lepton::ExpressionTreeNode;
using Lepton::Operation; using Lepton::Operation;
#define CHECK_RESULT(result, prefix) \
if (result != CUDA_SUCCESS) { \
std::stringstream m; \
m<<prefix<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
throw OpenMMException(m.str());\
}
static bool isZeroExpression(const Lepton::ParsedExpression& expression) { static bool isZeroExpression(const Lepton::ParsedExpression& expression) {
const Lepton::Operation& op = expression.getRootNode().getOperation(); const Lepton::Operation& op = expression.getRootNode().getOperation();
if (op.getId() != Lepton::Operation::CONSTANT) if (op.getId() != Lepton::Operation::CONSTANT)
...@@ -4445,6 +4452,20 @@ CudaCalcCustomManyParticleForceKernel::~CudaCalcCustomManyParticleForceKernel() ...@@ -4445,6 +4452,20 @@ CudaCalcCustomManyParticleForceKernel::~CudaCalcCustomManyParticleForceKernel()
delete exclusions; delete exclusions;
if (exclusionStartIndex != NULL) if (exclusionStartIndex != NULL)
delete exclusionStartIndex; delete exclusionStartIndex;
if (blockCenter != NULL)
delete blockCenter;
if (blockBoundingBox != NULL)
delete blockBoundingBox;
if (neighborPairs != NULL)
delete neighborPairs;
if (numNeighborPairs != NULL)
delete numNeighborPairs;
if (neighborStartIndex != NULL)
delete neighborStartIndex;
if (neighbors != NULL)
delete neighbors;
if (numNeighborsForAtom != NULL)
delete numNeighborsForAtom;
for (int i = 0; i < (int) tabulatedFunctions.size(); i++) for (int i = 0; i < (int) tabulatedFunctions.size(); i++)
delete tabulatedFunctions[i]; delete tabulatedFunctions[i];
} }
...@@ -4570,6 +4591,26 @@ void CudaCalcCustomManyParticleForceKernel::initialize(const System& system, con ...@@ -4570,6 +4591,26 @@ void CudaCalcCustomManyParticleForceKernel::initialize(const System& system, con
exclusions->upload(exclusionsVec); exclusions->upload(exclusionsVec);
exclusionStartIndex->upload(exclusionStartIndexVec); exclusionStartIndex->upload(exclusionStartIndexVec);
} }
// Build data structures for the neighbor list.
if (nonbondedMethod != NoCutoff) {
int numAtomBlocks = cu.getNumAtomBlocks();
int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
blockCenter = new CudaArray(cu, numAtomBlocks, 4*elementSize, "blockCenter");
blockBoundingBox = new CudaArray(cu, numAtomBlocks, 4*elementSize, "blockBoundingBox");
numNeighborPairs = CudaArray::create<int>(cu, 1, "customManyParticleNumNeighborPairs");
neighborStartIndex = CudaArray::create<int>(cu, numParticles+1, "customManyParticleNeighborStartIndex");
numNeighborsForAtom = CudaArray::create<int>(cu, numParticles, "customManyParticleNumNeighborsForAtom");
CHECK_RESULT(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING), "Error creating event for CustomManyParticleForce");
// Select a size for the array that holds the neighbor list. We have to make a fairly
// arbitrary guess, but if this turns out to be too small we'll increase it later.
maxNeighborPairs = 150*numParticles;
neighborPairs = CudaArray::create<int2>(cu, maxNeighborPairs, "customManyParticleNeighborPairs");
neighbors = CudaArray::create<int>(cu, maxNeighborPairs, "customManyParticleNeighbors");
}
// Now to generate the kernel. First, it needs to calculate all distances, angles, // Now to generate the kernel. First, it needs to calculate all distances, angles,
// and dihedrals the expression depends on. // and dihedrals the expression depends on.
...@@ -4820,15 +4861,23 @@ void CudaCalcCustomManyParticleForceKernel::initialize(const System& system, con ...@@ -4820,15 +4861,23 @@ void CudaCalcCustomManyParticleForceKernel::initialize(const System& system, con
defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms()); defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
defines["M_PI"] = cu.doubleToString(M_PI); defines["M_PI"] = cu.doubleToString(M_PI);
defines["CUTOFF_SQUARED"] = cu.doubleToString(force.getCutoffDistance()*force.getCutoffDistance()); defines["CUTOFF_SQUARED"] = cu.doubleToString(force.getCutoffDistance()*force.getCutoffDistance());
std::cout << cu.replaceStrings(CudaKernelSources::vectorOps+CudaKernelSources::customManyParticle, replacements)<< std::endl; defines["TILE_SIZE"] = cu.intToString(CudaContext::TileSize);
defines["NUM_BLOCKS"] = cu.intToString(cu.getNumAtomBlocks());
// std::cout << cu.replaceStrings(CudaKernelSources::vectorOps+CudaKernelSources::customManyParticle, replacements)<< std::endl;
CUmodule module = cu.createModule(cu.replaceStrings(CudaKernelSources::vectorOps+CudaKernelSources::customManyParticle, replacements), defines); CUmodule module = cu.createModule(cu.replaceStrings(CudaKernelSources::vectorOps+CudaKernelSources::customManyParticle, replacements), defines);
forceKernel = cu.getKernel(module, "computeInteraction"); forceKernel = cu.getKernel(module, "computeInteraction");
blockBoundsKernel = cu.getKernel(module, "findBlockBounds");
neighborsKernel = cu.getKernel(module, "findNeighbors");
startIndicesKernel = cu.getKernel(module, "computeNeighborStartIndices");
copyPairsKernel = cu.getKernel(module, "copyPairsToNeighborList");
} }
double CudaCalcCustomManyParticleForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) { double CudaCalcCustomManyParticleForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
if (!hasInitializedKernel) { if (!hasInitializedKernel) {
hasInitializedKernel = true; hasInitializedKernel = true;
int index = 0;
// Set arguments for the force kernel.
forceArgs.push_back(&cu.getForce().getDevicePointer()); forceArgs.push_back(&cu.getForce().getDevicePointer());
forceArgs.push_back(&cu.getEnergyBuffer().getDevicePointer()); forceArgs.push_back(&cu.getEnergyBuffer().getDevicePointer());
forceArgs.push_back(&cu.getPosq().getDevicePointer()); forceArgs.push_back(&cu.getPosq().getDevicePointer());
...@@ -4851,6 +4900,47 @@ double CudaCalcCustomManyParticleForceKernel::execute(ContextImpl& context, bool ...@@ -4851,6 +4900,47 @@ double CudaCalcCustomManyParticleForceKernel::execute(ContextImpl& context, bool
} }
for (int i = 0; i < (int) tabulatedFunctions.size(); i++) for (int i = 0; i < (int) tabulatedFunctions.size(); i++)
forceArgs.push_back(&tabulatedFunctions[i]->getDevicePointer()); forceArgs.push_back(&tabulatedFunctions[i]->getDevicePointer());
if (nonbondedMethod != NoCutoff) {
// Set arguments for the block bounds kernel.
blockBoundsArgs.push_back(cu.getPeriodicBoxSizePointer());
blockBoundsArgs.push_back(cu.getInvPeriodicBoxSizePointer());
blockBoundsArgs.push_back(&cu.getPosq().getDevicePointer());
blockBoundsArgs.push_back(&blockCenter->getDevicePointer());
blockBoundsArgs.push_back(&blockBoundingBox->getDevicePointer());
blockBoundsArgs.push_back(&numNeighborPairs->getDevicePointer());
// Set arguments for the neighbor list kernel.
neighborsArgs.push_back(cu.getPeriodicBoxSizePointer());
neighborsArgs.push_back(cu.getInvPeriodicBoxSizePointer());
neighborsArgs.push_back(&cu.getPosq().getDevicePointer());
neighborsArgs.push_back(&blockCenter->getDevicePointer());
neighborsArgs.push_back(&blockBoundingBox->getDevicePointer());
neighborsArgs.push_back(&neighborPairs->getDevicePointer());
neighborsArgs.push_back(&numNeighborPairs->getDevicePointer());
neighborsArgs.push_back(&numNeighborsForAtom->getDevicePointer());
neighborsArgs.push_back(&maxNeighborPairs);
if (exclusions != NULL) {
neighborsArgs.push_back(&exclusions->getDevicePointer());
neighborsArgs.push_back(&exclusionStartIndex->getDevicePointer());
}
// Set arguments for the kernel to find neighbor list start indices.
startIndicesArgs.push_back(&numNeighborsForAtom->getDevicePointer());
startIndicesArgs.push_back(&neighborStartIndex->getDevicePointer());
// Set arguments for the kernel to assemble the final neighbor list.
copyPairsArgs.push_back(&neighborPairs->getDevicePointer());
copyPairsArgs.push_back(&neighbors->getDevicePointer());
copyPairsArgs.push_back(&numNeighborPairs->getDevicePointer());
copyPairsArgs.push_back(&maxNeighborPairs);
copyPairsArgs.push_back(&numNeighborsForAtom->getDevicePointer());
copyPairsArgs.push_back(&neighborStartIndex->getDevicePointer());
}
} }
if (globals != NULL) { if (globals != NULL) {
bool changed = false; bool changed = false;
...@@ -4863,7 +4953,40 @@ double CudaCalcCustomManyParticleForceKernel::execute(ContextImpl& context, bool ...@@ -4863,7 +4953,40 @@ double CudaCalcCustomManyParticleForceKernel::execute(ContextImpl& context, bool
if (changed) if (changed)
globals->upload(globalParamValues); globals->upload(globalParamValues);
} }
cu.executeKernel(forceKernel, &forceArgs[0], cu.getNumAtoms()*CudaContext::ThreadBlockSize, CudaContext::ThreadBlockSize); while (true) {
int* numPairs = (int*) cu.getPinnedBuffer();
if (nonbondedMethod != NoCutoff) {
cu.executeKernel(blockBoundsKernel, &blockBoundsArgs[0], cu.getNumAtomBlocks());
cu.executeKernel(neighborsKernel, &neighborsArgs[0], cu.getNumAtoms());
// We need to make sure there was enough memory for the neighbor list. Download the
// information asynchronously so kernels can be running at the same time.
numNeighborPairs->download(numPairs, false);
CHECK_RESULT(cuEventRecord(event, 0), "Error recording event for CustomManyParticleForce");
cu.executeKernel(startIndicesKernel, &startIndicesArgs[0], 256, 256, 256*sizeof(int));
cu.executeKernel(copyPairsKernel, &copyPairsArgs[0], maxNeighborPairs);
}
cu.executeKernel(forceKernel, &forceArgs[0], cu.getNumAtoms()*CudaContext::ThreadBlockSize, CudaContext::ThreadBlockSize);
if (nonbondedMethod != NoCutoff) {
// Make sure there was enough memory for the neighbor list.
CHECK_RESULT(cuEventSynchronize(event), "Error synchronizing on event for CustomManyParticleForce");
if (*numPairs > maxNeighborPairs) {
// Resize the arrays and run the calculation again.
delete neighborPairs;
neighborPairs = NULL;
delete neighbors;
neighbors = NULL;
maxNeighborPairs = (int) (1.1*(*numPairs));
neighborPairs = CudaArray::create<int2>(cu, maxNeighborPairs, "customManyParticleNeighborPairs");
neighbors = CudaArray::create<int>(cu, maxNeighborPairs, "customManyParticleNeighbors");
continue;
}
}
break;
}
return 0.0; return 0.0;
} }
......
...@@ -74,6 +74,49 @@ inline __device__ bool isInteractionExcluded(int atom1, int atom2, int* __restri ...@@ -74,6 +74,49 @@ inline __device__ bool isInteractionExcluded(int atom1, int atom2, int* __restri
return false; return false;
} }
#define WARP_SIZE 32
/**
* Perform a parallel prefix sum of boolean values over an array. This is done as the first stage of compacting an array.
*/
__device__ void prefixSum(bool value, short* sum, ushort2* temp) {
#if __CUDA_ARCH__ >= 300
const int indexInWarp = threadIdx.x%WARP_SIZE;
const int warpMask = (2<<indexInWarp)-1;
temp[threadIdx.x].x = __popc(__ballot(value)&warpMask);
__syncthreads();
if (threadIdx.x < WARP_SIZE) {
int multiWarpSum = temp[(threadIdx.x+1)*WARP_SIZE-1].x;
for (int offset = 1; offset < blockDim.x/WARP_SIZE; offset *= 2) {
short n = __shfl_up(multiWarpSum, offset, WARP_SIZE);
if (indexInWarp >= offset)
multiWarpSum += n;
}
temp[threadIdx.x].y = multiWarpSum;
}
__syncthreads();
sum[threadIdx.x] = temp[threadIdx.x].x+(threadIdx.x < WARP_SIZE ? 0 : temp[threadIdx.x/WARP_SIZE-1].y);
__syncthreads();
#else
temp[threadIdx.x].x = value;
__syncthreads();
int whichBuffer = 0;
for (int offset = 1; offset < blockDim.x; offset *= 2) {
if (whichBuffer == 0)
temp[threadIdx.x].y = (threadIdx.x < offset ? temp[threadIdx.x].x : temp[threadIdx.x].x+temp[threadIdx.x-offset].x);
else
temp[threadIdx.x].x = (threadIdx.x < offset ? temp[threadIdx.x].y : temp[threadIdx.x].y+temp[threadIdx.x-offset].y);
whichBuffer = 1-whichBuffer;
__syncthreads();
}
if (whichBuffer == 0)
sum[threadIdx.x] = temp[threadIdx.x].x;
else
sum[threadIdx.x] = temp[threadIdx.x].y;
__syncthreads();
#endif
}
/** /**
* Compute the interaction. * Compute the interaction.
*/ */
...@@ -120,4 +163,162 @@ extern "C" __global__ void computeInteraction( ...@@ -120,4 +163,162 @@ extern "C" __global__ void computeInteraction(
} }
} }
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy; energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
} }
\ No newline at end of file
/**
* Find a bounding box for the atoms in each block.
*/
extern "C" __global__ void findBlockBounds(real4 periodicBoxSize, real4 invPeriodicBoxSize, const real4* __restrict__ posq,
real4* __restrict__ blockCenter, real4* __restrict__ blockBoundingBox, int* __restrict__ numNeighborPairs) {
int index = blockIdx.x*blockDim.x+threadIdx.x;
int base = index*TILE_SIZE;
while (base < NUM_ATOMS) {
real4 pos = posq[base];
#ifdef USE_PERIODIC
pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
#endif
real4 minPos = pos;
real4 maxPos = pos;
int last = min(base+TILE_SIZE, NUM_ATOMS);
for (int i = base+1; i < last; i++) {
pos = posq[i];
#ifdef USE_PERIODIC
real4 center = 0.5f*(maxPos+minPos);
pos.x -= floor((pos.x-center.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
pos.y -= floor((pos.y-center.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
pos.z -= floor((pos.z-center.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
minPos = make_real4(min(minPos.x,pos.x), min(minPos.y,pos.y), min(minPos.z,pos.z), 0);
maxPos = make_real4(max(maxPos.x,pos.x), max(maxPos.y,pos.y), max(maxPos.z,pos.z), 0);
}
real4 blockSize = 0.5f*(maxPos-minPos);
blockBoundingBox[index] = blockSize;
blockCenter[index] = 0.5f*(maxPos+minPos);
index += blockDim.x*gridDim.x;
base = index*TILE_SIZE;
}
if (blockIdx.x == 0 && threadIdx.x == 0)
*numNeighborPairs = 0;
}
/**
* Find a list of neighbors for each atom.
*/
extern "C" __global__ void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, const real4* __restrict__ posq,
const real4* __restrict__ blockCenter, const real4* __restrict__ blockBoundingBox, int2* __restrict__ neighborPairs,
int* __restrict__ numNeighborPairs, int* __restrict__ numNeighborsForAtom, int maxNeighborPairs
#ifdef USE_EXCLUSIONS
, int* __restrict__ exclusions, int* __restrict__ exclusionStartIndex
#endif
) {
for (int atom1 = blockIdx.x*blockDim.x+threadIdx.x; atom1 < NUM_ATOMS; atom1 += blockDim.x*gridDim.x) {
// Load data for this atom.
real4 pos1 = posq[atom1];
int block1 = atom1/TILE_SIZE;
real4 blockCenter1 = blockCenter[block1];
real4 blockSize1 = blockBoundingBox[block1];
int totalNeighborsForAtom1 = 0;
// Loop over atom blocks to search for neighbors.
for (int block2 = block1; block2 < NUM_BLOCKS; block2++) {
real4 blockCenter2 = blockCenter[block2];
real4 blockSize2 = blockBoundingBox[block2];
real4 blockDelta = blockCenter1-blockCenter2;
#ifdef USE_PERIODIC
blockDelta.x -= floor(blockDelta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
blockDelta.y -= floor(blockDelta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
blockDelta.z -= floor(blockDelta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
blockDelta.x = max(0.0f, fabs(blockDelta.x)-blockSize1.x-blockSize2.x);
blockDelta.y = max(0.0f, fabs(blockDelta.y)-blockSize1.y-blockSize2.y);
blockDelta.z = max(0.0f, fabs(blockDelta.z)-blockSize1.z-blockSize2.z);
if (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < CUTOFF_SQUARED) {
// Loop over atoms in this block.
int start = block2*TILE_SIZE;
int end = (block2+1)*TILE_SIZE;
int included[TILE_SIZE];
int numIncluded = 0;
for (int atom2 = start; atom2 < end; atom2++) {
real4 pos2 = posq[atom2];
// Decide whether to include this atom pair in the neighbor list.
real4 atomDelta = delta(pos1, pos2, periodicBoxSize, invPeriodicBoxSize);
bool includeAtom = (atom2 > atom1 && atom2 < NUM_ATOMS && atomDelta.w < CUTOFF_SQUARED);
#ifdef USE_EXCLUSIONS
if (includeAtom)
includeAtom &= !isInteractionExcluded(atom1, atom2, exclusions, exclusionStartIndex);
#endif
if (includeAtom)
included[numIncluded++] = atom2;
}
// If we found any neighbors, store them to the neighbor list.
if (numIncluded > 0) {
int baseIndex = atomicAdd(numNeighborPairs, numIncluded);
if (baseIndex+numIncluded <= maxNeighborPairs)
for (int i = 0; i < numIncluded; i++)
neighborPairs[baseIndex+i] = make_int2(atom1, included[i]);
totalNeighborsForAtom1 += numIncluded;
}
}
}
numNeighborsForAtom[atom1] = totalNeighborsForAtom1;
}
}
/**
* Sum the neighbor counts to compute the start position of each atom. This kernel
* is executed as a single work group.
*/
extern "C" __global__ void computeNeighborStartIndices(int* __restrict__ numNeighborsForAtom, int* __restrict__ neighborStartIndex) {
extern __shared__ unsigned int posBuffer[];
unsigned int globalOffset = 0;
for (unsigned int startAtom = 0; startAtom < NUM_ATOMS; startAtom += blockDim.x) {
// Load the neighbor counts into local memory.
unsigned int globalIndex = startAtom+threadIdx.x;
posBuffer[threadIdx.x] = (globalIndex < NUM_ATOMS ? numNeighborsForAtom[globalIndex] : 0);
__syncthreads();
// Perform a parallel prefix sum.
for (unsigned int step = 1; step < blockDim.x; step *= 2) {
unsigned int add = (threadIdx.x >= step ? posBuffer[threadIdx.x-step] : 0);
__syncthreads();
posBuffer[threadIdx.x] += add;
__syncthreads();
}
// Write the results back to global memory.
if (globalIndex < NUM_ATOMS)
neighborStartIndex[globalIndex+1] = posBuffer[threadIdx.x]+globalOffset;
numNeighborsForAtom[globalIndex] = 0; // Clear this so the next kernel can use it as a counter
globalOffset += posBuffer[blockDim.x-1];
}
if (threadIdx.x == 0)
neighborStartIndex[0] = 0;
}
/**
* Assemble the final neighbor list.
*/
extern "C" __global__ void copyPairsToNeighborList(const int2* __restrict__ neighborPairs, int* __restrict__ neighbors, int* __restrict__ numNeighborPairs,
int maxNeighborPairs, int* __restrict__ numNeighborsForAtom, const int* __restrict__ neighborStartIndex) {
int actualPairs = *numNeighborPairs;
if (actualPairs > maxNeighborPairs)
return; // There wasn't enough memory for the neighbor list, so we'll need to rebuild it.
for (unsigned int index = blockDim.x*blockIdx.x+threadIdx.x; index < actualPairs; index += blockDim.x*gridDim.x) {
int2 pair = neighborPairs[index];
int startIndex = neighborStartIndex[pair.x];
int offset = atomicAdd(numNeighborsForAtom+pair.x, 1);
neighbors[startIndex+offset] = pair.y;
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment