Wrote code to build neighbor list for CUDA version of CustomManyParticleForce

e3b631f6 · peastman · 786e1ee7 · e3b631f6 · e3b631f6 · e3b631f6
Commit e3b631f6 authored Aug 20, 2014 by peastman
5 changed files
--- a/openmmapi/include/openmm/CustomManyParticleForce.h
+++ b/openmmapi/include/openmm/CustomManyParticleForce.h
@@ -85,11 +85,11 @@ namespace OpenMM {
 * As an example, the following code creates a CustomManyParticleForce that implements an Axilrod-Teller potential.  This
 * is an interaction between three particles that depends on all three distances and angles formed by the particles.
 *
- * <tt>CustomManyParticleForce* force = new CustomManyParticleForce(3,
+ * <tt><pre>CustomManyParticleForce* force = new CustomManyParticleForce(3,
 *     "C*(1+3*cos(theta1)*cos(theta2)*cos(theta3))/(r12*r13*r23)^3;"
 *     "theta1=angle(p1,p2,p3); theta2=angle(p2,p3,p1); theta3=angle(p3,p1,p2);"
 *     "r12=distance(p1,p2); r13=distance(p1,p3); r23=distance(p2,p3)");
- * </tt>
+ * <pre></tt>
 *
 * This force depends on one parameter, C.  The following code defines it as a global parameter:
 *

--- a/openmmapi/src/CustomManyParticleForce.cpp
+++ b/openmmapi/src/CustomManyParticleForce.cpp
@@ -44,7 +44,7 @@ using namespace OpenMM;
 using namespace std;

 CustomManyParticleForce::CustomManyParticleForce(int particlesPerSet, const string& energy) :
-        particlesPerSet(particlesPerSet), energyExpression(energy), nonbondedMethod(NoCutoff), typeFilters(particlesPerSet) {
+        particlesPerSet(particlesPerSet), energyExpression(energy), nonbondedMethod(NoCutoff), cutoffDistance(1.0), typeFilters(particlesPerSet) {
 }

 CustomManyParticleForce::~CustomManyParticleForce() {

--- a/platforms/cuda/include/CudaKernels.h
+++ b/platforms/cuda/include/CudaKernels.h
@@ -932,7 +932,8 @@ class CudaCalcCustomManyParticleForceKernel : public CalcCustomManyParticleForce
 public:
    CudaCalcCustomManyParticleForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcCustomManyParticleForceKernel(name, platform),
            hasInitializedKernel(false), cu(cu), params(NULL), globals(NULL), particleTypes(NULL), orderIndex(NULL), particleOrder(NULL), exclusions(NULL),
-            exclusionStartIndex(NULL), system(system) {
+            exclusionStartIndex(NULL), blockCenter(NULL), blockBoundingBox(NULL), neighborPairs(NULL), numNeighborPairs(NULL), neighborStartIndex(NULL),
+            numNeighborsForAtom(NULL), neighbors(NULL), system(system) {
    }
    ~CudaCalcCustomManyParticleForceKernel();
    /**
@@ -963,6 +964,7 @@ private:
    CudaContext& cu;
    bool hasInitializedKernel;
    NonbondedMethod nonbondedMethod;
+    int maxNeighborPairs;
    CudaParameterSet* params;
    CudaArray* globals;
    CudaArray* particleTypes;
@@ -970,12 +972,20 @@ private:
    CudaArray* particleOrder;
    CudaArray* exclusions;
    CudaArray* exclusionStartIndex;
+    CudaArray* blockCenter;
+    CudaArray* blockBoundingBox;
+    CudaArray* neighborPairs;
+    CudaArray* numNeighborPairs;
+    CudaArray* neighborStartIndex;
+    CudaArray* numNeighborsForAtom;
+    CudaArray* neighbors;
    std::vector<std::string> globalParamNames;
    std::vector<float> globalParamValues;
    std::vector<CudaArray*> tabulatedFunctions;
-    std::vector<void*> forceArgs;
+    std::vector<void*> forceArgs, blockBoundsArgs, neighborsArgs, startIndicesArgs, copyPairsArgs;
    const System& system;
-    CUfunction forceKernel;
+    CUfunction forceKernel, blockBoundsKernel, neighborsKernel, startIndicesKernel, copyPairsKernel;
+    CUevent event;
 };

 /**

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -56,6 +56,13 @@ using namespace std;
 using Lepton::ExpressionTreeNode;
 using Lepton::Operation;

+#define CHECK_RESULT(result, prefix) \
+    if (result != CUDA_SUCCESS) { \
+        std::stringstream m; \
+        m<<prefix<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
+        throw OpenMMException(m.str());\
+    }
+
 static bool isZeroExpression(const Lepton::ParsedExpression& expression) {
    const Lepton::Operation& op = expression.getRootNode().getOperation();
    if (op.getId() != Lepton::Operation::CONSTANT)
@@ -4445,6 +4452,20 @@ CudaCalcCustomManyParticleForceKernel::~CudaCalcCustomManyParticleForceKernel()
        delete exclusions;
    if (exclusionStartIndex != NULL)
        delete exclusionStartIndex;
+    if (blockCenter != NULL)
+        delete blockCenter;
+    if (blockBoundingBox != NULL)
+        delete blockBoundingBox;
+    if (neighborPairs != NULL)
+        delete neighborPairs;
+    if (numNeighborPairs != NULL)
+        delete numNeighborPairs;
+    if (neighborStartIndex != NULL)
+        delete neighborStartIndex;
+    if (neighbors != NULL)
+        delete neighbors;
+    if (numNeighborsForAtom != NULL)
+        delete numNeighborsForAtom;
    for (int i = 0; i < (int) tabulatedFunctions.size(); i++)
        delete tabulatedFunctions[i];
 }
@@ -4570,6 +4591,26 @@ void CudaCalcCustomManyParticleForceKernel::initialize(const System& system, con
        exclusions->upload(exclusionsVec);
        exclusionStartIndex->upload(exclusionStartIndexVec);
    }
+    
+    // Build data structures for the neighbor list.
+    
+    if (nonbondedMethod != NoCutoff) {
+        int numAtomBlocks = cu.getNumAtomBlocks();
+        int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
+        blockCenter = new CudaArray(cu, numAtomBlocks, 4*elementSize, "blockCenter");
+        blockBoundingBox = new CudaArray(cu, numAtomBlocks, 4*elementSize, "blockBoundingBox");
+        numNeighborPairs = CudaArray::create<int>(cu, 1, "customManyParticleNumNeighborPairs");
+        neighborStartIndex = CudaArray::create<int>(cu, numParticles+1, "customManyParticleNeighborStartIndex");
+        numNeighborsForAtom = CudaArray::create<int>(cu, numParticles, "customManyParticleNumNeighborsForAtom");
+        CHECK_RESULT(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING), "Error creating event for CustomManyParticleForce");
+
+        // Select a size for the array that holds the neighbor list.  We have to make a fairly
+        // arbitrary guess, but if this turns out to be too small we'll increase it later.
+
+        maxNeighborPairs = 150*numParticles;
+        neighborPairs = CudaArray::create<int2>(cu, maxNeighborPairs, "customManyParticleNeighborPairs");
+        neighbors = CudaArray::create<int>(cu, maxNeighborPairs, "customManyParticleNeighbors");
+    }

    // Now to generate the kernel.  First, it needs to calculate all distances, angles,
    // and dihedrals the expression depends on.
@@ -4820,15 +4861,23 @@ void CudaCalcCustomManyParticleForceKernel::initialize(const System& system, con
    defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
    defines["M_PI"] = cu.doubleToString(M_PI);
    defines["CUTOFF_SQUARED"] = cu.doubleToString(force.getCutoffDistance()*force.getCutoffDistance());
-    std::cout << cu.replaceStrings(CudaKernelSources::vectorOps+CudaKernelSources::customManyParticle, replacements)<< std::endl;
+    defines["TILE_SIZE"] = cu.intToString(CudaContext::TileSize);
+    defines["NUM_BLOCKS"] = cu.intToString(cu.getNumAtomBlocks());
+//    std::cout << cu.replaceStrings(CudaKernelSources::vectorOps+CudaKernelSources::customManyParticle, replacements)<< std::endl;
    CUmodule module = cu.createModule(cu.replaceStrings(CudaKernelSources::vectorOps+CudaKernelSources::customManyParticle, replacements), defines);
    forceKernel = cu.getKernel(module, "computeInteraction");
+    blockBoundsKernel = cu.getKernel(module, "findBlockBounds");
+    neighborsKernel = cu.getKernel(module, "findNeighbors");
+    startIndicesKernel = cu.getKernel(module, "computeNeighborStartIndices");
+    copyPairsKernel = cu.getKernel(module, "copyPairsToNeighborList");
 }

 double CudaCalcCustomManyParticleForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
    if (!hasInitializedKernel) {
        hasInitializedKernel = true;
-        int index = 0;
+        
+        // Set arguments for the force kernel.
+        
        forceArgs.push_back(&cu.getForce().getDevicePointer());
        forceArgs.push_back(&cu.getEnergyBuffer().getDevicePointer());
        forceArgs.push_back(&cu.getPosq().getDevicePointer());
@@ -4851,6 +4900,47 @@ double CudaCalcCustomManyParticleForceKernel::execute(ContextImpl& context, bool
        }
        for (int i = 0; i < (int) tabulatedFunctions.size(); i++)
            forceArgs.push_back(&tabulatedFunctions[i]->getDevicePointer());
+        
+        if (nonbondedMethod != NoCutoff) {
+            // Set arguments for the block bounds kernel.
+
+            blockBoundsArgs.push_back(cu.getPeriodicBoxSizePointer());
+            blockBoundsArgs.push_back(cu.getInvPeriodicBoxSizePointer());
+            blockBoundsArgs.push_back(&cu.getPosq().getDevicePointer());
+            blockBoundsArgs.push_back(&blockCenter->getDevicePointer());
+            blockBoundsArgs.push_back(&blockBoundingBox->getDevicePointer());
+            blockBoundsArgs.push_back(&numNeighborPairs->getDevicePointer());
+
+            // Set arguments for the neighbor list kernel.
+
+            neighborsArgs.push_back(cu.getPeriodicBoxSizePointer());
+            neighborsArgs.push_back(cu.getInvPeriodicBoxSizePointer());
+            neighborsArgs.push_back(&cu.getPosq().getDevicePointer());
+            neighborsArgs.push_back(&blockCenter->getDevicePointer());
+            neighborsArgs.push_back(&blockBoundingBox->getDevicePointer());
+            neighborsArgs.push_back(&neighborPairs->getDevicePointer());
+            neighborsArgs.push_back(&numNeighborPairs->getDevicePointer());
+            neighborsArgs.push_back(&numNeighborsForAtom->getDevicePointer());
+            neighborsArgs.push_back(&maxNeighborPairs);
+            if (exclusions != NULL) {
+                neighborsArgs.push_back(&exclusions->getDevicePointer());
+                neighborsArgs.push_back(&exclusionStartIndex->getDevicePointer());
+            }
+            
+            // Set arguments for the kernel to find neighbor list start indices.
+            
+            startIndicesArgs.push_back(&numNeighborsForAtom->getDevicePointer());
+            startIndicesArgs.push_back(&neighborStartIndex->getDevicePointer());
+
+            // Set arguments for the kernel to assemble the final neighbor list.
+            
+            copyPairsArgs.push_back(&neighborPairs->getDevicePointer());
+            copyPairsArgs.push_back(&neighbors->getDevicePointer());
+            copyPairsArgs.push_back(&numNeighborPairs->getDevicePointer());
+            copyPairsArgs.push_back(&maxNeighborPairs);
+            copyPairsArgs.push_back(&numNeighborsForAtom->getDevicePointer());
+            copyPairsArgs.push_back(&neighborStartIndex->getDevicePointer());
+       }
    }
    if (globals != NULL) {
        bool changed = false;
@@ -4863,7 +4953,40 @@ double CudaCalcCustomManyParticleForceKernel::execute(ContextImpl& context, bool
        if (changed)
            globals->upload(globalParamValues);
    }
-    cu.executeKernel(forceKernel, &forceArgs[0], cu.getNumAtoms()*CudaContext::ThreadBlockSize, CudaContext::ThreadBlockSize);
+    while (true) {
+        int* numPairs = (int*) cu.getPinnedBuffer();
+        if (nonbondedMethod != NoCutoff) {
+            cu.executeKernel(blockBoundsKernel, &blockBoundsArgs[0], cu.getNumAtomBlocks());
+            cu.executeKernel(neighborsKernel, &neighborsArgs[0], cu.getNumAtoms());
+
+            // We need to make sure there was enough memory for the neighbor list.  Download the
+            // information asynchronously so kernels can be running at the same time.
+
+            numNeighborPairs->download(numPairs, false);
+            CHECK_RESULT(cuEventRecord(event, 0), "Error recording event for CustomManyParticleForce");
+            cu.executeKernel(startIndicesKernel, &startIndicesArgs[0], 256, 256, 256*sizeof(int));
+            cu.executeKernel(copyPairsKernel, &copyPairsArgs[0], maxNeighborPairs);
+        }
+        cu.executeKernel(forceKernel, &forceArgs[0], cu.getNumAtoms()*CudaContext::ThreadBlockSize, CudaContext::ThreadBlockSize);
+        if (nonbondedMethod != NoCutoff) {
+            // Make sure there was enough memory for the neighbor list.
+
+            CHECK_RESULT(cuEventSynchronize(event), "Error synchronizing on event for CustomManyParticleForce");
+            if (*numPairs > maxNeighborPairs) {
+                // Resize the arrays and run the calculation again.
+
+                delete neighborPairs;
+                neighborPairs = NULL;
+                delete neighbors;
+                neighbors = NULL;
+                maxNeighborPairs = (int) (1.1*(*numPairs));
+                neighborPairs = CudaArray::create<int2>(cu, maxNeighborPairs, "customManyParticleNeighborPairs");
+                neighbors = CudaArray::create<int>(cu, maxNeighborPairs, "customManyParticleNeighbors");
+                continue;
+            }
+        }
+        break;
+    }
    return 0.0;
 }


--- a/platforms/cuda/src/kernels/customManyParticle.cu
+++ b/platforms/cuda/src/kernels/customManyParticle.cu
@@ -74,6 +74,49 @@ inline __device__ bool isInteractionExcluded(int atom1, int atom2, int* __restri
    return false;
 }

+#define WARP_SIZE 32
+
+/**
+ * Perform a parallel prefix sum of boolean values over an array.  This is done as the first stage of compacting an array.
+ */
+__device__ void prefixSum(bool value, short* sum, ushort2* temp) {
+#if __CUDA_ARCH__ >= 300
+    const int indexInWarp = threadIdx.x%WARP_SIZE;
+    const int warpMask = (2<<indexInWarp)-1;
+    temp[threadIdx.x].x = __popc(__ballot(value)&warpMask);
+    __syncthreads();
+    if (threadIdx.x < WARP_SIZE) {
+        int multiWarpSum = temp[(threadIdx.x+1)*WARP_SIZE-1].x;
+        for (int offset = 1; offset < blockDim.x/WARP_SIZE; offset *= 2) {
+            short n = __shfl_up(multiWarpSum, offset, WARP_SIZE);
+            if (indexInWarp >= offset)
+                multiWarpSum += n;
+        }
+        temp[threadIdx.x].y = multiWarpSum;
+    }
+    __syncthreads();
+    sum[threadIdx.x] = temp[threadIdx.x].x+(threadIdx.x < WARP_SIZE ? 0 : temp[threadIdx.x/WARP_SIZE-1].y);
+    __syncthreads();
+#else
+    temp[threadIdx.x].x = value;
+    __syncthreads();
+    int whichBuffer = 0;
+    for (int offset = 1; offset < blockDim.x; offset *= 2) {
+        if (whichBuffer == 0)
+            temp[threadIdx.x].y = (threadIdx.x < offset ? temp[threadIdx.x].x : temp[threadIdx.x].x+temp[threadIdx.x-offset].x);
+        else
+            temp[threadIdx.x].x = (threadIdx.x < offset ? temp[threadIdx.x].y : temp[threadIdx.x].y+temp[threadIdx.x-offset].y);
+        whichBuffer = 1-whichBuffer;
+        __syncthreads();
+    }
+    if (whichBuffer == 0)
+        sum[threadIdx.x] = temp[threadIdx.x].x;
+    else
+        sum[threadIdx.x] = temp[threadIdx.x].y;
+    __syncthreads();
+#endif
+}
+
 /**
 * Compute the interaction.
 */
@@ -120,4 +163,162 @@ extern "C" __global__ void computeInteraction(
        }
    }
    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
-}
\ No newline at end of file
+}
+
+/**
+ * Find a bounding box for the atoms in each block.
+ */
+extern "C" __global__ void findBlockBounds(real4 periodicBoxSize, real4 invPeriodicBoxSize, const real4* __restrict__ posq,
+        real4* __restrict__ blockCenter, real4* __restrict__ blockBoundingBox, int* __restrict__ numNeighborPairs) {
+    int index = blockIdx.x*blockDim.x+threadIdx.x;
+    int base = index*TILE_SIZE;
+    while (base < NUM_ATOMS) {
+        real4 pos = posq[base];
+#ifdef USE_PERIODIC
+        pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
+        pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
+        pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
+#endif
+        real4 minPos = pos;
+        real4 maxPos = pos;
+        int last = min(base+TILE_SIZE, NUM_ATOMS);
+        for (int i = base+1; i < last; i++) {
+            pos = posq[i];
+#ifdef USE_PERIODIC
+            real4 center = 0.5f*(maxPos+minPos);
+            pos.x -= floor((pos.x-center.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+            pos.y -= floor((pos.y-center.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+            pos.z -= floor((pos.z-center.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+#endif
+            minPos = make_real4(min(minPos.x,pos.x), min(minPos.y,pos.y), min(minPos.z,pos.z), 0);
+            maxPos = make_real4(max(maxPos.x,pos.x), max(maxPos.y,pos.y), max(maxPos.z,pos.z), 0);
+        }
+        real4 blockSize = 0.5f*(maxPos-minPos);
+        blockBoundingBox[index] = blockSize;
+        blockCenter[index] = 0.5f*(maxPos+minPos);
+        index += blockDim.x*gridDim.x;
+        base = index*TILE_SIZE;
+    }
+    if (blockIdx.x == 0 && threadIdx.x == 0)
+        *numNeighborPairs = 0;
+}
+
+/**
+ * Find a list of neighbors for each atom.
+ */
+extern "C" __global__ void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, const real4* __restrict__ posq,
+        const real4* __restrict__ blockCenter, const real4* __restrict__ blockBoundingBox, int2* __restrict__ neighborPairs,
+        int* __restrict__ numNeighborPairs, int* __restrict__ numNeighborsForAtom, int maxNeighborPairs
+#ifdef USE_EXCLUSIONS
+        , int* __restrict__ exclusions, int* __restrict__ exclusionStartIndex
+#endif
+        ) {
+    for (int atom1 = blockIdx.x*blockDim.x+threadIdx.x; atom1 < NUM_ATOMS; atom1 += blockDim.x*gridDim.x) {
+        // Load data for this atom.
+        
+        real4 pos1 = posq[atom1];
+        int block1 = atom1/TILE_SIZE;
+        real4 blockCenter1 = blockCenter[block1];
+        real4 blockSize1 = blockBoundingBox[block1];
+        int totalNeighborsForAtom1 = 0;
+        
+        // Loop over atom blocks to search for neighbors.
+        
+        for (int block2 = block1; block2 < NUM_BLOCKS; block2++) {
+            real4 blockCenter2 = blockCenter[block2];
+            real4 blockSize2 = blockBoundingBox[block2];
+            real4 blockDelta = blockCenter1-blockCenter2;
+#ifdef USE_PERIODIC
+            blockDelta.x -= floor(blockDelta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+            blockDelta.y -= floor(blockDelta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+            blockDelta.z -= floor(blockDelta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+#endif
+            blockDelta.x = max(0.0f, fabs(blockDelta.x)-blockSize1.x-blockSize2.x);
+            blockDelta.y = max(0.0f, fabs(blockDelta.y)-blockSize1.y-blockSize2.y);
+            blockDelta.z = max(0.0f, fabs(blockDelta.z)-blockSize1.z-blockSize2.z);
+            if (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < CUTOFF_SQUARED) {
+                // Loop over atoms in this block.
+                
+                int start = block2*TILE_SIZE;
+                int end = (block2+1)*TILE_SIZE;
+                int included[TILE_SIZE];
+                int numIncluded = 0;
+                for (int atom2 = start; atom2 < end; atom2++) {
+                    real4 pos2 = posq[atom2];
+                    
+                    // Decide whether to include this atom pair in the neighbor list.
+                    
+                    real4 atomDelta = delta(pos1, pos2, periodicBoxSize, invPeriodicBoxSize);
+                    bool includeAtom = (atom2 > atom1 && atom2 < NUM_ATOMS && atomDelta.w < CUTOFF_SQUARED);
+#ifdef USE_EXCLUSIONS
+                    if (includeAtom)
+                        includeAtom &= !isInteractionExcluded(atom1, atom2, exclusions, exclusionStartIndex);
+#endif
+                    if (includeAtom)
+                        included[numIncluded++] = atom2;
+                }
+                
+                // If we found any neighbors, store them to the neighbor list.
+                
+                if (numIncluded > 0) {
+                    int baseIndex = atomicAdd(numNeighborPairs, numIncluded);
+                    if (baseIndex+numIncluded <= maxNeighborPairs)
+                        for (int i = 0; i < numIncluded; i++)
+                            neighborPairs[baseIndex+i] = make_int2(atom1, included[i]);
+                    totalNeighborsForAtom1 += numIncluded;
+                }
+            }
+        }
+        numNeighborsForAtom[atom1] = totalNeighborsForAtom1;
+    }
+}
+
+/**
+ * Sum the neighbor counts to compute the start position of each atom.  This kernel
+ * is executed as a single work group.
+ */
+extern "C" __global__ void computeNeighborStartIndices(int* __restrict__ numNeighborsForAtom, int* __restrict__ neighborStartIndex) {
+    extern __shared__ unsigned int posBuffer[];
+    unsigned int globalOffset = 0;
+    for (unsigned int startAtom = 0; startAtom < NUM_ATOMS; startAtom += blockDim.x) {
+        // Load the neighbor counts into local memory.
+
+        unsigned int globalIndex = startAtom+threadIdx.x;
+        posBuffer[threadIdx.x] = (globalIndex < NUM_ATOMS ? numNeighborsForAtom[globalIndex] : 0);
+        __syncthreads();
+
+        // Perform a parallel prefix sum.
+
+        for (unsigned int step = 1; step < blockDim.x; step *= 2) {
+            unsigned int add = (threadIdx.x >= step ? posBuffer[threadIdx.x-step] : 0);
+            __syncthreads();
+            posBuffer[threadIdx.x] += add;
+            __syncthreads();
+        }
+
+        // Write the results back to global memory.
+
+        if (globalIndex < NUM_ATOMS)
+            neighborStartIndex[globalIndex+1] = posBuffer[threadIdx.x]+globalOffset;
+        numNeighborsForAtom[globalIndex] = 0; // Clear this so the next kernel can use it as a counter
+        globalOffset += posBuffer[blockDim.x-1];
+    }
+    if (threadIdx.x == 0)
+        neighborStartIndex[0] = 0;
+}
+
+/**
+ * Assemble the final neighbor list.
+ */
+extern "C" __global__ void copyPairsToNeighborList(const int2* __restrict__ neighborPairs, int* __restrict__ neighbors, int* __restrict__ numNeighborPairs,
+            int maxNeighborPairs, int* __restrict__ numNeighborsForAtom, const int* __restrict__ neighborStartIndex) {
+    int actualPairs = *numNeighborPairs;
+    if (actualPairs > maxNeighborPairs)
+        return; // There wasn't enough memory for the neighbor list, so we'll need to rebuild it.
+    for (unsigned int index = blockDim.x*blockIdx.x+threadIdx.x; index < actualPairs; index += blockDim.x*gridDim.x) {
+        int2 pair = neighborPairs[index];
+        int startIndex = neighborStartIndex[pair.x];
+        int offset = atomicAdd(numNeighborsForAtom+pair.x, 1);
+        neighbors[startIndex+offset] = pair.y;
+    }
+}