CUDA version of CustomManyParticleForce uses neighbor list

bddaf4e7 · peastman · e3b631f6 · bddaf4e7 · bddaf4e7 · bddaf4e7
Commit bddaf4e7 authored Aug 21, 2014 by peastman
3 changed files
--- a/platforms/cuda/include/CudaKernels.h
+++ b/platforms/cuda/include/CudaKernels.h
@@ -964,7 +964,7 @@ private:
    CudaContext& cu;
    bool hasInitializedKernel;
    NonbondedMethod nonbondedMethod;
-    int maxNeighborPairs;
+    int maxNeighborPairs, forceWorkgroupSize;
    CudaParameterSet* params;
    CudaArray* globals;
    CudaArray* particleTypes;

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -4475,6 +4475,7 @@ void CudaCalcCustomManyParticleForceKernel::initialize(const System& system, con
    int numParticles = force.getNumParticles();
    int particlesPerSet = force.getNumParticlesPerSet();
    nonbondedMethod = CalcCustomManyParticleForceKernel::NonbondedMethod(force.getNonbondedMethod());
+    forceWorkgroupSize = 128;
    // Record parameter values.
@@ -4804,19 +4805,21 @@ void CudaCalcCustomManyParticleForceKernel::initialize(const System& system, con
        if (i > 1)
            numCombinations<<"*";
        numCombinations<<"numNeighbors";
+        if (nonbondedMethod == NoCutoff)
            atomsForCombination<<"int p"<<(i+1)<<" = p1+1+tempIndex%numNeighbors;\n";
+        else
+            atomsForCombination<<"int p"<<(i+1)<<" = neighbors[firstNeighbor+tempIndex%numNeighbors];\n";
        atomsForCombination<<"tempIndex /= numNeighbors;\n";
    }
    if (nonbondedMethod != NoCutoff) {
-        int startCheckFrom = 0;
+        for (int i = 1; i < particlesPerSet; i++)
-        for (int i = startCheckFrom; i < particlesPerSet; i++)
            verifyCutoff<<"real4 pos"<<(i+1)<<" = posq[p"<<(i+1)<<"];\n";
-        for (int i = startCheckFrom; i < particlesPerSet; i++)
+        for (int i = 1; i < particlesPerSet; i++)
            for (int j = i+1; j < particlesPerSet; j++)
                verifyCutoff<<"includeInteraction &= (delta(pos"<<(i+1)<<", pos"<<(j+1)<<", periodicBoxSize, invPeriodicBoxSize).w < CUTOFF_SQUARED);\n";
    }
    if (force.getNumExclusions() > 0) {
-        int startCheckFrom = 0;
+        int startCheckFrom = (nonbondedMethod == NoCutoff ? 0 : 1);
        for (int i = startCheckFrom; i < particlesPerSet; i++)
            for (int j = i+1; j < particlesPerSet; j++)
                verifyExclusions<<"includeInteraction &= !isInteractionExcluded(p"<<(i+1)<<", p"<<(j+1)<<", exclusions, exclusionStartIndex);\n";
@@ -4883,6 +4886,10 @@ double CudaCalcCustomManyParticleForceKernel::execute(ContextImpl& context, bool
        forceArgs.push_back(&cu.getPosq().getDevicePointer());
        forceArgs.push_back(cu.getPeriodicBoxSizePointer());
        forceArgs.push_back(cu.getInvPeriodicBoxSizePointer());
+        if (nonbondedMethod != NoCutoff) {
+            forceArgs.push_back(&neighbors->getDevicePointer());
+            forceArgs.push_back(&neighborStartIndex->getDevicePointer());
+        }
        if (particleTypes != NULL) {
            forceArgs.push_back(&particleTypes->getDevicePointer());
            forceArgs.push_back(&orderIndex->getDevicePointer());
@@ -4967,7 +4974,7 @@ double CudaCalcCustomManyParticleForceKernel::execute(ContextImpl& context, bool
            cu.executeKernel(startIndicesKernel, &startIndicesArgs[0], 256, 256, 256*sizeof(int));
            cu.executeKernel(copyPairsKernel, &copyPairsArgs[0], maxNeighborPairs);
        }
-        cu.executeKernel(forceKernel, &forceArgs[0], cu.getNumAtoms()*CudaContext::ThreadBlockSize, CudaContext::ThreadBlockSize);
+        cu.executeKernel(forceKernel, &forceArgs[0], cu.getNumAtoms()*forceWorkgroupSize, forceWorkgroupSize);
        if (nonbondedMethod != NoCutoff) {
            // Make sure there was enough memory for the neighbor list.

--- a/platforms/cuda/src/kernels/customManyParticle.cu
+++ b/platforms/cuda/src/kernels/customManyParticle.cu
@@ -74,55 +74,15 @@ inline __device__ bool isInteractionExcluded(int atom1, int atom2, int* __restri
    return false;
 }
-#define WARP_SIZE 32
-/**
- * Perform a parallel prefix sum of boolean values over an array.  This is done as the first stage of compacting an array.
- */
-__device__ void prefixSum(bool value, short* sum, ushort2* temp) {
-#if __CUDA_ARCH__ >= 300
-    const int indexInWarp = threadIdx.x%WARP_SIZE;
-    const int warpMask = (2<<indexInWarp)-1;
-    temp[threadIdx.x].x = __popc(__ballot(value)&warpMask);
-    __syncthreads();
-    if (threadIdx.x < WARP_SIZE) {
-        int multiWarpSum = temp[(threadIdx.x+1)*WARP_SIZE-1].x;
-        for (int offset = 1; offset < blockDim.x/WARP_SIZE; offset *= 2) {
-            short n = __shfl_up(multiWarpSum, offset, WARP_SIZE);
-            if (indexInWarp >= offset)
-                multiWarpSum += n;
-        }
-        temp[threadIdx.x].y = multiWarpSum;
-    }
-    __syncthreads();
-    sum[threadIdx.x] = temp[threadIdx.x].x+(threadIdx.x < WARP_SIZE ? 0 : temp[threadIdx.x/WARP_SIZE-1].y);
-    __syncthreads();
-#else
-    temp[threadIdx.x].x = value;
-    __syncthreads();
-    int whichBuffer = 0;
-    for (int offset = 1; offset < blockDim.x; offset *= 2) {
-        if (whichBuffer == 0)
-            temp[threadIdx.x].y = (threadIdx.x < offset ? temp[threadIdx.x].x : temp[threadIdx.x].x+temp[threadIdx.x-offset].x);
-        else
-            temp[threadIdx.x].x = (threadIdx.x < offset ? temp[threadIdx.x].y : temp[threadIdx.x].y+temp[threadIdx.x-offset].y);
-        whichBuffer = 1-whichBuffer;
-        __syncthreads();
-    }
-    if (whichBuffer == 0)
-        sum[threadIdx.x] = temp[threadIdx.x].x;
-    else
-        sum[threadIdx.x] = temp[threadIdx.x].y;
-    __syncthreads();
-#endif
-}
 /**
 * Compute the interaction.
 */
 extern "C" __global__ void computeInteraction(
        unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer, const real4* __restrict__ posq,
        real4 periodicBoxSize, real4 invPeriodicBoxSize
+#ifdef USE_CUTOFF
+        , const int* __restrict__ neighbors, const int* __restrict__ neighborStartIndex
+#endif
 #ifdef USE_FILTERS
        , int* __restrict__ particleTypes, int* __restrict__ orderIndex, int* __restrict__ particleOrder
 #endif
@@ -135,7 +95,12 @@ extern "C" __global__ void computeInteraction(
    // Loop over particles to be the first one in the set.
    for (int p1 = blockIdx.x; p1 < NUM_ATOMS; p1 += gridDim.x) {
+#ifdef USE_CUTOFF
+        int firstNeighbor = neighborStartIndex[p1];
+        int numNeighbors = neighborStartIndex[p1+1]-firstNeighbor;
+#else
        int numNeighbors = NUM_ATOMS-p1-1;
+#endif
        int numCombinations = NUM_CANDIDATE_COMBINATIONS;
        for (int index = threadIdx.x; index < numCombinations; index += blockDim.x) {
            FIND_ATOMS_FOR_COMBINATION_INDEX;