CustomNonbondedForce with interaction groups uses neighbor list in CUDA

7639d0bd · Peter Eastman · fb0cc5e7 · 7639d0bd · 7639d0bd · 7639d0bd
Commit 7639d0bd authored Mar 12, 2018 by Peter Eastman
7 changed files
--- a/platforms/cuda/include/CudaKernels.h
+++ b/platforms/cuda/include/CudaKernels.h
@@ -763,15 +763,15 @@ private:
    ForceInfo* info;
    CudaParameterSet* params;
    CudaArray globals;
-    CudaArray interactionGroupData;
-    CUfunction interactionGroupKernel;
-    std::vector<void*> interactionGroupArgs;
+    CudaArray interactionGroupData, filteredGroupData, numGroupTiles;
+    CUfunction interactionGroupKernel, prepareNeighborListKernel, buildNeighborListKernel;
+    std::vector<void*> interactionGroupArgs, prepareNeighborListArgs, buildNeighborListArgs;
    std::vector<std::string> globalParamNames;
    std::vector<float> globalParamValues;
    std::vector<CudaArray> tabulatedFunctions;
    double longRangeCoefficient;
    std::vector<double> longRangeCoefficientDerivs;
-    bool hasInitializedLongRangeCorrection, hasInitializedKernel, hasParamDerivs;
+    bool hasInitializedLongRangeCorrection, hasInitializedKernel, hasParamDerivs, useNeighborList;
    int numGroupThreadBlocks;
    CustomNonbondedForce* forceCopy;
    const System& system;

--- a/platforms/cuda/include/CudaNonbondedUtilities.h
+++ b/platforms/cuda/include/CudaNonbondedUtilities.h
@@ -142,6 +142,11 @@ public:
     * Get the maximum cutoff distance used by any force group.
     */
    double getMaxCutoffDistance();
+    /**
+     * Given a nonbonded cutoff, get the padded cutoff distance used in computing
+     * the neighbor list.
+     */
+    double padCutoff(double cutoff);
    /**
     * Prepare to compute interactions.  This updates the neighbor list.
     */
@@ -220,6 +225,13 @@ public:
    CudaArray& getExclusionRowIndices() {
        return exclusionRowIndices;
    }
+    /**
+     * Get the array containing a flag for whether the neighbor list was rebuilt
+     * on the most recent call to prepareInteractions().
+     */
+    CudaArray& getRebuildNeighborList() {
+        return rebuildNeighborList;
+    }
    /**
     * Get the index of the first tile this context is responsible for processing.
     */

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -2584,6 +2584,16 @@ void CudaCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNonbo
    }
    interactionGroupData.initialize<int4>(cu, groupData.size(), "interactionGroupData");
    interactionGroupData.upload(groupData);
+    numGroupTiles.initialize<int>(cu, 1, "numGroupTiles");
+
+    // Allocate space for a neighbor list, if necessary.
+
+    if (force.getNonbondedMethod() != CustomNonbondedForce::NoCutoff && groupData.size() > cu.getNumThreadBlocks()) {
+        filteredGroupData.initialize<int4>(cu, groupData.size(), "filteredGroupData");
+        interactionGroupData.copyTo(filteredGroupData);
+        int numTiles = groupData.size()/32;
+        numGroupTiles.upload(&numTiles);
+    }
    
    // Create the kernel.
    
@@ -2662,11 +2672,16 @@ void CudaCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNonbo
        defines["USE_CUTOFF"] = "1";
    if (force.getNonbondedMethod() == CustomNonbondedForce::CutoffPeriodic)
        defines["USE_PERIODIC"] = "1";
-    defines["LOCAL_MEMORY_SIZE"] = cu.intToString(max(32, cu.getNonbondedUtilities().getForceThreadBlockSize()));
+    int localMemorySize = max(32, cu.getNonbondedUtilities().getForceThreadBlockSize());
+    defines["LOCAL_MEMORY_SIZE"] = cu.intToString(localMemorySize);
+    defines["WARPS_IN_BLOCK"] = cu.intToString(localMemorySize/32);
    double cutoff = force.getCutoffDistance();
    defines["CUTOFF_SQUARED"] = cu.doubleToString(cutoff*cutoff);
+    double paddedCutoff = cu.getNonbondedUtilities().padCutoff(cutoff);
+    defines["PADDED_CUTOFF_SQUARED"] = cu.doubleToString(paddedCutoff*paddedCutoff);
    defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
    defines["TILE_SIZE"] = "32";
+    defines["NUM_TILES"] = cu.intToString(numTileSets);
    int numContexts = cu.getPlatformData().contexts.size();
    int startIndex = cu.getContextIndex()*numTileSets/numContexts;
    int endIndex = (cu.getContextIndex()+1)*numTileSets/numContexts;
@@ -2674,12 +2689,19 @@ void CudaCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNonbo
    defines["LAST_TILE"] = cu.intToString(endIndex);
    if ((localDataSize/4)%2 == 0 && !cu.getUseDoublePrecision())
        defines["PARAMETER_SIZE_IS_EVEN"] = "1";
-    CUmodule program = cu.createModule(CudaKernelSources::vectorOps+cu.replaceStrings(CudaKernelSources::customNonbondedGroups, replacements), defines);
-    interactionGroupKernel = cu.getKernel(program, "computeInteractionGroups");
+    CUmodule module = cu.createModule(CudaKernelSources::vectorOps+cu.replaceStrings(CudaKernelSources::customNonbondedGroups, replacements), defines);
+    interactionGroupKernel = cu.getKernel(module, "computeInteractionGroups");
+    prepareNeighborListKernel = cu.getKernel(module, "prepareToBuildNeighborList");
+    buildNeighborListKernel = cu.getKernel(module, "buildNeighborList");
    numGroupThreadBlocks = cu.getNonbondedUtilities().getNumForceThreadBlocks();
 }

 double CudaCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    useNeighborList = (filteredGroupData.isInitialized() && cu.getNonbondedUtilities().getUseCutoff());
+    if (useNeighborList && cu.getContextIndex() > 0) {
+        // When using a neighbor list, run the whole calculation on a single device.
+        return 0.0;
+    }
    if (globals.isInitialized()) {
        bool changed = false;
        for (int i = 0; i < (int) globalParamNames.size(); i++) {
@@ -2706,7 +2728,9 @@ double CudaCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool in
            interactionGroupArgs.push_back(&cu.getForce().getDevicePointer());
            interactionGroupArgs.push_back(&cu.getEnergyBuffer().getDevicePointer());
            interactionGroupArgs.push_back(&cu.getPosq().getDevicePointer());
-            interactionGroupArgs.push_back(&interactionGroupData.getDevicePointer());
+            interactionGroupArgs.push_back(&(useNeighborList ? filteredGroupData : interactionGroupData).getDevicePointer());
+            interactionGroupArgs.push_back(&numGroupTiles.getDevicePointer());
+            interactionGroupArgs.push_back(&useNeighborList);
            interactionGroupArgs.push_back(cu.getPeriodicBoxSizePointer());
            interactionGroupArgs.push_back(cu.getInvPeriodicBoxSizePointer());
            interactionGroupArgs.push_back(cu.getPeriodicBoxVecXPointer());
@@ -2720,8 +2744,30 @@ double CudaCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool in
                interactionGroupArgs.push_back(&globals.getDevicePointer());
            if (hasParamDerivs)
                interactionGroupArgs.push_back(&cu.getEnergyParamDerivBuffer().getDevicePointer());
+            if (useNeighborList) {
+                // Initialize kernels for building the interaction group neighbor list.
+
+                prepareNeighborListArgs.push_back(&cu.getNonbondedUtilities().getRebuildNeighborList().getDevicePointer());
+                prepareNeighborListArgs.push_back(&numGroupTiles.getDevicePointer());
+                buildNeighborListArgs.push_back(&cu.getNonbondedUtilities().getRebuildNeighborList().getDevicePointer());
+                buildNeighborListArgs.push_back(&numGroupTiles.getDevicePointer());
+                buildNeighborListArgs.push_back(&cu.getPosq().getDevicePointer());
+                buildNeighborListArgs.push_back(&interactionGroupData.getDevicePointer());
+                buildNeighborListArgs.push_back(&filteredGroupData.getDevicePointer());
+                buildNeighborListArgs.push_back(cu.getPeriodicBoxSizePointer());
+                buildNeighborListArgs.push_back(cu.getInvPeriodicBoxSizePointer());
+                buildNeighborListArgs.push_back(cu.getPeriodicBoxVecXPointer());
+                buildNeighborListArgs.push_back(cu.getPeriodicBoxVecYPointer());
+                buildNeighborListArgs.push_back(cu.getPeriodicBoxVecZPointer());
+            }
        }
        int forceThreadBlockSize = cu.getNonbondedUtilities().getForceThreadBlockSize();
+        if (useNeighborList) {
+            // Rebuild the neighbor list, if necessary.
+
+            cu.executeKernel(prepareNeighborListKernel, &prepareNeighborListArgs[0], 1, 1);
+            cu.executeKernel(buildNeighborListKernel, &buildNeighborListArgs[0], numGroupThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
+        }
        cu.executeKernel(interactionGroupKernel, &interactionGroupArgs[0], numGroupThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
    }
    double4 boxSize = cu.getPeriodicBoxSize();

--- a/platforms/cuda/src/CudaNonbondedUtilities.cpp
+++ b/platforms/cuda/src/CudaNonbondedUtilities.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2016 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2018 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -351,6 +351,11 @@ double CudaNonbondedUtilities::getMaxCutoffDistance() {
    return cutoff;
 }

+double CudaNonbondedUtilities::padCutoff(double cutoff) {
+    double padding = (usePadding ? 0.1*cutoff : 0.0);
+    return cutoff+padding;
+}
+
 void CudaNonbondedUtilities::prepareInteractions(int forceGroups) {
    if ((forceGroups&groupFlags) == 0)
        return;
@@ -462,13 +467,12 @@ void CudaNonbondedUtilities::createKernelsForGroups(int groups) {
    kernels.source = source;
    kernels.forceKernel = kernels.energyKernel = kernels.forceEnergyKernel = NULL;
    if (useCutoff) {
-        double padding = (usePadding ? 0.1*cutoff : 0.0);
-        double paddedCutoff = cutoff+padding;
+        double paddedCutoff = padCutoff(cutoff);
        map<string, string> defines;
        defines["TILE_SIZE"] = context.intToString(CudaContext::TileSize);
        defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
        defines["NUM_ATOMS"] = context.intToString(context.getNumAtoms());
-        defines["PADDING"] = context.doubleToString(padding);
+        defines["PADDING"] = context.doubleToString(paddedCutoff-cutoff);
        defines["PADDED_CUTOFF"] = context.doubleToString(paddedCutoff);
        defines["PADDED_CUTOFF_SQUARED"] = context.doubleToString(paddedCutoff*paddedCutoff);
        defines["NUM_TILES_WITH_EXCLUSIONS"] = context.intToString(exclusionTiles.getSize());

--- a/platforms/cuda/src/kernels/customNonbondedGroups.cu
+++ b/platforms/cuda/src/kernels/customNonbondedGroups.cu
@@ -10,6 +10,7 @@ typedef struct {

 extern "C" __global__ void computeInteractionGroups(
        unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, const int4* __restrict__ groupData,
+        int* __restrict__ numGroupTiles, bool useNeighborList,
        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
        PARAMETER_ARGUMENTS) {
    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
@@ -20,8 +21,8 @@ extern "C" __global__ void computeInteractionGroups(
    INIT_DERIVATIVES
    __shared__ AtomData localData[LOCAL_MEMORY_SIZE];

-    const unsigned int startTile = FIRST_TILE+warp*(LAST_TILE-FIRST_TILE)/totalWarps;
-    const unsigned int endTile = FIRST_TILE+(warp+1)*(LAST_TILE-FIRST_TILE)/totalWarps;
+    const unsigned int startTile = (useNeighborList ? warp*numGroupTiles[0]/totalWarps : FIRST_TILE+warp*(LAST_TILE-FIRST_TILE)/totalWarps);
+    const unsigned int endTile = (useNeighborList ? (warp+1)*numGroupTiles[0]/totalWarps : FIRST_TILE+(warp+1)*(LAST_TILE-FIRST_TILE)/totalWarps);
    for (int tile = startTile; tile < endTile; tile++) {
        const int4 atomData = groupData[TILE_SIZE*tile+tgx];
        const int atom1 = atomData.x;
@@ -86,3 +87,74 @@ extern "C" __global__ void computeInteractionGroups(
    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
    SAVE_DERIVATIVES
 }
+
+/**
+ * If the neighbor list needs to be rebuilt, reset the number of tiles to 0.  This is
+ * executed by a single thread.
+ */
+extern "C" __global__  void prepareToBuildNeighborList(int* __restrict__ rebuildNeighborList, int* __restrict__ numGroupTiles) {
+    if (rebuildNeighborList[0] == 1)
+        numGroupTiles[0] = 0;
+}
+
+/**
+ * Filter the list of tiles to include only ones that have interactions within the
+ * padded cutoff.
+ */
+extern "C" __global__  void buildNeighborList(int* __restrict__ rebuildNeighborList, int* __restrict__ numGroupTiles,
+        const real4* __restrict__ posq, const int4* __restrict__ groupData, int4* __restrict__ filteredGroupData,
+        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
+    
+    // If the neighbor list doesn't need to be rebuilt on this step, return immediately.
+    
+    if (rebuildNeighborList[0] == 0)
+        return;
+
+    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
+    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE; // global warpIndex
+    const unsigned int local_warp = threadIdx.x/TILE_SIZE; // local warpIndex
+    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); // index within the warp
+    const unsigned int tbx = threadIdx.x - tgx;           // block warpIndex
+
+    __shared__ real4 localPos[LOCAL_MEMORY_SIZE];
+    __shared__ volatile bool anyInteraction[WARPS_IN_BLOCK];
+    __shared__ volatile int tileIndex[WARPS_IN_BLOCK];
+
+    const unsigned int startTile = warp*NUM_TILES/totalWarps;
+    const unsigned int endTile = (warp+1)*NUM_TILES/totalWarps;
+    for (int tile = startTile; tile < endTile; tile++) {
+        const int4 atomData = groupData[TILE_SIZE*tile+tgx];
+        const int atom1 = atomData.x;
+        const int atom2 = atomData.y;
+        const int rangeStart = atomData.z&0xFFFF;
+        const int rangeEnd = (atomData.z>>16)&0xFFFF;
+        const int exclusions = atomData.w;
+        real4 posq1 = posq[atom1];
+        localPos[threadIdx.x] = posq[atom2];
+        if (tgx == 0)
+            anyInteraction[local_warp] = false;
+        int tj = tgx;
+        SYNC_WARPS;
+        for (int j = rangeStart; j < rangeEnd && !anyInteraction[local_warp]; j++) {
+            if (tj < rangeEnd) {
+                bool isExcluded = (((exclusions>>tj)&1) == 0);
+                int localIndex = tbx+tj;
+                real3 delta = make_real3(localPos[localIndex].x-posq1.x, localPos[localIndex].y-posq1.y, localPos[localIndex].z-posq1.z);
+#ifdef USE_PERIODIC
+                APPLY_PERIODIC_TO_DELTA(delta)
+#endif
+                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                if (!isExcluded && r2 < PADDED_CUTOFF_SQUARED)
+                    anyInteraction[local_warp] = true;
+            }
+            tj = (tj == rangeEnd-1 ? rangeStart : tj+1);
+            SYNC_WARPS;
+        }
+        if (anyInteraction[local_warp]) {
+            if (tgx == 0)
+                tileIndex[local_warp] = atomicAdd(numGroupTiles, 1);
+            SYNC_WARPS;
+            filteredGroupData[TILE_SIZE*tileIndex[local_warp]+tgx] = atomData;
+        }
+    }
+}
--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -2861,7 +2861,7 @@ double OpenCLCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool
            interactionGroupKernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
            interactionGroupKernel.setArg<cl::Buffer>(index++, (useNeighborList ? filteredGroupData : interactionGroupData).getDeviceBuffer());
            interactionGroupKernel.setArg<cl::Buffer>(index++, numGroupTiles.getDeviceBuffer());
-            interactionGroupKernel.setArg<cl_bool>(index++, useNeighborList);
+            interactionGroupKernel.setArg<cl_int>(index++, useNeighborList);
            index += 5;
            for (auto& buffer : params->getBuffers())
                interactionGroupKernel.setArg<cl::Memory>(index++, buffer.getMemory());

--- a/platforms/opencl/src/kernels/customNonbondedGroups.cl
+++ b/platforms/opencl/src/kernels/customNonbondedGroups.cl
@@ -43,7 +43,7 @@ __kernel void computeInteractionGroups(
        __global real4* restrict forceBuffers,
 #endif
        __global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const int4* restrict groupData,
-        __global int* restrict numGroupTiles, bool useNeighborList,
+        __global int* restrict numGroupTiles, int useNeighborList,
        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
        PARAMETER_ARGUMENTS) {
    const unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
@@ -159,8 +159,8 @@ __kernel void buildNeighborList(__global int* restrict rebuildNeighborList, __gl
    const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); // index within the warp
    const unsigned int tbx = get_local_id(0) - tgx;           // block warpIndex
    __local real4 localPos[LOCAL_MEMORY_SIZE];
-    __local bool anyInteraction[WARPS_IN_BLOCK];
-    __local int tileIndex[WARPS_IN_BLOCK];
+    __local volatile bool anyInteraction[WARPS_IN_BLOCK];
+    __local volatile int tileIndex[WARPS_IN_BLOCK];

    const unsigned int startTile = warp*NUM_TILES/totalWarps;
    const unsigned int endTile = (warp+1)*NUM_TILES/totalWarps;