Commit 7639d0bd authored by Peter Eastman's avatar Peter Eastman
Browse files

CustomNonbondedForce with interaction groups uses neighbor list in CUDA

parent fb0cc5e7
......@@ -763,15 +763,15 @@ private:
ForceInfo* info;
CudaParameterSet* params;
CudaArray globals;
CudaArray interactionGroupData;
CUfunction interactionGroupKernel;
std::vector<void*> interactionGroupArgs;
CudaArray interactionGroupData, filteredGroupData, numGroupTiles;
CUfunction interactionGroupKernel, prepareNeighborListKernel, buildNeighborListKernel;
std::vector<void*> interactionGroupArgs, prepareNeighborListArgs, buildNeighborListArgs;
std::vector<std::string> globalParamNames;
std::vector<float> globalParamValues;
std::vector<CudaArray> tabulatedFunctions;
double longRangeCoefficient;
std::vector<double> longRangeCoefficientDerivs;
bool hasInitializedLongRangeCorrection, hasInitializedKernel, hasParamDerivs;
bool hasInitializedLongRangeCorrection, hasInitializedKernel, hasParamDerivs, useNeighborList;
int numGroupThreadBlocks;
CustomNonbondedForce* forceCopy;
const System& system;
......
......@@ -142,6 +142,11 @@ public:
* Get the maximum cutoff distance used by any force group.
*/
double getMaxCutoffDistance();
/**
* Given a nonbonded cutoff, get the padded cutoff distance used in computing
* the neighbor list.
*/
double padCutoff(double cutoff);
/**
* Prepare to compute interactions. This updates the neighbor list.
*/
......@@ -220,6 +225,13 @@ public:
CudaArray& getExclusionRowIndices() {
return exclusionRowIndices;
}
/**
* Get the array containing a flag for whether the neighbor list was rebuilt
* on the most recent call to prepareInteractions().
*/
CudaArray& getRebuildNeighborList() {
return rebuildNeighborList;
}
/**
* Get the index of the first tile this context is responsible for processing.
*/
......
......@@ -2584,6 +2584,16 @@ void CudaCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNonbo
}
interactionGroupData.initialize<int4>(cu, groupData.size(), "interactionGroupData");
interactionGroupData.upload(groupData);
numGroupTiles.initialize<int>(cu, 1, "numGroupTiles");
// Allocate space for a neighbor list, if necessary.
if (force.getNonbondedMethod() != CustomNonbondedForce::NoCutoff && groupData.size() > cu.getNumThreadBlocks()) {
filteredGroupData.initialize<int4>(cu, groupData.size(), "filteredGroupData");
interactionGroupData.copyTo(filteredGroupData);
int numTiles = groupData.size()/32;
numGroupTiles.upload(&numTiles);
}
// Create the kernel.
......@@ -2662,11 +2672,16 @@ void CudaCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNonbo
defines["USE_CUTOFF"] = "1";
if (force.getNonbondedMethod() == CustomNonbondedForce::CutoffPeriodic)
defines["USE_PERIODIC"] = "1";
defines["LOCAL_MEMORY_SIZE"] = cu.intToString(max(32, cu.getNonbondedUtilities().getForceThreadBlockSize()));
int localMemorySize = max(32, cu.getNonbondedUtilities().getForceThreadBlockSize());
defines["LOCAL_MEMORY_SIZE"] = cu.intToString(localMemorySize);
defines["WARPS_IN_BLOCK"] = cu.intToString(localMemorySize/32);
double cutoff = force.getCutoffDistance();
defines["CUTOFF_SQUARED"] = cu.doubleToString(cutoff*cutoff);
double paddedCutoff = cu.getNonbondedUtilities().padCutoff(cutoff);
defines["PADDED_CUTOFF_SQUARED"] = cu.doubleToString(paddedCutoff*paddedCutoff);
defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
defines["TILE_SIZE"] = "32";
defines["NUM_TILES"] = cu.intToString(numTileSets);
int numContexts = cu.getPlatformData().contexts.size();
int startIndex = cu.getContextIndex()*numTileSets/numContexts;
int endIndex = (cu.getContextIndex()+1)*numTileSets/numContexts;
......@@ -2674,12 +2689,19 @@ void CudaCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNonbo
defines["LAST_TILE"] = cu.intToString(endIndex);
if ((localDataSize/4)%2 == 0 && !cu.getUseDoublePrecision())
defines["PARAMETER_SIZE_IS_EVEN"] = "1";
CUmodule program = cu.createModule(CudaKernelSources::vectorOps+cu.replaceStrings(CudaKernelSources::customNonbondedGroups, replacements), defines);
interactionGroupKernel = cu.getKernel(program, "computeInteractionGroups");
CUmodule module = cu.createModule(CudaKernelSources::vectorOps+cu.replaceStrings(CudaKernelSources::customNonbondedGroups, replacements), defines);
interactionGroupKernel = cu.getKernel(module, "computeInteractionGroups");
prepareNeighborListKernel = cu.getKernel(module, "prepareToBuildNeighborList");
buildNeighborListKernel = cu.getKernel(module, "buildNeighborList");
numGroupThreadBlocks = cu.getNonbondedUtilities().getNumForceThreadBlocks();
}
double CudaCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
useNeighborList = (filteredGroupData.isInitialized() && cu.getNonbondedUtilities().getUseCutoff());
if (useNeighborList && cu.getContextIndex() > 0) {
// When using a neighbor list, run the whole calculation on a single device.
return 0.0;
}
if (globals.isInitialized()) {
bool changed = false;
for (int i = 0; i < (int) globalParamNames.size(); i++) {
......@@ -2706,7 +2728,9 @@ double CudaCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool in
interactionGroupArgs.push_back(&cu.getForce().getDevicePointer());
interactionGroupArgs.push_back(&cu.getEnergyBuffer().getDevicePointer());
interactionGroupArgs.push_back(&cu.getPosq().getDevicePointer());
interactionGroupArgs.push_back(&interactionGroupData.getDevicePointer());
interactionGroupArgs.push_back(&(useNeighborList ? filteredGroupData : interactionGroupData).getDevicePointer());
interactionGroupArgs.push_back(&numGroupTiles.getDevicePointer());
interactionGroupArgs.push_back(&useNeighborList);
interactionGroupArgs.push_back(cu.getPeriodicBoxSizePointer());
interactionGroupArgs.push_back(cu.getInvPeriodicBoxSizePointer());
interactionGroupArgs.push_back(cu.getPeriodicBoxVecXPointer());
......@@ -2720,8 +2744,30 @@ double CudaCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool in
interactionGroupArgs.push_back(&globals.getDevicePointer());
if (hasParamDerivs)
interactionGroupArgs.push_back(&cu.getEnergyParamDerivBuffer().getDevicePointer());
if (useNeighborList) {
// Initialize kernels for building the interaction group neighbor list.
prepareNeighborListArgs.push_back(&cu.getNonbondedUtilities().getRebuildNeighborList().getDevicePointer());
prepareNeighborListArgs.push_back(&numGroupTiles.getDevicePointer());
buildNeighborListArgs.push_back(&cu.getNonbondedUtilities().getRebuildNeighborList().getDevicePointer());
buildNeighborListArgs.push_back(&numGroupTiles.getDevicePointer());
buildNeighborListArgs.push_back(&cu.getPosq().getDevicePointer());
buildNeighborListArgs.push_back(&interactionGroupData.getDevicePointer());
buildNeighborListArgs.push_back(&filteredGroupData.getDevicePointer());
buildNeighborListArgs.push_back(cu.getPeriodicBoxSizePointer());
buildNeighborListArgs.push_back(cu.getInvPeriodicBoxSizePointer());
buildNeighborListArgs.push_back(cu.getPeriodicBoxVecXPointer());
buildNeighborListArgs.push_back(cu.getPeriodicBoxVecYPointer());
buildNeighborListArgs.push_back(cu.getPeriodicBoxVecZPointer());
}
}
int forceThreadBlockSize = cu.getNonbondedUtilities().getForceThreadBlockSize();
if (useNeighborList) {
// Rebuild the neighbor list, if necessary.
cu.executeKernel(prepareNeighborListKernel, &prepareNeighborListArgs[0], 1, 1);
cu.executeKernel(buildNeighborListKernel, &buildNeighborListArgs[0], numGroupThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
}
cu.executeKernel(interactionGroupKernel, &interactionGroupArgs[0], numGroupThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
}
double4 boxSize = cu.getPeriodicBoxSize();
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2016 Stanford University and the Authors. *
* Portions copyright (c) 2009-2018 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -351,6 +351,11 @@ double CudaNonbondedUtilities::getMaxCutoffDistance() {
return cutoff;
}
double CudaNonbondedUtilities::padCutoff(double cutoff) {
double padding = (usePadding ? 0.1*cutoff : 0.0);
return cutoff+padding;
}
void CudaNonbondedUtilities::prepareInteractions(int forceGroups) {
if ((forceGroups&groupFlags) == 0)
return;
......@@ -462,13 +467,12 @@ void CudaNonbondedUtilities::createKernelsForGroups(int groups) {
kernels.source = source;
kernels.forceKernel = kernels.energyKernel = kernels.forceEnergyKernel = NULL;
if (useCutoff) {
double padding = (usePadding ? 0.1*cutoff : 0.0);
double paddedCutoff = cutoff+padding;
double paddedCutoff = padCutoff(cutoff);
map<string, string> defines;
defines["TILE_SIZE"] = context.intToString(CudaContext::TileSize);
defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
defines["NUM_ATOMS"] = context.intToString(context.getNumAtoms());
defines["PADDING"] = context.doubleToString(padding);
defines["PADDING"] = context.doubleToString(paddedCutoff-cutoff);
defines["PADDED_CUTOFF"] = context.doubleToString(paddedCutoff);
defines["PADDED_CUTOFF_SQUARED"] = context.doubleToString(paddedCutoff*paddedCutoff);
defines["NUM_TILES_WITH_EXCLUSIONS"] = context.intToString(exclusionTiles.getSize());
......
......@@ -10,6 +10,7 @@ typedef struct {
extern "C" __global__ void computeInteractionGroups(
unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, const int4* __restrict__ groupData,
int* __restrict__ numGroupTiles, bool useNeighborList,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
PARAMETER_ARGUMENTS) {
const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
......@@ -20,8 +21,8 @@ extern "C" __global__ void computeInteractionGroups(
INIT_DERIVATIVES
__shared__ AtomData localData[LOCAL_MEMORY_SIZE];
const unsigned int startTile = FIRST_TILE+warp*(LAST_TILE-FIRST_TILE)/totalWarps;
const unsigned int endTile = FIRST_TILE+(warp+1)*(LAST_TILE-FIRST_TILE)/totalWarps;
const unsigned int startTile = (useNeighborList ? warp*numGroupTiles[0]/totalWarps : FIRST_TILE+warp*(LAST_TILE-FIRST_TILE)/totalWarps);
const unsigned int endTile = (useNeighborList ? (warp+1)*numGroupTiles[0]/totalWarps : FIRST_TILE+(warp+1)*(LAST_TILE-FIRST_TILE)/totalWarps);
for (int tile = startTile; tile < endTile; tile++) {
const int4 atomData = groupData[TILE_SIZE*tile+tgx];
const int atom1 = atomData.x;
......@@ -86,3 +87,74 @@ extern "C" __global__ void computeInteractionGroups(
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
SAVE_DERIVATIVES
}
/**
* If the neighbor list needs to be rebuilt, reset the number of tiles to 0. This is
* executed by a single thread.
*/
extern "C" __global__ void prepareToBuildNeighborList(int* __restrict__ rebuildNeighborList, int* __restrict__ numGroupTiles) {
if (rebuildNeighborList[0] == 1)
numGroupTiles[0] = 0;
}
/**
* Filter the list of tiles to include only ones that have interactions within the
* padded cutoff.
*/
extern "C" __global__ void buildNeighborList(int* __restrict__ rebuildNeighborList, int* __restrict__ numGroupTiles,
const real4* __restrict__ posq, const int4* __restrict__ groupData, int4* __restrict__ filteredGroupData,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
// If the neighbor list doesn't need to be rebuilt on this step, return immediately.
if (rebuildNeighborList[0] == 0)
return;
const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE; // global warpIndex
const unsigned int local_warp = threadIdx.x/TILE_SIZE; // local warpIndex
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); // index within the warp
const unsigned int tbx = threadIdx.x - tgx; // block warpIndex
__shared__ real4 localPos[LOCAL_MEMORY_SIZE];
__shared__ volatile bool anyInteraction[WARPS_IN_BLOCK];
__shared__ volatile int tileIndex[WARPS_IN_BLOCK];
const unsigned int startTile = warp*NUM_TILES/totalWarps;
const unsigned int endTile = (warp+1)*NUM_TILES/totalWarps;
for (int tile = startTile; tile < endTile; tile++) {
const int4 atomData = groupData[TILE_SIZE*tile+tgx];
const int atom1 = atomData.x;
const int atom2 = atomData.y;
const int rangeStart = atomData.z&0xFFFF;
const int rangeEnd = (atomData.z>>16)&0xFFFF;
const int exclusions = atomData.w;
real4 posq1 = posq[atom1];
localPos[threadIdx.x] = posq[atom2];
if (tgx == 0)
anyInteraction[local_warp] = false;
int tj = tgx;
SYNC_WARPS;
for (int j = rangeStart; j < rangeEnd && !anyInteraction[local_warp]; j++) {
if (tj < rangeEnd) {
bool isExcluded = (((exclusions>>tj)&1) == 0);
int localIndex = tbx+tj;
real3 delta = make_real3(localPos[localIndex].x-posq1.x, localPos[localIndex].y-posq1.y, localPos[localIndex].z-posq1.z);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
if (!isExcluded && r2 < PADDED_CUTOFF_SQUARED)
anyInteraction[local_warp] = true;
}
tj = (tj == rangeEnd-1 ? rangeStart : tj+1);
SYNC_WARPS;
}
if (anyInteraction[local_warp]) {
if (tgx == 0)
tileIndex[local_warp] = atomicAdd(numGroupTiles, 1);
SYNC_WARPS;
filteredGroupData[TILE_SIZE*tileIndex[local_warp]+tgx] = atomData;
}
}
}
......@@ -2861,7 +2861,7 @@ double OpenCLCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool
interactionGroupKernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
interactionGroupKernel.setArg<cl::Buffer>(index++, (useNeighborList ? filteredGroupData : interactionGroupData).getDeviceBuffer());
interactionGroupKernel.setArg<cl::Buffer>(index++, numGroupTiles.getDeviceBuffer());
interactionGroupKernel.setArg<cl_bool>(index++, useNeighborList);
interactionGroupKernel.setArg<cl_int>(index++, useNeighborList);
index += 5;
for (auto& buffer : params->getBuffers())
interactionGroupKernel.setArg<cl::Memory>(index++, buffer.getMemory());
......
......@@ -43,7 +43,7 @@ __kernel void computeInteractionGroups(
__global real4* restrict forceBuffers,
#endif
__global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const int4* restrict groupData,
__global int* restrict numGroupTiles, bool useNeighborList,
__global int* restrict numGroupTiles, int useNeighborList,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
PARAMETER_ARGUMENTS) {
const unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
......@@ -159,8 +159,8 @@ __kernel void buildNeighborList(__global int* restrict rebuildNeighborList, __gl
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); // index within the warp
const unsigned int tbx = get_local_id(0) - tgx; // block warpIndex
__local real4 localPos[LOCAL_MEMORY_SIZE];
__local bool anyInteraction[WARPS_IN_BLOCK];
__local int tileIndex[WARPS_IN_BLOCK];
__local volatile bool anyInteraction[WARPS_IN_BLOCK];
__local volatile int tileIndex[WARPS_IN_BLOCK];
const unsigned int startTile = warp*NUM_TILES/totalWarps;
const unsigned int endTile = (warp+1)*NUM_TILES/totalWarps;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment