Unverified Commit c89cbcdb authored by peastman's avatar peastman Committed by GitHub
Browse files

Merge pull request #2014 from peastman/groups

CustomNonbondedForce with interaction groups uses neighbor lists
parents c6df2891 2df35b4e
......@@ -252,7 +252,9 @@ void CpuCustomNonbondedForce::calculateOneIxn(int ii, int jj, ThreadData& data,
// accumulate forces
double dEdR = (includeForce ? data.forceExpression.evaluate()/r : 0.0);
double energy = (includeEnergy ? data.energyExpression.evaluate() : 0.0);
double energy = 0.0;
if (includeEnergy || (useSwitch && r > switchingDistance))
energy = data.energyExpression.evaluate();
double switchValue = 1.0;
if (useSwitch) {
if (r > switchingDistance) {
......
......@@ -763,15 +763,15 @@ private:
ForceInfo* info;
CudaParameterSet* params;
CudaArray globals;
CudaArray interactionGroupData;
CUfunction interactionGroupKernel;
std::vector<void*> interactionGroupArgs;
CudaArray interactionGroupData, filteredGroupData, numGroupTiles;
CUfunction interactionGroupKernel, prepareNeighborListKernel, buildNeighborListKernel;
std::vector<void*> interactionGroupArgs, prepareNeighborListArgs, buildNeighborListArgs;
std::vector<std::string> globalParamNames;
std::vector<float> globalParamValues;
std::vector<CudaArray> tabulatedFunctions;
double longRangeCoefficient;
std::vector<double> longRangeCoefficientDerivs;
bool hasInitializedLongRangeCorrection, hasInitializedKernel, hasParamDerivs;
bool hasInitializedLongRangeCorrection, hasInitializedKernel, hasParamDerivs, useNeighborList;
int numGroupThreadBlocks;
CustomNonbondedForce* forceCopy;
const System& system;
......
......@@ -142,6 +142,11 @@ public:
* Get the maximum cutoff distance used by any force group.
*/
double getMaxCutoffDistance();
/**
* Given a nonbonded cutoff, get the padded cutoff distance used in computing
* the neighbor list.
*/
double padCutoff(double cutoff);
/**
* Prepare to compute interactions. This updates the neighbor list.
*/
......@@ -220,6 +225,13 @@ public:
CudaArray& getExclusionRowIndices() {
return exclusionRowIndices;
}
/**
* Get the array containing a flag for whether the neighbor list was rebuilt
* on the most recent call to prepareInteractions().
*/
CudaArray& getRebuildNeighborList() {
return rebuildNeighborList;
}
/**
* Get the index of the first tile this context is responsible for processing.
*/
......
......@@ -54,6 +54,7 @@
#include "jama_eig.h"
#include <algorithm>
#include <cmath>
#include <iterator>
#include <set>
using namespace OpenMM;
......@@ -2429,7 +2430,8 @@ void CudaCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNonbo
vector<vector<int> > atomLists;
vector<pair<int, int> > tiles;
map<pair<int, int>, int> duplicateInteractions;
vector<int> tileGroup;
vector<vector<int> > duplicateAtomsForGroup;
for (int group = 0; group < force.getNumInteractionGroups(); group++) {
// Get the list of atoms in this group and sort them.
......@@ -2440,6 +2442,10 @@ void CudaCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNonbo
atoms2.insert(atoms2.begin(), set2.begin(), set2.end());
sort(atoms1.begin(), atoms1.end());
sort(atoms2.begin(), atoms2.end());
duplicateAtomsForGroup.push_back(vector<int>());
set_intersection(set1.begin(), set1.end(), set2.begin(), set2.end(),
inserter(duplicateAtomsForGroup[group], duplicateAtomsForGroup[group].begin()));
sort(duplicateAtomsForGroup[group].begin(), duplicateAtomsForGroup[group].end());
// Find how many tiles we will create for this group.
......@@ -2451,9 +2457,12 @@ void CudaCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNonbo
// Add the tiles.
int firstTile = tiles.size();
for (int i = 0; i < numBlocks1; i++)
for (int j = 0; j < numBlocks2; j++)
for (int j = 0; j < numBlocks2; j++) {
tiles.push_back(make_pair(atomLists.size()+i, atomLists.size()+numBlocks1+j));
tileGroup.push_back(group);
}
// Add the atom lists.
......@@ -2473,22 +2482,6 @@ void CudaCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNonbo
atoms.push_back(atoms2[j]);
atomLists.push_back(atoms);
}
// If this group contains duplicate interactions, record that we need to skip them once.
for (int a1 : atoms1) {
if (set2.find(a1) == set2.end())
continue;
for (int j = 0; j < (int) atoms2.size() && atoms2[j] < a1; j++) {
int a2 = atoms2[j];
if (set1.find(a2) != set1.end()) {
pair<int, int> key = make_pair(a2, a1);
if (duplicateInteractions.find(key) == duplicateInteractions.end())
duplicateInteractions[key] = 0;
duplicateInteractions[key]++;
}
}
}
}
// Build a lookup table for quickly identifying excluded interactions.
......@@ -2506,15 +2499,18 @@ void CudaCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNonbo
vector<vector<int> > exclusionFlags(tiles.size());
vector<pair<int, int> > tileOrder;
for (int tile = 0; tile < tiles.size(); tile++) {
bool swapped = false;
if (atomLists[tiles[tile].first].size() < atomLists[tiles[tile].second].size()) {
// For efficiency, we want the first axis to be the larger one.
int swap = tiles[tile].first;
tiles[tile].first = tiles[tile].second;
tiles[tile].second = swap;
swapped = true;
}
vector<int>& atoms1 = atomLists[tiles[tile].first];
vector<int>& atoms2 = atomLists[tiles[tile].second];
vector<int>& duplicateAtoms = duplicateAtomsForGroup[tileGroup[tile]];
vector<int> flags(atoms1.size(), (int) (1LL<<atoms2.size())-1);
int numExcluded = 0;
for (int i = 0; i < (int) atoms1.size(); i++)
......@@ -2525,11 +2521,10 @@ void CudaCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNonbo
pair<int, int> key = make_pair(min(a1, a2), max(a1, a2));
if (a1 == a2 || exclusions.find(key) != exclusions.end())
isExcluded = true; // This is an excluded interaction.
else if (duplicateInteractions.find(key) != duplicateInteractions.end() && duplicateInteractions[key] > 0) {
else if ((a1 > a2) == swapped && binary_search(duplicateAtoms.begin(), duplicateAtoms.end(), a1) && binary_search(duplicateAtoms.begin(), duplicateAtoms.end(), a2)) {
// Both atoms are in both sets, so skip duplicate interactions.
isExcluded = true;
duplicateInteractions[key]--;
}
if (isExcluded) {
flags[i] &= -1-(1<<j);
......@@ -2584,6 +2579,16 @@ void CudaCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNonbo
}
interactionGroupData.initialize<int4>(cu, groupData.size(), "interactionGroupData");
interactionGroupData.upload(groupData);
numGroupTiles.initialize<int>(cu, 1, "numGroupTiles");
// Allocate space for a neighbor list, if necessary.
if (force.getNonbondedMethod() != CustomNonbondedForce::NoCutoff && groupData.size() > cu.getNumThreadBlocks()) {
filteredGroupData.initialize<int4>(cu, groupData.size(), "filteredGroupData");
interactionGroupData.copyTo(filteredGroupData);
int numTiles = groupData.size()/32;
numGroupTiles.upload(&numTiles);
}
// Create the kernel.
......@@ -2662,11 +2667,16 @@ void CudaCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNonbo
defines["USE_CUTOFF"] = "1";
if (force.getNonbondedMethod() == CustomNonbondedForce::CutoffPeriodic)
defines["USE_PERIODIC"] = "1";
defines["LOCAL_MEMORY_SIZE"] = cu.intToString(max(32, cu.getNonbondedUtilities().getForceThreadBlockSize()));
int localMemorySize = max(32, cu.getNonbondedUtilities().getForceThreadBlockSize());
defines["LOCAL_MEMORY_SIZE"] = cu.intToString(localMemorySize);
defines["WARPS_IN_BLOCK"] = cu.intToString(localMemorySize/32);
double cutoff = force.getCutoffDistance();
defines["CUTOFF_SQUARED"] = cu.doubleToString(cutoff*cutoff);
double paddedCutoff = cu.getNonbondedUtilities().padCutoff(cutoff);
defines["PADDED_CUTOFF_SQUARED"] = cu.doubleToString(paddedCutoff*paddedCutoff);
defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
defines["TILE_SIZE"] = "32";
defines["NUM_TILES"] = cu.intToString(numTileSets);
int numContexts = cu.getPlatformData().contexts.size();
int startIndex = cu.getContextIndex()*numTileSets/numContexts;
int endIndex = (cu.getContextIndex()+1)*numTileSets/numContexts;
......@@ -2674,12 +2684,19 @@ void CudaCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNonbo
defines["LAST_TILE"] = cu.intToString(endIndex);
if ((localDataSize/4)%2 == 0 && !cu.getUseDoublePrecision())
defines["PARAMETER_SIZE_IS_EVEN"] = "1";
CUmodule program = cu.createModule(CudaKernelSources::vectorOps+cu.replaceStrings(CudaKernelSources::customNonbondedGroups, replacements), defines);
interactionGroupKernel = cu.getKernel(program, "computeInteractionGroups");
CUmodule module = cu.createModule(CudaKernelSources::vectorOps+cu.replaceStrings(CudaKernelSources::customNonbondedGroups, replacements), defines);
interactionGroupKernel = cu.getKernel(module, "computeInteractionGroups");
prepareNeighborListKernel = cu.getKernel(module, "prepareToBuildNeighborList");
buildNeighborListKernel = cu.getKernel(module, "buildNeighborList");
numGroupThreadBlocks = cu.getNonbondedUtilities().getNumForceThreadBlocks();
}
double CudaCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
useNeighborList = (filteredGroupData.isInitialized() && cu.getNonbondedUtilities().getUseCutoff());
if (useNeighborList && cu.getContextIndex() > 0) {
// When using a neighbor list, run the whole calculation on a single device.
return 0.0;
}
if (globals.isInitialized()) {
bool changed = false;
for (int i = 0; i < (int) globalParamNames.size(); i++) {
......@@ -2706,7 +2723,9 @@ double CudaCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool in
interactionGroupArgs.push_back(&cu.getForce().getDevicePointer());
interactionGroupArgs.push_back(&cu.getEnergyBuffer().getDevicePointer());
interactionGroupArgs.push_back(&cu.getPosq().getDevicePointer());
interactionGroupArgs.push_back(&interactionGroupData.getDevicePointer());
interactionGroupArgs.push_back(&(useNeighborList ? filteredGroupData : interactionGroupData).getDevicePointer());
interactionGroupArgs.push_back(&numGroupTiles.getDevicePointer());
interactionGroupArgs.push_back(&useNeighborList);
interactionGroupArgs.push_back(cu.getPeriodicBoxSizePointer());
interactionGroupArgs.push_back(cu.getInvPeriodicBoxSizePointer());
interactionGroupArgs.push_back(cu.getPeriodicBoxVecXPointer());
......@@ -2720,8 +2739,30 @@ double CudaCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool in
interactionGroupArgs.push_back(&globals.getDevicePointer());
if (hasParamDerivs)
interactionGroupArgs.push_back(&cu.getEnergyParamDerivBuffer().getDevicePointer());
if (useNeighborList) {
// Initialize kernels for building the interaction group neighbor list.
prepareNeighborListArgs.push_back(&cu.getNonbondedUtilities().getRebuildNeighborList().getDevicePointer());
prepareNeighborListArgs.push_back(&numGroupTiles.getDevicePointer());
buildNeighborListArgs.push_back(&cu.getNonbondedUtilities().getRebuildNeighborList().getDevicePointer());
buildNeighborListArgs.push_back(&numGroupTiles.getDevicePointer());
buildNeighborListArgs.push_back(&cu.getPosq().getDevicePointer());
buildNeighborListArgs.push_back(&interactionGroupData.getDevicePointer());
buildNeighborListArgs.push_back(&filteredGroupData.getDevicePointer());
buildNeighborListArgs.push_back(cu.getPeriodicBoxSizePointer());
buildNeighborListArgs.push_back(cu.getInvPeriodicBoxSizePointer());
buildNeighborListArgs.push_back(cu.getPeriodicBoxVecXPointer());
buildNeighborListArgs.push_back(cu.getPeriodicBoxVecYPointer());
buildNeighborListArgs.push_back(cu.getPeriodicBoxVecZPointer());
}
}
int forceThreadBlockSize = cu.getNonbondedUtilities().getForceThreadBlockSize();
if (useNeighborList) {
// Rebuild the neighbor list, if necessary.
cu.executeKernel(prepareNeighborListKernel, &prepareNeighborListArgs[0], 1, 1);
cu.executeKernel(buildNeighborListKernel, &buildNeighborListArgs[0], numGroupThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
}
cu.executeKernel(interactionGroupKernel, &interactionGroupArgs[0], numGroupThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
}
double4 boxSize = cu.getPeriodicBoxSize();
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2016 Stanford University and the Authors. *
* Portions copyright (c) 2009-2018 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -266,6 +266,7 @@ void CudaNonbondedUtilities::initialize(const System& system) {
blockSorter = new CudaSort(context, new BlockSortTrait(context.getUseDoublePrecision()), numAtomBlocks);
vector<unsigned int> count(2, 0);
interactionCount.upload(count);
rebuildNeighborList.upload(count);
}
// Record arguments for kernels.
......@@ -351,6 +352,11 @@ double CudaNonbondedUtilities::getMaxCutoffDistance() {
return cutoff;
}
double CudaNonbondedUtilities::padCutoff(double cutoff) {
double padding = (usePadding ? 0.1*cutoff : 0.0);
return cutoff+padding;
}
void CudaNonbondedUtilities::prepareInteractions(int forceGroups) {
if ((forceGroups&groupFlags) == 0)
return;
......@@ -462,13 +468,12 @@ void CudaNonbondedUtilities::createKernelsForGroups(int groups) {
kernels.source = source;
kernels.forceKernel = kernels.energyKernel = kernels.forceEnergyKernel = NULL;
if (useCutoff) {
double padding = (usePadding ? 0.1*cutoff : 0.0);
double paddedCutoff = cutoff+padding;
double paddedCutoff = padCutoff(cutoff);
map<string, string> defines;
defines["TILE_SIZE"] = context.intToString(CudaContext::TileSize);
defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
defines["NUM_ATOMS"] = context.intToString(context.getNumAtoms());
defines["PADDING"] = context.doubleToString(padding);
defines["PADDING"] = context.doubleToString(paddedCutoff-cutoff);
defines["PADDED_CUTOFF"] = context.doubleToString(paddedCutoff);
defines["PADDED_CUTOFF_SQUARED"] = context.doubleToString(paddedCutoff*paddedCutoff);
defines["NUM_TILES_WITH_EXCLUSIONS"] = context.intToString(exclusionTiles.getSize());
......
......@@ -10,6 +10,7 @@ typedef struct {
extern "C" __global__ void computeInteractionGroups(
unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, const int4* __restrict__ groupData,
int* __restrict__ numGroupTiles, bool useNeighborList,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
PARAMETER_ARGUMENTS) {
const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
......@@ -20,8 +21,8 @@ extern "C" __global__ void computeInteractionGroups(
INIT_DERIVATIVES
__shared__ AtomData localData[LOCAL_MEMORY_SIZE];
const unsigned int startTile = FIRST_TILE+warp*(LAST_TILE-FIRST_TILE)/totalWarps;
const unsigned int endTile = FIRST_TILE+(warp+1)*(LAST_TILE-FIRST_TILE)/totalWarps;
const unsigned int startTile = (useNeighborList ? warp*numGroupTiles[0]/totalWarps : FIRST_TILE+warp*(LAST_TILE-FIRST_TILE)/totalWarps);
const unsigned int endTile = (useNeighborList ? (warp+1)*numGroupTiles[0]/totalWarps : FIRST_TILE+(warp+1)*(LAST_TILE-FIRST_TILE)/totalWarps);
for (int tile = startTile; tile < endTile; tile++) {
const int4 atomData = groupData[TILE_SIZE*tile+tgx];
const int atom1 = atomData.x;
......@@ -86,3 +87,75 @@ extern "C" __global__ void computeInteractionGroups(
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
SAVE_DERIVATIVES
}
/**
* If the neighbor list needs to be rebuilt, reset the number of tiles to 0. This is
* executed by a single thread.
*/
extern "C" __global__ void prepareToBuildNeighborList(int* __restrict__ rebuildNeighborList, int* __restrict__ numGroupTiles) {
if (rebuildNeighborList[0] == 1)
numGroupTiles[0] = 0;
}
/**
* Filter the list of tiles to include only ones that have interactions within the
* padded cutoff.
*/
extern "C" __global__ void buildNeighborList(int* __restrict__ rebuildNeighborList, int* __restrict__ numGroupTiles,
const real4* __restrict__ posq, const int4* __restrict__ groupData, int4* __restrict__ filteredGroupData,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
// If the neighbor list doesn't need to be rebuilt on this step, return immediately.
if (rebuildNeighborList[0] == 0)
return;
const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE; // global warpIndex
const unsigned int local_warp = threadIdx.x/TILE_SIZE; // local warpIndex
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); // index within the warp
const unsigned int tbx = threadIdx.x - tgx; // block warpIndex
__shared__ real4 localPos[LOCAL_MEMORY_SIZE];
__shared__ volatile bool anyInteraction[WARPS_IN_BLOCK];
__shared__ volatile int tileIndex[WARPS_IN_BLOCK];
const unsigned int startTile = warp*NUM_TILES/totalWarps;
const unsigned int endTile = (warp+1)*NUM_TILES/totalWarps;
for (int tile = startTile; tile < endTile; tile++) {
const int4 atomData = groupData[TILE_SIZE*tile+tgx];
const int atom1 = atomData.x;
const int atom2 = atomData.y;
const int rangeStart = atomData.z&0xFFFF;
const int rangeEnd = (atomData.z>>16)&0xFFFF;
const int exclusions = atomData.w;
real4 posq1 = posq[atom1];
localPos[threadIdx.x] = posq[atom2];
if (tgx == 0)
anyInteraction[local_warp] = false;
int tj = tgx;
SYNC_WARPS;
for (int j = rangeStart; j < rangeEnd && !anyInteraction[local_warp]; j++) {
if (tj < rangeEnd) {
bool isExcluded = (((exclusions>>tj)&1) == 0);
int localIndex = tbx+tj;
real3 delta = make_real3(localPos[localIndex].x-posq1.x, localPos[localIndex].y-posq1.y, localPos[localIndex].z-posq1.z);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
if (!isExcluded && r2 < PADDED_CUTOFF_SQUARED)
anyInteraction[local_warp] = true;
}
tj = (tj == rangeEnd-1 ? rangeStart : tj+1);
SYNC_WARPS;
}
if (anyInteraction[local_warp]) {
SYNC_WARPS;
if (tgx == 0)
tileIndex[local_warp] = atomicAdd(numGroupTiles, 1);
SYNC_WARPS;
filteredGroupData[TILE_SIZE*tileIndex[local_warp]+tgx] = atomData;
}
}
}
......@@ -742,15 +742,15 @@ private:
ForceInfo* info;
OpenCLParameterSet* params;
OpenCLArray globals;
OpenCLArray interactionGroupData;
cl::Kernel interactionGroupKernel;
OpenCLArray interactionGroupData, filteredGroupData, numGroupTiles;
cl::Kernel interactionGroupKernel, prepareNeighborListKernel, buildNeighborListKernel;
std::vector<void*> interactionGroupArgs;
std::vector<std::string> globalParamNames;
std::vector<cl_float> globalParamValues;
std::vector<OpenCLArray> tabulatedFunctions;
double longRangeCoefficient;
std::vector<double> longRangeCoefficientDerivs;
bool hasInitializedLongRangeCorrection, hasInitializedKernel, hasParamDerivs;
bool hasInitializedLongRangeCorrection, hasInitializedKernel, hasParamDerivs, useNeighborList;
int numGroupThreadBlocks;
CustomNonbondedForce* forceCopy;
const System& system;
......
......@@ -153,6 +153,11 @@ public:
bool getHasInteractions() {
return (groupCutoff.size() > 0);
}
/**
* Given a nonbonded cutoff, get the padded cutoff distance used in computing
* the neighbor list.
*/
double padCutoff(double cutoff);
/**
* Prepare to compute interactions. This updates the neighbor list.
*/
......@@ -225,6 +230,13 @@ public:
OpenCLArray& getExclusionRowIndices() {
return exclusionRowIndices;
}
/**
* Get the array containing a flag for whether the neighbor list was rebuilt
* on the most recent call to prepareInteractions().
*/
OpenCLArray& getRebuildNeighborList() {
return rebuildNeighborList;
}
/**
* Get the index of the first tile this context is responsible for processing.
*/
......
......@@ -54,6 +54,7 @@
#include "jama_eig.h"
#include <algorithm>
#include <cmath>
#include <iterator>
#include <set>
using namespace OpenMM;
......@@ -2550,7 +2551,8 @@ void OpenCLCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNon
vector<vector<int> > atomLists;
vector<pair<int, int> > tiles;
map<pair<int, int>, int> duplicateInteractions;
vector<int> tileGroup;
vector<vector<int> > duplicateAtomsForGroup;
for (int group = 0; group < force.getNumInteractionGroups(); group++) {
// Get the list of atoms in this group and sort them.
......@@ -2561,6 +2563,10 @@ void OpenCLCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNon
atoms2.insert(atoms2.begin(), set2.begin(), set2.end());
sort(atoms1.begin(), atoms1.end());
sort(atoms2.begin(), atoms2.end());
duplicateAtomsForGroup.push_back(vector<int>());
set_intersection(set1.begin(), set1.end(), set2.begin(), set2.end(),
inserter(duplicateAtomsForGroup[group], duplicateAtomsForGroup[group].begin()));
sort(duplicateAtomsForGroup[group].begin(), duplicateAtomsForGroup[group].end());
// Find how many tiles we will create for this group.
......@@ -2572,9 +2578,12 @@ void OpenCLCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNon
// Add the tiles.
int firstTile = tiles.size();
for (int i = 0; i < numBlocks1; i++)
for (int j = 0; j < numBlocks2; j++)
for (int j = 0; j < numBlocks2; j++) {
tiles.push_back(make_pair(atomLists.size()+i, atomLists.size()+numBlocks1+j));
tileGroup.push_back(group);
}
// Add the atom lists.
......@@ -2594,22 +2603,6 @@ void OpenCLCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNon
atoms.push_back(atoms2[j]);
atomLists.push_back(atoms);
}
// If this group contains duplicate interactions, record that we need to skip them once.
for (int a1 : atoms1) {
if (set2.find(a1) == set2.end())
continue;
for (int j = 0; j < (int) atoms2.size() && atoms2[j] < a1; j++) {
int a2 = atoms2[j];
if (set1.find(a2) != set1.end()) {
pair<int, int> key = make_pair(a2, a1);
if (duplicateInteractions.find(key) == duplicateInteractions.end())
duplicateInteractions[key] = 0;
duplicateInteractions[key]++;
}
}
}
}
// Build a lookup table for quickly identifying excluded interactions.
......@@ -2627,15 +2620,18 @@ void OpenCLCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNon
vector<vector<int> > exclusionFlags(tiles.size());
vector<pair<int, int> > tileOrder;
for (int tile = 0; tile < tiles.size(); tile++) {
bool swapped = false;
if (atomLists[tiles[tile].first].size() < atomLists[tiles[tile].second].size()) {
// For efficiency, we want the first axis to be the larger one.
int swap = tiles[tile].first;
tiles[tile].first = tiles[tile].second;
tiles[tile].second = swap;
swapped = true;
}
vector<int>& atoms1 = atomLists[tiles[tile].first];
vector<int>& atoms2 = atomLists[tiles[tile].second];
vector<int>& duplicateAtoms = duplicateAtomsForGroup[tileGroup[tile]];
vector<int> flags(atoms1.size(), (int) (1LL<<atoms2.size())-1);
int numExcluded = 0;
for (int i = 0; i < (int) atoms1.size(); i++)
......@@ -2646,11 +2642,10 @@ void OpenCLCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNon
pair<int, int> key = make_pair(min(a1, a2), max(a1, a2));
if (a1 == a2 || exclusions.find(key) != exclusions.end())
isExcluded = true; // This is an excluded interaction.
else if (duplicateInteractions.find(key) != duplicateInteractions.end() && duplicateInteractions[key] > 0) {
else if ((a1 > a2) == swapped && binary_search(duplicateAtoms.begin(), duplicateAtoms.end(), a1) && binary_search(duplicateAtoms.begin(), duplicateAtoms.end(), a2)) {
// Both atoms are in both sets, so skip duplicate interactions.
isExcluded = true;
duplicateInteractions[key]--;
}
if (isExcluded) {
flags[i] &= -1-(1<<j);
......@@ -2713,6 +2708,16 @@ void OpenCLCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNon
}
interactionGroupData.initialize<mm_int4>(cl, groupData.size(), "interactionGroupData");
interactionGroupData.upload(groupData);
numGroupTiles.initialize<cl_int>(cl, 1, "numGroupTiles");
// Allocate space for a neighbor list, if necessary.
if (force.getNonbondedMethod() != CustomNonbondedForce::NoCutoff && groupData.size() > cl.getNumThreadBlocks()) {
filteredGroupData.initialize<mm_int4>(cl, groupData.size(), "filteredGroupData");
interactionGroupData.copyTo(filteredGroupData);
int numTiles = groupData.size()/32;
numGroupTiles.upload(&numTiles);
}
// Create the kernel.
......@@ -2791,11 +2796,16 @@ void OpenCLCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNon
defines["USE_CUTOFF"] = "1";
if (force.getNonbondedMethod() == CustomNonbondedForce::CutoffPeriodic)
defines["USE_PERIODIC"] = "1";
defines["LOCAL_MEMORY_SIZE"] = cl.intToString(max(32, cl.getNonbondedUtilities().getForceThreadBlockSize()));
int localMemorySize = max(32, cl.getNonbondedUtilities().getForceThreadBlockSize());
defines["LOCAL_MEMORY_SIZE"] = cl.intToString(localMemorySize);
defines["WARPS_IN_BLOCK"] = cl.intToString(localMemorySize/32);
double cutoff = force.getCutoffDistance();
defines["CUTOFF_SQUARED"] = cl.doubleToString(cutoff*cutoff);
double paddedCutoff = cl.getNonbondedUtilities().padCutoff(cutoff);
defines["PADDED_CUTOFF_SQUARED"] = cl.doubleToString(paddedCutoff*paddedCutoff);
defines["PADDED_NUM_ATOMS"] = cl.intToString(cl.getPaddedNumAtoms());
defines["TILE_SIZE"] = "32";
defines["NUM_TILES"] = cl.intToString(numTileSets);
int numContexts = cl.getPlatformData().contexts.size();
int startIndex = cl.getContextIndex()*numTileSets/numContexts;
int endIndex = (cl.getContextIndex()+1)*numTileSets/numContexts;
......@@ -2805,10 +2815,17 @@ void OpenCLCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNon
defines["PARAMETER_SIZE_IS_EVEN"] = "1";
cl::Program program = cl.createProgram(cl.replaceStrings(OpenCLKernelSources::customNonbondedGroups, replacements), defines);
interactionGroupKernel = cl::Kernel(program, "computeInteractionGroups");
prepareNeighborListKernel = cl::Kernel(program, "prepareToBuildNeighborList");
buildNeighborListKernel = cl::Kernel(program, "buildNeighborList");
numGroupThreadBlocks = cl.getNonbondedUtilities().getNumForceThreadBlocks();
}
double OpenCLCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
useNeighborList = (filteredGroupData.isInitialized() && cl.getNonbondedUtilities().getUseCutoff());
if (useNeighborList && cl.getContextIndex() > 0) {
// When using a neighbor list, run the whole calculation on a single device.
return 0.0;
}
if (globals.isInitialized()) {
bool changed = false;
for (int i = 0; i < (int) globalParamNames.size(); i++) {
......@@ -2837,7 +2854,9 @@ double OpenCLCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool
interactionGroupKernel.setArg<cl::Buffer>(index++, (useLong ? cl.getLongForceBuffer() : cl.getForceBuffers()).getDeviceBuffer());
interactionGroupKernel.setArg<cl::Buffer>(index++, cl.getEnergyBuffer().getDeviceBuffer());
interactionGroupKernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
interactionGroupKernel.setArg<cl::Buffer>(index++, interactionGroupData.getDeviceBuffer());
interactionGroupKernel.setArg<cl::Buffer>(index++, (useNeighborList ? filteredGroupData : interactionGroupData).getDeviceBuffer());
interactionGroupKernel.setArg<cl::Buffer>(index++, numGroupTiles.getDeviceBuffer());
interactionGroupKernel.setArg<cl_int>(index++, useNeighborList);
index += 5;
for (auto& buffer : params->getBuffers())
interactionGroupKernel.setArg<cl::Memory>(index++, buffer.getMemory());
......@@ -2847,9 +2866,27 @@ double OpenCLCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool
interactionGroupKernel.setArg<cl::Buffer>(index++, globals.getDeviceBuffer());
if (hasParamDerivs)
interactionGroupKernel.setArg<cl::Memory>(index++, cl.getEnergyParamDerivBuffer().getDeviceBuffer());
if (useNeighborList) {
// Initialize kernels for building the interaction group neighbor list.
prepareNeighborListKernel.setArg<cl::Buffer>(0, cl.getNonbondedUtilities().getRebuildNeighborList().getDeviceBuffer());
prepareNeighborListKernel.setArg<cl::Buffer>(1, numGroupTiles.getDeviceBuffer());
buildNeighborListKernel.setArg<cl::Buffer>(0, cl.getNonbondedUtilities().getRebuildNeighborList().getDeviceBuffer());
buildNeighborListKernel.setArg<cl::Buffer>(1, numGroupTiles.getDeviceBuffer());
buildNeighborListKernel.setArg<cl::Buffer>(2, cl.getPosq().getDeviceBuffer());
buildNeighborListKernel.setArg<cl::Buffer>(3, interactionGroupData.getDeviceBuffer());
buildNeighborListKernel.setArg<cl::Buffer>(4, filteredGroupData.getDeviceBuffer());
}
}
setPeriodicBoxArgs(cl, interactionGroupKernel, 4);
int forceThreadBlockSize = max(32, cl.getNonbondedUtilities().getForceThreadBlockSize());
if (useNeighborList) {
// Rebuild the neighbor list, if necessary.
setPeriodicBoxArgs(cl, buildNeighborListKernel, 5);
cl.executeKernel(prepareNeighborListKernel, 1, 1);
cl.executeKernel(buildNeighborListKernel, numGroupThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
}
setPeriodicBoxArgs(cl, interactionGroupKernel, 6);
cl.executeKernel(interactionGroupKernel, numGroupThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
}
mm_double4 boxSize = cl.getPeriodicBoxSizeDouble();
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2016 Stanford University and the Authors. *
* Portions copyright (c) 2009-2018 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -296,6 +296,7 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
blockSorter = new OpenCLSort(context, new BlockSortTrait(context.getUseDoublePrecision()), numAtomBlocks);
vector<cl_uint> count(1, 0);
interactionCount.upload(count);
rebuildNeighborList.upload(count);
}
}
......@@ -323,6 +324,11 @@ double OpenCLNonbondedUtilities::getMaxCutoffDistance() {
return cutoff;
}
double OpenCLNonbondedUtilities::padCutoff(double cutoff) {
double padding = (usePadding ? 0.1*cutoff : 0.0);
return cutoff+padding;
}
void OpenCLNonbondedUtilities::prepareInteractions(int forceGroups) {
if ((forceGroups&groupFlags) == 0)
return;
......@@ -464,12 +470,11 @@ void OpenCLNonbondedUtilities::createKernelsForGroups(int groups) {
kernels.cutoffDistance = cutoff;
kernels.source = source;
if (useCutoff) {
double padding = (usePadding ? 0.1*cutoff : 0.0);
double paddedCutoff = cutoff+padding;
double paddedCutoff = padCutoff(cutoff);
map<string, string> defines;
defines["TILE_SIZE"] = context.intToString(OpenCLContext::TileSize);
defines["NUM_ATOMS"] = context.intToString(context.getNumAtoms());
defines["PADDING"] = context.doubleToString(padding);
defines["PADDING"] = context.doubleToString(paddedCutoff-cutoff);
defines["PADDED_CUTOFF"] = context.doubleToString(paddedCutoff);
defines["PADDED_CUTOFF_SQUARED"] = context.doubleToString(paddedCutoff*paddedCutoff);
defines["NUM_TILES_WITH_EXCLUSIONS"] = context.intToString(exclusionTiles.getSize());
......
......@@ -43,6 +43,7 @@ __kernel void computeInteractionGroups(
__global real4* restrict forceBuffers,
#endif
__global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const int4* restrict groupData,
__global int* restrict numGroupTiles, int useNeighborList,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
PARAMETER_ARGUMENTS) {
const unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
......@@ -53,8 +54,8 @@ __kernel void computeInteractionGroups(
INIT_DERIVATIVES
__local AtomData localData[LOCAL_MEMORY_SIZE];
const unsigned int startTile = FIRST_TILE+warp*(LAST_TILE-FIRST_TILE)/totalWarps;
const unsigned int endTile = FIRST_TILE+(warp+1)*(LAST_TILE-FIRST_TILE)/totalWarps;
const unsigned int startTile = (useNeighborList ? warp*numGroupTiles[0]/totalWarps : FIRST_TILE+warp*(LAST_TILE-FIRST_TILE)/totalWarps);
const unsigned int endTile = (useNeighborList ? (warp+1)*numGroupTiles[0]/totalWarps : FIRST_TILE+(warp+1)*(LAST_TILE-FIRST_TILE)/totalWarps);
for (int tile = startTile; tile < endTile; tile++) {
const int4 atomData = groupData[TILE_SIZE*tile+tgx];
const int atom1 = atomData.x;
......@@ -129,3 +130,74 @@ __kernel void computeInteractionGroups(
energyBuffer[get_global_id(0)] += energy;
SAVE_DERIVATIVES
}
/**
* If the neighbor list needs to be rebuilt, reset the number of tiles to 0. This is
* executed by a single thread.
*/
__kernel void prepareToBuildNeighborList(__global int* restrict rebuildNeighborList, __global int* restrict numGroupTiles) {
if (rebuildNeighborList[0] == 1)
numGroupTiles[0] = 0;
}
/**
* Filter the list of tiles to include only ones that have interactions within the
* padded cutoff.
*/
__kernel void buildNeighborList(__global int* restrict rebuildNeighborList, __global int* restrict numGroupTiles,
__global const real4* restrict posq, __global const int4* restrict groupData, __global int4* restrict filteredGroupData,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
// If the neighbor list doesn't need to be rebuilt on this step, return immediately.
if (rebuildNeighborList[0] == 0)
return;
const unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
const unsigned int warp = get_global_id(0)/TILE_SIZE; // global warpIndex
const unsigned int local_warp = get_local_id(0)/TILE_SIZE; // local warpIndex
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); // index within the warp
const unsigned int tbx = get_local_id(0) - tgx; // block warpIndex
__local real4 localPos[LOCAL_MEMORY_SIZE];
__local volatile bool anyInteraction[WARPS_IN_BLOCK];
__local volatile int tileIndex[WARPS_IN_BLOCK];
const unsigned int startTile = warp*NUM_TILES/totalWarps;
const unsigned int endTile = (warp+1)*NUM_TILES/totalWarps;
for (int tile = startTile; tile < endTile; tile++) {
const int4 atomData = groupData[TILE_SIZE*tile+tgx];
const int atom1 = atomData.x;
const int atom2 = atomData.y;
const int rangeStart = atomData.z&0xFFFF;
const int rangeEnd = (atomData.z>>16)&0xFFFF;
const int exclusions = atomData.w;
real4 posq1 = posq[atom1];
localPos[get_local_id(0)] = posq[atom2];
if (tgx == 0)
anyInteraction[local_warp] = false;
int tj = tgx;
SYNC_WARPS;
for (int j = rangeStart; j < rangeEnd && !anyInteraction[local_warp]; j++) {
if (tj < rangeEnd) {
bool isExcluded = (((exclusions>>tj)&1) == 0);
int localIndex = tbx+tj;
real4 delta = (real4) (localPos[localIndex].xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
if (!isExcluded && r2 < PADDED_CUTOFF_SQUARED)
anyInteraction[local_warp] = true;
}
tj = (tj == rangeEnd-1 ? rangeStart : tj+1);
SYNC_WARPS;
}
if (anyInteraction[local_warp]) {
SYNC_WARPS;
if (tgx == 0)
tileIndex[local_warp] = atomic_add(numGroupTiles, 1);
SYNC_WARPS;
filteredGroupData[TILE_SIZE*tileIndex[local_warp]+tgx] = atomData;
}
}
}
......@@ -7,7 +7,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2016 Stanford University and the Authors. *
* Portions copyright (c) 2008-2018 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -978,6 +978,68 @@ void testInteractionGroupTabulatedFunction() {
}
}
void testInteractionGroupWithCutoff() {
const int numParticles = 1000;
const double boxSize = 10.0;
System system;
system.setDefaultPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
NonbondedForce* standard = new NonbondedForce();
CustomNonbondedForce* custom = new CustomNonbondedForce("100/(r+0.1)");
system.addForce(standard);
system.addForce(custom);
standard->setNonbondedMethod(NonbondedForce::CutoffPeriodic);
custom->setNonbondedMethod(CustomNonbondedForce::CutoffPeriodic);
standard->setCutoffDistance(1.0);
custom->setCutoffDistance(1.0);
standard->setUseSwitchingFunction(true);
custom->setUseSwitchingFunction(true);
standard->setSwitchingDistance(0.9);
custom->setSwitchingDistance(0.8);
vector<Vec3> positions(numParticles);
OpenMM_SFMT::SFMT sfmt;
init_gen_rand(0, sfmt);
for (int i = 0; i < numParticles; i++) {
system.addParticle(10.0);
standard->addParticle(0.0, 0.2, 0.1);
custom->addParticle();
while (true) {
positions[i] = Vec3(genrand_real2(sfmt), genrand_real2(sfmt), genrand_real2(sfmt))*boxSize;
bool tooClose = false;
for (int j = 0; j < i; j++) {
Vec3 delta = positions[i]-positions[j];
if (delta.dot(delta) < 0.5*0.5)
tooClose = true;
}
if (!tooClose)
break;
}
}
set<int> set1, set2;
for (int i = 0; i < 10; i++)
set1.insert(2*i);
for (int i = 0; i < numParticles; i++)
set2.insert(i);
custom->addInteractionGroup(set1, set2);
custom->setForceGroup(1);
// Try simulating it and see if energy is conserved (indicating that any optimizations
// for combining the cutoff with the interaction group are behaving consistently).
VerletIntegrator integrator(0.001);
Context context(system, integrator, platform);
context.setPositions(positions);
context.setVelocitiesToTemperature(100);
ASSERT(context.getState(State::Energy, false, 1<<1).getPotentialEnergy() != 0.0);
State initialState = context.getState(State::Energy);
double initialEnergy = initialState.getPotentialEnergy()+initialState.getKineticEnergy();
for (int i = 0; i < 100; i++) {
integrator.step(10);
State state = context.getState(State::Energy);
double energy = state.getPotentialEnergy()+state.getKineticEnergy();
ASSERT_EQUAL_TOL(initialEnergy, energy, 0.001);
}
}
void testMultipleCutoffs() {
System system;
system.addParticle(1.0);
......@@ -1253,6 +1315,7 @@ int main(int argc, char* argv[]) {
testLargeInteractionGroup();
testInteractionGroupLongRangeCorrection();
testInteractionGroupTabulatedFunction();
testInteractionGroupWithCutoff();
testMultipleCutoffs();
testMultipleSwitches();
testIllegalVariable();
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment