Unverified Commit cdc0789a authored by peastman's avatar peastman Committed by GitHub
Browse files

Fixed range overflow with very large numbers of atoms (#2806)

* Fixed range overflow with very large numbers of atoms

* More fixes to overflow with large numbers of atoms

* Fix test failures
parent b4543a46
...@@ -17,7 +17,7 @@ KERNEL void computeN2Energy( ...@@ -17,7 +17,7 @@ KERNEL void computeN2Energy(
#endif #endif
GLOBAL mixed* RESTRICT energyBuffer, GLOBAL mixed* RESTRICT energyBuffer,
GLOBAL const real4* RESTRICT posq, GLOBAL const unsigned int* RESTRICT exclusions, GLOBAL const real4* RESTRICT posq, GLOBAL const unsigned int* RESTRICT exclusions,
GLOBAL const ushort2* exclusionTiles, int needEnergy, GLOBAL const int2* exclusionTiles, int needEnergy,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
...@@ -41,7 +41,7 @@ KERNEL void computeN2Energy( ...@@ -41,7 +41,7 @@ KERNEL void computeN2Energy(
const int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps; const int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps; const int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) { for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos]; const int2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x; const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y; const unsigned int y = tileIndices.y;
real3 force = make_real3(0); real3 force = make_real3(0);
...@@ -236,7 +236,7 @@ KERNEL void computeN2Energy( ...@@ -236,7 +236,7 @@ KERNEL void computeN2Energy(
while (skipTiles[tbx+TILE_SIZE-1] < pos) { while (skipTiles[tbx+TILE_SIZE-1] < pos) {
SYNC_WARPS; SYNC_WARPS;
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) { if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx]; int2 tile = exclusionTiles[skipBase+tgx];
skipTiles[LOCAL_ID] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2; skipTiles[LOCAL_ID] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
} }
else else
......
...@@ -17,7 +17,7 @@ KERNEL void computeN2Energy( ...@@ -17,7 +17,7 @@ KERNEL void computeN2Energy(
#endif #endif
GLOBAL mixed* RESTRICT energyBuffer, GLOBAL mixed* RESTRICT energyBuffer,
GLOBAL const real4* RESTRICT posq, GLOBAL const unsigned int* RESTRICT exclusions, GLOBAL const real4* RESTRICT posq, GLOBAL const unsigned int* RESTRICT exclusions,
GLOBAL const ushort2* exclusionTiles, int needEnergy, GLOBAL const int2* exclusionTiles, int needEnergy,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
...@@ -37,7 +37,7 @@ KERNEL void computeN2Energy( ...@@ -37,7 +37,7 @@ KERNEL void computeN2Energy(
const int firstExclusionTile = FIRST_EXCLUSION_TILE+GROUP_ID*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS; const int firstExclusionTile = FIRST_EXCLUSION_TILE+GROUP_ID*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS;
const int lastExclusionTile = FIRST_EXCLUSION_TILE+(GROUP_ID+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS; const int lastExclusionTile = FIRST_EXCLUSION_TILE+(GROUP_ID+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) { for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos]; const int2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x; const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y; const unsigned int y = tileIndices.y;
...@@ -248,7 +248,7 @@ KERNEL void computeN2Energy( ...@@ -248,7 +248,7 @@ KERNEL void computeN2Energy(
while (nextToSkip < pos) { while (nextToSkip < pos) {
if (currentSkipIndex < NUM_TILES_WITH_EXCLUSIONS) { if (currentSkipIndex < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[currentSkipIndex++]; int2 tile = exclusionTiles[currentSkipIndex++];
nextToSkip = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2; nextToSkip = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
} }
else else
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
* Compute a value based on pair interactions. * Compute a value based on pair interactions.
*/ */
KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsigned int* RESTRICT exclusions, KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsigned int* RESTRICT exclusions,
GLOBAL const ushort2* exclusionTiles, GLOBAL const int2* exclusionTiles,
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
GLOBAL mm_ulong* RESTRICT global_value, GLOBAL mm_ulong* RESTRICT global_value,
#else #else
...@@ -29,7 +29,7 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig ...@@ -29,7 +29,7 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig
const int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps; const int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps; const int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) { for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos]; const int2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x; const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y; const unsigned int y = tileIndices.y;
real value = 0; real value = 0;
...@@ -205,7 +205,7 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig ...@@ -205,7 +205,7 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig
while (skipTiles[tbx+TILE_SIZE-1] < pos) { while (skipTiles[tbx+TILE_SIZE-1] < pos) {
SYNC_WARPS; SYNC_WARPS;
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) { if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx]; int2 tile = exclusionTiles[skipBase+tgx];
skipTiles[LOCAL_ID] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2; skipTiles[LOCAL_ID] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
} }
else else
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
* Compute a value based on pair interactions. * Compute a value based on pair interactions.
*/ */
KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsigned int* RESTRICT exclusions, KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsigned int* RESTRICT exclusions,
GLOBAL const ushort2* exclusionTiles, GLOBAL const int2* exclusionTiles,
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
GLOBAL mm_ulong* RESTRICT global_value, GLOBAL mm_ulong* RESTRICT global_value,
#else #else
...@@ -25,7 +25,7 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig ...@@ -25,7 +25,7 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig
const int firstExclusionTile = FIRST_EXCLUSION_TILE+get_group_id(0)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0); const int firstExclusionTile = FIRST_EXCLUSION_TILE+get_group_id(0)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
const int lastExclusionTile = FIRST_EXCLUSION_TILE+(get_group_id(0)+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0); const int lastExclusionTile = FIRST_EXCLUSION_TILE+(get_group_id(0)+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) { for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos]; const int2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x; const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y; const unsigned int y = tileIndices.y;
...@@ -213,7 +213,7 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig ...@@ -213,7 +213,7 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig
while (nextToSkip < pos) { while (nextToSkip < pos) {
if (currentSkipIndex < NUM_TILES_WITH_EXCLUSIONS) { if (currentSkipIndex < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[currentSkipIndex++]; int2 tile = exclusionTiles[currentSkipIndex++];
nextToSkip = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2; nextToSkip = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
} }
else else
......
...@@ -24,7 +24,7 @@ KERNEL void computeBornSum( ...@@ -24,7 +24,7 @@ KERNEL void computeBornSum(
#else #else
unsigned int numTiles, unsigned int numTiles,
#endif #endif
GLOBAL const ushort2* RESTRICT exclusionTiles) { GLOBAL const int2* RESTRICT exclusionTiles) {
const unsigned int totalWarps = GLOBAL_SIZE/TILE_SIZE; const unsigned int totalWarps = GLOBAL_SIZE/TILE_SIZE;
const unsigned int warp = GLOBAL_ID/TILE_SIZE; const unsigned int warp = GLOBAL_ID/TILE_SIZE;
const unsigned int tgx = LOCAL_ID & (TILE_SIZE-1); const unsigned int tgx = LOCAL_ID & (TILE_SIZE-1);
...@@ -36,7 +36,7 @@ KERNEL void computeBornSum( ...@@ -36,7 +36,7 @@ KERNEL void computeBornSum(
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps; const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps; const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) { for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos]; const int2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x; const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y; const unsigned int y = tileIndices.y;
real bornSum = 0; real bornSum = 0;
...@@ -209,7 +209,7 @@ KERNEL void computeBornSum( ...@@ -209,7 +209,7 @@ KERNEL void computeBornSum(
while (skipTiles[tbx+TILE_SIZE-1] < pos) { while (skipTiles[tbx+TILE_SIZE-1] < pos) {
SYNC_WARPS; SYNC_WARPS;
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) { if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx]; int2 tile = exclusionTiles[skipBase+tgx];
skipTiles[LOCAL_ID] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2; skipTiles[LOCAL_ID] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
} }
else else
...@@ -393,7 +393,7 @@ KERNEL void computeGBSAForce1( ...@@ -393,7 +393,7 @@ KERNEL void computeGBSAForce1(
#else #else
unsigned int numTiles, unsigned int numTiles,
#endif #endif
GLOBAL const ushort2* RESTRICT exclusionTiles) { GLOBAL const int2* RESTRICT exclusionTiles) {
const unsigned int totalWarps = GLOBAL_SIZE/TILE_SIZE; const unsigned int totalWarps = GLOBAL_SIZE/TILE_SIZE;
const unsigned int warp = GLOBAL_ID/TILE_SIZE; const unsigned int warp = GLOBAL_ID/TILE_SIZE;
const unsigned int tgx = LOCAL_ID & (TILE_SIZE-1); const unsigned int tgx = LOCAL_ID & (TILE_SIZE-1);
...@@ -406,7 +406,7 @@ KERNEL void computeGBSAForce1( ...@@ -406,7 +406,7 @@ KERNEL void computeGBSAForce1(
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps; const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps; const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) { for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos]; const int2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x; const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y; const unsigned int y = tileIndices.y;
real4 force = make_real4(0); real4 force = make_real4(0);
...@@ -604,7 +604,7 @@ KERNEL void computeGBSAForce1( ...@@ -604,7 +604,7 @@ KERNEL void computeGBSAForce1(
while (skipTiles[tbx+TILE_SIZE-1] < pos) { while (skipTiles[tbx+TILE_SIZE-1] < pos) {
SYNC_WARPS; SYNC_WARPS;
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) { if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx]; int2 tile = exclusionTiles[skipBase+tgx];
skipTiles[LOCAL_ID] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2; skipTiles[LOCAL_ID] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
} }
else else
......
...@@ -22,7 +22,7 @@ KERNEL void computeBornSum( ...@@ -22,7 +22,7 @@ KERNEL void computeBornSum(
#else #else
unsigned int numTiles, unsigned int numTiles,
#endif #endif
GLOBAL const ushort2* exclusionTiles) { GLOBAL const int2* exclusionTiles) {
LOCAL AtomData1 localData[TILE_SIZE]; LOCAL AtomData1 localData[TILE_SIZE];
// First loop: process tiles that contain exclusions. // First loop: process tiles that contain exclusions.
...@@ -30,7 +30,7 @@ KERNEL void computeBornSum( ...@@ -30,7 +30,7 @@ KERNEL void computeBornSum(
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+GROUP_ID*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS; const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+GROUP_ID*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS;
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(GROUP_ID+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS; const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(GROUP_ID+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) { for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos]; const int2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x; const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y; const unsigned int y = tileIndices.y;
...@@ -213,7 +213,7 @@ KERNEL void computeBornSum( ...@@ -213,7 +213,7 @@ KERNEL void computeBornSum(
while (nextToSkip < pos) { while (nextToSkip < pos) {
if (currentSkipIndex < NUM_TILES_WITH_EXCLUSIONS) { if (currentSkipIndex < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[currentSkipIndex++]; int2 tile = exclusionTiles[currentSkipIndex++];
nextToSkip = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2; nextToSkip = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
} }
else else
...@@ -416,7 +416,7 @@ KERNEL void computeGBSAForce1( ...@@ -416,7 +416,7 @@ KERNEL void computeGBSAForce1(
#else #else
unsigned int numTiles, unsigned int numTiles,
#endif #endif
GLOBAL const ushort2* exclusionTiles) { GLOBAL const int2* exclusionTiles) {
mixed energy = 0; mixed energy = 0;
LOCAL AtomData2 localData[TILE_SIZE]; LOCAL AtomData2 localData[TILE_SIZE];
...@@ -425,7 +425,7 @@ KERNEL void computeGBSAForce1( ...@@ -425,7 +425,7 @@ KERNEL void computeGBSAForce1(
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+GROUP_ID*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS; const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+GROUP_ID*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS;
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(GROUP_ID+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS; const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(GROUP_ID+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) { for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos]; const int2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x; const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y; const unsigned int y = tileIndices.y;
...@@ -637,7 +637,7 @@ KERNEL void computeGBSAForce1( ...@@ -637,7 +637,7 @@ KERNEL void computeGBSAForce1(
while (nextToSkip < pos) { while (nextToSkip < pos) {
if (currentSkipIndex < NUM_TILES_WITH_EXCLUSIONS) { if (currentSkipIndex < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[currentSkipIndex++]; int2 tile = exclusionTiles[currentSkipIndex++];
nextToSkip = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2; nextToSkip = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
} }
else else
......
...@@ -33,4 +33,5 @@ ...@@ -33,4 +33,5 @@
#include "TestNonbondedForce.h" #include "TestNonbondedForce.h"
void runPlatformTests() { void runPlatformTests() {
testHugeSystem();
} }
...@@ -348,7 +348,8 @@ private: ...@@ -348,7 +348,8 @@ private:
std::map<int, std::string> groupKernelSource; std::map<int, std::string> groupKernelSource;
double lastCutoff; double lastCutoff;
bool useCutoff, usePeriodic, anyExclusions, usePadding, forceRebuildNeighborList, canUsePairList; bool useCutoff, usePeriodic, anyExclusions, usePadding, forceRebuildNeighborList, canUsePairList;
int startTileIndex, numTiles, startBlockIndex, numBlocks, maxTiles, maxSinglePairs, maxExclusions, numForceThreadBlocks, forceThreadBlockSize, numAtoms, groupFlags; int startTileIndex, startBlockIndex, numBlocks, maxTiles, maxSinglePairs, maxExclusions, numForceThreadBlocks, forceThreadBlockSize, numAtoms, groupFlags;
long long numTiles;
std::string kernelSource; std::string kernelSource;
}; };
......
...@@ -167,7 +167,7 @@ void CudaNonbondedUtilities::requestExclusions(const vector<vector<int> >& exclu ...@@ -167,7 +167,7 @@ void CudaNonbondedUtilities::requestExclusions(const vector<vector<int> >& exclu
} }
} }
static bool compareUshort2(ushort2 a, ushort2 b) { static bool compareInt2(int2 a, int2 b) {
return ((a.y < b.y) || (a.y == b.y && a.x < b.x)); return ((a.y < b.y) || (a.y == b.y && a.x < b.x));
} }
...@@ -199,15 +199,15 @@ void CudaNonbondedUtilities::initialize(const System& system) { ...@@ -199,15 +199,15 @@ void CudaNonbondedUtilities::initialize(const System& system) {
tilesWithExclusions.insert(make_pair(max(x, y), min(x, y))); tilesWithExclusions.insert(make_pair(max(x, y), min(x, y)));
} }
} }
vector<ushort2> exclusionTilesVec; vector<int2> exclusionTilesVec;
for (set<pair<int, int> >::const_iterator iter = tilesWithExclusions.begin(); iter != tilesWithExclusions.end(); ++iter) for (set<pair<int, int> >::const_iterator iter = tilesWithExclusions.begin(); iter != tilesWithExclusions.end(); ++iter)
exclusionTilesVec.push_back(make_ushort2((unsigned short) iter->first, (unsigned short) iter->second)); exclusionTilesVec.push_back(make_int2(iter->first, iter->second));
sort(exclusionTilesVec.begin(), exclusionTilesVec.end(), compareUshort2); sort(exclusionTilesVec.begin(), exclusionTilesVec.end(), compareInt2);
exclusionTiles.initialize<ushort2>(context, exclusionTilesVec.size(), "exclusionTiles"); exclusionTiles.initialize<int2>(context, exclusionTilesVec.size(), "exclusionTiles");
exclusionTiles.upload(exclusionTilesVec); exclusionTiles.upload(exclusionTilesVec);
map<pair<int, int>, int> exclusionTileMap; map<pair<int, int>, int> exclusionTileMap;
for (int i = 0; i < (int) exclusionTilesVec.size(); i++) { for (int i = 0; i < (int) exclusionTilesVec.size(); i++) {
ushort2 tile = exclusionTilesVec[i]; int2 tile = exclusionTilesVec[i];
exclusionTileMap[make_pair(tile.x, tile.y)] = i; exclusionTileMap[make_pair(tile.x, tile.y)] = i;
} }
vector<vector<int> > exclusionBlocksForBlock(numAtomBlocks); vector<vector<int> > exclusionBlocksForBlock(numAtomBlocks);
...@@ -463,9 +463,9 @@ void CudaNonbondedUtilities::setAtomBlockRange(double startFraction, double endF ...@@ -463,9 +463,9 @@ void CudaNonbondedUtilities::setAtomBlockRange(double startFraction, double endF
int numAtomBlocks = context.getNumAtomBlocks(); int numAtomBlocks = context.getNumAtomBlocks();
startBlockIndex = (int) (startFraction*numAtomBlocks); startBlockIndex = (int) (startFraction*numAtomBlocks);
numBlocks = (int) (endFraction*numAtomBlocks)-startBlockIndex; numBlocks = (int) (endFraction*numAtomBlocks)-startBlockIndex;
int totalTiles = context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2; long long totalTiles = context.getNumAtomBlocks()*((long long)context.getNumAtomBlocks()+1)/2;
startTileIndex = (int) (startFraction*totalTiles); startTileIndex = (int) (startFraction*totalTiles);
numTiles = (int) (endFraction*totalTiles)-startTileIndex; numTiles = (long long) (endFraction*totalTiles)-startTileIndex;
forceRebuildNeighborList = true; forceRebuildNeighborList = true;
} }
......
...@@ -264,7 +264,7 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea ...@@ -264,7 +264,7 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
includeBlock2 = forceInclude = true; includeBlock2 = forceInclude = true;
#endif #endif
if (includeBlock2) { if (includeBlock2) {
unsigned short y = (unsigned short) sortedBlocks[block2].y; int y = (int) sortedBlocks[block2].y;
for (int k = 0; k < numExclusions; k++) for (int k = 0; k < numExclusions; k++)
includeBlock2 &= (exclusionsForX[k] != y); includeBlock2 &= (exclusionsForX[k] != y);
} }
...@@ -278,7 +278,7 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea ...@@ -278,7 +278,7 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
int i = __ffs(includeBlockFlags)-1; int i = __ffs(includeBlockFlags)-1;
includeBlockFlags &= includeBlockFlags-1; includeBlockFlags &= includeBlockFlags-1;
forceInclude = (forceIncludeFlags>>i) & 1; forceInclude = (forceIncludeFlags>>i) & 1;
unsigned short y = (unsigned short) sortedBlocks[block2Base+i].y; int y = (int) sortedBlocks[block2Base+i].y;
// Check each atom in block Y for interactions. // Check each atom in block Y for interactions.
......
...@@ -105,7 +105,7 @@ static __inline__ __device__ long long real_shfl(long long var, int srcLane) { ...@@ -105,7 +105,7 @@ static __inline__ __device__ long long real_shfl(long long var, int srcLane) {
*/ */
extern "C" __global__ void computeNonbonded( extern "C" __global__ void computeNonbonded(
unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, const tileflags* __restrict__ exclusions, unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, const tileflags* __restrict__ exclusions,
const ushort2* __restrict__ exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices const int2* __restrict__ exclusionTiles, unsigned int startTileIndex, unsigned long long numTileIndices
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
, const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, , const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter,
...@@ -129,7 +129,7 @@ extern "C" __global__ void computeNonbonded( ...@@ -129,7 +129,7 @@ extern "C" __global__ void computeNonbonded(
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps; const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps; const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) { for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos]; const int2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x; const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y; const unsigned int y = tileIndices.y;
real3 force = make_real3(0); real3 force = make_real3(0);
...@@ -324,12 +324,11 @@ extern "C" __global__ void computeNonbonded( ...@@ -324,12 +324,11 @@ extern "C" __global__ void computeNonbonded(
const unsigned int numTiles = interactionCount[0]; const unsigned int numTiles = interactionCount[0];
if (numTiles > maxTiles) if (numTiles > maxTiles)
return; // There wasn't enough memory for the neighbor list. return; // There wasn't enough memory for the neighbor list.
int pos = (int) (numTiles > maxTiles ? startTileIndex+warp*(long long)numTileIndices/totalWarps : warp*(long long)numTiles/totalWarps); int pos = (int) (warp*(long long)numTiles/totalWarps);
int end = (int) (numTiles > maxTiles ? startTileIndex+(warp+1)*(long long)numTileIndices/totalWarps : (warp+1)*(long long)numTiles/totalWarps); int end = (int) ((warp+1)*(long long)numTiles/totalWarps);
#else #else
const unsigned int numTiles = numTileIndices; int pos = (int) (startTileIndex+warp*numTileIndices/totalWarps);
int pos = (int) (startTileIndex+warp*(long long)numTiles/totalWarps); int end = (int) (startTileIndex+(warp+1)*numTileIndices/totalWarps);
int end = (int) (startTileIndex+(warp+1)*(long long)numTiles/totalWarps);
#endif #endif
int skipBase = 0; int skipBase = 0;
int currentSkipIndex = tbx; int currentSkipIndex = tbx;
...@@ -365,7 +364,7 @@ extern "C" __global__ void computeNonbonded( ...@@ -365,7 +364,7 @@ extern "C" __global__ void computeNonbonded(
while (skipTiles[tbx+TILE_SIZE-1] < pos) { while (skipTiles[tbx+TILE_SIZE-1] < pos) {
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) { if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx]; int2 tile = exclusionTiles[skipBase+tgx];
skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2; skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
} }
else else
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2008-2015 Stanford University and the Authors. * * Portions copyright (c) 2008-2020 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -31,6 +31,8 @@ ...@@ -31,6 +31,8 @@
#include "CudaTests.h" #include "CudaTests.h"
#include "TestNonbondedForce.h" #include "TestNonbondedForce.h"
#include <cuda.h>
#include <string>
void testParallelComputation(NonbondedForce::NonbondedMethod method) { void testParallelComputation(NonbondedForce::NonbondedMethod method) {
System system; System system;
...@@ -152,10 +154,33 @@ void testDeterministicForces() { ...@@ -152,10 +154,33 @@ void testDeterministicForces() {
} }
} }
bool canRunHugeTest() {
// Create a minimal context just to see which device is being used.
System system;
system.addParticle(1.0);
VerletIntegrator integrator(1.0);
Context context(system, integrator, platform);
int deviceIndex = stoi(platform.getPropertyValue(context, CudaPlatform::CudaDeviceIndex()));
// Find out how much memory the device has.
CUdevice device;
cuDeviceGet(&device, deviceIndex);
size_t memory;
cuDeviceTotalMem(&memory, device);
// Only run the huge test if the device has at least 4 GB of memory.
return (memory >= 4*(1<<30));
}
void runPlatformTests() { void runPlatformTests() {
testParallelComputation(NonbondedForce::NoCutoff); testParallelComputation(NonbondedForce::NoCutoff);
testParallelComputation(NonbondedForce::Ewald); testParallelComputation(NonbondedForce::Ewald);
testParallelComputation(NonbondedForce::PME); testParallelComputation(NonbondedForce::PME);
testReordering(); testReordering();
testDeterministicForces(); testDeterministicForces();
if (canRunHugeTest())
testHugeSystem();
} }
...@@ -328,8 +328,9 @@ private: ...@@ -328,8 +328,9 @@ private:
std::map<int, std::string> groupKernelSource; std::map<int, std::string> groupKernelSource;
double lastCutoff; double lastCutoff;
bool useCutoff, usePeriodic, deviceIsCpu, anyExclusions, usePadding, forceRebuildNeighborList; bool useCutoff, usePeriodic, deviceIsCpu, anyExclusions, usePadding, forceRebuildNeighborList;
int numForceBuffers, startTileIndex, numTiles, startBlockIndex, numBlocks, maxExclusions, numForceThreadBlocks; int numForceBuffers, startTileIndex, startBlockIndex, numBlocks, maxExclusions, numForceThreadBlocks;
int forceThreadBlockSize, interactingBlocksThreadBlockSize, groupFlags; int forceThreadBlockSize, interactingBlocksThreadBlockSize, groupFlags;
long long numTiles;
}; };
/** /**
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2009-2018 Stanford University and the Authors. * * Portions copyright (c) 2009-2020 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -177,13 +177,13 @@ void OpenCLNonbondedUtilities::requestExclusions(const vector<vector<int> >& exc ...@@ -177,13 +177,13 @@ void OpenCLNonbondedUtilities::requestExclusions(const vector<vector<int> >& exc
} }
} }
static bool compareUshort2(mm_ushort2 a, mm_ushort2 b) { static bool compareInt2(mm_int2 a, mm_int2 b) {
// This version is used on devices with SIMD width of 32 or less. It sorts tiles to improve cache efficiency. // This version is used on devices with SIMD width of 32 or less. It sorts tiles to improve cache efficiency.
return ((a.y < b.y) || (a.y == b.y && a.x < b.x)); return ((a.y < b.y) || (a.y == b.y && a.x < b.x));
} }
static bool compareUshort2LargeSIMD(mm_ushort2 a, mm_ushort2 b) { static bool compareInt2LargeSIMD(mm_int2 a, mm_int2 b) {
// This version is used on devices with SIMD width greater than 32. It puts diagonal tiles before off-diagonal // This version is used on devices with SIMD width greater than 32. It puts diagonal tiles before off-diagonal
// ones to reduce thread divergence. // ones to reduce thread divergence.
...@@ -223,15 +223,15 @@ void OpenCLNonbondedUtilities::initialize(const System& system) { ...@@ -223,15 +223,15 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
tilesWithExclusions.insert(make_pair(max(x, y), min(x, y))); tilesWithExclusions.insert(make_pair(max(x, y), min(x, y)));
} }
} }
vector<mm_ushort2> exclusionTilesVec; vector<mm_int2> exclusionTilesVec;
for (set<pair<int, int> >::const_iterator iter = tilesWithExclusions.begin(); iter != tilesWithExclusions.end(); ++iter) for (set<pair<int, int> >::const_iterator iter = tilesWithExclusions.begin(); iter != tilesWithExclusions.end(); ++iter)
exclusionTilesVec.push_back(mm_ushort2((unsigned short) iter->first, (unsigned short) iter->second)); exclusionTilesVec.push_back(mm_int2(iter->first, iter->second));
sort(exclusionTilesVec.begin(), exclusionTilesVec.end(), context.getSIMDWidth() <= 32 ? compareUshort2 : compareUshort2LargeSIMD); sort(exclusionTilesVec.begin(), exclusionTilesVec.end(), context.getSIMDWidth() <= 32 ? compareInt2 : compareInt2LargeSIMD);
exclusionTiles.initialize<mm_ushort2>(context, exclusionTilesVec.size(), "exclusionTiles"); exclusionTiles.initialize<mm_int2>(context, exclusionTilesVec.size(), "exclusionTiles");
exclusionTiles.upload(exclusionTilesVec); exclusionTiles.upload(exclusionTilesVec);
map<pair<int, int>, int> exclusionTileMap; map<pair<int, int>, int> exclusionTileMap;
for (int i = 0; i < (int) exclusionTilesVec.size(); i++) { for (int i = 0; i < (int) exclusionTilesVec.size(); i++) {
mm_ushort2 tile = exclusionTilesVec[i]; mm_int2 tile = exclusionTilesVec[i];
exclusionTileMap[make_pair(tile.x, tile.y)] = i; exclusionTileMap[make_pair(tile.x, tile.y)] = i;
} }
vector<vector<int> > exclusionBlocksForBlock(numAtomBlocks); vector<vector<int> > exclusionBlocksForBlock(numAtomBlocks);
...@@ -440,9 +440,9 @@ void OpenCLNonbondedUtilities::setAtomBlockRange(double startFraction, double en ...@@ -440,9 +440,9 @@ void OpenCLNonbondedUtilities::setAtomBlockRange(double startFraction, double en
int numAtomBlocks = context.getNumAtomBlocks(); int numAtomBlocks = context.getNumAtomBlocks();
startBlockIndex = (int) (startFraction*numAtomBlocks); startBlockIndex = (int) (startFraction*numAtomBlocks);
numBlocks = (int) (endFraction*numAtomBlocks)-startBlockIndex; numBlocks = (int) (endFraction*numAtomBlocks)-startBlockIndex;
int totalTiles = context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2; long long totalTiles = context.getNumAtomBlocks()*((long long)context.getNumAtomBlocks()+1)/2;
startTileIndex = (int) (startFraction*totalTiles);; startTileIndex = (int) (startFraction*totalTiles);;
numTiles = (int) (endFraction*totalTiles)-startTileIndex; numTiles = (long long) (endFraction*totalTiles)-startTileIndex;
if (useCutoff) { if (useCutoff) {
// We are using a cutoff, and the kernels have already been created. // We are using a cutoff, and the kernels have already been created.
...@@ -450,15 +450,15 @@ void OpenCLNonbondedUtilities::setAtomBlockRange(double startFraction, double en ...@@ -450,15 +450,15 @@ void OpenCLNonbondedUtilities::setAtomBlockRange(double startFraction, double en
KernelSet& kernels = iter->second; KernelSet& kernels = iter->second;
if (*reinterpret_cast<cl_kernel*>(&kernels.forceKernel) != NULL) { if (*reinterpret_cast<cl_kernel*>(&kernels.forceKernel) != NULL) {
kernels.forceKernel.setArg<cl_uint>(5, startTileIndex); kernels.forceKernel.setArg<cl_uint>(5, startTileIndex);
kernels.forceKernel.setArg<cl_uint>(6, numTiles); kernels.forceKernel.setArg<cl_ulong>(6, numTiles);
} }
if (*reinterpret_cast<cl_kernel*>(&kernels.energyKernel) != NULL) { if (*reinterpret_cast<cl_kernel*>(&kernels.energyKernel) != NULL) {
kernels.energyKernel.setArg<cl_uint>(5, startTileIndex); kernels.energyKernel.setArg<cl_uint>(5, startTileIndex);
kernels.energyKernel.setArg<cl_uint>(6, numTiles); kernels.energyKernel.setArg<cl_ulong>(6, numTiles);
} }
if (*reinterpret_cast<cl_kernel*>(&kernels.forceEnergyKernel) != NULL) { if (*reinterpret_cast<cl_kernel*>(&kernels.forceEnergyKernel) != NULL) {
kernels.forceEnergyKernel.setArg<cl_uint>(5, startTileIndex); kernels.forceEnergyKernel.setArg<cl_uint>(5, startTileIndex);
kernels.forceEnergyKernel.setArg<cl_uint>(6, numTiles); kernels.forceEnergyKernel.setArg<cl_ulong>(6, numTiles);
} }
kernels.findInteractingBlocksKernel.setArg<cl_uint>(10, startBlockIndex); kernels.findInteractingBlocksKernel.setArg<cl_uint>(10, startBlockIndex);
kernels.findInteractingBlocksKernel.setArg<cl_uint>(11, numBlocks); kernels.findInteractingBlocksKernel.setArg<cl_uint>(11, numBlocks);
...@@ -711,7 +711,7 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc ...@@ -711,7 +711,7 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
kernel.setArg<cl::Buffer>(index++, exclusions.getDeviceBuffer()); kernel.setArg<cl::Buffer>(index++, exclusions.getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, exclusionTiles.getDeviceBuffer()); kernel.setArg<cl::Buffer>(index++, exclusionTiles.getDeviceBuffer());
kernel.setArg<cl_uint>(index++, startTileIndex); kernel.setArg<cl_uint>(index++, startTileIndex);
kernel.setArg<cl_uint>(index++, numTiles); kernel.setArg<cl_ulong>(index++, numTiles);
if (useCutoff) { if (useCutoff) {
kernel.setArg<cl::Buffer>(index++, interactingTiles.getDeviceBuffer()); kernel.setArg<cl::Buffer>(index++, interactingTiles.getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, interactionCount.getDeviceBuffer()); kernel.setArg<cl::Buffer>(index++, interactionCount.getDeviceBuffer());
......
...@@ -166,7 +166,7 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi ...@@ -166,7 +166,7 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi
includeBlock2 = true; includeBlock2 = true;
#endif #endif
if (includeBlock2) { if (includeBlock2) {
unsigned short y = (unsigned short) sortedBlocks[block2].y; int y = (int) sortedBlocks[block2].y;
for (int k = 0; k < numExclusions; k++) for (int k = 0; k < numExclusions; k++)
includeBlock2 &= (exclusionsForX[k] != y); includeBlock2 &= (exclusionsForX[k] != y);
} }
...@@ -180,7 +180,7 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi ...@@ -180,7 +180,7 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi
while (i < TILE_SIZE && !includeBlockFlags[warpStart+i]) while (i < TILE_SIZE && !includeBlockFlags[warpStart+i])
i++; i++;
if (i < TILE_SIZE) { if (i < TILE_SIZE) {
unsigned short y = (unsigned short) sortedBlocks[block2Base+i].y; int y = (int) sortedBlocks[block2Base+i].y;
// Check each atom in block Y for interactions. // Check each atom in block Y for interactions.
......
...@@ -69,7 +69,7 @@ __kernel void sortBoxData(__global const real2* restrict sortedBlock, __global c ...@@ -69,7 +69,7 @@ __kernel void sortBoxData(__global const real2* restrict sortedBlock, __global c
* This is called by findBlocksWithInteractions(). It compacts the list of blocks and writes them * This is called by findBlocksWithInteractions(). It compacts the list of blocks and writes them
* to global memory. * to global memory.
*/ */
void storeInteractionData(unsigned short x, unsigned short* buffer, int* atoms, int* numAtoms, int numValid, __global unsigned int* interactionCount, void storeInteractionData(int x, int* buffer, int* atoms, int* numAtoms, int numValid, __global unsigned int* interactionCount,
__global int* interactingTiles, __global unsigned int* interactingAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, __global int* interactingTiles, __global unsigned int* interactingAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX,
real4 periodicBoxVecY, real4 periodicBoxVecZ, __global const real4* posq, real4 blockCenterX, real4 blockSizeX, unsigned int maxTiles, bool finish) { real4 periodicBoxVecY, real4 periodicBoxVecZ, __global const real4* posq, real4 blockCenterX, real4 blockSizeX, unsigned int maxTiles, bool finish) {
real4 posBuffer[TILE_SIZE]; real4 posBuffer[TILE_SIZE];
...@@ -167,7 +167,7 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi ...@@ -167,7 +167,7 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi
__global const int* restrict rebuildNeighborList) { __global const int* restrict rebuildNeighborList) {
if (rebuildNeighborList[0] == 0) if (rebuildNeighborList[0] == 0)
return; // The neighbor list doesn't need to be rebuilt. return; // The neighbor list doesn't need to be rebuilt.
unsigned short buffer[BUFFER_SIZE]; int buffer[BUFFER_SIZE];
int atoms[BUFFER_SIZE]; int atoms[BUFFER_SIZE];
int exclusionsForX[MAX_EXCLUSIONS]; int exclusionsForX[MAX_EXCLUSIONS];
int valuesInBuffer; int valuesInBuffer;
...@@ -179,7 +179,7 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi ...@@ -179,7 +179,7 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi
valuesInBuffer = 0; valuesInBuffer = 0;
numAtoms = 0; numAtoms = 0;
real2 sortedKey = sortedBlocks[i]; real2 sortedKey = sortedBlocks[i];
unsigned short x = (unsigned short) sortedKey.y; int x = (int) sortedKey.y;
real4 blockCenterX = sortedBlockCenter[i]; real4 blockCenterX = sortedBlockCenter[i];
real4 blockSizeX = sortedBlockBoundingBox[i]; real4 blockSizeX = sortedBlockBoundingBox[i];
...@@ -195,7 +195,7 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi ...@@ -195,7 +195,7 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi
for (int j = i+1; j < NUM_BLOCKS; j++) { for (int j = i+1; j < NUM_BLOCKS; j++) {
real2 sortedKey2 = sortedBlocks[j]; real2 sortedKey2 = sortedBlocks[j];
unsigned short y = (unsigned short) sortedKey2.y; int y = (int) sortedKey2.y;
bool hasExclusions = false; bool hasExclusions = false;
for (int k = 0; k < numExclusions; k++) for (int k = 0; k < numExclusions; k++)
hasExclusions |= (exclusionsForX[k] == y); hasExclusions |= (exclusionsForX[k] == y);
......
...@@ -23,7 +23,7 @@ __kernel void computeNonbonded( ...@@ -23,7 +23,7 @@ __kernel void computeNonbonded(
__global real4* restrict forceBuffers, __global real4* restrict forceBuffers,
#endif #endif
__global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const unsigned int* restrict exclusions, __global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const unsigned int* restrict exclusions,
__global const ushort2* restrict exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices __global const int2* restrict exclusionTiles, unsigned int startTileIndex, unsigned long numTileIndices
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
, __global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, , __global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter,
...@@ -43,7 +43,7 @@ __kernel void computeNonbonded( ...@@ -43,7 +43,7 @@ __kernel void computeNonbonded(
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps; const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps; const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) { for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos]; const int2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x; const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y; const unsigned int y = tileIndices.y;
real4 force = 0; real4 force = 0;
...@@ -205,12 +205,11 @@ __kernel void computeNonbonded( ...@@ -205,12 +205,11 @@ __kernel void computeNonbonded(
unsigned int numTiles = interactionCount[0]; unsigned int numTiles = interactionCount[0];
if (numTiles > maxTiles) if (numTiles > maxTiles)
return; // There wasn't enough memory for the neighbor list. return; // There wasn't enough memory for the neighbor list.
int pos = (int) (numTiles > maxTiles ? startTileIndex+warp*(long)numTileIndices/totalWarps : warp*(long)numTiles/totalWarps); int pos = (int) (warp*(long)numTiles/totalWarps);
int end = (int) (numTiles > maxTiles ? startTileIndex+(warp+1)*(long)numTileIndices/totalWarps : (warp+1)*(long)numTiles/totalWarps); int end = (int) ((warp+1)*(long)numTiles/totalWarps);
#else #else
const unsigned int numTiles = numTileIndices; int pos = (int) (startTileIndex+warp*numTileIndices/totalWarps);
int pos = (int) (startTileIndex+warp*(long)numTiles/totalWarps); int end = (int) (startTileIndex+(warp+1)*numTileIndices/totalWarps);
int end = (int) (startTileIndex+(warp+1)*(long)numTiles/totalWarps);
#endif #endif
int skipBase = 0; int skipBase = 0;
int currentSkipIndex = tbx; int currentSkipIndex = tbx;
...@@ -247,7 +246,7 @@ __kernel void computeNonbonded( ...@@ -247,7 +246,7 @@ __kernel void computeNonbonded(
while (skipTiles[tbx+TILE_SIZE-1] < pos) { while (skipTiles[tbx+TILE_SIZE-1] < pos) {
SYNC_WARPS; SYNC_WARPS;
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) { if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx]; int2 tile = exclusionTiles[skipBase+tgx];
skipTiles[get_local_id(0)] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2; skipTiles[get_local_id(0)] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
} }
else else
......
...@@ -20,7 +20,7 @@ __kernel void computeNonbonded( ...@@ -20,7 +20,7 @@ __kernel void computeNonbonded(
__global real4* restrict forceBuffers, __global real4* restrict forceBuffers,
#endif #endif
__global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const unsigned int* restrict exclusions, __global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const unsigned int* restrict exclusions,
__global const ushort2* restrict exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices __global const int2* restrict exclusionTiles, unsigned int startTileIndex, unsigned long numTileIndices
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
, __global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, , __global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter,
...@@ -36,7 +36,7 @@ __kernel void computeNonbonded( ...@@ -36,7 +36,7 @@ __kernel void computeNonbonded(
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+get_group_id(0)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0); const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+get_group_id(0)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(get_group_id(0)+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0); const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(get_group_id(0)+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) { for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos]; const int2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x; const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y; const unsigned int y = tileIndices.y;
...@@ -256,7 +256,7 @@ __kernel void computeNonbonded( ...@@ -256,7 +256,7 @@ __kernel void computeNonbonded(
while (nextToSkip < pos) { while (nextToSkip < pos) {
if (currentSkipIndex < NUM_TILES_WITH_EXCLUSIONS) { if (currentSkipIndex < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[currentSkipIndex++]; int2 tile = exclusionTiles[currentSkipIndex++];
nextToSkip = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2; nextToSkip = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
} }
else else
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2008-2015 Stanford University and the Authors. * * Portions copyright (c) 2008-2020 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -31,6 +31,8 @@ ...@@ -31,6 +31,8 @@
#include "OpenCLTests.h" #include "OpenCLTests.h"
#include "TestNonbondedForce.h" #include "TestNonbondedForce.h"
#include <cl.hpp>
#include <string>
void testParallelComputation(NonbondedForce::NonbondedMethod method) { void testParallelComputation(NonbondedForce::NonbondedMethod method) {
System system; System system;
...@@ -122,9 +124,34 @@ void testReordering() { ...@@ -122,9 +124,34 @@ void testReordering() {
} }
} }
bool canRunHugeTest() {
// Create a minimal context just to see which platform and device are being used.
System system;
system.addParticle(1.0);
VerletIntegrator integrator(1.0);
Context context(system, integrator, platform);
int platformIndex = stoi(platform.getPropertyValue(context, OpenCLPlatform::OpenCLPlatformIndex()));
int deviceIndex = stoi(platform.getPropertyValue(context, OpenCLPlatform::OpenCLDeviceIndex()));
// Find out how much memory the device has.
vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
vector<cl::Device> devices;
platforms[platformIndex].getDevices(CL_DEVICE_TYPE_ALL, &devices);
long long memory = devices[deviceIndex].getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>();
// Only run the huge test if the device has at least 4 GB of memory.
return (memory >= 4*(1<<30));
}
void runPlatformTests() { void runPlatformTests() {
testParallelComputation(NonbondedForce::NoCutoff); testParallelComputation(NonbondedForce::NoCutoff);
testParallelComputation(NonbondedForce::Ewald); testParallelComputation(NonbondedForce::Ewald);
testParallelComputation(NonbondedForce::PME); testParallelComputation(NonbondedForce::PME);
testReordering(); testReordering();
if (canRunHugeTest())
testHugeSystem();
} }
...@@ -1239,11 +1239,11 @@ void CudaCalcAmoebaMultipoleForceKernel::initializeScaleFactors() { ...@@ -1239,11 +1239,11 @@ void CudaCalcAmoebaMultipoleForceKernel::initializeScaleFactors() {
// Figure out the covalent flag values to use for each atom pair. // Figure out the covalent flag values to use for each atom pair.
vector<ushort2> exclusionTiles; vector<int2> exclusionTiles;
nb.getExclusionTiles().download(exclusionTiles); nb.getExclusionTiles().download(exclusionTiles);
map<pair<int, int>, int> exclusionTileMap; map<pair<int, int>, int> exclusionTileMap;
for (int i = 0; i < (int) exclusionTiles.size(); i++) { for (int i = 0; i < (int) exclusionTiles.size(); i++) {
ushort2 tile = exclusionTiles[i]; int2 tile = exclusionTiles[i];
exclusionTileMap[make_pair(tile.x, tile.y)] = i; exclusionTileMap[make_pair(tile.x, tile.y)] = i;
} }
covalentFlags.initialize<uint2>(cu, nb.getExclusions().getSize(), "covalentFlags"); covalentFlags.initialize<uint2>(cu, nb.getExclusions().getSize(), "covalentFlags");
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment