Unverified Commit cdc0789a authored by peastman's avatar peastman Committed by GitHub
Browse files

Fixed range overflow with very large numbers of atoms (#2806)

* Fixed range overflow with very large numbers of atoms

* More fixes to overflow with large numbers of atoms

* Fix test failures
parent b4543a46
......@@ -17,7 +17,7 @@ KERNEL void computeN2Energy(
#endif
GLOBAL mixed* RESTRICT energyBuffer,
GLOBAL const real4* RESTRICT posq, GLOBAL const unsigned int* RESTRICT exclusions,
GLOBAL const ushort2* exclusionTiles, int needEnergy,
GLOBAL const int2* exclusionTiles, int needEnergy,
#ifdef USE_CUTOFF
GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
......@@ -41,7 +41,7 @@ KERNEL void computeN2Energy(
const int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos];
const int2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y;
real3 force = make_real3(0);
......@@ -236,7 +236,7 @@ KERNEL void computeN2Energy(
while (skipTiles[tbx+TILE_SIZE-1] < pos) {
SYNC_WARPS;
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx];
int2 tile = exclusionTiles[skipBase+tgx];
skipTiles[LOCAL_ID] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
}
else
......
......@@ -17,7 +17,7 @@ KERNEL void computeN2Energy(
#endif
GLOBAL mixed* RESTRICT energyBuffer,
GLOBAL const real4* RESTRICT posq, GLOBAL const unsigned int* RESTRICT exclusions,
GLOBAL const ushort2* exclusionTiles, int needEnergy,
GLOBAL const int2* exclusionTiles, int needEnergy,
#ifdef USE_CUTOFF
GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
......@@ -37,7 +37,7 @@ KERNEL void computeN2Energy(
const int firstExclusionTile = FIRST_EXCLUSION_TILE+GROUP_ID*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS;
const int lastExclusionTile = FIRST_EXCLUSION_TILE+(GROUP_ID+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos];
const int2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y;
......@@ -248,7 +248,7 @@ KERNEL void computeN2Energy(
while (nextToSkip < pos) {
if (currentSkipIndex < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[currentSkipIndex++];
int2 tile = exclusionTiles[currentSkipIndex++];
nextToSkip = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
}
else
......
......@@ -2,7 +2,7 @@
* Compute a value based on pair interactions.
*/
KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsigned int* RESTRICT exclusions,
GLOBAL const ushort2* exclusionTiles,
GLOBAL const int2* exclusionTiles,
#ifdef SUPPORTS_64_BIT_ATOMICS
GLOBAL mm_ulong* RESTRICT global_value,
#else
......@@ -29,7 +29,7 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig
const int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos];
const int2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y;
real value = 0;
......@@ -205,7 +205,7 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig
while (skipTiles[tbx+TILE_SIZE-1] < pos) {
SYNC_WARPS;
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx];
int2 tile = exclusionTiles[skipBase+tgx];
skipTiles[LOCAL_ID] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
}
else
......
......@@ -2,7 +2,7 @@
* Compute a value based on pair interactions.
*/
KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsigned int* RESTRICT exclusions,
GLOBAL const ushort2* exclusionTiles,
GLOBAL const int2* exclusionTiles,
#ifdef SUPPORTS_64_BIT_ATOMICS
GLOBAL mm_ulong* RESTRICT global_value,
#else
......@@ -25,7 +25,7 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig
const int firstExclusionTile = FIRST_EXCLUSION_TILE+get_group_id(0)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
const int lastExclusionTile = FIRST_EXCLUSION_TILE+(get_group_id(0)+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos];
const int2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y;
......@@ -213,7 +213,7 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig
while (nextToSkip < pos) {
if (currentSkipIndex < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[currentSkipIndex++];
int2 tile = exclusionTiles[currentSkipIndex++];
nextToSkip = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
}
else
......
......@@ -24,7 +24,7 @@ KERNEL void computeBornSum(
#else
unsigned int numTiles,
#endif
GLOBAL const ushort2* RESTRICT exclusionTiles) {
GLOBAL const int2* RESTRICT exclusionTiles) {
const unsigned int totalWarps = GLOBAL_SIZE/TILE_SIZE;
const unsigned int warp = GLOBAL_ID/TILE_SIZE;
const unsigned int tgx = LOCAL_ID & (TILE_SIZE-1);
......@@ -36,7 +36,7 @@ KERNEL void computeBornSum(
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos];
const int2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y;
real bornSum = 0;
......@@ -209,7 +209,7 @@ KERNEL void computeBornSum(
while (skipTiles[tbx+TILE_SIZE-1] < pos) {
SYNC_WARPS;
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx];
int2 tile = exclusionTiles[skipBase+tgx];
skipTiles[LOCAL_ID] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
}
else
......@@ -393,7 +393,7 @@ KERNEL void computeGBSAForce1(
#else
unsigned int numTiles,
#endif
GLOBAL const ushort2* RESTRICT exclusionTiles) {
GLOBAL const int2* RESTRICT exclusionTiles) {
const unsigned int totalWarps = GLOBAL_SIZE/TILE_SIZE;
const unsigned int warp = GLOBAL_ID/TILE_SIZE;
const unsigned int tgx = LOCAL_ID & (TILE_SIZE-1);
......@@ -406,7 +406,7 @@ KERNEL void computeGBSAForce1(
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos];
const int2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y;
real4 force = make_real4(0);
......@@ -604,7 +604,7 @@ KERNEL void computeGBSAForce1(
while (skipTiles[tbx+TILE_SIZE-1] < pos) {
SYNC_WARPS;
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx];
int2 tile = exclusionTiles[skipBase+tgx];
skipTiles[LOCAL_ID] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
}
else
......
......@@ -22,7 +22,7 @@ KERNEL void computeBornSum(
#else
unsigned int numTiles,
#endif
GLOBAL const ushort2* exclusionTiles) {
GLOBAL const int2* exclusionTiles) {
LOCAL AtomData1 localData[TILE_SIZE];
// First loop: process tiles that contain exclusions.
......@@ -30,7 +30,7 @@ KERNEL void computeBornSum(
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+GROUP_ID*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS;
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(GROUP_ID+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos];
const int2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y;
......@@ -213,7 +213,7 @@ KERNEL void computeBornSum(
while (nextToSkip < pos) {
if (currentSkipIndex < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[currentSkipIndex++];
int2 tile = exclusionTiles[currentSkipIndex++];
nextToSkip = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
}
else
......@@ -416,7 +416,7 @@ KERNEL void computeGBSAForce1(
#else
unsigned int numTiles,
#endif
GLOBAL const ushort2* exclusionTiles) {
GLOBAL const int2* exclusionTiles) {
mixed energy = 0;
LOCAL AtomData2 localData[TILE_SIZE];
......@@ -425,7 +425,7 @@ KERNEL void computeGBSAForce1(
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+GROUP_ID*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS;
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(GROUP_ID+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos];
const int2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y;
......@@ -637,7 +637,7 @@ KERNEL void computeGBSAForce1(
while (nextToSkip < pos) {
if (currentSkipIndex < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[currentSkipIndex++];
int2 tile = exclusionTiles[currentSkipIndex++];
nextToSkip = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
}
else
......
......@@ -33,4 +33,5 @@
#include "TestNonbondedForce.h"
void runPlatformTests() {
testHugeSystem();
}
......@@ -348,7 +348,8 @@ private:
std::map<int, std::string> groupKernelSource;
double lastCutoff;
bool useCutoff, usePeriodic, anyExclusions, usePadding, forceRebuildNeighborList, canUsePairList;
int startTileIndex, numTiles, startBlockIndex, numBlocks, maxTiles, maxSinglePairs, maxExclusions, numForceThreadBlocks, forceThreadBlockSize, numAtoms, groupFlags;
int startTileIndex, startBlockIndex, numBlocks, maxTiles, maxSinglePairs, maxExclusions, numForceThreadBlocks, forceThreadBlockSize, numAtoms, groupFlags;
long long numTiles;
std::string kernelSource;
};
......
......@@ -167,7 +167,7 @@ void CudaNonbondedUtilities::requestExclusions(const vector<vector<int> >& exclu
}
}
static bool compareUshort2(ushort2 a, ushort2 b) {
static bool compareInt2(int2 a, int2 b) {
return ((a.y < b.y) || (a.y == b.y && a.x < b.x));
}
......@@ -199,15 +199,15 @@ void CudaNonbondedUtilities::initialize(const System& system) {
tilesWithExclusions.insert(make_pair(max(x, y), min(x, y)));
}
}
vector<ushort2> exclusionTilesVec;
vector<int2> exclusionTilesVec;
for (set<pair<int, int> >::const_iterator iter = tilesWithExclusions.begin(); iter != tilesWithExclusions.end(); ++iter)
exclusionTilesVec.push_back(make_ushort2((unsigned short) iter->first, (unsigned short) iter->second));
sort(exclusionTilesVec.begin(), exclusionTilesVec.end(), compareUshort2);
exclusionTiles.initialize<ushort2>(context, exclusionTilesVec.size(), "exclusionTiles");
exclusionTilesVec.push_back(make_int2(iter->first, iter->second));
sort(exclusionTilesVec.begin(), exclusionTilesVec.end(), compareInt2);
exclusionTiles.initialize<int2>(context, exclusionTilesVec.size(), "exclusionTiles");
exclusionTiles.upload(exclusionTilesVec);
map<pair<int, int>, int> exclusionTileMap;
for (int i = 0; i < (int) exclusionTilesVec.size(); i++) {
ushort2 tile = exclusionTilesVec[i];
int2 tile = exclusionTilesVec[i];
exclusionTileMap[make_pair(tile.x, tile.y)] = i;
}
vector<vector<int> > exclusionBlocksForBlock(numAtomBlocks);
......@@ -463,9 +463,9 @@ void CudaNonbondedUtilities::setAtomBlockRange(double startFraction, double endF
int numAtomBlocks = context.getNumAtomBlocks();
startBlockIndex = (int) (startFraction*numAtomBlocks);
numBlocks = (int) (endFraction*numAtomBlocks)-startBlockIndex;
int totalTiles = context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2;
long long totalTiles = context.getNumAtomBlocks()*((long long)context.getNumAtomBlocks()+1)/2;
startTileIndex = (int) (startFraction*totalTiles);
numTiles = (int) (endFraction*totalTiles)-startTileIndex;
numTiles = (long long) (endFraction*totalTiles)-startTileIndex;
forceRebuildNeighborList = true;
}
......
......@@ -264,7 +264,7 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
includeBlock2 = forceInclude = true;
#endif
if (includeBlock2) {
unsigned short y = (unsigned short) sortedBlocks[block2].y;
int y = (int) sortedBlocks[block2].y;
for (int k = 0; k < numExclusions; k++)
includeBlock2 &= (exclusionsForX[k] != y);
}
......@@ -278,7 +278,7 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
int i = __ffs(includeBlockFlags)-1;
includeBlockFlags &= includeBlockFlags-1;
forceInclude = (forceIncludeFlags>>i) & 1;
unsigned short y = (unsigned short) sortedBlocks[block2Base+i].y;
int y = (int) sortedBlocks[block2Base+i].y;
// Check each atom in block Y for interactions.
......
......@@ -105,7 +105,7 @@ static __inline__ __device__ long long real_shfl(long long var, int srcLane) {
*/
extern "C" __global__ void computeNonbonded(
unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, const tileflags* __restrict__ exclusions,
const ushort2* __restrict__ exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices
const int2* __restrict__ exclusionTiles, unsigned int startTileIndex, unsigned long long numTileIndices
#ifdef USE_CUTOFF
, const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter,
......@@ -129,7 +129,7 @@ extern "C" __global__ void computeNonbonded(
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos];
const int2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y;
real3 force = make_real3(0);
......@@ -324,12 +324,11 @@ extern "C" __global__ void computeNonbonded(
const unsigned int numTiles = interactionCount[0];
if (numTiles > maxTiles)
return; // There wasn't enough memory for the neighbor list.
int pos = (int) (numTiles > maxTiles ? startTileIndex+warp*(long long)numTileIndices/totalWarps : warp*(long long)numTiles/totalWarps);
int end = (int) (numTiles > maxTiles ? startTileIndex+(warp+1)*(long long)numTileIndices/totalWarps : (warp+1)*(long long)numTiles/totalWarps);
int pos = (int) (warp*(long long)numTiles/totalWarps);
int end = (int) ((warp+1)*(long long)numTiles/totalWarps);
#else
const unsigned int numTiles = numTileIndices;
int pos = (int) (startTileIndex+warp*(long long)numTiles/totalWarps);
int end = (int) (startTileIndex+(warp+1)*(long long)numTiles/totalWarps);
int pos = (int) (startTileIndex+warp*numTileIndices/totalWarps);
int end = (int) (startTileIndex+(warp+1)*numTileIndices/totalWarps);
#endif
int skipBase = 0;
int currentSkipIndex = tbx;
......@@ -365,7 +364,7 @@ extern "C" __global__ void computeNonbonded(
while (skipTiles[tbx+TILE_SIZE-1] < pos) {
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx];
int2 tile = exclusionTiles[skipBase+tgx];
skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
}
else
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2015 Stanford University and the Authors. *
* Portions copyright (c) 2008-2020 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -31,6 +31,8 @@
#include "CudaTests.h"
#include "TestNonbondedForce.h"
#include <cuda.h>
#include <string>
void testParallelComputation(NonbondedForce::NonbondedMethod method) {
System system;
......@@ -152,10 +154,33 @@ void testDeterministicForces() {
}
}
bool canRunHugeTest() {
// Create a minimal context just to see which device is being used.
System system;
system.addParticle(1.0);
VerletIntegrator integrator(1.0);
Context context(system, integrator, platform);
int deviceIndex = stoi(platform.getPropertyValue(context, CudaPlatform::CudaDeviceIndex()));
// Find out how much memory the device has.
CUdevice device;
cuDeviceGet(&device, deviceIndex);
size_t memory;
cuDeviceTotalMem(&memory, device);
// Only run the huge test if the device has at least 4 GB of memory.
return (memory >= 4*(1<<30));
}
void runPlatformTests() {
testParallelComputation(NonbondedForce::NoCutoff);
testParallelComputation(NonbondedForce::Ewald);
testParallelComputation(NonbondedForce::PME);
testReordering();
testDeterministicForces();
if (canRunHugeTest())
testHugeSystem();
}
......@@ -328,8 +328,9 @@ private:
std::map<int, std::string> groupKernelSource;
double lastCutoff;
bool useCutoff, usePeriodic, deviceIsCpu, anyExclusions, usePadding, forceRebuildNeighborList;
int numForceBuffers, startTileIndex, numTiles, startBlockIndex, numBlocks, maxExclusions, numForceThreadBlocks;
int numForceBuffers, startTileIndex, startBlockIndex, numBlocks, maxExclusions, numForceThreadBlocks;
int forceThreadBlockSize, interactingBlocksThreadBlockSize, groupFlags;
long long numTiles;
};
/**
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2018 Stanford University and the Authors. *
* Portions copyright (c) 2009-2020 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -177,13 +177,13 @@ void OpenCLNonbondedUtilities::requestExclusions(const vector<vector<int> >& exc
}
}
static bool compareUshort2(mm_ushort2 a, mm_ushort2 b) {
static bool compareInt2(mm_int2 a, mm_int2 b) {
// This version is used on devices with SIMD width of 32 or less. It sorts tiles to improve cache efficiency.
return ((a.y < b.y) || (a.y == b.y && a.x < b.x));
}
static bool compareUshort2LargeSIMD(mm_ushort2 a, mm_ushort2 b) {
static bool compareInt2LargeSIMD(mm_int2 a, mm_int2 b) {
// This version is used on devices with SIMD width greater than 32. It puts diagonal tiles before off-diagonal
// ones to reduce thread divergence.
......@@ -223,15 +223,15 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
tilesWithExclusions.insert(make_pair(max(x, y), min(x, y)));
}
}
vector<mm_ushort2> exclusionTilesVec;
vector<mm_int2> exclusionTilesVec;
for (set<pair<int, int> >::const_iterator iter = tilesWithExclusions.begin(); iter != tilesWithExclusions.end(); ++iter)
exclusionTilesVec.push_back(mm_ushort2((unsigned short) iter->first, (unsigned short) iter->second));
sort(exclusionTilesVec.begin(), exclusionTilesVec.end(), context.getSIMDWidth() <= 32 ? compareUshort2 : compareUshort2LargeSIMD);
exclusionTiles.initialize<mm_ushort2>(context, exclusionTilesVec.size(), "exclusionTiles");
exclusionTilesVec.push_back(mm_int2(iter->first, iter->second));
sort(exclusionTilesVec.begin(), exclusionTilesVec.end(), context.getSIMDWidth() <= 32 ? compareInt2 : compareInt2LargeSIMD);
exclusionTiles.initialize<mm_int2>(context, exclusionTilesVec.size(), "exclusionTiles");
exclusionTiles.upload(exclusionTilesVec);
map<pair<int, int>, int> exclusionTileMap;
for (int i = 0; i < (int) exclusionTilesVec.size(); i++) {
mm_ushort2 tile = exclusionTilesVec[i];
mm_int2 tile = exclusionTilesVec[i];
exclusionTileMap[make_pair(tile.x, tile.y)] = i;
}
vector<vector<int> > exclusionBlocksForBlock(numAtomBlocks);
......@@ -440,9 +440,9 @@ void OpenCLNonbondedUtilities::setAtomBlockRange(double startFraction, double en
int numAtomBlocks = context.getNumAtomBlocks();
startBlockIndex = (int) (startFraction*numAtomBlocks);
numBlocks = (int) (endFraction*numAtomBlocks)-startBlockIndex;
int totalTiles = context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2;
long long totalTiles = context.getNumAtomBlocks()*((long long)context.getNumAtomBlocks()+1)/2;
startTileIndex = (int) (startFraction*totalTiles);;
numTiles = (int) (endFraction*totalTiles)-startTileIndex;
numTiles = (long long) (endFraction*totalTiles)-startTileIndex;
if (useCutoff) {
// We are using a cutoff, and the kernels have already been created.
......@@ -450,15 +450,15 @@ void OpenCLNonbondedUtilities::setAtomBlockRange(double startFraction, double en
KernelSet& kernels = iter->second;
if (*reinterpret_cast<cl_kernel*>(&kernels.forceKernel) != NULL) {
kernels.forceKernel.setArg<cl_uint>(5, startTileIndex);
kernels.forceKernel.setArg<cl_uint>(6, numTiles);
kernels.forceKernel.setArg<cl_ulong>(6, numTiles);
}
if (*reinterpret_cast<cl_kernel*>(&kernels.energyKernel) != NULL) {
kernels.energyKernel.setArg<cl_uint>(5, startTileIndex);
kernels.energyKernel.setArg<cl_uint>(6, numTiles);
kernels.energyKernel.setArg<cl_ulong>(6, numTiles);
}
if (*reinterpret_cast<cl_kernel*>(&kernels.forceEnergyKernel) != NULL) {
kernels.forceEnergyKernel.setArg<cl_uint>(5, startTileIndex);
kernels.forceEnergyKernel.setArg<cl_uint>(6, numTiles);
kernels.forceEnergyKernel.setArg<cl_ulong>(6, numTiles);
}
kernels.findInteractingBlocksKernel.setArg<cl_uint>(10, startBlockIndex);
kernels.findInteractingBlocksKernel.setArg<cl_uint>(11, numBlocks);
......@@ -711,7 +711,7 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
kernel.setArg<cl::Buffer>(index++, exclusions.getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, exclusionTiles.getDeviceBuffer());
kernel.setArg<cl_uint>(index++, startTileIndex);
kernel.setArg<cl_uint>(index++, numTiles);
kernel.setArg<cl_ulong>(index++, numTiles);
if (useCutoff) {
kernel.setArg<cl::Buffer>(index++, interactingTiles.getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, interactionCount.getDeviceBuffer());
......
......@@ -166,7 +166,7 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi
includeBlock2 = true;
#endif
if (includeBlock2) {
unsigned short y = (unsigned short) sortedBlocks[block2].y;
int y = (int) sortedBlocks[block2].y;
for (int k = 0; k < numExclusions; k++)
includeBlock2 &= (exclusionsForX[k] != y);
}
......@@ -180,7 +180,7 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi
while (i < TILE_SIZE && !includeBlockFlags[warpStart+i])
i++;
if (i < TILE_SIZE) {
unsigned short y = (unsigned short) sortedBlocks[block2Base+i].y;
int y = (int) sortedBlocks[block2Base+i].y;
// Check each atom in block Y for interactions.
......
......@@ -69,7 +69,7 @@ __kernel void sortBoxData(__global const real2* restrict sortedBlock, __global c
* This is called by findBlocksWithInteractions(). It compacts the list of blocks and writes them
* to global memory.
*/
void storeInteractionData(unsigned short x, unsigned short* buffer, int* atoms, int* numAtoms, int numValid, __global unsigned int* interactionCount,
void storeInteractionData(int x, int* buffer, int* atoms, int* numAtoms, int numValid, __global unsigned int* interactionCount,
__global int* interactingTiles, __global unsigned int* interactingAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX,
real4 periodicBoxVecY, real4 periodicBoxVecZ, __global const real4* posq, real4 blockCenterX, real4 blockSizeX, unsigned int maxTiles, bool finish) {
real4 posBuffer[TILE_SIZE];
......@@ -167,7 +167,7 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi
__global const int* restrict rebuildNeighborList) {
if (rebuildNeighborList[0] == 0)
return; // The neighbor list doesn't need to be rebuilt.
unsigned short buffer[BUFFER_SIZE];
int buffer[BUFFER_SIZE];
int atoms[BUFFER_SIZE];
int exclusionsForX[MAX_EXCLUSIONS];
int valuesInBuffer;
......@@ -179,7 +179,7 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi
valuesInBuffer = 0;
numAtoms = 0;
real2 sortedKey = sortedBlocks[i];
unsigned short x = (unsigned short) sortedKey.y;
int x = (int) sortedKey.y;
real4 blockCenterX = sortedBlockCenter[i];
real4 blockSizeX = sortedBlockBoundingBox[i];
......@@ -195,7 +195,7 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi
for (int j = i+1; j < NUM_BLOCKS; j++) {
real2 sortedKey2 = sortedBlocks[j];
unsigned short y = (unsigned short) sortedKey2.y;
int y = (int) sortedKey2.y;
bool hasExclusions = false;
for (int k = 0; k < numExclusions; k++)
hasExclusions |= (exclusionsForX[k] == y);
......
......@@ -23,7 +23,7 @@ __kernel void computeNonbonded(
__global real4* restrict forceBuffers,
#endif
__global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const unsigned int* restrict exclusions,
__global const ushort2* restrict exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices
__global const int2* restrict exclusionTiles, unsigned int startTileIndex, unsigned long numTileIndices
#ifdef USE_CUTOFF
, __global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter,
......@@ -43,7 +43,7 @@ __kernel void computeNonbonded(
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos];
const int2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y;
real4 force = 0;
......@@ -205,12 +205,11 @@ __kernel void computeNonbonded(
unsigned int numTiles = interactionCount[0];
if (numTiles > maxTiles)
return; // There wasn't enough memory for the neighbor list.
int pos = (int) (numTiles > maxTiles ? startTileIndex+warp*(long)numTileIndices/totalWarps : warp*(long)numTiles/totalWarps);
int end = (int) (numTiles > maxTiles ? startTileIndex+(warp+1)*(long)numTileIndices/totalWarps : (warp+1)*(long)numTiles/totalWarps);
int pos = (int) (warp*(long)numTiles/totalWarps);
int end = (int) ((warp+1)*(long)numTiles/totalWarps);
#else
const unsigned int numTiles = numTileIndices;
int pos = (int) (startTileIndex+warp*(long)numTiles/totalWarps);
int end = (int) (startTileIndex+(warp+1)*(long)numTiles/totalWarps);
int pos = (int) (startTileIndex+warp*numTileIndices/totalWarps);
int end = (int) (startTileIndex+(warp+1)*numTileIndices/totalWarps);
#endif
int skipBase = 0;
int currentSkipIndex = tbx;
......@@ -247,7 +246,7 @@ __kernel void computeNonbonded(
while (skipTiles[tbx+TILE_SIZE-1] < pos) {
SYNC_WARPS;
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx];
int2 tile = exclusionTiles[skipBase+tgx];
skipTiles[get_local_id(0)] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
}
else
......
......@@ -20,7 +20,7 @@ __kernel void computeNonbonded(
__global real4* restrict forceBuffers,
#endif
__global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const unsigned int* restrict exclusions,
__global const ushort2* restrict exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices
__global const int2* restrict exclusionTiles, unsigned int startTileIndex, unsigned long numTileIndices
#ifdef USE_CUTOFF
, __global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter,
......@@ -36,7 +36,7 @@ __kernel void computeNonbonded(
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+get_group_id(0)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(get_group_id(0)+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos];
const int2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y;
......@@ -256,7 +256,7 @@ __kernel void computeNonbonded(
while (nextToSkip < pos) {
if (currentSkipIndex < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[currentSkipIndex++];
int2 tile = exclusionTiles[currentSkipIndex++];
nextToSkip = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
}
else
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2015 Stanford University and the Authors. *
* Portions copyright (c) 2008-2020 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -31,6 +31,8 @@
#include "OpenCLTests.h"
#include "TestNonbondedForce.h"
#include <cl.hpp>
#include <string>
void testParallelComputation(NonbondedForce::NonbondedMethod method) {
System system;
......@@ -122,9 +124,34 @@ void testReordering() {
}
}
bool canRunHugeTest() {
// Create a minimal context just to see which platform and device are being used.
System system;
system.addParticle(1.0);
VerletIntegrator integrator(1.0);
Context context(system, integrator, platform);
int platformIndex = stoi(platform.getPropertyValue(context, OpenCLPlatform::OpenCLPlatformIndex()));
int deviceIndex = stoi(platform.getPropertyValue(context, OpenCLPlatform::OpenCLDeviceIndex()));
// Find out how much memory the device has.
vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
vector<cl::Device> devices;
platforms[platformIndex].getDevices(CL_DEVICE_TYPE_ALL, &devices);
long long memory = devices[deviceIndex].getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>();
// Only run the huge test if the device has at least 4 GB of memory.
return (memory >= 4*(1<<30));
}
void runPlatformTests() {
testParallelComputation(NonbondedForce::NoCutoff);
testParallelComputation(NonbondedForce::Ewald);
testParallelComputation(NonbondedForce::PME);
testReordering();
if (canRunHugeTest())
testHugeSystem();
}
......@@ -1239,11 +1239,11 @@ void CudaCalcAmoebaMultipoleForceKernel::initializeScaleFactors() {
// Figure out the covalent flag values to use for each atom pair.
vector<ushort2> exclusionTiles;
vector<int2> exclusionTiles;
nb.getExclusionTiles().download(exclusionTiles);
map<pair<int, int>, int> exclusionTileMap;
for (int i = 0; i < (int) exclusionTiles.size(); i++) {
ushort2 tile = exclusionTiles[i];
int2 tile = exclusionTiles[i];
exclusionTileMap[make_pair(tile.x, tile.y)] = i;
}
covalentFlags.initialize<uint2>(cu, nb.getExclusions().getSize(), "covalentFlags");
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment