Fixed range overflow with very large numbers of atoms (#2806)

* Fixed range overflow with very large numbers of atoms * More fixes to overflow with large numbers of atoms * Fix test failures

Fixed range overflow with very large numbers of atoms (#2806)
* Fixed range overflow with very large numbers of atoms * More fixes to overflow with large numbers of atoms * Fix test failures
cdc0789a · peastman · GitHub · b4543a46 · cdc0789a · cdc0789a
Unverified Commit cdc0789a authored Aug 20, 2020 by peastman Committed by GitHub Aug 20, 2020
20 changed files
--- a/platforms/common/src/kernels/customGBEnergyN2.cc
+++ b/platforms/common/src/kernels/customGBEnergyN2.cc
@@ -17,7 +17,7 @@ KERNEL void computeN2Energy(
 #endif
        GLOBAL mixed* RESTRICT energyBuffer,
        GLOBAL const real4* RESTRICT posq, GLOBAL const unsigned int* RESTRICT exclusions,
-        GLOBAL const ushort2* exclusionTiles, int needEnergy,
+        GLOBAL const int2* exclusionTiles, int needEnergy,
 #ifdef USE_CUTOFF
        GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
@@ -41,7 +41,7 @@ KERNEL void computeN2Energy(
    const int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
    const int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
-        const ushort2 tileIndices = exclusionTiles[pos];
+        const int2 tileIndices = exclusionTiles[pos];
        const unsigned int x = tileIndices.x;
        const unsigned int y = tileIndices.y;
        real3 force = make_real3(0);
@@ -236,7 +236,7 @@ KERNEL void computeN2Energy(
        while (skipTiles[tbx+TILE_SIZE-1] < pos) {
            SYNC_WARPS;
            if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
-                ushort2 tile = exclusionTiles[skipBase+tgx];
+                int2 tile = exclusionTiles[skipBase+tgx];
                skipTiles[LOCAL_ID] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
            }
            else

--- a/platforms/common/src/kernels/customGBEnergyN2_cpu.cc
+++ b/platforms/common/src/kernels/customGBEnergyN2_cpu.cc
@@ -17,7 +17,7 @@ KERNEL void computeN2Energy(
 #endif
        GLOBAL mixed* RESTRICT energyBuffer,
        GLOBAL const real4* RESTRICT posq, GLOBAL const unsigned int* RESTRICT exclusions,
-        GLOBAL const ushort2* exclusionTiles, int needEnergy,
+        GLOBAL const int2* exclusionTiles, int needEnergy,
 #ifdef USE_CUTOFF
        GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
@@ -37,7 +37,7 @@ KERNEL void computeN2Energy(
    const int firstExclusionTile = FIRST_EXCLUSION_TILE+GROUP_ID*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS;
    const int lastExclusionTile = FIRST_EXCLUSION_TILE+(GROUP_ID+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS;
    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
-        const ushort2 tileIndices = exclusionTiles[pos];
+        const int2 tileIndices = exclusionTiles[pos];
        const unsigned int x = tileIndices.x;
        const unsigned int y = tileIndices.y;

@@ -248,7 +248,7 @@ KERNEL void computeN2Energy(

        while (nextToSkip < pos) {
            if (currentSkipIndex < NUM_TILES_WITH_EXCLUSIONS) {
-                ushort2 tile = exclusionTiles[currentSkipIndex++];
+                int2 tile = exclusionTiles[currentSkipIndex++];
                nextToSkip = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
            }
            else

--- a/platforms/common/src/kernels/customGBValueN2.cc
+++ b/platforms/common/src/kernels/customGBValueN2.cc
@@ -2,7 +2,7 @@
 * Compute a value based on pair interactions.
 */
 KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsigned int* RESTRICT exclusions,
-        GLOBAL const ushort2* exclusionTiles,
+        GLOBAL const int2* exclusionTiles,
 #ifdef SUPPORTS_64_BIT_ATOMICS
        GLOBAL mm_ulong* RESTRICT global_value,
 #else
@@ -29,7 +29,7 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig
    const int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
    const int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
-        const ushort2 tileIndices = exclusionTiles[pos];
+        const int2 tileIndices = exclusionTiles[pos];
        const unsigned int x = tileIndices.x;
        const unsigned int y = tileIndices.y;
        real value = 0;
@@ -205,7 +205,7 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig
        while (skipTiles[tbx+TILE_SIZE-1] < pos) {
            SYNC_WARPS;
            if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
-                ushort2 tile = exclusionTiles[skipBase+tgx];
+                int2 tile = exclusionTiles[skipBase+tgx];
                skipTiles[LOCAL_ID] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
            }
            else

--- a/platforms/common/src/kernels/customGBValueN2_cpu.cc
+++ b/platforms/common/src/kernels/customGBValueN2_cpu.cc
@@ -2,7 +2,7 @@
 * Compute a value based on pair interactions.
 */
 KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsigned int* RESTRICT exclusions,
-        GLOBAL const ushort2* exclusionTiles,
+        GLOBAL const int2* exclusionTiles,
 #ifdef SUPPORTS_64_BIT_ATOMICS
        GLOBAL mm_ulong* RESTRICT global_value,
 #else
@@ -25,7 +25,7 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig
    const int firstExclusionTile = FIRST_EXCLUSION_TILE+get_group_id(0)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
    const int lastExclusionTile = FIRST_EXCLUSION_TILE+(get_group_id(0)+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
-        const ushort2 tileIndices = exclusionTiles[pos];
+        const int2 tileIndices = exclusionTiles[pos];
        const unsigned int x = tileIndices.x;
        const unsigned int y = tileIndices.y;

@@ -213,7 +213,7 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig

        while (nextToSkip < pos) {
            if (currentSkipIndex < NUM_TILES_WITH_EXCLUSIONS) {
-                ushort2 tile = exclusionTiles[currentSkipIndex++];
+                int2 tile = exclusionTiles[currentSkipIndex++];
                nextToSkip = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
            }
            else

--- a/platforms/common/src/kernels/gbsaObc.cc
+++ b/platforms/common/src/kernels/gbsaObc.cc
@@ -24,7 +24,7 @@ KERNEL void computeBornSum(
 #else
        unsigned int numTiles,
 #endif
-        GLOBAL const ushort2* RESTRICT exclusionTiles) {
+        GLOBAL const int2* RESTRICT exclusionTiles) {
    const unsigned int totalWarps = GLOBAL_SIZE/TILE_SIZE;
    const unsigned int warp = GLOBAL_ID/TILE_SIZE;
    const unsigned int tgx = LOCAL_ID & (TILE_SIZE-1);
@@ -36,7 +36,7 @@ KERNEL void computeBornSum(
    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
-        const ushort2 tileIndices = exclusionTiles[pos];
+        const int2 tileIndices = exclusionTiles[pos];
        const unsigned int x = tileIndices.x;
        const unsigned int y = tileIndices.y;
        real bornSum = 0;
@@ -209,7 +209,7 @@ KERNEL void computeBornSum(
        while (skipTiles[tbx+TILE_SIZE-1] < pos) {
            SYNC_WARPS;
            if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
-                ushort2 tile = exclusionTiles[skipBase+tgx];
+                int2 tile = exclusionTiles[skipBase+tgx];
                skipTiles[LOCAL_ID] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
            }
            else
@@ -393,7 +393,7 @@ KERNEL void computeGBSAForce1(
 #else
        unsigned int numTiles,
 #endif
-        GLOBAL const ushort2* RESTRICT exclusionTiles) {
+        GLOBAL const int2* RESTRICT exclusionTiles) {
    const unsigned int totalWarps = GLOBAL_SIZE/TILE_SIZE;
    const unsigned int warp = GLOBAL_ID/TILE_SIZE;
    const unsigned int tgx = LOCAL_ID & (TILE_SIZE-1);
@@ -406,7 +406,7 @@ KERNEL void computeGBSAForce1(
    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
-        const ushort2 tileIndices = exclusionTiles[pos];
+        const int2 tileIndices = exclusionTiles[pos];
        const unsigned int x = tileIndices.x;
        const unsigned int y = tileIndices.y;
        real4 force = make_real4(0);
@@ -604,7 +604,7 @@ KERNEL void computeGBSAForce1(
        while (skipTiles[tbx+TILE_SIZE-1] < pos) {
            SYNC_WARPS;
            if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
-                ushort2 tile = exclusionTiles[skipBase+tgx];
+                int2 tile = exclusionTiles[skipBase+tgx];
                skipTiles[LOCAL_ID] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
            }
            else

--- a/platforms/common/src/kernels/gbsaObc_cpu.cc
+++ b/platforms/common/src/kernels/gbsaObc_cpu.cc
@@ -22,7 +22,7 @@ KERNEL void computeBornSum(
 #else
        unsigned int numTiles,
 #endif
-        GLOBAL const ushort2* exclusionTiles) {
+        GLOBAL const int2* exclusionTiles) {
    LOCAL AtomData1 localData[TILE_SIZE];

    // First loop: process tiles that contain exclusions.
@@ -30,7 +30,7 @@ KERNEL void computeBornSum(
    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+GROUP_ID*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS;
    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(GROUP_ID+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS;
    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
-        const ushort2 tileIndices = exclusionTiles[pos];
+        const int2 tileIndices = exclusionTiles[pos];
        const unsigned int x = tileIndices.x;
        const unsigned int y = tileIndices.y;

@@ -213,7 +213,7 @@ KERNEL void computeBornSum(

        while (nextToSkip < pos) {
            if (currentSkipIndex < NUM_TILES_WITH_EXCLUSIONS) {
-                ushort2 tile = exclusionTiles[currentSkipIndex++];
+                int2 tile = exclusionTiles[currentSkipIndex++];
                nextToSkip = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
            }
            else
@@ -416,7 +416,7 @@ KERNEL void computeGBSAForce1(
 #else
        unsigned int numTiles,
 #endif
-        GLOBAL const ushort2* exclusionTiles) {
+        GLOBAL const int2* exclusionTiles) {
    mixed energy = 0;
    LOCAL AtomData2 localData[TILE_SIZE];

@@ -425,7 +425,7 @@ KERNEL void computeGBSAForce1(
    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+GROUP_ID*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS;
    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(GROUP_ID+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS;
    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
-        const ushort2 tileIndices = exclusionTiles[pos];
+        const int2 tileIndices = exclusionTiles[pos];
        const unsigned int x = tileIndices.x;
        const unsigned int y = tileIndices.y;

@@ -637,7 +637,7 @@ KERNEL void computeGBSAForce1(

        while (nextToSkip < pos) {
            if (currentSkipIndex < NUM_TILES_WITH_EXCLUSIONS) {
-                ushort2 tile = exclusionTiles[currentSkipIndex++];
+                int2 tile = exclusionTiles[currentSkipIndex++];
                nextToSkip = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
            }
            else

--- a/platforms/cpu/tests/TestCpuNonbondedForce.cpp
+++ b/platforms/cpu/tests/TestCpuNonbondedForce.cpp
@@ -33,4 +33,5 @@
 #include "TestNonbondedForce.h"

 void runPlatformTests() {
+    testHugeSystem();
 }
--- a/platforms/cuda/include/CudaNonbondedUtilities.h
+++ b/platforms/cuda/include/CudaNonbondedUtilities.h
@@ -348,7 +348,8 @@ private:
    std::map<int, std::string> groupKernelSource;
    double lastCutoff;
    bool useCutoff, usePeriodic, anyExclusions, usePadding, forceRebuildNeighborList, canUsePairList;
-    int startTileIndex, numTiles, startBlockIndex, numBlocks, maxTiles, maxSinglePairs, maxExclusions, numForceThreadBlocks, forceThreadBlockSize, numAtoms, groupFlags;
+    int startTileIndex, startBlockIndex, numBlocks, maxTiles, maxSinglePairs, maxExclusions, numForceThreadBlocks, forceThreadBlockSize, numAtoms, groupFlags;
+    long long numTiles;
    std::string kernelSource;
 };


--- a/platforms/cuda/src/CudaNonbondedUtilities.cpp
+++ b/platforms/cuda/src/CudaNonbondedUtilities.cpp
@@ -167,7 +167,7 @@ void CudaNonbondedUtilities::requestExclusions(const vector<vector<int> >& exclu
    }
 }

-static bool compareUshort2(ushort2 a, ushort2 b) {
+static bool compareInt2(int2 a, int2 b) {
    return ((a.y < b.y) || (a.y == b.y && a.x < b.x));
 }

@@ -199,15 +199,15 @@ void CudaNonbondedUtilities::initialize(const System& system) {
            tilesWithExclusions.insert(make_pair(max(x, y), min(x, y)));
        }
    }
-    vector<ushort2> exclusionTilesVec;
+    vector<int2> exclusionTilesVec;
    for (set<pair<int, int> >::const_iterator iter = tilesWithExclusions.begin(); iter != tilesWithExclusions.end(); ++iter)
-        exclusionTilesVec.push_back(make_ushort2((unsigned short) iter->first, (unsigned short) iter->second));
-    sort(exclusionTilesVec.begin(), exclusionTilesVec.end(), compareUshort2);
-    exclusionTiles.initialize<ushort2>(context, exclusionTilesVec.size(), "exclusionTiles");
+        exclusionTilesVec.push_back(make_int2(iter->first, iter->second));
+    sort(exclusionTilesVec.begin(), exclusionTilesVec.end(), compareInt2);
+    exclusionTiles.initialize<int2>(context, exclusionTilesVec.size(), "exclusionTiles");
    exclusionTiles.upload(exclusionTilesVec);
    map<pair<int, int>, int> exclusionTileMap;
    for (int i = 0; i < (int) exclusionTilesVec.size(); i++) {
-        ushort2 tile = exclusionTilesVec[i];
+        int2 tile = exclusionTilesVec[i];
        exclusionTileMap[make_pair(tile.x, tile.y)] = i;
    }
    vector<vector<int> > exclusionBlocksForBlock(numAtomBlocks);
@@ -463,9 +463,9 @@ void CudaNonbondedUtilities::setAtomBlockRange(double startFraction, double endF
    int numAtomBlocks = context.getNumAtomBlocks();
    startBlockIndex = (int) (startFraction*numAtomBlocks);
    numBlocks = (int) (endFraction*numAtomBlocks)-startBlockIndex;
-    int totalTiles = context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2;
+    long long totalTiles = context.getNumAtomBlocks()*((long long)context.getNumAtomBlocks()+1)/2;
    startTileIndex = (int) (startFraction*totalTiles);
-    numTiles = (int) (endFraction*totalTiles)-startTileIndex;
+    numTiles = (long long) (endFraction*totalTiles)-startTileIndex;
    forceRebuildNeighborList = true;
 }


--- a/platforms/cuda/src/kernels/findInteractingBlocks.cu
+++ b/platforms/cuda/src/kernels/findInteractingBlocks.cu
@@ -264,7 +264,7 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
                    includeBlock2 = forceInclude = true;
 #endif
                if (includeBlock2) {
-                    unsigned short y = (unsigned short) sortedBlocks[block2].y;
+                    int y = (int) sortedBlocks[block2].y;
                    for (int k = 0; k < numExclusions; k++)
                        includeBlock2 &= (exclusionsForX[k] != y);
                }
@@ -278,7 +278,7 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
                int i = __ffs(includeBlockFlags)-1;
                includeBlockFlags &= includeBlockFlags-1;
                forceInclude = (forceIncludeFlags>>i) & 1;
-                unsigned short y = (unsigned short) sortedBlocks[block2Base+i].y;
+                int y = (int) sortedBlocks[block2Base+i].y;

                // Check each atom in block Y for interactions.


--- a/platforms/cuda/src/kernels/nonbonded.cu
+++ b/platforms/cuda/src/kernels/nonbonded.cu
@@ -105,7 +105,7 @@ static __inline__ __device__ long long real_shfl(long long var, int srcLane) {
 */
 extern "C" __global__ void computeNonbonded(
        unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, const tileflags* __restrict__ exclusions,
-        const ushort2* __restrict__ exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices
+        const int2* __restrict__ exclusionTiles, unsigned int startTileIndex, unsigned long long numTileIndices
 #ifdef USE_CUTOFF
        , const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, 
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter,
@@ -129,7 +129,7 @@ extern "C" __global__ void computeNonbonded(
    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
-        const ushort2 tileIndices = exclusionTiles[pos];
+        const int2 tileIndices = exclusionTiles[pos];
        const unsigned int x = tileIndices.x;
        const unsigned int y = tileIndices.y;
        real3 force = make_real3(0);
@@ -324,12 +324,11 @@ extern "C" __global__ void computeNonbonded(
    const unsigned int numTiles = interactionCount[0];
    if (numTiles > maxTiles)
        return; // There wasn't enough memory for the neighbor list.
-    int pos = (int) (numTiles > maxTiles ? startTileIndex+warp*(long long)numTileIndices/totalWarps : warp*(long long)numTiles/totalWarps);
-    int end = (int) (numTiles > maxTiles ? startTileIndex+(warp+1)*(long long)numTileIndices/totalWarps : (warp+1)*(long long)numTiles/totalWarps);
+    int pos = (int) (warp*(long long)numTiles/totalWarps);
+    int end = (int) ((warp+1)*(long long)numTiles/totalWarps);
 #else
-    const unsigned int numTiles = numTileIndices;
-    int pos = (int) (startTileIndex+warp*(long long)numTiles/totalWarps);
-    int end = (int) (startTileIndex+(warp+1)*(long long)numTiles/totalWarps);
+    int pos = (int) (startTileIndex+warp*numTileIndices/totalWarps);
+    int end = (int) (startTileIndex+(warp+1)*numTileIndices/totalWarps);
 #endif
    int skipBase = 0;
    int currentSkipIndex = tbx;
@@ -365,7 +364,7 @@ extern "C" __global__ void computeNonbonded(

        while (skipTiles[tbx+TILE_SIZE-1] < pos) {
            if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
-                ushort2 tile = exclusionTiles[skipBase+tgx];
+                int2 tile = exclusionTiles[skipBase+tgx];
                skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
            }
            else

--- a/platforms/cuda/tests/TestCudaNonbondedForce.cpp
+++ b/platforms/cuda/tests/TestCudaNonbondedForce.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2015 Stanford University and the Authors.      *
+ * Portions copyright (c) 2008-2020 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -31,6 +31,8 @@

 #include "CudaTests.h"
 #include "TestNonbondedForce.h"
+#include <cuda.h>
+#include <string>

 void testParallelComputation(NonbondedForce::NonbondedMethod method) {
    System system;
@@ -152,10 +154,33 @@ void testDeterministicForces() {
    }
 }

+bool canRunHugeTest() {
+    // Create a minimal context just to see which device is being used.
+
+    System system;
+    system.addParticle(1.0);
+    VerletIntegrator integrator(1.0);
+    Context context(system, integrator, platform);
+    int deviceIndex = stoi(platform.getPropertyValue(context, CudaPlatform::CudaDeviceIndex()));
+
+    // Find out how much memory the device has.
+
+    CUdevice device;
+    cuDeviceGet(&device, deviceIndex);
+    size_t memory;
+    cuDeviceTotalMem(&memory, device);
+
+    // Only run the huge test if the device has at least 4 GB of memory.
+
+    return (memory >= 4*(1<<30));
+}
+
 void runPlatformTests() {
    testParallelComputation(NonbondedForce::NoCutoff);
    testParallelComputation(NonbondedForce::Ewald);
    testParallelComputation(NonbondedForce::PME);
    testReordering();
    testDeterministicForces();
+    if (canRunHugeTest())
+        testHugeSystem();
 }
--- a/platforms/opencl/include/OpenCLNonbondedUtilities.h
+++ b/platforms/opencl/include/OpenCLNonbondedUtilities.h
@@ -328,8 +328,9 @@ private:
    std::map<int, std::string> groupKernelSource;
    double lastCutoff;
    bool useCutoff, usePeriodic, deviceIsCpu, anyExclusions, usePadding, forceRebuildNeighborList;
-    int numForceBuffers, startTileIndex, numTiles, startBlockIndex, numBlocks, maxExclusions, numForceThreadBlocks;
+    int numForceBuffers, startTileIndex, startBlockIndex, numBlocks, maxExclusions, numForceThreadBlocks;
    int forceThreadBlockSize, interactingBlocksThreadBlockSize, groupFlags;
+    long long numTiles;
 };

 /**

--- a/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+++ b/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2018 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2020 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -177,13 +177,13 @@ void OpenCLNonbondedUtilities::requestExclusions(const vector<vector<int> >& exc
    }
 }

-static bool compareUshort2(mm_ushort2 a, mm_ushort2 b) {
+static bool compareInt2(mm_int2 a, mm_int2 b) {
    // This version is used on devices with SIMD width of 32 or less.  It sorts tiles to improve cache efficiency.

    return ((a.y < b.y) || (a.y == b.y && a.x < b.x));
 }

-static bool compareUshort2LargeSIMD(mm_ushort2 a, mm_ushort2 b) {
+static bool compareInt2LargeSIMD(mm_int2 a, mm_int2 b) {
    // This version is used on devices with SIMD width greater than 32.  It puts diagonal tiles before off-diagonal
    // ones to reduce thread divergence.
    
@@ -223,15 +223,15 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
            tilesWithExclusions.insert(make_pair(max(x, y), min(x, y)));
        }
    }
-    vector<mm_ushort2> exclusionTilesVec;
+    vector<mm_int2> exclusionTilesVec;
    for (set<pair<int, int> >::const_iterator iter = tilesWithExclusions.begin(); iter != tilesWithExclusions.end(); ++iter)
-        exclusionTilesVec.push_back(mm_ushort2((unsigned short) iter->first, (unsigned short) iter->second));
-    sort(exclusionTilesVec.begin(), exclusionTilesVec.end(), context.getSIMDWidth() <= 32 ? compareUshort2 : compareUshort2LargeSIMD);
-    exclusionTiles.initialize<mm_ushort2>(context, exclusionTilesVec.size(), "exclusionTiles");
+        exclusionTilesVec.push_back(mm_int2(iter->first, iter->second));
+    sort(exclusionTilesVec.begin(), exclusionTilesVec.end(), context.getSIMDWidth() <= 32 ? compareInt2 : compareInt2LargeSIMD);
+    exclusionTiles.initialize<mm_int2>(context, exclusionTilesVec.size(), "exclusionTiles");
    exclusionTiles.upload(exclusionTilesVec);
    map<pair<int, int>, int> exclusionTileMap;
    for (int i = 0; i < (int) exclusionTilesVec.size(); i++) {
-        mm_ushort2 tile = exclusionTilesVec[i];
+        mm_int2 tile = exclusionTilesVec[i];
        exclusionTileMap[make_pair(tile.x, tile.y)] = i;
    }
    vector<vector<int> > exclusionBlocksForBlock(numAtomBlocks);
@@ -440,9 +440,9 @@ void OpenCLNonbondedUtilities::setAtomBlockRange(double startFraction, double en
    int numAtomBlocks = context.getNumAtomBlocks();
    startBlockIndex = (int) (startFraction*numAtomBlocks);
    numBlocks = (int) (endFraction*numAtomBlocks)-startBlockIndex;
-    int totalTiles = context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2;
+    long long totalTiles = context.getNumAtomBlocks()*((long long)context.getNumAtomBlocks()+1)/2;
    startTileIndex = (int) (startFraction*totalTiles);;
-    numTiles = (int) (endFraction*totalTiles)-startTileIndex;
+    numTiles = (long long) (endFraction*totalTiles)-startTileIndex;
    if (useCutoff) {
        // We are using a cutoff, and the kernels have already been created.

@@ -450,15 +450,15 @@ void OpenCLNonbondedUtilities::setAtomBlockRange(double startFraction, double en
            KernelSet& kernels = iter->second;
            if (*reinterpret_cast<cl_kernel*>(&kernels.forceKernel) != NULL) {
                kernels.forceKernel.setArg<cl_uint>(5, startTileIndex);
-                kernels.forceKernel.setArg<cl_uint>(6, numTiles);
+                kernels.forceKernel.setArg<cl_ulong>(6, numTiles);
            }
            if (*reinterpret_cast<cl_kernel*>(&kernels.energyKernel) != NULL) {
                kernels.energyKernel.setArg<cl_uint>(5, startTileIndex);
-                kernels.energyKernel.setArg<cl_uint>(6, numTiles);
+                kernels.energyKernel.setArg<cl_ulong>(6, numTiles);
            }
            if (*reinterpret_cast<cl_kernel*>(&kernels.forceEnergyKernel) != NULL) {
                kernels.forceEnergyKernel.setArg<cl_uint>(5, startTileIndex);
-                kernels.forceEnergyKernel.setArg<cl_uint>(6, numTiles);
+                kernels.forceEnergyKernel.setArg<cl_ulong>(6, numTiles);
            }
            kernels.findInteractingBlocksKernel.setArg<cl_uint>(10, startBlockIndex);
            kernels.findInteractingBlocksKernel.setArg<cl_uint>(11, numBlocks);
@@ -711,7 +711,7 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
    kernel.setArg<cl::Buffer>(index++, exclusions.getDeviceBuffer());
    kernel.setArg<cl::Buffer>(index++, exclusionTiles.getDeviceBuffer());
    kernel.setArg<cl_uint>(index++, startTileIndex);
-    kernel.setArg<cl_uint>(index++, numTiles);
+    kernel.setArg<cl_ulong>(index++, numTiles);
    if (useCutoff) {
        kernel.setArg<cl::Buffer>(index++, interactingTiles.getDeviceBuffer());
        kernel.setArg<cl::Buffer>(index++, interactionCount.getDeviceBuffer());

--- a/platforms/opencl/src/kernels/findInteractingBlocks.cl
+++ b/platforms/opencl/src/kernels/findInteractingBlocks.cl
@@ -166,7 +166,7 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi
                    includeBlock2 = true;
 #endif
                if (includeBlock2) {
-                    unsigned short y = (unsigned short) sortedBlocks[block2].y;
+                    int y = (int) sortedBlocks[block2].y;
                    for (int k = 0; k < numExclusions; k++)
                        includeBlock2 &= (exclusionsForX[k] != y);
                }
@@ -180,7 +180,7 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi
                while (i < TILE_SIZE && !includeBlockFlags[warpStart+i])
                    i++;
                if (i < TILE_SIZE) {
-                    unsigned short y = (unsigned short) sortedBlocks[block2Base+i].y;
+                    int y = (int) sortedBlocks[block2Base+i].y;

                    // Check each atom in block Y for interactions.


--- a/platforms/opencl/src/kernels/findInteractingBlocks_cpu.cl
+++ b/platforms/opencl/src/kernels/findInteractingBlocks_cpu.cl
@@ -69,7 +69,7 @@ __kernel void sortBoxData(__global const real2* restrict sortedBlock, __global c
 * This is called by findBlocksWithInteractions().  It compacts the list of blocks and writes them
 * to global memory.
 */
-void storeInteractionData(unsigned short x, unsigned short* buffer, int* atoms, int* numAtoms, int numValid, __global unsigned int* interactionCount,
+void storeInteractionData(int x, int* buffer, int* atoms, int* numAtoms, int numValid, __global unsigned int* interactionCount,
            __global int* interactingTiles, __global unsigned int* interactingAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX,
            real4 periodicBoxVecY, real4 periodicBoxVecZ, __global const real4* posq, real4 blockCenterX, real4 blockSizeX, unsigned int maxTiles, bool finish) {
    real4 posBuffer[TILE_SIZE];
@@ -167,7 +167,7 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi
        __global const int* restrict rebuildNeighborList) {
    if (rebuildNeighborList[0] == 0)
        return; // The neighbor list doesn't need to be rebuilt.
-    unsigned short buffer[BUFFER_SIZE];
+    int buffer[BUFFER_SIZE];
    int atoms[BUFFER_SIZE];
    int exclusionsForX[MAX_EXCLUSIONS];
    int valuesInBuffer;
@@ -179,7 +179,7 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi
        valuesInBuffer = 0;
        numAtoms = 0;
        real2 sortedKey = sortedBlocks[i];
-        unsigned short x = (unsigned short) sortedKey.y;
+        int x = (int) sortedKey.y;
        real4 blockCenterX = sortedBlockCenter[i];
        real4 blockSizeX = sortedBlockBoundingBox[i];

@@ -195,7 +195,7 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi
        
        for (int j = i+1; j < NUM_BLOCKS; j++) {
            real2 sortedKey2 = sortedBlocks[j];
-            unsigned short y = (unsigned short) sortedKey2.y;
+            int y = (int) sortedKey2.y;
            bool hasExclusions = false;
            for (int k = 0; k < numExclusions; k++)
                hasExclusions |= (exclusionsForX[k] == y);

--- a/platforms/opencl/src/kernels/nonbonded.cl
+++ b/platforms/opencl/src/kernels/nonbonded.cl
@@ -23,7 +23,7 @@ __kernel void computeNonbonded(
        __global real4* restrict forceBuffers,
 #endif
        __global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const unsigned int* restrict exclusions,
-        __global const ushort2* restrict exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices
+        __global const int2* restrict exclusionTiles, unsigned int startTileIndex, unsigned long numTileIndices
 #ifdef USE_CUTOFF
        , __global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter,
@@ -43,7 +43,7 @@ __kernel void computeNonbonded(
    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
-        const ushort2 tileIndices = exclusionTiles[pos];
+        const int2 tileIndices = exclusionTiles[pos];
        const unsigned int x = tileIndices.x;
        const unsigned int y = tileIndices.y;
        real4 force = 0;
@@ -205,12 +205,11 @@ __kernel void computeNonbonded(
    unsigned int numTiles = interactionCount[0];
    if (numTiles > maxTiles)
        return; // There wasn't enough memory for the neighbor list.
-    int pos = (int) (numTiles > maxTiles ? startTileIndex+warp*(long)numTileIndices/totalWarps : warp*(long)numTiles/totalWarps);
-    int end = (int) (numTiles > maxTiles ? startTileIndex+(warp+1)*(long)numTileIndices/totalWarps : (warp+1)*(long)numTiles/totalWarps);
+    int pos = (int) (warp*(long)numTiles/totalWarps);
+    int end = (int) ((warp+1)*(long)numTiles/totalWarps);
 #else
-    const unsigned int numTiles = numTileIndices;
-    int pos = (int) (startTileIndex+warp*(long)numTiles/totalWarps);
-    int end = (int) (startTileIndex+(warp+1)*(long)numTiles/totalWarps);
+    int pos = (int) (startTileIndex+warp*numTileIndices/totalWarps);
+    int end = (int) (startTileIndex+(warp+1)*numTileIndices/totalWarps);
 #endif
    int skipBase = 0;
    int currentSkipIndex = tbx;
@@ -247,7 +246,7 @@ __kernel void computeNonbonded(
        while (skipTiles[tbx+TILE_SIZE-1] < pos) {
            SYNC_WARPS;
            if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
-                ushort2 tile = exclusionTiles[skipBase+tgx];
+                int2 tile = exclusionTiles[skipBase+tgx];
                skipTiles[get_local_id(0)] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
            }
            else

--- a/platforms/opencl/src/kernels/nonbonded_cpu.cl
+++ b/platforms/opencl/src/kernels/nonbonded_cpu.cl
@@ -20,7 +20,7 @@ __kernel void computeNonbonded(
        __global real4* restrict forceBuffers,
 #endif
        __global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const unsigned int* restrict exclusions,
-        __global const ushort2* restrict exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices
+        __global const int2* restrict exclusionTiles, unsigned int startTileIndex, unsigned long numTileIndices
 #ifdef USE_CUTOFF
        , __global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter,
@@ -36,7 +36,7 @@ __kernel void computeNonbonded(
    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+get_group_id(0)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(get_group_id(0)+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
-        const ushort2 tileIndices = exclusionTiles[pos];
+        const int2 tileIndices = exclusionTiles[pos];
        const unsigned int x = tileIndices.x;
        const unsigned int y = tileIndices.y;

@@ -256,7 +256,7 @@ __kernel void computeNonbonded(

        while (nextToSkip < pos) {
            if (currentSkipIndex < NUM_TILES_WITH_EXCLUSIONS) {
-                ushort2 tile = exclusionTiles[currentSkipIndex++];
+                int2 tile = exclusionTiles[currentSkipIndex++];
                nextToSkip = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
            }
            else

--- a/platforms/opencl/tests/TestOpenCLNonbondedForce.cpp
+++ b/platforms/opencl/tests/TestOpenCLNonbondedForce.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2015 Stanford University and the Authors.      *
+ * Portions copyright (c) 2008-2020 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -31,6 +31,8 @@

 #include "OpenCLTests.h"
 #include "TestNonbondedForce.h"
+#include <cl.hpp>
+#include <string>

 void testParallelComputation(NonbondedForce::NonbondedMethod method) {
    System system;
@@ -122,9 +124,34 @@ void testReordering() {
    }
 }

+bool canRunHugeTest() {
+    // Create a minimal context just to see which platform and device are being used.
+    
+    System system;
+    system.addParticle(1.0);
+    VerletIntegrator integrator(1.0);
+    Context context(system, integrator, platform);
+    int platformIndex = stoi(platform.getPropertyValue(context, OpenCLPlatform::OpenCLPlatformIndex()));
+    int deviceIndex = stoi(platform.getPropertyValue(context, OpenCLPlatform::OpenCLDeviceIndex()));
+
+    // Find out how much memory the device has.
+
+    vector<cl::Platform> platforms;
+    cl::Platform::get(&platforms);
+    vector<cl::Device> devices;
+    platforms[platformIndex].getDevices(CL_DEVICE_TYPE_ALL, &devices);
+    long long memory = devices[deviceIndex].getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>();
+
+    // Only run the huge test if the device has at least 4 GB of memory.
+
+    return (memory >= 4*(1<<30));
+}
+
 void runPlatformTests() {
    testParallelComputation(NonbondedForce::NoCutoff);
    testParallelComputation(NonbondedForce::Ewald);
    testParallelComputation(NonbondedForce::PME);
    testReordering();
+    if (canRunHugeTest())
+        testHugeSystem();
 }
--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
@@ -1239,11 +1239,11 @@ void CudaCalcAmoebaMultipoleForceKernel::initializeScaleFactors() {
    
    // Figure out the covalent flag values to use for each atom pair.

-    vector<ushort2> exclusionTiles;
+    vector<int2> exclusionTiles;
    nb.getExclusionTiles().download(exclusionTiles);
    map<pair<int, int>, int> exclusionTileMap;
    for (int i = 0; i < (int) exclusionTiles.size(); i++) {
-        ushort2 tile = exclusionTiles[i];
+        int2 tile = exclusionTiles[i];
        exclusionTileMap[make_pair(tile.x, tile.y)] = i;
    }
    covalentFlags.initialize<uint2>(cu, nb.getExclusions().getSize(), "covalentFlags");